UMB W05: przetwarzanie wstępne¶
import pandas as pd
import matplotlib.pyplot as plt
import umb_tools as umb
# konfiguracja
plt.rcParams["figure.figsize"] = [5, 4]
pd.set_option("display.float_format", lambda x: "%.4f" % x)
1. Wczytanie zbioru danych¶
# odczyt pliku TSV (zwracane są: zbiór danych w postaci DataFrame biblioteki Pandas oraz lista nazw klas)
(df, c_names) = umb.read_data("data/BreastCancer.txt")
df
#BreastCancer | labels | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
842302 | 1 | 17.9900 | 10.3800 | 122.8000 | 1001.0000 | 0.1184 | 0.2776 | 0.3001 | 0.1471 | 0.2419 | ... | 25.3800 | 17.3300 | 184.6000 | 2019.0000 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.1189 |
874858 | 1 | 14.2200 | 23.1200 | 94.3700 | 609.9000 | 0.1075 | 0.2413 | 0.1981 | 0.0662 | 0.2384 | ... | 15.7400 | 37.1800 | 106.4000 | 762.4000 | 0.1533 | 0.9327 | 0.8488 | 0.1772 | 0.5166 | 0.1446 |
875263 | 1 | 12.3400 | 26.8600 | 81.1500 | 477.4000 | 0.1034 | 0.1353 | 0.1085 | 0.0456 | 0.1943 | ... | 15.6500 | 39.3400 | 101.7000 | 768.9000 | 0.1785 | 0.4706 | 0.4425 | 0.1459 | 0.3215 | 0.1205 |
87556202 | 1 | 14.8600 | 23.2100 | 100.4000 | 671.4000 | 0.1044 | 0.1980 | 0.1697 | 0.0888 | 0.1737 | ... | 16.0800 | 27.7800 | 118.6000 | 784.7000 | 0.1316 | 0.4648 | 0.4589 | 0.1727 | 0.3000 | 0.0870 |
875938 | 1 | 13.7700 | 22.2900 | 90.6300 | 588.9000 | 0.1200 | 0.1267 | 0.1385 | 0.0653 | 0.1834 | ... | 16.3900 | 34.0100 | 111.6000 | 806.9000 | 0.1737 | 0.3122 | 0.3809 | 0.1673 | 0.3080 | 0.0933 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
8910720 | 2 | 10.7100 | 20.3900 | 69.5000 | 344.9000 | 0.1082 | 0.1289 | 0.0845 | 0.0287 | 0.1668 | ... | 11.6900 | 25.2100 | 76.5100 | 410.4000 | 0.1335 | 0.2550 | 0.2534 | 0.0860 | 0.2605 | 0.0870 |
8910506 | 2 | 12.8700 | 16.2100 | 82.3800 | 512.2000 | 0.0943 | 0.0622 | 0.0390 | 0.0162 | 0.2010 | ... | 13.9000 | 23.6400 | 89.2700 | 597.5000 | 0.1256 | 0.1808 | 0.1992 | 0.0578 | 0.3604 | 0.0706 |
8910499 | 2 | 13.5900 | 21.8400 | 87.1600 | 561.0000 | 0.0796 | 0.0826 | 0.0407 | 0.0214 | 0.1635 | ... | 14.8000 | 30.0400 | 97.6600 | 661.5000 | 0.1005 | 0.1730 | 0.1453 | 0.0619 | 0.2446 | 0.0702 |
8912055 | 2 | 11.7400 | 14.0200 | 74.2400 | 427.3000 | 0.0781 | 0.0434 | 0.0225 | 0.0276 | 0.2101 | ... | 13.3100 | 18.2600 | 84.7000 | 533.7000 | 0.1036 | 0.0850 | 0.0673 | 0.0829 | 0.3101 | 0.0669 |
92751 | 2 | 7.7600 | 24.5400 | 47.9200 | 181.0000 | 0.0526 | 0.0436 | 0.0000 | 0.0000 | 0.1587 | ... | 9.4560 | 30.3700 | 59.1600 | 268.6000 | 0.0900 | 0.0644 | 0.0000 | 0.0000 | 0.2871 | 0.0704 |
569 rows × 31 columns
# pobranie dwóch cech z macierzy danych
data = df.iloc[:, [1, 10]]
data.describe()
#BreastCancer | radius_mean | fractal_dimension_mean |
---|---|---|
count | 569.0000 | 569.0000 |
mean | 14.1273 | 0.0628 |
std | 3.5240 | 0.0071 |
min | 6.9810 | 0.0500 |
25% | 11.7000 | 0.0577 |
50% | 13.3700 | 0.0615 |
75% | 15.7800 | 0.0661 |
max | 28.1100 | 0.0974 |
Standaryzacja
from sklearn.preprocessing import StandardScaler
data = df.iloc[:, [1, 10]]
norm_data = StandardScaler().fit_transform(data)
pd.DataFrame(norm_data, columns=data.columns).describe()
#BreastCancer | radius_mean | fractal_dimension_mean |
---|---|---|
count | 569.0000 | 569.0000 |
mean | -0.0000 | 0.0000 |
std | 1.0009 | 1.0009 |
min | -2.0296 | -1.8199 |
25% | -0.6894 | -0.7226 |
50% | -0.2151 | -0.1783 |
75% | 0.4694 | 0.4710 |
max | 3.9713 | 4.9109 |
Skalowanie min-max
from sklearn.preprocessing import MinMaxScaler
data = df.iloc[:, [1, 10]]
norm_data = MinMaxScaler().fit_transform(data)
pd.DataFrame(norm_data, columns=data.columns).describe()
#BreastCancer | radius_mean | fractal_dimension_mean |
---|---|---|
count | 569.0000 | 569.0000 |
mean | 0.3382 | 0.2704 |
std | 0.1668 | 0.1487 |
min | 0.0000 | 0.0000 |
25% | 0.2233 | 0.1630 |
50% | 0.3024 | 0.2439 |
75% | 0.4164 | 0.3404 |
max | 1.0000 | 1.0000 |
Znaczenie normalizacji danych
# pobranie etykiet klas
labels = df.iloc[:, 0].to_numpy()
# pobranie macierzy danych
data = df.iloc[:, 1:].to_numpy()
# standaryzacja
data_norm = StandardScaler().fit_transform(data)
# wykresy PCA
fig, ax = plt.subplots(1, 2, figsize=(10, 4))
umb.pca_plot(data, labels, c_names, ax[0], "Without standarization")
umb.pca_plot(data_norm, labels, c_names, ax[1], "With standarization")
plt.show()