UMB W07: redukcja wymiarowości¶
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import umb_tools as umb
# konfiguracja
plt.rcParams["figure.figsize"] = [5, 4]
pd.set_option("display.float_format", lambda x: "%.4f" % x)
1. Zbiór danych¶
Wczytanie i normalizacja danych
# odczyt pliku TSV (zwracane są: zbiór danych w postaci DataFrame biblioteki Pandas oraz lista nazw klas)
(df, c_names) = umb.read_data("data/BreastCancer.txt")
df
#BreastCancer | labels | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
842302 | 1 | 17.9900 | 10.3800 | 122.8000 | 1001.0000 | 0.1184 | 0.2776 | 0.3001 | 0.1471 | 0.2419 | ... | 25.3800 | 17.3300 | 184.6000 | 2019.0000 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.1189 |
874858 | 1 | 14.2200 | 23.1200 | 94.3700 | 609.9000 | 0.1075 | 0.2413 | 0.1981 | 0.0662 | 0.2384 | ... | 15.7400 | 37.1800 | 106.4000 | 762.4000 | 0.1533 | 0.9327 | 0.8488 | 0.1772 | 0.5166 | 0.1446 |
875263 | 1 | 12.3400 | 26.8600 | 81.1500 | 477.4000 | 0.1034 | 0.1353 | 0.1085 | 0.0456 | 0.1943 | ... | 15.6500 | 39.3400 | 101.7000 | 768.9000 | 0.1785 | 0.4706 | 0.4425 | 0.1459 | 0.3215 | 0.1205 |
87556202 | 1 | 14.8600 | 23.2100 | 100.4000 | 671.4000 | 0.1044 | 0.1980 | 0.1697 | 0.0888 | 0.1737 | ... | 16.0800 | 27.7800 | 118.6000 | 784.7000 | 0.1316 | 0.4648 | 0.4589 | 0.1727 | 0.3000 | 0.0870 |
875938 | 1 | 13.7700 | 22.2900 | 90.6300 | 588.9000 | 0.1200 | 0.1267 | 0.1385 | 0.0653 | 0.1834 | ... | 16.3900 | 34.0100 | 111.6000 | 806.9000 | 0.1737 | 0.3122 | 0.3809 | 0.1673 | 0.3080 | 0.0933 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
8910720 | 2 | 10.7100 | 20.3900 | 69.5000 | 344.9000 | 0.1082 | 0.1289 | 0.0845 | 0.0287 | 0.1668 | ... | 11.6900 | 25.2100 | 76.5100 | 410.4000 | 0.1335 | 0.2550 | 0.2534 | 0.0860 | 0.2605 | 0.0870 |
8910506 | 2 | 12.8700 | 16.2100 | 82.3800 | 512.2000 | 0.0943 | 0.0622 | 0.0390 | 0.0162 | 0.2010 | ... | 13.9000 | 23.6400 | 89.2700 | 597.5000 | 0.1256 | 0.1808 | 0.1992 | 0.0578 | 0.3604 | 0.0706 |
8910499 | 2 | 13.5900 | 21.8400 | 87.1600 | 561.0000 | 0.0796 | 0.0826 | 0.0407 | 0.0214 | 0.1635 | ... | 14.8000 | 30.0400 | 97.6600 | 661.5000 | 0.1005 | 0.1730 | 0.1453 | 0.0619 | 0.2446 | 0.0702 |
8912055 | 2 | 11.7400 | 14.0200 | 74.2400 | 427.3000 | 0.0781 | 0.0434 | 0.0225 | 0.0276 | 0.2101 | ... | 13.3100 | 18.2600 | 84.7000 | 533.7000 | 0.1036 | 0.0850 | 0.0673 | 0.0829 | 0.3101 | 0.0669 |
92751 | 2 | 7.7600 | 24.5400 | 47.9200 | 181.0000 | 0.0526 | 0.0436 | 0.0000 | 0.0000 | 0.1587 | ... | 9.4560 | 30.3700 | 59.1600 | 268.6000 | 0.0900 | 0.0644 | 0.0000 | 0.0000 | 0.2871 | 0.0704 |
569 rows × 31 columns
# pobranie etykiet klas
labels = df.iloc[:, 0].to_numpy()
# pobranie macierzy danych
data = df.iloc[:, 1:].to_numpy()
# normalizacja
data = StandardScaler().fit_transform(data)
Informacje o klasach
# pobranie indeksów próbek z klas
(c_labels, c_index) = umb.class_info(labels)
# liczba klas
c_n = len(c_labels)
# informacje o klasach
print(f"\nClasses: {c_n}")
for i in range(c_n):
print(f" name = {c_names[i]}, label = {c_labels[i]}, samples = {len(c_index[i])}")
Classes: 2 name = Malignant, label = 1, samples = 212 name = Benign, label = 2, samples = 357
2. Analiza składowych głównych (PCA, Principal Component Analysis)¶
from sklearn.decomposition import PCA
# wyznaczenie rzutu danych w przestrzeni trójwymiarowej
n_components = 3
pca_model = PCA(n_components)
pc = pca_model.fit_transform(data)
print(f"\nData size in the original space: {data.shape[0]} samples, {data.shape[1]} features")
print(f"Data size in principal components space: {pc.shape[0]} samples, {pc.shape[1]} features")
Data size in the original space: 569 samples, 30 features Data size in principal components space: 569 samples, 3 features
fig = plt.figure(figsize=(10, 4))
# wykres 2D
ax = fig.add_subplot(1, 2, 1)
for i in range(c_n):
ax.scatter(pc[c_index[i], 0], pc[c_index[i], 1], color=umb.class_color(i), label=c_names[i], s=10)
ax.legend()
ax.set_xlabel(f"PC 1")
ax.set_ylabel(f"PC 2")
# wykres 3D
ax = fig.add_subplot(1, 2, 2, projection="3d")
for i in range(c_n):
ax.scatter(pc[c_index[i], 0], pc[c_index[i], 1], pc[c_index[i], 2], color=umb.class_color(i), label=c_names[i], s=10)
ax.legend()
ax.set(xticklabels=[], yticklabels=[], zticklabels=[])
ax.set_xlabel("PC 1")
ax.set_ylabel("PC 2")
ax.set_zlabel("PC 3")
plt.show()
fig = plt.figure(figsize=(10, 4))
# wykres 2D
ax = fig.add_subplot(1, 2, 1)
umb.pca_plot(data, labels, c_names, ax)
# wykres 3D
ax = fig.add_subplot(1, 2, 2, projection="3d")
umb.pca_plot_3d(data, labels, c_names, ax)
plt.show()