UMB W07: redukcja wymiarowości¶

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

import umb_tools as umb
# konfiguracja
plt.rcParams["figure.figsize"] = [5, 4]
pd.set_option("display.float_format", lambda x: "%.4f" % x)

1. Zbiór danych¶


Wczytanie i normalizacja danych

# odczyt pliku TSV (zwracane są: zbiór danych w postaci DataFrame biblioteki Pandas oraz lista nazw klas)

(df, c_names) = umb.read_data("data/BreastCancer.txt")
df
#BreastCancer labels radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
842302 1 17.9900 10.3800 122.8000 1001.0000 0.1184 0.2776 0.3001 0.1471 0.2419 ... 25.3800 17.3300 184.6000 2019.0000 0.1622 0.6656 0.7119 0.2654 0.4601 0.1189
874858 1 14.2200 23.1200 94.3700 609.9000 0.1075 0.2413 0.1981 0.0662 0.2384 ... 15.7400 37.1800 106.4000 762.4000 0.1533 0.9327 0.8488 0.1772 0.5166 0.1446
875263 1 12.3400 26.8600 81.1500 477.4000 0.1034 0.1353 0.1085 0.0456 0.1943 ... 15.6500 39.3400 101.7000 768.9000 0.1785 0.4706 0.4425 0.1459 0.3215 0.1205
87556202 1 14.8600 23.2100 100.4000 671.4000 0.1044 0.1980 0.1697 0.0888 0.1737 ... 16.0800 27.7800 118.6000 784.7000 0.1316 0.4648 0.4589 0.1727 0.3000 0.0870
875938 1 13.7700 22.2900 90.6300 588.9000 0.1200 0.1267 0.1385 0.0653 0.1834 ... 16.3900 34.0100 111.6000 806.9000 0.1737 0.3122 0.3809 0.1673 0.3080 0.0933
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8910720 2 10.7100 20.3900 69.5000 344.9000 0.1082 0.1289 0.0845 0.0287 0.1668 ... 11.6900 25.2100 76.5100 410.4000 0.1335 0.2550 0.2534 0.0860 0.2605 0.0870
8910506 2 12.8700 16.2100 82.3800 512.2000 0.0943 0.0622 0.0390 0.0162 0.2010 ... 13.9000 23.6400 89.2700 597.5000 0.1256 0.1808 0.1992 0.0578 0.3604 0.0706
8910499 2 13.5900 21.8400 87.1600 561.0000 0.0796 0.0826 0.0407 0.0214 0.1635 ... 14.8000 30.0400 97.6600 661.5000 0.1005 0.1730 0.1453 0.0619 0.2446 0.0702
8912055 2 11.7400 14.0200 74.2400 427.3000 0.0781 0.0434 0.0225 0.0276 0.2101 ... 13.3100 18.2600 84.7000 533.7000 0.1036 0.0850 0.0673 0.0829 0.3101 0.0669
92751 2 7.7600 24.5400 47.9200 181.0000 0.0526 0.0436 0.0000 0.0000 0.1587 ... 9.4560 30.3700 59.1600 268.6000 0.0900 0.0644 0.0000 0.0000 0.2871 0.0704

569 rows × 31 columns

# pobranie etykiet klas
labels = df.iloc[:, 0].to_numpy()

# pobranie macierzy danych
data = df.iloc[:, 1:].to_numpy()

# normalizacja
data = StandardScaler().fit_transform(data)

Informacje o klasach

# pobranie indeksów próbek z klas
(c_labels, c_index) = umb.class_info(labels)

# liczba klas
c_n = len(c_labels)

# informacje o klasach
print(f"\nClasses: {c_n}")
for i in range(c_n):
    print(f"  name = {c_names[i]}, label = {c_labels[i]}, samples = {len(c_index[i])}")

Classes: 2
  name = Malignant, label = 1, samples = 212
  name = Benign, label = 2, samples = 357


2. Analiza składowych głównych (PCA, Principal Component Analysis)¶

from sklearn.decomposition import PCA
# wyznaczenie rzutu danych w przestrzeni trójwymiarowej
n_components = 3

pca_model = PCA(n_components)
pc = pca_model.fit_transform(data)

print(f"\nData size in the original space: {data.shape[0]} samples, {data.shape[1]} features")
print(f"Data size in principal components space: {pc.shape[0]} samples, {pc.shape[1]} features")

Data size in the original space: 569 samples, 30 features
Data size in principal components space: 569 samples, 3 features
fig = plt.figure(figsize=(10, 4))

# wykres 2D
ax = fig.add_subplot(1, 2, 1)

for i in range(c_n):
    ax.scatter(pc[c_index[i], 0], pc[c_index[i], 1], color=umb.class_color(i), label=c_names[i], s=10)  
       
ax.legend()
ax.set_xlabel(f"PC 1")
ax.set_ylabel(f"PC 2")


# wykres 3D
ax = fig.add_subplot(1, 2, 2, projection="3d")

for i in range(c_n):
    ax.scatter(pc[c_index[i], 0], pc[c_index[i], 1], pc[c_index[i], 2], color=umb.class_color(i), label=c_names[i], s=10)  

ax.legend()
ax.set(xticklabels=[], yticklabels=[], zticklabels=[])
ax.set_xlabel("PC 1")
ax.set_ylabel("PC 2")
ax.set_zlabel("PC 3")

plt.show()
Matplotlib output
fig = plt.figure(figsize=(10, 4))

# wykres 2D
ax = fig.add_subplot(1, 2, 1)
umb.pca_plot(data, labels, c_names, ax)

# wykres 3D
ax = fig.add_subplot(1, 2, 2, projection="3d")
umb.pca_plot_3d(data, labels, c_names, ax)

plt.show()
Matplotlib output