UMB W03: dostęp do danych¶

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import umb_tools as umb
# konfiguracja
plt.rcParams["figure.figsize"] = [5, 4]
pd.set_option("display.float_format", lambda x: "%.4f" % x)

1. Wczytywanie zbioru danych¶

# odczyt pliku TSV (zwracane są: zbiór danych w postaci DataFrame biblioteki Pandas oraz lista nazw klas)

(df, c_names) = umb.read_data("data/BreastCancer.txt")
df
#BreastCancer labels radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
842302 1 17.9900 10.3800 122.8000 1001.0000 0.1184 0.2776 0.3001 0.1471 0.2419 ... 25.3800 17.3300 184.6000 2019.0000 0.1622 0.6656 0.7119 0.2654 0.4601 0.1189
874858 1 14.2200 23.1200 94.3700 609.9000 0.1075 0.2413 0.1981 0.0662 0.2384 ... 15.7400 37.1800 106.4000 762.4000 0.1533 0.9327 0.8488 0.1772 0.5166 0.1446
875263 1 12.3400 26.8600 81.1500 477.4000 0.1034 0.1353 0.1085 0.0456 0.1943 ... 15.6500 39.3400 101.7000 768.9000 0.1785 0.4706 0.4425 0.1459 0.3215 0.1205
87556202 1 14.8600 23.2100 100.4000 671.4000 0.1044 0.1980 0.1697 0.0888 0.1737 ... 16.0800 27.7800 118.6000 784.7000 0.1316 0.4648 0.4589 0.1727 0.3000 0.0870
875938 1 13.7700 22.2900 90.6300 588.9000 0.1200 0.1267 0.1385 0.0653 0.1834 ... 16.3900 34.0100 111.6000 806.9000 0.1737 0.3122 0.3809 0.1673 0.3080 0.0933
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8910720 2 10.7100 20.3900 69.5000 344.9000 0.1082 0.1289 0.0845 0.0287 0.1668 ... 11.6900 25.2100 76.5100 410.4000 0.1335 0.2550 0.2534 0.0860 0.2605 0.0870
8910506 2 12.8700 16.2100 82.3800 512.2000 0.0943 0.0622 0.0390 0.0162 0.2010 ... 13.9000 23.6400 89.2700 597.5000 0.1256 0.1808 0.1992 0.0578 0.3604 0.0706
8910499 2 13.5900 21.8400 87.1600 561.0000 0.0796 0.0826 0.0407 0.0214 0.1635 ... 14.8000 30.0400 97.6600 661.5000 0.1005 0.1730 0.1453 0.0619 0.2446 0.0702
8912055 2 11.7400 14.0200 74.2400 427.3000 0.0781 0.0434 0.0225 0.0276 0.2101 ... 13.3100 18.2600 84.7000 533.7000 0.1036 0.0850 0.0673 0.0829 0.3101 0.0669
92751 2 7.7600 24.5400 47.9200 181.0000 0.0526 0.0436 0.0000 0.0000 0.1587 ... 9.4560 30.3700 59.1600 268.6000 0.0900 0.0644 0.0000 0.0000 0.2871 0.0704

569 rows × 31 columns

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 569 entries, 842302 to 92751
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   labels                   569 non-null    int32  
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  569 non-null    float64
 15  smoothness_se            569 non-null    float64
 16  compactness_se           569 non-null    float64
 17  concavity_se             569 non-null    float64
 18  concave points_se        569 non-null    float64
 19  symmetry_se              569 non-null    float64
 20  fractal_dimension_se     569 non-null    float64
 21  radius_worst             569 non-null    float64
 22  texture_worst            569 non-null    float64
 23  perimeter_worst          569 non-null    float64
 24  area_worst               569 non-null    float64
 25  smoothness_worst         569 non-null    float64
 26  compactness_worst        569 non-null    float64
 27  concavity_worst          569 non-null    float64
 28  concave points_worst     569 non-null    float64
 29  symmetry_worst           569 non-null    float64
 30  fractal_dimension_worst  569 non-null    float64
dtypes: float64(30), int32(1)
memory usage: 140.0+ KB
# pobranie etykiet klas
labels = df.iloc[:, 0].to_numpy()

# pobranie macierzy danych
data = df.iloc[:, 1:].to_numpy()

# pobieranie nazw próbek
samples = df.index

# wymiary macierzy danych
print(f"\nSamples: {data.shape[0]} \nFeatures: {data.shape[1]}")

Samples: 569 
Features: 30
# pobranie indeksów próbek z klas
(c_labels, c_index) = umb.class_info(labels)

# liczba klas
c_n = len(c_labels)

# informacje o klasach
print(f"\nClasses: {c_n}")
for i in range(c_n):
    print(f"  name = {c_names[i]}, label = {c_labels[i]}, samples = {len(c_index[i])}")

Classes: 2
  name = Malignant, label = 1, samples = 212
  name = Benign, label = 2, samples = 357


2. Podstawowe właściwości atrybutów¶

# numer atrybutu
feature_nr = 1

# wartości atrybutu
f_data = data[:,feature_nr]
# podstawowe właściwości statystyczne we wszystkich próbkach
print(f"\nmin = {np.min(f_data):.4f}\nmax = {np.max(f_data):.4f}")
print(f"mean = {np.mean(f_data):.4f}\nstd. dev. = {np.std(f_data):.4f}")

# podstawowe właściwości statystyczne w ramach klas
for i in range(c_n):
    print(f"\n{c_names[i]}:")
    
    print(f"  min = {np.min(f_data[c_index[i]]):.4f}\n  max = {np.max(f_data[c_index[i]]):.4f}")
    print(f"  mean = {np.mean(f_data[c_index[i]]):.4f}\n  std. dev. = {np.std(f_data[c_index[i]]):.4f}")

min = 9.7100
max = 39.2800
mean = 19.2896
std. dev. = 4.2973

Malignant:
  min = 10.3800
  max = 39.2800
  mean = 21.6049
  std. dev. = 3.7705

Benign:
  min = 9.7100
  max = 33.8100
  mean = 17.9148
  std. dev. = 3.9895


3. Wizualizacja pojedynczych atrybutów¶


Wykresy słupkowe wartości i średnich w klasach

fig, ax = plt.subplots(1, 2, figsize=(10, 4))

# wartości poszczególnych próbek
umb.bar_plot(f_data, labels, c_names, ax[0])

# wartości średnie w klasach
umb.bar_plot_mean(f_data, labels, c_names, ax[1])

plt.show()
Matplotlib output

Estymowane rozkłady prawdopodobieństwa w klasach

fig, ax = plt.subplots(1, 2, figsize=(10, 4))

# funkcja gęstości prawdopodobieństwa estymowana za pomocą histogramu
umb.histogram(f_data, labels, c_names, ax[0])
ax[0].set_title("Feature PDF (histogram)")

# funkcja gęstości prawdopodobieństwa estymowana za pomocą KDE
umb.kde_plot(f_data, labels, c_names, ax[1])
ax[1].set_title("Feature PDF (KDE)")

plt.show()
Matplotlib output

Rozrzuty wartości w klasach

fig, ax = plt.subplots(1, 2, figsize=(10, 4))

# box plot
umb.box_plot(f_data, labels, c_names, ax[0])

# violon plot
umb.violin_plot(f_data, labels, c_names, ax[1])

plt.show()
Matplotlib output


4. Wizualizacja wielu atrybutów¶


Wykresy punktowe par atrybutów

# numery atrybutów
feature_nr = [1, 2, 3]

# pobranie wartości atrybutów
f_data = data[:,feature_nr]

print(data.shape)

(569, 30)
n = len(feature_nr)

fig, ax = plt.subplots(n-1, n-1, figsize=(10, 8))

for k in range(0, n):
    for l in range(k+1, n):
        x = f_data[:, k]
        y = f_data[:, l]

        umb.scatter_plot(x, y, labels, c_names, ax[l-1][k])
        
        ax[l-1][k].set_xlabel(f"feature {feature_nr[k]}")
        ax[l-1][k].set_ylabel(f"feature {feature_nr[l]}")

plt.show()
Matplotlib output

Heat map

from  matplotlib.colors import LinearSegmentedColormap
# standaryzacja danych 
m = np.mean(data, axis=0)
s = np.std(data, axis=0)
map = (data - m) / s

# ograniczenie zakresu zmienności danych 
map[map>4] = 4
map[map<-4] = -4

# zdefiniowanie mapy kolorów 
cmap = LinearSegmentedColormap.from_list("RedBlackGreen",["r", "k", "g"], N=256) 
#cmap = "seismic"

# rysowanie
fig, ax = plt.subplots(figsize=(12, 4))

plt.imshow(map.T, interpolation="none", cmap=cmap, aspect=3)
plt.axis("off");
Matplotlib output

Analiza składowych głównych (PCA, Principal Component Analysis)

fig = plt.figure(figsize=(10, 4))

# wykres 2D
ax = fig.add_subplot(1, 2, 1)
umb.pca_plot(data, labels, c_names, ax)

# wykres 3D
ax = fig.add_subplot(1, 2, 2, projection="3d")
umb.pca_plot_3d(data, labels, c_names, ax)

plt.show()
Matplotlib output