UMB W06: klasteryzacja (grupowanie, analiza skupień)¶

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

import umb_tools as umb
# konfiguracja
plt.rcParams["figure.figsize"] = [5, 4]
pd.set_option("display.float_format", lambda x: "%.4f" % x)

1. Zbiór danych¶


Wczytanie i normalizacja danych

# odczyt pliku TSV (zwracane są: zbiór danych w postaci DataFrame biblioteki Pandas oraz lista nazw klas)

(df, c_names) = umb.read_data("data/Leukemia_500_2.txt")
df
#Leukemia_500 labels M23197_at U05259_rna1_at X03934_at U23852_s_at M31523_at M84371_rna1_s_at D88270_at U46499_at D00749_s_at ... U24704_at M71243_f_at U19247_rna1_s_at M60750_f_at U55766_at U25029_at M34516_r_at D13315_at M22638_at U09087_s_at
ALLB 01 1 8.0279 13.1870 9.2831 9.6166 10.3663 11.5073 10.9403 5.4594 10.8895 ... 8.8549 7.5774 7.8642 10.2095 6.2854 7.9715 12.0084 9.3663 9.1085 8.4838
ALLB 46 1 5.2095 11.7326 8.6582 8.1799 9.9366 10.8033 11.7723 5.5069 9.4939 ... 8.5737 5.9773 6.3537 13.1322 6.1898 7.5236 8.6970 9.4717 8.2336 6.2668
ALLB 47 1 8.1649 11.3214 8.2192 9.1898 10.0389 9.6055 10.9099 5.9773 9.6129 ... 8.6257 8.9944 3.5850 9.3509 5.9073 5.4594 11.9908 9.6018 9.0362 6.6439
ALLB 48 1 7.7944 12.4594 8.4051 8.4512 11.2455 11.1911 13.3102 6.1293 10.6027 ... 9.1344 8.3794 8.8138 11.4722 6.4263 8.3794 9.3923 10.9329 8.5430 7.5850
ALLB 49 1 8.6073 13.1812 9.0471 9.5411 10.5186 11.2064 12.9146 6.5236 10.5421 ... 8.1212 7.4094 7.6582 9.8265 6.0388 5.9307 11.4460 8.2143 8.3619 7.7347
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
AML 30 2 10.1421 8.4221 8.3129 7.2574 7.9658 7.9715 5.6724 10.3487 10.1293 ... 6.5236 8.5353 9.2621 9.9454 6.7549 6.2668 11.5464 9.4838 7.7549 3.9069
AML 29 2 9.1163 8.6865 8.0980 5.1293 8.1241 7.8138 5.8826 10.0348 9.8009 ... 8.7381 11.2784 9.1799 8.4594 7.1799 8.1241 9.7228 7.4676 9.4429 5.8580
AML 65 2 9.1293 8.2336 8.4388 7.4178 8.5508 8.5812 7.1895 10.5981 10.8098 ... 8.3659 8.0224 7.4178 9.8345 5.9433 6.0000 9.4676 7.8202 8.7582 6.0549
AML 50 2 9.8580 8.7964 8.4263 8.0768 7.6220 9.4009 8.1359 10.6275 10.4909 ... 7.8826 8.2992 9.1007 9.5622 7.1085 5.1699 9.3815 9.1059 8.2432 5.4919
AML 66 2 8.4136 9.2831 8.7313 8.8392 9.0444 9.0000 7.7482 7.3487 10.7474 ... 8.9396 10.7952 6.9773 7.9484 6.9773 5.8580 10.5746 9.8611 9.0084 5.6147

72 rows × 501 columns

# pobranie etykiet klas
labels = df.iloc[:, 0].to_numpy()

# pobranie nazw próbek
samples = df.index

# pobranie macierzy danych
data = df.iloc[:, 1:].to_numpy()

# normalizacja
data = StandardScaler().fit_transform(data)

Analiza składowych głównych

# wykres PCA
umb.pca_plot(data, labels, c_names, show=True)
Matplotlib output


2. Klasteryzacja hierarchiczna (hierarchical clustering)¶


Macierz niepodobieństwa

import scipy.spatial.distance as dist
# wyznaczenie macierzy niepodobieństwa (miarą jest odległość wynikajaca ze współczynnika korelacji)
dmtx = dist.pdist(data, metric="correlation")


# rysowanie
fig, ax = plt.subplots()

im = ax.imshow(dist.squareform(dmtx), interpolation="none", cmap="hot")
fig.colorbar(im, ax=ax)

ax.set_axis_off()
Matplotlib output

Dendrogram

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
# tworzenie dendrogramu
lnk = linkage(dmtx, "average")


# rysowanie dendrogramu
fig, ax = plt.subplots(figsize=(10, 5))
dendrogram(lnk, ax=ax, labels=samples, color_threshold=0)
plt.show()
Matplotlib output

Tworzenie klastrów

# rysowanie dendrogramu z podziałem na klastry (kryterium jest odległość > 1)
fig, ax = plt.subplots(figsize=(10, 5))
dd = dendrogram(lnk, ax=ax, labels=samples, color_threshold=1)
plt.show()
Matplotlib output
# utworzenie klastrów
clusters = fcluster(lnk, t=1, criterion="distance")

c_df = pd.DataFrame({"samples": samples,
                     "labels": clusters})
c_df
samples labels
0 ALLB 01 3
1 ALLB 46 3
2 ALLB 47 3
3 ALLB 48 3
4 ALLB 49 3
... ... ...
67 AML 30 1
68 AML 29 1
69 AML 65 1
70 AML 50 1
71 AML 66 2

72 rows × 2 columns

# wykres PCA z uwzględnieniem wyników klasteryzacji
umb.pca_plot(data, c_df.labels, title="Hierarchical clustering")
Matplotlib output


3. Algorytm K-średnich (K-means)¶

from sklearn.cluster import KMeans
# wykonanie klasteryzacji
km_model = KMeans(n_clusters=3, init="random", n_init=10, random_state=0)
km_model = km_model.fit(data)


# pobranie informacji o klastrach
c_df = pd.DataFrame({"samples": samples,
                     "labels": km_model.labels_})
c_df

e:\Projects\Python\Anaconda\jupyter\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
samples labels
0 ALLB 01 1
1 ALLB 46 1
2 ALLB 47 1
3 ALLB 48 1
4 ALLB 49 1
... ... ...
67 AML 30 2
68 AML 29 2
69 AML 65 2
70 AML 50 2
71 AML 66 2

72 rows × 2 columns

# wykres PCA z uwzględnieniem wyników klasteryzacji
umb.pca_plot(data, c_df.labels,title="k-means clustering")
Matplotlib output


2.4 Ocena wyników klasteryzacji (miara silhouette)¶

from sklearn.metrics import silhouette_samples, silhouette_score
# wyznaczenie wartości miary silhouette dla próbek
sil = silhouette_samples(data, labels=c_df.labels, metric="euclidean")


# rysowanie wartości miary silhouette dla próbek
index=np.argsort(c_df.labels)

umb.bar_plot(sil[index], c_df.labels[index])
Matplotlib output
# wartość średnia miary silhouette
mean_sil = silhouette_score(data, labels=c_df.labels, metric="euclidean")

print(f"\nmean sil = {mean_sil:.4f} ({np.mean(sil):.4f})")

mean sil = 0.1788 (0.1788)

Estymacja liczby klastrów dla algorytmu K-średnich

# uruchomienie alg. K-średnich dla K = 2, 3, ..., 10 i wyznaczenie średniej miary silhouette

sil_scores = []
for k in range(2, 11):
    km_model = (KMeans(n_clusters=k, init="random", n_init=10, random_state=0)).fit(data)
 
    mean_sil = silhouette_score(data, km_model.labels_)
    sil_scores.append(mean_sil)

plt.plot(range(2, 11), sil_scores, marker="o")
plt.xlabel("Number of clusters (K)")
plt.ylabel("silhouette")
plt.show()

print(f"\nOptimal number of clusters: {np.argmax(sil_scores)+2}")

e:\Projects\Python\Anaconda\jupyter\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
e:\Projects\Python\Anaconda\jupyter\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
e:\Projects\Python\Anaconda\jupyter\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
e:\Projects\Python\Anaconda\jupyter\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
e:\Projects\Python\Anaconda\jupyter\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
e:\Projects\Python\Anaconda\jupyter\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
e:\Projects\Python\Anaconda\jupyter\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
e:\Projects\Python\Anaconda\jupyter\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
e:\Projects\Python\Anaconda\jupyter\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
Matplotlib output

Optimal number of clusters: 3