import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances, confusion_matrix
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn_extra.cluster import KMedoids
# ref: https://pyseer.readthedocs.io/en/master/_modules/pyseer/cmdscale.html
def cmdscale(D):
"""Classical multidimensional scaling (MDS)
Args:
D (numpy.array)
Symmetric distance matrix (n, n)
Returns:
Y (numpy.array)
Configuration matrix (n, p). Each column represents a dimension. Only the
p dimensions corresponding to positive eigenvalues of B are returned.
Note that each dimension is only determined up to an overall sign,
corresponding to a reflection.
e (numpy.array)
Eigenvalues of B (n, 1)
"""
# Number of points
n = len(D)
# Centering matrix
H = np.eye(n) - np.ones((n, n))/n
# YY^T
B = -H.dot(D**2).dot(H)/2
# Diagonalize
evals, evecs = np.linalg.eigh(B)
# Sort by eigenvalue in descending order
idx = np.argsort(evals)[::-1]
evals = evals[idx]
evecs = evecs[:, idx]
# Compute the coordinates using positive-eigenvalued components only
w, = np.where(evals > 0)
L = np.diag(np.sqrt(evals[w]))
V = evecs[:, w]
Y = V.dot(L)
return Y
# Source: https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html
def silhouette_plot(kmeans, data, ax):
n_clusters = kmeans.n_clusters
cluster_labels = kmeans.labels_
silhouette_avg = silhouette_score(data, cluster_labels)
sample_silhouette_values = silhouette_samples(data, cluster_labels)
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax.fill_betweenx(
np.arange(y_lower, y_upper),
0,
ith_cluster_silhouette_values,
facecolor=color,
edgecolor=color,
alpha=0.7,
)
# Label the silhouette plots with their cluster numbers at the middle
ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax.set_title("Silhouette plot of $K=%i$ clusters." % n_clusters)
ax.set_xlabel("Silhouette Width")
ax.set_ylabel("Cluster Label")
# The vertical line for average silhouette score of all the values
ax.axvline(x=silhouette_avg, color="red", linestyle="--")
ax.set_yticks([]) # Clear the yaxis labels / ticks
ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
To elucidate the concept of multidimensional scaling, let's consider a dataset from a study that aimed to determine which pairs of letters people frequently confuse.
The table below is from Wolford and Hollingsworth (1974). The set comprises eight letters: C, D, G, H, M, N, Q, and W. This results in an 8×8 matrix. Each cell in this matrix indicates the frequency with which people confuse the corresponding row and column letters.
D = np.zeros((8, 8))
D[1:8, 0] = [5, 12, 2, 2, 2, 9, 1]
D[2:8, 1] = [2, 4, 3, 4, 20, 5]
D[3:8, 2] = [3, 2, 1, 9, 2]
D[4:8, 3] = [19, 18, 1, 5]
D[5:8, 4] = [16, 2, 18]
D[6:8, 5] = [8, 13]
D[7, 6] = 4
D = D + D.T
letters = ["C", "D", "G", "H", "M", "N", "Q", "W"]
D = pd.DataFrame(D, columns=letters, index=letters)
D
C | D | G | H | M | N | Q | W | |
---|---|---|---|---|---|---|---|---|
C | 0.0 | 5.0 | 12.0 | 2.0 | 2.0 | 2.0 | 9.0 | 1.0 |
D | 5.0 | 0.0 | 2.0 | 4.0 | 3.0 | 4.0 | 20.0 | 5.0 |
G | 12.0 | 2.0 | 0.0 | 3.0 | 2.0 | 1.0 | 9.0 | 2.0 |
H | 2.0 | 4.0 | 3.0 | 0.0 | 19.0 | 18.0 | 1.0 | 5.0 |
M | 2.0 | 3.0 | 2.0 | 19.0 | 0.0 | 16.0 | 2.0 | 18.0 |
N | 2.0 | 4.0 | 1.0 | 18.0 | 16.0 | 0.0 | 8.0 | 13.0 |
Q | 9.0 | 20.0 | 9.0 | 1.0 | 2.0 | 8.0 | 0.0 | 4.0 |
W | 1.0 | 5.0 | 2.0 | 5.0 | 18.0 | 13.0 | 4.0 | 0.0 |
The matrix $\mathbf{D}$ is symmetric and represents similarity, not distance. A higher value signifies that the two letters (row and column) are easily confused, indicating their similarity.
First, we change the similarity matrix $\mathbf{D}$ to a distance matrix $\mathbf{D}_0$ using the formula $21-\mathbf{D}$. This essentially inverts the sign, ensuring all entries remain positive (given the highest value in $\mathbf{D}$ is 20). The resultant matrix, $\mathbf{D}_0$, serves as our distance matrix. We also set the diagonal of $\mathbf{D}_0$ to zero since the distance between identical letters should always be zero.
D0 = 21 - D
np.fill_diagonal(D0.values, 0)
Applying multidimensional scaling to this matrix and by default embedding the data in a two-dimensional space, we get a visual representation. An alternative transformation for the similarity-to-distance matrix conversion, using $41-\mathbf{D}$, was also tried. The results were largely consistent, albeit with slight variations in scale.
tmp = cmdscale(D0)
fig, axes = plt.subplots(nrows=1, ncols=2)
axes[0].plot(tmp[:, 0], tmp[:, 1], 'none', markersize=5)
axes[0].set_xlim(-15, 15)
axes[0].set_ylim(-15, 15)
for i, txt in enumerate(letters):
axes[0].annotate(txt, (tmp[i, 0], tmp[i, 1]), ha='center', va='center')
D1 = 41 - D
np.fill_diagonal(D1.values, 0)
tmp = cmdscale(D1)
axes[1].plot(tmp[:, 0], tmp[:, 1], 'none', markersize=5)
axes[1].set_xlim(-20, 20)
axes[1].set_ylim(-20, 20)
for i, txt in enumerate(letters):
axes[1].annotate(txt, (tmp[i, 0], tmp[i, 1]), ha='center', va='center')
fig.set_figwidth(10)
fig.tight_layout()
From the visual representation, it becomes evident that the letters D and Q are similar, as are C and G. Additionally, a cluster of four other letters seems to share certain similarities.
In our study, we apply clustering algorithms in R using the dataset "sportsranks". This dataset contains over a hundred entries, each consisting of seven columns, with each column corresponding to a specific sport. Each entry, or row, represents an individual's ranking preference for these seven sports, ranging from 1 to 7, without any ties.
url = "https://raw.githubusercontent.com/liangfgithub/liangfgithub.github.io/master/Data/sportsranks.csv"
sportsranks = pd.read_csv(url)
sportsranks.head()
Baseball | Football | Basketball | Tennis | Cycling | Swimming | Jogging | |
---|---|---|---|---|---|---|---|
0 | 1 | 3 | 7 | 2 | 4 | 5 | 6 |
1 | 1 | 3 | 2 | 5 | 4 | 7 | 6 |
2 | 1 | 3 | 2 | 5 | 4 | 7 | 6 |
3 | 4 | 7 | 3 | 1 | 5 | 6 | 2 |
4 | 2 | 3 | 1 | 7 | 6 | 5 | 4 |
To analyze this data, we employed the K-means clustering method with K set to 2 and 3.
km2 = KMeans(n_clusters=2, n_init=10).fit(sportsranks)
km3 = KMeans(n_clusters=3, n_init=10).fit(sportsranks)
To visualize the results, we utilized a multidimensional scaling (MDS) representation, plotted in a two-dimensional space. This MDS was calculated using the cmdscale
function defined above.
D = pairwise_distances(sportsranks)
Xmds = cmdscale(D)
fig, axes = plt.subplots(nrows=1, ncols=2)
sns.scatterplot(x=Xmds[:, 0], y=Xmds[:, 1], hue=km2.labels_,
style=km2.labels_, palette="Set2", legend=False, ax=axes[0])
sns.scatterplot(x=Xmds[:, 0], y=Xmds[:, 1], hue=km3.labels_,
style=km3.labels_, palette="Set2", legend=False, ax=axes[1])
fig.set_figwidth(10)
fig.tight_layout()
Moreover, we manually computed the MDS by first applying double-centering, followed by singular value decomposition. We extracted the first two columns, essentially representing the two-dimensional space. A correlation check between our computed two-dimensional MDS and that returned by the cmdscale
function revealed that they were virtually identical.
D2 = D ** 2
n = D2.shape[0]
D2_mean = D2.mean()
D2_colMean = np.broadcast_to(np.mean(D2, axis=0), (n, n))
D2_rowMean = D2_colMean.T
tmp = D2 - D2_colMean - D2_rowMean
tmp = -(tmp + D2_mean) / 2
tmp_svd = np.linalg.svd(tmp)
F = tmp_svd[0][:, 0:2] @ np.diag(tmp_svd[1][0:2] ** 0.5)
# check the correlations
print(np.corrcoef(F[:, 0], Xmds[:, 0])[0, 1])
print(np.corrcoef(F[:, 0], Xmds[:, 1])[0, 1])
print(np.corrcoef(F[:, 1], Xmds[:, 0])[0, 1])
print(np.corrcoef(F[:, 1], Xmds[:, 1])[0, 1])
1.0 -1.5331171381692608e-16 4.3823592026289615e-17 -0.9999999999999993
The plots indicate that setting K to 2 might be sufficient since using K = 3 began to display overlapping clusters. To verify our choice of K, we employed the gap statistic method.
We will compute the gap statistic step by step.
n_init = 10
, selecting the one yielding the smallest sum.n = 130
ss = []
ss.append(np.sum(np.var(sportsranks, ddof=1)) * (n - 1))
for i in range(2, 11):
km_temp = KMeans(n_clusters=i, n_init=10).fit(sportsranks)
ss.append(km_temp.inertia_)
lss = np.log(ss)
m = 7 # each review is a permutation of numbers from 1 to 7
B = 50 # number of iterations
lss0 = np.zeros((B, 10))
for i in range(B):
xstar = np.array(
[np.random.choice(range(m), size=m, replace=False) for i in range(n)])
lss0[i, 0] = np.log(np.sum(np.var(xstar, axis=0, ddof=1)) * (n - 1))
for j in range(1, 11):
km_temp = KMeans(n_clusters=j, n_init=10).fit(xstar)
lss0[i, j - 1] = np.log(km_temp.inertia_)
Upon plotting the log(SS) against K values, we observed that the red curve, representing the log(SS) from the original data, consistently decreased. In contrast, the 50 grey curves, representing the log(SS) from the reference data, displayed a more varied pattern. This difference in patterns highlighted that while the sum of squares always diminishes with an increasing K, the gap doesn't necessarily follow the same trend.
plt.plot(lss0.T, color='gray', alpha=0.2)
plt.plot(lss, color='red')
plt.ylabel('log(Within SS)')
plt.show()
Plot the Gap curve. Based on the 1se rule, the most considerable gap difference appeared at K=2. Thus, we concluded that K=2 is the optimal choice based on the gap statistic, confirming our earlier visual observations.
lss0m = lss0.mean(axis=0)
lss0sd = lss0.std(axis=0, ddof=1) * np.sqrt(1 + 1 / B)
diff = lss0m - lss
plt.plot(diff, marker='o')
plt.plot(diff + lss0sd, linestyle='--')
plt.plot(diff - lss0sd, linestyle='--')
plt.xlabel("K")
plt.ylabel("Gap Statistics")
plt.show()
fig, axes = plt.subplots(nrows=1, ncols=2)
silhouette_plot(km2, sportsranks, axes[0])
silhouette_plot(km3, sportsranks, axes[1])
fig.set_figwidth(10)
fig.tight_layout()
Compute and plot average silhouette over a range of K values.
k_max = 10
sil = []
for i in range(2, k_max + 1):
km_temp = KMeans(n_clusters=i, n_init=10).fit(sportsranks)
sil_avg = silhouette_score(sportsranks, km_temp.labels_)
sil.append(sil_avg)
plt.plot(range(2, k_max + 1), sil, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Average silhouette score')
plt.axvline(x=sil.index(max(sil)) + 2, linestyle="--")
<matplotlib.lines.Line2D at 0x23c8fcf9fa0>
pam2 = KMedoids(n_clusters=2, method='pam').fit(sportsranks)
pam3 = KMedoids(n_clusters=3, method='pam').fit(sportsranks)
fig, axes = plt.subplots(nrows=1, ncols=2)
silhouette_plot(pam2, sportsranks, axes[0])
silhouette_plot(pam3, sportsranks, axes[1])
fig.set_figwidth(10)
fig.tight_layout()
D = pairwise_distances(sportsranks)
Xmds = cmdscale(D)
fig, axes = plt.subplots(nrows=1, ncols=2)
sns.scatterplot(x=Xmds[:, 0], y=Xmds[:, 1], hue=pam2.labels_,
style=pam2.labels_, palette="Set2", legend=False, ax=axes[0])
sns.scatterplot(x=Xmds[:, 0], y=Xmds[:, 1], hue=pam3.labels_,
style=pam3.labels_, palette="Set2", legend=False, ax=axes[1])
fig.set_figwidth(10)
fig.tight_layout()
Compare PAM with K-means using confusion matrices.
confusion_matrix(km2.labels_, pam2.labels_)
array([[60, 2], [ 0, 68]], dtype=int64)
confusion_matrix(km3.labels_, pam3.labels_)
array([[29, 4, 0], [ 0, 38, 0], [ 2, 0, 57]], dtype=int64)
dm = D[np.triu_indices(n, k=1)]
Z = linkage(dm, method='complete')
fig, ax = plt.subplots()
d = dendrogram(Z, orientation='top', ax=ax)
ax.set_title('Complete Linkage')
ax.set_xlabel('Label')
ax.set_ylabel('Height')
fig.set_figwidth(16)
Z = linkage(dm, method='single')
fig, ax = plt.subplots()
d = dendrogram(Z, orientation='top', ax=ax)
ax.set_title('Single Linkage')
ax.set_xlabel('Label')
ax.set_ylabel('Height')
fig.set_figwidth(16)
Z = linkage(dm, method='average')
fig, ax = plt.subplots()
d = dendrogram(Z, orientation='top', ax=ax)
ax.set_title('Average Linkage')
ax.set_xlabel('Label')
ax.set_ylabel('Height')
fig.set_figwidth(16)
Let's demonstrate an interesting application of k-means: image compression. Take, for instance, this image of my son alongside his cherished toy, Curious George.
X = pd.read_table("../image.txt", sep="\s+", header=None)
X = X.to_numpy()
plt.imshow(X, cmap='gray')
plt.show()
To compress this image, we segment it into small 6x6 patches. For each patch, we extract its grayscale values, representing them as a 36x1 vector. Consequently, our data matrix, which I refer to as Z
, has dimensions 2700x36. This means the original image has been divided into nearly 3,000 individual patches.
p = 36
Z = np.empty((0, 36))
n = (270 * 360) // 36
for i in range(360 // 6):
for j in range(270 // 6):
k = 6 * i
l = 6 * j
tmpX = X[l:(l + 6), k:(k + 6)].flatten()
Z = np.vstack((Z, tmpX))
Z.shape
(2700, 36)
Every row in matrix Z
represents a patch, depicted as a vector in this 36-dimensional space. We'll then apply K-means to matrix Z, setting k=40. Consequently, we'll identify 40 distinct cluster centers. Each of these centers is represented by a 36x1 vector, which can also be visualized as a 6x6 patch.
With these centers in hand, we'll reconstruct the image, replacing the original 6x6 patches with their nearest cluster centers. The end result is an image reconstructed using only 40 unique patches. Naturally, this compressed image isn't as detailed as the original, which utilized 2,700 distinct patches, but it significantly reduces the file size. Despite this compression, the main features remain discernible: we can still identify both my son's face and Curious George. And impressively, the entire compression process hinges on the simple K-means algorithm.
mykm = KMeans(n_clusters=40, n_init=10).fit(Z)
mykm_pred = mykm.cluster_centers_[mykm.predict(Z)]
newX = np.zeros((270, 360))
m = 0
for i in range(360 // 6):
for j in range(270 // 6):
k = 6 * i
l = 6 * j
newX[l:(l + 6), k:(k + 6)] = mykm_pred[m].reshape(6, 6)
m += 1
plt.imshow(newX, cmap='gray')