import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.metrics import pairwise_distances, confusion_matrix
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.cluster import KMeans

from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn_extra.cluster import KMedoids


# ref: https://pyseer.readthedocs.io/en/master/_modules/pyseer/cmdscale.html
def cmdscale(D):
    """Classical multidimensional scaling (MDS)

    Args:
        D (numpy.array)
            Symmetric distance matrix (n, n)

    Returns:
        Y (numpy.array)
            Configuration matrix (n, p). Each column represents a dimension. Only the
            p dimensions corresponding to positive eigenvalues of B are returned.
            Note that each dimension is only determined up to an overall sign,
            corresponding to a reflection.
        e (numpy.array)
            Eigenvalues of B (n, 1)
    """
    # Number of points
    n = len(D)

    # Centering matrix
    H = np.eye(n) - np.ones((n, n))/n

    # YY^T
    B = -H.dot(D**2).dot(H)/2

    # Diagonalize
    evals, evecs = np.linalg.eigh(B)

    # Sort by eigenvalue in descending order
    idx = np.argsort(evals)[::-1]
    evals = evals[idx]
    evecs = evecs[:, idx]

    # Compute the coordinates using positive-eigenvalued components only
    w, = np.where(evals > 0)
    L = np.diag(np.sqrt(evals[w]))
    V = evecs[:, w]
    Y = V.dot(L)
    
    return Y


# Source: https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html
def silhouette_plot(kmeans, data, ax):
    n_clusters = kmeans.n_clusters
    cluster_labels = kmeans.labels_
    silhouette_avg = silhouette_score(data, cluster_labels)
    sample_silhouette_values = silhouette_samples(data, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        # Label the silhouette plots with their cluster numbers at the middle
        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax.set_title("Silhouette plot of $K=%i$ clusters." % n_clusters)
    ax.set_xlabel("Silhouette Width")
    ax.set_ylabel("Cluster Label")

    # The vertical line for average silhouette score of all the values
    ax.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax.set_yticks([])  # Clear the yaxis labels / ticks
    ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])


D = np.zeros((8, 8))
D[1:8, 0] = [5, 12, 2, 2, 2, 9, 1]
D[2:8, 1] = [2, 4, 3, 4, 20, 5]
D[3:8, 2] = [3, 2, 1, 9, 2]
D[4:8, 3] = [19, 18, 1, 5]
D[5:8, 4] = [16, 2, 18]
D[6:8, 5] = [8, 13]
D[7, 6] = 4
D = D + D.T

letters = ["C", "D", "G", "H", "M", "N", "Q", "W"]
D = pd.DataFrame(D, columns=letters, index=letters)
D


D0 = 21 - D
np.fill_diagonal(D0.values, 0)


tmp = cmdscale(D0)

fig, axes = plt.subplots(nrows=1, ncols=2)
axes[0].plot(tmp[:, 0], tmp[:, 1], 'none', markersize=5)
axes[0].set_xlim(-15, 15)
axes[0].set_ylim(-15, 15)
for i, txt in enumerate(letters):
    axes[0].annotate(txt, (tmp[i, 0], tmp[i, 1]), ha='center', va='center')

D1 = 41 - D
np.fill_diagonal(D1.values, 0)
tmp = cmdscale(D1)
axes[1].plot(tmp[:, 0], tmp[:, 1], 'none', markersize=5)
axes[1].set_xlim(-20, 20)
axes[1].set_ylim(-20, 20)
for i, txt in enumerate(letters):
    axes[1].annotate(txt, (tmp[i, 0], tmp[i, 1]), ha='center', va='center')

fig.set_figwidth(10)
fig.tight_layout()


url = "https://raw.githubusercontent.com/liangfgithub/liangfgithub.github.io/master/Data/sportsranks.csv"
sportsranks = pd.read_csv(url)
sportsranks.head()


km2 = KMeans(n_clusters=2, n_init=10).fit(sportsranks)
km3 = KMeans(n_clusters=3, n_init=10).fit(sportsranks)


D = pairwise_distances(sportsranks)
Xmds = cmdscale(D)

fig, axes = plt.subplots(nrows=1, ncols=2)
sns.scatterplot(x=Xmds[:, 0], y=Xmds[:, 1], hue=km2.labels_,
                style=km2.labels_, palette="Set2", legend=False, ax=axes[0])
sns.scatterplot(x=Xmds[:, 0], y=Xmds[:, 1], hue=km3.labels_,
                style=km3.labels_, palette="Set2", legend=False, ax=axes[1])
fig.set_figwidth(10)
fig.tight_layout()


D2 = D ** 2
n = D2.shape[0]
D2_mean = D2.mean()
D2_colMean = np.broadcast_to(np.mean(D2, axis=0), (n, n))
D2_rowMean = D2_colMean.T

tmp = D2 - D2_colMean - D2_rowMean
tmp = -(tmp + D2_mean) / 2
tmp_svd = np.linalg.svd(tmp)
F = tmp_svd[0][:, 0:2] @ np.diag(tmp_svd[1][0:2] ** 0.5)

# check the correlations
print(np.corrcoef(F[:, 0], Xmds[:, 0])[0, 1])
print(np.corrcoef(F[:, 0], Xmds[:, 1])[0, 1])
print(np.corrcoef(F[:, 1], Xmds[:, 0])[0, 1])
print(np.corrcoef(F[:, 1], Xmds[:, 1])[0, 1])

1.0
-1.5331171381692608e-16
4.3823592026289615e-17
-0.9999999999999993


n = 130
ss = []
ss.append(np.sum(np.var(sportsranks, ddof=1)) * (n - 1))

for i in range(2, 11):
    km_temp = KMeans(n_clusters=i, n_init=10).fit(sportsranks)
    ss.append(km_temp.inertia_)

lss = np.log(ss)


m = 7  # each review is a permutation of numbers from 1 to 7
B = 50  # number of iterations
lss0 = np.zeros((B, 10))

for i in range(B):
    xstar = np.array(
        [np.random.choice(range(m), size=m, replace=False) for i in range(n)])
    lss0[i, 0] = np.log(np.sum(np.var(xstar, axis=0, ddof=1)) * (n - 1))
    for j in range(1, 11):
        km_temp = KMeans(n_clusters=j, n_init=10).fit(xstar)
        lss0[i, j - 1] = np.log(km_temp.inertia_)


plt.plot(lss0.T, color='gray', alpha=0.2)
plt.plot(lss, color='red')
plt.ylabel('log(Within SS)')
plt.show()


lss0m = lss0.mean(axis=0)
lss0sd = lss0.std(axis=0, ddof=1) * np.sqrt(1 + 1 / B)
diff = lss0m - lss
plt.plot(diff, marker='o')
plt.plot(diff + lss0sd, linestyle='--')
plt.plot(diff - lss0sd, linestyle='--')
plt.xlabel("K")
plt.ylabel("Gap Statistics")
plt.show()


fig, axes = plt.subplots(nrows=1, ncols=2)
silhouette_plot(km2, sportsranks, axes[0])
silhouette_plot(km3, sportsranks, axes[1])
fig.set_figwidth(10)
fig.tight_layout()


k_max = 10
sil = []

for i in range(2, k_max + 1):
    km_temp = KMeans(n_clusters=i, n_init=10).fit(sportsranks)
    sil_avg = silhouette_score(sportsranks, km_temp.labels_)
    sil.append(sil_avg)

plt.plot(range(2, k_max + 1), sil, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Average silhouette score')
plt.axvline(x=sil.index(max(sil)) + 2, linestyle="--")

<matplotlib.lines.Line2D at 0x23c8fcf9fa0>


pam2 = KMedoids(n_clusters=2, method='pam').fit(sportsranks)
pam3 = KMedoids(n_clusters=3, method='pam').fit(sportsranks)


fig, axes = plt.subplots(nrows=1, ncols=2)
silhouette_plot(pam2, sportsranks, axes[0])
silhouette_plot(pam3, sportsranks, axes[1])
fig.set_figwidth(10)
fig.tight_layout()


D = pairwise_distances(sportsranks)
Xmds = cmdscale(D)

fig, axes = plt.subplots(nrows=1, ncols=2)
sns.scatterplot(x=Xmds[:, 0], y=Xmds[:, 1], hue=pam2.labels_,
                style=pam2.labels_, palette="Set2", legend=False, ax=axes[0])
sns.scatterplot(x=Xmds[:, 0], y=Xmds[:, 1], hue=pam3.labels_,
                style=pam3.labels_, palette="Set2", legend=False, ax=axes[1])
fig.set_figwidth(10)
fig.tight_layout()


confusion_matrix(km2.labels_, pam2.labels_)

array([[60,  2],
       [ 0, 68]], dtype=int64)


confusion_matrix(km3.labels_, pam3.labels_)

array([[29,  4,  0],
       [ 0, 38,  0],
       [ 2,  0, 57]], dtype=int64)


dm = D[np.triu_indices(n, k=1)]
Z = linkage(dm, method='complete')

fig, ax = plt.subplots()
d = dendrogram(Z, orientation='top', ax=ax)
ax.set_title('Complete Linkage')
ax.set_xlabel('Label')
ax.set_ylabel('Height')
fig.set_figwidth(16)


Z = linkage(dm, method='single')

fig, ax = plt.subplots()
d = dendrogram(Z, orientation='top', ax=ax)
ax.set_title('Single Linkage')
ax.set_xlabel('Label')
ax.set_ylabel('Height')
fig.set_figwidth(16)


Z = linkage(dm, method='average')

fig, ax = plt.subplots()
d = dendrogram(Z, orientation='top', ax=ax)
ax.set_title('Average Linkage')
ax.set_xlabel('Label')
ax.set_ylabel('Height')
fig.set_figwidth(16)


X = pd.read_table("../image.txt", sep="\s+", header=None)
X = X.to_numpy()
plt.imshow(X, cmap='gray')
plt.show()


p = 36
Z = np.empty((0, 36))
n = (270 * 360) // 36
for i in range(360 // 6):
    for j in range(270 // 6):
        k = 6 * i
        l = 6 * j
        tmpX = X[l:(l + 6), k:(k + 6)].flatten()
        Z = np.vstack((Z, tmpX))
Z.shape

(2700, 36)


mykm = KMeans(n_clusters=40, n_init=10).fit(Z)
mykm_pred = mykm.cluster_centers_[mykm.predict(Z)]


newX = np.zeros((270, 360))

m = 0
for i in range(360 // 6):
    for j in range(270 // 6):
        k = 6 * i
        l = 6 * j
        newX[l:(l + 6), k:(k + 6)] = mykm_pred[m].reshape(6, 6)
        m += 1

plt.imshow(newX, cmap='gray')

	C	D	G	H	M	N	Q	W
C	0.0	5.0	12.0	2.0	2.0	2.0	9.0	1.0
D	5.0	0.0	2.0	4.0	3.0	4.0	20.0	5.0
G	12.0	2.0	0.0	3.0	2.0	1.0	9.0	2.0
H	2.0	4.0	3.0	0.0	19.0	18.0	1.0	5.0
M	2.0	3.0	2.0	19.0	0.0	16.0	2.0	18.0
N	2.0	4.0	1.0	18.0	16.0	0.0	8.0	13.0
Q	9.0	20.0	9.0	1.0	2.0	8.0	0.0	4.0
W	1.0	5.0	2.0	5.0	18.0	13.0	4.0	0.0

	Baseball	Football	Basketball	Tennis	Cycling	Swimming	Jogging
0	1	3	7	2	4	5	6
1	1	3	2	5	4	7	6
2	1	3	2	5	4	7	6
3	4	7	3	1	5	6	2
4	2	3	1	7	6	5	4

(PSL) Clustering¶

Multidimensional Scaling¶

K-means¶

Gap Statistics¶

Silhouette Plots¶

PAM (Partitioning Around Medoids)¶

Hierarchical Clustering¶

Vector Quantization¶