To perform our analysis, we construct a toy dataset that is simple to analyze. This dataset is made of overlapping Gaussian clusters in 2D that are randomly squeezed and rotated.
% matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
#rcParams['font.size'] = 30
np.random.seed(1)
num_samples = 1000
d = 2
num_clusters = 4
colors = ['r', 'g', 'b', 'k', 'y', 'c', 'm']
num_samples_per_cluster = num_samples//num_clusters
avg_intercluster_distance = 9.0
centers = np.sqrt(avg_intercluster_distance)*np.random.randn(num_clusters, d)
def transformation(point, squeeze_factor, theta):
t = theta
R = np.array([[np.cos(t), -np.sin(t)], [np.sin(t), np.cos(t)]])
squeeze = np.array([[squeeze_factor, 0.0],[0.0, 1.0]])
return np.dot(R, np.dot(squeeze, point))
clusters = []
data = np.zeros((num_samples, d))
for ind_c in range(num_clusters-1):
cluster = []
center = centers[ind_c,:]
covariance = np.eye(d)
# Squeeze factor
squeeze_factor = np.random.rand()*4
# Rotation angle
theta = np.random.rand()*2.0*np.pi
for ind_sc in range(num_samples_per_cluster):
point = np.random.multivariate_normal(center, covariance)
# Do a non-linear transformation
point = transformation(point, squeeze_factor, theta)
cluster.append(point)
ind_sample = ind_c * num_samples_per_cluster + ind_sc
data[ind_sample, :] = point
clusters.append(cluster)
thetas = np.random.rand(num_samples_per_cluster)*2.0*np.pi
xs = avg_intercluster_distance * np.cos(thetas)
ys = avg_intercluster_distance * np.sin(thetas)
cluster = [np.array([xs[ind_t], ys[ind_t]]) for ind_t in range(num_samples_per_cluster)]
clusters.append(cluster)
data[-num_samples_per_cluster:, :] = np.array(cluster)
# Plot the clusters with different colors.
plt.figure()
for ind_c in range(num_clusters):
cluster = np.array(clusters[ind_c])
(xs, ys) = zip(cluster.T)
plt.plot(xs, ys, 'o', color=colors[ind_c])
plt.xlabel('$x$')
plt.ylabel('$y$')
plt.show()
Spectral embedding and other "kernel trick" methods use a kernel $K(s_i, s_j)$ that measures similarity between data points. The choice of kernel is part of the procedure and requires some intuition/trial-and-error.
A popular choice of kernel is the Gaussian kernel $$ K(s_i, s_j) = \exp\left(- \frac{||s_i - s_j||^2}{2\sigma^2}\right) $$
% matplotlib notebook
import numpy.linalg as nla
# Gaussian kernel with variance sigma^2.
sigma = 0.4
def kernel(s_i, s_j):
return np.exp(-nla.norm(s_i - s_j)**2.0 / (2.0*sigma**2.0))
# The kernel matrix of the data.
kernel_matrix = np.zeros((num_samples, num_samples))
for i in range(num_samples):
for j in range(i, num_samples):
kernel_matrix[i,j] = kernel(data[i,:], data[j,:])
kernel_matrix[j,i] = kernel_matrix[i,j]
plt.matshow(kernel_matrix)
A key step of spectral embedding is the construction of the graph Laplacian, $L = D - K$, where $D_{ij} = \sum_{l=1}^n K(s_i, s_l) \delta_{ij}$ is the degree matrix and $K_{ij} = K(s_i, s_j)$ is the edge weight matrix (the kernel matrix). The graph Laplacian can be interpreted as an effective quadratic Hamiltonian for springs coupled Harmonically.
% matplotlib notebook
degrees = np.sum(kernel_matrix, axis=0)
D = np.diag(degrees)
K = kernel_matrix
L = D - K
plt.matshow(L)
Compute the spectrum of $L$. Ignore the exactly zero eigenvector as it has to always be $(1,\ldots,1)$, which does not give us useful information.
% matplotlib notebook
import numpy.linalg as nla
(eigvals, eigvecs) = nla.eigh(L)
print('Smallest eigenvalues = {}'.format(eigvals[0:2*num_clusters]))
plt.plot(eigvals[1:], 'ro')
plt.xlabel('Eigenvalue index')
plt.ylabel('Eigenvalue')
plt.show()
The $k$-means cluster algorithm heuristically groups data into a fixed number, $k$, of clusters by greedily minimizing the intracluster distances.
First, let's test how well $k$-means can cluster the original data:
from algorithms import kmeans
# The number of clusters to use in k-means.
k = num_clusters
# The number of iterations.
num_iterations = 20
(clusters_info, data_point_assignment, centroids) = kmeans(data, k, num_iterations, verbose=False)
% matplotlib notebook
def plot_clusters(clusters_info):
ind_plot = 0
for cluster_inds in clusters_info:
cluster_data = data[cluster_inds, :]
(xs, ys) = zip(cluster_data.T)
plt.plot(xs, ys, 'o', color=colors[ind_plot])
ind_plot += 1
#plt.xlabel('$x$')
#plt.ylabel('$y$')
plot_clusters(clusters_info)
for ind_cluster in range(k):
plt.plot([centroids[0,ind_cluster]], [centroids[1,ind_cluster]], '*', markersize=26, color='w', markeredgecolor='k', markeredgewidth=3)
plt.show()
Now, instead of doing $k$-means on the original data, we perform $k$-means on the data projected onto the $k$-lowest eigenvalue eigenvectors of the graph Laplacian $L$.
from algorithms import spectral_clustering
# The number of clusters to use in k-means.
k = num_clusters
# The number of iterations.
num_iterations = 10
# Type of Laplacian to use.
L_type = 'unnormalized'
(spectral_clusters_info, data_point_assignment, centroids) = spectral_clustering(data, k, num_iterations, kernel, L_type=L_type, verbose=False)
L_type = 'symmetric'
(spectral_clusters_sym_info, data_point_assignment, centroids) = spectral_clustering(data, k, num_iterations, kernel, L_type=L_type, verbose=False)
L_type = 'randomwalk'
(spectral_clusters_rw_info, data_point_assignment, centroids) = spectral_clustering(data, k, num_iterations, kernel, L_type=L_type, verbose=False)
% matplotlib notebook
plot_clusters(spectral_clusters_info)
plt.show()
Here we test our results against $k$-means and spectral clustering methods implemented in sklearn.
from sklearn.cluster import KMeans, SpectralClustering
# k-means
km = KMeans(n_clusters=k).fit(data)
skl_kmeans_clusters_info = []
for ind_cluster in range(k):
skl_kmeans_clusters_info.append([])
for ind_point in range(num_samples):
ind_cluster = km.labels_[ind_point]
skl_kmeans_clusters_info[ind_cluster].append(ind_point)
# spectral clustering
sc = SpectralClustering(n_clusters=k, affinity='precomputed', assign_labels='kmeans') #, gamma=1.0/sigma**2.0, affinity='rbf').fit(data)
sc.fit_predict(kernel_matrix)
skl_sc_clusters_info = []
for ind_cluster in range(k):
skl_sc_clusters_info.append([])
for ind_point in range(num_samples):
ind_cluster = sc.labels_[ind_point]
skl_sc_clusters_info[ind_cluster].append(ind_point)
Here we show for reference the original clusters, the $k$-means clustering and the spectral clustering.
% matplotlib notebook
plt.figure()
plt.subplot(2,4,1)
for ind_c in range(num_clusters):
cluster = np.array(clusters[ind_c])
(xs, ys) = zip(cluster.T)
plt.plot(xs, ys, 'o', color=colors[ind_c])
plt.title('Original clusters')
#plt.xlabel('$x$')
#plt.ylabel('$y$')
titles = ['$k$-means', 'spectral', 'spectral sym', 'spectral rw', 'SKL $k$-means', 'SKL spectral']
cluster_datas = [clusters_info, spectral_clusters_info, spectral_clusters_sym_info, spectral_clusters_rw_info, skl_kmeans_clusters_info, skl_sc_clusters_info]
ind_subplot = 2
for (subplot_title, cluster_data) in zip(titles, cluster_datas):
plt.subplot(2,4,ind_subplot)
plot_clusters(cluster_data)
ind_subplot += 1
plt.title(subplot_title)
plt.show()