Spectral embedding and clustering

Data $s_1,\ldots,s_N$

To perform our analysis, we construct a toy dataset that is simple to analyze. This dataset is made of overlapping Gaussian clusters in 2D that are randomly squeezed and rotated.

In [1]:
% matplotlib notebook

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams

#rcParams['font.size'] = 30

np.random.seed(1)

num_samples = 1000

d = 2
num_clusters = 4
colors = ['r', 'g', 'b', 'k', 'y', 'c', 'm']
num_samples_per_cluster = num_samples//num_clusters

avg_intercluster_distance = 9.0

centers  = np.sqrt(avg_intercluster_distance)*np.random.randn(num_clusters, d)

def transformation(point, squeeze_factor, theta):
    t = theta
    R = np.array([[np.cos(t), -np.sin(t)], [np.sin(t), np.cos(t)]])
    squeeze = np.array([[squeeze_factor, 0.0],[0.0, 1.0]])
    return np.dot(R, np.dot(squeeze, point))

clusters = []
data = np.zeros((num_samples, d))
for ind_c in range(num_clusters-1):
    cluster = []
    center = centers[ind_c,:]
    covariance = np.eye(d)
    
    # Squeeze factor
    squeeze_factor = np.random.rand()*4
    
    # Rotation angle
    theta = np.random.rand()*2.0*np.pi
    
    for ind_sc in range(num_samples_per_cluster):
        point = np.random.multivariate_normal(center, covariance)
        
        # Do a non-linear transformation
        point = transformation(point, squeeze_factor, theta)
        
        cluster.append(point)
        
        ind_sample = ind_c * num_samples_per_cluster + ind_sc
        data[ind_sample, :] = point
    clusters.append(cluster)
    
thetas = np.random.rand(num_samples_per_cluster)*2.0*np.pi
xs = avg_intercluster_distance * np.cos(thetas)
ys = avg_intercluster_distance * np.sin(thetas)
cluster = [np.array([xs[ind_t], ys[ind_t]]) for ind_t in range(num_samples_per_cluster)]
clusters.append(cluster)
data[-num_samples_per_cluster:, :] = np.array(cluster)
    
# Plot the clusters with different colors.
plt.figure()
for ind_c in range(num_clusters):
    cluster = np.array(clusters[ind_c])
    (xs, ys) = zip(cluster.T)
    plt.plot(xs, ys, 'o', color=colors[ind_c])
    
plt.xlabel('$x$')
plt.ylabel('$y$')

plt.show()

Kernel $K(s_i,s_j)$

Spectral embedding and other "kernel trick" methods use a kernel $K(s_i, s_j)$ that measures similarity between data points. The choice of kernel is part of the procedure and requires some intuition/trial-and-error.

A popular choice of kernel is the Gaussian kernel $$ K(s_i, s_j) = \exp\left(- \frac{||s_i - s_j||^2}{2\sigma^2}\right) $$

In [2]:
% matplotlib notebook

import numpy.linalg as nla

# Gaussian kernel with variance sigma^2.
sigma = 0.4
def kernel(s_i, s_j):
    return np.exp(-nla.norm(s_i - s_j)**2.0 / (2.0*sigma**2.0))

# The kernel matrix of the data.
kernel_matrix = np.zeros((num_samples, num_samples))
for i in range(num_samples):
    for j in range(i, num_samples):
        kernel_matrix[i,j] = kernel(data[i,:], data[j,:])
        kernel_matrix[j,i] = kernel_matrix[i,j]
        
plt.matshow(kernel_matrix)
Out[2]:
<matplotlib.image.AxesImage at 0x7f3d0446e490>

Graph Laplacian $L$

A key step of spectral embedding is the construction of the graph Laplacian, $L = D - K$, where $D_{ij} = \sum_{l=1}^n K(s_i, s_l) \delta_{ij}$ is the degree matrix and $K_{ij} = K(s_i, s_j)$ is the edge weight matrix (the kernel matrix). The graph Laplacian can be interpreted as an effective quadratic Hamiltonian for springs coupled Harmonically.

In [3]:
% matplotlib notebook

degrees = np.sum(kernel_matrix, axis=0)
D = np.diag(degrees)
K = kernel_matrix

L = D - K

plt.matshow(L)
Out[3]:
<matplotlib.image.AxesImage at 0x7f3d04057890>

Spectrum of $L$

Compute the spectrum of $L$. Ignore the exactly zero eigenvector as it has to always be $(1,\ldots,1)$, which does not give us useful information.

In [4]:
% matplotlib notebook

import numpy.linalg as nla

(eigvals, eigvecs) = nla.eigh(L)

print('Smallest eigenvalues = {}'.format(eigvals[0:2*num_clusters]))

plt.plot(eigvals[1:], 'ro')
plt.xlabel('Eigenvalue index')
plt.ylabel('Eigenvalue')

plt.show()
Smallest eigenvalues = [ -3.15161903e-15   1.36346699e-08   6.10323236e-07   3.99972754e-04
   1.77923482e-03   2.29958753e-03   4.23597842e-03   5.73741030e-03]

$k$-means clustering

The $k$-means cluster algorithm heuristically groups data into a fixed number, $k$, of clusters by greedily minimizing the intracluster distances.

First, let's test how well $k$-means can cluster the original data:

In [6]:
from algorithms import kmeans

# The number of clusters to use in k-means.
k = num_clusters
# The number of iterations.
num_iterations = 20

(clusters_info, data_point_assignment, centroids) = kmeans(data, k, num_iterations, verbose=False)
In [7]:
% matplotlib notebook

def plot_clusters(clusters_info):
    ind_plot = 0
    for cluster_inds in clusters_info:
        cluster_data = data[cluster_inds, :]

        (xs, ys) = zip(cluster_data.T)

        plt.plot(xs, ys, 'o', color=colors[ind_plot])

        ind_plot += 1

    #plt.xlabel('$x$')
    #plt.ylabel('$y$')
    
plot_clusters(clusters_info)
for ind_cluster in range(k):
    plt.plot([centroids[0,ind_cluster]], [centroids[1,ind_cluster]], '*', markersize=26, color='w', markeredgecolor='k', markeredgewidth=3)

plt.show()

Spectral embedding

Now, instead of doing $k$-means on the original data, we perform $k$-means on the data projected onto the $k$-lowest eigenvalue eigenvectors of the graph Laplacian $L$.

In [8]:
from algorithms import spectral_clustering

# The number of clusters to use in k-means.
k = num_clusters
# The number of iterations.
num_iterations = 10
# Type of Laplacian to use.
L_type = 'unnormalized'
(spectral_clusters_info, data_point_assignment, centroids) = spectral_clustering(data, k, num_iterations, kernel, L_type=L_type, verbose=False)

L_type = 'symmetric'
(spectral_clusters_sym_info, data_point_assignment, centroids) = spectral_clustering(data, k, num_iterations, kernel, L_type=L_type, verbose=False)

L_type = 'randomwalk'
(spectral_clusters_rw_info, data_point_assignment, centroids) = spectral_clustering(data, k, num_iterations, kernel, L_type=L_type, verbose=False)
/usr/local/lib/python2.7/dist-packages/numpy/core/fromnumeric.py:2909: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
/usr/local/lib/python2.7/dist-packages/numpy/core/_methods.py:80: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
In [9]:
% matplotlib notebook

plot_clusters(spectral_clusters_info)    
plt.show()

Scikit learn implementations

Here we test our results against $k$-means and spectral clustering methods implemented in sklearn.

In [10]:
from sklearn.cluster import KMeans, SpectralClustering

# k-means
km = KMeans(n_clusters=k).fit(data)

skl_kmeans_clusters_info = []
for ind_cluster in range(k):
    skl_kmeans_clusters_info.append([])

for ind_point in range(num_samples):
    ind_cluster = km.labels_[ind_point]
    skl_kmeans_clusters_info[ind_cluster].append(ind_point)

# spectral clustering
sc = SpectralClustering(n_clusters=k, affinity='precomputed', assign_labels='kmeans') #, gamma=1.0/sigma**2.0, affinity='rbf').fit(data)
sc.fit_predict(kernel_matrix)

skl_sc_clusters_info = []
for ind_cluster in range(k):
    skl_sc_clusters_info.append([])

for ind_point in range(num_samples):
    ind_cluster = sc.labels_[ind_point]
    skl_sc_clusters_info[ind_cluster].append(ind_point)

Side-by-side comparisons

Here we show for reference the original clusters, the $k$-means clustering and the spectral clustering.

In [11]:
% matplotlib notebook

plt.figure()
plt.subplot(2,4,1)
for ind_c in range(num_clusters):
    cluster = np.array(clusters[ind_c])
    (xs, ys) = zip(cluster.T)
    plt.plot(xs, ys, 'o', color=colors[ind_c])
    
plt.title('Original clusters')
#plt.xlabel('$x$')
#plt.ylabel('$y$')

titles = ['$k$-means', 'spectral', 'spectral sym', 'spectral rw', 'SKL $k$-means', 'SKL spectral']
cluster_datas = [clusters_info, spectral_clusters_info, spectral_clusters_sym_info, spectral_clusters_rw_info, skl_kmeans_clusters_info, skl_sc_clusters_info]

ind_subplot = 2
for (subplot_title, cluster_data) in zip(titles, cluster_datas):
    plt.subplot(2,4,ind_subplot)
    
    plot_clusters(cluster_data)
    
    ind_subplot += 1
    
    plt.title(subplot_title)
    
plt.show()