Instructions
Requirements and Specifications
Source Code
"""
K-means clustering.
"""
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
def analyze_kmeans():
"""
Top-level wrapper to iterate over a bunch of values of k and plot the
distortions and misclassification rates.
"""
X = np.genfromtxt("digit.txt")
y = np.genfromtxt("labels.txt", dtype=int)
distortions = []
errs = []
ks = range(1, 11)
for k in ks:
distortion, err = analyze_one_k(X, y, k)
distortions.append(distortion)
errs.append(err)
fig, ax = plt.subplots(2, figsize=(8, 6))
ax[0].plot(ks, distortions, marker=".")
ax[0].set_ylabel("Distortion")
ax[1].plot(ks, errs, marker=".")
ax[1].set_xlabel("k")
ax[1].set_ylabel("Mistake rate")
ax[0].set_title("k-means performance")
fig.savefig("kmeans.png")
def analyze_one_k(X, y, k):
"""
Run the k-means analysis for a single value of k. Return the distortion and
the mistake rate.
"""
print("Running k-means with k={0}".format(k))
clust = cluster(X, y, k)
print("Computing classification error.")
err = compute_mistake_rate(y, clust)
return clust["distortion"][0], err
def cluster(X, y, k, n_starts=5):
"""
Run k-means a total of n_starts times. Returns the results from the run that
had the lowest within-group sum of squares (i.e. the lowest distortion).
Inputs
------
X is an NxD1 matrix of inputs.(D1=157)
y is a D2x1 vector of labels.(D2=1000)
n_starts says how many times to randomly re-initialize k-means. You don't
need to change this.
Outputs
-------
The output is a dictionary with the following fields:
Mu is a kxD1 matrix of cluster centroids
z is an Nx1 vector assigning points to clusters. So, for instance, if z[4] =
2, then the algorithm has assigned the 4th data point to the second
cluster.
distortion is the within-group sum of squares, a number.
"""
def loop(X, i):
"""
A single run of clustering.
"""
Mu = initialize(X, k) #Done
N = X.shape[0]
z = np.repeat(-1, N) # So that initially all assignments change.-1 repeats N times
while True:
old_z = z
z = assign(X, Mu) # The vector of assignments z.
Mu = update(X, z, k) # Update the centroids
if np.all(z == old_z):
distortion = compute_distortion(X, Mu, z)
return dict(Mu=Mu, z=z, distortion=distortion)
# Main function body
print("Performing clustering.")
results = [loop(X, i) for i in range(n_starts)]
best = min(results, key=lambda entry: entry["distortion"])
best["digits"] = label_clusters(y, k, best["z"])
return best
def assign(X, Mu):
"""
Assign each entry to the closest centroid. Return an Nx1 vector of
assignments z.
X is the NxD matrix of inputs.
Mu is the kxD matrix of cluster centroids.
"""
z=[]
for i in range(0,len(X)):
dist=[]
for j in range(0,len(Mu)):
dist.append(np.linalg.norm(X[i]-Mu[j]))
z.append(dist.index(min(dist)))
return z
def update(X, z, k):
"""
Update the cluster centroids given the new assignments. Return a kxD matrix
of cluster centroids Mu.
X is the NxD inputs as always.
z is the Nx1 vector of cluster assignments.
k is the number of clusters.
"""
# TODO: Compute the cluster centroids Mu.
b=X.shape[1]
Mu=np.zeros(shape=(k,b))
for i in range(0,k):
cluster_index=[]
for j in range(0,(len(z))):
if z[j]==i:
cluster_index.append(j)
Mu[i,:]=np.mean(X[cluster_index,:],axis=0)
return Mu
def compute_distortion(X, Mu, z):
"""
Compute the distortion (i.e. within-group sum of squares) implied by NxD
data X, kxD centroids Mu, and Nx1 assignments z.
"""
# TODO: Compute the within-group sum of squares (the distortion).
k=Mu.shape[0]
distortion = []
for i in range(0,k):
cluster_index=[]
for j in range(0,(len(z))):
if z[j]==i:
cluster_index.append(j)
distortion1=0
for j in cluster_index:
distortion1= distortion1 + (np.linalg.norm(X[j]-Mu[i])**2)
distortion.append(distortion1)
return distortion
def initialize(X, k):
"""
Randomly initialize the kxD matrix of cluster centroids Mu. Do this by
choosing k data points randomly from the data set X.
"""
index = np.random.choice(len(X),k)
Mu=X[index,:]
return Mu
def label_clusters(y, k, z):
"""
Label each cluster with the digit that occurs most frequently for points
assigned to that cluster.
Return a kx1 vector labels with the label for each cluster.
For instance: if 20 points assigned to cluster 0 have label "3", and 40 have
label "5", then labels[0] should be 5.
y is the Nx1 vector of digit labels for the data X
k is the number of clusters
z is the Nx1 vector of cluster assignments.
"""
# TODO: Compute the cluster labelings.
labels=[]
for i in range(0,k):
print("i",i)
list_1=[]
cluster_index=[]
for j in range(0,(len(z))):
if z[j]==i:
cluster_index.append(j)
list_1 = y[cluster_index]
label_data=pd.DataFrame(data=list_1,columns=["Y"])
labels.append(label_data['Y'].value_counts().idxmax())
return np.array(labels)
def compute_mistake_rate(y, clust):
"""
Compute the mistake rate as discussed in section 3.4 of the homework.
y is the Nx1 vector of true labels.
clust is the output of a run of clustering. Two fields are relevant:
"digits" is a kx1 vector giving the majority label for each cluster
"z" is an Nx1 vector of final cluster assignments.
"""
print("start compute")
clust['z']=np.array(clust['z'])
def zero_one_loss(xs, ys):
return sum(xs != ys) / float(len(xs))
y_hat = clust["digits"][clust["z"]]
return zero_one_loss(y, y_hat)
def main():
analyze_kmeans()
if __name__ == '__main__':
main()