# Create a Program to Implement Digit Classification in Python Assignment Solution.

## Instructions

Objective
Write a python assignment program to implement digit classification.

## Requirements and Specifications

Source Code

```""" K-means clustering. """ import numpy as np import pandas as pd from matplotlib import pyplot as plt def analyze_kmeans(): """ Top-level wrapper to iterate over a bunch of values of k and plot the distortions and misclassification rates. """ X = np.genfromtxt("digit.txt") y = np.genfromtxt("labels.txt", dtype=int) distortions = [] errs = [] ks = range(1, 11) for k in ks: distortion, err = analyze_one_k(X, y, k) distortions.append(distortion) errs.append(err) fig, ax = plt.subplots(2, figsize=(8, 6)) ax[0].plot(ks, distortions, marker=".") ax[0].set_ylabel("Distortion") ax[1].plot(ks, errs, marker=".") ax[1].set_xlabel("k") ax[1].set_ylabel("Mistake rate") ax[0].set_title("k-means performance") fig.savefig("kmeans.png") def analyze_one_k(X, y, k): """ Run the k-means analysis for a single value of k. Return the distortion and the mistake rate. """ print("Running k-means with k={0}".format(k)) clust = cluster(X, y, k) print("Computing classification error.") err = compute_mistake_rate(y, clust) return clust["distortion"][0], err def cluster(X, y, k, n_starts=5): """ Run k-means a total of n_starts times. Returns the results from the run that had the lowest within-group sum of squares (i.e. the lowest distortion). Inputs ------ X is an NxD1 matrix of inputs.(D1=157) y is a D2x1 vector of labels.(D2=1000) n_starts says how many times to randomly re-initialize k-means. You don't need to change this. Outputs ------- The output is a dictionary with the following fields: Mu is a kxD1 matrix of cluster centroids z is an Nx1 vector assigning points to clusters. So, for instance, if z[4] = 2, then the algorithm has assigned the 4th data point to the second cluster. distortion is the within-group sum of squares, a number. """ def loop(X, i): """ A single run of clustering. """ Mu = initialize(X, k) #Done N = X.shape[0] z = np.repeat(-1, N) # So that initially all assignments change.-1 repeats N times while True: old_z = z z = assign(X, Mu) # The vector of assignments z. Mu = update(X, z, k) # Update the centroids if np.all(z == old_z): distortion = compute_distortion(X, Mu, z) return dict(Mu=Mu, z=z, distortion=distortion) # Main function body print("Performing clustering.") results = [loop(X, i) for i in range(n_starts)] best = min(results, key=lambda entry: entry["distortion"]) best["digits"] = label_clusters(y, k, best["z"]) return best def assign(X, Mu): """ Assign each entry to the closest centroid. Return an Nx1 vector of assignments z. X is the NxD matrix of inputs. Mu is the kxD matrix of cluster centroids. """ z=[] for i in range(0,len(X)): dist=[] for j in range(0,len(Mu)): dist.append(np.linalg.norm(X[i]-Mu[j])) z.append(dist.index(min(dist))) return z def update(X, z, k): """ Update the cluster centroids given the new assignments. Return a kxD matrix of cluster centroids Mu. X is the NxD inputs as always. z is the Nx1 vector of cluster assignments. k is the number of clusters. """ # TODO: Compute the cluster centroids Mu. b=X.shape[1] Mu=np.zeros(shape=(k,b)) for i in range(0,k): cluster_index=[] for j in range(0,(len(z))): if z[j]==i: cluster_index.append(j) Mu[i,:]=np.mean(X[cluster_index,:],axis=0) return Mu def compute_distortion(X, Mu, z): """ Compute the distortion (i.e. within-group sum of squares) implied by NxD data X, kxD centroids Mu, and Nx1 assignments z. """ # TODO: Compute the within-group sum of squares (the distortion). k=Mu.shape[0] distortion = [] for i in range(0,k): cluster_index=[] for j in range(0,(len(z))): if z[j]==i: cluster_index.append(j) distortion1=0 for j in cluster_index: distortion1= distortion1 + (np.linalg.norm(X[j]-Mu[i])**2) distortion.append(distortion1) return distortion def initialize(X, k): """ Randomly initialize the kxD matrix of cluster centroids Mu. Do this by choosing k data points randomly from the data set X. """ index = np.random.choice(len(X),k) Mu=X[index,:] return Mu def label_clusters(y, k, z): """ Label each cluster with the digit that occurs most frequently for points assigned to that cluster. Return a kx1 vector labels with the label for each cluster. For instance: if 20 points assigned to cluster 0 have label "3", and 40 have label "5", then labels[0] should be 5. y is the Nx1 vector of digit labels for the data X k is the number of clusters z is the Nx1 vector of cluster assignments. """ # TODO: Compute the cluster labelings. labels=[] for i in range(0,k): print("i",i) list_1=[] cluster_index=[] for j in range(0,(len(z))): if z[j]==i: cluster_index.append(j) list_1 = y[cluster_index] label_data=pd.DataFrame(data=list_1,columns=["Y"]) labels.append(label_data['Y'].value_counts().idxmax()) return np.array(labels) def compute_mistake_rate(y, clust): """ Compute the mistake rate as discussed in section 3.4 of the homework. y is the Nx1 vector of true labels. clust is the output of a run of clustering. Two fields are relevant: "digits" is a kx1 vector giving the majority label for each cluster "z" is an Nx1 vector of final cluster assignments. """ print("start compute") clust['z']=np.array(clust['z']) def zero_one_loss(xs, ys): return sum(xs != ys) / float(len(xs)) y_hat = clust["digits"][clust["z"]] return zero_one_loss(y, y_hat) def main(): analyze_kmeans() if __name__ == '__main__': main()```