Program to Implement Digit Classification in Python

Instructions

Objective
Write a python assignment program to implement digit classification.
Requirements and Specifications

program to implement digit classification in python
program to implement digit classification in python 1
Source Code
"""
K-means clustering.
"""
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
def analyze_kmeans():
 """
 Top-level wrapper to iterate over a bunch of values of k and plot the
 distortions and misclassification rates.
 """
 X = np.genfromtxt("digit.txt")
 y = np.genfromtxt("labels.txt", dtype=int)
 distortions = []
 errs = []
 ks = range(1, 11)
 for k in ks:
 distortion, err = analyze_one_k(X, y, k)
 distortions.append(distortion)
 errs.append(err)
 fig, ax = plt.subplots(2, figsize=(8, 6))
 ax[0].plot(ks, distortions, marker=".")
 ax[0].set_ylabel("Distortion")
 ax[1].plot(ks, errs, marker=".")
 ax[1].set_xlabel("k")
 ax[1].set_ylabel("Mistake rate")
 ax[0].set_title("k-means performance")
 fig.savefig("kmeans.png")
def analyze_one_k(X, y, k):
 """
 Run the k-means analysis for a single value of k. Return the distortion and
 the mistake rate.
 """
 print("Running k-means with k={0}".format(k))
 clust = cluster(X, y, k)
 print("Computing classification error.")
 err = compute_mistake_rate(y, clust)
 return clust["distortion"][0], err
 def cluster(X, y, k, n_starts=5):
 """
 Run k-means a total of n_starts times. Returns the results from the run that
 had the lowest within-group sum of squares (i.e. the lowest distortion).
 Inputs
 ------
 X is an NxD1 matrix of inputs.(D1=157)
 y is a D2x1 vector of labels.(D2=1000)
 n_starts says how many times to randomly re-initialize k-means. You don't
 need to change this.
 Outputs
 -------
 The output is a dictionary with the following fields:
 Mu is a kxD1 matrix of cluster centroids
 z is an Nx1 vector assigning points to clusters. So, for instance, if z[4] =
 2, then the algorithm has assigned the 4th data point to the second
 cluster.
 distortion is the within-group sum of squares, a number.
 """
 def loop(X, i):
 """
 A single run of clustering.
 """
 Mu = initialize(X, k) #Done
 N = X.shape[0]
 z = np.repeat(-1, N) # So that initially all assignments change.-1 repeats N times
 while True:
 old_z = z
 z = assign(X, Mu) # The vector of assignments z.
 Mu = update(X, z, k) # Update the centroids
 if np.all(z == old_z):
 distortion = compute_distortion(X, Mu, z)
 return dict(Mu=Mu, z=z, distortion=distortion)
 # Main function body
 print("Performing clustering.")
 results = [loop(X, i) for i in range(n_starts)]
 best = min(results, key=lambda entry: entry["distortion"])
 best["digits"] = label_clusters(y, k, best["z"])
 return best
 def assign(X, Mu):
 """
 Assign each entry to the closest centroid. Return an Nx1 vector of
 assignments z.
 X is the NxD matrix of inputs.
 Mu is the kxD matrix of cluster centroids.
 """
 z=[]
 for i in range(0,len(X)):
 dist=[]
 for j in range(0,len(Mu)):
 dist.append(np.linalg.norm(X[i]-Mu[j]))
 z.append(dist.index(min(dist)))
 return z
 def update(X, z, k):
 """
 Update the cluster centroids given the new assignments. Return a kxD matrix
 of cluster centroids Mu.
 X is the NxD inputs as always.
 z is the Nx1 vector of cluster assignments.
 k is the number of clusters.
 """
 # TODO: Compute the cluster centroids Mu.
 b=X.shape[1]
 Mu=np.zeros(shape=(k,b))
 for i in range(0,k):
 cluster_index=[]
 for j in range(0,(len(z))):
 if z[j]==i:
 cluster_index.append(j)
 Mu[i,:]=np.mean(X[cluster_index,:],axis=0)
 return Mu
 def compute_distortion(X, Mu, z):
 """
 Compute the distortion (i.e. within-group sum of squares) implied by NxD
 data X, kxD centroids Mu, and Nx1 assignments z.
 """
 # TODO: Compute the within-group sum of squares (the distortion).
 k=Mu.shape[0]
 distortion = []
 for i in range(0,k):
 cluster_index=[]
 for j in range(0,(len(z))):
 if z[j]==i:
 cluster_index.append(j)
 distortion1=0
 for j in cluster_index:
 distortion1= distortion1 + (np.linalg.norm(X[j]-Mu[i])**2)
 distortion.append(distortion1)
 return distortion
 def initialize(X, k):
 """
 Randomly initialize the kxD matrix of cluster centroids Mu. Do this by
 choosing k data points randomly from the data set X.
 """
 index = np.random.choice(len(X),k)
 Mu=X[index,:]
 return Mu
 def label_clusters(y, k, z):
 """
 Label each cluster with the digit that occurs most frequently for points
 assigned to that cluster.
 Return a kx1 vector labels with the label for each cluster.
 For instance: if 20 points assigned to cluster 0 have label "3", and 40 have
 label "5", then labels[0] should be 5.
 y is the Nx1 vector of digit labels for the data X
 k is the number of clusters
 z is the Nx1 vector of cluster assignments.
 """
 # TODO: Compute the cluster labelings.
 labels=[]
 for i in range(0,k):
 print("i",i)
 list_1=[]
 cluster_index=[]
 for j in range(0,(len(z))):
 if z[j]==i:
 cluster_index.append(j)
 list_1 = y[cluster_index]
 label_data=pd.DataFrame(data=list_1,columns=["Y"])
 labels.append(label_data['Y'].value_counts().idxmax())
 return np.array(labels)
 def compute_mistake_rate(y, clust):
 """
 Compute the mistake rate as discussed in section 3.4 of the homework.
 y is the Nx1 vector of true labels.
 clust is the output of a run of clustering. Two fields are relevant:
 "digits" is a kx1 vector giving the majority label for each cluster
 "z" is an Nx1 vector of final cluster assignments.
 """
 print("start compute")
 clust['z']=np.array(clust['z'])
 def zero_one_loss(xs, ys):
 return sum(xs != ys) / float(len(xs))
 y_hat = clust["digits"][clust["z"]]
 return zero_one_loss(y, y_hat)
 def main():
 analyze_kmeans()
 if __name__ == '__main__':
 main()
Create a Program to Implement Digit Classification in Python Assignment Solution.

Instructions

Requirements and Specifications