Instructions
Requirements and Specifications
Source Code
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
"""# Read original data with all coefficients
We will read the original .csv file and then extract the desired column
"""
data = pd.read_csv('original_data.csv')
# drop na
data = data.dropna()
data.head()
"""# Get data for column 'Degree = 4 Coefficients'
The column contains the points in a string '[ .. ]', so we will have to parse that string to remove the brackets and extract the float values
"""
pointsraw = data['Degree=4 Coefficients'].to_numpy() # extract values and convert to numpy
# Now, take each row, remove first and last characters ( [] ), and split
X = np.zeros((pointsraw.shape[0], 5)) # Matrix to store all 39 samples
# Loop through each raw sample
for i, points_str in enumerate(pointsraw):
points_str = points_str[1:-1] # remove first and last characters which are []
# Split
points_lst = points_str.split()
# Convert to float
points_i = list(map(float, points_lst))
# Add to matrix
X[i,:] = points_i
"""# Hierarchical Clustering"""
Z = linkage(X, method = 'ward', metric = 'euclidean')
"""# Dendogram"""
# Create figure
plt.figure(figsize=(25, 15))
# Create dendogram
dendrogram(
Z,
leaf_rotation=90., # rotates the x axis labels
leaf_font_size=8., # font size for the x axis labels
)
plt.title('Hierarchical Clustering Dendrogram', fontsize=25)
plt.xlabel('Index', fontsize=25)
plt.ylabel('Euclidean Distance', fontsize=25)
plt.show()