# Create a Program to Implement Neighbour Joining Algorithm in Python Assignment Solution.

## Instructions

Objective
Write a python assignment to implement the neighbor-joining algorithm in Python. This algorithm constructs phylogenetic trees, crucial in evolutionary biology. Develop a program that takes a distance matrix, identifies closest neighbors iteratively, and builds the tree. This assignment hones Python skills while solving a practical biological problem.

## Requirements and Specifications

Source Code

```# -*- coding: utf-8 -*- """ @author: """ # do not change or add any import statements from os.path import basename from glob import glob import numpy as np import matplotlib.pyplot as plt from scipy.cluster.hierarchy import dendrogram # %% def read(filename, n=3): """ Please fill in this docstring template Parameters ---------- filename : str String containing the file name n : int, optional number of the n-grams. The default is 3. Returns ------- ngrams : list of tuples List of tuples where each tuple contain n words """ # Open file ngrams = list() with open(filename, 'r') as file: # Read content content = file.read() # Split into words words = content.split(" ") # Now, construct the ngrams for i in range(len(words)-n+1): sublst = list() for j in range(i, i+n): w = words[j].strip() sublst.append(w) ngrams.append(tuple(sublst)) return ngrams # %% def vocabulary(ngrams_list): """ Please fill in this docstring template Parameters ---------- ngrams_list : list of tuples list of tuples containing the ngrams obtained from the function read(filename, n=3) Returns ------- ret: list of tuples Returns a list of the unique tuples in ngrams """ ret = set() for ngram in ngrams_list: for tup in ngram: ret.add(tup) return list(ret) # %% def vectorize(ngrams, vocabulary): """ Please fill in this docstring template Parameters ---------- ngrams : list of tuples list of n-grams (obtained from 'read') vocabulary : list of tuples list of n-grams obtained from 'vocabulary' Returns ------- features : NumPy array NumPy array with the same length of vocabulary, containing 1 and zeros """ # Get the length of vocabulary n = len(vocabulary) # Create the array filled with zeros features = np.zeros(n) # Iterate through the tuples in vocabulary i = 0 for tup in vocabulary: if tup in ngrams: features[i] = 1 i += 1 return features # %% def tfidf(vectors): """ Please fill in this docstring template Parameters ---------- vectors : Numpy Array Numpy Array with 1 and 0 Returns ------- result : TYPE DESCRIPTION. """ N = len(vectors) n = len(vectors[0]) tfidf_ret = np.zeros((N, n)) result = np.zeros((N, n)) for i in range(len(vectors)): vector = vectors[i] for j in range(len(vector)): tf = vector[j] nt = list(vector[:j+1]).count(tf) idf = np.log(N/(1 + nt))+1 tfidf = tf*idf tfidf_ret[i,j] = tfidf return tfidf_ret # %% def dissimilarity(v1, v2): """ Please fill in this docstring template Parameters ---------- v1 : TYPE DESCRIPTION. v2 : TYPE DESCRIPTION. Returns ------- TYPE DESCRIPTION. """ sumv1_2 = np.sum(np.power(v1, 2)) sumv2_2 = np.sum(np.power(v2,2)) sum = 0 for i in range(len(v1)): sum += v1[i]*v2[i] d = 1 - sum/(sumv1_2*sumv2_2)**(1/2) return d # %% def distance_matrix(vectors): """ Please fill in this docstring template Parameters ---------- vectors : TYPE DESCRIPTION. Returns ------- distances : TYPE DESCRIPTION. """ n = len(vectors) d = np.zeros((n,n)) for i in range(n): for j in range(n): v1 = vectors[i] v2 = vectors[j] d[i,j] = dissimilarity(v1,v2) return d # %% def nearest_pair(distances, mask): """ Please fill in this docstring template Parameters ---------- distances : 2-D Numpy Array Numpy Array containing the dissimilarity values mask : List of boolean List of boolean values Returns ------- pair : List List with two values containing the index of the smallest values in distances such that maks[i] = mask[j] = False """ n = distances.shape[0] smallest = np.inf ret = [] for i in range(n): for j in range(n): if i != j and distances[i,j] < smallest and mask[i] == False and mask[j] == False: smallest = distances[i,j] ret = [i,j] return ret # %% def merge_pair(vectors, linkage, mask, pair): """ Please fill in this docstring template Parameters ---------- vectors : TYPE DESCRIPTION. linkage : TYPE DESCRIPTION. mask : TYPE DESCRIPTION. pair : TYPE DESCRIPTION. Returns ------- None. """ # First, select the vectors defined in the pair of index v1 = vectors[pair[0]] v2 = vectors[pair[1]] # Merge vectors v3 = (v1+v2)/2 np.append(vectors, v3) # Calculate the distance between the merged vectors, and the average between each merged vector to the new one dv1v2 = dissimilarity(v1,v2)/2.0 davg = (dissimilarity(v1, v3) + dissimilarity(v2, v3))/2.0 dmax = dv1v2 if davg > dmax: dmax = davg # Create new linkage row linkage_row = [pair[0], pair[1], dmax, len(pair)] linkage.append(linkage_row) mask[pair[0]] = True mask[pair[1]] = True mask.append(False) # %% def update_distance_matrix(distances, vectors): """ Please fill in this docstring template Parameters ---------- distances : TYPE DESCRIPTION. vectors : TYPE DESCRIPTION. Returns ------- new_distances : TYPE DESCRIPTION. """ # Pick the last vector v = vectors[-1] # Create a new distance matrix, with one more row and column new_dist = np.zeros((len(vectors), len(vectors))) # Copy the first N-1 elements into the new matrix # Fill the elements for i in range(len(vectors)-1): for j in range(len(vectors)-1): new_dist[i,j] = distances[i,j] # Now, compute the distances to the new vector for i in range(len(vectors)): v_ = vectors[i] new_dist[len(vectors)-1, i] = dissimilarity(v, v_) new_dist[i, len(vectors)-1] = dissimilarity(v, v_) return new_dist # %% def cluster(vectors): """ Please fill in this docstring template Parameters ---------- vectors : TYPE DESCRIPTION. Returns ------- TYPE DESCRIPTION. """ # First, compute the distance matrix distances = distance_matrix(vectors) linkage = [] mask = [False for i in range(len(vectors))] # Repeat until no pair of vectors needs to be merged while True: pair = nearest_pair(distances, mask) if not pair: break merge_pair(vectors, linkage, mask, pair) new_distances = update_distance_matrix(distances, vectors) distances = new_distances.copy() return linkage # %% # Do not delete or change code beneath this line. def main(): """ Run a clustering analysis on the assignments in the "assignments" directory, which should be in your current working directory. Displays the resulting heirarchy as a dendrogram. Raises ------ RuntimeError Raised if it can't find the assignments. Returns ------- fig : a matplotlib figure A dendrogram of the heirarchy. """ # read in the assignments data = [] labels = [] files = list(glob('assignments/*')) if not files: raise RuntimeError("I can't find the assignments. " "Are they in your current working directory?") # Uncomment the following line to test on a subset of assignments # files = np.random.choice(glob('assignments/*'), size=50, replace=False) for filename in files: print(f"Reading and analyzing {filename}... ") data.append(read(filename)) labels.append(basename(filename)[:-4]) # extract the vocab and convert assignments to vectors print("Calculating vocabulary...") vocab = vocabulary(data) print("Vectorizing...") vectors = [vectorize(d, vocab) for d in data] # transform the features to make rare features more important print("Calculating tfidf...") vectors = tfidf(vectors) # cluster the assignments print("Calculating linkage...") linkage = cluster(vectors) # plot the result print("Plotting...") fig = plt.figure(figsize=(10, 10)) dendrogram(linkage, labels=labels, orientation='left') return fig main()```