Program to Implement Neighbour Joining Algorithm in Python

Instructions

Objective
Write a python assignment to implement the neighbor-joining algorithm in Python. This algorithm constructs phylogenetic trees, crucial in evolutionary biology. Develop a program that takes a distance matrix, identifies closest neighbors iteratively, and builds the tree. This assignment hones Python skills while solving a practical biological problem.
Requirements and Specifications

program to implement neighbour joining algorithm in python
Source Code
# -*- coding: utf-8 -*-
"""
@author: 
"""
# do not change or add any import statements
from os.path import basename
from glob import glob
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram
# %%
def read(filename, n=3):
 """
 Please fill in this docstring template
 Parameters
 ----------
 filename : str
 String containing the file name
 n : int, optional
 number of the n-grams. The default is 3.
 Returns
 -------
 ngrams : list of tuples
 List of tuples where each tuple contain n words
 """
 # Open file
 ngrams = list()
 with open(filename, 'r') as file:
 # Read content
 content = file.read()
 # Split into words
 words = content.split(" ")
 # Now, construct the ngrams
 for i in range(len(words)-n+1):
 sublst = list()
 for j in range(i, i+n):
 w = words[j].strip()
 sublst.append(w)
 ngrams.append(tuple(sublst))
 return ngrams
# %%
def vocabulary(ngrams_list):
 """
 Please fill in this docstring template
 Parameters
 ----------
 ngrams_list : list of tuples
 list of tuples containing the ngrams obtained from the function read(filename, n=3)
 Returns
 -------
 ret: list of tuples
 Returns a list of the unique tuples in ngrams
 """
 ret = set()
 for ngram in ngrams_list:
 for tup in ngram:
 ret.add(tup)
 return list(ret)
# %%
def vectorize(ngrams, vocabulary):
 """
 Please fill in this docstring template
 Parameters
 ----------
 ngrams : list of tuples
 list of n-grams (obtained from 'read')
 vocabulary : list of tuples
 list of n-grams obtained from 'vocabulary'
 Returns
 -------
 features : NumPy array
 NumPy array with the same length of vocabulary, containing 1 and zeros
 """
 # Get the length of vocabulary
 n = len(vocabulary)
 # Create the array filled with zeros
 features = np.zeros(n)
 # Iterate through the tuples in vocabulary
 i = 0
 for tup in vocabulary:
 if tup in ngrams:
 features[i] = 1
 i += 1
 return features
# %%
def tfidf(vectors):
 """
 Please fill in this docstring template
 Parameters
 ----------
 vectors : Numpy Array
 Numpy Array with 1 and 0
 Returns
 -------
 result : TYPE
 DESCRIPTION.
 """
 N = len(vectors)
 n = len(vectors[0])
 tfidf_ret = np.zeros((N, n))
 result = np.zeros((N, n))
 for i in range(len(vectors)):
 vector = vectors[i]
 for j in range(len(vector)):
 tf = vector[j]
 nt = list(vector[:j+1]).count(tf)
 idf = np.log(N/(1 + nt))+1
 tfidf = tf*idf
 tfidf_ret[i,j] = tfidf
 return tfidf_ret
# %%
def dissimilarity(v1, v2):
 """
 Please fill in this docstring template
 Parameters
 ----------
 v1 : TYPE
 DESCRIPTION.
 v2 : TYPE
 DESCRIPTION.
 Returns
 -------
 TYPE
 DESCRIPTION.
 """
 sumv1_2 = np.sum(np.power(v1, 2))
 sumv2_2 = np.sum(np.power(v2,2))
 sum = 0
 for i in range(len(v1)):
 sum += v1[i]*v2[i]
 d = 1 - sum/(sumv1_2*sumv2_2)**(1/2)
 return d
# %%
def distance_matrix(vectors):
 """
 Please fill in this docstring template
 Parameters
 ----------
 vectors : TYPE
 DESCRIPTION.
 Returns
 -------
 distances : TYPE
 DESCRIPTION.
 """
 n = len(vectors)
 d = np.zeros((n,n))
 for i in range(n):
 for j in range(n):
 v1 = vectors[i]
 v2 = vectors[j]
 d[i,j] = dissimilarity(v1,v2)
 return d
# %%
def nearest_pair(distances, mask):
 """
 Please fill in this docstring template
 Parameters
 ----------
 distances : 2-D Numpy Array
 Numpy Array containing the dissimilarity values
 mask : List of boolean
 List of boolean values
 Returns
 -------
 pair : List
 List with two values containing the index of the smallest values in distances such that maks[i] = mask[j] = False
 """
 n = distances.shape[0]
 smallest = np.inf
 ret = []
 for i in range(n):
 for j in range(n):
 if i != j and distances[i,j] < smallest and mask[i] == False and mask[j] == False:
 smallest = distances[i,j]
 ret = [i,j]
 return ret
# %%
def merge_pair(vectors, linkage, mask, pair):
 """
 Please fill in this docstring template
 Parameters
 ----------
 vectors : TYPE
 DESCRIPTION.
 linkage : TYPE
 DESCRIPTION.
 mask : TYPE
 DESCRIPTION.
 pair : TYPE
 DESCRIPTION.
 Returns
 -------
 None.
 """
 # First, select the vectors defined in the pair of index
 v1 = vectors[pair[0]]
 v2 = vectors[pair[1]]
 # Merge vectors
 v3 = (v1+v2)/2
 np.append(vectors, v3)
 # Calculate the distance between the merged vectors, and the average between each merged vector to the new one
 dv1v2 = dissimilarity(v1,v2)/2.0
 davg = (dissimilarity(v1, v3) + dissimilarity(v2, v3))/2.0
 dmax = dv1v2
 if davg > dmax:
 dmax = davg
 # Create new linkage row
 linkage_row = [pair[0], pair[1], dmax, len(pair)]
 linkage.append(linkage_row)
 mask[pair[0]] = True
 mask[pair[1]] = True
 mask.append(False)
# %%
def update_distance_matrix(distances, vectors):
 """
 Please fill in this docstring template
 Parameters
 ----------
 distances : TYPE
 DESCRIPTION.
 vectors : TYPE
 DESCRIPTION.
 Returns
 -------
 new_distances : TYPE
 DESCRIPTION.
 """
 # Pick the last vector
 v = vectors[-1]
 # Create a new distance matrix, with one more row and column
 new_dist = np.zeros((len(vectors), len(vectors)))
 # Copy the first N-1 elements into the new matrix
 # Fill the elements
 for i in range(len(vectors)-1):
 for j in range(len(vectors)-1):
 new_dist[i,j] = distances[i,j]
 # Now, compute the distances to the new vector
 for i in range(len(vectors)):
 v_ = vectors[i]
 new_dist[len(vectors)-1, i] = dissimilarity(v, v_)
 new_dist[i, len(vectors)-1] = dissimilarity(v, v_)
 return new_dist
# %%
def cluster(vectors):
 """
 Please fill in this docstring template
 Parameters
 ----------
 vectors : TYPE
 DESCRIPTION.
 Returns
 -------
 TYPE
 DESCRIPTION.
 """
 # First, compute the distance matrix
 distances = distance_matrix(vectors)
 linkage = []
 mask = [False for i in range(len(vectors))]
 # Repeat until no pair of vectors needs to be merged
 while True:
 pair = nearest_pair(distances, mask)
 if not pair:
 break
 merge_pair(vectors, linkage, mask, pair)
 new_distances = update_distance_matrix(distances, vectors)
 distances = new_distances.copy()
 return linkage
# %%
# Do not delete or change code beneath this line.
def main():
 """
 Run a clustering analysis on the assignments in the "assignments"
 directory, which should be in your current working directory.
 Displays the resulting heirarchy as a dendrogram.
 Raises
 ------
 RuntimeError
 Raised if it can't find the assignments.
 Returns
 -------
 fig : a matplotlib figure
 A dendrogram of the heirarchy.
 """
 # read in the assignments
 data = []
 labels = []
 files = list(glob('assignments/*'))
 if not files:
 raise RuntimeError("I can't find the assignments. "
 "Are they in your current working directory?")
 # Uncomment the following line to test on a subset of assignments
 # files = np.random.choice(glob('assignments/*'), size=50, replace=False)
 for filename in files:
 print(f"Reading and analyzing {filename}... ")
 data.append(read(filename))
 labels.append(basename(filename)[:-4])
 # extract the vocab and convert assignments to vectors
 print("Calculating vocabulary...")
 vocab = vocabulary(data)
 print("Vectorizing...")
 vectors = [vectorize(d, vocab) for d in data]
 # transform the features to make rare features more important
 print("Calculating tfidf...")
 vectors = tfidf(vectors)
 # cluster the assignments
 print("Calculating linkage...")
 linkage = cluster(vectors)
 # plot the result
 print("Plotting...")
 fig = plt.figure(figsize=(10, 10))
 dendrogram(linkage, labels=labels, orientation='left')
 return fig
main()
Create a Program to Implement Neighbour Joining Algorithm in Python Assignment Solution.

Instructions

Requirements and Specifications