# ngram Linear Model Assignment Solution

Question

write a python assignment to create an ngram linear model

Solution:

``` import argparse import math import random from nltk.tokenize import sent_tokenize, word_tokenize from typing import List from typing import Tuple from typing import Generator import nltk nltk.download('punkt') # Generator for all n-grams in text # n is a (non-negative) int # text is a list of strings # Yields n-gram tuples of the form (string, context), where context is a tuple of strings def get_ngrams(n: int, text: List[str]) -> Generator[Tuple[str, Tuple[str, ...]], None, None]: token_sep = '' text_edited = [token_sep]*(n-1) text_edited.extend(text) text_edited.append(token_sep) list_ngrams = list(zip(*[text_edited[i:] for i in range(n)])) for i in list_ngrams: yield tuple((i[n-1], tuple(i[:n-1]))) # Loads and tokenizes a corpus # corpus_path is a string # Returns a list of sentences, where each sentence is a list of strings def load_corpus(corpus_path: str) -> List[List[str]]: corpus = open(corpus_path).read() paragraph = corpus.split('\n\n') sentence = [] words = [] for par in paragraph: sentence.extend(sent_tokenize(par)) for sent in sentence: words.append(word_tokenize(sent)) return words # Builds an n-gram model from a corpus # n is a (non-negative) int # corpus_path is a string # Returns an NGramLM def create_ngram_lm(n: int, corpus_path: str) -> 'NGramLM': corpus_data = load_corpus(corpus_path) ngramLM = NGramLM(n) for corpus in corpus_data: ngramLM.update(corpus) return ngramLM # An n-gram language model class NGramLM: def __init__(self, n: int): self.n = n self.ngram_counts = {} self.context_counts = {} self.vocabulary = set() # Updates internal counts based on the n-grams in text # text is a list of strings # No return value def update(self, text: List[str]) -> None: self.vocabulary |= set(text) self.vocabulary |= set(['']) for ngram in get_ngrams(self.n, text): self.ngram_counts[ngram] = 1 + self.ngram_counts.get(ngram, 0) self.context_counts[ngram[1]] = 1 + self.context_counts.get(ngram[1], 0) # Calculates the MLE probability of an n-gram # word is a string # context is a tuple of strings # delta is an float # Returns a float def get_ngram_prob(self, word: str, context: Tuple[str, ...], delta=.0) -> float: context_count = self.context_counts.get(context, 0) ngram_count = self.ngram_counts.get((word, context), 0) if delta == 0: if context_count == 0: return 1/(len(self.vocabulary)) else: return ngram_count/context_count else: return (delta + ngram_counts)/(context_count + (delta * len(self.vocabulary))) # Calculates the log probability of a sentence # sent is a list of strings # delta is a float # Returns a float def get_sent_log_prob(self, sent: List[str], delta=.0) -> float: probability = 0 for ngram in get_ngrams(self.n, sent): ngram_prob = self.get_ngram_prob(ngram[0], ngram[1], delta) try: probability += math.log2(ngram_prob) except: probability += float('-inf') return probability # Calculates the perplexity of a language model on a test corpus # corpus is a list of lists of strings # Returns a float def get_perplexity(self, corpus: List[List[str]]) -> float: size = 0 corpus_sent_probability = 0 for sent in corpus: size += len(sent) sent_prob = self.get_sent_log_prob(sent) corpus_sent_probability += sent_prob avg_log_prob = (-1)*(corpus_sent_probability/size) return math.pow(2, avg_log_prob) # Samples a word from the probability distribution for a given context # context is a tuple of strings # delta is an float # Returns a string def generate_random_word(self, context: Tuple[str, ...], delta=.0) -> str: prob_before = 0 prob_current = 0 sorted_vocabulary = sorted(self.vocabulary) r = random.random() for word in sorted_vocabulary: ngram_probability = self.get_ngram_prob(word, context, delta) prob_current += ngram_probability if prob_before < r <= prob_current: return word prob_before += ngram_probability # Generates a random sentence # max_length is an int # delta is a float # Returns a string def generate_random_text(self, max_length: int, delta=.0) -> str: end_sentence = False len_current = 0 text = ['']*(self.n-1) if max_length == 0: return '' first_word = self.generate_random_word(tuple(text), delta) len_current += 1 text.append(first_word) while len_current < max_length and not end_sentence: word = self.generate_random_word(tuple(text[-(self.n-1):]), delta) text.append(word) len_current += 1 if word == '': end_sentence = True #print(text) return ' '.join(text[self.n-1:]) def main(corpus_path: str, delta: float, seed: int): trigram_lm = create_ngram_lm(3, corpus_path) s1 = 'God has given it to me, let him who touches it beware!' s2 = 'Where is the prince, my Dauphin?' print(trigram_lm.get_perplexity([word_tokenize(s1)])) print(trigram_lm.get_perplexity([word_tokenize(s2)])) if __name__ == '__main__': parser = argparse.ArgumentParser(description="CS6320 HW1") parser.add_argument('corpus_path', nargs="?", type=str, default='sentence.txt', help='Path to corpus file') parser.add_argument('delta', nargs="?", type=float, default=.5, help='Delta value used for smoothing') parser.add_argument('seed', nargs="?", type=int, default=82761904, help='Random seed used for text generation') args = parser.parse_args() random.seed(args.seed) main(args.corpus_path, args.delta, args.seed) ```