Question
Create an ngram linear model for text Python
Solution:
import argparse
import math
import random
from nltk.tokenize import sent_tokenize, word_tokenize
from typing import List
from typing import Tuple
from typing import Generator
import nltk
nltk.download('punkt')
# Generator for all n-grams in text
# n is a (non-negative) int
# text is a list of strings
# Yields n-gram tuples of the form (string, context), where context is a tuple of strings
def get_ngrams(n: int, text: List[str]) -> Generator[Tuple[str, Tuple[str, ...]], None, None]:
token_sep = ''
text_edited = [token_sep]*(n-1)
text_edited.extend(text)
text_edited.append(token_sep)
list_ngrams = list(zip(*[text_edited[i:] for i in range(n)]))
for i in list_ngrams:
yield tuple((i[n-1], tuple(i[:n-1])))
# Loads and tokenizes a corpus
# corpus_path is a string
# Returns a list of sentences, where each sentence is a list of strings
def load_corpus(corpus_path: str) -> List[List[str]]:
corpus = open(corpus_path).read()
paragraph = corpus.split('\n\n')
sentence = []
words = []
for par in paragraph:
sentence.extend(sent_tokenize(par))
for sent in sentence:
words.append(word_tokenize(sent))
return words
# Builds an n-gram model from a corpus
# n is a (non-negative) int
# corpus_path is a string
# Returns an NGramLM
def create_ngram_lm(n: int, corpus_path: str) -> 'NGramLM':
corpus_data = load_corpus(corpus_path)
ngramLM = NGramLM(n)
for corpus in corpus_data:
ngramLM.update(corpus)
return ngramLM
# An n-gram language model
class NGramLM:
def __init__(self, n: int):
self.n = n
self.ngram_counts = {}
self.context_counts = {}
self.vocabulary = set()
# Updates internal counts based on the n-grams in text
# text is a list of strings
# No return value
def update(self, text: List[str]) -> None:
self.vocabulary |= set(text)
self.vocabulary |= set([''])
for ngram in get_ngrams(self.n, text):
self.ngram_counts[ngram] = 1 + self.ngram_counts.get(ngram, 0)
self.context_counts[ngram[1]] = 1 + self.context_counts.get(ngram[1], 0)
# Calculates the MLE probability of an n-gram
# word is a string
# context is a tuple of strings
# delta is an float
# Returns a float
def get_ngram_prob(self, word: str, context: Tuple[str, ...], delta=.0) -> float:
context_count = self.context_counts.get(context, 0)
ngram_count = self.ngram_counts.get((word, context), 0)
if delta == 0:
if context_count == 0:
return 1/(len(self.vocabulary))
else:
return ngram_count/context_count
else:
return (delta + ngram_counts)/(context_count + (delta * len(self.vocabulary)))
# Calculates the log probability of a sentence
# sent is a list of strings
# delta is a float
# Returns a float
def get_sent_log_prob(self, sent: List[str], delta=.0) -> float:
probability = 0
for ngram in get_ngrams(self.n, sent):
ngram_prob = self.get_ngram_prob(ngram[0], ngram[1], delta)
try:
probability += math.log2(ngram_prob)
except:
probability += float('-inf')
return probability
# Calculates the perplexity of a language model on a test corpus
# corpus is a list of lists of strings
# Returns a float
def get_perplexity(self, corpus: List[List[str]]) -> float:
size = 0
corpus_sent_probability = 0
for sent in corpus:
size += len(sent)
sent_prob = self.get_sent_log_prob(sent)
corpus_sent_probability += sent_prob
avg_log_prob = (-1)*(corpus_sent_probability/size)
return math.pow(2, avg_log_prob)
# Samples a word from the probability distribution for a given context
# context is a tuple of strings
# delta is an float
# Returns a string
def generate_random_word(self, context: Tuple[str, ...], delta=.0) -> str:
prob_before = 0
prob_current = 0
sorted_vocabulary = sorted(self.vocabulary)
r = random.random()
for word in sorted_vocabulary:
ngram_probability = self.get_ngram_prob(word, context, delta)
prob_current += ngram_probability
if prob_before < r <= prob_current:
return word
prob_before += ngram_probability
# Generates a random sentence
# max_length is an int
# delta is a float
# Returns a string
def generate_random_text(self, max_length: int, delta=.0) -> str:
end_sentence = False
len_current = 0
text = ['
']*(self.n-1)
if max_length == 0:
return ''
first_word = self.generate_random_word(tuple(text), delta)
len_current += 1
text.append(first_word)
while len_current < max_length and not end_sentence:
word = self.generate_random_word(tuple(text[-(self.n-1):]), delta)
text.append(word)
len_current += 1
if word == '':
end_sentence = True
#print(text)
return ' '.join(text[self.n-1:])
def main(corpus_path: str, delta: float, seed: int):
trigram_lm = create_ngram_lm(3, corpus_path)
s1 = 'God has given it to me, let him who touches it beware!'
s2 = 'Where is the prince, my Dauphin?'
print(trigram_lm.get_perplexity([word_tokenize(s1)]))
print(trigram_lm.get_perplexity([word_tokenize(s2)]))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="CS6320 HW1")
parser.add_argument('corpus_path', nargs="?", type=str, default='sentence.txt', help='Path to corpus file')
parser.add_argument('delta', nargs="?", type=float, default=.5, help='Delta value used for smoothing')
parser.add_argument('seed', nargs="?", type=int, default=82761904, help='Random seed used for text generation')
args = parser.parse_args()
random.seed(args.seed)
main(args.corpus_path, args.delta, args.seed)