+1 (315) 557-6473 

Implement A Function `Topbigrams` Which Returns The 10 Most Frequent Character Pairs of Adjacent Letters in An English Text in The Python Assignment Solution.


Instructions

Objective
Write a python homework to implement a function `topBigrams` which returns the 10 most frequent character pairs of adjacent letters in an English text.

Requirements and Specifications

Task 1
Implement a function `topBigrams` which returns the 10 most frequent character pairs of adjacent letters in an English text. The function must return a `list` holding the 10 most frequently occurring character pairs in descending order. This function can be used, for example, to optimise the key assignment on a keyboard.
Important:
  •  You are meant to work on a particular input text available from the supplemental script `corpus.py`.
  • `corpus.py`, when loaded, will provide a corpus of 20 candidate texts.
  • Work on the text in this corpus identified by a computed position. Your position is computed as the integer obtained from your `student_id` (trim any leading zeroes!) modulo `20` (there are 20 candidate texts in `corpus`, hence this divisor).
Hints:
  •  The modulo is also known as the residual computation, e.g.: `55 % 10 = 5` (see also Task 2).
  • In a first step, remove any character other than [A-Z, a-z] from the input string for bigram frequency analysis.
  •  When calculating the occurences of character pairs, uppercase letters should be treated the same as lowercase letters.
The corpus.py file shall be omitted in the submitted zip-file.
Source Code
def topBigrams(text):
    filtered = ''
    for c in text:
        clower = c.lower()
        if clower.isalpha():
            filtered += clower
    top = {}
    for i in range(len(filtered)-1):
        bigram = filtered[i:i+2]
        if bigram in top:
            top[bigram] += 1
        else:
            top[bigram] = 1
    bigrams = list(top.keys())
    bigrams.sort(key=lambda x: top[x], reverse=True)
    l = min(10, len(bigrams))
    return bigrams[:l]
from nose.tools import assert_equal
text = corpus[(int(student_id.lstrip("h0")) % len(corpus))]
assert_equal(type(topBigrams(text)), list)
assert_equal(len(topBigrams(text)), 10)
assert_equal(topBigrams("NotImplementedError"), ['no', 'ot', 'ti', 'im', 'mp', 'pl', 'le', 'em', 'me', 'en'])
assert_equal(topBigrams("Yes, I will adhere to the Code of Conduct"), ['he', 'co', 'ye', 'es', 'si', 'iw', 'wi', 'il', 'll', 'la'])
def compute(a, b):
    if not isinstance(a, int) or not isinstance (b, int):
        return "Wrong input type"
    if a <= 0 or b <= 0:
        return "This is illegal, only positive numbers allowed"
    result = None
    if a == b:
        result = a
    else:
        if a > b:
            result = compute(a-b, b)
        else:
            result = compute(a, b-a)
    return result
from nose.tools import assert_equal
assert_equal(compute(-168, 4), "This is illegal, only positive numbers allowed")
assert_equal(compute(5, 2205), 5)
assert_equal(type(compute(99, 11)), int)
assert_equal(compute(4231, 1324), 1)
assert_equal(compute("整数", "int"), "Wrong input type")
def decode(input_string):
    if not isinstance(input_string, str):
        return "Wrong input type"
    if len(input_string) == 0:
        return ""
    parts = input_string.split()
    n = len(parts)
    m = len(parts[0])
    result = ""
    for i in range(n*m):
        c = parts[i % n][i // n]
        if c == '-':
            result += ' '
        else:
            result += c
    result = result.strip()
    return result
from nose.tools import assert_equal
assert_equal(decode("H-VCG EIEOS L--D- LLEI- OONN-"), 'HELLO I LOVE ENCODINGS')
assert_equal(decode("T-A-I HA-SN ELHE- RWICD EADRA -YDET ISETA S-N--"), 'THERE IS ALWAYS A HIDDEN SECRET IN DATA')
assert_equal(decode("NHRHA OA-IG BLRSE OLE-- D-AM- YEDE- -V-S- SETS-"), 'NOBODY SHALL EVER READ THIS MESSAGE')
assert_equal(type(decode("ts et")), str)
assert_equal(len(decode("T- HS IE SN -T WE IN LC LE -- BI E- -G AU -E LS OS N- G-")), 36)
assert_equal(decode(""), "")
assert_equal(decode([]), "Wrong input type")
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd",
             'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers',
             'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
             'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been',
             'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
             'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
             'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off',
             'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all',
             'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own',
             'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've",
             'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't",
             'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn',
             "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't",
             'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
 def cleaning(input_string, stopword_list):
    if not isinstance(input_string, str):
        return "Wrong input type"
    if len(input_string) == 0:
        return []
    words = []
    currword = ''
    for c in input_string:
        if c.isdigit():
            continue
        if c in '-_+\\^ ,#.;!?:>< ':
            if len(currword) > 0:
                words.append(currword)
                currword = ''
        else:
            currword += c
    if len(currword) > 0:
        words.append(currword)
    result = []
    for w in words:
        if w not in stopword_list:
            result.append(w)
    return result
from nose.tools import assert_equal
assert_equal(cleaning("Goo7d.day you won
assert_equal(cleaning("Plea8se.h3elp me
assert_equal(cleaning("Plea8se.h3elp me
assert_equal(cleaning("Tr7y this;,<+ 2o3n5e", stopwords), ['Try', 'one'])
assert_equal(cleaning("Tr7y this;,<+ 2o3n5e", []), ['Try', 'this', 'one'])
assert_equal(len(cleaning("The_ch3air,w2as+so-ha76ppy<>f7o2356r,al9ice!", stopwords)), 4)
assert_equal(type(cleaning("Goo7d.day you won
assert_equal(cleaning([], stopwords), "Wrong input type")
assert_equal(cleaning(24, stopwords), "Wrong input type")
assert_equal(cleaning("", stopwords), [])