Program to Implement Web Scrapping in Python Assignment Solution.

Instructions

Objective

Write a program to implement web scrapping in python language.

Requirements and Specifications

HLT

Homework 5: Web Scraping

Worth 200 points

Objective: Create a knowledge base scraped from the web. This knowledge base will be used in a later homework to create a chatbot that can carry on a limited conversation in a particular domain using the knowledge base, as well as knowledge it learns from the user.

You may work alone if you prefer, or you can partner with one other person.
Upload your code and all output files, zipped together.

Instructions

Build a web crawler function that starts with a URL representing a topic (a sport, your favorite film, a celebrity, a political issue, etc.) and outputs a list of at least 15 relevant URLs. The URLs can be pages within the original domain but should have a few outside the original domain.
Write a function to loop through your URLs and scrape all text off each page. Store each page’s text in its own file.
Write a function to clean up the text. You might need to delete newlines and tabs. Extract sentences with NLTK’s sentence tokenizer. Write the sentences for each file to a new file. That is, if you have 15 files in, you have 15 files out.
Write a function to extract at least 25 important terms from the pages using an importance measure such as term frequency, or tf-idf. First, it’s a good idea to lower-case everything, remove stopwords and punctuation. Print the top 25-40 terms.
Manually determine the top 10 terms from step 4, based on your domain knowledge.
Build a searchable knowledge base of facts that a chatbot (to be developed later) can share related to the 10 terms. The “knowledge base” can be as simple as a Python dict which you can pickle. More points for something more sophisticated like sql.
In a doc: (1) describe how you created your knowledge base, include screen shots of the knowledge base, and indicate your top 10 terms; (2) write up a sample dialog you would like to create with a chatbot based on your knowledge base

Source Code

from bs4 import BeautifulSoup
import requests
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
#nltk.download('punkt')
htmlwords = ['https', 'http', 'display', 'button', 'hover',
 'color', 'background', 'height', 'none', 'target',
 'WebPage', 'reload', 'fieldset', 'padding', 'input',
 'select', 'textarea', 'html', 'form', 'cursor',
 'overflow', 'format', 'italic', 'normal', 'truetype',
 'before', 'name', 'label', 'float', 'title', 'arial', 'type',
 'block', 'audio', 'inline', 'canvas', 'margin', 'serif', 'menu',
 'woff', 'content', 'fixed', 'media', 'position', 'relative', 'hidden',
 'width', 'clear', 'body', 'standard', 'expandable', 'helvetica',
 'fullwidth', 'embed', 'expandfull', 'fullstandardwidth', 'left', 'middle',
 'iframe', 'rgba', 'selected', 'scroll', 'opacity',
 'center', 'false', 'right', 'href']
symbols = ['<', '/', '\\', '>', '-', '_', '|', '!--', '--', '=', ':', '="']
"""
 Question 1
"""
def crawler(url: str):
 """
 This function scrappes in the given url and returns a list with 15 relevant URLS found in the given url
:param url: URL str
 :return: List of URLs
 """
 # Get HTML code from url
 page = requests.get(url)
 # Create soup
 soup = BeautifulSoup(page.content, features='html.parser')
 # Create list with results
 result = list()
 n = 0
 for a in soup.find_all('a', href=True, attrs={'target': '_blank'}):
 result.append(a['href'])
 n += 1
 if n == 15: # We have already added 15 URLs
 break
 return result
"""
 QUESTION 2
"""
def scrappe_urls(url):
 """
 This function received an URL.
 It then uses the function 'crawler' to get 15 relevant URLs from this url
 and then this function will get the HTML code for each url and save it into a file
 :param url: str
 :return: None
 """
 # First, get relevant URLs
 urls = crawler(url)
 # Iterate through each url, get HTML code and save it into a file
 for u in urls:
 file_name = u.split(".")[1] + ".txt"
 html = requests.get(url).content
 # Define the file_name as the url
 file = open(file_name, 'w+')
 file.write(str(html))
 file.close()
"""
 QUESTION 3
"""
def clean_html(url: str):
 # First, get relevant URLs
 urls = crawler(url)
 for u in urls:
 # Get html
 html_code = str(requests.get(u).content)
 # Clear all html words
 for w in htmlwords:
 html_code = html_code.replace(w, "")
 # Clear symbols
 for s in symbols:
 html_code = html_code.replace(s, "")
 # First, clean new lines and tabs
 html_code = html_code.replace("\n", " ")
 html_code = html_code.replace("\t", " ")
 # Tokenize sentences
 tokens = sent_tokenize(html_code)
 # Get sentences that only contains alphabetic chars (not symbols)
 #sentences = [w for w in tokens if w.isalpha() and len(w) > 4]
 sentences = tokens
 # Now, write each token in a new line and save it to a file
 res = ""
 file_name = u.split(".")[1] + "_token.txt"
 file = open(file_name, 'w+')
 for sentence in sentences:
 res += sentence + "\n"
 file.write(res)
 file.close()
"""
 QUESTION 4
"""
def important_words(url: str):
 # Get html
 html_code = str(requests.get(url).content)
 # First, clean new lines and tabs
 html_code = html_code.replace("\n", " ")
 html_code = html_code.replace("\t", " ")
 # Tokenize sentences
 tokens = word_tokenize(html_code)
 # Get sentences that only contains alphabetic chars (not symbols)
 words = [w for w in tokens if w.isalpha() and len(w) > 4 and w.lower() not in htmlwords]
 # Convert to NLTK text
 text = nltk.Text(words)
 # Calculate frequency distribution
 dist = dict(FreqDist(text))
 # Sort in descending order and output the first 25
 dist = dict(sorted(dist.items(), key=lambda x: x[1], reverse=True))
 # Save the 40 most frequent terms into a file
 res = ""
 file_name = url.split(".")[1] + "_most_frequent_terms.txt"
 file = open(file_name, 'w+')
 for tup in list(dist.items())[:40]:
 res += "(" + tup[0] + "," + str(tup[1]) + ")\n"
 file.write(res)
 file.close()
 return list(dist.items())[:40]
def searchable_base(url):
 tree = dict()
 # We will create a tree that will contain as root, the important word.
 # Then, we will check in which sentence that word appears, and the rest of words in that sentence
 # are added as childrens
 analyzed = list()
 queue = list()
 # Get html
 html_code = str(requests.get(url).content)
 # Clear all html words
 for w in htmlwords:
 html_code = html_code.replace(w, "")
 # Clear symbols
 for s in symbols:
 html_code = html_code.replace(s, "")
 # First, clean new lines and tabs
 html_code = html_code.replace("\n", " ")
 html_code = html_code.replace("\t", " ")
 # Tokenize sentences
 tokens = sent_tokenize(html_code)
 # Get sentences that only contains alphabetic chars (not symbols)
 # sentences = [w for w in tokens if w.isalpha() and len(w) > 4]
 sentences = tokens
 # Now, write each token in a new line and save it to a file
 mi_words = [x[0] for x in important_words(url)[:10]]
 queue = mi_words.copy()
 while len(queue) > 0:
 w = queue.pop()
 if not w in tree and len(w) > 0 and w.isalpha():
 tree[w] = list()
 for s in sentences:
 if w in s:
 words = s.split(" ")
 for w2 in words:
 if w2 != w and len(w2) > 0 and w2.isalpha():
 queue.append(w2)
 tree[w].append(w2)
 """
 for s in sentences:
 words = s.split(" ")
 for w in words:
 if not w in tree and len(w) > 0 and w.isalpha():
 tree[w] = []
 for w2 in words:
 if w in tree and w != w2 and len(w2) > 0 and w2.isalpha():
 tree[w].append(w2)
 """
 return tree
if __name__ == '__main__':
 """
 QUESTION 1
 """
 # Test with site listing TOP news sites
 url = "http://www.ebizmba.com/articles/news-websites"
 print(crawler(url))
 """
 QUESTION 2
 """
 #scrappe_urls(url)
 """
 QUESTION 3
 """
 #clean_html(url)
 """
 QUESTION 4
 """
 words = important_words(url)
 print(f"The 10 most important words from {url} are:")
 for i in range(len(words)):
 tup = words[i]
 print(tup[0], end="")
 if i < len(words)-1:
 print(", ", end="")
 else:
 print("")
 """
 QUESTION 5
 """
 # The top 10 words based on the frequency histogram are:
 # Sites, Popular, eBizMBA, strong, Websites, script, October, Quantcast, Alexa, border
 """
 QUESTION 6
 """
 print(searchable_base(url))

Create a Program to Implement Web Scrapping in Python Assignment Solution.

Instructions

Requirements and Specifications