+1 (315) 557-6473 

Create a Program to Implement Web Scrapping in Python Assignment Solution.


Instructions

Objective
Write a program to implement web scrapping in python language.

Requirements and Specifications

HLT
Homework 5: Web Scraping
Worth 200 points
Objective: Create a knowledge base scraped from the web. This knowledge base will be used in a later homework to create a chatbot that can carry on a limited conversation in a particular domain using the knowledge base, as well as knowledge it learns from the user.
  • You may work alone if you prefer, or you can partner with one other person.
  • Upload your code and all output files, zipped together.
Instructions
  1. Build a web crawler function that starts with a URL representing a topic (a sport, your favorite film, a celebrity, a political issue, etc.) and outputs a list of at least 15 relevant URLs. The URLs can be pages within the original domain but should have a few outside the original domain.
  2. Write a function to loop through your URLs and scrape all text off each page. Store each page’s text in its own file.
  3. Write a function to clean up the text. You might need to delete newlines and tabs. Extract sentences with NLTK’s sentence tokenizer. Write the sentences for each file to a new file. That is, if you have 15 files in, you have 15 files out.
  4. Write a function to extract at least 25 important terms from the pages using an importance measure such as term frequency, or tf-idf. First, it’s a good idea to lower-case everything, remove stopwords and punctuation. Print the top 25-40 terms.
  5. Manually determine the top 10 terms from step 4, based on your domain knowledge.
  6. Build a searchable knowledge base of facts that a chatbot (to be developed later) can share related to the 10 terms. The “knowledge base” can be as simple as a Python dict which you can pickle. More points for something more sophisticated like sql.
  7. In a doc: (1) describe how you created your knowledge base, include screen shots of the knowledge base, and indicate your top 10 terms; (2) write up a sample dialog you would like to create with a chatbot based on your knowledge base

Source Code

from bs4 import BeautifulSoup

import requests

import nltk

from nltk.tokenize import word_tokenize, sent_tokenize

from nltk.probability import FreqDist

#nltk.download('punkt')

htmlwords = ['https', 'http', 'display', 'button', 'hover',

'color', 'background', 'height', 'none', 'target',

'WebPage', 'reload', 'fieldset', 'padding', 'input',

'select', 'textarea', 'html', 'form', 'cursor',

'overflow', 'format', 'italic', 'normal', 'truetype',

'before', 'name', 'label', 'float', 'title', 'arial', 'type',

'block', 'audio', 'inline', 'canvas', 'margin', 'serif', 'menu',

'woff', 'content', 'fixed', 'media', 'position', 'relative', 'hidden',

'width', 'clear', 'body', 'standard', 'expandable', 'helvetica',

'fullwidth', 'embed', 'expandfull', 'fullstandardwidth', 'left', 'middle',

'iframe', 'rgba', 'selected', 'scroll', 'opacity',

'center', 'false', 'right', 'href']

symbols = ['<', '/', '\\', '>', '-', '_', '|', '!--', '--', '=', ':', '="']

"""

Question 1

"""

def crawler(url: str):

"""

This function scrappes in the given url and returns a list with 15 relevant URLS found in the given url

:param url: URL str

:return: List of URLs

"""

# Get HTML code from url

page = requests.get(url)

# Create soup

soup = BeautifulSoup(page.content, features='html.parser')

# Create list with results

result = list()

n = 0

for a in soup.find_all('a', href=True, attrs={'target': '_blank'}):

result.append(a['href'])

n += 1

if n == 15: # We have already added 15 URLs

break

return result

"""

QUESTION 2

"""

def scrappe_urls(url):

"""

This function received an URL.

It then uses the function 'crawler' to get 15 relevant URLs from this url

and then this function will get the HTML code for each url and save it into a file

:param url: str

:return: None

"""

# First, get relevant URLs

urls = crawler(url)

# Iterate through each url, get HTML code and save it into a file

for u in urls:

file_name = u.split(".")[1] + ".txt"

html = requests.get(url).content

# Define the file_name as the url

file = open(file_name, 'w+')

file.write(str(html))

file.close()

"""

QUESTION 3

"""

def clean_html(url: str):

# First, get relevant URLs

urls = crawler(url)

for u in urls:

# Get html

html_code = str(requests.get(u).content)

# Clear all html words

for w in htmlwords:

html_code = html_code.replace(w, "")

# Clear symbols

for s in symbols:

html_code = html_code.replace(s, "")

# First, clean new lines and tabs

html_code = html_code.replace("\n", " ")

html_code = html_code.replace("\t", " ")

# Tokenize sentences

tokens = sent_tokenize(html_code)

# Get sentences that only contains alphabetic chars (not symbols)

#sentences = [w for w in tokens if w.isalpha() and len(w) > 4]

sentences = tokens

# Now, write each token in a new line and save it to a file

res = ""

file_name = u.split(".")[1] + "_token.txt"

file = open(file_name, 'w+')

for sentence in sentences:

res += sentence + "\n"

file.write(res)

file.close()

"""

QUESTION 4

"""

def important_words(url: str):

# Get html

html_code = str(requests.get(url).content)

# First, clean new lines and tabs

html_code = html_code.replace("\n", " ")

html_code = html_code.replace("\t", " ")

# Tokenize sentences

tokens = word_tokenize(html_code)

# Get sentences that only contains alphabetic chars (not symbols)

words = [w for w in tokens if w.isalpha() and len(w) > 4 and w.lower() not in htmlwords]

# Convert to NLTK text

text = nltk.Text(words)

# Calculate frequency distribution

dist = dict(FreqDist(text))

# Sort in descending order and output the first 25

dist = dict(sorted(dist.items(), key=lambda x: x[1], reverse=True))

# Save the 40 most frequent terms into a file

res = ""

file_name = url.split(".")[1] + "_most_frequent_terms.txt"

file = open(file_name, 'w+')

for tup in list(dist.items())[:40]:

res += "(" + tup[0] + "," + str(tup[1]) + ")\n"

file.write(res)

file.close()

return list(dist.items())[:40]

def searchable_base(url):

tree = dict()

# We will create a tree that will contain as root, the important word.

# Then, we will check in which sentence that word appears, and the rest of words in that sentence

# are added as childrens

analyzed = list()

queue = list()

# Get html

html_code = str(requests.get(url).content)

# Clear all html words

for w in htmlwords:

html_code = html_code.replace(w, "")

# Clear symbols

for s in symbols:

html_code = html_code.replace(s, "")

# First, clean new lines and tabs

html_code = html_code.replace("\n", " ")

html_code = html_code.replace("\t", " ")

# Tokenize sentences

tokens = sent_tokenize(html_code)

# Get sentences that only contains alphabetic chars (not symbols)

# sentences = [w for w in tokens if w.isalpha() and len(w) > 4]

sentences = tokens

# Now, write each token in a new line and save it to a file

mi_words = [x[0] for x in important_words(url)[:10]]

queue = mi_words.copy()

while len(queue) > 0:

w = queue.pop()

if not w in tree and len(w) > 0 and w.isalpha():

tree[w] = list()

for s in sentences:

if w in s:

words = s.split(" ")

for w2 in words:

if w2 != w and len(w2) > 0 and w2.isalpha():

queue.append(w2)

tree[w].append(w2)

"""

for s in sentences:

words = s.split(" ")

for w in words:

if not w in tree and len(w) > 0 and w.isalpha():

tree[w] = []

for w2 in words:

if w in tree and w != w2 and len(w2) > 0 and w2.isalpha():

tree[w].append(w2)

"""

return tree

if __name__ == '__main__':

"""

QUESTION 1

"""

# Test with site listing TOP news sites

url = "http://www.ebizmba.com/articles/news-websites"

print(crawler(url))

"""

QUESTION 2

"""

#scrappe_urls(url)

"""

QUESTION 3

"""

#clean_html(url)

"""

QUESTION 4

"""

words = important_words(url)

print(f"The 10 most important words from {url} are:")

for i in range(len(words)):

tup = words[i]

print(tup[0], end="")

if i < len(words)-1:

print(", ", end="")

else:

print("")

"""

QUESTION 5

"""

# The top 10 words based on the frequency histogram are:

# Sites, Popular, eBizMBA, strong, Websites, script, October, Quantcast, Alexa, border

"""

QUESTION 6

"""

print(searchable_base(url))