Skip to content

Commit

Permalink
Create webCrawling.py
Browse files Browse the repository at this point in the history
  • Loading branch information
nglthu committed Apr 1, 2019
1 parent 68cfe3c commit b7953ec
Showing 1 changed file with 369 additions and 0 deletions.
369 changes: 369 additions & 0 deletions webCrawling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,369 @@
"""
Example webcrawler or 'spider' code. This spider integrates both of the code routines provided including:
PorterStemmer - this code implements a porter stemmer. We have integrated this code
into the program and called the stem method. This routine could have been implemented as a library and
included into the program but for the sake of simplicity an clarity when debugging, I have simply included
the source code into my web crawler script.
BeautifulSoup - this code which can be found in the resources section of this unit is a python module
that allows text to be read from a html page. Essentially it returns the text of the html page with all of the HTML tags
and other formatting removed making providing a simple string containing the contents of a web page
that can be parsed and indexed by our indexer code. This module can be downloaded from unit resources
section which also has instructions for installing the module on your system.
"""
import sys, os, re
#import urllib2
from urllib.request import urlopen
from urllib.parse import urlparse
import urllib
import urllib.request



import sqlite3
import math
import time
from bs4 import BeautifulSoup, NavigableString
from porterstemmer import PorterStemmer

stopwords = ['the', 'of', 'and', 'to', 'in', 'you', 'it', 'with', 'that', 'or', 'was', 'he', 'is', 'for', 'this', 'his', 'as', 'not', 'at', 'by', 'all', 'they', 'but', 'be', 'on', 'from', 'had', 'her', 'work', 'are', 'any', 'she', 'if', 'said', 'so', 'which', 'have', 'do', 'we', 'no', 'my', 'were', 'them', 'their', 'him', 'one', 'will', 'me', 'there', 'who', 'up', 'other', 'an', 'its', 'when', 'what', 'can', 'may', 'into', 'out', 'must', 'your', 'then', 'would', 'could', 'more', 'now', 'has', 'like', 'down', 'where', 'been', 'through', 'did', 'away', 'these', 'such', 'set', 'back', 'some', 'than', 'way', 'made', 'our', 'after', 'well', 'should', 'get', 'even', 'am', 'go', 'saw', 'just', 'put', 'while', 'ever', 'off', 'here', 'also']
# regular expression for: extract words, extract ID from path, check for hexa value
chars = re.compile(r'\W+')
pattid= re.compile(r'(\d{3})/(\d{3})/(\d{3})')


# the higher ID
tokens = 0
documents = 0
terms = 0

#
# We will create a term object for each unique instance of a term
#
class Term():
termid = 0
termfreq = 0
docs = 0
docids = {}

# split on any chars
def splitchars(line) :
return chars.split(line)

def stripTags(s):
intag = False
s2 = ""
for c in s:
if c == '<':
intag = True
elif c == '>':
intag = False
if intag != True:
s2 = s2+c
return(s2)

def printText(tags):
for tag in tags:
if tag.__class__ == NavigableString:
print(tag)
else:
printText(tag)
print("")


# process the tokens of the source code
def parsetoken(db, line):
global documents
global tokens
global terms
#
# Create instance of the porterstemmer object we will call the stemmer method in this
# object to 'stem' the tokens extracted from the line.
#
p = PorterStemmer()

# this replaces any tab characters with a space character in the line
# read from the file
line = line.replace('\t',' ')
line = line.strip()
#line.encode('ascii', 'ignore')

#
# This routine splits the contents of the line into tokens
l = splitchars(line)

# for each token in the line process
for elmt in l:
# This statement removes the newline character if found
elmt = elmt.replace('\n','')

# This statement converts all letters to lower case
lowerElmt = elmt.lower().strip()

#
# Increment the counter of the number of tokens processed. This value will
# provide the total size of the corpus in terms of the number of terms in the
# entire collection
#
tokens += 1

# if the token is less than 2 characters in length we assume
# that it is not a valid term and ignore it
#
if len(lowerElmt) <2:
continue

#
# if the token is in the stopwords list then do not include in the term
# dictionary and do not index the term.
#
if (lowerElmt in stopwords):
continue

#
# This section of code will check to see if the term is a number and will not
# add a number to the index. This is accomplished by attempting to convert
# the term into an integer and assigning it to a variable. If the term is not
# a number meaning it contains non numeric characters this will fail and we can
# catch this error and continue processing the term. If the term is a number
# it will not fail and we can then ignore the term (the continue statement will
# continue with the next item retrieved from the 'for' statement)
#
try:
dummy = int(lowerElmt)
except ValueError:
# Value is not a number so we can index it
stemword = lowerElmt
else:
# value is a number so we will NOT add it to the index
continue

#
# In this following short section of the code we call the porter stemmer code
# that we have included in our indexer process. This algorithm will stem the
# the tokens which will reduce the size of our data dictionary.
#
lowerElmt = p.stem(stemword, 0,len(stemword)-1)

# if the term doesn't currently exist in the term dictionary
# then add the term
if not (lowerElmt in db.keys()):
terms+=1
db[lowerElmt] = Term()
db[lowerElmt].termid = terms
db[lowerElmt].docids = dict()
db[lowerElmt].docs = 0

# if the document is not currently in the postings
# list for the term then add it
#
if not (documents in db[lowerElmt].docids.keys()):
db[lowerElmt].docs += 1
db[lowerElmt].docids[documents] = 0

# Increment the counter that tracks the term frequency
db[lowerElmt].docids[documents] += 1
return l

#
# Create the inverted index tables.
#
# Insert a row into the TermDictionary for each unique term along with a termid which is
# a integer assigned to each term by incrementing an integer
#
# Insert a row into the posting table for each unique combination of Docid and termid
#
def writeindex(dbr):
for k in db.keys():
cur.execute('insert into TermDictionary values (?,?)', (k, db[k].termid))
docfreq = db[k].docs
ratio = float(documents) / float(docfreq)
idf = math.log10(ratio)

for i in db[k].docids.keys():
termfreq = db[k].docids[i]
print("testing")
tfidf = float(termfreq) * float(idf)
if tfidf > 0:
cur.execute('insert into Posting values (?, ?, ?, ?, ?)', (db[k].termid, i, tfidf, docfreq, termfreq))




if __name__ == '__main__':

#
# Get the starting URL to crawl
#
line = input("Enter URL to crawl (must be in the form http://www.domain.com): ")

# the database is a simple dictionnary
db = {}

#
# Capture the start time of the routine so that we can determine the total running
# time required to process the corpus
#
t2 = time.localtime()
print('Start Time: %.2d:%.2d' % (t2.tm_hour, t2.tm_min))

#
# Create a sqlite database to hold the inverted index. The isolation_level statment turns
# on autocommit which means that changes made in the database are committed automatically
#
con = sqlite3.connect("webcrawler.db")
con.isolation_level = None
cur = con.cursor()

#
# In the following section three tables and their associated indexes will be created.
# Before we create the table or index we will attempt to drop any existing tables in
# case they exist
#
# Document Dictionary Table
cur.execute("drop table if exists DocumentDictionary")
cur.execute("drop index if exists idxDocumentDictionary")
cur.execute("create table if not exists DocumentDictionary (DocumentName text, DocId int)")
cur.execute("create index if not exists idxDocumentDictionary on DocumentDictionary (DocId)")

# Term Dictionary Table
cur.execute("drop table if exists TermDictionary")
cur.execute("drop index if exists idxTermDictionary")
cur.execute("create table if not exists TermDictionary (Term text, TermId int)")
cur.execute("create index if not exists idxTermDictionary on TermDictionary (TermId)")

# Postings Table
cur.execute("drop table if exists Posting")
cur.execute("drop index if exists idxPosting1")
cur.execute("drop index if exists idxPosting2")
cur.execute("create table if not exists Posting (TermId int, DocId int, tfidf real, docfreq int, termfreq int)")
cur.execute("create index if not exists idxPosting1 on Posting (TermId)")
cur.execute("create index if not exists idxPosting2 on Posting (Docid)")

#
# Initialize variables
#
crawled = ([]) # contains the list of pages that have already been crawled
tocrawl = [line] # contains the queue of url's that will be crawled
links_queue = 0 # counts the number of links in the queue to limit the depth of the crawl
crawlcomplete = True # Flat that will exit the while loop when the craw is finished
#
# Crawl the starting web page and links in the web page up to the limit.
#
#print(str(tocrawl)[1:-1])
while crawlcomplete:

#
# Pop the top url off of the queue and process it.
#
try:

crawling = tocrawl.pop()

except:
crawlcomplete = False
continue

l = len(crawling)
ext = crawling[l-4:l]
if ext in ['.pdf', '.png', '.jpg', '.gif', '.asp']:
crawled.append(crawling)
continue

#
# Print the current length of the queue of URL's to crawl
#


#
# Parse the URL and open it.
#


url = urlparse(crawling)
#print(urlparse(crawling))
try:
response = urllib.request.urlopen(crawling).read()

# response = urllib.urlretrieve(crawling).read()
print(response)

except:
#print("len of crawling %i" % len(crawling))
continue

#
# Use BeautifulSoup modules to format web page as text that can
# be parsed and indexed
#
soup = BeautifulSoup(response,'html.parser')
#tok = "".join(soup.findAll('a', text=re.compile(".")))
#print("response: %i" % response)
paragraphs = soup.findAll("p")
tok=""
for para in paragraphs:
tok=tok + para.get_text(' ', strip=True)
tok = "".join(soup.findAll("p", text=re.compile(".")))

# pass the text extracted from the web page to the parsetoken routine for indexing
parsetoken(db, tok)
documents += 1

#
# For each unique instance of a document assign a document id (documents) and store in the documentdictionary
#

cur.execute("insert into DocumentDictionary values (?, ?)", (documents, crawling))

#
# Find all of the weblinks on the page put them in the stack to crawl through
#


if links_queue < 500:
# links = re.findall(b"[A-Za-z]", response, re.I)
# links = re.findall(r'href=[\'"]?([^\'" >]+)', response, re.I)
links = re.findall(r'''href=["'](.[^"']+)["']''', response.decode('iso-8859-1'), re.I)
for link in (links.pop(0) for _ in range(len(links))):
if link.startswith('/'):
link = 'http://' + url[1] + link
elif link.startswith('#'):
link = 'http://' + url[1] + url[2] + link
elif not link.startswith('http'):
link = 'http://' + url[1] + '/' + link
if link not in crawled:
links_queue += 1
tocrawl.append(link)
crawled.append(crawling)

#
# Display the time that the indexing process is complete, and the process of writing
#

t2 = time.localtime()
print('Indexing Complete, write to disk: %.2d:%.2d' % (t2.tm_hour, t2.tm_min))

#
# Write the inverted index to disk
#
writeindex(db)

#
# Commit and close the database
#
con.commit()
con.close()

#
# Print processing statistics
# Documents - every document opened and read by the indexer
# Terms - each token that was extracted from the file.
#
print("Documents %i" % documents)
print("Terms %i" % terms)
print("Tokens %i" % tokens)
t2 = time.localtime()
print('End Time: %.2d:%.2d' % (t2.tm_hour, t2.tm_min))

0 comments on commit b7953ec

Please sign in to comment.