-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
369 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,369 @@ | ||
""" | ||
Example webcrawler or 'spider' code. This spider integrates both of the code routines provided including: | ||
PorterStemmer - this code implements a porter stemmer. We have integrated this code | ||
into the program and called the stem method. This routine could have been implemented as a library and | ||
included into the program but for the sake of simplicity an clarity when debugging, I have simply included | ||
the source code into my web crawler script. | ||
BeautifulSoup - this code which can be found in the resources section of this unit is a python module | ||
that allows text to be read from a html page. Essentially it returns the text of the html page with all of the HTML tags | ||
and other formatting removed making providing a simple string containing the contents of a web page | ||
that can be parsed and indexed by our indexer code. This module can be downloaded from unit resources | ||
section which also has instructions for installing the module on your system. | ||
""" | ||
import sys, os, re | ||
#import urllib2 | ||
from urllib.request import urlopen | ||
from urllib.parse import urlparse | ||
import urllib | ||
import urllib.request | ||
|
||
|
||
|
||
import sqlite3 | ||
import math | ||
import time | ||
from bs4 import BeautifulSoup, NavigableString | ||
from porterstemmer import PorterStemmer | ||
|
||
stopwords = ['the', 'of', 'and', 'to', 'in', 'you', 'it', 'with', 'that', 'or', 'was', 'he', 'is', 'for', 'this', 'his', 'as', 'not', 'at', 'by', 'all', 'they', 'but', 'be', 'on', 'from', 'had', 'her', 'work', 'are', 'any', 'she', 'if', 'said', 'so', 'which', 'have', 'do', 'we', 'no', 'my', 'were', 'them', 'their', 'him', 'one', 'will', 'me', 'there', 'who', 'up', 'other', 'an', 'its', 'when', 'what', 'can', 'may', 'into', 'out', 'must', 'your', 'then', 'would', 'could', 'more', 'now', 'has', 'like', 'down', 'where', 'been', 'through', 'did', 'away', 'these', 'such', 'set', 'back', 'some', 'than', 'way', 'made', 'our', 'after', 'well', 'should', 'get', 'even', 'am', 'go', 'saw', 'just', 'put', 'while', 'ever', 'off', 'here', 'also'] | ||
# regular expression for: extract words, extract ID from path, check for hexa value | ||
chars = re.compile(r'\W+') | ||
pattid= re.compile(r'(\d{3})/(\d{3})/(\d{3})') | ||
|
||
|
||
# the higher ID | ||
tokens = 0 | ||
documents = 0 | ||
terms = 0 | ||
|
||
# | ||
# We will create a term object for each unique instance of a term | ||
# | ||
class Term(): | ||
termid = 0 | ||
termfreq = 0 | ||
docs = 0 | ||
docids = {} | ||
|
||
# split on any chars | ||
def splitchars(line) : | ||
return chars.split(line) | ||
|
||
def stripTags(s): | ||
intag = False | ||
s2 = "" | ||
for c in s: | ||
if c == '<': | ||
intag = True | ||
elif c == '>': | ||
intag = False | ||
if intag != True: | ||
s2 = s2+c | ||
return(s2) | ||
|
||
def printText(tags): | ||
for tag in tags: | ||
if tag.__class__ == NavigableString: | ||
print(tag) | ||
else: | ||
printText(tag) | ||
print("") | ||
|
||
|
||
# process the tokens of the source code | ||
def parsetoken(db, line): | ||
global documents | ||
global tokens | ||
global terms | ||
# | ||
# Create instance of the porterstemmer object we will call the stemmer method in this | ||
# object to 'stem' the tokens extracted from the line. | ||
# | ||
p = PorterStemmer() | ||
|
||
# this replaces any tab characters with a space character in the line | ||
# read from the file | ||
line = line.replace('\t',' ') | ||
line = line.strip() | ||
#line.encode('ascii', 'ignore') | ||
|
||
# | ||
# This routine splits the contents of the line into tokens | ||
l = splitchars(line) | ||
|
||
# for each token in the line process | ||
for elmt in l: | ||
# This statement removes the newline character if found | ||
elmt = elmt.replace('\n','') | ||
|
||
# This statement converts all letters to lower case | ||
lowerElmt = elmt.lower().strip() | ||
|
||
# | ||
# Increment the counter of the number of tokens processed. This value will | ||
# provide the total size of the corpus in terms of the number of terms in the | ||
# entire collection | ||
# | ||
tokens += 1 | ||
|
||
# if the token is less than 2 characters in length we assume | ||
# that it is not a valid term and ignore it | ||
# | ||
if len(lowerElmt) <2: | ||
continue | ||
|
||
# | ||
# if the token is in the stopwords list then do not include in the term | ||
# dictionary and do not index the term. | ||
# | ||
if (lowerElmt in stopwords): | ||
continue | ||
|
||
# | ||
# This section of code will check to see if the term is a number and will not | ||
# add a number to the index. This is accomplished by attempting to convert | ||
# the term into an integer and assigning it to a variable. If the term is not | ||
# a number meaning it contains non numeric characters this will fail and we can | ||
# catch this error and continue processing the term. If the term is a number | ||
# it will not fail and we can then ignore the term (the continue statement will | ||
# continue with the next item retrieved from the 'for' statement) | ||
# | ||
try: | ||
dummy = int(lowerElmt) | ||
except ValueError: | ||
# Value is not a number so we can index it | ||
stemword = lowerElmt | ||
else: | ||
# value is a number so we will NOT add it to the index | ||
continue | ||
|
||
# | ||
# In this following short section of the code we call the porter stemmer code | ||
# that we have included in our indexer process. This algorithm will stem the | ||
# the tokens which will reduce the size of our data dictionary. | ||
# | ||
lowerElmt = p.stem(stemword, 0,len(stemword)-1) | ||
|
||
# if the term doesn't currently exist in the term dictionary | ||
# then add the term | ||
if not (lowerElmt in db.keys()): | ||
terms+=1 | ||
db[lowerElmt] = Term() | ||
db[lowerElmt].termid = terms | ||
db[lowerElmt].docids = dict() | ||
db[lowerElmt].docs = 0 | ||
|
||
# if the document is not currently in the postings | ||
# list for the term then add it | ||
# | ||
if not (documents in db[lowerElmt].docids.keys()): | ||
db[lowerElmt].docs += 1 | ||
db[lowerElmt].docids[documents] = 0 | ||
|
||
# Increment the counter that tracks the term frequency | ||
db[lowerElmt].docids[documents] += 1 | ||
return l | ||
|
||
# | ||
# Create the inverted index tables. | ||
# | ||
# Insert a row into the TermDictionary for each unique term along with a termid which is | ||
# a integer assigned to each term by incrementing an integer | ||
# | ||
# Insert a row into the posting table for each unique combination of Docid and termid | ||
# | ||
def writeindex(dbr): | ||
for k in db.keys(): | ||
cur.execute('insert into TermDictionary values (?,?)', (k, db[k].termid)) | ||
docfreq = db[k].docs | ||
ratio = float(documents) / float(docfreq) | ||
idf = math.log10(ratio) | ||
|
||
for i in db[k].docids.keys(): | ||
termfreq = db[k].docids[i] | ||
print("testing") | ||
tfidf = float(termfreq) * float(idf) | ||
if tfidf > 0: | ||
cur.execute('insert into Posting values (?, ?, ?, ?, ?)', (db[k].termid, i, tfidf, docfreq, termfreq)) | ||
|
||
|
||
|
||
|
||
if __name__ == '__main__': | ||
|
||
# | ||
# Get the starting URL to crawl | ||
# | ||
line = input("Enter URL to crawl (must be in the form http://www.domain.com): ") | ||
|
||
# the database is a simple dictionnary | ||
db = {} | ||
|
||
# | ||
# Capture the start time of the routine so that we can determine the total running | ||
# time required to process the corpus | ||
# | ||
t2 = time.localtime() | ||
print('Start Time: %.2d:%.2d' % (t2.tm_hour, t2.tm_min)) | ||
|
||
# | ||
# Create a sqlite database to hold the inverted index. The isolation_level statment turns | ||
# on autocommit which means that changes made in the database are committed automatically | ||
# | ||
con = sqlite3.connect("webcrawler.db") | ||
con.isolation_level = None | ||
cur = con.cursor() | ||
|
||
# | ||
# In the following section three tables and their associated indexes will be created. | ||
# Before we create the table or index we will attempt to drop any existing tables in | ||
# case they exist | ||
# | ||
# Document Dictionary Table | ||
cur.execute("drop table if exists DocumentDictionary") | ||
cur.execute("drop index if exists idxDocumentDictionary") | ||
cur.execute("create table if not exists DocumentDictionary (DocumentName text, DocId int)") | ||
cur.execute("create index if not exists idxDocumentDictionary on DocumentDictionary (DocId)") | ||
|
||
# Term Dictionary Table | ||
cur.execute("drop table if exists TermDictionary") | ||
cur.execute("drop index if exists idxTermDictionary") | ||
cur.execute("create table if not exists TermDictionary (Term text, TermId int)") | ||
cur.execute("create index if not exists idxTermDictionary on TermDictionary (TermId)") | ||
|
||
# Postings Table | ||
cur.execute("drop table if exists Posting") | ||
cur.execute("drop index if exists idxPosting1") | ||
cur.execute("drop index if exists idxPosting2") | ||
cur.execute("create table if not exists Posting (TermId int, DocId int, tfidf real, docfreq int, termfreq int)") | ||
cur.execute("create index if not exists idxPosting1 on Posting (TermId)") | ||
cur.execute("create index if not exists idxPosting2 on Posting (Docid)") | ||
|
||
# | ||
# Initialize variables | ||
# | ||
crawled = ([]) # contains the list of pages that have already been crawled | ||
tocrawl = [line] # contains the queue of url's that will be crawled | ||
links_queue = 0 # counts the number of links in the queue to limit the depth of the crawl | ||
crawlcomplete = True # Flat that will exit the while loop when the craw is finished | ||
# | ||
# Crawl the starting web page and links in the web page up to the limit. | ||
# | ||
#print(str(tocrawl)[1:-1]) | ||
while crawlcomplete: | ||
|
||
# | ||
# Pop the top url off of the queue and process it. | ||
# | ||
try: | ||
|
||
crawling = tocrawl.pop() | ||
|
||
except: | ||
crawlcomplete = False | ||
continue | ||
|
||
l = len(crawling) | ||
ext = crawling[l-4:l] | ||
if ext in ['.pdf', '.png', '.jpg', '.gif', '.asp']: | ||
crawled.append(crawling) | ||
continue | ||
|
||
# | ||
# Print the current length of the queue of URL's to crawl | ||
# | ||
|
||
|
||
# | ||
# Parse the URL and open it. | ||
# | ||
|
||
|
||
url = urlparse(crawling) | ||
#print(urlparse(crawling)) | ||
try: | ||
response = urllib.request.urlopen(crawling).read() | ||
|
||
# response = urllib.urlretrieve(crawling).read() | ||
print(response) | ||
|
||
except: | ||
#print("len of crawling %i" % len(crawling)) | ||
continue | ||
|
||
# | ||
# Use BeautifulSoup modules to format web page as text that can | ||
# be parsed and indexed | ||
# | ||
soup = BeautifulSoup(response,'html.parser') | ||
#tok = "".join(soup.findAll('a', text=re.compile("."))) | ||
#print("response: %i" % response) | ||
paragraphs = soup.findAll("p") | ||
tok="" | ||
for para in paragraphs: | ||
tok=tok + para.get_text(' ', strip=True) | ||
tok = "".join(soup.findAll("p", text=re.compile("."))) | ||
|
||
# pass the text extracted from the web page to the parsetoken routine for indexing | ||
parsetoken(db, tok) | ||
documents += 1 | ||
|
||
# | ||
# For each unique instance of a document assign a document id (documents) and store in the documentdictionary | ||
# | ||
|
||
cur.execute("insert into DocumentDictionary values (?, ?)", (documents, crawling)) | ||
|
||
# | ||
# Find all of the weblinks on the page put them in the stack to crawl through | ||
# | ||
|
||
|
||
if links_queue < 500: | ||
# links = re.findall(b"[A-Za-z]", response, re.I) | ||
# links = re.findall(r'href=[\'"]?([^\'" >]+)', response, re.I) | ||
links = re.findall(r'''href=["'](.[^"']+)["']''', response.decode('iso-8859-1'), re.I) | ||
for link in (links.pop(0) for _ in range(len(links))): | ||
if link.startswith('/'): | ||
link = 'http://' + url[1] + link | ||
elif link.startswith('#'): | ||
link = 'http://' + url[1] + url[2] + link | ||
elif not link.startswith('http'): | ||
link = 'http://' + url[1] + '/' + link | ||
if link not in crawled: | ||
links_queue += 1 | ||
tocrawl.append(link) | ||
crawled.append(crawling) | ||
|
||
# | ||
# Display the time that the indexing process is complete, and the process of writing | ||
# | ||
|
||
t2 = time.localtime() | ||
print('Indexing Complete, write to disk: %.2d:%.2d' % (t2.tm_hour, t2.tm_min)) | ||
|
||
# | ||
# Write the inverted index to disk | ||
# | ||
writeindex(db) | ||
|
||
# | ||
# Commit and close the database | ||
# | ||
con.commit() | ||
con.close() | ||
|
||
# | ||
# Print processing statistics | ||
# Documents - every document opened and read by the indexer | ||
# Terms - each token that was extracted from the file. | ||
# | ||
print("Documents %i" % documents) | ||
print("Terms %i" % terms) | ||
print("Tokens %i" % tokens) | ||
t2 = time.localtime() | ||
print('End Time: %.2d:%.2d' % (t2.tm_hour, t2.tm_min)) |