-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathspider.py
87 lines (77 loc) · 2.19 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# coding: utf-8
from general import *
from bs4 import BeautifulSoup,BeautifulStoneSoup
import urllib2
import html5lib
import sqlite3
from time import sleep, time
import gc
class Spider:
# Class variables (shared among all instances)
queue_file = ''
queue = []
crawled = []
def __init__(self, queue_file):
Spider.queue_file = queue_file
self.boot()
@staticmethod
def boot():
Spider.queue = file_to_arr(Spider.queue_file)
Spider.crawled = []
@staticmethod
def getTitle(url):
try:
resource = urllib2.urlopen(url)
page = resource.read()
soup = BeautifulSoup(page, "html5lib")
if soup.title is None:
return 'title is not exist'
title = soup.title.string
if title is None:
return 'title is not exist'
title = title.replace("'", '-')
title = title.replace('"', '-')
resource.close()
except IOError:
title = ('ERROR: invalid URL "%s"' % url)
return title
@staticmethod
def update_db(db_path, table_name, id, url, title):
sqlite_db = sqlite3.connect(db_path)
sqlite_cu = sqlite_db.cursor()
sql_str= "insert into " + table_name + " values(" + str(id) + ", '" + url + "', '" + title + "')"
#print sql_str
sqlite_cu.execute(sql_str)
sqlite_db.commit()
@staticmethod
def crawl_page(thread_name, page_url, db_path, table_name):
if page_url not in Spider.crawled:
print(thread_name + ' now crawling ' + page_url)
print('Queue ' + str(len(Spider.queue)) + ' | Crawled ' + str(len(Spider.crawled)))
#Spider.filter_links(page_url)
if page_url in Spider.crawled:
Spider.queue.remove(page_url)
return
Spider.queue.remove(page_url)
Spider.crawled.append(page_url)
db_id = len(Spider.crawled)
title = Spider.getTitle(page_url)
Spider.update_db(db_path, table_name, db_id, page_url, title)
if len(Spider.crawled) != 0 and len(Spider.crawled)%1000 == 0:
sleep(60)
Spider.free()
return title
@staticmethod
def free():
#del Spider.queue
#del Spider.crawled
gc.collect()
@staticmethod
def filter_links(links):
for url in links:
if url in Spider.queue:
Spider.queue.remove
continue
if url in Spider.crawled:
continue
Spider.queue.append(url)