-
Notifications
You must be signed in to change notification settings - Fork 0
/
mapper.py
83 lines (67 loc) · 1.54 KB
/
mapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from urlparse import urlparse
import sys
def get_scraper(content, uri):
"""Map the URI to the appropriate scraper class."""
dom = simplify_uri(uri)
mod = None
print "simplified uri: %s" %dom
# Mapping
mapp = {
'.aaronsw.com': 'aaronsw',
'.biologynews.net': 'biologynews',
'.broadbandmechanics.com': 'marcsvoice',
'.dataportability.org': 'dataportability',
'.lesswrong.com': 'lesswrong',
'.reddit.com': 'reddit',
'.scobleizer.com': 'scobleizer',
'.slashdot.org': 'slashdot',
'.techcrunch.com': 'techcrunch',
'.techdirt.com': 'techdirt',
}
dom = '.' + dom
for k in mapp:
if dom.endswith(k):
mod = mapp[k]
break
print "module: %s" %mod
if not mod:
return False
mod = 'sites.' + mod
module = __import__(mod)
module = sys.modules[mod]
try:
return module.get_scraper(content)
except AttributeError:
return anonymous_inst(module, content, uri)
def simplify_uri(uri):
"""Removes 'www', etc."""
up = urlparse(uri)
hostsplit = up.hostname.split('.')
domain = ''
for p in hostsplit:
if p is 'www':
continue
domain += '.' + p
if not domain:
return uri
return domain[1:]
def anonymous_inst(module, content, uri):
"""Instantiate scraper anonymously."""
from web2feed import Scraper
cls = None
for x in dir(module):
z = getattr(module, x)
try:
if issubclass(z, Scraper):
if x not in ['Scraper', 'web2feed.Scraper']:
cls = x
break
except:
pass
if not cls:
print "Anonyomous instance not found"
return
sc = getattr(module, cls)(content, uri)
print sc
print sc.uri
return sc