-
Notifications
You must be signed in to change notification settings - Fork 38
/
Copy pathclient.py
104 lines (88 loc) · 3.01 KB
/
client.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import requests
class DiffbotClient(object):
base_url = 'http://api.diffbot.com/'
def request(self, url, token, api, fields=None, version=3, **kwargs):
"""
Returns a python object containing the requested resource from the diffbot api
"""
params = {"url": url, "token": token}
if fields:
params['fields'] = fields
params.update(kwargs)
response = requests.get(self.compose_url(api, version), params=params)
response.raise_for_status()
return response.json()
def compose_url(self, api, version_number):
"""
Returns the uri for an endpoint as a string
"""
version = self.format_version_string(version_number)
return '{}{}/{}'.format(self.base_url, version, api)
@staticmethod
def format_version_string(version_number):
"""
Returns a string representation of the API version
"""
return 'v{}'.format(version_number)
class DiffbotJob(DiffbotClient):
"""
Various calls for managing a Crawlbot or Bulk API job.
"""
def request(self,params):
response = requests.get(self.compose_url(self.jobType,3),params=params)
response.raise_for_status
try:
return response.json()
except:
print(response.text)
def start(self,params):
response = self.request(params)
return response
def status(self):
response = self.request(self.params)
return response
def update(self,**kwargs):
temp_params = self.params
temp_params.update(kwargs)
response = self.request(self.params)
return response
def delete(self):
temp_params = self.params
temp_params['delete'] = 1
response = self.request(temp_params)
return response
def restart(self):
temp_params = self.params
temp_params['restart'] = 1
response = self.request(temp_params)
return response
def download(self,data_format="json"):
"""
downloads the JSON output of a crawl or bulk job
"""
download_url = '{}/v3/{}/download/{}-{}_data.{}'.format(
self.base_url,self.jobType,self.params['token'],self.params['name'],data_format
)
download = requests.get(download_url)
download.raise_for_status()
if data_format == "csv":
return download.content
else:
return download.json()
class DiffbotCrawl(DiffbotJob):
"""
Initializes a Diffbot crawl. Pass additional arguments as necessary.
"""
def __init__(self,token,name,seeds=None,api=None,apiVersion=3,**kwargs):
self.params = {
"token": token,
"name": name,
}
startParams = dict(self.params)
if seeds:
startParams['seeds'] = seeds
if api:
startParams['apiUrl'] = self.compose_url(api,apiVersion)
startParams.update(kwargs)
self.jobType = "crawl"
self.start(startParams)