Skip to content

Commit

Permalink
Merge pull request #91 from sleepless-se/master
Browse files Browse the repository at this point in the history
fix google parser error
  • Loading branch information
ZhiyuanChen authored Jan 16, 2021
2 parents fc5471d + f29d55e commit 7ccec64
Showing 1 changed file with 14 additions and 12 deletions.
26 changes: 14 additions & 12 deletions icrawler/builtin/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,24 +140,26 @@ def feed(self, keyword, offset, max_num, language=None, filters=None):


class GoogleParser(Parser):

def parse(self, response):
soup = BeautifulSoup(
response.content.decode('utf-8', 'ignore'), 'lxml')
image_divs = soup.find_all('script')
#image_divs = soup.find_all('script')
image_divs = soup.find_all(name='script')
for div in image_divs:
txt = div.string
if txt is None or not txt.startswith('AF_initDataCallback'):
#txt = div.text
txt = str(div)
#if not txt.startswith('AF_initDataCallback'):
if 'AF_initDataCallback' not in txt:
continue
if 'ds:1' not in txt:
if 'ds:0' in txt or 'ds:1' not in txt:
continue
txt = re.sub(r"^AF_initDataCallback\({.*key: 'ds:(\d)'.+data:(.+), sideChannel: {.*}}\);?$",
"\\2", txt, 0, re.DOTALL)

meta = json.loads(txt)
data = meta[31][0][12][2]

uris = [img[1][3][0] for img in data if img[0] == 1]
#txt = re.sub(r"^AF_initDataCallback\({.*key: 'ds:(\d)'.+data:function\(\){return (.+)}}\);?$",
# "\\2", txt, 0, re.DOTALL)
#meta = json.loads(txt)
#data = meta[31][0][12][2]
#uris = [img[1][3][0] for img in data if img[0] == 1]
uris = re.findall(r'http.*?\.(?:jpg|png|bmp)', txt)
return [{'file_url': uri} for uri in uris]


Expand Down

0 comments on commit 7ccec64

Please sign in to comment.