Skip to content

Commit

Permalink
fix google parser error
Browse files Browse the repository at this point in the history
Solved the problem that json.decoder.JSONDecodeError was displayed due to Google's specification change.
  • Loading branch information
sleepless-se authored Jan 5, 2021
1 parent fc5471d commit f29d55e
Showing 1 changed file with 14 additions and 12 deletions.
26 changes: 14 additions & 12 deletions icrawler/builtin/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,24 +140,26 @@ def feed(self, keyword, offset, max_num, language=None, filters=None):


class GoogleParser(Parser):

def parse(self, response):
soup = BeautifulSoup(
response.content.decode('utf-8', 'ignore'), 'lxml')
image_divs = soup.find_all('script')
#image_divs = soup.find_all('script')
image_divs = soup.find_all(name='script')
for div in image_divs:
txt = div.string
if txt is None or not txt.startswith('AF_initDataCallback'):
#txt = div.text
txt = str(div)
#if not txt.startswith('AF_initDataCallback'):
if 'AF_initDataCallback' not in txt:
continue
if 'ds:1' not in txt:
if 'ds:0' in txt or 'ds:1' not in txt:
continue
txt = re.sub(r"^AF_initDataCallback\({.*key: 'ds:(\d)'.+data:(.+), sideChannel: {.*}}\);?$",
"\\2", txt, 0, re.DOTALL)

meta = json.loads(txt)
data = meta[31][0][12][2]

uris = [img[1][3][0] for img in data if img[0] == 1]
#txt = re.sub(r"^AF_initDataCallback\({.*key: 'ds:(\d)'.+data:function\(\){return (.+)}}\);?$",
# "\\2", txt, 0, re.DOTALL)
#meta = json.loads(txt)
#data = meta[31][0][12][2]
#uris = [img[1][3][0] for img in data if img[0] == 1]
uris = re.findall(r'http.*?\.(?:jpg|png|bmp)', txt)
return [{'file_url': uri} for uri in uris]


Expand Down

0 comments on commit f29d55e

Please sign in to comment.