From f29d55e8cf97ab98438d2afec0a51d80dc6e7f63 Mon Sep 17 00:00:00 2001 From: sleepless-se Date: Tue, 5 Jan 2021 11:19:05 +0900 Subject: [PATCH] fix google parser error Solved the problem that json.decoder.JSONDecodeError was displayed due to Google's specification change. --- icrawler/builtin/google.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/icrawler/builtin/google.py b/icrawler/builtin/google.py index b0198e3..9c03691 100644 --- a/icrawler/builtin/google.py +++ b/icrawler/builtin/google.py @@ -140,24 +140,26 @@ def feed(self, keyword, offset, max_num, language=None, filters=None): class GoogleParser(Parser): - def parse(self, response): soup = BeautifulSoup( response.content.decode('utf-8', 'ignore'), 'lxml') - image_divs = soup.find_all('script') + #image_divs = soup.find_all('script') + image_divs = soup.find_all(name='script') for div in image_divs: - txt = div.string - if txt is None or not txt.startswith('AF_initDataCallback'): + #txt = div.text + txt = str(div) + #if not txt.startswith('AF_initDataCallback'): + if 'AF_initDataCallback' not in txt: continue - if 'ds:1' not in txt: + if 'ds:0' in txt or 'ds:1' not in txt: continue - txt = re.sub(r"^AF_initDataCallback\({.*key: 'ds:(\d)'.+data:(.+), sideChannel: {.*}}\);?$", - "\\2", txt, 0, re.DOTALL) - - meta = json.loads(txt) - data = meta[31][0][12][2] - - uris = [img[1][3][0] for img in data if img[0] == 1] + #txt = re.sub(r"^AF_initDataCallback\({.*key: 'ds:(\d)'.+data:function\(\){return (.+)}}\);?$", + # "\\2", txt, 0, re.DOTALL) + #meta = json.loads(txt) + #data = meta[31][0][12][2] + #uris = [img[1][3][0] for img in data if img[0] == 1] + + uris = re.findall(r'http.*?\.(?:jpg|png|bmp)', txt) return [{'file_url': uri} for uri in uris]