From aa5e51e987be63415b6d77613305ef3eceaee086 Mon Sep 17 00:00:00 2001 From: bluelul Date: Thu, 18 Jan 2024 17:59:58 +0700 Subject: [PATCH 1/2] Update new url_pattern for Google image crawling --- crawler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawler.py b/crawler.py index d94b763c..169c4338 100644 --- a/crawler.py +++ b/crawler.py @@ -120,13 +120,13 @@ def google_image_url_from_webpage(driver, max_number, quiet=False): image_elements = driver.find_elements(By.CLASS_NAME, "islib") image_urls = list() - url_pattern = r"imgurl=\S*&imgrefurl" + url_pattern = r"imgurl=\S*&tbnid" for image_element in image_elements[:max_number]: outer_html = image_element.get_attribute("outerHTML") re_group = re.search(url_pattern, outer_html) if re_group is not None: - image_url = unquote(re_group.group()[7:-14]) + image_url = unquote(re_group.group()[7:-10]) image_urls.append(image_url) return image_urls From f8e4a8fba95fff8487275cb47b637248b99082fc Mon Sep 17 00:00:00 2001 From: bluelul Date: Thu, 18 Jan 2024 18:04:20 +0700 Subject: [PATCH 2/2] Click only max_number images in Google Image, the total downloaded images may be smaller than max_number due to URL not found, but we can control how much images should be crawled precisely --- crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawler.py b/crawler.py index 169c4338..7beb977d 100644 --- a/crawler.py +++ b/crawler.py @@ -97,7 +97,7 @@ def google_image_url_from_webpage(driver, max_number, quiet=False): my_print("Click on each thumbnail image to get image url, may take a moment ...", quiet) retry_click = [] - for i, elem in enumerate(thumb_elements): + for i, elem in enumerate(thumb_elements[:max_number]): try: if i != 0 and i % 50 == 0: my_print("{} thumbnail clicked.".format(i), quiet)