From de560c86d13af5081f970f736971e93cca8d0fae Mon Sep 17 00:00:00 2001 From: "qingmu.li" Date: Tue, 16 Apr 2024 16:03:11 +0800 Subject: [PATCH 1/2] feat: Adapt to Google page updates --- crawler.py | 30 +++++++++++++++++------------- requirements.txt | 2 +- utils.py | 5 +++-- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/crawler.py b/crawler.py index d94b763c..c4819008 100644 --- a/crawler.py +++ b/crawler.py @@ -42,12 +42,12 @@ def google_gen_query_url(keywords, face_only=False, safe_mode=False, image_type= base_url = "https://www.google.com/search?tbm=isch&hl=en" keywords_str = "&q=" + quote(keywords) query_url = base_url + keywords_str - + if safe_mode is True: query_url += "&safe=on" else: query_url += "&safe=off" - + filter_url = "&tbs=" if color is not None: @@ -55,12 +55,12 @@ def google_gen_query_url(keywords, face_only=False, safe_mode=False, image_type= filter_url += "ic:gray%2C" else: filter_url += "ic:specific%2Cisc:{}%2C".format(color.lower()) - + if image_type is not None: if image_type.lower() == "linedrawing": image_type = "lineart" filter_url += "itp:{}".format(image_type) - + if face_only is True: filter_url += "itp:face" @@ -73,7 +73,10 @@ def google_image_url_from_webpage(driver, max_number, quiet=False): thumb_elements = [] while True: try: - thumb_elements = driver.find_elements(By.CLASS_NAME, "rg_i") + # old way to get thumb_elements + # thumb_elements = driver.find_elements(By.CLASS_NAME, "rg_i") + # Adapt to the updated Google image search page + thumb_elements = driver.find_elements(By.CSS_SELECTOR, ".H8Rx8c > g-img > img") my_print("Find {} images.".format(len(thumb_elements)), quiet) if len(thumb_elements) >= max_number: break @@ -90,7 +93,7 @@ def google_image_url_from_webpage(driver, max_number, quiet=False): except Exception as e: print("Exception ", e) pass - + if len(thumb_elements) == 0: return [] @@ -109,7 +112,7 @@ def google_image_url_from_webpage(driver, max_number, quiet=False): print("Error while clicking in thumbnail:", e) retry_click.append(elem) - if len(retry_click) > 0: + if len(retry_click) > 0: my_print("Retry some failed clicks ...", quiet) for elem in retry_click: try: @@ -117,8 +120,9 @@ def google_image_url_from_webpage(driver, max_number, quiet=False): elem.click() except Exception as e: print("Error while retrying click:", e) - - image_elements = driver.find_elements(By.CLASS_NAME, "islib") + + # image_elements = driver.find_elements(By.CLASS_NAME, "islib") + image_elements = driver.find_elements(By.CSS_SELECTOR, ".ob5Hkd > a") image_urls = list() url_pattern = r"imgurl=\S*&imgrefurl" @@ -138,10 +142,10 @@ def bing_gen_query_url(keywords, face_only=False, safe_mode=False, image_type=No filter_url = "&qft=" if face_only is True: filter_url += "+filterui:face-face" - + if image_type is not None: filter_url += "+filterui:photo-{}".format(image_type) - + if color is not None: if color == "bw" or color == "color": filter_url += "+filterui:color2-{}".format(color.lower()) @@ -183,7 +187,7 @@ def bing_get_image_url_using_api(keywords, max_number=10000, face_only=False, proxies = None if proxy and proxy_type: proxies = {"http": "{}://{}".format(proxy_type, proxy), - "https": "{}://{}".format(proxy_type, proxy)} + "https": "{}://{}".format(proxy_type, proxy)} start = 1 image_urls = [] while start <= max_number: @@ -309,7 +313,7 @@ def process_batch(batch_no, batch_size): def crawl_image_urls(keywords, engine="Google", max_number=10000, - face_only=False, safe_mode=False, proxy=None, + face_only=False, safe_mode=False, proxy=None, proxy_type="http", quiet=False, browser="chrome_headless", image_type=None, color=None): """ Scrape image urls of keywords from Google Image Search diff --git a/requirements.txt b/requirements.txt index fa29d1ec..7f690487 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ chromedriver-autoinstaller==0.4.0 pyinstaller==5.9.0 -PyQt5==5.15.9 +PyQt5==5.15.10 requests==2.31.0 selenium==4.8.3 diff --git a/utils.py b/utils.py index ba4f7038..e120cd36 100644 --- a/utils.py +++ b/utils.py @@ -13,7 +13,7 @@ def gen_valid_dir_name_for_keywords(keywords): class AppConfig(object): def __init__(self): self.engine = "Google" - + self.driver = "chrome_headless" self.keywords = "" @@ -33,7 +33,7 @@ def __init__(self): def to_command_paras(self): str_paras = "" - + str_paras += ' -e ' + self.engine str_paras += ' -d ' + self.driver @@ -72,6 +72,7 @@ def gen_keywords_list_from_file(filepath): def resolve_dependencies(driver=str): if "chrome" in driver: print("Checking Google Chrome and chromedriver ...") + # if you have installed chronmium/chrome and chromedriver of the same version and still get an error, you can try commenting out the following three lines. driver_path = chromedriver_autoinstaller.install() if not driver_path: return False From 99b2ebdd67569a2389893412ff0dc3fcb17bf59a Mon Sep 17 00:00:00 2001 From: "qingmu.li" Date: Tue, 16 Apr 2024 16:34:21 +0800 Subject: [PATCH 2/2] fix: Comment if necessary --- utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils.py b/utils.py index e120cd36..9b5a9e4f 100644 --- a/utils.py +++ b/utils.py @@ -73,8 +73,8 @@ def resolve_dependencies(driver=str): if "chrome" in driver: print("Checking Google Chrome and chromedriver ...") # if you have installed chronmium/chrome and chromedriver of the same version and still get an error, you can try commenting out the following three lines. - driver_path = chromedriver_autoinstaller.install() - if not driver_path: - return False + # driver_path = chromedriver_autoinstaller.install() + # if not driver_path: + # return False print("OK.") return True