From de560c86d13af5081f970f736971e93cca8d0fae Mon Sep 17 00:00:00 2001
From: "qingmu.li" <qingmu.li@upai.com>
Date: Tue, 16 Apr 2024 16:03:11 +0800
Subject: [PATCH 1/2] feat: Adapt to Google page updates

---
 crawler.py       | 30 +++++++++++++++++-------------
 requirements.txt |  2 +-
 utils.py         |  5 +++--
 3 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/crawler.py b/crawler.py
index d94b763c..c4819008 100644
--- a/crawler.py
+++ b/crawler.py
@@ -42,12 +42,12 @@ def google_gen_query_url(keywords, face_only=False, safe_mode=False, image_type=
     base_url = "https://www.google.com/search?tbm=isch&hl=en"
     keywords_str = "&q=" + quote(keywords)
     query_url = base_url + keywords_str
-    
+
     if safe_mode is True:
         query_url += "&safe=on"
     else:
         query_url += "&safe=off"
-    
+
     filter_url = "&tbs="
 
     if color is not None:
@@ -55,12 +55,12 @@ def google_gen_query_url(keywords, face_only=False, safe_mode=False, image_type=
             filter_url += "ic:gray%2C"
         else:
             filter_url += "ic:specific%2Cisc:{}%2C".format(color.lower())
-    
+
     if image_type is not None:
         if image_type.lower() == "linedrawing":
             image_type = "lineart"
         filter_url += "itp:{}".format(image_type)
-        
+
     if face_only is True:
         filter_url += "itp:face"
 
@@ -73,7 +73,10 @@ def google_image_url_from_webpage(driver, max_number, quiet=False):
     thumb_elements = []
     while True:
         try:
-            thumb_elements = driver.find_elements(By.CLASS_NAME, "rg_i")
+            # old way to get thumb_elements
+            # thumb_elements = driver.find_elements(By.CLASS_NAME, "rg_i")
+            # Adapt to the updated Google image search page
+            thumb_elements = driver.find_elements(By.CSS_SELECTOR, ".H8Rx8c > g-img > img")
             my_print("Find {} images.".format(len(thumb_elements)), quiet)
             if len(thumb_elements) >= max_number:
                 break
@@ -90,7 +93,7 @@ def google_image_url_from_webpage(driver, max_number, quiet=False):
         except Exception as e:
             print("Exception ", e)
             pass
-    
+
     if len(thumb_elements) == 0:
         return []
 
@@ -109,7 +112,7 @@ def google_image_url_from_webpage(driver, max_number, quiet=False):
             print("Error while clicking in thumbnail:", e)
             retry_click.append(elem)
 
-    if len(retry_click) > 0:    
+    if len(retry_click) > 0:
         my_print("Retry some failed clicks ...", quiet)
         for elem in retry_click:
             try:
@@ -117,8 +120,9 @@ def google_image_url_from_webpage(driver, max_number, quiet=False):
                     elem.click()
             except Exception as e:
                 print("Error while retrying click:", e)
-    
-    image_elements = driver.find_elements(By.CLASS_NAME, "islib")
+
+    # image_elements = driver.find_elements(By.CLASS_NAME, "islib")
+    image_elements = driver.find_elements(By.CSS_SELECTOR, ".ob5Hkd > a")
     image_urls = list()
     url_pattern = r"imgurl=\S*&amp;imgrefurl"
 
@@ -138,10 +142,10 @@ def bing_gen_query_url(keywords, face_only=False, safe_mode=False, image_type=No
     filter_url = "&qft="
     if face_only is True:
         filter_url += "+filterui:face-face"
-    
+
     if image_type is not None:
         filter_url += "+filterui:photo-{}".format(image_type)
-    
+
     if color is not None:
         if color == "bw" or color == "color":
             filter_url += "+filterui:color2-{}".format(color.lower())
@@ -183,7 +187,7 @@ def bing_get_image_url_using_api(keywords, max_number=10000, face_only=False,
     proxies = None
     if proxy and proxy_type:
         proxies = {"http": "{}://{}".format(proxy_type, proxy),
-                   "https": "{}://{}".format(proxy_type, proxy)}                             
+                   "https": "{}://{}".format(proxy_type, proxy)}
     start = 1
     image_urls = []
     while start <= max_number:
@@ -309,7 +313,7 @@ def process_batch(batch_no, batch_size):
 
 
 def crawl_image_urls(keywords, engine="Google", max_number=10000,
-                     face_only=False, safe_mode=False, proxy=None, 
+                     face_only=False, safe_mode=False, proxy=None,
                      proxy_type="http", quiet=False, browser="chrome_headless", image_type=None, color=None):
     """
     Scrape image urls of keywords from Google Image Search
diff --git a/requirements.txt b/requirements.txt
index fa29d1ec..7f690487 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 chromedriver-autoinstaller==0.4.0
 pyinstaller==5.9.0
-PyQt5==5.15.9
+PyQt5==5.15.10
 requests==2.31.0
 selenium==4.8.3
diff --git a/utils.py b/utils.py
index ba4f7038..e120cd36 100644
--- a/utils.py
+++ b/utils.py
@@ -13,7 +13,7 @@ def gen_valid_dir_name_for_keywords(keywords):
 class AppConfig(object):
     def __init__(self):
         self.engine = "Google"
-        
+
         self.driver = "chrome_headless"
 
         self.keywords = ""
@@ -33,7 +33,7 @@ def __init__(self):
 
     def to_command_paras(self):
         str_paras = ""
- 
+
         str_paras += ' -e ' + self.engine
 
         str_paras += ' -d ' + self.driver
@@ -72,6 +72,7 @@ def gen_keywords_list_from_file(filepath):
 def resolve_dependencies(driver=str):
     if "chrome" in driver:
         print("Checking Google Chrome and chromedriver ...")
+        # if you have installed chronmium/chrome and chromedriver of the same version and still get an error, you can try commenting out the following three lines.
         driver_path = chromedriver_autoinstaller.install()
         if not driver_path:
             return False

From 99b2ebdd67569a2389893412ff0dc3fcb17bf59a Mon Sep 17 00:00:00 2001
From: "qingmu.li" <qingmu.li@upai.com>
Date: Tue, 16 Apr 2024 16:34:21 +0800
Subject: [PATCH 2/2] fix: Comment if necessary

---
 utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils.py b/utils.py
index e120cd36..9b5a9e4f 100644
--- a/utils.py
+++ b/utils.py
@@ -73,8 +73,8 @@ def resolve_dependencies(driver=str):
     if "chrome" in driver:
         print("Checking Google Chrome and chromedriver ...")
         # if you have installed chronmium/chrome and chromedriver of the same version and still get an error, you can try commenting out the following three lines.
-        driver_path = chromedriver_autoinstaller.install()
-        if not driver_path:
-            return False
+        # driver_path = chromedriver_autoinstaller.install()
+        # if not driver_path:
+        #     return False
         print("OK.")
     return True