domain suffixes and regexes removed

HTTPArchive · Oct 1, 2024 · fe31518 · fe31518
1 parent ac6e895
commit fe31518
Showing 1 changed file with 8 additions and 8 deletions.
diff --git a/sql/util/populate_easylist_adserver.py b/sql/util/populate_easylist_adserver.py
@@ -5,21 +5,21 @@
 
 
 def extract_domains(content):
-    domains_list = []
+    domains_set = set()
     for line in content.splitlines():
 
-        # Skip comments
-        if line.startswith("!"):
+        # Skip comments and regexes
+        if line.startswith("!") or line.startswith("/"):
             continue
 
-        # Remove the '||' prefix and '^' suffix
-        domain = line.strip().lstrip("||").rstrip("^")
+        # Remove the '||' prefix and '^.*' suffix
+        domain = line.strip().lstrip("||").split('^')[0]
 
         # Ensure the domain is not empty
         if domain:
-            domains_list.append(domain)
+            domains_set.add(domain)
 
-    return domains_list
+    return domains_set
 
 
 # URL to the text file containing the regex patterns
@@ -32,7 +32,7 @@ def extract_domains(content):
 domains = extract_domains(response.text)
 
 # Create a DataFrame from the list of domains
-df = pd.DataFrame(domains, columns=["Domain"])
+df = pd.DataFrame(domains, columns=["Domain"]).sort_values("Domain").reset_index(drop=True)
 
 write_to_bq(
     df,