From fe315181729979c1519371eef5a9c4dbbd61abcd Mon Sep 17 00:00:00 2001
From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com>
Date: Tue, 1 Oct 2024 20:03:40 +0200
Subject: [PATCH] domain suffixes and regexes removed

---
 sql/util/populate_easylist_adserver.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sql/util/populate_easylist_adserver.py b/sql/util/populate_easylist_adserver.py
index 70e58d4bf87..e048d16208a 100644
--- a/sql/util/populate_easylist_adserver.py
+++ b/sql/util/populate_easylist_adserver.py
@@ -5,21 +5,21 @@
 
 
 def extract_domains(content):
-    domains_list = []
+    domains_set = set()
     for line in content.splitlines():
 
-        # Skip comments
-        if line.startswith("!"):
+        # Skip comments and regexes
+        if line.startswith("!") or line.startswith("/"):
             continue
 
-        # Remove the '||' prefix and '^' suffix
-        domain = line.strip().lstrip("||").rstrip("^")
+        # Remove the '||' prefix and '^.*' suffix
+        domain = line.strip().lstrip("||").split('^')[0]
 
         # Ensure the domain is not empty
         if domain:
-            domains_list.append(domain)
+            domains_set.add(domain)
 
-    return domains_list
+    return domains_set
 
 
 # URL to the text file containing the regex patterns
@@ -32,7 +32,7 @@ def extract_domains(content):
 domains = extract_domains(response.text)
 
 # Create a DataFrame from the list of domains
-df = pd.DataFrame(domains, columns=["Domain"])
+df = pd.DataFrame(domains, columns=["Domain"]).sort_values("Domain").reset_index(drop=True)
 
 write_to_bq(
     df,