From fe315181729979c1519371eef5a9c4dbbd61abcd Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Tue, 1 Oct 2024 20:03:40 +0200 Subject: [PATCH] domain suffixes and regexes removed --- sql/util/populate_easylist_adserver.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sql/util/populate_easylist_adserver.py b/sql/util/populate_easylist_adserver.py index 70e58d4bf87..e048d16208a 100644 --- a/sql/util/populate_easylist_adserver.py +++ b/sql/util/populate_easylist_adserver.py @@ -5,21 +5,21 @@ def extract_domains(content): - domains_list = [] + domains_set = set() for line in content.splitlines(): - # Skip comments - if line.startswith("!"): + # Skip comments and regexes + if line.startswith("!") or line.startswith("/"): continue - # Remove the '||' prefix and '^' suffix - domain = line.strip().lstrip("||").rstrip("^") + # Remove the '||' prefix and '^.*' suffix + domain = line.strip().lstrip("||").split('^')[0] # Ensure the domain is not empty if domain: - domains_list.append(domain) + domains_set.add(domain) - return domains_list + return domains_set # URL to the text file containing the regex patterns @@ -32,7 +32,7 @@ def extract_domains(content): domains = extract_domains(response.text) # Create a DataFrame from the list of domains -df = pd.DataFrame(domains, columns=["Domain"]) +df = pd.DataFrame(domains, columns=["Domain"]).sort_values("Domain").reset_index(drop=True) write_to_bq( df,