Skip to content

Commit

Permalink
domain suffixes and regexes removed
Browse files Browse the repository at this point in the history
  • Loading branch information
max-ostapenko committed Oct 1, 2024
1 parent ac6e895 commit fe31518
Showing 1 changed file with 8 additions and 8 deletions.
16 changes: 8 additions & 8 deletions sql/util/populate_easylist_adserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,21 @@


def extract_domains(content):
domains_list = []
domains_set = set()
for line in content.splitlines():

# Skip comments
if line.startswith("!"):
# Skip comments and regexes
if line.startswith("!") or line.startswith("/"):
continue

# Remove the '||' prefix and '^' suffix
domain = line.strip().lstrip("||").rstrip("^")
# Remove the '||' prefix and '^.*' suffix
domain = line.strip().lstrip("||").split('^')[0]

# Ensure the domain is not empty
if domain:
domains_list.append(domain)
domains_set.add(domain)

return domains_list
return domains_set


# URL to the text file containing the regex patterns
Expand All @@ -32,7 +32,7 @@ def extract_domains(content):
domains = extract_domains(response.text)

# Create a DataFrame from the list of domains
df = pd.DataFrame(domains, columns=["Domain"])
df = pd.DataFrame(domains, columns=["Domain"]).sort_values("Domain").reset_index(drop=True)

write_to_bq(
df,
Expand Down

0 comments on commit fe31518

Please sign in to comment.