Skip to content

Commit

Permalink
add comments
Browse files Browse the repository at this point in the history
  • Loading branch information
kouloumos committed Apr 26, 2024
1 parent eadf4c9 commit 0d7cf28
Show file tree
Hide file tree
Showing 8 changed files with 123 additions and 26 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/homepage_json_gen_cron_job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: Daily Python Homepage Update Script Execution

on:
schedule:
- cron: "0 3 * * *"
- cron: "0 3 * * *" # every day at 03:00 AM UTC
workflow_dispatch:
repository_dispatch:

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/push_combined_summary_to_es_cron_job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: Daily Push Combined Summary From XML Files to ES INDEX

on:
schedule:
- cron: "30 2 * * *"
- cron: "30 2 * * *" # every day at 02:30 AM UTC
workflow_dispatch:
repository_dispatch:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: Daily Push Summary From XML Files to ES INDEX

on:
schedule:
- cron: "0 2 * * *"
- cron: "0 2 * * *" # every day at 02:00 AM UTC
workflow_dispatch:
repository_dispatch:

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/xmls_gen_cron_job.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name: Daily XML Generation Script
on:
schedule:
- cron: "0 1 * * *"
- cron: "0 1 * * *" # every day at 01:00 AM UTC
workflow_dispatch:
repository_dispatch:

Expand Down
4 changes: 3 additions & 1 deletion generate_homepage_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def page_data_handling(data_list: list, get_unique_per_dev=False):
counts, contributors = elastic_search.es_fetch_contributors_and_threads(
es_index=ES_INDEX, title=title, domain=dev_url
)

# exclude the post authors
for author in data['_source']['authors']:
contributors.remove(author)
data['_source']['n_threads'] = counts
Expand All @@ -167,6 +167,7 @@ def page_data_handling(data_list: list, get_unique_per_dev=False):
counts, contributors = elastic_search.es_fetch_contributors_and_threads(
es_index=ES_INDEX, title=title, domain=dev_url
)
# exclude the post authors
for author in data['_source']['authors']:
contributors.remove(author)
data['_source']['n_threads'] = counts
Expand Down Expand Up @@ -230,6 +231,7 @@ def page_data_handling(data_list: list, get_unique_per_dev=False):
continue

if contributors:
# exclude the post authors
for author in doc['_source']['authors']:
contributors.remove(author)
doc['_source']['n_threads'] = counts
Expand Down
2 changes: 2 additions & 0 deletions generate_weekly_newsletter_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@
logger.success(f"TOTAL THREADS RECEIVED FOR '{dev_name}': {len(data_list)}")

# NEW THREADS POSTS
# @TODO you already identify the original post by type==original_post
# so you could get the posts in order by date and check if the original posts is there
seen_titles = set()
for i in data_list:
this_title = i['_source']['title']
Expand Down
24 changes: 20 additions & 4 deletions src/elasticsearch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,12 +127,20 @@ def fetch_data_based_on_title(self, es_index, title, url):

def extract_data_from_es(self, es_index, url, start_date_str, current_date_str,
exclude_combined_summary_docs=False):
"""
Fetches and extracts documents from a specified Elasticsearch index based on URL,
date range, and an optional exclusion flag for combined summary documents.
The method returns a list of documents that match the query criteria.
"""
output_list = []
start_time = time.time()

if self._es_client.ping():
logger.success("connected to the ElasticSearch")

# Construct a search query to filter documents by domain,
# date range (start to end date) and optionally exclude
# 'combined-summary' documents
domain_query = self.get_domain_query(url)

if exclude_combined_summary_docs:
Expand Down Expand Up @@ -228,7 +236,7 @@ def filter_top_recent_posts(self, es_results, top_n):
def filter_top_active_posts(self, es_results, top_n):
unique_results = []

thread_dict = {}
thread_dict = {} # maps post titles to their respective activity levels
# create dictionary with title as key and thread count as value
for result in es_results:
title = result['_source']['title']
Expand All @@ -238,6 +246,7 @@ def filter_top_active_posts(self, es_results, top_n):
domain=result['_source']['domain']
)
result['_source']['n_threads'] = counts
# exclude the post authors
for author in result['_source']['authors']:
contributors.remove(author)
result['_source']['n_threads'] = counts
Expand All @@ -246,7 +255,8 @@ def filter_top_active_posts(self, es_results, top_n):
# add counts as value to thread_dict with a key as title
thread_dict[title] = counts

# Use the dictionary created above, to sort the results
# Use the dictionary created above to sort the results
# posts with a higher thread count are placed at the top
es_results_sorted = sorted(
es_results,
key=lambda x: thread_dict[x['_source']['title']], reverse=True
Expand Down Expand Up @@ -370,9 +380,13 @@ def get_earliest_posts_by_title(self, es_index, url, title):
return earliest_post

def es_fetch_contributors_and_threads(self, es_index, title, domain):
"""
Fetches the count of threads and unique contributors for a given post based on title and domain
"""
# The search query
domain_query = self.get_domain_query(domain)
query = {
"size": 0,
"size": 0, # no search hits are returned, the focus is solely on the aggregations and counts
"query": {
"bool": {
"must": [
Expand All @@ -381,6 +395,7 @@ def es_fetch_contributors_and_threads(self, es_index, title, domain):
]
}
},
# count unique authors associated with the matching documents
"aggs": {
"authors_list": {
"terms": {
Expand All @@ -394,6 +409,7 @@ def es_fetch_contributors_and_threads(self, es_index, title, domain):
response = self._es_client.search(index=es_index, body=query)
counts = response['hits']['total']['value']
if int(counts) > 0:
# exclude original post
counts = int(counts) - 1
contributors = [author['key'] for author in response['aggregations']['authors_list']['buckets']]
return counts, contributors
Expand Down Expand Up @@ -427,7 +443,7 @@ def fetch_data_in_date_range(self, es_index, start_date, end_date, domain):
return selected_threads

def fetch_data_with_empty_summary(self, es_index, url=None, start_date_str=None, current_date_str=None):
logger.info(f"connecting ElasticSearch to fetch the docs with summary ... ")
logger.info(f"connecting ElasticSearch to fetch the docs with no summary ... ")
output_list = []
start_time = time.time()

Expand Down
Loading

0 comments on commit 0d7cf28

Please sign in to comment.