add comments

bitcoinsearch · Apr 26, 2024 · 0d7cf28 · 0d7cf28
1 parent eadf4c9
commit 0d7cf28
Show file tree

Hide file tree

Showing 8 changed files with 123 additions and 26 deletions.
diff --git a/.github/workflows/homepage_json_gen_cron_job.yml b/.github/workflows/homepage_json_gen_cron_job.yml
@@ -2,7 +2,7 @@ name: Daily Python Homepage Update Script Execution
 
 on:
   schedule:
-    - cron: "0 3 * * *"
+    - cron: "0 3 * * *" # every day at 03:00 AM UTC
   workflow_dispatch:
   repository_dispatch:
 

diff --git a/.github/workflows/push_combined_summary_to_es_cron_job.yml b/.github/workflows/push_combined_summary_to_es_cron_job.yml
@@ -2,7 +2,7 @@ name: Daily Push Combined Summary From XML Files to ES INDEX
 
 on:
   schedule:
-    - cron: "30 2 * * *"
+    - cron: "30 2 * * *" # every day at 02:30 AM UTC
   workflow_dispatch:
   repository_dispatch:
 

diff --git a/.github/workflows/push_summary_to_elasticsearch_cron_job.yml b/.github/workflows/push_summary_to_elasticsearch_cron_job.yml
@@ -2,7 +2,7 @@ name: Daily Push Summary From XML Files to ES INDEX
 
 on:
   schedule:
-    - cron: "0 2 * * *"
+    - cron: "0 2 * * *" # every day at 02:00 AM UTC
   workflow_dispatch:
   repository_dispatch:
 

diff --git a/.github/workflows/xmls_gen_cron_job.yml b/.github/workflows/xmls_gen_cron_job.yml
@@ -1,7 +1,7 @@
 name: Daily XML Generation Script
 on:
   schedule:
-    - cron: "0 1 * * *"
+    - cron: "0 1 * * *" # every day at 01:00 AM UTC
   workflow_dispatch:
   repository_dispatch:
 

diff --git a/generate_homepage_xml.py b/generate_homepage_xml.py
@@ -145,7 +145,7 @@ def page_data_handling(data_list: list, get_unique_per_dev=False):
             counts, contributors = elastic_search.es_fetch_contributors_and_threads(
                 es_index=ES_INDEX, title=title, domain=dev_url
             )
-
+            # exclude the post authors
             for author in data['_source']['authors']:
                 contributors.remove(author)
             data['_source']['n_threads'] = counts
@@ -167,6 +167,7 @@ def page_data_handling(data_list: list, get_unique_per_dev=False):
                 counts, contributors = elastic_search.es_fetch_contributors_and_threads(
                     es_index=ES_INDEX, title=title, domain=dev_url
                 )
+                # exclude the post authors
                 for author in data['_source']['authors']:
                     contributors.remove(author)
                 data['_source']['n_threads'] = counts
@@ -230,6 +231,7 @@ def page_data_handling(data_list: list, get_unique_per_dev=False):
                             continue
 
                         if contributors:
+                            # exclude the post authors
                             for author in doc['_source']['authors']:
                                 contributors.remove(author)
                         doc['_source']['n_threads'] = counts

diff --git a/generate_weekly_newsletter_json.py b/generate_weekly_newsletter_json.py
@@ -57,6 +57,8 @@
         logger.success(f"TOTAL THREADS RECEIVED FOR '{dev_name}': {len(data_list)}")
 
         # NEW THREADS POSTS
+        # @TODO you already identify the original post by type==original_post
+        # so you could get the posts in order by date and check if the original posts is there
         seen_titles = set()
         for i in data_list:
             this_title = i['_source']['title']

diff --git a/src/elasticsearch_utils.py b/src/elasticsearch_utils.py
@@ -127,12 +127,20 @@ def fetch_data_based_on_title(self, es_index, title, url):
 
     def extract_data_from_es(self, es_index, url, start_date_str, current_date_str,
                              exclude_combined_summary_docs=False):
+        """
+        Fetches and extracts documents from a specified Elasticsearch index based on URL,
+        date range, and an optional exclusion flag for combined summary documents.
+        The method returns a list of documents that match the query criteria.
+        """
         output_list = []
         start_time = time.time()
 
         if self._es_client.ping():
             logger.success("connected to the ElasticSearch")
 
+            # Construct a search query to filter documents by domain,
+            # date range (start to end date) and optionally exclude
+            # 'combined-summary' documents
             domain_query = self.get_domain_query(url)
 
             if exclude_combined_summary_docs:
@@ -228,7 +236,7 @@ def filter_top_recent_posts(self, es_results, top_n):
     def filter_top_active_posts(self, es_results, top_n):
         unique_results = []
 
-        thread_dict = {}
+        thread_dict = {} # maps post titles to their respective activity levels
         # create dictionary with title as key and thread count as value
         for result in es_results:
             title = result['_source']['title']
@@ -238,6 +246,7 @@ def filter_top_active_posts(self, es_results, top_n):
                 domain=result['_source']['domain']
             )
             result['_source']['n_threads'] = counts
+            # exclude the post authors
             for author in result['_source']['authors']:
                 contributors.remove(author)
             result['_source']['n_threads'] = counts
@@ -246,7 +255,8 @@ def filter_top_active_posts(self, es_results, top_n):
             # add counts as value to thread_dict with a key as title
             thread_dict[title] = counts
 
-        # Use the dictionary created above, to sort the results
+        # Use the dictionary created above to sort the results
+        # posts with a higher thread count are placed at the top
         es_results_sorted = sorted(
             es_results,
             key=lambda x: thread_dict[x['_source']['title']], reverse=True
@@ -370,9 +380,13 @@ def get_earliest_posts_by_title(self, es_index, url, title):
         return earliest_post
 
     def es_fetch_contributors_and_threads(self, es_index, title, domain):
+        """
+        Fetches the count of threads and unique contributors for a given post based on title and domain
+        """
+        # The search query 
         domain_query = self.get_domain_query(domain)
         query = {
-            "size": 0,
+            "size": 0, # no search hits are returned, the focus is solely on the aggregations and counts
             "query": {
                 "bool": {
                     "must": [
@@ -381,6 +395,7 @@ def es_fetch_contributors_and_threads(self, es_index, title, domain):
                     ]
                 }
             },
+            # count unique authors associated with the matching documents
             "aggs": {
                 "authors_list": {
                     "terms": {
@@ -394,6 +409,7 @@ def es_fetch_contributors_and_threads(self, es_index, title, domain):
         response = self._es_client.search(index=es_index, body=query)
         counts = response['hits']['total']['value']
         if int(counts) > 0:
+            # exclude original post
             counts = int(counts) - 1
         contributors = [author['key'] for author in response['aggregations']['authors_list']['buckets']]
         return counts, contributors
@@ -427,7 +443,7 @@ def fetch_data_in_date_range(self, es_index, start_date, end_date, domain):
         return selected_threads
 
     def fetch_data_with_empty_summary(self, es_index, url=None, start_date_str=None, current_date_str=None):
-        logger.info(f"connecting ElasticSearch to fetch the docs with summary ... ")
+        logger.info(f"connecting ElasticSearch to fetch the docs with no summary ... ")
         output_list = []
         start_time = time.time()