Enhanced code and added comments

bitcoinsearch · Apr 26, 2024 · 87be4b8 · 87be4b8
1 parent e5445b7
commit 87be4b8
Show file tree

Hide file tree

Showing 9 changed files with 541 additions and 506 deletions.
diff --git a/.github/workflows/homepage_json_gen_cron_job.yml b/.github/workflows/homepage_json_gen_cron_job.yml
@@ -37,7 +37,7 @@ jobs:
          pip install -r requirements.txt
 
     - name: Execute Python script
-      run: python generate_homepage_xml.py
+      run: python generate_homepage_json.py
 
     - name: Configure Git
       run: |

diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ Utilizing data collected by the [scraper](https://github.com/bitcoinsearch/scrap
    - Queries Elasticsearch for documents lacking summaries, extracts summaries from corresponding XML files, and then updates these documents with their summaries in the Elasticsearch index.
 3. Daily [Push Combined Summary From XML Files to ES INDEX](.github/workflows/push_combined_summary_to_es_cron_job.yml) ([source](push_combined_summary_to_es.py))
    - Processes each combined thread summary XML file, transforming it into a document format, checks for its existence in Elasticsearch, and updates or inserts the document as needed.
-4. Daily [Python Homepage Update Script](.github/workflows/homepage_json_gen_cron_job.yml) ([source](generate_homepage_xml.py))
+4. Daily [Python Homepage Update Script](.github/workflows/homepage_json_gen_cron_job.yml) ([source](generate_homepage_json.py))
    - Queries the last 7 days of data from Elasticsearch for each source to compile lists of active threads, recent threads, and historical threads for 'Today in History'. It generates a summary of recent threads if available; otherwise, for active threads. The resulting [`homepage.json`](static/homepage.json) is then committed to GitHub to be used by [Bitcoin TLDR](https://github.com/bitcoinsearch/tldr).
 5. Weekly [Python Newsletter Generation Script](.github/workflows/weekly_newsletter_gen_cron_job.yml) ([source](generate_weekly_newsletter_json.py))
    - Generates a newsletter by compiling lists of new and active threads from the past week's data for each source. It generates a summary of new threads if available; otherwise, for active threads. The resulting [`newsletter.json`](static/newsletters/newsletter.json) is then committed to GitHub to be used by [Bitcoin TLDR](https://github.com/bitcoinsearch/tldr).

diff --git a/generate_homepage_xml.py → generate_homepage_json.py b/generate_homepage_xml.py → generate_homepage_json.py
diff --git a/generate_weekly_newsletter_json.py b/generate_weekly_newsletter_json.py
@@ -1,92 +1,89 @@
+import json
+import os
+import sys
 import time
 import traceback
 from datetime import datetime, timedelta
+
 from loguru import logger
-import os
-import sys
-import json
 from tqdm import tqdm
 
 from src.config import ES_INDEX
 from src.elasticsearch_utils import ElasticSearchClient
 from src.json_utils import GenerateJSON
-from src.xml_utils import GenerateXML
 from src.utils import month_dict
+from src.xml_utils import GenerateXML
 
 if __name__ == "__main__":
 
-    gen = GenerateJSON()
+    # Instantiating objects for generating JSON, XML and connecting to ElasticSearch
+    json_gen = GenerateJSON()
     xml_gen = GenerateXML()
     elastic_search = ElasticSearchClient()
+
+    # URLs for development mailing lists and forums
     dev_urls = [
         ["https://lists.linuxfoundation.org/pipermail/bitcoin-dev/",
          "https://gnusha.org/pi/bitcoindev/"],
         "https://lists.linuxfoundation.org/pipermail/lightning-dev/",
         "https://delvingbitcoin.org/"
     ]
 
+    # Set the date range for data extraction: last week to yesterday.
     current_date = datetime.now()
-    current_date_str = current_date.strftime("%Y-%m-%d")
-
     start_date = current_date - timedelta(days=7)
-    start_date_str = start_date.strftime("%Y-%m-%d")
-
     end_date = current_date - timedelta(days=1)
+
+    current_date_str = current_date.strftime("%Y-%m-%d")
+    start_date_str = start_date.strftime("%Y-%m-%d")
     end_date_str = end_date.strftime("%Y-%m-%d")
 
     logger.info(f"Newsletter publish date: {current_date_str}")
     logger.info(f"Gathering data for newsletter from {start_date_str} to {end_date_str}")
 
+    # Convert month from number to name for filename construction
     month_name = month_dict[int(current_date.month)]
     str_month_year = f"{month_name}_{int(current_date.year)}"
 
     active_data_list = []
     new_threads_list = []
 
+    # Process each URL in the dev_urls list
     for dev_url in dev_urls:
-
         data_list = elastic_search.extract_data_from_es(
             ES_INDEX, dev_url, start_date_str, end_date_str, exclude_combined_summary_docs=True
         )
 
-        if isinstance(dev_url, list):
-            dev_name = dev_url[0].split("/")[-2]
-        else:
-            dev_name = dev_url.split("/")[-2]
-
-        logger.success(f"TOTAL THREADS RECEIVED FOR '{dev_name}': {len(data_list)}")
+        dev_name = dev_url[0].split("/")[-2] if isinstance(dev_url, list) else dev_url.split("/")[-2]
+        logger.success(f"Retrieved {len(data_list)} threads for {dev_name}")
 
         # NEW THREADS POSTS
-        # @TODO you already identify the original post by type==original_post
-        # so you could get the posts in order by date and check if the original posts is there
         seen_titles = set()
         for i in data_list:
             this_title = i['_source']['title']
             if this_title in seen_titles:
                 continue
             seen_titles.add(this_title)
 
-            # check if the first post for this title is in the past week
-            original_post = elastic_search.get_earliest_posts_by_title(es_index=ES_INDEX, url=dev_url, title=this_title)
-
-            if original_post['_source'] and i['_source']['created_at'] == original_post['_source']['created_at']:
-                logger.success(f"new thread created on: {original_post['_source']['created_at']} || TITLE: {this_title}")
+            # Check if any new thread started in given week
+            if i['_source']['type'] == 'original_post':
+                logger.success(f"New thread created on: {i['_source']['created_at']} || TITLE: {this_title}")
 
                 counts, contributors = elastic_search.es_fetch_contributors_and_threads(
                     es_index=ES_INDEX, title=this_title, domain=dev_url
                 )
-
+                # Separate an original author and contributors
                 for author in i['_source']['authors']:
                     contributors.remove(author)
                 i['_source']['n_threads'] = counts
                 i['_source']['contributors'] = contributors
                 i['_source']['dev_name'] = dev_name
                 new_threads_list.append(i)
-        logger.info(f"number of new threads started this week: {len(new_threads_list)}")
+        logger.info(f"No. of new threads started this week: {len(new_threads_list)}")
 
         # TOP ACTIVE POSTS
         active_posts_data = elastic_search.filter_top_active_posts(es_results=data_list, top_n=15)
-        logger.info(f"number of filtered top active post: {len(active_posts_data)}")
+        logger.info(f"No. of filtered top active post: {len(active_posts_data)}")
 
         new_threads_titles_list = [i['_source']['title'] for i in new_threads_list]
 
@@ -103,14 +100,15 @@
             seen_titles.add(title)
             active_data_list.append(data)
             # active_posts_data_counter += 1
-        logger.info(f"number of active posts collected: {len(active_data_list)}")
+        logger.info(f"No. of active posts collected: {len(active_data_list)}")
 
-    # gather titles of docs from json file
+    # Determine if there's any update in the data compared to stored JSON
+    # Gather titles from stored JSON file
     json_file_path = fr"static/newsletters/newsletter.json"
 
     current_directory = os.getcwd()
     json_full_path = os.path.join(current_directory, json_file_path)
-    json_xml_ids = set()
+    stored_json_titles = set()
     if os.path.exists(json_full_path):
         with open(json_full_path, 'r') as j:
             try:
@@ -119,22 +117,22 @@
                 logger.info(f"Error reading json file:{json_full_path} :: {e}")
                 json_data = {}
 
-        json_xml_ids = set(
+        stored_json_titles = set(
             [item['title'] for item in json_data.get('new_threads_this_week', [])] +
             [item['title'] for item in json_data.get('active_posts_this_week', [])]
         )
     else:
         logger.warning(f"No existing newsletter.json file found: {json_full_path}")
 
-    # gather ids of docs from active posts and new thread posts
-    filtered_docs_ids = set(
+    # Gather titles from collected Active data and New Threads list
+    collected_json_titles = set(
         [data['_source']['title'] for data in active_data_list] +
         [data['_source']['title'] for data in new_threads_list]
     )
 
-    # check if there are any updates in the xml file
-    if filtered_docs_ids != json_xml_ids:
-        logger.info("changes found in recent posts ... ")
+    # Generate a new newsletter.json file if changes found in stored JSON file
+    if collected_json_titles != stored_json_titles:
+        logger.info("Changes found as compared to previously stored JSON file... ")
 
         delay = 5
         count = 0
@@ -144,23 +142,21 @@
                 logger.success(f"Total no. of active posts collected: {len(active_data_list)}")
                 logger.success(f"Total no. of new threads started this week: {len(new_threads_list)}")
 
-                logger.info("creating newsletter.json file ... ")
+                logger.info("Creating newsletter.json file ... ")
                 if len(active_data_list) > 0 or len(new_threads_list) > 0:
 
+                    # Prepare New Threads data for newsletter
                     new_threads_page_data = []
-                    active_page_data = []
                     new_threads_summary = ""
-
                     if new_threads_list:
-                        new_threads_summary += gen.generate_recent_posts_summary(new_threads_list, verbose=True)
+                        new_threads_summary += json_gen.generate_recent_posts_summary(new_threads_list, verbose=True)
                         logger.success(new_threads_summary)
 
                         for data in tqdm(new_threads_list):
                             try:
-                                # check and generate any missing file
+                                # Generate all XML files for given title, if not present
                                 xml_gen.start(dict_data=[data], url=data['_source']['domain'])
-
-                                entry_data = gen.create_single_entry(
+                                entry_data = json_gen.create_single_entry(
                                     data,
                                     base_url_for_xml="https://tldr.bitcoinsearch.xyz/summary",
                                     look_for_combined_summary=True,
@@ -173,16 +169,17 @@
                     else:
                         logger.warning(f"No new threads started this week, generating summary of active posts this "
                                        f"week ...")
-                        # if no new threads started this week, generate summary from active post this week
-                        new_threads_summary += gen.generate_recent_posts_summary(active_data_list)
+                        # If no new threads started this week, generate summary from active posts of the given week
+                        new_threads_summary += json_gen.generate_recent_posts_summary(active_data_list)
                         logger.success(new_threads_summary)
 
+                    # Prepare active posts data for newsletter
+                    active_page_data = []
                     for data in tqdm(active_data_list):
                         try:
-                            # check and generate any missing file
+                            # Generate all XML files for given title, if not present
                             xml_gen.start(dict_data=[data], url=data['_source']['domain'])
-
-                            entry_data = gen.create_single_entry(
+                            entry_data = json_gen.create_single_entry(
                                 data, base_url_for_xml="https://tldr.bitcoinsearch.xyz/summary",
                                 look_for_combined_summary=True, remove_xml_extension=True
                             )
@@ -191,19 +188,17 @@
                             logger.error(
                                 f"Error occurred for doc id: {data['_source']['id']}\n{ex} \n{traceback.format_exc()}")
 
+                    # Compile and save data for newsletter file
                     json_string = {
                         "summary_of_threads_started_this_week": new_threads_summary,
                         "new_threads_this_week": new_threads_page_data,
                         "active_posts_this_week": active_page_data
                     }
-                    gen.write_json_file(json_string, json_file_path)
-
+                    json_gen.write_json_file(json_string, json_file_path)
                     archive_json_file_path = fr"static/newsletters/{str_month_year}/{current_date_str}-newsletter.json"
-                    gen.store_file_in_archive(json_file_path, archive_json_file_path)
-
+                    json_gen.store_file_in_archive(json_file_path, archive_json_file_path)
                 else:
                     logger.error(f"Data list empty! Please check the data again.")
-
                 break
             except Exception as ex:
                 logger.error(f"Error occurred: {ex} \n{traceback.format_exc()}")
@@ -212,8 +207,8 @@
                 if count > 1:
                     sys.exit(f"{ex}")
     else:
+        # If no changes found in stored JSON file, save the previous one with updated name in the archive directory
         logger.success("No change in the posts, no need to update newsletter.json file")
-        # save the previous one with updated name in archive
         if os.path.exists(json_full_path):
             archive_json_file_path = fr"static/newsletters/{str_month_year}/{current_date_str}-newsletter.json"
-            gen.store_file_in_archive(json_file_path, archive_json_file_path)
+            json_gen.store_file_in_archive(json_file_path, archive_json_file_path)
diff --git a/push_combined_summary_to_es.py b/push_combined_summary_to_es.py
@@ -13,40 +13,42 @@
 
     REMOVE_TIMESTAMPS_IN_AUTHORS = True
 
+    # Instantiating objects for reading XML and connecting to ElasticSearch
     xml_reader = XMLReader()
     elastic_search = ElasticSearchClient()
 
-    total_combined_files = []
+    # Static directory names to look into for respective combined summary xml files
     static_dirs = [
         'bitcoin-dev',
         'lightning-dev',
         'delvingbitcoin'
     ]
     pattern = "combined*.xml"
 
+    total_combined_files = []
     for static_dir in static_dirs:
         combined_files = glob.glob(f"static/{static_dir}/**/{pattern}")
         total_combined_files.extend(combined_files)
     logger.info(f"Total combined files: {(len(total_combined_files))}")
 
-    # get unique combined file paths
+    # Get unique combined file paths
     total_combined_files_dict = {os.path.splitext(os.path.basename(i))[0]: i for i in total_combined_files}
-
     logger.info(f"Total unique combined files: {len(total_combined_files_dict)}")
 
+    # Loop through all locally stored combined summary XML files and insert/update them accordingly
     for file_name, full_path in tqdm.tqdm(total_combined_files_dict.items()):
         try:
-            # get data from xml file
+            # Get data from xml file
             xml_file_data = xml_reader.read_xml_file(full_path)
 
             if REMOVE_TIMESTAMPS_IN_AUTHORS:
-                # remove timestamps from author's names and collect unique names only
+                # Remove timestamps from author's names and collect unique names only
                 xml_file_data['authors'] = remove_timestamps_from_author_names(xml_file_data['authors'])
 
-            # check if doc exist in ES index
+            # Check if doc exist in ES index
             doc_exists = elastic_search.es_client.exists(index=ES_INDEX, id=file_name)
 
-            # insert the doc in ES index if it does not exist, else update it
+            # Insert the doc in ES index if it does not exist, else update it
             if not doc_exists:
                 res = elastic_search.es_client.index(
                     index=ES_INDEX,

diff --git a/push_summary_to_es.py b/push_summary_to_es.py
@@ -6,23 +6,25 @@
 from src.elasticsearch_utils import ElasticSearchClient
 from src.xml_utils import XMLReader
 
-
 if __name__ == "__main__":
 
     APPLY_DATE_RANGE = False
 
+    # Instantiating objects for reading XML and connecting to ElasticSearch
     xml_reader = XMLReader()
     elastic_search = ElasticSearchClient()
 
+    # URLs for development mailing lists and forums
     dev_urls = [
         "https://lists.linuxfoundation.org/pipermail/bitcoin-dev/",
         "https://lists.linuxfoundation.org/pipermail/lightning-dev/",
         "https://delvingbitcoin.org/",
         "https://gnusha.org/pi/bitcoindev/"
     ]
 
+    # Process each URL in the dev_urls list
     for dev_url in dev_urls:
-
+        # Set the date range for data extraction
         if APPLY_DATE_RANGE:
             current_date_str = None
             if not current_date_str:
@@ -35,21 +37,20 @@
             start_date_str = None
             current_date_str = None
 
+        # Fetch doc with an empty summary field
         docs_list = elastic_search.fetch_data_with_empty_summary(ES_INDEX, dev_url, start_date_str, current_date_str)
 
-        if isinstance(dev_url, list):
-            dev_name = dev_url[0].split("/")[-2]
-        else:
-            dev_name = dev_url.split("/")[-2]
-
+        dev_name = dev_url[0].split("/")[-2] if isinstance(dev_url, list) else dev_url.split("/")[-2]
         logger.success(f"Total threads received with empty summary for '{dev_name}': {len(docs_list)}")
 
+        # Loop through all fetched docs and update them by adding the summary from xml files
         for doc in tqdm.tqdm(docs_list):
             res = None
             try:
                 doc_id = doc['_id']
                 doc_index = doc['_index']
                 if not doc['_source'].get('summary'):
+                    # Get summary text from locally stored XML files
                     xml_summary = xml_reader.get_xml_summary(doc, dev_name)
                     if xml_summary:
                         elastic_search.es_client.update(

diff --git a/src/config.py b/src/config.py
@@ -1,7 +1,7 @@
 import os
-import openai
 import warnings
 
+import openai
 import tiktoken
 from dotenv import load_dotenv