From 87be4b899a7c533637a0b2650909f405bd80c985 Mon Sep 17 00:00:00 2001
From: urvishp80 <urvishp80@gmail.com>
Date: Fri, 26 Apr 2024 10:46:32 -0600
Subject: [PATCH] Enhanced code and added comments

---
 .../workflows/homepage_json_gen_cron_job.yml  |   2 +-
 README.md                                     |   2 +-
 ...mepage_xml.py => generate_homepage_json.py | 764 +++++++++---------
 generate_weekly_newsletter_json.py            | 103 ++-
 push_combined_summary_to_es.py                |  16 +-
 push_summary_to_es.py                         |  15 +-
 src/config.py                                 |   2 +-
 src/xml_utils.py                              | 129 +--
 xmls_generator_production.py                  |  14 +-
 9 files changed, 541 insertions(+), 506 deletions(-)
 rename generate_homepage_xml.py => generate_homepage_json.py (75%)

diff --git a/.github/workflows/homepage_json_gen_cron_job.yml b/.github/workflows/homepage_json_gen_cron_job.yml
index b6493212b..545c85313 100644
--- a/.github/workflows/homepage_json_gen_cron_job.yml
+++ b/.github/workflows/homepage_json_gen_cron_job.yml
@@ -37,7 +37,7 @@ jobs:
          pip install -r requirements.txt
 
     - name: Execute Python script
-      run: python generate_homepage_xml.py
+      run: python generate_homepage_json.py
 
     - name: Configure Git
       run: |
diff --git a/README.md b/README.md
index 6e5ad1a99..1b3315129 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ Utilizing data collected by the [scraper](https://github.com/bitcoinsearch/scrap
    - Queries Elasticsearch for documents lacking summaries, extracts summaries from corresponding XML files, and then updates these documents with their summaries in the Elasticsearch index.
 3. Daily [Push Combined Summary From XML Files to ES INDEX](.github/workflows/push_combined_summary_to_es_cron_job.yml) ([source](push_combined_summary_to_es.py))
    - Processes each combined thread summary XML file, transforming it into a document format, checks for its existence in Elasticsearch, and updates or inserts the document as needed.
-4. Daily [Python Homepage Update Script](.github/workflows/homepage_json_gen_cron_job.yml) ([source](generate_homepage_xml.py))
+4. Daily [Python Homepage Update Script](.github/workflows/homepage_json_gen_cron_job.yml) ([source](generate_homepage_json.py))
    - Queries the last 7 days of data from Elasticsearch for each source to compile lists of active threads, recent threads, and historical threads for 'Today in History'. It generates a summary of recent threads if available; otherwise, for active threads. The resulting [`homepage.json`](static/homepage.json) is then committed to GitHub to be used by [Bitcoin TLDR](https://github.com/bitcoinsearch/tldr).
 5. Weekly [Python Newsletter Generation Script](.github/workflows/weekly_newsletter_gen_cron_job.yml) ([source](generate_weekly_newsletter_json.py))
    - Generates a newsletter by compiling lists of new and active threads from the past week's data for each source. It generates a summary of new threads if available; otherwise, for active threads. The resulting [`newsletter.json`](static/newsletters/newsletter.json) is then committed to GitHub to be used by [Bitcoin TLDR](https://github.com/bitcoinsearch/tldr).
diff --git a/generate_homepage_xml.py b/generate_homepage_json.py
similarity index 75%
rename from generate_homepage_xml.py
rename to generate_homepage_json.py
index ec79e966d..f29626a75 100644
--- a/generate_homepage_xml.py
+++ b/generate_homepage_json.py
@@ -1,372 +1,392 @@
-import random
-import time
-import traceback
-from datetime import datetime, timedelta
-from loguru import logger
-import os
-import sys
-import warnings
-import json
-from tqdm import tqdm
-
-from src.config import ES_INDEX
-from src.elasticsearch_utils import ElasticSearchClient
-from src.json_utils import GenerateJSON
-from src.xml_utils import GenerateXML
-from src.utils import month_dict
-
-warnings.filterwarnings("ignore")
-
-
-def page_data_handling(data_list: list, get_unique_per_dev=False):
-    page_data = []
-    collected_dev_data = []
-    for data in tqdm(data_list):
-        try:
-            # check and generate any missing file
-            xml_gen.start(dict_data=[data], url=data['_source']['domain'])
-            entry_data = gen.create_single_entry(data, look_for_combined_summary=True)
-
-            if get_unique_per_dev:
-                if entry_data['dev_name'] not in collected_dev_data:
-                    collected_dev_data.append(entry_data['dev_name'])
-                    logger.info(f"collected data for: {collected_dev_data}")
-                    page_data.append(entry_data)
-            else:
-                page_data.append(entry_data)
-        except Exception as ex:
-            logger.error(
-                f"Error occurred for doc id: {data['_source']['id']}\n{ex} \n{traceback.format_exc()}")
-    return page_data
-
-
-if __name__ == "__main__":
-
-    gen = GenerateJSON()
-    xml_gen = GenerateXML()
-    elastic_search = ElasticSearchClient()
-    dev_urls = [
-        ["https://lists.linuxfoundation.org/pipermail/bitcoin-dev/",
-         "https://gnusha.org/pi/bitcoindev/"],
-        "https://lists.linuxfoundation.org/pipermail/lightning-dev/",
-        "https://delvingbitcoin.org/"
-    ]
-
-    current_date = datetime.now()
-    current_date_str = current_date.strftime("%Y-%m-%d")
-
-    start_date = current_date - timedelta(days=7)
-    start_date_str = start_date.strftime("%Y-%m-%d")
-    logger.info(f"start_date: {start_date_str}")
-    logger.info(f"current_date_str: {current_date_str}")
-
-    month_name = month_dict[int(current_date.month)]
-    str_month_year = f"{month_name}_{int(current_date.year)}"
-
-    json_file_path = fr"static/homepage.json"
-
-    recent_data_list = []
-    active_data_list = []
-    today_in_history_data_list = []
-    history_data_collected_from_yesterday = False
-
-    random_years_ago = None
-
-    for dev_url in dev_urls:
-        logger.info(f"Working on URL: {dev_url}")
-        fetch_today_in_history = True
-
-        data_list = elastic_search.extract_data_from_es(
-            ES_INDEX, dev_url, start_date_str, current_date_str, exclude_combined_summary_docs=True
-        )
-
-        if isinstance(dev_url, list):
-            dev_name = dev_url[0].split("/")[-2]
-        else:
-            dev_name = dev_url.split("/")[-2]
-
-        logger.success(f"TOTAL THREADS RECEIVED FOR - '{dev_name}': {len(data_list)}")
-
-        seen_titles = set()
-
-        # TOP ACTIVE POSTS
-        active_posts_data = elastic_search.filter_top_active_posts(
-            es_results=data_list, top_n=10
-        )
-
-        active_posts_data_counter = 0
-        for data in active_posts_data:
-            if active_posts_data_counter >= 3:
-                break
-
-            title = data['_source']['title']
-            if title in seen_titles:
-                continue
-            seen_titles.add(title)
-
-            # get the first post's info of this title
-            original_post = elastic_search.get_earliest_posts_by_title(
-                es_index=ES_INDEX, url=dev_url, title=title
-            )
-
-            counts, contributors = elastic_search.es_fetch_contributors_and_threads(
-                es_index=ES_INDEX, title=title, domain=dev_url
-            )
-
-            # if you want to show the first post of each selected title,
-            # then do the below operations on - 'original_post', else on 'data'
-            for author in original_post['_source']['authors']:
-                contributors.remove(author)
-            original_post['_source']['n_threads'] = counts
-            original_post['_source']['contributors'] = contributors
-            original_post['_source']['dev_name'] = dev_name
-            active_data_list.append(original_post)
-            active_posts_data_counter += 1
-
-        logger.success(f"Number of active posts collected: {len(active_data_list)}, for URL: {dev_url}")
-
-        # TOP RECENT POSTS
-        recent_data_post_counter = 0
-        recent_posts_data = elastic_search.filter_top_recent_posts(es_results=data_list, top_n=20)
-
-        for data in recent_posts_data:
-            # if preprocess body text not longer than token_threshold, skip that post
-            if not gen.is_body_text_long(data=data, sent_threshold=2):
-                logger.info(f"skipping: {data['_source']['title']} - {data['_source']['url']}")
-                continue
-
-            title = data['_source']['title']
-            if title in seen_titles:
-                continue
-            seen_titles.add(title)
-            if recent_data_post_counter >= 3:
-                break
-
-            counts, contributors = elastic_search.es_fetch_contributors_and_threads(
-                es_index=ES_INDEX, title=title, domain=dev_url
-            )
-            # exclude the post authors
-            for author in data['_source']['authors']:
-                contributors.remove(author)
-            data['_source']['n_threads'] = counts
-            data['_source']['contributors'] = contributors
-            data['_source']['dev_name'] = dev_name
-            recent_data_list.append(data)
-            recent_data_post_counter += 1
-
-        if not recent_data_list:
-            for data in recent_posts_data:
-                # if preprocess body text not longer than token_threshold, skip that post
-                if not gen.is_body_text_long(data=data, sent_threshold=2):
-                    logger.info(f"skipping: {data['_source']['title']} - {data['_source']['url']}")
-                    continue
-
-                title = data['_source']['title']
-                if recent_data_post_counter >= 3:
-                    break
-                counts, contributors = elastic_search.es_fetch_contributors_and_threads(
-                    es_index=ES_INDEX, title=title, domain=dev_url
-                )
-                # exclude the post authors
-                for author in data['_source']['authors']:
-                    contributors.remove(author)
-                data['_source']['n_threads'] = counts
-                data['_source']['contributors'] = contributors
-                data['_source']['dev_name'] = dev_name
-                recent_data_list.append(data)
-                recent_data_post_counter += 1
-
-        logger.success(f"Number of recent posts collected: {len(recent_data_list)}, for URL: {dev_url}")
-
-        # TODAY IN HISTORY POSTS
-        logger.info(f"fetching 'Today in history' posts... ")
-
-        if not random_years_ago:
-            at_least_years_ago = 3
-            at_max_years_ago = current_date.year - 2015
-            random_years_ago = random.randint(at_least_years_ago, at_max_years_ago)
-            logger.info(f"random years ago between {at_least_years_ago} to {at_max_years_ago}: {random_years_ago}")
-
-        if dev_url == "https://delvingbitcoin.org/":
-            random_years_ago = random.randint(1, current_date.year - 2022)
-            logger.info(
-                f"for delving-bitcoin - random years ago between {1} to {current_date.year - 2022}: {random_years_ago}")
-
-        default_days_to_look_back = 6
-        loop_counter = 1
-
-        while fetch_today_in_history:
-            days_to_look_back = default_days_to_look_back * loop_counter
-            selected_random_date = current_date - timedelta(days=365 * random_years_ago)
-
-            start_of_time = selected_random_date - timedelta(days=selected_random_date.weekday())
-            end_of_time = start_of_time + timedelta(days=days_to_look_back)
-
-            start_of_time_str = start_of_time.strftime("%Y-%m-%dT%H:%M:%S")
-            end_of_time_str = end_of_time.strftime("%Y-%m-%dT%H:%M:%S")
-
-            logger.info(f"collecting the data from {days_to_look_back} days range ... || Start of week: {start_of_time} | "
-                        f"End of week: {end_of_time}")
-
-            selected_threads = elastic_search.fetch_data_in_date_range(
-                es_index=ES_INDEX,
-                start_date=start_of_time_str,
-                end_date=end_of_time_str,
-                domain=dev_url
-            )
-
-            if len(selected_threads) > 0:
-                for doc in selected_threads:
-                    doc_title = doc['_source']['title']
-                    doc_created_at = doc['_source']['created_at']
-
-                    if doc['_source']['type'] == 'original_post':
-
-                        counts, contributors = elastic_search.es_fetch_contributors_and_threads(
-                            es_index=ES_INDEX, title=doc_title, domain=dev_url
-                        )
-
-                        if counts < 5:
-                            logger.info(f"No. of replies are less than 5, skipping it... ")
-                            continue
-
-                        if contributors:
-                            # exclude the post authors
-                            for author in doc['_source']['authors']:
-                                contributors.remove(author)
-                        doc['_source']['n_threads'] = counts
-                        doc['_source']['contributors'] = contributors
-                        doc['_source']['dev_name'] = dev_name
-                        today_in_history_data_list.append(doc)
-                        logger.info(f"collected doc created on: {doc_created_at} || TITLE: {doc_title}")
-                        fetch_today_in_history = False
-                        break
-            loop_counter += 1
-
-        # add history data from yesterday's homepage.json
-        if not today_in_history_data_list:
-            logger.info("Collecting yesterday's history threads!")
-            current_directory = os.getcwd()
-            full_path = os.path.join(current_directory, json_file_path)
-            if os.path.exists(full_path):
-                with open(full_path, 'r') as j:
-                    try:
-                        data = json.load(j)
-                    except Exception as e:
-                        logger.info(f"Error reading json file:{full_path} :: {e}")
-                        data = {}
-                    today_in_history_data_list.extend(data.get('today_in_history_posts', []))
-                    history_data_collected_from_yesterday = True
-
-        logger.success(f"No. of 'Today in history' posts collected: {len(today_in_history_data_list)}")
-
-    current_directory = os.getcwd()
-    full_path = os.path.join(current_directory, json_file_path)
-    if os.path.exists(full_path):
-        with open(full_path, 'r') as j:
-            try:
-                yesterday_data = json.load(j)
-            except Exception as e:
-                logger.info(f"Error reading json file:{full_path} :: {e}")
-                yesterday_data = {}
-
-    xml_ids_title = gen.get_existing_json_title(file_path=json_file_path)
-    recent_post_ids = [data['_source']['title'] for data in recent_data_list]
-    active_post_ids = [data['_source']['title'] for data in active_data_list]
-    all_post_titles = set(recent_post_ids + active_post_ids)
-
-    if all_post_titles != set(xml_ids_title):
-        logger.info("changes found in recent posts ... ")
-
-        delay = 5
-        count = 0
-
-        while True:
-            try:
-                logger.info(
-                    f"active posts: {len(active_data_list)}, "
-                    f"recent posts: {len(recent_data_list)}, "
-                    f"today in history posts: {len(today_in_history_data_list)}"
-                )
-                logger.info("Creating homepage.json file ... ")
-
-                recent_post_summ = ""
-                if len(active_data_list) > 0 or len(recent_data_list) > 0:
-
-                    # header summary
-                    if len(recent_data_list) > 0:
-                        recent_post_summ = gen.generate_recent_posts_summary(recent_data_list)
-                    else:
-                        recent_post_summ = gen.generate_recent_posts_summary(active_data_list)
-                    logger.success(recent_post_summ)
-
-                    # recent data
-                    recent_page_data = page_data_handling(recent_data_list)
-
-                    # active data
-                    active_page_data = page_data_handling(active_data_list)
-
-                else:
-                    logger.error(f"'Active' and 'Recent' data list empty! Please check the data again.")
-                    recent_page_data, active_page_data = [], []
-
-                # today in history
-                if history_data_collected_from_yesterday:
-                    logger.info("No change in 'Today in History' data posts, gathering data from yesterday's post.")
-                    today_in_history_data = yesterday_data.get('today_in_history_posts', [])
-                else:
-                    if len(today_in_history_data_list) > 0:
-                        today_in_history_data = page_data_handling(today_in_history_data_list, get_unique_per_dev=True)
-                    else:
-                        logger.error(f"'Today in history' data list empty! Please check the data again.")
-                        today_in_history_data = []
-
-                json_string = {
-                    "header_summary": recent_post_summ,
-                    "recent_posts": recent_page_data,
-                    "active_posts": active_page_data,
-                    "today_in_history_posts": today_in_history_data
-                }
-                gen.write_json_file(json_string, json_file_path)
-
-                archive_json_file_path = fr"static/homepage/{str_month_year}/{current_date_str}-homepage.json"
-                gen.store_file_in_archive(json_file_path, archive_json_file_path)
-                break
-
-            except Exception as ex:
-                logger.error(f"Error occurred: {ex} \n{traceback.format_exc()}")
-                time.sleep(delay)
-                count += 1
-                if count > 1:
-                    sys.exit(f"{ex}")
-    else:
-        logger.info("No change in 'Recent' or 'Active' posts.")
-        rewrite_json_file = False
-
-        # update today in history and save file if no change in Recent or Active posts
-        if history_data_collected_from_yesterday:
-            logger.info("No change in 'Today in History' data posts, gathering data from yesterday's post.")
-            today_in_history_data = yesterday_data.get('today_in_history_posts', [])
-        else:
-            rewrite_json_file = True
-            if len(today_in_history_data_list) > 0:
-                today_in_history_data = page_data_handling(today_in_history_data_list, get_unique_per_dev=True)
-            else:
-                logger.error(f"'Today in history' data list empty! Please check the data again.")
-                today_in_history_data = []
-
-        if rewrite_json_file:
-            logger.info(f'Rewriting the homepage.json file')
-            json_string = {
-                "header_summary": yesterday_data.get('header_summary', []),
-                "recent_posts": yesterday_data.get('recent_posts', []),
-                "active_posts": yesterday_data.get('recent_posts', []),
-                "today_in_history_posts": today_in_history_data
-            }
-            gen.write_json_file(json_string, json_file_path)
-        else:
-            logger.info("No need to rewrite homepage.json file")
-
-        if os.path.exists(full_path):
-            archive_json_file_path = fr"static/homepage/{str_month_year}/{current_date_str}-homepage.json"
-            gen.store_file_in_archive(json_file_path, archive_json_file_path)
+import json
+import os
+import random
+import sys
+import time
+import traceback
+import warnings
+from datetime import datetime, timedelta
+
+from loguru import logger
+from tqdm import tqdm
+
+from src.config import ES_INDEX
+from src.elasticsearch_utils import ElasticSearchClient
+from src.json_utils import GenerateJSON
+from src.utils import month_dict
+from src.xml_utils import GenerateXML
+
+warnings.filterwarnings("ignore")
+
+
+def page_data_handling(data_list: list, get_unique_per_dev=False):
+    page_data = []
+    collected_dev_data = []
+    for data in tqdm(data_list):
+        try:
+            # Generate all XML files for each given title, if not present
+            xml_gen.start(dict_data=[data], url=data['_source']['domain'])
+            entry_data = json_gen.create_single_entry(data, look_for_combined_summary=True)
+            if get_unique_per_dev:  # Ensure that there is only one document per domain
+                if entry_data['dev_name'] not in collected_dev_data:
+                    collected_dev_data.append(entry_data['dev_name'])
+                    logger.info(f"collected data for: {collected_dev_data}")
+                    page_data.append(entry_data)
+            else:
+                page_data.append(entry_data)
+        except Exception as ex:
+            logger.error(
+                f"Error occurred for doc id: {data['_source']['id']}\n{ex} \n{traceback.format_exc()}")
+    return page_data
+
+
+if __name__ == "__main__":
+
+    # Instantiating objects for generating JSON, XML and connecting to ElasticSearch
+    json_gen = GenerateJSON()
+    xml_gen = GenerateXML()
+    elastic_search = ElasticSearchClient()
+
+    # URLs of mailing lists and forums
+    dev_urls = [
+        ["https://lists.linuxfoundation.org/pipermail/bitcoin-dev/",
+         "https://gnusha.org/pi/bitcoindev/"],
+        "https://lists.linuxfoundation.org/pipermail/lightning-dev/",
+        "https://delvingbitcoin.org/"
+    ]
+
+    # Set the date range for data extraction
+    current_date = datetime.now()
+    start_date = current_date - timedelta(days=7)
+
+    start_date_str = start_date.strftime("%Y-%m-%d")
+    current_date_str = current_date.strftime("%Y-%m-%d")
+
+    logger.info(f"start_date: {start_date_str}")
+    logger.info(f"current_date_str: {current_date_str}")
+
+    # Convert month from number to name for filename construction
+    month_name = month_dict[int(current_date.month)]
+    str_month_year = f"{month_name}_{int(current_date.year)}"
+
+    recent_data_list = []
+    active_data_list = []
+    today_in_history_data_list = []
+    history_data_collected_from_yesterday = False
+    random_years_ago = None
+
+    # path to the stored homepage.json file
+    json_file_path = fr"static/homepage.json"
+
+    # Process each URL in the dev_urls list
+    for dev_url in dev_urls:
+        logger.info(f"Working on URL: {dev_url}")
+        fetch_today_in_history = True
+
+        # Fetch docs from an elasticsearch index
+        data_list = elastic_search.extract_data_from_es(
+            ES_INDEX, dev_url, start_date_str, current_date_str, exclude_combined_summary_docs=True
+        )
+
+        dev_name = dev_url[0].split("/")[-2] if isinstance(dev_url, list) else dev_url.split("/")[-2]
+        logger.success(f"Retrieved {len(data_list)} threads for {dev_name}")
+
+        seen_titles = set()
+
+        # TOP ACTIVE POSTS
+        active_posts_data = elastic_search.filter_top_active_posts(
+            es_results=data_list, top_n=10
+        )
+
+        # Collect N active posts per domain
+        active_posts_data_counter = 0
+        for data in active_posts_data:
+            if active_posts_data_counter >= 3:
+                break
+
+            title = data['_source']['title']
+            if title in seen_titles:
+                continue
+            seen_titles.add(title)
+
+            # Fetch the first post for given title and domain
+            original_post = elastic_search.get_earliest_posts_by_title(
+                es_index=ES_INDEX, url=dev_url, title=title
+            )
+
+            # Gather post counts for given title and its total contributors
+            counts, contributors = elastic_search.es_fetch_contributors_and_threads(
+                es_index=ES_INDEX, title=title, domain=dev_url
+            )
+
+            # As we want to show the original/first post of the filtered active post,
+            # we are parsing information from 'original_post',
+            # otherwise we would parse the information from 'data' if we want to show the filtered post itself
+
+            # Separate out an original author from contributor's list
+            for author in original_post['_source']['authors']:
+                contributors.remove(author)
+            original_post['_source']['n_threads'] = counts
+            original_post['_source']['contributors'] = contributors
+            original_post['_source']['dev_name'] = dev_name
+            active_data_list.append(original_post)
+            active_posts_data_counter += 1
+
+        logger.success(f"Number of active posts collected: {len(active_data_list)}, for URL: {dev_url}")
+
+        # TOP RECENT POSTS
+        recent_data_post_counter = 0
+        recent_posts_data = elastic_search.filter_top_recent_posts(es_results=data_list, top_n=20)
+
+        for data in recent_posts_data:
+            # If preprocessed body text shorter than token_threshold, skip the doc
+            if not json_gen.is_body_text_long(data=data, sent_threshold=2):
+                logger.info(f"skipping: {data['_source']['title']} - {data['_source']['url']}")
+                continue
+
+            title = data['_source']['title']
+            if title in seen_titles:
+                continue
+            seen_titles.add(title)
+
+            # Collect N recent posts per domain
+            if recent_data_post_counter >= 3:
+                break
+
+            # Gather post counts for given title and its total contributors
+            counts, contributors = elastic_search.es_fetch_contributors_and_threads(
+                es_index=ES_INDEX, title=title, domain=dev_url
+            )
+
+            # Separate out an original author from contributor's list
+            for author in data['_source']['authors']:
+                contributors.remove(author)
+            data['_source']['n_threads'] = counts
+            data['_source']['contributors'] = contributors
+            data['_source']['dev_name'] = dev_name
+            recent_data_list.append(data)
+            recent_data_post_counter += 1
+
+        if not recent_data_list:
+            for data in recent_posts_data:
+                # If the preprocessed body text shorter than token_threshold, skip that post
+                if not json_gen.is_body_text_long(data=data, sent_threshold=2):
+                    logger.info(f"skipping: {data['_source']['title']} - {data['_source']['url']}")
+                    continue
+
+                title = data['_source']['title']
+                # Collect N recent posts per domain
+                if recent_data_post_counter >= 3:
+                    break
+                counts, contributors = elastic_search.es_fetch_contributors_and_threads(
+                    es_index=ES_INDEX, title=title, domain=dev_url
+                )
+
+                # Separate out an original author from contributor's list
+                for author in data['_source']['authors']:
+                    contributors.remove(author)
+                data['_source']['n_threads'] = counts
+                data['_source']['contributors'] = contributors
+                data['_source']['dev_name'] = dev_name
+                recent_data_list.append(data)
+                recent_data_post_counter += 1
+
+        logger.success(f"Number of recent posts collected: {len(recent_data_list)}, for URL: {dev_url}")
+
+        # TODAY IN HISTORY POSTS
+        logger.info(f"fetching 'Today in history' posts... ")
+
+        # Randomly choose a number N within given range and look back N for the data N years ago
+        # for bitcoin-dev and lighting-dev we have data from 2015, and for delving-bitcoin we have it from 2022
+        if not random_years_ago:
+            at_least_years_ago = 3
+            at_max_years_ago = current_date.year - 2015
+            random_years_ago = random.randint(at_least_years_ago, at_max_years_ago)
+            logger.info(f"Random years ago between {at_least_years_ago} to {at_max_years_ago}: {random_years_ago}")
+
+        if dev_url == "https://delvingbitcoin.org/":
+            random_years_ago = random.randint(1, current_date.year - 2022)
+            logger.info(
+                f"for delving-bitcoin - random years ago between {1} to {current_date.year - 2022}: {random_years_ago}")
+
+        default_days_to_look_back = 6
+        loop_counter = 1
+
+        while fetch_today_in_history:
+            days_to_look_back = default_days_to_look_back * loop_counter
+            selected_random_date = current_date - timedelta(days=365 * random_years_ago)
+
+            start_of_time = selected_random_date - timedelta(days=selected_random_date.weekday())
+            end_of_time = start_of_time + timedelta(days=days_to_look_back)
+
+            start_of_time_str = start_of_time.strftime("%Y-%m-%dT%H:%M:%S")
+            end_of_time_str = end_of_time.strftime("%Y-%m-%dT%H:%M:%S")
+
+            logger.info(
+                f"collecting the data from {days_to_look_back} days range ... || Start of week: {start_of_time} | "
+                f"End of week: {end_of_time}")
+
+            selected_threads = elastic_search.fetch_data_in_date_range(
+                es_index=ES_INDEX,
+                start_date=start_of_time_str,
+                end_date=end_of_time_str,
+                domain=dev_url
+            )
+
+            if len(selected_threads) > 0:
+                for doc in selected_threads:
+                    doc_title = doc['_source']['title']
+                    doc_created_at = doc['_source']['created_at']
+
+                    if doc['_source']['type'] == 'original_post':
+
+                        counts, contributors = elastic_search.es_fetch_contributors_and_threads(
+                            es_index=ES_INDEX, title=doc_title, domain=dev_url
+                        )
+
+                        if counts < 5:
+                            logger.info(f"No. of replies are less than 5, skipping it... ")
+                            continue
+
+                        if contributors:
+                            # Separate out an original author from contributor's list
+                            for author in doc['_source']['authors']:
+                                contributors.remove(author)
+                        doc['_source']['n_threads'] = counts
+                        doc['_source']['contributors'] = contributors
+                        doc['_source']['dev_name'] = dev_name
+                        today_in_history_data_list.append(doc)
+                        logger.info(f"collected doc created on: {doc_created_at} || TITLE: {doc_title}")
+                        fetch_today_in_history = False
+                        break
+            loop_counter += 1
+
+        # If not data found for given time period, collect the history data from stored homepage.json file
+        if not today_in_history_data_list:
+            logger.info("Collecting yesterday's history threads!")
+            current_directory = os.getcwd()
+            full_path = os.path.join(current_directory, json_file_path)
+            if os.path.exists(full_path):
+                with open(full_path, 'r') as j:
+                    try:
+                        data = json.load(j)
+                    except Exception as e:
+                        logger.info(f"Error reading json file:{full_path} :: {e}")
+                        data = {}
+                    today_in_history_data_list.extend(data.get('today_in_history_posts', []))
+                    history_data_collected_from_yesterday = True
+
+        logger.success(f"No. of 'Today in history' posts collected: {len(today_in_history_data_list)}")
+
+    # Determine if there's any update in the data as compared to stored JSON file
+    current_directory = os.getcwd()
+    full_path = os.path.join(current_directory, json_file_path)
+    if os.path.exists(full_path):
+        with open(full_path, 'r') as j:
+            try:
+                yesterday_data = json.load(j)
+            except Exception as e:
+                logger.info(f"Error reading json file:{full_path} :: {e}")
+                yesterday_data = {}
+
+    stored_json_titles = json_gen.get_existing_json_title(file_path=json_file_path)
+    collected_post_titles = set([data['_source']['title'] for data in recent_data_list] +
+                                [data['_source']['title'] for data in active_data_list])
+
+    if collected_post_titles != set(stored_json_titles):
+        logger.info("Changes found as compared to previously stored JSON file... ")
+
+        delay = 5
+        count = 0
+
+        while True:
+            try:
+                logger.info(
+                    f"Active posts: {len(active_data_list)}, "
+                    f"Recent posts: {len(recent_data_list)}, "
+                    f"Today in history posts: {len(today_in_history_data_list)}"
+                )
+                logger.info("Creating homepage.json file ... ")
+
+                recent_post_summ = ""
+                if len(active_data_list) > 0 or len(recent_data_list) > 0:
+
+                    # Generate the header summary from recent posts,
+                    # and if no recent data is collected then from active posts
+                    if len(recent_data_list) > 0:
+                        recent_post_summ = json_gen.generate_recent_posts_summary(recent_data_list)
+                    else:
+                        recent_post_summ = json_gen.generate_recent_posts_summary(active_data_list)
+                    logger.success(recent_post_summ)
+
+                    # Compile recent posts data
+                    recent_page_data = page_data_handling(recent_data_list)
+
+                    # Compile active posts data
+                    active_page_data = page_data_handling(active_data_list)
+
+                else:
+                    logger.error(f"'Active' and 'Recent' data list empty! Please check the data again.")
+                    recent_page_data, active_page_data = [], []
+
+                # Compile today in history posts
+                if history_data_collected_from_yesterday:
+                    logger.info("No change in 'Today in History' data posts, gathering data from yesterday's post.")
+                    today_in_history_data = yesterday_data.get('today_in_history_posts', [])
+                else:
+                    if len(today_in_history_data_list) > 0:
+                        today_in_history_data = page_data_handling(today_in_history_data_list, get_unique_per_dev=True)
+                    else:
+                        logger.error(f"'Today in history' data list empty! Please check the data again.")
+                        today_in_history_data = []
+
+                json_string = {
+                    "header_summary": recent_post_summ,
+                    "recent_posts": recent_page_data,
+                    "active_posts": active_page_data,
+                    "today_in_history_posts": today_in_history_data
+                }
+                json_gen.write_json_file(json_string, json_file_path)
+
+                archive_json_file_path = fr"static/homepage/{str_month_year}/{current_date_str}-homepage.json"
+                json_gen.store_file_in_archive(json_file_path, archive_json_file_path)
+                break
+
+            except Exception as ex:
+                logger.error(f"Error occurred: {ex} \n{traceback.format_exc()}")
+                time.sleep(delay)
+                count += 1
+                if count > 1:
+                    sys.exit(f"{ex}")
+    else:
+        # If no changes found in Recent or Active posts,
+        # simply gather all data from yesterday's stored json file and save it with an updated name in the archive directory
+        logger.info("No change in 'Recent' or 'Active' posts.")
+        rewrite_json_file = False
+
+        if history_data_collected_from_yesterday:
+            logger.info("No change in 'Today in History' data posts, gathering data from yesterday's post.")
+            today_in_history_data = yesterday_data.get('today_in_history_posts', [])
+        else:
+            rewrite_json_file = True
+            if len(today_in_history_data_list) > 0:
+                today_in_history_data = page_data_handling(today_in_history_data_list, get_unique_per_dev=True)
+            else:
+                logger.error(f"'Today in history' data list empty! Please check the data again.")
+                today_in_history_data = []
+
+        if rewrite_json_file:
+            logger.info(f'Rewriting the homepage.json file')
+            json_string = {
+                "header_summary": yesterday_data.get('header_summary', []),
+                "recent_posts": yesterday_data.get('recent_posts', []),
+                "active_posts": yesterday_data.get('recent_posts', []),
+                "today_in_history_posts": today_in_history_data
+            }
+            json_gen.write_json_file(json_string, json_file_path)
+        else:
+            logger.info("No need to rewrite homepage.json file")
+
+        if os.path.exists(full_path):
+            archive_json_file_path = fr"static/homepage/{str_month_year}/{current_date_str}-homepage.json"
+            json_gen.store_file_in_archive(json_file_path, archive_json_file_path)
diff --git a/generate_weekly_newsletter_json.py b/generate_weekly_newsletter_json.py
index ce5de7781..fc8b98801 100644
--- a/generate_weekly_newsletter_json.py
+++ b/generate_weekly_newsletter_json.py
@@ -1,23 +1,27 @@
+import json
+import os
+import sys
 import time
 import traceback
 from datetime import datetime, timedelta
+
 from loguru import logger
-import os
-import sys
-import json
 from tqdm import tqdm
 
 from src.config import ES_INDEX
 from src.elasticsearch_utils import ElasticSearchClient
 from src.json_utils import GenerateJSON
-from src.xml_utils import GenerateXML
 from src.utils import month_dict
+from src.xml_utils import GenerateXML
 
 if __name__ == "__main__":
 
-    gen = GenerateJSON()
+    # Instantiating objects for generating JSON, XML and connecting to ElasticSearch
+    json_gen = GenerateJSON()
     xml_gen = GenerateXML()
     elastic_search = ElasticSearchClient()
+
+    # URLs for development mailing lists and forums
     dev_urls = [
         ["https://lists.linuxfoundation.org/pipermail/bitcoin-dev/",
          "https://gnusha.org/pi/bitcoindev/"],
@@ -25,40 +29,35 @@
         "https://delvingbitcoin.org/"
     ]
 
+    # Set the date range for data extraction: last week to yesterday.
     current_date = datetime.now()
-    current_date_str = current_date.strftime("%Y-%m-%d")
-
     start_date = current_date - timedelta(days=7)
-    start_date_str = start_date.strftime("%Y-%m-%d")
-
     end_date = current_date - timedelta(days=1)
+
+    current_date_str = current_date.strftime("%Y-%m-%d")
+    start_date_str = start_date.strftime("%Y-%m-%d")
     end_date_str = end_date.strftime("%Y-%m-%d")
 
     logger.info(f"Newsletter publish date: {current_date_str}")
     logger.info(f"Gathering data for newsletter from {start_date_str} to {end_date_str}")
 
+    # Convert month from number to name for filename construction
     month_name = month_dict[int(current_date.month)]
     str_month_year = f"{month_name}_{int(current_date.year)}"
 
     active_data_list = []
     new_threads_list = []
 
+    # Process each URL in the dev_urls list
     for dev_url in dev_urls:
-
         data_list = elastic_search.extract_data_from_es(
             ES_INDEX, dev_url, start_date_str, end_date_str, exclude_combined_summary_docs=True
         )
 
-        if isinstance(dev_url, list):
-            dev_name = dev_url[0].split("/")[-2]
-        else:
-            dev_name = dev_url.split("/")[-2]
-
-        logger.success(f"TOTAL THREADS RECEIVED FOR '{dev_name}': {len(data_list)}")
+        dev_name = dev_url[0].split("/")[-2] if isinstance(dev_url, list) else dev_url.split("/")[-2]
+        logger.success(f"Retrieved {len(data_list)} threads for {dev_name}")
 
         # NEW THREADS POSTS
-        # @TODO you already identify the original post by type==original_post
-        # so you could get the posts in order by date and check if the original posts is there
         seen_titles = set()
         for i in data_list:
             this_title = i['_source']['title']
@@ -66,27 +65,25 @@
                 continue
             seen_titles.add(this_title)
 
-            # check if the first post for this title is in the past week
-            original_post = elastic_search.get_earliest_posts_by_title(es_index=ES_INDEX, url=dev_url, title=this_title)
-
-            if original_post['_source'] and i['_source']['created_at'] == original_post['_source']['created_at']:
-                logger.success(f"new thread created on: {original_post['_source']['created_at']} || TITLE: {this_title}")
+            # Check if any new thread started in given week
+            if i['_source']['type'] == 'original_post':
+                logger.success(f"New thread created on: {i['_source']['created_at']} || TITLE: {this_title}")
 
                 counts, contributors = elastic_search.es_fetch_contributors_and_threads(
                     es_index=ES_INDEX, title=this_title, domain=dev_url
                 )
-
+                # Separate an original author and contributors
                 for author in i['_source']['authors']:
                     contributors.remove(author)
                 i['_source']['n_threads'] = counts
                 i['_source']['contributors'] = contributors
                 i['_source']['dev_name'] = dev_name
                 new_threads_list.append(i)
-        logger.info(f"number of new threads started this week: {len(new_threads_list)}")
+        logger.info(f"No. of new threads started this week: {len(new_threads_list)}")
 
         # TOP ACTIVE POSTS
         active_posts_data = elastic_search.filter_top_active_posts(es_results=data_list, top_n=15)
-        logger.info(f"number of filtered top active post: {len(active_posts_data)}")
+        logger.info(f"No. of filtered top active post: {len(active_posts_data)}")
 
         new_threads_titles_list = [i['_source']['title'] for i in new_threads_list]
 
@@ -103,14 +100,15 @@
             seen_titles.add(title)
             active_data_list.append(data)
             # active_posts_data_counter += 1
-        logger.info(f"number of active posts collected: {len(active_data_list)}")
+        logger.info(f"No. of active posts collected: {len(active_data_list)}")
 
-    # gather titles of docs from json file
+    # Determine if there's any update in the data compared to stored JSON
+    # Gather titles from stored JSON file
     json_file_path = fr"static/newsletters/newsletter.json"
 
     current_directory = os.getcwd()
     json_full_path = os.path.join(current_directory, json_file_path)
-    json_xml_ids = set()
+    stored_json_titles = set()
     if os.path.exists(json_full_path):
         with open(json_full_path, 'r') as j:
             try:
@@ -119,22 +117,22 @@
                 logger.info(f"Error reading json file:{json_full_path} :: {e}")
                 json_data = {}
 
-        json_xml_ids = set(
+        stored_json_titles = set(
             [item['title'] for item in json_data.get('new_threads_this_week', [])] +
             [item['title'] for item in json_data.get('active_posts_this_week', [])]
         )
     else:
         logger.warning(f"No existing newsletter.json file found: {json_full_path}")
 
-    # gather ids of docs from active posts and new thread posts
-    filtered_docs_ids = set(
+    # Gather titles from collected Active data and New Threads list
+    collected_json_titles = set(
         [data['_source']['title'] for data in active_data_list] +
         [data['_source']['title'] for data in new_threads_list]
     )
 
-    # check if there are any updates in the xml file
-    if filtered_docs_ids != json_xml_ids:
-        logger.info("changes found in recent posts ... ")
+    # Generate a new newsletter.json file if changes found in stored JSON file
+    if collected_json_titles != stored_json_titles:
+        logger.info("Changes found as compared to previously stored JSON file... ")
 
         delay = 5
         count = 0
@@ -144,23 +142,21 @@
                 logger.success(f"Total no. of active posts collected: {len(active_data_list)}")
                 logger.success(f"Total no. of new threads started this week: {len(new_threads_list)}")
 
-                logger.info("creating newsletter.json file ... ")
+                logger.info("Creating newsletter.json file ... ")
                 if len(active_data_list) > 0 or len(new_threads_list) > 0:
 
+                    # Prepare New Threads data for newsletter
                     new_threads_page_data = []
-                    active_page_data = []
                     new_threads_summary = ""
-
                     if new_threads_list:
-                        new_threads_summary += gen.generate_recent_posts_summary(new_threads_list, verbose=True)
+                        new_threads_summary += json_gen.generate_recent_posts_summary(new_threads_list, verbose=True)
                         logger.success(new_threads_summary)
 
                         for data in tqdm(new_threads_list):
                             try:
-                                # check and generate any missing file
+                                # Generate all XML files for given title, if not present
                                 xml_gen.start(dict_data=[data], url=data['_source']['domain'])
-
-                                entry_data = gen.create_single_entry(
+                                entry_data = json_gen.create_single_entry(
                                     data,
                                     base_url_for_xml="https://tldr.bitcoinsearch.xyz/summary",
                                     look_for_combined_summary=True,
@@ -173,16 +169,17 @@
                     else:
                         logger.warning(f"No new threads started this week, generating summary of active posts this "
                                        f"week ...")
-                        # if no new threads started this week, generate summary from active post this week
-                        new_threads_summary += gen.generate_recent_posts_summary(active_data_list)
+                        # If no new threads started this week, generate summary from active posts of the given week
+                        new_threads_summary += json_gen.generate_recent_posts_summary(active_data_list)
                         logger.success(new_threads_summary)
 
+                    # Prepare active posts data for newsletter
+                    active_page_data = []
                     for data in tqdm(active_data_list):
                         try:
-                            # check and generate any missing file
+                            # Generate all XML files for given title, if not present
                             xml_gen.start(dict_data=[data], url=data['_source']['domain'])
-
-                            entry_data = gen.create_single_entry(
+                            entry_data = json_gen.create_single_entry(
                                 data, base_url_for_xml="https://tldr.bitcoinsearch.xyz/summary",
                                 look_for_combined_summary=True, remove_xml_extension=True
                             )
@@ -191,19 +188,17 @@
                             logger.error(
                                 f"Error occurred for doc id: {data['_source']['id']}\n{ex} \n{traceback.format_exc()}")
 
+                    # Compile and save data for newsletter file
                     json_string = {
                         "summary_of_threads_started_this_week": new_threads_summary,
                         "new_threads_this_week": new_threads_page_data,
                         "active_posts_this_week": active_page_data
                     }
-                    gen.write_json_file(json_string, json_file_path)
-
+                    json_gen.write_json_file(json_string, json_file_path)
                     archive_json_file_path = fr"static/newsletters/{str_month_year}/{current_date_str}-newsletter.json"
-                    gen.store_file_in_archive(json_file_path, archive_json_file_path)
-
+                    json_gen.store_file_in_archive(json_file_path, archive_json_file_path)
                 else:
                     logger.error(f"Data list empty! Please check the data again.")
-
                 break
             except Exception as ex:
                 logger.error(f"Error occurred: {ex} \n{traceback.format_exc()}")
@@ -212,8 +207,8 @@
                 if count > 1:
                     sys.exit(f"{ex}")
     else:
+        # If no changes found in stored JSON file, save the previous one with updated name in the archive directory
         logger.success("No change in the posts, no need to update newsletter.json file")
-        # save the previous one with updated name in archive
         if os.path.exists(json_full_path):
             archive_json_file_path = fr"static/newsletters/{str_month_year}/{current_date_str}-newsletter.json"
-            gen.store_file_in_archive(json_file_path, archive_json_file_path)
+            json_gen.store_file_in_archive(json_file_path, archive_json_file_path)
diff --git a/push_combined_summary_to_es.py b/push_combined_summary_to_es.py
index 018a838de..c3a53a70b 100644
--- a/push_combined_summary_to_es.py
+++ b/push_combined_summary_to_es.py
@@ -13,10 +13,11 @@
 
     REMOVE_TIMESTAMPS_IN_AUTHORS = True
 
+    # Instantiating objects for reading XML and connecting to ElasticSearch
     xml_reader = XMLReader()
     elastic_search = ElasticSearchClient()
 
-    total_combined_files = []
+    # Static directory names to look into for respective combined summary xml files
     static_dirs = [
         'bitcoin-dev',
         'lightning-dev',
@@ -24,29 +25,30 @@
     ]
     pattern = "combined*.xml"
 
+    total_combined_files = []
     for static_dir in static_dirs:
         combined_files = glob.glob(f"static/{static_dir}/**/{pattern}")
         total_combined_files.extend(combined_files)
     logger.info(f"Total combined files: {(len(total_combined_files))}")
 
-    # get unique combined file paths
+    # Get unique combined file paths
     total_combined_files_dict = {os.path.splitext(os.path.basename(i))[0]: i for i in total_combined_files}
-
     logger.info(f"Total unique combined files: {len(total_combined_files_dict)}")
 
+    # Loop through all locally stored combined summary XML files and insert/update them accordingly
     for file_name, full_path in tqdm.tqdm(total_combined_files_dict.items()):
         try:
-            # get data from xml file
+            # Get data from xml file
             xml_file_data = xml_reader.read_xml_file(full_path)
 
             if REMOVE_TIMESTAMPS_IN_AUTHORS:
-                # remove timestamps from author's names and collect unique names only
+                # Remove timestamps from author's names and collect unique names only
                 xml_file_data['authors'] = remove_timestamps_from_author_names(xml_file_data['authors'])
 
-            # check if doc exist in ES index
+            # Check if doc exist in ES index
             doc_exists = elastic_search.es_client.exists(index=ES_INDEX, id=file_name)
 
-            # insert the doc in ES index if it does not exist, else update it
+            # Insert the doc in ES index if it does not exist, else update it
             if not doc_exists:
                 res = elastic_search.es_client.index(
                     index=ES_INDEX,
diff --git a/push_summary_to_es.py b/push_summary_to_es.py
index 930e46c0b..6ea9e0480 100644
--- a/push_summary_to_es.py
+++ b/push_summary_to_es.py
@@ -6,14 +6,15 @@
 from src.elasticsearch_utils import ElasticSearchClient
 from src.xml_utils import XMLReader
 
-
 if __name__ == "__main__":
 
     APPLY_DATE_RANGE = False
 
+    # Instantiating objects for reading XML and connecting to ElasticSearch
     xml_reader = XMLReader()
     elastic_search = ElasticSearchClient()
 
+    # URLs for development mailing lists and forums
     dev_urls = [
         "https://lists.linuxfoundation.org/pipermail/bitcoin-dev/",
         "https://lists.linuxfoundation.org/pipermail/lightning-dev/",
@@ -21,8 +22,9 @@
         "https://gnusha.org/pi/bitcoindev/"
     ]
 
+    # Process each URL in the dev_urls list
     for dev_url in dev_urls:
-
+        # Set the date range for data extraction
         if APPLY_DATE_RANGE:
             current_date_str = None
             if not current_date_str:
@@ -35,21 +37,20 @@
             start_date_str = None
             current_date_str = None
 
+        # Fetch doc with an empty summary field
         docs_list = elastic_search.fetch_data_with_empty_summary(ES_INDEX, dev_url, start_date_str, current_date_str)
 
-        if isinstance(dev_url, list):
-            dev_name = dev_url[0].split("/")[-2]
-        else:
-            dev_name = dev_url.split("/")[-2]
-
+        dev_name = dev_url[0].split("/")[-2] if isinstance(dev_url, list) else dev_url.split("/")[-2]
         logger.success(f"Total threads received with empty summary for '{dev_name}': {len(docs_list)}")
 
+        # Loop through all fetched docs and update them by adding the summary from xml files
         for doc in tqdm.tqdm(docs_list):
             res = None
             try:
                 doc_id = doc['_id']
                 doc_index = doc['_index']
                 if not doc['_source'].get('summary'):
+                    # Get summary text from locally stored XML files
                     xml_summary = xml_reader.get_xml_summary(doc, dev_name)
                     if xml_summary:
                         elastic_search.es_client.update(
diff --git a/src/config.py b/src/config.py
index 9b2dba280..a07c31892 100644
--- a/src/config.py
+++ b/src/config.py
@@ -1,7 +1,7 @@
 import os
-import openai
 import warnings
 
+import openai
 import tiktoken
 from dotenv import load_dotenv
 
diff --git a/src/xml_utils.py b/src/xml_utils.py
index cf4fe58fe..a74873d9e 100644
--- a/src/xml_utils.py
+++ b/src/xml_utils.py
@@ -1,23 +1,24 @@
-import re
-import pandas as pd
-from feedgen.feed import FeedGenerator
-from tqdm import tqdm
+import glob
+import os
 import platform
+import re
 import shutil
+import traceback
+import xml.etree.ElementTree as ET
 from datetime import datetime, timezone
+
+import pandas as pd
 import pytz
-import glob
-import xml.etree.ElementTree as ET
-import os
-import traceback
+from feedgen.feed import FeedGenerator
 from loguru import logger
+from tqdm import tqdm
 
+from src.config import ES_INDEX
+from src.elasticsearch_utils import ElasticSearchClient
+from src.gpt_utils import create_summary
 from src.utils import preprocess_email, month_dict, get_id, clean_title, convert_to_tuple, create_folder, \
     remove_multiple_whitespaces, add_utc_if_not_present
-from src.gpt_utils import create_summary
 
-from src.config import ES_INDEX
-from src.elasticsearch_utils import ElasticSearchClient
 elastic_search = ElasticSearchClient()
 
 
@@ -149,8 +150,6 @@ def append_columns(self, df_dict, file, title, namespace):
 
         # The title is directly provided as a parameter
         df_dict["title"].append(title)
-        # formatted_file_name = file.split("/static")[1]
-        # logger.info(formatted_file_name)
 
         # Parse the XML file to extract and append relevant data
         tree = ET.parse(file)
@@ -174,51 +173,46 @@ def append_columns(self, df_dict, file, title, namespace):
         df_dict["authors"].append([author_result.strip()])
 
     def file_not_present_df(self, columns, source_cols, df_dict, files_list, dict_data, data,
-                            title, combined_filename, namespace):
+                            title, namespace):
         """
         Processes data directly from the given document (`data`) as no XML summary is
         available for that document. Also, for each individual summary (XML file) that
         already exists for the given thread, extracts and appends its content to the dictionary.
         """
-        # Append basic data from dict_data for each column into df_dict
+        # Append basic data from dict_data for each column into df_dict using list comprehension
         for col in columns:
             df_dict[col].append(dict_data[data][col])
 
+        # Processing source_cols with conditional append
         for col in source_cols:
+            value = dict_data[data]['_source'][col]
             if "created_at" in col:
-                datetime_obj = add_utc_if_not_present(dict_data[data]['_source'][col], iso_format=False)
-                df_dict[col].append(datetime_obj)
-            else:
-                df_dict[col].append(dict_data[data]['_source'][col])
+                value = add_utc_if_not_present(value, iso_format=False)
+            df_dict[col].append(value)
+
+        # Iterate over files with transformed file paths
+        files_list = [file.replace("\\", "/") for file in files_list]
+
+        # Use dictionary to store parsed XML trees to avoid redundant parsing
+        parsed_files = {}
 
         # For each individual summary (XML file) that exists for the
         # given thread, extract and append their content to the dictionary
-        # TODO: 
-        # This method is called for every post without a summary, which means that
-        # existing inidividual summaries for a thread are added n-1 times the amount
-        # of new posts in the thread at the time of execution of the cron job.
-        # this is not an issue because we then drop duplicates, but it's extra complexity. 
         for file in files_list:
-            file = file.replace("\\", "/")
             if os.path.exists(file):
-                tree = ET.parse(file)
-                root = tree.getroot()
+                if file not in parsed_files:
+                    tree = ET.parse(file)
+                    root = tree.getroot()
+                    parsed_files[file] = (tree, root)
+
+                tree, root = parsed_files[file]
                 file_title = root.find('atom:entry/atom:title', namespace).text
 
                 if title == file_title:
                     self.append_columns(df_dict, file, title, namespace)
 
-                    if combined_filename in file:
-                        # TODO: the code will never reach this point
-                        # as we are already filtering per thread title so no
-                        # "Combined summary - X" filename will pass though
-                        tree = ET.parse(file)
-                        root = tree.getroot()
-                        summary = root.find('atom:entry/atom:summary', namespace).text
-                        df_dict["body"].append(summary)
-                    else:
-                        summary = root.find('atom:entry/atom:summary', namespace).text
-                        df_dict["body"].append(summary)
+                    summary = root.find('atom:entry/atom:summary', namespace).text
+                    df_dict["body"].append(summary)
             else:
                 logger.info(f"file not present: {file}")
 
@@ -230,39 +224,45 @@ def file_present_df(self, files_list, namespace, combined_filename, title, indiv
         summary exists, it extracts the content of individual summaries, appending it
         to the data dictionary.
         """
-        combined_file_fullpath = None # the combined XML file if found
         # List to keep track of the month folders that contain
         # the XML files for the posts of the current thread
-        month_folders = []
+        month_folders = set()
+
+        # Cached listdir calls to avoid repeated disk access
+        folder_contents = {}
+
+        # Identifying combined file and processing individual summaries in a single loop
+        combined_file_fullpath = None
 
-        # Iterate through the list of local XML file paths
         for file in files_list:
-            file = file.replace("\\", "/")
+            normalized_file = file.replace("\\", "/")
             # Check if the current file is the combined XML file for the thread
-            if combined_filename in file:
-                combined_file_fullpath = file
+            if combined_filename in normalized_file:
+                combined_file_fullpath = normalized_file
             # Parse the XML file to find the title and compare it with the current title
             # in order to understand if the post/file is part of the current thread
-            tree = ET.parse(file)
+            tree = ET.parse(normalized_file)
             root = tree.getroot()
             file_title = root.find('atom:entry/atom:title', namespace).text
             # If titles match, add the file to the list of relevant XMLs and track its month folder
             if title == file_title:
-                individual_summaries_xmls_list.append(file)
-                month_folder_path = "/".join(file.split("/")[:-1])
-                if month_folder_path not in month_folders:
-                    month_folders.append(month_folder_path)
+                individual_summaries_xmls_list.append(normalized_file)
+                month_folder_path = "/".join(normalized_file.split("/")[:-1])
+                month_folders.add(month_folder_path)
 
         # Ensure the combined XML file is copied to all relevant month folders
         for month_folder in month_folders:
-            if combined_file_fullpath and combined_filename not in os.listdir(month_folder):
-                if combined_filename not in os.listdir(month_folder):
-                    shutil.copy(combined_file_fullpath, month_folder)
+            if month_folder not in folder_contents:
+                folder_contents[month_folder] = os.listdir(month_folder)
+
+            if combined_file_fullpath and combined_filename not in folder_contents[month_folder]:
+                shutil.copy(combined_file_fullpath, month_folder)
 
         # If individual summaries exist but no combined summary,
         # extract and append their content to the dictionary
-        if len(individual_summaries_xmls_list) > 0 and not any(combined_filename in item for item in files_list):
-            logger.info("individual summaries are present but not combined ones ...")
+        combined_exists = any(combined_filename in item for item in files_list)
+        if individual_summaries_xmls_list and not combined_exists:
+            logger.info("Individual summaries are present but not combined ones.")
             for file in individual_summaries_xmls_list:
                 self.append_columns(df_dict, file, title, namespace)
                 tree = ET.parse(file)
@@ -283,12 +283,18 @@ def get_local_xml_file_paths(self, dev_url):
         files_list = glob.glob(os.path.join(current_directory, "static", directory, "**/*.xml"), recursive=True)
         return files_list
 
+    def get_local_xml_file_paths_for_title(self, dev_url, title):
+        """
+        Retrieve paths for all relevant local XML files based on the given domain and title
+        """
+        current_directory = os.getcwd()
+        directory = get_base_directory(dev_url)
+        files_list = glob.glob(os.path.join(current_directory, "static", directory, f"**/*{title}.xml"), recursive=True)
+        return files_list
+
     def generate_new_emails_df(self, main_dict_data, dev_url):
         # Define XML namespace for parsing XML files
         namespaces = {'atom': 'http://www.w3.org/2005/Atom'}
- 
-        # Retrieve all existing XML files (summaries) for the given source
-        files_list = self.get_local_xml_file_paths(dev_url)
 
         # Initialize a dictionary to store data for DataFrame construction, with predefined columns
         columns = ['_index', '_id', '_score']
@@ -297,9 +303,9 @@ def generate_new_emails_df(self, main_dict_data, dev_url):
         df_dict = {col: [] for col in (columns + source_cols)}
 
         seen_titles = set()
-        # Process each document in the input data   
+        # Process each document in the input data
         for idx in range(len(main_dict_data)):
-            xmls_list = [] # the existing XML files for the thread that the fetched document is part of
+            xmls_list = []  # the existing XML files for the thread that the fetched document is part of
             thread_title = main_dict_data[idx]["_source"]["title"]
             if thread_title in seen_titles:
                 continue
@@ -322,11 +328,14 @@ def generate_new_emails_df(self, main_dict_data, dev_url):
                 combined_filename = f"combined_{xml_name}.xml"
                 created_at = title_dict_data[data_idx]["_source"]["created_at"]
 
+                # Retrieve all existing XML files (summaries) for the given source and title
+                files_list = self.get_local_xml_file_paths_for_title(dev_url=dev_url, title=xml_name)
+
                 # Check if the XML file for the document exists
                 if not any(file_name in item for item in files_list):
                     logger.info(f"Not present: {created_at} | {file_name}")
                     self.file_not_present_df(columns, source_cols, df_dict, files_list, title_dict_data, data_idx,
-                                             title, combined_filename, namespaces)
+                                             title, namespaces)
                 else:
                     logger.info(f"Present: {created_at} | {file_name}")
                     self.file_present_df(files_list, namespaces, combined_filename, title, xmls_list, df_dict)
diff --git a/xmls_generator_production.py b/xmls_generator_production.py
index e87792a38..ccc466791 100644
--- a/xmls_generator_production.py
+++ b/xmls_generator_production.py
@@ -1,9 +1,11 @@
+import sys
 import time
+import warnings
 from datetime import datetime, timedelta
-import sys
+
 from loguru import logger
-import warnings
 from openai.error import APIError, PermissionError, AuthenticationError, InvalidAPIType, ServiceUnavailableError
+
 from src.config import ES_INDEX
 from src.elasticsearch_utils import ElasticSearchClient
 from src.xml_utils import GenerateXML
@@ -11,8 +13,12 @@
 warnings.filterwarnings("ignore")
 
 if __name__ == "__main__":
+
+    # Instantiating objects for generating JSON, XML and connecting to ElasticSearch
     gen = GenerateXML()
     elastic_search = ElasticSearchClient()
+
+    # URLs of mailing lists and forums
     dev_urls = [
         "https://lists.linuxfoundation.org/pipermail/bitcoin-dev/",
         "https://lists.linuxfoundation.org/pipermail/lightning-dev/",
@@ -20,15 +26,17 @@
         "https://gnusha.org/pi/bitcoindev/"
     ]
 
+    # Set the date range for data extraction
     end_date = datetime.now()
     start_date = end_date - timedelta(days=30)
 
-    # yyyy-mm-dd
     end_date_str = end_date.strftime("%Y-%m-%d")
     start_date_str = start_date.strftime("%Y-%m-%d")
+
     logger.info(f"start_data: {start_date_str}")
     logger.info(f"end_date_str: {end_date_str}")
 
+    # Process each URL in the dev_urls list
     for dev_url in dev_urls:
         data_list = elastic_search.extract_data_from_es(
             ES_INDEX, dev_url, start_date_str, end_date_str, exclude_combined_summary_docs=True