From 87be4b899a7c533637a0b2650909f405bd80c985 Mon Sep 17 00:00:00 2001 From: urvishp80 Date: Fri, 26 Apr 2024 10:46:32 -0600 Subject: [PATCH] Enhanced code and added comments --- .../workflows/homepage_json_gen_cron_job.yml | 2 +- README.md | 2 +- ...mepage_xml.py => generate_homepage_json.py | 764 +++++++++--------- generate_weekly_newsletter_json.py | 103 ++- push_combined_summary_to_es.py | 16 +- push_summary_to_es.py | 15 +- src/config.py | 2 +- src/xml_utils.py | 129 +-- xmls_generator_production.py | 14 +- 9 files changed, 541 insertions(+), 506 deletions(-) rename generate_homepage_xml.py => generate_homepage_json.py (75%) diff --git a/.github/workflows/homepage_json_gen_cron_job.yml b/.github/workflows/homepage_json_gen_cron_job.yml index b6493212b..545c85313 100644 --- a/.github/workflows/homepage_json_gen_cron_job.yml +++ b/.github/workflows/homepage_json_gen_cron_job.yml @@ -37,7 +37,7 @@ jobs: pip install -r requirements.txt - name: Execute Python script - run: python generate_homepage_xml.py + run: python generate_homepage_json.py - name: Configure Git run: | diff --git a/README.md b/README.md index 6e5ad1a99..1b3315129 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Utilizing data collected by the [scraper](https://github.com/bitcoinsearch/scrap - Queries Elasticsearch for documents lacking summaries, extracts summaries from corresponding XML files, and then updates these documents with their summaries in the Elasticsearch index. 3. Daily [Push Combined Summary From XML Files to ES INDEX](.github/workflows/push_combined_summary_to_es_cron_job.yml) ([source](push_combined_summary_to_es.py)) - Processes each combined thread summary XML file, transforming it into a document format, checks for its existence in Elasticsearch, and updates or inserts the document as needed. -4. Daily [Python Homepage Update Script](.github/workflows/homepage_json_gen_cron_job.yml) ([source](generate_homepage_xml.py)) +4. Daily [Python Homepage Update Script](.github/workflows/homepage_json_gen_cron_job.yml) ([source](generate_homepage_json.py)) - Queries the last 7 days of data from Elasticsearch for each source to compile lists of active threads, recent threads, and historical threads for 'Today in History'. It generates a summary of recent threads if available; otherwise, for active threads. The resulting [`homepage.json`](static/homepage.json) is then committed to GitHub to be used by [Bitcoin TLDR](https://github.com/bitcoinsearch/tldr). 5. Weekly [Python Newsletter Generation Script](.github/workflows/weekly_newsletter_gen_cron_job.yml) ([source](generate_weekly_newsletter_json.py)) - Generates a newsletter by compiling lists of new and active threads from the past week's data for each source. It generates a summary of new threads if available; otherwise, for active threads. The resulting [`newsletter.json`](static/newsletters/newsletter.json) is then committed to GitHub to be used by [Bitcoin TLDR](https://github.com/bitcoinsearch/tldr). diff --git a/generate_homepage_xml.py b/generate_homepage_json.py similarity index 75% rename from generate_homepage_xml.py rename to generate_homepage_json.py index ec79e966d..f29626a75 100644 --- a/generate_homepage_xml.py +++ b/generate_homepage_json.py @@ -1,372 +1,392 @@ -import random -import time -import traceback -from datetime import datetime, timedelta -from loguru import logger -import os -import sys -import warnings -import json -from tqdm import tqdm - -from src.config import ES_INDEX -from src.elasticsearch_utils import ElasticSearchClient -from src.json_utils import GenerateJSON -from src.xml_utils import GenerateXML -from src.utils import month_dict - -warnings.filterwarnings("ignore") - - -def page_data_handling(data_list: list, get_unique_per_dev=False): - page_data = [] - collected_dev_data = [] - for data in tqdm(data_list): - try: - # check and generate any missing file - xml_gen.start(dict_data=[data], url=data['_source']['domain']) - entry_data = gen.create_single_entry(data, look_for_combined_summary=True) - - if get_unique_per_dev: - if entry_data['dev_name'] not in collected_dev_data: - collected_dev_data.append(entry_data['dev_name']) - logger.info(f"collected data for: {collected_dev_data}") - page_data.append(entry_data) - else: - page_data.append(entry_data) - except Exception as ex: - logger.error( - f"Error occurred for doc id: {data['_source']['id']}\n{ex} \n{traceback.format_exc()}") - return page_data - - -if __name__ == "__main__": - - gen = GenerateJSON() - xml_gen = GenerateXML() - elastic_search = ElasticSearchClient() - dev_urls = [ - ["https://lists.linuxfoundation.org/pipermail/bitcoin-dev/", - "https://gnusha.org/pi/bitcoindev/"], - "https://lists.linuxfoundation.org/pipermail/lightning-dev/", - "https://delvingbitcoin.org/" - ] - - current_date = datetime.now() - current_date_str = current_date.strftime("%Y-%m-%d") - - start_date = current_date - timedelta(days=7) - start_date_str = start_date.strftime("%Y-%m-%d") - logger.info(f"start_date: {start_date_str}") - logger.info(f"current_date_str: {current_date_str}") - - month_name = month_dict[int(current_date.month)] - str_month_year = f"{month_name}_{int(current_date.year)}" - - json_file_path = fr"static/homepage.json" - - recent_data_list = [] - active_data_list = [] - today_in_history_data_list = [] - history_data_collected_from_yesterday = False - - random_years_ago = None - - for dev_url in dev_urls: - logger.info(f"Working on URL: {dev_url}") - fetch_today_in_history = True - - data_list = elastic_search.extract_data_from_es( - ES_INDEX, dev_url, start_date_str, current_date_str, exclude_combined_summary_docs=True - ) - - if isinstance(dev_url, list): - dev_name = dev_url[0].split("/")[-2] - else: - dev_name = dev_url.split("/")[-2] - - logger.success(f"TOTAL THREADS RECEIVED FOR - '{dev_name}': {len(data_list)}") - - seen_titles = set() - - # TOP ACTIVE POSTS - active_posts_data = elastic_search.filter_top_active_posts( - es_results=data_list, top_n=10 - ) - - active_posts_data_counter = 0 - for data in active_posts_data: - if active_posts_data_counter >= 3: - break - - title = data['_source']['title'] - if title in seen_titles: - continue - seen_titles.add(title) - - # get the first post's info of this title - original_post = elastic_search.get_earliest_posts_by_title( - es_index=ES_INDEX, url=dev_url, title=title - ) - - counts, contributors = elastic_search.es_fetch_contributors_and_threads( - es_index=ES_INDEX, title=title, domain=dev_url - ) - - # if you want to show the first post of each selected title, - # then do the below operations on - 'original_post', else on 'data' - for author in original_post['_source']['authors']: - contributors.remove(author) - original_post['_source']['n_threads'] = counts - original_post['_source']['contributors'] = contributors - original_post['_source']['dev_name'] = dev_name - active_data_list.append(original_post) - active_posts_data_counter += 1 - - logger.success(f"Number of active posts collected: {len(active_data_list)}, for URL: {dev_url}") - - # TOP RECENT POSTS - recent_data_post_counter = 0 - recent_posts_data = elastic_search.filter_top_recent_posts(es_results=data_list, top_n=20) - - for data in recent_posts_data: - # if preprocess body text not longer than token_threshold, skip that post - if not gen.is_body_text_long(data=data, sent_threshold=2): - logger.info(f"skipping: {data['_source']['title']} - {data['_source']['url']}") - continue - - title = data['_source']['title'] - if title in seen_titles: - continue - seen_titles.add(title) - if recent_data_post_counter >= 3: - break - - counts, contributors = elastic_search.es_fetch_contributors_and_threads( - es_index=ES_INDEX, title=title, domain=dev_url - ) - # exclude the post authors - for author in data['_source']['authors']: - contributors.remove(author) - data['_source']['n_threads'] = counts - data['_source']['contributors'] = contributors - data['_source']['dev_name'] = dev_name - recent_data_list.append(data) - recent_data_post_counter += 1 - - if not recent_data_list: - for data in recent_posts_data: - # if preprocess body text not longer than token_threshold, skip that post - if not gen.is_body_text_long(data=data, sent_threshold=2): - logger.info(f"skipping: {data['_source']['title']} - {data['_source']['url']}") - continue - - title = data['_source']['title'] - if recent_data_post_counter >= 3: - break - counts, contributors = elastic_search.es_fetch_contributors_and_threads( - es_index=ES_INDEX, title=title, domain=dev_url - ) - # exclude the post authors - for author in data['_source']['authors']: - contributors.remove(author) - data['_source']['n_threads'] = counts - data['_source']['contributors'] = contributors - data['_source']['dev_name'] = dev_name - recent_data_list.append(data) - recent_data_post_counter += 1 - - logger.success(f"Number of recent posts collected: {len(recent_data_list)}, for URL: {dev_url}") - - # TODAY IN HISTORY POSTS - logger.info(f"fetching 'Today in history' posts... ") - - if not random_years_ago: - at_least_years_ago = 3 - at_max_years_ago = current_date.year - 2015 - random_years_ago = random.randint(at_least_years_ago, at_max_years_ago) - logger.info(f"random years ago between {at_least_years_ago} to {at_max_years_ago}: {random_years_ago}") - - if dev_url == "https://delvingbitcoin.org/": - random_years_ago = random.randint(1, current_date.year - 2022) - logger.info( - f"for delving-bitcoin - random years ago between {1} to {current_date.year - 2022}: {random_years_ago}") - - default_days_to_look_back = 6 - loop_counter = 1 - - while fetch_today_in_history: - days_to_look_back = default_days_to_look_back * loop_counter - selected_random_date = current_date - timedelta(days=365 * random_years_ago) - - start_of_time = selected_random_date - timedelta(days=selected_random_date.weekday()) - end_of_time = start_of_time + timedelta(days=days_to_look_back) - - start_of_time_str = start_of_time.strftime("%Y-%m-%dT%H:%M:%S") - end_of_time_str = end_of_time.strftime("%Y-%m-%dT%H:%M:%S") - - logger.info(f"collecting the data from {days_to_look_back} days range ... || Start of week: {start_of_time} | " - f"End of week: {end_of_time}") - - selected_threads = elastic_search.fetch_data_in_date_range( - es_index=ES_INDEX, - start_date=start_of_time_str, - end_date=end_of_time_str, - domain=dev_url - ) - - if len(selected_threads) > 0: - for doc in selected_threads: - doc_title = doc['_source']['title'] - doc_created_at = doc['_source']['created_at'] - - if doc['_source']['type'] == 'original_post': - - counts, contributors = elastic_search.es_fetch_contributors_and_threads( - es_index=ES_INDEX, title=doc_title, domain=dev_url - ) - - if counts < 5: - logger.info(f"No. of replies are less than 5, skipping it... ") - continue - - if contributors: - # exclude the post authors - for author in doc['_source']['authors']: - contributors.remove(author) - doc['_source']['n_threads'] = counts - doc['_source']['contributors'] = contributors - doc['_source']['dev_name'] = dev_name - today_in_history_data_list.append(doc) - logger.info(f"collected doc created on: {doc_created_at} || TITLE: {doc_title}") - fetch_today_in_history = False - break - loop_counter += 1 - - # add history data from yesterday's homepage.json - if not today_in_history_data_list: - logger.info("Collecting yesterday's history threads!") - current_directory = os.getcwd() - full_path = os.path.join(current_directory, json_file_path) - if os.path.exists(full_path): - with open(full_path, 'r') as j: - try: - data = json.load(j) - except Exception as e: - logger.info(f"Error reading json file:{full_path} :: {e}") - data = {} - today_in_history_data_list.extend(data.get('today_in_history_posts', [])) - history_data_collected_from_yesterday = True - - logger.success(f"No. of 'Today in history' posts collected: {len(today_in_history_data_list)}") - - current_directory = os.getcwd() - full_path = os.path.join(current_directory, json_file_path) - if os.path.exists(full_path): - with open(full_path, 'r') as j: - try: - yesterday_data = json.load(j) - except Exception as e: - logger.info(f"Error reading json file:{full_path} :: {e}") - yesterday_data = {} - - xml_ids_title = gen.get_existing_json_title(file_path=json_file_path) - recent_post_ids = [data['_source']['title'] for data in recent_data_list] - active_post_ids = [data['_source']['title'] for data in active_data_list] - all_post_titles = set(recent_post_ids + active_post_ids) - - if all_post_titles != set(xml_ids_title): - logger.info("changes found in recent posts ... ") - - delay = 5 - count = 0 - - while True: - try: - logger.info( - f"active posts: {len(active_data_list)}, " - f"recent posts: {len(recent_data_list)}, " - f"today in history posts: {len(today_in_history_data_list)}" - ) - logger.info("Creating homepage.json file ... ") - - recent_post_summ = "" - if len(active_data_list) > 0 or len(recent_data_list) > 0: - - # header summary - if len(recent_data_list) > 0: - recent_post_summ = gen.generate_recent_posts_summary(recent_data_list) - else: - recent_post_summ = gen.generate_recent_posts_summary(active_data_list) - logger.success(recent_post_summ) - - # recent data - recent_page_data = page_data_handling(recent_data_list) - - # active data - active_page_data = page_data_handling(active_data_list) - - else: - logger.error(f"'Active' and 'Recent' data list empty! Please check the data again.") - recent_page_data, active_page_data = [], [] - - # today in history - if history_data_collected_from_yesterday: - logger.info("No change in 'Today in History' data posts, gathering data from yesterday's post.") - today_in_history_data = yesterday_data.get('today_in_history_posts', []) - else: - if len(today_in_history_data_list) > 0: - today_in_history_data = page_data_handling(today_in_history_data_list, get_unique_per_dev=True) - else: - logger.error(f"'Today in history' data list empty! Please check the data again.") - today_in_history_data = [] - - json_string = { - "header_summary": recent_post_summ, - "recent_posts": recent_page_data, - "active_posts": active_page_data, - "today_in_history_posts": today_in_history_data - } - gen.write_json_file(json_string, json_file_path) - - archive_json_file_path = fr"static/homepage/{str_month_year}/{current_date_str}-homepage.json" - gen.store_file_in_archive(json_file_path, archive_json_file_path) - break - - except Exception as ex: - logger.error(f"Error occurred: {ex} \n{traceback.format_exc()}") - time.sleep(delay) - count += 1 - if count > 1: - sys.exit(f"{ex}") - else: - logger.info("No change in 'Recent' or 'Active' posts.") - rewrite_json_file = False - - # update today in history and save file if no change in Recent or Active posts - if history_data_collected_from_yesterday: - logger.info("No change in 'Today in History' data posts, gathering data from yesterday's post.") - today_in_history_data = yesterday_data.get('today_in_history_posts', []) - else: - rewrite_json_file = True - if len(today_in_history_data_list) > 0: - today_in_history_data = page_data_handling(today_in_history_data_list, get_unique_per_dev=True) - else: - logger.error(f"'Today in history' data list empty! Please check the data again.") - today_in_history_data = [] - - if rewrite_json_file: - logger.info(f'Rewriting the homepage.json file') - json_string = { - "header_summary": yesterday_data.get('header_summary', []), - "recent_posts": yesterday_data.get('recent_posts', []), - "active_posts": yesterday_data.get('recent_posts', []), - "today_in_history_posts": today_in_history_data - } - gen.write_json_file(json_string, json_file_path) - else: - logger.info("No need to rewrite homepage.json file") - - if os.path.exists(full_path): - archive_json_file_path = fr"static/homepage/{str_month_year}/{current_date_str}-homepage.json" - gen.store_file_in_archive(json_file_path, archive_json_file_path) +import json +import os +import random +import sys +import time +import traceback +import warnings +from datetime import datetime, timedelta + +from loguru import logger +from tqdm import tqdm + +from src.config import ES_INDEX +from src.elasticsearch_utils import ElasticSearchClient +from src.json_utils import GenerateJSON +from src.utils import month_dict +from src.xml_utils import GenerateXML + +warnings.filterwarnings("ignore") + + +def page_data_handling(data_list: list, get_unique_per_dev=False): + page_data = [] + collected_dev_data = [] + for data in tqdm(data_list): + try: + # Generate all XML files for each given title, if not present + xml_gen.start(dict_data=[data], url=data['_source']['domain']) + entry_data = json_gen.create_single_entry(data, look_for_combined_summary=True) + if get_unique_per_dev: # Ensure that there is only one document per domain + if entry_data['dev_name'] not in collected_dev_data: + collected_dev_data.append(entry_data['dev_name']) + logger.info(f"collected data for: {collected_dev_data}") + page_data.append(entry_data) + else: + page_data.append(entry_data) + except Exception as ex: + logger.error( + f"Error occurred for doc id: {data['_source']['id']}\n{ex} \n{traceback.format_exc()}") + return page_data + + +if __name__ == "__main__": + + # Instantiating objects for generating JSON, XML and connecting to ElasticSearch + json_gen = GenerateJSON() + xml_gen = GenerateXML() + elastic_search = ElasticSearchClient() + + # URLs of mailing lists and forums + dev_urls = [ + ["https://lists.linuxfoundation.org/pipermail/bitcoin-dev/", + "https://gnusha.org/pi/bitcoindev/"], + "https://lists.linuxfoundation.org/pipermail/lightning-dev/", + "https://delvingbitcoin.org/" + ] + + # Set the date range for data extraction + current_date = datetime.now() + start_date = current_date - timedelta(days=7) + + start_date_str = start_date.strftime("%Y-%m-%d") + current_date_str = current_date.strftime("%Y-%m-%d") + + logger.info(f"start_date: {start_date_str}") + logger.info(f"current_date_str: {current_date_str}") + + # Convert month from number to name for filename construction + month_name = month_dict[int(current_date.month)] + str_month_year = f"{month_name}_{int(current_date.year)}" + + recent_data_list = [] + active_data_list = [] + today_in_history_data_list = [] + history_data_collected_from_yesterday = False + random_years_ago = None + + # path to the stored homepage.json file + json_file_path = fr"static/homepage.json" + + # Process each URL in the dev_urls list + for dev_url in dev_urls: + logger.info(f"Working on URL: {dev_url}") + fetch_today_in_history = True + + # Fetch docs from an elasticsearch index + data_list = elastic_search.extract_data_from_es( + ES_INDEX, dev_url, start_date_str, current_date_str, exclude_combined_summary_docs=True + ) + + dev_name = dev_url[0].split("/")[-2] if isinstance(dev_url, list) else dev_url.split("/")[-2] + logger.success(f"Retrieved {len(data_list)} threads for {dev_name}") + + seen_titles = set() + + # TOP ACTIVE POSTS + active_posts_data = elastic_search.filter_top_active_posts( + es_results=data_list, top_n=10 + ) + + # Collect N active posts per domain + active_posts_data_counter = 0 + for data in active_posts_data: + if active_posts_data_counter >= 3: + break + + title = data['_source']['title'] + if title in seen_titles: + continue + seen_titles.add(title) + + # Fetch the first post for given title and domain + original_post = elastic_search.get_earliest_posts_by_title( + es_index=ES_INDEX, url=dev_url, title=title + ) + + # Gather post counts for given title and its total contributors + counts, contributors = elastic_search.es_fetch_contributors_and_threads( + es_index=ES_INDEX, title=title, domain=dev_url + ) + + # As we want to show the original/first post of the filtered active post, + # we are parsing information from 'original_post', + # otherwise we would parse the information from 'data' if we want to show the filtered post itself + + # Separate out an original author from contributor's list + for author in original_post['_source']['authors']: + contributors.remove(author) + original_post['_source']['n_threads'] = counts + original_post['_source']['contributors'] = contributors + original_post['_source']['dev_name'] = dev_name + active_data_list.append(original_post) + active_posts_data_counter += 1 + + logger.success(f"Number of active posts collected: {len(active_data_list)}, for URL: {dev_url}") + + # TOP RECENT POSTS + recent_data_post_counter = 0 + recent_posts_data = elastic_search.filter_top_recent_posts(es_results=data_list, top_n=20) + + for data in recent_posts_data: + # If preprocessed body text shorter than token_threshold, skip the doc + if not json_gen.is_body_text_long(data=data, sent_threshold=2): + logger.info(f"skipping: {data['_source']['title']} - {data['_source']['url']}") + continue + + title = data['_source']['title'] + if title in seen_titles: + continue + seen_titles.add(title) + + # Collect N recent posts per domain + if recent_data_post_counter >= 3: + break + + # Gather post counts for given title and its total contributors + counts, contributors = elastic_search.es_fetch_contributors_and_threads( + es_index=ES_INDEX, title=title, domain=dev_url + ) + + # Separate out an original author from contributor's list + for author in data['_source']['authors']: + contributors.remove(author) + data['_source']['n_threads'] = counts + data['_source']['contributors'] = contributors + data['_source']['dev_name'] = dev_name + recent_data_list.append(data) + recent_data_post_counter += 1 + + if not recent_data_list: + for data in recent_posts_data: + # If the preprocessed body text shorter than token_threshold, skip that post + if not json_gen.is_body_text_long(data=data, sent_threshold=2): + logger.info(f"skipping: {data['_source']['title']} - {data['_source']['url']}") + continue + + title = data['_source']['title'] + # Collect N recent posts per domain + if recent_data_post_counter >= 3: + break + counts, contributors = elastic_search.es_fetch_contributors_and_threads( + es_index=ES_INDEX, title=title, domain=dev_url + ) + + # Separate out an original author from contributor's list + for author in data['_source']['authors']: + contributors.remove(author) + data['_source']['n_threads'] = counts + data['_source']['contributors'] = contributors + data['_source']['dev_name'] = dev_name + recent_data_list.append(data) + recent_data_post_counter += 1 + + logger.success(f"Number of recent posts collected: {len(recent_data_list)}, for URL: {dev_url}") + + # TODAY IN HISTORY POSTS + logger.info(f"fetching 'Today in history' posts... ") + + # Randomly choose a number N within given range and look back N for the data N years ago + # for bitcoin-dev and lighting-dev we have data from 2015, and for delving-bitcoin we have it from 2022 + if not random_years_ago: + at_least_years_ago = 3 + at_max_years_ago = current_date.year - 2015 + random_years_ago = random.randint(at_least_years_ago, at_max_years_ago) + logger.info(f"Random years ago between {at_least_years_ago} to {at_max_years_ago}: {random_years_ago}") + + if dev_url == "https://delvingbitcoin.org/": + random_years_ago = random.randint(1, current_date.year - 2022) + logger.info( + f"for delving-bitcoin - random years ago between {1} to {current_date.year - 2022}: {random_years_ago}") + + default_days_to_look_back = 6 + loop_counter = 1 + + while fetch_today_in_history: + days_to_look_back = default_days_to_look_back * loop_counter + selected_random_date = current_date - timedelta(days=365 * random_years_ago) + + start_of_time = selected_random_date - timedelta(days=selected_random_date.weekday()) + end_of_time = start_of_time + timedelta(days=days_to_look_back) + + start_of_time_str = start_of_time.strftime("%Y-%m-%dT%H:%M:%S") + end_of_time_str = end_of_time.strftime("%Y-%m-%dT%H:%M:%S") + + logger.info( + f"collecting the data from {days_to_look_back} days range ... || Start of week: {start_of_time} | " + f"End of week: {end_of_time}") + + selected_threads = elastic_search.fetch_data_in_date_range( + es_index=ES_INDEX, + start_date=start_of_time_str, + end_date=end_of_time_str, + domain=dev_url + ) + + if len(selected_threads) > 0: + for doc in selected_threads: + doc_title = doc['_source']['title'] + doc_created_at = doc['_source']['created_at'] + + if doc['_source']['type'] == 'original_post': + + counts, contributors = elastic_search.es_fetch_contributors_and_threads( + es_index=ES_INDEX, title=doc_title, domain=dev_url + ) + + if counts < 5: + logger.info(f"No. of replies are less than 5, skipping it... ") + continue + + if contributors: + # Separate out an original author from contributor's list + for author in doc['_source']['authors']: + contributors.remove(author) + doc['_source']['n_threads'] = counts + doc['_source']['contributors'] = contributors + doc['_source']['dev_name'] = dev_name + today_in_history_data_list.append(doc) + logger.info(f"collected doc created on: {doc_created_at} || TITLE: {doc_title}") + fetch_today_in_history = False + break + loop_counter += 1 + + # If not data found for given time period, collect the history data from stored homepage.json file + if not today_in_history_data_list: + logger.info("Collecting yesterday's history threads!") + current_directory = os.getcwd() + full_path = os.path.join(current_directory, json_file_path) + if os.path.exists(full_path): + with open(full_path, 'r') as j: + try: + data = json.load(j) + except Exception as e: + logger.info(f"Error reading json file:{full_path} :: {e}") + data = {} + today_in_history_data_list.extend(data.get('today_in_history_posts', [])) + history_data_collected_from_yesterday = True + + logger.success(f"No. of 'Today in history' posts collected: {len(today_in_history_data_list)}") + + # Determine if there's any update in the data as compared to stored JSON file + current_directory = os.getcwd() + full_path = os.path.join(current_directory, json_file_path) + if os.path.exists(full_path): + with open(full_path, 'r') as j: + try: + yesterday_data = json.load(j) + except Exception as e: + logger.info(f"Error reading json file:{full_path} :: {e}") + yesterday_data = {} + + stored_json_titles = json_gen.get_existing_json_title(file_path=json_file_path) + collected_post_titles = set([data['_source']['title'] for data in recent_data_list] + + [data['_source']['title'] for data in active_data_list]) + + if collected_post_titles != set(stored_json_titles): + logger.info("Changes found as compared to previously stored JSON file... ") + + delay = 5 + count = 0 + + while True: + try: + logger.info( + f"Active posts: {len(active_data_list)}, " + f"Recent posts: {len(recent_data_list)}, " + f"Today in history posts: {len(today_in_history_data_list)}" + ) + logger.info("Creating homepage.json file ... ") + + recent_post_summ = "" + if len(active_data_list) > 0 or len(recent_data_list) > 0: + + # Generate the header summary from recent posts, + # and if no recent data is collected then from active posts + if len(recent_data_list) > 0: + recent_post_summ = json_gen.generate_recent_posts_summary(recent_data_list) + else: + recent_post_summ = json_gen.generate_recent_posts_summary(active_data_list) + logger.success(recent_post_summ) + + # Compile recent posts data + recent_page_data = page_data_handling(recent_data_list) + + # Compile active posts data + active_page_data = page_data_handling(active_data_list) + + else: + logger.error(f"'Active' and 'Recent' data list empty! Please check the data again.") + recent_page_data, active_page_data = [], [] + + # Compile today in history posts + if history_data_collected_from_yesterday: + logger.info("No change in 'Today in History' data posts, gathering data from yesterday's post.") + today_in_history_data = yesterday_data.get('today_in_history_posts', []) + else: + if len(today_in_history_data_list) > 0: + today_in_history_data = page_data_handling(today_in_history_data_list, get_unique_per_dev=True) + else: + logger.error(f"'Today in history' data list empty! Please check the data again.") + today_in_history_data = [] + + json_string = { + "header_summary": recent_post_summ, + "recent_posts": recent_page_data, + "active_posts": active_page_data, + "today_in_history_posts": today_in_history_data + } + json_gen.write_json_file(json_string, json_file_path) + + archive_json_file_path = fr"static/homepage/{str_month_year}/{current_date_str}-homepage.json" + json_gen.store_file_in_archive(json_file_path, archive_json_file_path) + break + + except Exception as ex: + logger.error(f"Error occurred: {ex} \n{traceback.format_exc()}") + time.sleep(delay) + count += 1 + if count > 1: + sys.exit(f"{ex}") + else: + # If no changes found in Recent or Active posts, + # simply gather all data from yesterday's stored json file and save it with an updated name in the archive directory + logger.info("No change in 'Recent' or 'Active' posts.") + rewrite_json_file = False + + if history_data_collected_from_yesterday: + logger.info("No change in 'Today in History' data posts, gathering data from yesterday's post.") + today_in_history_data = yesterday_data.get('today_in_history_posts', []) + else: + rewrite_json_file = True + if len(today_in_history_data_list) > 0: + today_in_history_data = page_data_handling(today_in_history_data_list, get_unique_per_dev=True) + else: + logger.error(f"'Today in history' data list empty! Please check the data again.") + today_in_history_data = [] + + if rewrite_json_file: + logger.info(f'Rewriting the homepage.json file') + json_string = { + "header_summary": yesterday_data.get('header_summary', []), + "recent_posts": yesterday_data.get('recent_posts', []), + "active_posts": yesterday_data.get('recent_posts', []), + "today_in_history_posts": today_in_history_data + } + json_gen.write_json_file(json_string, json_file_path) + else: + logger.info("No need to rewrite homepage.json file") + + if os.path.exists(full_path): + archive_json_file_path = fr"static/homepage/{str_month_year}/{current_date_str}-homepage.json" + json_gen.store_file_in_archive(json_file_path, archive_json_file_path) diff --git a/generate_weekly_newsletter_json.py b/generate_weekly_newsletter_json.py index ce5de7781..fc8b98801 100644 --- a/generate_weekly_newsletter_json.py +++ b/generate_weekly_newsletter_json.py @@ -1,23 +1,27 @@ +import json +import os +import sys import time import traceback from datetime import datetime, timedelta + from loguru import logger -import os -import sys -import json from tqdm import tqdm from src.config import ES_INDEX from src.elasticsearch_utils import ElasticSearchClient from src.json_utils import GenerateJSON -from src.xml_utils import GenerateXML from src.utils import month_dict +from src.xml_utils import GenerateXML if __name__ == "__main__": - gen = GenerateJSON() + # Instantiating objects for generating JSON, XML and connecting to ElasticSearch + json_gen = GenerateJSON() xml_gen = GenerateXML() elastic_search = ElasticSearchClient() + + # URLs for development mailing lists and forums dev_urls = [ ["https://lists.linuxfoundation.org/pipermail/bitcoin-dev/", "https://gnusha.org/pi/bitcoindev/"], @@ -25,40 +29,35 @@ "https://delvingbitcoin.org/" ] + # Set the date range for data extraction: last week to yesterday. current_date = datetime.now() - current_date_str = current_date.strftime("%Y-%m-%d") - start_date = current_date - timedelta(days=7) - start_date_str = start_date.strftime("%Y-%m-%d") - end_date = current_date - timedelta(days=1) + + current_date_str = current_date.strftime("%Y-%m-%d") + start_date_str = start_date.strftime("%Y-%m-%d") end_date_str = end_date.strftime("%Y-%m-%d") logger.info(f"Newsletter publish date: {current_date_str}") logger.info(f"Gathering data for newsletter from {start_date_str} to {end_date_str}") + # Convert month from number to name for filename construction month_name = month_dict[int(current_date.month)] str_month_year = f"{month_name}_{int(current_date.year)}" active_data_list = [] new_threads_list = [] + # Process each URL in the dev_urls list for dev_url in dev_urls: - data_list = elastic_search.extract_data_from_es( ES_INDEX, dev_url, start_date_str, end_date_str, exclude_combined_summary_docs=True ) - if isinstance(dev_url, list): - dev_name = dev_url[0].split("/")[-2] - else: - dev_name = dev_url.split("/")[-2] - - logger.success(f"TOTAL THREADS RECEIVED FOR '{dev_name}': {len(data_list)}") + dev_name = dev_url[0].split("/")[-2] if isinstance(dev_url, list) else dev_url.split("/")[-2] + logger.success(f"Retrieved {len(data_list)} threads for {dev_name}") # NEW THREADS POSTS - # @TODO you already identify the original post by type==original_post - # so you could get the posts in order by date and check if the original posts is there seen_titles = set() for i in data_list: this_title = i['_source']['title'] @@ -66,27 +65,25 @@ continue seen_titles.add(this_title) - # check if the first post for this title is in the past week - original_post = elastic_search.get_earliest_posts_by_title(es_index=ES_INDEX, url=dev_url, title=this_title) - - if original_post['_source'] and i['_source']['created_at'] == original_post['_source']['created_at']: - logger.success(f"new thread created on: {original_post['_source']['created_at']} || TITLE: {this_title}") + # Check if any new thread started in given week + if i['_source']['type'] == 'original_post': + logger.success(f"New thread created on: {i['_source']['created_at']} || TITLE: {this_title}") counts, contributors = elastic_search.es_fetch_contributors_and_threads( es_index=ES_INDEX, title=this_title, domain=dev_url ) - + # Separate an original author and contributors for author in i['_source']['authors']: contributors.remove(author) i['_source']['n_threads'] = counts i['_source']['contributors'] = contributors i['_source']['dev_name'] = dev_name new_threads_list.append(i) - logger.info(f"number of new threads started this week: {len(new_threads_list)}") + logger.info(f"No. of new threads started this week: {len(new_threads_list)}") # TOP ACTIVE POSTS active_posts_data = elastic_search.filter_top_active_posts(es_results=data_list, top_n=15) - logger.info(f"number of filtered top active post: {len(active_posts_data)}") + logger.info(f"No. of filtered top active post: {len(active_posts_data)}") new_threads_titles_list = [i['_source']['title'] for i in new_threads_list] @@ -103,14 +100,15 @@ seen_titles.add(title) active_data_list.append(data) # active_posts_data_counter += 1 - logger.info(f"number of active posts collected: {len(active_data_list)}") + logger.info(f"No. of active posts collected: {len(active_data_list)}") - # gather titles of docs from json file + # Determine if there's any update in the data compared to stored JSON + # Gather titles from stored JSON file json_file_path = fr"static/newsletters/newsletter.json" current_directory = os.getcwd() json_full_path = os.path.join(current_directory, json_file_path) - json_xml_ids = set() + stored_json_titles = set() if os.path.exists(json_full_path): with open(json_full_path, 'r') as j: try: @@ -119,22 +117,22 @@ logger.info(f"Error reading json file:{json_full_path} :: {e}") json_data = {} - json_xml_ids = set( + stored_json_titles = set( [item['title'] for item in json_data.get('new_threads_this_week', [])] + [item['title'] for item in json_data.get('active_posts_this_week', [])] ) else: logger.warning(f"No existing newsletter.json file found: {json_full_path}") - # gather ids of docs from active posts and new thread posts - filtered_docs_ids = set( + # Gather titles from collected Active data and New Threads list + collected_json_titles = set( [data['_source']['title'] for data in active_data_list] + [data['_source']['title'] for data in new_threads_list] ) - # check if there are any updates in the xml file - if filtered_docs_ids != json_xml_ids: - logger.info("changes found in recent posts ... ") + # Generate a new newsletter.json file if changes found in stored JSON file + if collected_json_titles != stored_json_titles: + logger.info("Changes found as compared to previously stored JSON file... ") delay = 5 count = 0 @@ -144,23 +142,21 @@ logger.success(f"Total no. of active posts collected: {len(active_data_list)}") logger.success(f"Total no. of new threads started this week: {len(new_threads_list)}") - logger.info("creating newsletter.json file ... ") + logger.info("Creating newsletter.json file ... ") if len(active_data_list) > 0 or len(new_threads_list) > 0: + # Prepare New Threads data for newsletter new_threads_page_data = [] - active_page_data = [] new_threads_summary = "" - if new_threads_list: - new_threads_summary += gen.generate_recent_posts_summary(new_threads_list, verbose=True) + new_threads_summary += json_gen.generate_recent_posts_summary(new_threads_list, verbose=True) logger.success(new_threads_summary) for data in tqdm(new_threads_list): try: - # check and generate any missing file + # Generate all XML files for given title, if not present xml_gen.start(dict_data=[data], url=data['_source']['domain']) - - entry_data = gen.create_single_entry( + entry_data = json_gen.create_single_entry( data, base_url_for_xml="https://tldr.bitcoinsearch.xyz/summary", look_for_combined_summary=True, @@ -173,16 +169,17 @@ else: logger.warning(f"No new threads started this week, generating summary of active posts this " f"week ...") - # if no new threads started this week, generate summary from active post this week - new_threads_summary += gen.generate_recent_posts_summary(active_data_list) + # If no new threads started this week, generate summary from active posts of the given week + new_threads_summary += json_gen.generate_recent_posts_summary(active_data_list) logger.success(new_threads_summary) + # Prepare active posts data for newsletter + active_page_data = [] for data in tqdm(active_data_list): try: - # check and generate any missing file + # Generate all XML files for given title, if not present xml_gen.start(dict_data=[data], url=data['_source']['domain']) - - entry_data = gen.create_single_entry( + entry_data = json_gen.create_single_entry( data, base_url_for_xml="https://tldr.bitcoinsearch.xyz/summary", look_for_combined_summary=True, remove_xml_extension=True ) @@ -191,19 +188,17 @@ logger.error( f"Error occurred for doc id: {data['_source']['id']}\n{ex} \n{traceback.format_exc()}") + # Compile and save data for newsletter file json_string = { "summary_of_threads_started_this_week": new_threads_summary, "new_threads_this_week": new_threads_page_data, "active_posts_this_week": active_page_data } - gen.write_json_file(json_string, json_file_path) - + json_gen.write_json_file(json_string, json_file_path) archive_json_file_path = fr"static/newsletters/{str_month_year}/{current_date_str}-newsletter.json" - gen.store_file_in_archive(json_file_path, archive_json_file_path) - + json_gen.store_file_in_archive(json_file_path, archive_json_file_path) else: logger.error(f"Data list empty! Please check the data again.") - break except Exception as ex: logger.error(f"Error occurred: {ex} \n{traceback.format_exc()}") @@ -212,8 +207,8 @@ if count > 1: sys.exit(f"{ex}") else: + # If no changes found in stored JSON file, save the previous one with updated name in the archive directory logger.success("No change in the posts, no need to update newsletter.json file") - # save the previous one with updated name in archive if os.path.exists(json_full_path): archive_json_file_path = fr"static/newsletters/{str_month_year}/{current_date_str}-newsletter.json" - gen.store_file_in_archive(json_file_path, archive_json_file_path) + json_gen.store_file_in_archive(json_file_path, archive_json_file_path) diff --git a/push_combined_summary_to_es.py b/push_combined_summary_to_es.py index 018a838de..c3a53a70b 100644 --- a/push_combined_summary_to_es.py +++ b/push_combined_summary_to_es.py @@ -13,10 +13,11 @@ REMOVE_TIMESTAMPS_IN_AUTHORS = True + # Instantiating objects for reading XML and connecting to ElasticSearch xml_reader = XMLReader() elastic_search = ElasticSearchClient() - total_combined_files = [] + # Static directory names to look into for respective combined summary xml files static_dirs = [ 'bitcoin-dev', 'lightning-dev', @@ -24,29 +25,30 @@ ] pattern = "combined*.xml" + total_combined_files = [] for static_dir in static_dirs: combined_files = glob.glob(f"static/{static_dir}/**/{pattern}") total_combined_files.extend(combined_files) logger.info(f"Total combined files: {(len(total_combined_files))}") - # get unique combined file paths + # Get unique combined file paths total_combined_files_dict = {os.path.splitext(os.path.basename(i))[0]: i for i in total_combined_files} - logger.info(f"Total unique combined files: {len(total_combined_files_dict)}") + # Loop through all locally stored combined summary XML files and insert/update them accordingly for file_name, full_path in tqdm.tqdm(total_combined_files_dict.items()): try: - # get data from xml file + # Get data from xml file xml_file_data = xml_reader.read_xml_file(full_path) if REMOVE_TIMESTAMPS_IN_AUTHORS: - # remove timestamps from author's names and collect unique names only + # Remove timestamps from author's names and collect unique names only xml_file_data['authors'] = remove_timestamps_from_author_names(xml_file_data['authors']) - # check if doc exist in ES index + # Check if doc exist in ES index doc_exists = elastic_search.es_client.exists(index=ES_INDEX, id=file_name) - # insert the doc in ES index if it does not exist, else update it + # Insert the doc in ES index if it does not exist, else update it if not doc_exists: res = elastic_search.es_client.index( index=ES_INDEX, diff --git a/push_summary_to_es.py b/push_summary_to_es.py index 930e46c0b..6ea9e0480 100644 --- a/push_summary_to_es.py +++ b/push_summary_to_es.py @@ -6,14 +6,15 @@ from src.elasticsearch_utils import ElasticSearchClient from src.xml_utils import XMLReader - if __name__ == "__main__": APPLY_DATE_RANGE = False + # Instantiating objects for reading XML and connecting to ElasticSearch xml_reader = XMLReader() elastic_search = ElasticSearchClient() + # URLs for development mailing lists and forums dev_urls = [ "https://lists.linuxfoundation.org/pipermail/bitcoin-dev/", "https://lists.linuxfoundation.org/pipermail/lightning-dev/", @@ -21,8 +22,9 @@ "https://gnusha.org/pi/bitcoindev/" ] + # Process each URL in the dev_urls list for dev_url in dev_urls: - + # Set the date range for data extraction if APPLY_DATE_RANGE: current_date_str = None if not current_date_str: @@ -35,21 +37,20 @@ start_date_str = None current_date_str = None + # Fetch doc with an empty summary field docs_list = elastic_search.fetch_data_with_empty_summary(ES_INDEX, dev_url, start_date_str, current_date_str) - if isinstance(dev_url, list): - dev_name = dev_url[0].split("/")[-2] - else: - dev_name = dev_url.split("/")[-2] - + dev_name = dev_url[0].split("/")[-2] if isinstance(dev_url, list) else dev_url.split("/")[-2] logger.success(f"Total threads received with empty summary for '{dev_name}': {len(docs_list)}") + # Loop through all fetched docs and update them by adding the summary from xml files for doc in tqdm.tqdm(docs_list): res = None try: doc_id = doc['_id'] doc_index = doc['_index'] if not doc['_source'].get('summary'): + # Get summary text from locally stored XML files xml_summary = xml_reader.get_xml_summary(doc, dev_name) if xml_summary: elastic_search.es_client.update( diff --git a/src/config.py b/src/config.py index 9b2dba280..a07c31892 100644 --- a/src/config.py +++ b/src/config.py @@ -1,7 +1,7 @@ import os -import openai import warnings +import openai import tiktoken from dotenv import load_dotenv diff --git a/src/xml_utils.py b/src/xml_utils.py index cf4fe58fe..a74873d9e 100644 --- a/src/xml_utils.py +++ b/src/xml_utils.py @@ -1,23 +1,24 @@ -import re -import pandas as pd -from feedgen.feed import FeedGenerator -from tqdm import tqdm +import glob +import os import platform +import re import shutil +import traceback +import xml.etree.ElementTree as ET from datetime import datetime, timezone + +import pandas as pd import pytz -import glob -import xml.etree.ElementTree as ET -import os -import traceback +from feedgen.feed import FeedGenerator from loguru import logger +from tqdm import tqdm +from src.config import ES_INDEX +from src.elasticsearch_utils import ElasticSearchClient +from src.gpt_utils import create_summary from src.utils import preprocess_email, month_dict, get_id, clean_title, convert_to_tuple, create_folder, \ remove_multiple_whitespaces, add_utc_if_not_present -from src.gpt_utils import create_summary -from src.config import ES_INDEX -from src.elasticsearch_utils import ElasticSearchClient elastic_search = ElasticSearchClient() @@ -149,8 +150,6 @@ def append_columns(self, df_dict, file, title, namespace): # The title is directly provided as a parameter df_dict["title"].append(title) - # formatted_file_name = file.split("/static")[1] - # logger.info(formatted_file_name) # Parse the XML file to extract and append relevant data tree = ET.parse(file) @@ -174,51 +173,46 @@ def append_columns(self, df_dict, file, title, namespace): df_dict["authors"].append([author_result.strip()]) def file_not_present_df(self, columns, source_cols, df_dict, files_list, dict_data, data, - title, combined_filename, namespace): + title, namespace): """ Processes data directly from the given document (`data`) as no XML summary is available for that document. Also, for each individual summary (XML file) that already exists for the given thread, extracts and appends its content to the dictionary. """ - # Append basic data from dict_data for each column into df_dict + # Append basic data from dict_data for each column into df_dict using list comprehension for col in columns: df_dict[col].append(dict_data[data][col]) + # Processing source_cols with conditional append for col in source_cols: + value = dict_data[data]['_source'][col] if "created_at" in col: - datetime_obj = add_utc_if_not_present(dict_data[data]['_source'][col], iso_format=False) - df_dict[col].append(datetime_obj) - else: - df_dict[col].append(dict_data[data]['_source'][col]) + value = add_utc_if_not_present(value, iso_format=False) + df_dict[col].append(value) + + # Iterate over files with transformed file paths + files_list = [file.replace("\\", "/") for file in files_list] + + # Use dictionary to store parsed XML trees to avoid redundant parsing + parsed_files = {} # For each individual summary (XML file) that exists for the # given thread, extract and append their content to the dictionary - # TODO: - # This method is called for every post without a summary, which means that - # existing inidividual summaries for a thread are added n-1 times the amount - # of new posts in the thread at the time of execution of the cron job. - # this is not an issue because we then drop duplicates, but it's extra complexity. for file in files_list: - file = file.replace("\\", "/") if os.path.exists(file): - tree = ET.parse(file) - root = tree.getroot() + if file not in parsed_files: + tree = ET.parse(file) + root = tree.getroot() + parsed_files[file] = (tree, root) + + tree, root = parsed_files[file] file_title = root.find('atom:entry/atom:title', namespace).text if title == file_title: self.append_columns(df_dict, file, title, namespace) - if combined_filename in file: - # TODO: the code will never reach this point - # as we are already filtering per thread title so no - # "Combined summary - X" filename will pass though - tree = ET.parse(file) - root = tree.getroot() - summary = root.find('atom:entry/atom:summary', namespace).text - df_dict["body"].append(summary) - else: - summary = root.find('atom:entry/atom:summary', namespace).text - df_dict["body"].append(summary) + summary = root.find('atom:entry/atom:summary', namespace).text + df_dict["body"].append(summary) else: logger.info(f"file not present: {file}") @@ -230,39 +224,45 @@ def file_present_df(self, files_list, namespace, combined_filename, title, indiv summary exists, it extracts the content of individual summaries, appending it to the data dictionary. """ - combined_file_fullpath = None # the combined XML file if found # List to keep track of the month folders that contain # the XML files for the posts of the current thread - month_folders = [] + month_folders = set() + + # Cached listdir calls to avoid repeated disk access + folder_contents = {} + + # Identifying combined file and processing individual summaries in a single loop + combined_file_fullpath = None - # Iterate through the list of local XML file paths for file in files_list: - file = file.replace("\\", "/") + normalized_file = file.replace("\\", "/") # Check if the current file is the combined XML file for the thread - if combined_filename in file: - combined_file_fullpath = file + if combined_filename in normalized_file: + combined_file_fullpath = normalized_file # Parse the XML file to find the title and compare it with the current title # in order to understand if the post/file is part of the current thread - tree = ET.parse(file) + tree = ET.parse(normalized_file) root = tree.getroot() file_title = root.find('atom:entry/atom:title', namespace).text # If titles match, add the file to the list of relevant XMLs and track its month folder if title == file_title: - individual_summaries_xmls_list.append(file) - month_folder_path = "/".join(file.split("/")[:-1]) - if month_folder_path not in month_folders: - month_folders.append(month_folder_path) + individual_summaries_xmls_list.append(normalized_file) + month_folder_path = "/".join(normalized_file.split("/")[:-1]) + month_folders.add(month_folder_path) # Ensure the combined XML file is copied to all relevant month folders for month_folder in month_folders: - if combined_file_fullpath and combined_filename not in os.listdir(month_folder): - if combined_filename not in os.listdir(month_folder): - shutil.copy(combined_file_fullpath, month_folder) + if month_folder not in folder_contents: + folder_contents[month_folder] = os.listdir(month_folder) + + if combined_file_fullpath and combined_filename not in folder_contents[month_folder]: + shutil.copy(combined_file_fullpath, month_folder) # If individual summaries exist but no combined summary, # extract and append their content to the dictionary - if len(individual_summaries_xmls_list) > 0 and not any(combined_filename in item for item in files_list): - logger.info("individual summaries are present but not combined ones ...") + combined_exists = any(combined_filename in item for item in files_list) + if individual_summaries_xmls_list and not combined_exists: + logger.info("Individual summaries are present but not combined ones.") for file in individual_summaries_xmls_list: self.append_columns(df_dict, file, title, namespace) tree = ET.parse(file) @@ -283,12 +283,18 @@ def get_local_xml_file_paths(self, dev_url): files_list = glob.glob(os.path.join(current_directory, "static", directory, "**/*.xml"), recursive=True) return files_list + def get_local_xml_file_paths_for_title(self, dev_url, title): + """ + Retrieve paths for all relevant local XML files based on the given domain and title + """ + current_directory = os.getcwd() + directory = get_base_directory(dev_url) + files_list = glob.glob(os.path.join(current_directory, "static", directory, f"**/*{title}.xml"), recursive=True) + return files_list + def generate_new_emails_df(self, main_dict_data, dev_url): # Define XML namespace for parsing XML files namespaces = {'atom': 'http://www.w3.org/2005/Atom'} - - # Retrieve all existing XML files (summaries) for the given source - files_list = self.get_local_xml_file_paths(dev_url) # Initialize a dictionary to store data for DataFrame construction, with predefined columns columns = ['_index', '_id', '_score'] @@ -297,9 +303,9 @@ def generate_new_emails_df(self, main_dict_data, dev_url): df_dict = {col: [] for col in (columns + source_cols)} seen_titles = set() - # Process each document in the input data + # Process each document in the input data for idx in range(len(main_dict_data)): - xmls_list = [] # the existing XML files for the thread that the fetched document is part of + xmls_list = [] # the existing XML files for the thread that the fetched document is part of thread_title = main_dict_data[idx]["_source"]["title"] if thread_title in seen_titles: continue @@ -322,11 +328,14 @@ def generate_new_emails_df(self, main_dict_data, dev_url): combined_filename = f"combined_{xml_name}.xml" created_at = title_dict_data[data_idx]["_source"]["created_at"] + # Retrieve all existing XML files (summaries) for the given source and title + files_list = self.get_local_xml_file_paths_for_title(dev_url=dev_url, title=xml_name) + # Check if the XML file for the document exists if not any(file_name in item for item in files_list): logger.info(f"Not present: {created_at} | {file_name}") self.file_not_present_df(columns, source_cols, df_dict, files_list, title_dict_data, data_idx, - title, combined_filename, namespaces) + title, namespaces) else: logger.info(f"Present: {created_at} | {file_name}") self.file_present_df(files_list, namespaces, combined_filename, title, xmls_list, df_dict) diff --git a/xmls_generator_production.py b/xmls_generator_production.py index e87792a38..ccc466791 100644 --- a/xmls_generator_production.py +++ b/xmls_generator_production.py @@ -1,9 +1,11 @@ +import sys import time +import warnings from datetime import datetime, timedelta -import sys + from loguru import logger -import warnings from openai.error import APIError, PermissionError, AuthenticationError, InvalidAPIType, ServiceUnavailableError + from src.config import ES_INDEX from src.elasticsearch_utils import ElasticSearchClient from src.xml_utils import GenerateXML @@ -11,8 +13,12 @@ warnings.filterwarnings("ignore") if __name__ == "__main__": + + # Instantiating objects for generating JSON, XML and connecting to ElasticSearch gen = GenerateXML() elastic_search = ElasticSearchClient() + + # URLs of mailing lists and forums dev_urls = [ "https://lists.linuxfoundation.org/pipermail/bitcoin-dev/", "https://lists.linuxfoundation.org/pipermail/lightning-dev/", @@ -20,15 +26,17 @@ "https://gnusha.org/pi/bitcoindev/" ] + # Set the date range for data extraction end_date = datetime.now() start_date = end_date - timedelta(days=30) - # yyyy-mm-dd end_date_str = end_date.strftime("%Y-%m-%d") start_date_str = start_date.strftime("%Y-%m-%d") + logger.info(f"start_data: {start_date_str}") logger.info(f"end_date_str: {end_date_str}") + # Process each URL in the dev_urls list for dev_url in dev_urls: data_list = elastic_search.extract_data_from_es( ES_INDEX, dev_url, start_date_str, end_date_str, exclude_combined_summary_docs=True