From 5e0915b2b8f4df0366e7e638374e8e1b5cc7d696 Mon Sep 17 00:00:00 2001 From: urvishp80 Date: Fri, 26 Apr 2024 10:31:15 -0600 Subject: [PATCH] Enhanced code and added comments --- .../workflows/homepage_json_gen_cron_job.yml | 2 +- README.md | 2 +- ...mepage_xml.py => generate_homepage_json.py | 764 +++++++++--------- generate_weekly_newsletter_json.py | 103 ++- push_combined_summary_to_es.py | 16 +- push_summary_to_es.py | 15 +- src/config.py | 2 +- src/xml_utils.py | 129 +-- static/homepage.json | 105 +-- xmls_generator_production.py | 14 +- 10 files changed, 595 insertions(+), 557 deletions(-) rename generate_homepage_xml.py => generate_homepage_json.py (75%) diff --git a/.github/workflows/homepage_json_gen_cron_job.yml b/.github/workflows/homepage_json_gen_cron_job.yml index b6493212b..545c85313 100644 --- a/.github/workflows/homepage_json_gen_cron_job.yml +++ b/.github/workflows/homepage_json_gen_cron_job.yml @@ -37,7 +37,7 @@ jobs: pip install -r requirements.txt - name: Execute Python script - run: python generate_homepage_xml.py + run: python generate_homepage_json.py - name: Configure Git run: | diff --git a/README.md b/README.md index 6e5ad1a99..1b3315129 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Utilizing data collected by the [scraper](https://github.com/bitcoinsearch/scrap - Queries Elasticsearch for documents lacking summaries, extracts summaries from corresponding XML files, and then updates these documents with their summaries in the Elasticsearch index. 3. Daily [Push Combined Summary From XML Files to ES INDEX](.github/workflows/push_combined_summary_to_es_cron_job.yml) ([source](push_combined_summary_to_es.py)) - Processes each combined thread summary XML file, transforming it into a document format, checks for its existence in Elasticsearch, and updates or inserts the document as needed. -4. Daily [Python Homepage Update Script](.github/workflows/homepage_json_gen_cron_job.yml) ([source](generate_homepage_xml.py)) +4. Daily [Python Homepage Update Script](.github/workflows/homepage_json_gen_cron_job.yml) ([source](generate_homepage_json.py)) - Queries the last 7 days of data from Elasticsearch for each source to compile lists of active threads, recent threads, and historical threads for 'Today in History'. It generates a summary of recent threads if available; otherwise, for active threads. The resulting [`homepage.json`](static/homepage.json) is then committed to GitHub to be used by [Bitcoin TLDR](https://github.com/bitcoinsearch/tldr). 5. Weekly [Python Newsletter Generation Script](.github/workflows/weekly_newsletter_gen_cron_job.yml) ([source](generate_weekly_newsletter_json.py)) - Generates a newsletter by compiling lists of new and active threads from the past week's data for each source. It generates a summary of new threads if available; otherwise, for active threads. The resulting [`newsletter.json`](static/newsletters/newsletter.json) is then committed to GitHub to be used by [Bitcoin TLDR](https://github.com/bitcoinsearch/tldr). diff --git a/generate_homepage_xml.py b/generate_homepage_json.py similarity index 75% rename from generate_homepage_xml.py rename to generate_homepage_json.py index ec79e966d..f29626a75 100644 --- a/generate_homepage_xml.py +++ b/generate_homepage_json.py @@ -1,372 +1,392 @@ -import random -import time -import traceback -from datetime import datetime, timedelta -from loguru import logger -import os -import sys -import warnings -import json -from tqdm import tqdm - -from src.config import ES_INDEX -from src.elasticsearch_utils import ElasticSearchClient -from src.json_utils import GenerateJSON -from src.xml_utils import GenerateXML -from src.utils import month_dict - -warnings.filterwarnings("ignore") - - -def page_data_handling(data_list: list, get_unique_per_dev=False): - page_data = [] - collected_dev_data = [] - for data in tqdm(data_list): - try: - # check and generate any missing file - xml_gen.start(dict_data=[data], url=data['_source']['domain']) - entry_data = gen.create_single_entry(data, look_for_combined_summary=True) - - if get_unique_per_dev: - if entry_data['dev_name'] not in collected_dev_data: - collected_dev_data.append(entry_data['dev_name']) - logger.info(f"collected data for: {collected_dev_data}") - page_data.append(entry_data) - else: - page_data.append(entry_data) - except Exception as ex: - logger.error( - f"Error occurred for doc id: {data['_source']['id']}\n{ex} \n{traceback.format_exc()}") - return page_data - - -if __name__ == "__main__": - - gen = GenerateJSON() - xml_gen = GenerateXML() - elastic_search = ElasticSearchClient() - dev_urls = [ - ["https://lists.linuxfoundation.org/pipermail/bitcoin-dev/", - "https://gnusha.org/pi/bitcoindev/"], - "https://lists.linuxfoundation.org/pipermail/lightning-dev/", - "https://delvingbitcoin.org/" - ] - - current_date = datetime.now() - current_date_str = current_date.strftime("%Y-%m-%d") - - start_date = current_date - timedelta(days=7) - start_date_str = start_date.strftime("%Y-%m-%d") - logger.info(f"start_date: {start_date_str}") - logger.info(f"current_date_str: {current_date_str}") - - month_name = month_dict[int(current_date.month)] - str_month_year = f"{month_name}_{int(current_date.year)}" - - json_file_path = fr"static/homepage.json" - - recent_data_list = [] - active_data_list = [] - today_in_history_data_list = [] - history_data_collected_from_yesterday = False - - random_years_ago = None - - for dev_url in dev_urls: - logger.info(f"Working on URL: {dev_url}") - fetch_today_in_history = True - - data_list = elastic_search.extract_data_from_es( - ES_INDEX, dev_url, start_date_str, current_date_str, exclude_combined_summary_docs=True - ) - - if isinstance(dev_url, list): - dev_name = dev_url[0].split("/")[-2] - else: - dev_name = dev_url.split("/")[-2] - - logger.success(f"TOTAL THREADS RECEIVED FOR - '{dev_name}': {len(data_list)}") - - seen_titles = set() - - # TOP ACTIVE POSTS - active_posts_data = elastic_search.filter_top_active_posts( - es_results=data_list, top_n=10 - ) - - active_posts_data_counter = 0 - for data in active_posts_data: - if active_posts_data_counter >= 3: - break - - title = data['_source']['title'] - if title in seen_titles: - continue - seen_titles.add(title) - - # get the first post's info of this title - original_post = elastic_search.get_earliest_posts_by_title( - es_index=ES_INDEX, url=dev_url, title=title - ) - - counts, contributors = elastic_search.es_fetch_contributors_and_threads( - es_index=ES_INDEX, title=title, domain=dev_url - ) - - # if you want to show the first post of each selected title, - # then do the below operations on - 'original_post', else on 'data' - for author in original_post['_source']['authors']: - contributors.remove(author) - original_post['_source']['n_threads'] = counts - original_post['_source']['contributors'] = contributors - original_post['_source']['dev_name'] = dev_name - active_data_list.append(original_post) - active_posts_data_counter += 1 - - logger.success(f"Number of active posts collected: {len(active_data_list)}, for URL: {dev_url}") - - # TOP RECENT POSTS - recent_data_post_counter = 0 - recent_posts_data = elastic_search.filter_top_recent_posts(es_results=data_list, top_n=20) - - for data in recent_posts_data: - # if preprocess body text not longer than token_threshold, skip that post - if not gen.is_body_text_long(data=data, sent_threshold=2): - logger.info(f"skipping: {data['_source']['title']} - {data['_source']['url']}") - continue - - title = data['_source']['title'] - if title in seen_titles: - continue - seen_titles.add(title) - if recent_data_post_counter >= 3: - break - - counts, contributors = elastic_search.es_fetch_contributors_and_threads( - es_index=ES_INDEX, title=title, domain=dev_url - ) - # exclude the post authors - for author in data['_source']['authors']: - contributors.remove(author) - data['_source']['n_threads'] = counts - data['_source']['contributors'] = contributors - data['_source']['dev_name'] = dev_name - recent_data_list.append(data) - recent_data_post_counter += 1 - - if not recent_data_list: - for data in recent_posts_data: - # if preprocess body text not longer than token_threshold, skip that post - if not gen.is_body_text_long(data=data, sent_threshold=2): - logger.info(f"skipping: {data['_source']['title']} - {data['_source']['url']}") - continue - - title = data['_source']['title'] - if recent_data_post_counter >= 3: - break - counts, contributors = elastic_search.es_fetch_contributors_and_threads( - es_index=ES_INDEX, title=title, domain=dev_url - ) - # exclude the post authors - for author in data['_source']['authors']: - contributors.remove(author) - data['_source']['n_threads'] = counts - data['_source']['contributors'] = contributors - data['_source']['dev_name'] = dev_name - recent_data_list.append(data) - recent_data_post_counter += 1 - - logger.success(f"Number of recent posts collected: {len(recent_data_list)}, for URL: {dev_url}") - - # TODAY IN HISTORY POSTS - logger.info(f"fetching 'Today in history' posts... ") - - if not random_years_ago: - at_least_years_ago = 3 - at_max_years_ago = current_date.year - 2015 - random_years_ago = random.randint(at_least_years_ago, at_max_years_ago) - logger.info(f"random years ago between {at_least_years_ago} to {at_max_years_ago}: {random_years_ago}") - - if dev_url == "https://delvingbitcoin.org/": - random_years_ago = random.randint(1, current_date.year - 2022) - logger.info( - f"for delving-bitcoin - random years ago between {1} to {current_date.year - 2022}: {random_years_ago}") - - default_days_to_look_back = 6 - loop_counter = 1 - - while fetch_today_in_history: - days_to_look_back = default_days_to_look_back * loop_counter - selected_random_date = current_date - timedelta(days=365 * random_years_ago) - - start_of_time = selected_random_date - timedelta(days=selected_random_date.weekday()) - end_of_time = start_of_time + timedelta(days=days_to_look_back) - - start_of_time_str = start_of_time.strftime("%Y-%m-%dT%H:%M:%S") - end_of_time_str = end_of_time.strftime("%Y-%m-%dT%H:%M:%S") - - logger.info(f"collecting the data from {days_to_look_back} days range ... || Start of week: {start_of_time} | " - f"End of week: {end_of_time}") - - selected_threads = elastic_search.fetch_data_in_date_range( - es_index=ES_INDEX, - start_date=start_of_time_str, - end_date=end_of_time_str, - domain=dev_url - ) - - if len(selected_threads) > 0: - for doc in selected_threads: - doc_title = doc['_source']['title'] - doc_created_at = doc['_source']['created_at'] - - if doc['_source']['type'] == 'original_post': - - counts, contributors = elastic_search.es_fetch_contributors_and_threads( - es_index=ES_INDEX, title=doc_title, domain=dev_url - ) - - if counts < 5: - logger.info(f"No. of replies are less than 5, skipping it... ") - continue - - if contributors: - # exclude the post authors - for author in doc['_source']['authors']: - contributors.remove(author) - doc['_source']['n_threads'] = counts - doc['_source']['contributors'] = contributors - doc['_source']['dev_name'] = dev_name - today_in_history_data_list.append(doc) - logger.info(f"collected doc created on: {doc_created_at} || TITLE: {doc_title}") - fetch_today_in_history = False - break - loop_counter += 1 - - # add history data from yesterday's homepage.json - if not today_in_history_data_list: - logger.info("Collecting yesterday's history threads!") - current_directory = os.getcwd() - full_path = os.path.join(current_directory, json_file_path) - if os.path.exists(full_path): - with open(full_path, 'r') as j: - try: - data = json.load(j) - except Exception as e: - logger.info(f"Error reading json file:{full_path} :: {e}") - data = {} - today_in_history_data_list.extend(data.get('today_in_history_posts', [])) - history_data_collected_from_yesterday = True - - logger.success(f"No. of 'Today in history' posts collected: {len(today_in_history_data_list)}") - - current_directory = os.getcwd() - full_path = os.path.join(current_directory, json_file_path) - if os.path.exists(full_path): - with open(full_path, 'r') as j: - try: - yesterday_data = json.load(j) - except Exception as e: - logger.info(f"Error reading json file:{full_path} :: {e}") - yesterday_data = {} - - xml_ids_title = gen.get_existing_json_title(file_path=json_file_path) - recent_post_ids = [data['_source']['title'] for data in recent_data_list] - active_post_ids = [data['_source']['title'] for data in active_data_list] - all_post_titles = set(recent_post_ids + active_post_ids) - - if all_post_titles != set(xml_ids_title): - logger.info("changes found in recent posts ... ") - - delay = 5 - count = 0 - - while True: - try: - logger.info( - f"active posts: {len(active_data_list)}, " - f"recent posts: {len(recent_data_list)}, " - f"today in history posts: {len(today_in_history_data_list)}" - ) - logger.info("Creating homepage.json file ... ") - - recent_post_summ = "" - if len(active_data_list) > 0 or len(recent_data_list) > 0: - - # header summary - if len(recent_data_list) > 0: - recent_post_summ = gen.generate_recent_posts_summary(recent_data_list) - else: - recent_post_summ = gen.generate_recent_posts_summary(active_data_list) - logger.success(recent_post_summ) - - # recent data - recent_page_data = page_data_handling(recent_data_list) - - # active data - active_page_data = page_data_handling(active_data_list) - - else: - logger.error(f"'Active' and 'Recent' data list empty! Please check the data again.") - recent_page_data, active_page_data = [], [] - - # today in history - if history_data_collected_from_yesterday: - logger.info("No change in 'Today in History' data posts, gathering data from yesterday's post.") - today_in_history_data = yesterday_data.get('today_in_history_posts', []) - else: - if len(today_in_history_data_list) > 0: - today_in_history_data = page_data_handling(today_in_history_data_list, get_unique_per_dev=True) - else: - logger.error(f"'Today in history' data list empty! Please check the data again.") - today_in_history_data = [] - - json_string = { - "header_summary": recent_post_summ, - "recent_posts": recent_page_data, - "active_posts": active_page_data, - "today_in_history_posts": today_in_history_data - } - gen.write_json_file(json_string, json_file_path) - - archive_json_file_path = fr"static/homepage/{str_month_year}/{current_date_str}-homepage.json" - gen.store_file_in_archive(json_file_path, archive_json_file_path) - break - - except Exception as ex: - logger.error(f"Error occurred: {ex} \n{traceback.format_exc()}") - time.sleep(delay) - count += 1 - if count > 1: - sys.exit(f"{ex}") - else: - logger.info("No change in 'Recent' or 'Active' posts.") - rewrite_json_file = False - - # update today in history and save file if no change in Recent or Active posts - if history_data_collected_from_yesterday: - logger.info("No change in 'Today in History' data posts, gathering data from yesterday's post.") - today_in_history_data = yesterday_data.get('today_in_history_posts', []) - else: - rewrite_json_file = True - if len(today_in_history_data_list) > 0: - today_in_history_data = page_data_handling(today_in_history_data_list, get_unique_per_dev=True) - else: - logger.error(f"'Today in history' data list empty! Please check the data again.") - today_in_history_data = [] - - if rewrite_json_file: - logger.info(f'Rewriting the homepage.json file') - json_string = { - "header_summary": yesterday_data.get('header_summary', []), - "recent_posts": yesterday_data.get('recent_posts', []), - "active_posts": yesterday_data.get('recent_posts', []), - "today_in_history_posts": today_in_history_data - } - gen.write_json_file(json_string, json_file_path) - else: - logger.info("No need to rewrite homepage.json file") - - if os.path.exists(full_path): - archive_json_file_path = fr"static/homepage/{str_month_year}/{current_date_str}-homepage.json" - gen.store_file_in_archive(json_file_path, archive_json_file_path) +import json +import os +import random +import sys +import time +import traceback +import warnings +from datetime import datetime, timedelta + +from loguru import logger +from tqdm import tqdm + +from src.config import ES_INDEX +from src.elasticsearch_utils import ElasticSearchClient +from src.json_utils import GenerateJSON +from src.utils import month_dict +from src.xml_utils import GenerateXML + +warnings.filterwarnings("ignore") + + +def page_data_handling(data_list: list, get_unique_per_dev=False): + page_data = [] + collected_dev_data = [] + for data in tqdm(data_list): + try: + # Generate all XML files for each given title, if not present + xml_gen.start(dict_data=[data], url=data['_source']['domain']) + entry_data = json_gen.create_single_entry(data, look_for_combined_summary=True) + if get_unique_per_dev: # Ensure that there is only one document per domain + if entry_data['dev_name'] not in collected_dev_data: + collected_dev_data.append(entry_data['dev_name']) + logger.info(f"collected data for: {collected_dev_data}") + page_data.append(entry_data) + else: + page_data.append(entry_data) + except Exception as ex: + logger.error( + f"Error occurred for doc id: {data['_source']['id']}\n{ex} \n{traceback.format_exc()}") + return page_data + + +if __name__ == "__main__": + + # Instantiating objects for generating JSON, XML and connecting to ElasticSearch + json_gen = GenerateJSON() + xml_gen = GenerateXML() + elastic_search = ElasticSearchClient() + + # URLs of mailing lists and forums + dev_urls = [ + ["https://lists.linuxfoundation.org/pipermail/bitcoin-dev/", + "https://gnusha.org/pi/bitcoindev/"], + "https://lists.linuxfoundation.org/pipermail/lightning-dev/", + "https://delvingbitcoin.org/" + ] + + # Set the date range for data extraction + current_date = datetime.now() + start_date = current_date - timedelta(days=7) + + start_date_str = start_date.strftime("%Y-%m-%d") + current_date_str = current_date.strftime("%Y-%m-%d") + + logger.info(f"start_date: {start_date_str}") + logger.info(f"current_date_str: {current_date_str}") + + # Convert month from number to name for filename construction + month_name = month_dict[int(current_date.month)] + str_month_year = f"{month_name}_{int(current_date.year)}" + + recent_data_list = [] + active_data_list = [] + today_in_history_data_list = [] + history_data_collected_from_yesterday = False + random_years_ago = None + + # path to the stored homepage.json file + json_file_path = fr"static/homepage.json" + + # Process each URL in the dev_urls list + for dev_url in dev_urls: + logger.info(f"Working on URL: {dev_url}") + fetch_today_in_history = True + + # Fetch docs from an elasticsearch index + data_list = elastic_search.extract_data_from_es( + ES_INDEX, dev_url, start_date_str, current_date_str, exclude_combined_summary_docs=True + ) + + dev_name = dev_url[0].split("/")[-2] if isinstance(dev_url, list) else dev_url.split("/")[-2] + logger.success(f"Retrieved {len(data_list)} threads for {dev_name}") + + seen_titles = set() + + # TOP ACTIVE POSTS + active_posts_data = elastic_search.filter_top_active_posts( + es_results=data_list, top_n=10 + ) + + # Collect N active posts per domain + active_posts_data_counter = 0 + for data in active_posts_data: + if active_posts_data_counter >= 3: + break + + title = data['_source']['title'] + if title in seen_titles: + continue + seen_titles.add(title) + + # Fetch the first post for given title and domain + original_post = elastic_search.get_earliest_posts_by_title( + es_index=ES_INDEX, url=dev_url, title=title + ) + + # Gather post counts for given title and its total contributors + counts, contributors = elastic_search.es_fetch_contributors_and_threads( + es_index=ES_INDEX, title=title, domain=dev_url + ) + + # As we want to show the original/first post of the filtered active post, + # we are parsing information from 'original_post', + # otherwise we would parse the information from 'data' if we want to show the filtered post itself + + # Separate out an original author from contributor's list + for author in original_post['_source']['authors']: + contributors.remove(author) + original_post['_source']['n_threads'] = counts + original_post['_source']['contributors'] = contributors + original_post['_source']['dev_name'] = dev_name + active_data_list.append(original_post) + active_posts_data_counter += 1 + + logger.success(f"Number of active posts collected: {len(active_data_list)}, for URL: {dev_url}") + + # TOP RECENT POSTS + recent_data_post_counter = 0 + recent_posts_data = elastic_search.filter_top_recent_posts(es_results=data_list, top_n=20) + + for data in recent_posts_data: + # If preprocessed body text shorter than token_threshold, skip the doc + if not json_gen.is_body_text_long(data=data, sent_threshold=2): + logger.info(f"skipping: {data['_source']['title']} - {data['_source']['url']}") + continue + + title = data['_source']['title'] + if title in seen_titles: + continue + seen_titles.add(title) + + # Collect N recent posts per domain + if recent_data_post_counter >= 3: + break + + # Gather post counts for given title and its total contributors + counts, contributors = elastic_search.es_fetch_contributors_and_threads( + es_index=ES_INDEX, title=title, domain=dev_url + ) + + # Separate out an original author from contributor's list + for author in data['_source']['authors']: + contributors.remove(author) + data['_source']['n_threads'] = counts + data['_source']['contributors'] = contributors + data['_source']['dev_name'] = dev_name + recent_data_list.append(data) + recent_data_post_counter += 1 + + if not recent_data_list: + for data in recent_posts_data: + # If the preprocessed body text shorter than token_threshold, skip that post + if not json_gen.is_body_text_long(data=data, sent_threshold=2): + logger.info(f"skipping: {data['_source']['title']} - {data['_source']['url']}") + continue + + title = data['_source']['title'] + # Collect N recent posts per domain + if recent_data_post_counter >= 3: + break + counts, contributors = elastic_search.es_fetch_contributors_and_threads( + es_index=ES_INDEX, title=title, domain=dev_url + ) + + # Separate out an original author from contributor's list + for author in data['_source']['authors']: + contributors.remove(author) + data['_source']['n_threads'] = counts + data['_source']['contributors'] = contributors + data['_source']['dev_name'] = dev_name + recent_data_list.append(data) + recent_data_post_counter += 1 + + logger.success(f"Number of recent posts collected: {len(recent_data_list)}, for URL: {dev_url}") + + # TODAY IN HISTORY POSTS + logger.info(f"fetching 'Today in history' posts... ") + + # Randomly choose a number N within given range and look back N for the data N years ago + # for bitcoin-dev and lighting-dev we have data from 2015, and for delving-bitcoin we have it from 2022 + if not random_years_ago: + at_least_years_ago = 3 + at_max_years_ago = current_date.year - 2015 + random_years_ago = random.randint(at_least_years_ago, at_max_years_ago) + logger.info(f"Random years ago between {at_least_years_ago} to {at_max_years_ago}: {random_years_ago}") + + if dev_url == "https://delvingbitcoin.org/": + random_years_ago = random.randint(1, current_date.year - 2022) + logger.info( + f"for delving-bitcoin - random years ago between {1} to {current_date.year - 2022}: {random_years_ago}") + + default_days_to_look_back = 6 + loop_counter = 1 + + while fetch_today_in_history: + days_to_look_back = default_days_to_look_back * loop_counter + selected_random_date = current_date - timedelta(days=365 * random_years_ago) + + start_of_time = selected_random_date - timedelta(days=selected_random_date.weekday()) + end_of_time = start_of_time + timedelta(days=days_to_look_back) + + start_of_time_str = start_of_time.strftime("%Y-%m-%dT%H:%M:%S") + end_of_time_str = end_of_time.strftime("%Y-%m-%dT%H:%M:%S") + + logger.info( + f"collecting the data from {days_to_look_back} days range ... || Start of week: {start_of_time} | " + f"End of week: {end_of_time}") + + selected_threads = elastic_search.fetch_data_in_date_range( + es_index=ES_INDEX, + start_date=start_of_time_str, + end_date=end_of_time_str, + domain=dev_url + ) + + if len(selected_threads) > 0: + for doc in selected_threads: + doc_title = doc['_source']['title'] + doc_created_at = doc['_source']['created_at'] + + if doc['_source']['type'] == 'original_post': + + counts, contributors = elastic_search.es_fetch_contributors_and_threads( + es_index=ES_INDEX, title=doc_title, domain=dev_url + ) + + if counts < 5: + logger.info(f"No. of replies are less than 5, skipping it... ") + continue + + if contributors: + # Separate out an original author from contributor's list + for author in doc['_source']['authors']: + contributors.remove(author) + doc['_source']['n_threads'] = counts + doc['_source']['contributors'] = contributors + doc['_source']['dev_name'] = dev_name + today_in_history_data_list.append(doc) + logger.info(f"collected doc created on: {doc_created_at} || TITLE: {doc_title}") + fetch_today_in_history = False + break + loop_counter += 1 + + # If not data found for given time period, collect the history data from stored homepage.json file + if not today_in_history_data_list: + logger.info("Collecting yesterday's history threads!") + current_directory = os.getcwd() + full_path = os.path.join(current_directory, json_file_path) + if os.path.exists(full_path): + with open(full_path, 'r') as j: + try: + data = json.load(j) + except Exception as e: + logger.info(f"Error reading json file:{full_path} :: {e}") + data = {} + today_in_history_data_list.extend(data.get('today_in_history_posts', [])) + history_data_collected_from_yesterday = True + + logger.success(f"No. of 'Today in history' posts collected: {len(today_in_history_data_list)}") + + # Determine if there's any update in the data as compared to stored JSON file + current_directory = os.getcwd() + full_path = os.path.join(current_directory, json_file_path) + if os.path.exists(full_path): + with open(full_path, 'r') as j: + try: + yesterday_data = json.load(j) + except Exception as e: + logger.info(f"Error reading json file:{full_path} :: {e}") + yesterday_data = {} + + stored_json_titles = json_gen.get_existing_json_title(file_path=json_file_path) + collected_post_titles = set([data['_source']['title'] for data in recent_data_list] + + [data['_source']['title'] for data in active_data_list]) + + if collected_post_titles != set(stored_json_titles): + logger.info("Changes found as compared to previously stored JSON file... ") + + delay = 5 + count = 0 + + while True: + try: + logger.info( + f"Active posts: {len(active_data_list)}, " + f"Recent posts: {len(recent_data_list)}, " + f"Today in history posts: {len(today_in_history_data_list)}" + ) + logger.info("Creating homepage.json file ... ") + + recent_post_summ = "" + if len(active_data_list) > 0 or len(recent_data_list) > 0: + + # Generate the header summary from recent posts, + # and if no recent data is collected then from active posts + if len(recent_data_list) > 0: + recent_post_summ = json_gen.generate_recent_posts_summary(recent_data_list) + else: + recent_post_summ = json_gen.generate_recent_posts_summary(active_data_list) + logger.success(recent_post_summ) + + # Compile recent posts data + recent_page_data = page_data_handling(recent_data_list) + + # Compile active posts data + active_page_data = page_data_handling(active_data_list) + + else: + logger.error(f"'Active' and 'Recent' data list empty! Please check the data again.") + recent_page_data, active_page_data = [], [] + + # Compile today in history posts + if history_data_collected_from_yesterday: + logger.info("No change in 'Today in History' data posts, gathering data from yesterday's post.") + today_in_history_data = yesterday_data.get('today_in_history_posts', []) + else: + if len(today_in_history_data_list) > 0: + today_in_history_data = page_data_handling(today_in_history_data_list, get_unique_per_dev=True) + else: + logger.error(f"'Today in history' data list empty! Please check the data again.") + today_in_history_data = [] + + json_string = { + "header_summary": recent_post_summ, + "recent_posts": recent_page_data, + "active_posts": active_page_data, + "today_in_history_posts": today_in_history_data + } + json_gen.write_json_file(json_string, json_file_path) + + archive_json_file_path = fr"static/homepage/{str_month_year}/{current_date_str}-homepage.json" + json_gen.store_file_in_archive(json_file_path, archive_json_file_path) + break + + except Exception as ex: + logger.error(f"Error occurred: {ex} \n{traceback.format_exc()}") + time.sleep(delay) + count += 1 + if count > 1: + sys.exit(f"{ex}") + else: + # If no changes found in Recent or Active posts, + # simply gather all data from yesterday's stored json file and save it with an updated name in the archive directory + logger.info("No change in 'Recent' or 'Active' posts.") + rewrite_json_file = False + + if history_data_collected_from_yesterday: + logger.info("No change in 'Today in History' data posts, gathering data from yesterday's post.") + today_in_history_data = yesterday_data.get('today_in_history_posts', []) + else: + rewrite_json_file = True + if len(today_in_history_data_list) > 0: + today_in_history_data = page_data_handling(today_in_history_data_list, get_unique_per_dev=True) + else: + logger.error(f"'Today in history' data list empty! Please check the data again.") + today_in_history_data = [] + + if rewrite_json_file: + logger.info(f'Rewriting the homepage.json file') + json_string = { + "header_summary": yesterday_data.get('header_summary', []), + "recent_posts": yesterday_data.get('recent_posts', []), + "active_posts": yesterday_data.get('recent_posts', []), + "today_in_history_posts": today_in_history_data + } + json_gen.write_json_file(json_string, json_file_path) + else: + logger.info("No need to rewrite homepage.json file") + + if os.path.exists(full_path): + archive_json_file_path = fr"static/homepage/{str_month_year}/{current_date_str}-homepage.json" + json_gen.store_file_in_archive(json_file_path, archive_json_file_path) diff --git a/generate_weekly_newsletter_json.py b/generate_weekly_newsletter_json.py index ce5de7781..fc8b98801 100644 --- a/generate_weekly_newsletter_json.py +++ b/generate_weekly_newsletter_json.py @@ -1,23 +1,27 @@ +import json +import os +import sys import time import traceback from datetime import datetime, timedelta + from loguru import logger -import os -import sys -import json from tqdm import tqdm from src.config import ES_INDEX from src.elasticsearch_utils import ElasticSearchClient from src.json_utils import GenerateJSON -from src.xml_utils import GenerateXML from src.utils import month_dict +from src.xml_utils import GenerateXML if __name__ == "__main__": - gen = GenerateJSON() + # Instantiating objects for generating JSON, XML and connecting to ElasticSearch + json_gen = GenerateJSON() xml_gen = GenerateXML() elastic_search = ElasticSearchClient() + + # URLs for development mailing lists and forums dev_urls = [ ["https://lists.linuxfoundation.org/pipermail/bitcoin-dev/", "https://gnusha.org/pi/bitcoindev/"], @@ -25,40 +29,35 @@ "https://delvingbitcoin.org/" ] + # Set the date range for data extraction: last week to yesterday. current_date = datetime.now() - current_date_str = current_date.strftime("%Y-%m-%d") - start_date = current_date - timedelta(days=7) - start_date_str = start_date.strftime("%Y-%m-%d") - end_date = current_date - timedelta(days=1) + + current_date_str = current_date.strftime("%Y-%m-%d") + start_date_str = start_date.strftime("%Y-%m-%d") end_date_str = end_date.strftime("%Y-%m-%d") logger.info(f"Newsletter publish date: {current_date_str}") logger.info(f"Gathering data for newsletter from {start_date_str} to {end_date_str}") + # Convert month from number to name for filename construction month_name = month_dict[int(current_date.month)] str_month_year = f"{month_name}_{int(current_date.year)}" active_data_list = [] new_threads_list = [] + # Process each URL in the dev_urls list for dev_url in dev_urls: - data_list = elastic_search.extract_data_from_es( ES_INDEX, dev_url, start_date_str, end_date_str, exclude_combined_summary_docs=True ) - if isinstance(dev_url, list): - dev_name = dev_url[0].split("/")[-2] - else: - dev_name = dev_url.split("/")[-2] - - logger.success(f"TOTAL THREADS RECEIVED FOR '{dev_name}': {len(data_list)}") + dev_name = dev_url[0].split("/")[-2] if isinstance(dev_url, list) else dev_url.split("/")[-2] + logger.success(f"Retrieved {len(data_list)} threads for {dev_name}") # NEW THREADS POSTS - # @TODO you already identify the original post by type==original_post - # so you could get the posts in order by date and check if the original posts is there seen_titles = set() for i in data_list: this_title = i['_source']['title'] @@ -66,27 +65,25 @@ continue seen_titles.add(this_title) - # check if the first post for this title is in the past week - original_post = elastic_search.get_earliest_posts_by_title(es_index=ES_INDEX, url=dev_url, title=this_title) - - if original_post['_source'] and i['_source']['created_at'] == original_post['_source']['created_at']: - logger.success(f"new thread created on: {original_post['_source']['created_at']} || TITLE: {this_title}") + # Check if any new thread started in given week + if i['_source']['type'] == 'original_post': + logger.success(f"New thread created on: {i['_source']['created_at']} || TITLE: {this_title}") counts, contributors = elastic_search.es_fetch_contributors_and_threads( es_index=ES_INDEX, title=this_title, domain=dev_url ) - + # Separate an original author and contributors for author in i['_source']['authors']: contributors.remove(author) i['_source']['n_threads'] = counts i['_source']['contributors'] = contributors i['_source']['dev_name'] = dev_name new_threads_list.append(i) - logger.info(f"number of new threads started this week: {len(new_threads_list)}") + logger.info(f"No. of new threads started this week: {len(new_threads_list)}") # TOP ACTIVE POSTS active_posts_data = elastic_search.filter_top_active_posts(es_results=data_list, top_n=15) - logger.info(f"number of filtered top active post: {len(active_posts_data)}") + logger.info(f"No. of filtered top active post: {len(active_posts_data)}") new_threads_titles_list = [i['_source']['title'] for i in new_threads_list] @@ -103,14 +100,15 @@ seen_titles.add(title) active_data_list.append(data) # active_posts_data_counter += 1 - logger.info(f"number of active posts collected: {len(active_data_list)}") + logger.info(f"No. of active posts collected: {len(active_data_list)}") - # gather titles of docs from json file + # Determine if there's any update in the data compared to stored JSON + # Gather titles from stored JSON file json_file_path = fr"static/newsletters/newsletter.json" current_directory = os.getcwd() json_full_path = os.path.join(current_directory, json_file_path) - json_xml_ids = set() + stored_json_titles = set() if os.path.exists(json_full_path): with open(json_full_path, 'r') as j: try: @@ -119,22 +117,22 @@ logger.info(f"Error reading json file:{json_full_path} :: {e}") json_data = {} - json_xml_ids = set( + stored_json_titles = set( [item['title'] for item in json_data.get('new_threads_this_week', [])] + [item['title'] for item in json_data.get('active_posts_this_week', [])] ) else: logger.warning(f"No existing newsletter.json file found: {json_full_path}") - # gather ids of docs from active posts and new thread posts - filtered_docs_ids = set( + # Gather titles from collected Active data and New Threads list + collected_json_titles = set( [data['_source']['title'] for data in active_data_list] + [data['_source']['title'] for data in new_threads_list] ) - # check if there are any updates in the xml file - if filtered_docs_ids != json_xml_ids: - logger.info("changes found in recent posts ... ") + # Generate a new newsletter.json file if changes found in stored JSON file + if collected_json_titles != stored_json_titles: + logger.info("Changes found as compared to previously stored JSON file... ") delay = 5 count = 0 @@ -144,23 +142,21 @@ logger.success(f"Total no. of active posts collected: {len(active_data_list)}") logger.success(f"Total no. of new threads started this week: {len(new_threads_list)}") - logger.info("creating newsletter.json file ... ") + logger.info("Creating newsletter.json file ... ") if len(active_data_list) > 0 or len(new_threads_list) > 0: + # Prepare New Threads data for newsletter new_threads_page_data = [] - active_page_data = [] new_threads_summary = "" - if new_threads_list: - new_threads_summary += gen.generate_recent_posts_summary(new_threads_list, verbose=True) + new_threads_summary += json_gen.generate_recent_posts_summary(new_threads_list, verbose=True) logger.success(new_threads_summary) for data in tqdm(new_threads_list): try: - # check and generate any missing file + # Generate all XML files for given title, if not present xml_gen.start(dict_data=[data], url=data['_source']['domain']) - - entry_data = gen.create_single_entry( + entry_data = json_gen.create_single_entry( data, base_url_for_xml="https://tldr.bitcoinsearch.xyz/summary", look_for_combined_summary=True, @@ -173,16 +169,17 @@ else: logger.warning(f"No new threads started this week, generating summary of active posts this " f"week ...") - # if no new threads started this week, generate summary from active post this week - new_threads_summary += gen.generate_recent_posts_summary(active_data_list) + # If no new threads started this week, generate summary from active posts of the given week + new_threads_summary += json_gen.generate_recent_posts_summary(active_data_list) logger.success(new_threads_summary) + # Prepare active posts data for newsletter + active_page_data = [] for data in tqdm(active_data_list): try: - # check and generate any missing file + # Generate all XML files for given title, if not present xml_gen.start(dict_data=[data], url=data['_source']['domain']) - - entry_data = gen.create_single_entry( + entry_data = json_gen.create_single_entry( data, base_url_for_xml="https://tldr.bitcoinsearch.xyz/summary", look_for_combined_summary=True, remove_xml_extension=True ) @@ -191,19 +188,17 @@ logger.error( f"Error occurred for doc id: {data['_source']['id']}\n{ex} \n{traceback.format_exc()}") + # Compile and save data for newsletter file json_string = { "summary_of_threads_started_this_week": new_threads_summary, "new_threads_this_week": new_threads_page_data, "active_posts_this_week": active_page_data } - gen.write_json_file(json_string, json_file_path) - + json_gen.write_json_file(json_string, json_file_path) archive_json_file_path = fr"static/newsletters/{str_month_year}/{current_date_str}-newsletter.json" - gen.store_file_in_archive(json_file_path, archive_json_file_path) - + json_gen.store_file_in_archive(json_file_path, archive_json_file_path) else: logger.error(f"Data list empty! Please check the data again.") - break except Exception as ex: logger.error(f"Error occurred: {ex} \n{traceback.format_exc()}") @@ -212,8 +207,8 @@ if count > 1: sys.exit(f"{ex}") else: + # If no changes found in stored JSON file, save the previous one with updated name in the archive directory logger.success("No change in the posts, no need to update newsletter.json file") - # save the previous one with updated name in archive if os.path.exists(json_full_path): archive_json_file_path = fr"static/newsletters/{str_month_year}/{current_date_str}-newsletter.json" - gen.store_file_in_archive(json_file_path, archive_json_file_path) + json_gen.store_file_in_archive(json_file_path, archive_json_file_path) diff --git a/push_combined_summary_to_es.py b/push_combined_summary_to_es.py index 018a838de..c3a53a70b 100644 --- a/push_combined_summary_to_es.py +++ b/push_combined_summary_to_es.py @@ -13,10 +13,11 @@ REMOVE_TIMESTAMPS_IN_AUTHORS = True + # Instantiating objects for reading XML and connecting to ElasticSearch xml_reader = XMLReader() elastic_search = ElasticSearchClient() - total_combined_files = [] + # Static directory names to look into for respective combined summary xml files static_dirs = [ 'bitcoin-dev', 'lightning-dev', @@ -24,29 +25,30 @@ ] pattern = "combined*.xml" + total_combined_files = [] for static_dir in static_dirs: combined_files = glob.glob(f"static/{static_dir}/**/{pattern}") total_combined_files.extend(combined_files) logger.info(f"Total combined files: {(len(total_combined_files))}") - # get unique combined file paths + # Get unique combined file paths total_combined_files_dict = {os.path.splitext(os.path.basename(i))[0]: i for i in total_combined_files} - logger.info(f"Total unique combined files: {len(total_combined_files_dict)}") + # Loop through all locally stored combined summary XML files and insert/update them accordingly for file_name, full_path in tqdm.tqdm(total_combined_files_dict.items()): try: - # get data from xml file + # Get data from xml file xml_file_data = xml_reader.read_xml_file(full_path) if REMOVE_TIMESTAMPS_IN_AUTHORS: - # remove timestamps from author's names and collect unique names only + # Remove timestamps from author's names and collect unique names only xml_file_data['authors'] = remove_timestamps_from_author_names(xml_file_data['authors']) - # check if doc exist in ES index + # Check if doc exist in ES index doc_exists = elastic_search.es_client.exists(index=ES_INDEX, id=file_name) - # insert the doc in ES index if it does not exist, else update it + # Insert the doc in ES index if it does not exist, else update it if not doc_exists: res = elastic_search.es_client.index( index=ES_INDEX, diff --git a/push_summary_to_es.py b/push_summary_to_es.py index 930e46c0b..6ea9e0480 100644 --- a/push_summary_to_es.py +++ b/push_summary_to_es.py @@ -6,14 +6,15 @@ from src.elasticsearch_utils import ElasticSearchClient from src.xml_utils import XMLReader - if __name__ == "__main__": APPLY_DATE_RANGE = False + # Instantiating objects for reading XML and connecting to ElasticSearch xml_reader = XMLReader() elastic_search = ElasticSearchClient() + # URLs for development mailing lists and forums dev_urls = [ "https://lists.linuxfoundation.org/pipermail/bitcoin-dev/", "https://lists.linuxfoundation.org/pipermail/lightning-dev/", @@ -21,8 +22,9 @@ "https://gnusha.org/pi/bitcoindev/" ] + # Process each URL in the dev_urls list for dev_url in dev_urls: - + # Set the date range for data extraction if APPLY_DATE_RANGE: current_date_str = None if not current_date_str: @@ -35,21 +37,20 @@ start_date_str = None current_date_str = None + # Fetch doc with an empty summary field docs_list = elastic_search.fetch_data_with_empty_summary(ES_INDEX, dev_url, start_date_str, current_date_str) - if isinstance(dev_url, list): - dev_name = dev_url[0].split("/")[-2] - else: - dev_name = dev_url.split("/")[-2] - + dev_name = dev_url[0].split("/")[-2] if isinstance(dev_url, list) else dev_url.split("/")[-2] logger.success(f"Total threads received with empty summary for '{dev_name}': {len(docs_list)}") + # Loop through all fetched docs and update them by adding the summary from xml files for doc in tqdm.tqdm(docs_list): res = None try: doc_id = doc['_id'] doc_index = doc['_index'] if not doc['_source'].get('summary'): + # Get summary text from locally stored XML files xml_summary = xml_reader.get_xml_summary(doc, dev_name) if xml_summary: elastic_search.es_client.update( diff --git a/src/config.py b/src/config.py index 9b2dba280..a07c31892 100644 --- a/src/config.py +++ b/src/config.py @@ -1,7 +1,7 @@ import os -import openai import warnings +import openai import tiktoken from dotenv import load_dotenv diff --git a/src/xml_utils.py b/src/xml_utils.py index cf4fe58fe..a74873d9e 100644 --- a/src/xml_utils.py +++ b/src/xml_utils.py @@ -1,23 +1,24 @@ -import re -import pandas as pd -from feedgen.feed import FeedGenerator -from tqdm import tqdm +import glob +import os import platform +import re import shutil +import traceback +import xml.etree.ElementTree as ET from datetime import datetime, timezone + +import pandas as pd import pytz -import glob -import xml.etree.ElementTree as ET -import os -import traceback +from feedgen.feed import FeedGenerator from loguru import logger +from tqdm import tqdm +from src.config import ES_INDEX +from src.elasticsearch_utils import ElasticSearchClient +from src.gpt_utils import create_summary from src.utils import preprocess_email, month_dict, get_id, clean_title, convert_to_tuple, create_folder, \ remove_multiple_whitespaces, add_utc_if_not_present -from src.gpt_utils import create_summary -from src.config import ES_INDEX -from src.elasticsearch_utils import ElasticSearchClient elastic_search = ElasticSearchClient() @@ -149,8 +150,6 @@ def append_columns(self, df_dict, file, title, namespace): # The title is directly provided as a parameter df_dict["title"].append(title) - # formatted_file_name = file.split("/static")[1] - # logger.info(formatted_file_name) # Parse the XML file to extract and append relevant data tree = ET.parse(file) @@ -174,51 +173,46 @@ def append_columns(self, df_dict, file, title, namespace): df_dict["authors"].append([author_result.strip()]) def file_not_present_df(self, columns, source_cols, df_dict, files_list, dict_data, data, - title, combined_filename, namespace): + title, namespace): """ Processes data directly from the given document (`data`) as no XML summary is available for that document. Also, for each individual summary (XML file) that already exists for the given thread, extracts and appends its content to the dictionary. """ - # Append basic data from dict_data for each column into df_dict + # Append basic data from dict_data for each column into df_dict using list comprehension for col in columns: df_dict[col].append(dict_data[data][col]) + # Processing source_cols with conditional append for col in source_cols: + value = dict_data[data]['_source'][col] if "created_at" in col: - datetime_obj = add_utc_if_not_present(dict_data[data]['_source'][col], iso_format=False) - df_dict[col].append(datetime_obj) - else: - df_dict[col].append(dict_data[data]['_source'][col]) + value = add_utc_if_not_present(value, iso_format=False) + df_dict[col].append(value) + + # Iterate over files with transformed file paths + files_list = [file.replace("\\", "/") for file in files_list] + + # Use dictionary to store parsed XML trees to avoid redundant parsing + parsed_files = {} # For each individual summary (XML file) that exists for the # given thread, extract and append their content to the dictionary - # TODO: - # This method is called for every post without a summary, which means that - # existing inidividual summaries for a thread are added n-1 times the amount - # of new posts in the thread at the time of execution of the cron job. - # this is not an issue because we then drop duplicates, but it's extra complexity. for file in files_list: - file = file.replace("\\", "/") if os.path.exists(file): - tree = ET.parse(file) - root = tree.getroot() + if file not in parsed_files: + tree = ET.parse(file) + root = tree.getroot() + parsed_files[file] = (tree, root) + + tree, root = parsed_files[file] file_title = root.find('atom:entry/atom:title', namespace).text if title == file_title: self.append_columns(df_dict, file, title, namespace) - if combined_filename in file: - # TODO: the code will never reach this point - # as we are already filtering per thread title so no - # "Combined summary - X" filename will pass though - tree = ET.parse(file) - root = tree.getroot() - summary = root.find('atom:entry/atom:summary', namespace).text - df_dict["body"].append(summary) - else: - summary = root.find('atom:entry/atom:summary', namespace).text - df_dict["body"].append(summary) + summary = root.find('atom:entry/atom:summary', namespace).text + df_dict["body"].append(summary) else: logger.info(f"file not present: {file}") @@ -230,39 +224,45 @@ def file_present_df(self, files_list, namespace, combined_filename, title, indiv summary exists, it extracts the content of individual summaries, appending it to the data dictionary. """ - combined_file_fullpath = None # the combined XML file if found # List to keep track of the month folders that contain # the XML files for the posts of the current thread - month_folders = [] + month_folders = set() + + # Cached listdir calls to avoid repeated disk access + folder_contents = {} + + # Identifying combined file and processing individual summaries in a single loop + combined_file_fullpath = None - # Iterate through the list of local XML file paths for file in files_list: - file = file.replace("\\", "/") + normalized_file = file.replace("\\", "/") # Check if the current file is the combined XML file for the thread - if combined_filename in file: - combined_file_fullpath = file + if combined_filename in normalized_file: + combined_file_fullpath = normalized_file # Parse the XML file to find the title and compare it with the current title # in order to understand if the post/file is part of the current thread - tree = ET.parse(file) + tree = ET.parse(normalized_file) root = tree.getroot() file_title = root.find('atom:entry/atom:title', namespace).text # If titles match, add the file to the list of relevant XMLs and track its month folder if title == file_title: - individual_summaries_xmls_list.append(file) - month_folder_path = "/".join(file.split("/")[:-1]) - if month_folder_path not in month_folders: - month_folders.append(month_folder_path) + individual_summaries_xmls_list.append(normalized_file) + month_folder_path = "/".join(normalized_file.split("/")[:-1]) + month_folders.add(month_folder_path) # Ensure the combined XML file is copied to all relevant month folders for month_folder in month_folders: - if combined_file_fullpath and combined_filename not in os.listdir(month_folder): - if combined_filename not in os.listdir(month_folder): - shutil.copy(combined_file_fullpath, month_folder) + if month_folder not in folder_contents: + folder_contents[month_folder] = os.listdir(month_folder) + + if combined_file_fullpath and combined_filename not in folder_contents[month_folder]: + shutil.copy(combined_file_fullpath, month_folder) # If individual summaries exist but no combined summary, # extract and append their content to the dictionary - if len(individual_summaries_xmls_list) > 0 and not any(combined_filename in item for item in files_list): - logger.info("individual summaries are present but not combined ones ...") + combined_exists = any(combined_filename in item for item in files_list) + if individual_summaries_xmls_list and not combined_exists: + logger.info("Individual summaries are present but not combined ones.") for file in individual_summaries_xmls_list: self.append_columns(df_dict, file, title, namespace) tree = ET.parse(file) @@ -283,12 +283,18 @@ def get_local_xml_file_paths(self, dev_url): files_list = glob.glob(os.path.join(current_directory, "static", directory, "**/*.xml"), recursive=True) return files_list + def get_local_xml_file_paths_for_title(self, dev_url, title): + """ + Retrieve paths for all relevant local XML files based on the given domain and title + """ + current_directory = os.getcwd() + directory = get_base_directory(dev_url) + files_list = glob.glob(os.path.join(current_directory, "static", directory, f"**/*{title}.xml"), recursive=True) + return files_list + def generate_new_emails_df(self, main_dict_data, dev_url): # Define XML namespace for parsing XML files namespaces = {'atom': 'http://www.w3.org/2005/Atom'} - - # Retrieve all existing XML files (summaries) for the given source - files_list = self.get_local_xml_file_paths(dev_url) # Initialize a dictionary to store data for DataFrame construction, with predefined columns columns = ['_index', '_id', '_score'] @@ -297,9 +303,9 @@ def generate_new_emails_df(self, main_dict_data, dev_url): df_dict = {col: [] for col in (columns + source_cols)} seen_titles = set() - # Process each document in the input data + # Process each document in the input data for idx in range(len(main_dict_data)): - xmls_list = [] # the existing XML files for the thread that the fetched document is part of + xmls_list = [] # the existing XML files for the thread that the fetched document is part of thread_title = main_dict_data[idx]["_source"]["title"] if thread_title in seen_titles: continue @@ -322,11 +328,14 @@ def generate_new_emails_df(self, main_dict_data, dev_url): combined_filename = f"combined_{xml_name}.xml" created_at = title_dict_data[data_idx]["_source"]["created_at"] + # Retrieve all existing XML files (summaries) for the given source and title + files_list = self.get_local_xml_file_paths_for_title(dev_url=dev_url, title=xml_name) + # Check if the XML file for the document exists if not any(file_name in item for item in files_list): logger.info(f"Not present: {created_at} | {file_name}") self.file_not_present_df(columns, source_cols, df_dict, files_list, title_dict_data, data_idx, - title, combined_filename, namespaces) + title, namespaces) else: logger.info(f"Present: {created_at} | {file_name}") self.file_present_df(files_list, namespaces, combined_filename, title, xmls_list, df_dict) diff --git a/static/homepage.json b/static/homepage.json index 0b6b71254..08124e715 100644 --- a/static/homepage.json +++ b/static/homepage.json @@ -1,5 +1,5 @@ { - "header_summary": "Andrew Poelstra's analysis of the BIP-342 upgrade highlights the introduction of CHECKSIG From Stack (CSFS) and enhancements in batch verification, alongside discussing the uniformity of public key sets between CSFS and CHECKSIG operations. Poelstra's contemplation on the future of softforks concerning public key divergence reveals a preference for maintaining consistency across functionalities, underlining the importance for future protocol developments. Further details can be found on [Andrew Poelstra's webpage](https://www.wpsoftware.net/andrew).\n\nBrandon Black's proposal introduces `OP_INTERNALKEY` to the tapscript framework, aiming to replace `OP_SUCCESS203` and improve Bitcoin scripting efficiency by facilitating direct access to the taproot internal key. This initiative is poised to streamline key spend transactions and is detailed within a [GitHub pull request](https://github.com/bitcoin/bitcoin/pull/29269), despite noting compatibility concerns with existing behaviors.\n\nEthan Heilman and Sipa address distinct technical proceedings within the Bitcoin community; Heilman clarifies the confusion surrounding the assignment of a BIP number to OP_CAT, led by Armin and Ali Sherief, and emphasizes the discrepancy between official and social media-driven communications. Simultaneously, Sipa explores transaction linearization improvements through the revised Double-LIMO approach, showcasing a commitment to enhancing processing efficiency in Bitcoin Core.\n\nDavid Harding and Adiabat delve into transaction efficiency advancements; Harding quantifies the benefits of payment batching and Cross-Input Signature Aggregation (CISA) through [Optech's calculator](https://bitcoinops.org/en/tools/calc-size/), highlighting modest yet valuable improvements for Bitcoin. On another innovative front, Adiabat introduces exploding keys as a method to increase transaction efficiency by eliminating the need for signatures through pre-committed public keys to specific outputs, signaling a forward-thinking approach to Bitcoin transaction mechanisms despite potential security implications.", + "header_summary": "Andrew Poelstra's analysis of the BIP-342 upgrade highlights the introduction of significant improvements to the Bitcoin protocol, such as treating unknown public keys as OP_SUCCESS and mandating empty vectors for invalid signatures. Poelstra also reflects on the need for uniformity in public key sets across different Bitcoin functionalities, acknowledging its importance for future developments despite not being an immediate concern within the CSFS proposal.\n\nBrandon Black discusses the potential of a new tapscript opcode, `OP_INTERNALKEY`, to enhance Bitcoin's scripting capabilities through a Bitcoin Improvement Proposal (BIP). This proposal aims to improve byte efficiency in transactions involving multiple parties and script-based restrictions, offering a backward-compatible integration through a soft fork. However, it also acknowledges potential post-deployment compatibility challenges.\n\nEthan Heilman addresses the confusion surrounding the assignment of a Bitcoin Improvement Proposal (BIP) number to OP_CAT, contrasting official procedures with speculative social media narratives. Meanwhile, sipa explores the Double-LIMO strategy's refinement in Bitcoin Core's block builder, presenting an optimized approach for transaction processing efficiency. David Harding and Adiabat further contribute to the discussion on enhancing transaction efficiency, with Harding examining payment batching and the speculative benefits of Cross-Input Signature Aggregation (CISA) for privacy protocols, and Adiabat introducing the concept of exploding keys to potentially revolutionize Bitcoin transactions by allowing for streamlined covenant constructions without traditional signature requirements.", "recent_posts": [ { "id": "mfd0d3cd1c351cf61e87288f3e752562f89a84132", @@ -9,7 +9,7 @@ "Andrew Poelstra" ], "published_at": "2024-04-25T11:44:00+00:00", - "summary": "- Andrew Poelstra discusses BIP-342 upgrade benefits, including new features for protocol enhancement.\n- He debates maintaining uniform public key sets for CSFS and CHECKSIG operations' future compatibility.\n- Poelstra prefers no divergence in public key types between the two operations, seeing no benefit.", + "summary": "- Andrew Poelstra discusses BIP-342's impact, praising its batch verification and CSFS features.\n- He questions the need for different public keys in CSFS and CHECKSIG, favoring uniformity.\n- Poelstra sees no benefit in diverging public key sets, stressing consistency for future softforks.", "n_threads": 1, "dev_name": "bitcoin-dev", "contributors": [ @@ -26,7 +26,7 @@ "Brandon Black" ], "published_at": "2024-04-25T05:22:00+00:00", - "summary": "- A new tapscript opcode, `OP_INTERNALKEY`, aims to enhance Bitcoin scripting under taproot.\n- It proposes a more efficient way to handle transactions by directly accessing the taproot internal key.\n- The implementation challenges and community discussions on its deployment are ongoing.", + "summary": "- The BIP introduces `OP_INTERNALKEY` for efficient Bitcoin scripting under taproot.\n- `OP_INTERNALKEY` aims to save 8 vBytes in transactions by enabling script-based restrictions.\n- Its implementation and deployment details are still under discussion by developers.", "n_threads": 0, "dev_name": "bitcoin-dev", "contributors": [], @@ -41,7 +41,7 @@ "Ethan Heilman" ], "published_at": "2024-04-22T21:51:00+00:00", - "summary": "- Ethan Heilman raised concerns about the OP_CAT BIP status.\n- Armin and Ali Sherief formally requested a BIP number for OP_CAT.\n- No BIP number has been assigned to OP_CAT, contrary to social media rumors.", + "summary": "- Ethan Heilman discusses the unclear status of the OP_CAT Bitcoin Improvement Proposal.\n- Armin and Ali Sherief formally requested a BIP number for OP_CAT, awaiting assignment.\n- Social media rumors of an assigned BIP number for OP_CAT are inaccurate, contrasting official updates.", "n_threads": 2, "dev_name": "bitcoin-dev", "contributors": [ @@ -58,7 +58,7 @@ "sipa" ], "published_at": "2024-04-25T22:37:19.185000+00:00", - "summary": "- Double-LIMO's efficacy in Bitcoin transaction linearization has faced scrutiny and limitations.\n- Insights show the goal is meeting practical needs rather than exceeding theoretical ideals in linearization.\n- A new Double (and Triple) LIMO approach shows promise in refining transaction linearization techniques.", + "summary": "- Double-LIMO's efficacy in Bitcoin Core's transaction linearization has been questioned.\n- A new understanding emphasizes meeting practical CPFP transaction requirements over theoretical ideals.\n- Simplified Double (and Triple) LIMO methods show promise for enhanced transaction processing efficiency.", "n_threads": 6, "dev_name": "delvingbitcoin", "contributors": [ @@ -76,7 +76,7 @@ "harding" ], "published_at": "2024-04-24T19:27:03.356000+00:00", - "summary": "- Payment batching reduces Bitcoin transaction sizes significantly, saving up to 48%.\n- With CISA, further reductions bring savings to 7.1% beyond just batching.\n- CISA could enhance privacy protocols like coinjoins, making them cheaper and more effective.", + "summary": "- Payment batching in Bitcoin reduces transaction sizes by up to 48%.\n- Cross-Input Signature Aggregation with batching can save an additional 7.1%.\n- CISA may lower costs for privacy protocols, enhancing Bitcoin's privacy features.", "n_threads": 1, "dev_name": "delvingbitcoin", "contributors": [ @@ -93,7 +93,7 @@ "adiabat" ], "published_at": "2024-04-24T03:12:24.329000+00:00", - "summary": "- Exploding keys enhance Bitcoin transactions by pre-committing to outputs, negating signatures.\n- They function through cryptographic tweaks and aggregation, ensuring security without witnesses.\n- This innovation opens avenues for sophisticated Bitcoin covenants and future research development.", + "summary": "- Exploding keys enhance Bitcoin transaction efficiency by pre-committing to output sets.\n- They utilize cryptographic techniques for security without requiring witness data.\n- This innovation paves the way for advanced Bitcoin covenants and future research.", "n_threads": 0, "dev_name": "delvingbitcoin", "contributors": [], @@ -110,7 +110,7 @@ "Ava Chow" ], "published_at": "2024-02-27T18:53:00+00:00", - "summary": "- Friction in Bitcoin Improvement Proposals process requires appointing new editors.\n- New BIP editors must agree on BIP numbering and have a proven Bitcoin development track record.\n- Kanzure and RubenSomsen are proposed as capable candidates for BIP editor roles.", + "summary": "- Friction in the BIP process necessitates additional editors due to management challenges.\n- New editors must agree on BIP numbering and have a strong Bitcoin development background.\n- Kanzure and RubenSomsen are proposed as capable candidates to enhance BIP management.", "n_threads": 94, "dev_name": "bitcoin-dev", "contributors": [ @@ -161,7 +161,7 @@ "Jameson Lopp" ], "published_at": "2024-03-31T13:19:00+00:00", - "summary": "- Testnet3 faces operational challenges, including a reduced block reward and a difficulty reset bug.\n- Misuse of testnet3 for scammy airdrops has created a marketplace for TBTC, against its principles.\n- Discussions are ongoing about a reset, fixing bugs, or potentially replacing testnet3 with signet.", + "summary": "- Testnet3 faces challenges with coin distribution and a bug affecting mining difficulty.\n- Misuse of Testnet3 for scams has led to an unintended marketplace for TBTC.\n- Todd suggests a reset, fixing the bug, or moving to a different testing environment.", "n_threads": 33, "dev_name": "bitcoin-dev", "contributors": [ @@ -188,6 +188,24 @@ "file_path": "static/bitcoin-dev/March_2024/mc0042b7121a2d8687d25a719fe0ed03188b7a3d2_The-Future-of-Bitcoin-Testnet.xml", "combined_summ_file_path": "static/bitcoin-dev/March_2024/combined_The-Future-of-Bitcoin-Testnet.xml" }, + { + "id": "m9eb5b0869377b3c1e2f29b8f65eafbfd354fea2b", + "title": "Great Consensus Cleanup Revival", + "link": "https://gnusha.org/pi/bitcoindev/gnM89sIQ7MhDgI62JciQEGy63DassEv7YZAMhj0IEuIo0EdnafykF6RH4OqjTTHIHsIoZvC2MnTUzJI7EfET4o-UQoD-XAQRDcct994VarE=@protonmail.com/T/#u#m9eb5b0869377b3c1e2f29b8f65eafbfd354fea2b", + "authors": [ + "Antoine Poinsot" + ], + "published_at": "2024-03-24T18:10:00+00:00", + "summary": "- Antoine Poinsot discussed the Great Consensus Cleanup for Bitcoin on DelvingBitcoin.org.\n- He suggested restricting legacy transaction sizes and fixing the timewarp bug for better security.\n- Poinsot seeks community feedback on his proposals to improve the network's consensus mechanism.", + "n_threads": 6, + "dev_name": "bitcoin-dev", + "contributors": [ + "Antoine Riard", + "Mark F" + ], + "file_path": "static/bitcoin-dev/March_2024/m9eb5b0869377b3c1e2f29b8f65eafbfd354fea2b_Great-Consensus-Cleanup-Revival.xml", + "combined_summ_file_path": "static/bitcoin-dev/March_2024/combined_Great-Consensus-Cleanup-Revival.xml" + }, { "id": "1583", "title": "Basic vault prototype using OP_CAT", @@ -196,7 +214,7 @@ "rijndael" ], "published_at": "2024-02-15T22:18:50.558000+00:00", - "summary": "- The demo on GitHub shows OP_CAT's use in blockchain transactions with scripts for consistency.\n- \"Trigger Withdrawal\" needs two specific inputs and outputs, ensuring transaction integrity with strict conditions.\n- \"Complete\" and \"Cancel Withdrawal\" transactions introduce intricate checks and simplified processes for security.", + "summary": "- The demo at GitHub showcases OP_CAT's application in blockchain transactions.\n- It includes mechanisms for transaction integrity and security across different processes.\n- Utilizes OP_CAT for state assertions and rule enforcement, indicating potential enhancements.", "n_threads": 17, "dev_name": "delvingbitcoin", "contributors": [ @@ -213,7 +231,7 @@ "instagibbs" ], "published_at": "2024-04-16T17:45:35.589000+00:00", - "summary": "- This proposal enhances compact block relay efficiency via weak blocks to improve mining fairness.\n- A Proof of Concept shows feasibility, urging further discussion on technical adjustments and parameters.\n- Exploration includes assessing miner interest and broader network implications, requiring community feedback.", + "summary": "- This proposal suggests utilizing compact block infrastructure to enable weak block transmission.\n- A developed Proof of Concept shows the potential for integrating weak block transactions into the mempool.\n- Future steps include seeking feedback and researching miner willingness to adopt this system for network efficiency.", "n_threads": 16, "dev_name": "delvingbitcoin", "contributors": [ @@ -234,7 +252,7 @@ "jungly" ], "published_at": "2024-03-29T16:50:26.252000+00:00", - "summary": "- The new DSL aims to simplify bitcoin contract processes with a comprehensive approach.\n- It offers a high-level syntax for transactions and automatic witness program management.\n- Documentation and examples are available, enhancing its utility for advanced bitcoin use cases.", + "summary": "- The DSL for bitcoin contracts simplifies transaction descriptions and node interactions.\n- It streamlines creating scripts and managing witness programs, enhancing contract flexibility.\n- Documentation provides practical examples for using the tool in regtest environments.", "n_threads": 14, "dev_name": "delvingbitcoin", "contributors": [ @@ -252,56 +270,41 @@ ], "today_in_history_posts": [ { - "id": "017801", - "title": "BIP-341: Committing to all scriptPubKeys in the signature message", - "link": "https://lists.linuxfoundation.org/pipermail/bitcoin-dev/2020-April/017801.html", + "id": "016894", + "title": "Adding xpub field to PSBT to make multisig more secure", + "link": "https://lists.linuxfoundation.org/pipermail/bitcoin-dev/2019-April/016894.html", "authors": [ - "Andrew Kozlik" + "Stepan Snigirev" ], - "published_at": "2020-04-29T14:57:46+00:00", - "summary": "- Andrew Kozlik suggests signature messages should include all transaction inputs' scriptPubKeys.\n- This change aids in preventing theft in applications like CoinJoin by ensuring accurate spend amounts.\n- Proposes adding sha_scriptPubKeys hash to the signature message for better input ownership verification.", - "n_threads": 12, + "published_at": "2019-04-26T15:21:06+00:00", + "summary": "- Stepan Snigirev highlights a flaw in bip174 PSBT that could lead to fund theft in M of N setups.\n- He suggests adding xpub fields to metadata for verifying public keys used in transactions.\n- Proposes new key-value pairs for PSBT to enhance security and suggests reviewing other wallets' protocols.", + "n_threads": 6, "dev_name": "bitcoin-dev", "contributors": [ - "Russell O'Connor", - "Jeremy", - "Anthony Towns", - "David A. Harding", - "Greg Sanders", - "Jonas Nick", - "Pieter Wuille" + "Dmitry Petukhov", + "Ava Chow", + "Peter D. Gray", + "jan matejek" ], - "file_path": "static/bitcoin-dev/April_2020/017801_BIP-341-Committing-to-all-scriptPubKeys-in-the-signature-message.xml", - "combined_summ_file_path": "static/bitcoin-dev/April_2020/combined_BIP-341-Committing-to-all-scriptPubKeys-in-the-signature-message.xml" + "file_path": "static/bitcoin-dev/April_2019/016894_Adding-xpub-field-to-PSBT-to-make-multisig-more-secure.xml", + "combined_summ_file_path": "static/bitcoin-dev/April_2019/combined_Adding-xpub-field-to-PSBT-to-make-multisig-more-secure.xml" }, { - "id": "002678", - "title": "On the scalability issues of onboarding millions of LN mobile clients", - "link": "https://lists.linuxfoundation.org/pipermail/lightning-dev/2020-May/002678.html", + "id": "001986", + "title": "Improving Payment Latency by Fast Forwards", + "link": "https://lists.linuxfoundation.org/pipermail/lightning-dev/2019-April/001986.html", "authors": [ - "Antoine Riard" + "ZmnSCPxj" ], - "published_at": "2020-05-05T10:17:37+00:00", - "summary": "- BIP 157's development enhances Bitcoin light client protocols, impacting future mobile client support.\n- LN's adoption could change Bitcoin's security reliance from full-nodes to more user-friendly models.\n- Introducing monetary incentives for light client services may align with the watchtower concept for better security.", - "n_threads": 32, + "published_at": "2019-04-24T08:32:26+00:00", + "summary": "- The Lightning Network experiences payment latency due to 1.5 round trips required for safe forwarding.\n- Fast forwards could reduce latency but introduce higher risks and require higher off-chain fees.\n- Decker-Russell-Osuntokun suggests eliminating fast forwards through proper link-level protocol design.", + "n_threads": 12, "dev_name": "lightning-dev", "contributors": [ - "Keagan McClelland", - "ZmnSCPxj", - "Braydon Fuller", - "Christopher Allen", - "Richard Myers", - "Andr\u00e9s G. Aragoneses", - "Chris Belcher", - "Igor Cota", - "John Newbery", - "Lloyd Fournier", - "Luke Dashjr", - "Olaoluwa Osuntokun", - "William Casarin" + "Lloyd Fournier" ], - "file_path": "static/lightning-dev/May_2020/002678_On-the-scalability-issues-of-onboarding-millions-of-LN-mobile-clients.xml", - "combined_summ_file_path": "static/lightning-dev/May_2020/combined_On-the-scalability-issues-of-onboarding-millions-of-LN-mobile-clients.xml" + "file_path": "static/lightning-dev/April_2019/001986_Improving-Payment-Latency-by-Fast-Forwards.xml", + "combined_summ_file_path": "static/lightning-dev/April_2019/combined_Improving-Payment-Latency-by-Fast-Forwards.xml" }, { "id": "62", @@ -311,7 +314,7 @@ "jamesob" ], "published_at": "2023-08-16T15:22:13.243000+00:00", - "summary": "- Bitcoin scalability aims for 50,000 off-chain \"bitcoin banks\" to support 1 billion weekly users.\n- Off-chain solutions include federated sidechains and coinpools, avoiding larger block sizes to prevent centralization.\n- Future development focuses on enhancing security with tools like `OP_VAULT` and ensuring regulatory compliance through networking.", + "summary": "- Bitcoin's scalability involves establishing 50,000 off-chain \"bitcoin banks\" for user transactions.\n- Proposed solutions include federated sidechains and time-sensitive smart contracts without third-party reliance.\n- A focus on Layer 2 protocols and secure, scalable infrastructure is crucial for regulatory compliance.", "n_threads": 5, "dev_name": "delvingbitcoin", "contributors": [ diff --git a/xmls_generator_production.py b/xmls_generator_production.py index e87792a38..ccc466791 100644 --- a/xmls_generator_production.py +++ b/xmls_generator_production.py @@ -1,9 +1,11 @@ +import sys import time +import warnings from datetime import datetime, timedelta -import sys + from loguru import logger -import warnings from openai.error import APIError, PermissionError, AuthenticationError, InvalidAPIType, ServiceUnavailableError + from src.config import ES_INDEX from src.elasticsearch_utils import ElasticSearchClient from src.xml_utils import GenerateXML @@ -11,8 +13,12 @@ warnings.filterwarnings("ignore") if __name__ == "__main__": + + # Instantiating objects for generating JSON, XML and connecting to ElasticSearch gen = GenerateXML() elastic_search = ElasticSearchClient() + + # URLs of mailing lists and forums dev_urls = [ "https://lists.linuxfoundation.org/pipermail/bitcoin-dev/", "https://lists.linuxfoundation.org/pipermail/lightning-dev/", @@ -20,15 +26,17 @@ "https://gnusha.org/pi/bitcoindev/" ] + # Set the date range for data extraction end_date = datetime.now() start_date = end_date - timedelta(days=30) - # yyyy-mm-dd end_date_str = end_date.strftime("%Y-%m-%d") start_date_str = start_date.strftime("%Y-%m-%d") + logger.info(f"start_data: {start_date_str}") logger.info(f"end_date_str: {end_date_str}") + # Process each URL in the dev_urls list for dev_url in dev_urls: data_list = elastic_search.extract_data_from_es( ES_INDEX, dev_url, start_date_str, end_date_str, exclude_combined_summary_docs=True