Skip to content

Commit

Permalink
Enhanced code and added comments
Browse files Browse the repository at this point in the history
  • Loading branch information
urvishp80 committed Apr 26, 2024
1 parent e5445b7 commit 87be4b8
Show file tree
Hide file tree
Showing 9 changed files with 541 additions and 506 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/homepage_json_gen_cron_job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
pip install -r requirements.txt
- name: Execute Python script
run: python generate_homepage_xml.py
run: python generate_homepage_json.py

- name: Configure Git
run: |
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Utilizing data collected by the [scraper](https://github.com/bitcoinsearch/scrap
- Queries Elasticsearch for documents lacking summaries, extracts summaries from corresponding XML files, and then updates these documents with their summaries in the Elasticsearch index.
3. Daily [Push Combined Summary From XML Files to ES INDEX](.github/workflows/push_combined_summary_to_es_cron_job.yml) ([source](push_combined_summary_to_es.py))
- Processes each combined thread summary XML file, transforming it into a document format, checks for its existence in Elasticsearch, and updates or inserts the document as needed.
4. Daily [Python Homepage Update Script](.github/workflows/homepage_json_gen_cron_job.yml) ([source](generate_homepage_xml.py))
4. Daily [Python Homepage Update Script](.github/workflows/homepage_json_gen_cron_job.yml) ([source](generate_homepage_json.py))
- Queries the last 7 days of data from Elasticsearch for each source to compile lists of active threads, recent threads, and historical threads for 'Today in History'. It generates a summary of recent threads if available; otherwise, for active threads. The resulting [`homepage.json`](static/homepage.json) is then committed to GitHub to be used by [Bitcoin TLDR](https://github.com/bitcoinsearch/tldr).
5. Weekly [Python Newsletter Generation Script](.github/workflows/weekly_newsletter_gen_cron_job.yml) ([source](generate_weekly_newsletter_json.py))
- Generates a newsletter by compiling lists of new and active threads from the past week's data for each source. It generates a summary of new threads if available; otherwise, for active threads. The resulting [`newsletter.json`](static/newsletters/newsletter.json) is then committed to GitHub to be used by [Bitcoin TLDR](https://github.com/bitcoinsearch/tldr).
Expand Down
764 changes: 392 additions & 372 deletions generate_homepage_xml.py → generate_homepage_json.py

Large diffs are not rendered by default.

103 changes: 49 additions & 54 deletions generate_weekly_newsletter_json.py
Original file line number Diff line number Diff line change
@@ -1,92 +1,89 @@
import json
import os
import sys
import time
import traceback
from datetime import datetime, timedelta

from loguru import logger
import os
import sys
import json
from tqdm import tqdm

from src.config import ES_INDEX
from src.elasticsearch_utils import ElasticSearchClient
from src.json_utils import GenerateJSON
from src.xml_utils import GenerateXML
from src.utils import month_dict
from src.xml_utils import GenerateXML

if __name__ == "__main__":

gen = GenerateJSON()
# Instantiating objects for generating JSON, XML and connecting to ElasticSearch
json_gen = GenerateJSON()
xml_gen = GenerateXML()
elastic_search = ElasticSearchClient()

# URLs for development mailing lists and forums
dev_urls = [
["https://lists.linuxfoundation.org/pipermail/bitcoin-dev/",
"https://gnusha.org/pi/bitcoindev/"],
"https://lists.linuxfoundation.org/pipermail/lightning-dev/",
"https://delvingbitcoin.org/"
]

# Set the date range for data extraction: last week to yesterday.
current_date = datetime.now()
current_date_str = current_date.strftime("%Y-%m-%d")

start_date = current_date - timedelta(days=7)
start_date_str = start_date.strftime("%Y-%m-%d")

end_date = current_date - timedelta(days=1)

current_date_str = current_date.strftime("%Y-%m-%d")
start_date_str = start_date.strftime("%Y-%m-%d")
end_date_str = end_date.strftime("%Y-%m-%d")

logger.info(f"Newsletter publish date: {current_date_str}")
logger.info(f"Gathering data for newsletter from {start_date_str} to {end_date_str}")

# Convert month from number to name for filename construction
month_name = month_dict[int(current_date.month)]
str_month_year = f"{month_name}_{int(current_date.year)}"

active_data_list = []
new_threads_list = []

# Process each URL in the dev_urls list
for dev_url in dev_urls:

data_list = elastic_search.extract_data_from_es(
ES_INDEX, dev_url, start_date_str, end_date_str, exclude_combined_summary_docs=True
)

if isinstance(dev_url, list):
dev_name = dev_url[0].split("/")[-2]
else:
dev_name = dev_url.split("/")[-2]

logger.success(f"TOTAL THREADS RECEIVED FOR '{dev_name}': {len(data_list)}")
dev_name = dev_url[0].split("/")[-2] if isinstance(dev_url, list) else dev_url.split("/")[-2]
logger.success(f"Retrieved {len(data_list)} threads for {dev_name}")

# NEW THREADS POSTS
# @TODO you already identify the original post by type==original_post
# so you could get the posts in order by date and check if the original posts is there
seen_titles = set()
for i in data_list:
this_title = i['_source']['title']
if this_title in seen_titles:
continue
seen_titles.add(this_title)

# check if the first post for this title is in the past week
original_post = elastic_search.get_earliest_posts_by_title(es_index=ES_INDEX, url=dev_url, title=this_title)

if original_post['_source'] and i['_source']['created_at'] == original_post['_source']['created_at']:
logger.success(f"new thread created on: {original_post['_source']['created_at']} || TITLE: {this_title}")
# Check if any new thread started in given week
if i['_source']['type'] == 'original_post':
logger.success(f"New thread created on: {i['_source']['created_at']} || TITLE: {this_title}")

counts, contributors = elastic_search.es_fetch_contributors_and_threads(
es_index=ES_INDEX, title=this_title, domain=dev_url
)

# Separate an original author and contributors
for author in i['_source']['authors']:
contributors.remove(author)
i['_source']['n_threads'] = counts
i['_source']['contributors'] = contributors
i['_source']['dev_name'] = dev_name
new_threads_list.append(i)
logger.info(f"number of new threads started this week: {len(new_threads_list)}")
logger.info(f"No. of new threads started this week: {len(new_threads_list)}")

# TOP ACTIVE POSTS
active_posts_data = elastic_search.filter_top_active_posts(es_results=data_list, top_n=15)
logger.info(f"number of filtered top active post: {len(active_posts_data)}")
logger.info(f"No. of filtered top active post: {len(active_posts_data)}")

new_threads_titles_list = [i['_source']['title'] for i in new_threads_list]

Expand All @@ -103,14 +100,15 @@
seen_titles.add(title)
active_data_list.append(data)
# active_posts_data_counter += 1
logger.info(f"number of active posts collected: {len(active_data_list)}")
logger.info(f"No. of active posts collected: {len(active_data_list)}")

# gather titles of docs from json file
# Determine if there's any update in the data compared to stored JSON
# Gather titles from stored JSON file
json_file_path = fr"static/newsletters/newsletter.json"

current_directory = os.getcwd()
json_full_path = os.path.join(current_directory, json_file_path)
json_xml_ids = set()
stored_json_titles = set()
if os.path.exists(json_full_path):
with open(json_full_path, 'r') as j:
try:
Expand All @@ -119,22 +117,22 @@
logger.info(f"Error reading json file:{json_full_path} :: {e}")
json_data = {}

json_xml_ids = set(
stored_json_titles = set(
[item['title'] for item in json_data.get('new_threads_this_week', [])] +
[item['title'] for item in json_data.get('active_posts_this_week', [])]
)
else:
logger.warning(f"No existing newsletter.json file found: {json_full_path}")

# gather ids of docs from active posts and new thread posts
filtered_docs_ids = set(
# Gather titles from collected Active data and New Threads list
collected_json_titles = set(
[data['_source']['title'] for data in active_data_list] +
[data['_source']['title'] for data in new_threads_list]
)

# check if there are any updates in the xml file
if filtered_docs_ids != json_xml_ids:
logger.info("changes found in recent posts ... ")
# Generate a new newsletter.json file if changes found in stored JSON file
if collected_json_titles != stored_json_titles:
logger.info("Changes found as compared to previously stored JSON file... ")

delay = 5
count = 0
Expand All @@ -144,23 +142,21 @@
logger.success(f"Total no. of active posts collected: {len(active_data_list)}")
logger.success(f"Total no. of new threads started this week: {len(new_threads_list)}")

logger.info("creating newsletter.json file ... ")
logger.info("Creating newsletter.json file ... ")
if len(active_data_list) > 0 or len(new_threads_list) > 0:

# Prepare New Threads data for newsletter
new_threads_page_data = []
active_page_data = []
new_threads_summary = ""

if new_threads_list:
new_threads_summary += gen.generate_recent_posts_summary(new_threads_list, verbose=True)
new_threads_summary += json_gen.generate_recent_posts_summary(new_threads_list, verbose=True)
logger.success(new_threads_summary)

for data in tqdm(new_threads_list):
try:
# check and generate any missing file
# Generate all XML files for given title, if not present
xml_gen.start(dict_data=[data], url=data['_source']['domain'])

entry_data = gen.create_single_entry(
entry_data = json_gen.create_single_entry(
data,
base_url_for_xml="https://tldr.bitcoinsearch.xyz/summary",
look_for_combined_summary=True,
Expand All @@ -173,16 +169,17 @@
else:
logger.warning(f"No new threads started this week, generating summary of active posts this "
f"week ...")
# if no new threads started this week, generate summary from active post this week
new_threads_summary += gen.generate_recent_posts_summary(active_data_list)
# If no new threads started this week, generate summary from active posts of the given week
new_threads_summary += json_gen.generate_recent_posts_summary(active_data_list)
logger.success(new_threads_summary)

# Prepare active posts data for newsletter
active_page_data = []
for data in tqdm(active_data_list):
try:
# check and generate any missing file
# Generate all XML files for given title, if not present
xml_gen.start(dict_data=[data], url=data['_source']['domain'])

entry_data = gen.create_single_entry(
entry_data = json_gen.create_single_entry(
data, base_url_for_xml="https://tldr.bitcoinsearch.xyz/summary",
look_for_combined_summary=True, remove_xml_extension=True
)
Expand All @@ -191,19 +188,17 @@
logger.error(
f"Error occurred for doc id: {data['_source']['id']}\n{ex} \n{traceback.format_exc()}")

# Compile and save data for newsletter file
json_string = {
"summary_of_threads_started_this_week": new_threads_summary,
"new_threads_this_week": new_threads_page_data,
"active_posts_this_week": active_page_data
}
gen.write_json_file(json_string, json_file_path)

json_gen.write_json_file(json_string, json_file_path)
archive_json_file_path = fr"static/newsletters/{str_month_year}/{current_date_str}-newsletter.json"
gen.store_file_in_archive(json_file_path, archive_json_file_path)

json_gen.store_file_in_archive(json_file_path, archive_json_file_path)
else:
logger.error(f"Data list empty! Please check the data again.")

break
except Exception as ex:
logger.error(f"Error occurred: {ex} \n{traceback.format_exc()}")
Expand All @@ -212,8 +207,8 @@
if count > 1:
sys.exit(f"{ex}")
else:
# If no changes found in stored JSON file, save the previous one with updated name in the archive directory
logger.success("No change in the posts, no need to update newsletter.json file")
# save the previous one with updated name in archive
if os.path.exists(json_full_path):
archive_json_file_path = fr"static/newsletters/{str_month_year}/{current_date_str}-newsletter.json"
gen.store_file_in_archive(json_file_path, archive_json_file_path)
json_gen.store_file_in_archive(json_file_path, archive_json_file_path)
16 changes: 9 additions & 7 deletions push_combined_summary_to_es.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,40 +13,42 @@

REMOVE_TIMESTAMPS_IN_AUTHORS = True

# Instantiating objects for reading XML and connecting to ElasticSearch
xml_reader = XMLReader()
elastic_search = ElasticSearchClient()

total_combined_files = []
# Static directory names to look into for respective combined summary xml files
static_dirs = [
'bitcoin-dev',
'lightning-dev',
'delvingbitcoin'
]
pattern = "combined*.xml"

total_combined_files = []
for static_dir in static_dirs:
combined_files = glob.glob(f"static/{static_dir}/**/{pattern}")
total_combined_files.extend(combined_files)
logger.info(f"Total combined files: {(len(total_combined_files))}")

# get unique combined file paths
# Get unique combined file paths
total_combined_files_dict = {os.path.splitext(os.path.basename(i))[0]: i for i in total_combined_files}

logger.info(f"Total unique combined files: {len(total_combined_files_dict)}")

# Loop through all locally stored combined summary XML files and insert/update them accordingly
for file_name, full_path in tqdm.tqdm(total_combined_files_dict.items()):
try:
# get data from xml file
# Get data from xml file
xml_file_data = xml_reader.read_xml_file(full_path)

if REMOVE_TIMESTAMPS_IN_AUTHORS:
# remove timestamps from author's names and collect unique names only
# Remove timestamps from author's names and collect unique names only
xml_file_data['authors'] = remove_timestamps_from_author_names(xml_file_data['authors'])

# check if doc exist in ES index
# Check if doc exist in ES index
doc_exists = elastic_search.es_client.exists(index=ES_INDEX, id=file_name)

# insert the doc in ES index if it does not exist, else update it
# Insert the doc in ES index if it does not exist, else update it
if not doc_exists:
res = elastic_search.es_client.index(
index=ES_INDEX,
Expand Down
15 changes: 8 additions & 7 deletions push_summary_to_es.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,25 @@
from src.elasticsearch_utils import ElasticSearchClient
from src.xml_utils import XMLReader


if __name__ == "__main__":

APPLY_DATE_RANGE = False

# Instantiating objects for reading XML and connecting to ElasticSearch
xml_reader = XMLReader()
elastic_search = ElasticSearchClient()

# URLs for development mailing lists and forums
dev_urls = [
"https://lists.linuxfoundation.org/pipermail/bitcoin-dev/",
"https://lists.linuxfoundation.org/pipermail/lightning-dev/",
"https://delvingbitcoin.org/",
"https://gnusha.org/pi/bitcoindev/"
]

# Process each URL in the dev_urls list
for dev_url in dev_urls:

# Set the date range for data extraction
if APPLY_DATE_RANGE:
current_date_str = None
if not current_date_str:
Expand All @@ -35,21 +37,20 @@
start_date_str = None
current_date_str = None

# Fetch doc with an empty summary field
docs_list = elastic_search.fetch_data_with_empty_summary(ES_INDEX, dev_url, start_date_str, current_date_str)

if isinstance(dev_url, list):
dev_name = dev_url[0].split("/")[-2]
else:
dev_name = dev_url.split("/")[-2]

dev_name = dev_url[0].split("/")[-2] if isinstance(dev_url, list) else dev_url.split("/")[-2]
logger.success(f"Total threads received with empty summary for '{dev_name}': {len(docs_list)}")

# Loop through all fetched docs and update them by adding the summary from xml files
for doc in tqdm.tqdm(docs_list):
res = None
try:
doc_id = doc['_id']
doc_index = doc['_index']
if not doc['_source'].get('summary'):
# Get summary text from locally stored XML files
xml_summary = xml_reader.get_xml_summary(doc, dev_name)
if xml_summary:
elastic_search.es_client.update(
Expand Down
2 changes: 1 addition & 1 deletion src/config.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import openai
import warnings

import openai
import tiktoken
from dotenv import load_dotenv

Expand Down
Loading

0 comments on commit 87be4b8

Please sign in to comment.