-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMain_functions_top_10_article_summaries.py
165 lines (142 loc) · 5.8 KB
/
Main_functions_top_10_article_summaries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import requests
from bs4 import BeautifulSoup
import streamlit as st
import pandas as pd
import sqlite3
from apscheduler.schedulers.background import BackgroundScheduler
import datetime
import os
import re # Import regex
from random import randint
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt') # Download the Punkt sentence tokenizer
nltk.download('stopwords') # Do
# Constants
URL = "https://www.mediapool.bg/"
DATABASE_NAME = 'articles.db'
FOLDER_NAME = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
os.makedirs(FOLDER_NAME, exist_ok=True)
# Sanitize title to create safe file names
def sanitize_title(title):
# Remove invalid file name characters
safe_title = re.sub(r'[\\/*?:"<>|]', '', title)
return safe_title[:50] # Limit the length of file name if necessary
# Setup Database
def setup_database():
with sqlite3.connect(DATABASE_NAME) as conn:
conn.execute('''
CREATE TABLE IF NOT EXISTS articles (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
link TEXT NOT NULL,
summary TEXT,
views INTEGER,
retrieval_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# Fetch and Summarize Articles
def fetch_news(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
articles = []
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
news_items = soup.find_all('article', limit=10) # Limit to the first 10 articles
for item in news_items:
a_tag = item.find('a')
if a_tag:
title = a_tag.get_text(strip=True)
link = a_tag['href']
summary = "Накратко... " + title # Placeholder for actual summarization
sanitized_title = sanitize_title(title)
articles.append({"title": title, "link": link, "summary": summary})
# Save to SQLite DB
with sqlite3.connect(DATABASE_NAME) as conn:
for item in news_items:
a_tag = item.find('a')
if a_tag:
title = a_tag.text.strip()
link = a_tag.get('href')
summary = "Summary placeholder"
views = randint(1, 100)
print(f"Views for {title}: {views}")
conn.execute("INSERT INTO articles (title, link, summary) VALUES (?, ?, ?)",
(title, link, summary))
# Save to files
with open(f"{FOLDER_NAME}/{sanitized_title}.txt", 'w', encoding='utf-8') as file:
file.write(f"Title: {title}\nLink: {link}\nSummary: {summary}\n")
except requests.RequestException as e:
st.error(f"Failed to retrieve articles: {str(e)}")
return articles
#Summarize with NLTK function as an alternative to Gensim which rendered an issue with SciPy
def summarize(text):
stopWords = set(stopwords.words("english"))
words = word_tokenize(text)
freqTable = dict()
for word in words:
word = word.lower()
if word in stopWords:
continue
if word in freqTable:
freqTable[word] += 1
else:
freqTable[word] = 1
sentences = sent_tokenize(text)
sentenceValue = dict()
for sentence in sentences:
for word, freq in freqTable.items():
if word in sentence.lower():
if sentence in sentenceValue:
sentenceValue[sentence] += freq
else:
sentenceValue[sentence] = freq
sumValues = 0
for sentence in sentenceValue:
sumValues += sentenceValue[sentence]
average = int(sumValues / len(sentenceValue))
summary = ''
for sentence in sentences:
if (sentence in sentenceValue) and (sentenceValue[sentence] > (1.2 * average)):
summary += " " + sentence
return summary
# Display Articles Function
def display_articles(articles):
if not articles: # Check if the list is empty
st.write("Не са намерени статии.")
else:
# Convert articles to a DataFrame
articles = pd.DataFrame(articles)
if articles.empty: # Check if the DataFrame is empty
st.write("The DataFrame is empty.")
else:
articles.index = range(1, len(articles) + 1) # Start index from 1
print(articles) # Print the DataFrame
for _, row in articles.iterrows():
with st.expander(f"{row['title']}"):
# Generate a summary of the article
summary = summarize(row['summary'])
st.write(f"Накратко: {summary}")
st.write(f"Прочетете повече: [link]({row['link']})")
# Retrieve and Display Function for APScheduler
# Retrieve and Display Function for APScheduler
def retrieve_and_display_articles():
articles = fetch_news(URL)
if articles: # Check if the list is not empty
display_articles(articles)
# Streamlit App Layout
def main():
st.title("Scheduled News Summary from MediaPool")
setup_database()
# Scheduling fetches
scheduler = BackgroundScheduler()
scheduler.add_job(retrieve_and_display_articles, 'interval', hours=12, next_run_time=datetime.datetime.now())
scheduler.start()
if st.button("Fetch News Now"):
retrieve_and_display_articles()
if __name__ == "__main__":
main()