-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMain-sidebar_functions_work_try_NLTK_to_summarize.py
113 lines (102 loc) · 4.54 KB
/
Main-sidebar_functions_work_try_NLTK_to_summarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import requests
from bs4 import BeautifulSoup
import streamlit as st
import pandas as pd
import sqlite3
from apscheduler.schedulers.background import BackgroundScheduler
import datetime
import os
import re # Import regex
from random import randint
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt') # Download the Punkt sentence tokenizer
nltk.download('stopwords') # Download the stopwords
# Custom Bulgarian stopwords list
bulgarian_stopwords = set([
"и", "в", "на", "с", "за", "не"
])
# Constants
BASE_URL = "https://www.mediapool.bg/"
DATABASE_NAME = 'articles.db'
FOLDER_NAME = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
os.makedirs(FOLDER_NAME, exist_ok=True)
def sanitize_title(title):
return re.sub(r'[\\/*?:"<>|]', '', title)[:50]
def setup_database():
with sqlite3.connect(DATABASE_NAME) as conn:
conn.execute('''
CREATE TABLE IF NOT EXISTS articles (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
link TEXT NOT NULL,
summary TEXT,
views INTEGER DEFAULT 0,
retrieval_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
def fetch_news(url):
headers = {'User-Agent': 'Mozilla/5.0'}
articles = []
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
news_items = soup.find_all('article', limit=10)
with sqlite3.connect(DATABASE_NAME) as conn:
conn.execute("DELETE FROM articles WHERE date(retrieval_date) = date('now')") # Clear today's existing entries to prevent duplicates
for item in news_items:
a_tag = item.find('a')
if a_tag:
title = a_tag.text.strip()
link = a_tag.get('href')
# Fetch the article text and summarize it
article_response = requests.get(link)
article_soup = BeautifulSoup(article_response.text, 'html.parser')
# Replace 'p' with the actual HTML element that contains the article text
article_text = ' '.join(p.text for p in article_soup.find_all('p'))
summary = summarize(article_text)
views = randint(1, 100)
conn.execute("INSERT INTO articles (title, link, summary, views, retrieval_date) VALUES (?, ?, ?, ?, ?)",
(title, link, summary, views, datetime.datetime.now()))
conn.commit()
except requests.RequestException as e:
st.error("Exception during news fetch: " + str(e))
def get_popular_articles(selected_date):
with sqlite3.connect(DATABASE_NAME) as conn:
return pd.read_sql("""
SELECT title, link, summary
FROM articles
WHERE date(retrieval_date) = date(?)
ORDER BY views DESC
LIMIT 5
""", conn, params=(selected_date,))
def display_articles(articles):
if not articles.empty:
articles.index = range(1, len(articles) + 1) # Start index from 1
for _, row in articles.iterrows():
with st.expander(f"{row['title']}"):
st.write(f"Накратко: {row['title']}")
st.write(f"Прочетете повече: [link]({row['link']})")
else:
st.write("Не са намерени статии.")
def main():
st.title("TOP 10 News Summary from MediaPool")
setup_database()
selected_date = st.sidebar.date_input("Select a date", datetime.date.today())
if st.sidebar.button("Fetch News Now"):
fetch_news(BASE_URL + "bulgaria-cat2.html") # Assume today's news is always on page 1
# Display today's articles
todays_articles = pd.read_sql("SELECT title, link, summary FROM articles WHERE date(retrieval_date) = date('now')", sqlite3.connect(DATABASE_NAME))
display_articles(todays_articles)
# Display popular articles in the sidebar
popular_articles = get_popular_articles(selected_date)
if not popular_articles.empty:
st.sidebar.write(f"Most Popular Articles for {selected_date}:")
for index, row in popular_articles.iterrows():
st.sidebar.write(f"{row['title']} - {row['link']}")
else:
st.sidebar.write("No popular articles found.")
if __name__ == "__main__":
main()