-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtest6.py
119 lines (106 loc) · 4.28 KB
/
test6.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import requests
from bs4 import BeautifulSoup
import streamlit as st
import pandas as pd
import sqlite3
from apscheduler.schedulers.background import BackgroundScheduler
import datetime
import os
import re # Import regex
# Constants
URL = "https://www.mediapool.bg/"
DATABASE_NAME = 'articles.db'
FOLDER_NAME = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
os.makedirs(FOLDER_NAME, exist_ok=True)
# Sanitize title to create safe file names
def sanitize_title(title):
# Remove invalid file name characters
safe_title = re.sub(r'[\\/*?:"<>|]', '', title)
return safe_title[:50] # Limit the length of file name if necessary
def setup_database():
with sqlite3.connect(DATABASE_NAME) as conn:
cursor = conn.cursor()
# Drop the existing table if it exists and recreate it
cursor.execute("DROP TABLE IF EXISTS articles")
cursor.execute('''
CREATE TABLE articles (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
link TEXT NOT NULL,
summary TEXT,
retrieval_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
conn.commit()
# Optionally, add the 'retrieval_date' column if it does not exist (useful if schema was initially set without it)
cursor.execute("PRAGMA table_info(articles)")
columns = [col[1] for col in cursor.fetchall()]
if 'retrieval_date' not in columns:
cursor.execute('ALTER TABLE articles ADD COLUMN retrieval_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP')
conn.commit()
# Fetch and Summarize Articles
def fetch_news(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
with sqlite3.connect(DATABASE_NAME) as conn:
news_items = soup.find_all('article', limit=10)
for item in news_items:
a_tag = item.find('a')
if a_tag:
title = a_tag.get_text(strip=True)
link = a_tag['href']
summary = "Summary of article... " + title
# Insert without specifying retrieval_date
conn.execute("INSERT INTO articles (title, link, summary) VALUES (?, ?, ?)",
(title, link, summary))
conn.commit()
except requests.RequestException as e:
st.error(f"Failed to retrieve articles: {str(e)}")
# Retrieve recent popular articles
def get_recent_articles():
with sqlite3.connect(DATABASE_NAME) as conn:
recent_articles = pd.read_sql("""
SELECT title, link, summary
FROM articles
WHERE retrieval_date > datetime('now', '-3 hours')
ORDER BY retrieval_date DESC
LIMIT 5
""", conn)
return recent_articles
# Display Articles Function
def display_articles(articles):
if articles:
df = pd.DataFrame(articles)
for _, row in df.iterrows():
with st.expander(f"{row['Title']}"):
st.write(f"Summary: {row['Summary']}")
st.write(f"Read more at: [link]({row['Link']})")
else:
st.write("No articles found or unable to fetch articles.")
# Streamlit App Layout
def main():
st.title("Scheduled News Summary from MediaPool")
setup_database()
# Sidebar for popular articles
st.sidebar.title("Most Popular Articles (Last 3 Hours)")
popular_articles = get_recent_articles()
if not popular_articles.empty:
for index, row in popular_articles.iterrows():
st.sidebar.header(row['Title'])
st.sidebar.write(row['Summary'])
st.sidebar.write(f"Read more at: [link]({row['Link']})")
else:
st.sidebar.write("No popular articles found.")
# Scheduling fetches
scheduler = BackgroundScheduler()
scheduler.add_job(retrieve_and_display_articles, 'interval', hours=12, next_run_time=datetime.datetime.now())
scheduler.start()
if st.button("Fetch News Now"):
retrieve_and_display_articles()
if __name__ == "__main__":
main()