-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrap.py
102 lines (79 loc) · 2.7 KB
/
scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import requests
from bs4 import BeautifulSoup
from lxml import etree
import pandas as pd
import re
def fetch_sitemap(url):
response = requests.get(url)
if response.status_code == 200:
sitemap_content = response.content
soup = BeautifulSoup(sitemap_content, features="xml")
return soup
else:
print(f"Failed to fetch sitemap: {response.status_code}")
return None
def extract_news_metadata(soup):
news_items = []
for url in soup.find_all("url"):
loc = url.find("loc").text if url.find("loc") else None
news = url.find("news:news")
if news:
title = news.find("news:title").text if news.find("news:title") else None
news_items.append(
{
"url": loc,
"title": title,
}
)
print(f"URL: {loc} | Title: {title}")
return news_items
def clean_text(text):
text = re.sub(r"&#[0-9]+;", "", text)
text = " ".join(text.split())
return text
def ends_with_punctuation(text):
return text.endswith((".", "!", "?"))
def scrape_article(news_item):
response = requests.get(news_item["url"])
if response.status_code == 200:
page_content = response.content
tree = etree.HTML(page_content)
content_element = tree.xpath('//article[@data-ds-component="article"]')
if not content_element:
print(f"No article element found for URL: {news_item['url']}")
return None
content_element = content_element[0]
paragraphs = content_element.xpath(
'.//p[not(parent::div[@data-ds-component="ad"])]'
)
content = " ".join(
[
etree.tostring(para, method="text", encoding="unicode").strip()
for para in paragraphs
]
)
title = clean_text(news_item["title"])
content = clean_text(content)
if not ends_with_punctuation(title):
full_content = f"{title}. {content}"
else:
full_content = f"{title} {content}"
return {
"url": news_item["url"],
"full_content": full_content,
}
else:
print(f"Failed to fetch article: {response}")
return None
soup = fetch_sitemap("https://www.infomoney.com.br/news-sitemap.xml")
if soup:
news_metadata = extract_news_metadata(soup)
articles_data = []
for item in news_metadata:
article_data = scrape_article(item)
if article_data:
articles_data.append(article_data)
if articles_data:
df = pd.DataFrame(articles_data)
print(df)
df.to_csv("datasets/infomoney_news.csv", index=False)