-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
164 lines (139 loc) · 6.43 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
import time
import re
def scrape_scholarships():
programs = []
# Extended list of websites to scrape
websites = [
"https://opportunitiescorners.com/",
"https://scholarshipscorner.website/",
"https://www.youthop.com/",
"https://opportunitiesforyouth.org/",
"https://greatyop.com/",
"https://www.un.org/en/",
"https://www.levels.fyi/",
"https://www.indeed.com/",
"https://www.unesco.org/",
"https://www.scholarshipportal.com/",
"https://www.findaphd.com/",
"https://www.educations.com/",
"https://www.scholars4dev.com/",
"https://www.opportunitiesforafricans.com/",
"https://www.mladiinfo.eu/",
"https://www.afterschoolafrica.com/",
"https://www.european-funding-guide.eu/"
]
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
for website in websites:
try:
# Skip LinkedIn and Google as they require authentication/special handling
if "linkedin.com" in website or "google.com" in website:
continue
response = requests.get(website, headers=headers, timeout=30)
soup = BeautifulSoup(response.text, 'html.parser')
# Enhanced scraping logic with more detailed information
program_elements = soup.find_all(['div', 'article'], class_=re.compile(r'program|scholarship|opportunity|job|position|grant'))
for element in program_elements:
# Extract year from deadline or posting date
date_text = element.find(text=re.compile(r'\d{4}'))
year = extract_year(date_text) if date_text else "Ongoing"
program = {
'title': clean_text(element.find(class_=re.compile(r'title|heading')).text if element.find(class_=re.compile(r'title|heading')) else ""),
'university': clean_text(element.find(class_=re.compile(r'university|institution|organization|company')).text if element.find(class_=re.compile(r'university|institution|organization|company')) else ""),
'country': clean_text(element.find(class_=re.compile(r'country|location')).text if element.find(class_=re.compile(r'country|location')) else ""),
'deadline': clean_text(element.find(class_=re.compile(r'deadline|date|apply-by')).text if element.find(class_=re.compile(r'deadline|date|apply-by')) else ""),
'type': determine_program_type(element),
'funding': extract_funding_info(element),
'year': year,
'link': extract_link(element, website),
'description': clean_text(element.find(class_=re.compile(r'description|content|summary')).text if element.find(class_=re.compile(r'description|content|summary')) else ""),
'requirements': extract_requirements(element),
'posted_date': extract_posted_date(element),
'source': website
}
programs.append(program)
# Respect robots.txt and be nice to servers
time.sleep(3)
except Exception as e:
print(f"Error scraping {website}: {str(e)}")
continue
return programs
def clean_text(text):
"""Clean and normalize text content"""
return ' '.join(text.strip().split())
def extract_year(text):
"""Extract year from text"""
match = re.search(r'\b20\d{2}\b', text)
return match.group(0) if match else "Ongoing"
def determine_program_type(element):
"""Determine program type based on content"""
text_content = element.get_text().lower()
type_keywords = {
'competition': 'competitions',
'conference': 'conferences',
'papers': 'papers_conferences',
'code camp': 'code_camp',
'hackathon': 'hackathons',
'exchange': 'exchange_programs',
'entrepreneur': 'entrepreneurial',
'internship': 'internships',
'fellowship': 'fellowships',
'course': 'online_courses',
'leadership': 'leadership',
'sdg': 'sdgs',
'summer program': 'summer_programs',
'summer school': 'summer_schools',
'training': 'training',
'youth forum': 'youth_forums',
'united nations': 'united_nations',
'workshop': 'workshops',
'government': 'gov_scholarships',
'high school': 'highschool',
'master': 'masters',
'mba': 'mba',
'phd': 'phd',
'postdoc': 'postdoc',
'undergraduate': 'undergraduate'
}
for keyword, program_type in type_keywords.items():
if keyword in text_content:
return program_type
return 'miscellaneous'
def extract_funding_info(element):
"""Extract funding information"""
funding_element = element.find(class_=re.compile(r'funding|scholarship|award'))
if funding_element:
return clean_text(funding_element.text)
return "Contact institution for funding details"
def extract_link(element, base_url):
"""Extract and validate program link"""
link_element = element.find('a', href=True)
if link_element:
link = link_element['href']
return link if link.startswith('http') else f"{base_url.rstrip('/')}{link}"
return base_url
def extract_requirements(element):
"""Extract program requirements"""
req_element = element.find(class_=re.compile(r'requirements|eligibility'))
if req_element:
return clean_text(req_element.text)
return "See program website for detailed requirements"
def extract_posted_date(element):
"""Extract posting date"""
date_element = element.find(class_=re.compile(r'posted|published|date'))
if date_element:
return clean_text(date_element.text)
return "Not specified"
if __name__ == "__main__":
programs = scrape_scholarships()
# Save to JSON file with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f'programs_{timestamp}.json'
with open(filename, 'w', encoding='utf-8') as f:
json.dump(programs, f, indent=2, ensure_ascii=False)
print(f"Scraped {len(programs)} programs and saved to {filename}")