-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathapp.py
178 lines (166 loc) · 8.69 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# IMport all the required libraries and methods
from flask import Flask, render_template, request, redirect, flash, url_for
from .forms import *
import urllib.request
from urllib.parse import urlparse,urljoin
from bs4 import BeautifulSoup,SoupStrainer
import requests
import validators
import json
import re
import itertools
from .index import *
from scholarly import scholarly
from urllib.request import urlopen
from flask_apscheduler import APScheduler
from .pages import pages
from scholarly import ProxyGenerator
# Iniatalise the flask app and the scheduler
app = Flask(__name__)
app.secret_key = '67eadccda3bc198f2b9712c77912bd55'
scheduler = APScheduler()
#Proxy from scholary
# pg = ProxyGenerator()
# pg.FreeProxies()
# scholarly.use_proxy(pg)
@app.route("/profiles",methods=("GET", "POST"), strict_slashes=False)
def index():
try:
#A list to store all the author data
authorpack = list()
#Start a loop through each of the three pages,for each page perform a crawl and append data
#to the authorpack list
for page in pages:
#Assign a url to a variable requested url
requested_url = page
# Print each page url for demo purposes
print(requested_url)
#Convert urls to text for beautful soup to parse
source = requests.get(requested_url).text
# Convert all the contents of each url to lxml format
soup = BeautifulSoup(source, 'lxml')
#From the page contents,extract only divs with a class gsc_1usr,
#At this point we have several divs that match the given class name
authors = soup.findAll('div', attrs={'class':'gsc_1usr'})
#Url segment to embedd to all relative image paths
imgUrl = 'https://scholar.googleusercontent.com'
#Url segment to embedd to all relative profile paths
profileUrl = 'https://scholar.google.com'
#Loop through each author package and for each extract all the information avaialable info
for author in authors:
author_image = author.find('img')
author_image = author_image["src"]
#Validate image path using validators
valid_imgpath = validators.url(author_image)
#If it's valid,assign it to the image variable,else join to the imgUrl to form a complete path
if valid_imgpath == True:
author_image = author_image
else:
full_path = urljoin(imgUrl, author_image)
author_image =full_path
author_name = author.find('h3').text
print(author_name)
profile_link = author.select('.gs_ai_name a')
profile_link = profileUrl + profile_link[0]['href']
author_affiliation = author.find("div", {"class": "gs_ai_aff"}).text
author_email = author.find("div", {"class": "gs_ai_eml"}).text
author_citations = author.find("div", {"class": "gs_ai_cby"}).text
author_tags = author.find("div", {"class": "gs_ai_int"}).text
#To fetch all the papers from each author ,we again parse the authors profile links
r = requests.get(profile_link).text
aProfile = BeautifulSoup(r,"html.parser")
#Find all paper links using the 'a' tag
papers = aProfile.find_all('a', class_='gsc_a_at')
#A list to store all the papers
paperpack = []
#Loop through each paper and then convert to text
for paper in papers:
paperpack.append(paper.text)
#Assign all the paper details to a single variable for proper and organised storage
x = (author_image,author_name,profile_link,author_affiliation,author_email,author_citations,author_tags,papers,index)
#Now add them to the list in line 33
authorpack.append(x)
#After the loop return a template and populate it with the results
return render_template("index.html",authorpack=authorpack,title="GS-Crawler | Profiles")
#Catch errors from blocked connections
except requests.exceptions.ConnectionError:
flash(f"Connection Refused !", "danger")
#Default route template
return render_template("index.html",title="GS-Crawler | Profiles")
@app.route("/",methods=("GET", "POST"), strict_slashes=False)
def search():
#Flask form to render the text field
form = Url()
#Get the value a user types in the search bar
query = request.form.get('text')
try:
#If the query is not empty,proceed
if query:
#A list of the urls to be crawled,we only use one for demo purposes
q_pages = ['http://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q='+query,
# 'https://scholar.google.com/scholar?start=20&q=' +query +'&hl=en&as_sdt=0,5',
# 'https://scholar.google.com/scholar?start=30&q=' +query +'&hl=en&as_sdt=0,5',
# 'https://scholar.google.com/scholar?start=40&q=' +query +'&hl=en&as_sdt=0,5',
]
#Loop through all the available pages
for r_page in q_pages:
##Assign each page a a loop level to the url variable
requested_url = r_page
#Show on the terminal the url being crawled
print('Crawling page ....',requested_url)
#G.scholar base url
baseUrl = 'https://scholar.google.com'
#Bs4 default url parsing method
#Convert url to text
source = requests.get(requested_url).text
#Convert page contents to lxml format
soup = BeautifulSoup(source, 'lxml')
#Fetch all divs with a class name gs_ri
articles = soup.findAll('div', attrs={'class':'gs_ri'})
#Author ids list,used to filter only required authos
author_ids = list()
#A list to store all the articles and their data
articlepack = list()
#Loop through each article,fetch all relevant info
for article in articles:
title = article.find('h3').text
paper_link = article.select('.gs_rt a')
paper_link = paper_link[0]['href']
author = article.find("div", {"class": "gs_a"}).text
abstract =article.find("div",{"class" : "gs_rs"}).text
extra = article.find("div",{"class" : "gs_fl"}).text
citations = article.select('.gs_fl a:nth-of-type(3)')
citations = baseUrl + citations[0]['href']
related = article.select('.gs_fl a:nth-of-type(4)')
related = baseUrl + related[0]['href']
# Show only results with atleast one author from CU
# Search publication by title,Then fetch the author ID
p_title = scholarly.search_pubs(title)
p_title = next(p_title)
# Process Author ID
a_id = p_title['author_id']
author_ids.append(a_id)
for id in a_id:
if id:
individual_id = id
# Author id extracted,check if author/coauthor is from CU
f_author = scholarly.search_author_id(individual_id)
#Extract email domain from author container
f_author_email = f_author['email_domain']
print(f_author_email)
if f_author_email == '@coventry.ac.uk':
# flash(f"At least one Coauthor is from CU !", "info")
pack = (title,paper_link,author,abstract,extra,citations,related)
articlepack.append(pack)
# else:
# flash(f"No Coauthor is not from CU !", "danger")
# print(author_ids)
return render_template("results.html",form=form,articlepack=articlepack,title="GS-Crawler | Search")
except requests.exceptions.ConnectionError:
flash(f"Connection Refused !", "danger")
return render_template("search.html",form=form,title="GS-Crawler | Search")
if __name__ == "__main__":
#Schedule the crwler to run with the flask app,automate to crawl after every 3 days
scheduler.add_job(id = 'Scheduled Task', func=scheduleTask, trigger="interval", days=3)
scheduler.start()
app.run(debug=True)