Skip to content

Commit

Permalink
Fix #2386 title issue in CEURWS scraper
Browse files Browse the repository at this point in the history
If there was a newline in the title of the CEURWS proceeding then
wrong Quickstatements would be generated.
This fix normalize consecutive whitespace to one normal space.
  • Loading branch information
fnielsen committed Nov 24, 2023
1 parent de6d358 commit e73c309
Showing 1 changed file with 7 additions and 3 deletions.
10 changes: 7 additions & 3 deletions scholia/scrape/ceurws.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def tree_to_papers(tree, proceedings, proceedings_q, iso639='en'):
paper['full_text_url'] = os.path.join(
proceedings['url'],
element.xpath(".//a")[0].attrib['href'])
paper['title'] = re.sub(r'\s+', ' ', title_elements[0].text)
paper['title'] = re.sub(r'\s+', ' ', title_elements[0].text).strip()

# Authors
authors = [
Expand Down Expand Up @@ -267,6 +267,9 @@ def paper_to_q(paper):
response = requests.get(WDQS_URL,
params={'query': query, 'format': 'json'},
headers=HEADERS)
if not response.ok:
response.raise_for_status()

data = response.json()['results']['bindings']

if len(data) == 0 or not data[0]:
Expand Down Expand Up @@ -355,8 +358,9 @@ def proceedings_url_to_proceedings(url, return_tree=False):
proceedings['urn'] = \
tree.xpath("//span[@class='CEURURN']")[0].text

proceedings['title'] = \
tree.xpath("//span[@class='CEURFULLTITLE']")[0].text
proceedings['title'] = re.sub(
r'\s+', ' ',
tree.xpath("//span[@class='CEURFULLTITLE']")[0].text).strip()

proceedings['date'] = \
tree.xpath("//span[@class='CEURPUBDATE']")[0].text
Expand Down

0 comments on commit e73c309

Please sign in to comment.