Fix #2386 title issue in CEURWS scraper

If there was a newline in the title of the CEURWS proceeding then wrong Quickstatements would be generated. This fix normalize consecutive whitespace to one normal space.
WDscholia · Nov 24, 2023 · e73c309 · e73c309
1 parent de6d358
commit e73c309
Showing 1 changed file with 7 additions and 3 deletions.
diff --git a/scholia/scrape/ceurws.py b/scholia/scrape/ceurws.py
@@ -130,7 +130,7 @@ def tree_to_papers(tree, proceedings, proceedings_q, iso639='en'):
         paper['full_text_url'] = os.path.join(
             proceedings['url'],
             element.xpath(".//a")[0].attrib['href'])
-        paper['title'] = re.sub(r'\s+', ' ', title_elements[0].text)
+        paper['title'] = re.sub(r'\s+', ' ', title_elements[0].text).strip()
 
         # Authors
         authors = [
@@ -267,6 +267,9 @@ def paper_to_q(paper):
     response = requests.get(WDQS_URL,
                             params={'query': query, 'format': 'json'},
                             headers=HEADERS)
+    if not response.ok:
+        response.raise_for_status()
+
     data = response.json()['results']['bindings']
 
     if len(data) == 0 or not data[0]:
@@ -355,8 +358,9 @@ def proceedings_url_to_proceedings(url, return_tree=False):
     proceedings['urn'] = \
         tree.xpath("//span[@class='CEURURN']")[0].text
 
-    proceedings['title'] = \
-        tree.xpath("//span[@class='CEURFULLTITLE']")[0].text
+    proceedings['title'] = re.sub(
+        r'\s+', ' ',
+        tree.xpath("//span[@class='CEURFULLTITLE']")[0].text).strip()
 
     proceedings['date'] = \
         tree.xpath("//span[@class='CEURPUBDATE']")[0].text