From fac50ebb54350fea602c05816ffbf22025faef6f Mon Sep 17 00:00:00 2001 From: Ewan Cahen Date: Wed, 9 Oct 2024 20:22:38 +0200 Subject: [PATCH] feat: add scraper for mentions with an OpenAlex ID --- .../rsd/scraper/doi/MainCitations.java | 4 +- .../rsd/scraper/doi/MainMentions.java | 66 +++++++++++++------ ...xCitations.java => OpenAlexConnector.java} | 44 +++++++++++-- .../rsd/scraper/doi/OpenalexId.java | 4 ++ .../doi/PostgrestMentionRepository.java | 2 +- .../scraper/doi/OpenAlexCitationsTest.java | 2 +- 6 files changed, 95 insertions(+), 27 deletions(-) rename scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/{OpenAlexCitations.java => OpenAlexConnector.java} (82%) diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainCitations.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainCitations.java index 050d9069d..da9559c11 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainCitations.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainCitations.java @@ -36,7 +36,7 @@ public static void main(String[] args) { PostgrestCitationRepository localCitationRepository = new PostgrestCitationRepository(backendUrl); Collection referencePapersToScrape = localCitationRepository.leastRecentlyScrapedCitations(5); - OpenAlexCitations openAlexCitations = new OpenAlexCitations(); + OpenAlexConnector openAlexConnector = new OpenAlexConnector(); PostgrestMentionRepository localMentionRepository = new PostgrestMentionRepository(backendUrl); String email = Config.crossrefContactEmail().orElse(null); Instant now = Instant.now(); @@ -47,7 +47,7 @@ public static void main(String[] args) { LOGGER.info("Scraping for DOI {}, OpenAlex ID {}", citationData.doi(), citationData.openalexId()); - Collection citingMentions = openAlexCitations.citations(citationData.openalexId(), citationData.doi(), email, citationData.id()); + Collection citingMentions = openAlexConnector.citations(citationData.openalexId(), citationData.doi(), email, citationData.id()); // we don't update mentions that have a DOI in the database with OpenAlex data, as they can already be // scraped through Crossref or DataCite diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainMentions.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainMentions.java index 7b996e0ad..b8fb0f767 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainMentions.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainMentions.java @@ -15,10 +15,13 @@ import org.slf4j.LoggerFactory; import java.time.Instant; +import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Objects; +import java.util.UUID; import java.util.stream.Collectors; public class MainMentions { @@ -36,13 +39,27 @@ public static void main(String[] args) { // we will remove successfully scraped mentions from here, // we use this to set scrapedAt even for failed mentions, // to put them back at the scraping queue - Map mentionsFailedToScrape = new HashMap<>(); + Map mentionsFailedToScrape = new HashMap<>(); + Map doiToId = new HashMap<>(); + Map openalexIdToId = new HashMap<>(); for (RsdMentionIds mentionIds : mentionsToScrape) { - mentionsFailedToScrape.put(mentionIds.doi(), mentionIds); + UUID id = mentionIds.id(); + mentionsFailedToScrape.put(id, mentionIds); + + Doi doi = mentionIds.doi(); + if (doi != null) { + doiToId.put(doi, id); + } + + OpenalexId openalexId = mentionIds.openalexId(); + if (openalexId != null) { + openalexIdToId.put(openalexId, id); + } } String doisJoined = mentionsToScrape.stream() .map(RsdMentionIds::doi) + .filter(Objects::nonNull) .map(Doi::toUrlEncodedString) .collect(Collectors.joining(",")); String jsonSources = null; @@ -72,14 +89,14 @@ public static void main(String[] args) { } for (ExternalMentionRecord scrapedMention : scrapedDataciteMentions) { Doi doi = scrapedMention.doi(); - RsdMentionIds ids = mentionsFailedToScrape.get(doi); + UUID id = doiToId.get(doi); try { - RsdMentionRecord mentionToUpdate = new RsdMentionRecord(ids.id(), scrapedMention, now); + RsdMentionRecord mentionToUpdate = new RsdMentionRecord(id, scrapedMention, now); localMentionRepository.updateMention(mentionToUpdate, false); - mentionsFailedToScrape.remove(doi); + mentionsFailedToScrape.remove(id); } catch (Exception e) { LOGGER.error("Failed to update a DataCite mention with DOI {}", scrapedMention.doi()); - Utils.saveExceptionInDatabase("Mention scraper", "mention", ids.id(), e); + Utils.saveExceptionInDatabase("Mention scraper", "mention", id, e); } } @@ -94,30 +111,30 @@ public static void main(String[] args) { .toList(); for (Doi crossrefDoi : crossrefDois) { ExternalMentionRecord scrapedMention; + UUID id = doiToId.get(crossrefDoi); try { scrapedMention = new CrossrefMention(crossrefDoi).mentionData(); } catch (Exception e) { LOGGER.error("Failed to scrape a Crossref mention with DOI {}", crossrefDoi); RuntimeException exceptionWithMessage = new RuntimeException("Failed to scrape a Crossref mention with DOI " + crossrefDoi, e); - Utils.saveExceptionInDatabase("Crossref mention scraper", "mention", mentionsFailedToScrape.get(crossrefDoi).id(), exceptionWithMessage); + Utils.saveExceptionInDatabase("Crossref mention scraper", "mention", id, exceptionWithMessage); continue; } - Doi doi = scrapedMention.doi(); - RsdMentionIds ids = mentionsFailedToScrape.get(doi); - RsdMentionRecord mentionToUpdate = new RsdMentionRecord(ids.id(), scrapedMention, now); + RsdMentionRecord mentionToUpdate = new RsdMentionRecord(id, scrapedMention, now); try { localMentionRepository.updateMention(mentionToUpdate, false); - mentionsFailedToScrape.remove(doi); + mentionsFailedToScrape.remove(id); } catch (Exception e) { RuntimeException exceptionWithMessage = new RuntimeException("Failed to update a Crossref mention with DOI " + crossrefDoi, e); - Utils.saveExceptionInDatabase("Crossref mention scraper", "mention", ids.id(), exceptionWithMessage); + Utils.saveExceptionInDatabase("Crossref mention scraper", "mention", id, exceptionWithMessage); } } // END CROSSREF // OPENALEX (for European Publication Office DOIs) String email = Config.crossrefContactEmail().orElse(null); - Collection scrapedOpenalexMentions = List.of(); + Collection scrapedOpenalexMentions = new ArrayList<>(); + OpenAlexConnector openAlexConnector = new OpenAlexConnector(); Collection europeanPublicationsOfficeDois = doiToSource.entrySet() .stream() .filter(doiSourceEntry -> doiSourceEntry.getValue().equals("OP")) @@ -125,20 +142,31 @@ public static void main(String[] args) { .map(Doi::fromString) .toList(); try { - scrapedOpenalexMentions = new OpenAlexCitations().mentionData(europeanPublicationsOfficeDois, email); + scrapedOpenalexMentions.addAll(openAlexConnector.mentionDataByDois(europeanPublicationsOfficeDois, email)); } catch (Exception e) { Utils.saveExceptionInDatabase("OpenAlex mention scraper", "mention", null, e); } + Collection openalexIdsToScrape = mentionsToScrape + .stream() + .filter(ids -> ids.doi() == null && ids.openalexId() != null) + .map(RsdMentionIds::openalexId) + .toList(); + try { + scrapedOpenalexMentions.addAll(openAlexConnector.mentionDataByOpenalexIds(openalexIdsToScrape, email)); + } catch (Exception e) { + Utils.saveExceptionInDatabase("OpenAlex mention scraper", "mention", null, e); + } + for (ExternalMentionRecord scrapedMention : scrapedOpenalexMentions) { - Doi doi = scrapedMention.doi(); - RsdMentionIds ids = mentionsFailedToScrape.get(doi); - RsdMentionRecord mentionToUpdate = new RsdMentionRecord(ids.id(), scrapedMention, now); + OpenalexId openalexId = scrapedMention.openalexId(); + UUID id = openalexIdToId.get(openalexId); + RsdMentionRecord mentionToUpdate = new RsdMentionRecord(id, scrapedMention, now); try { localMentionRepository.updateMention(mentionToUpdate, true); - mentionsFailedToScrape.remove(doi); + mentionsFailedToScrape.remove(id); } catch (Exception e) { LOGGER.error("Failed to update an OpenAlex mention with DOI {}", scrapedMention.doi()); - Utils.saveExceptionInDatabase("Mention scraper", "mention", ids.id(), e); + Utils.saveExceptionInDatabase("Mention scraper", "mention", id, e); } } // END OPENALEX diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexCitations.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexConnector.java similarity index 82% rename from scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexCitations.java rename to scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexConnector.java index 1e997b8b8..c9504a972 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexCitations.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexConnector.java @@ -26,19 +26,20 @@ import java.util.stream.Collectors; import java.util.stream.StreamSupport; -class OpenAlexCitations { +class OpenAlexConnector { - private static final Logger LOGGER = LoggerFactory.getLogger(OpenAlexCitations.class); + private static final Logger LOGGER = LoggerFactory.getLogger(OpenAlexConnector.class); static final String DOI_FILTER_URL_UNFORMATTED = "https://api.openalex.org/works?filter=doi:%s"; static final String OPENALEX_ID_URL_UNFORMATTED = "https://api.openalex.org/works?filter=ids.openalex:%s"; - public Collection mentionData(Collection dataciteDois, String email) throws IOException, InterruptedException { - String filter = dataciteDois + public Collection mentionDataByDois(Collection dois, String email) throws IOException, InterruptedException { + String filter = dois .stream() .filter(Objects::nonNull) .map(Doi::toString) .collect(Collectors.joining("|")); + // e.g. https://api.openalex.org/works?filter=doi:10.1038%2Fs41598-024-73248-4|10.5194%2Ftc-2022-249-rc1&per-page=200 String worksUri = DOI_FILTER_URL_UNFORMATTED.formatted(Utils.urlEncode(filter)) + "&per-page=200"; HttpResponse response; @@ -67,6 +68,41 @@ public Collection mentionData(Collection dataciteDoi return mentions; } + public Collection mentionDataByOpenalexIds(Collection openalexIds, String email) throws IOException, InterruptedException { + String filter = openalexIds + .stream() + .filter(Objects::nonNull) + .map(OpenalexId::getOpenalexKey) + .collect(Collectors.joining("|")); + // e.g. https://api.openalex.org/works?filter=ids.openalex:W4402994101|W4319593220&per-page=200 + String worksUri = OPENALEX_ID_URL_UNFORMATTED.formatted(Utils.urlEncode(filter)) + "&per-page=200"; + + HttpResponse response; + if (email == null || email.isBlank()) { + response = Utils.getAsHttpResponse(worksUri); + } else { + response = Utils.getAsHttpResponse(worksUri, "User-Agent", "mailto:" + email); + } + + JsonObject tree = JsonParser.parseString(response.body()).getAsJsonObject(); + JsonArray citationsArray = tree + .getAsJsonArray("results"); + + Collection mentions = new ArrayList<>(); + for (JsonElement citation : citationsArray) { + ExternalMentionRecord citationAsMention; + try { + citationAsMention = parseCitationAsMention(citation); + } catch (RuntimeException e) { + Utils.saveExceptionInDatabase("OpenAlex mention scraper", "mention", null, e); + continue; + } + mentions.add(citationAsMention); + } + + return mentions; + } + public Collection citations(OpenalexId openalexId, Doi doi, String email, UUID id) throws IOException, InterruptedException { // This shouldn't happen, but let's check it to prevent unexpected exceptions: if (doi == null && openalexId == null) { diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenalexId.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenalexId.java index def269a88..03ac67991 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenalexId.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenalexId.java @@ -45,6 +45,10 @@ public String toUrlEncodedString() { return Utils.urlEncode(toString()); } + public String getOpenalexKey() { + return openalexKey; + } + @Override public String toString() { return OPENALEX_ID_BASE + openalexKey; diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestMentionRepository.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestMentionRepository.java index 1b67a7068..1e841adae 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestMentionRepository.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestMentionRepository.java @@ -59,7 +59,7 @@ static RsdMentionIds parseSingleRsdIds(String json) { } public Collection leastRecentlyScrapedMentions(int limit) { - String data = Utils.getAsAdmin(backendUrl + "/mention?doi=not.is.null&order=scraped_at.asc.nullsfirst&select=id,doi,openalex_id&limit=" + limit); + String data = Utils.getAsAdmin(backendUrl + "/mention?or=(doi.not.is.null,openalex_id.not.is.null)&order=scraped_at.asc.nullsfirst&select=id,doi,openalex_id&limit=" + limit); return parseMultipleRsdIds(data); } diff --git a/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexCitationsTest.java b/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexCitationsTest.java index 1750e2de0..9addd5482 100644 --- a/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexCitationsTest.java +++ b/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexCitationsTest.java @@ -22,7 +22,7 @@ void givenLocationWithBackSlashes_whenExtractedAsLocation_thenSlashesUrlEncoded( location.addProperty("landing_page_url", "https://www.example.com/path\\with\\slash"); array.add(location); - URI result = OpenAlexCitations.extractUrlFromLocation(array); + URI result = OpenAlexConnector.extractUrlFromLocation(array); Assertions.assertNotNull(result); Assertions.assertEquals("https://www.example.com/path%5Cwith%5Cslash", result.toString());