Skip to content

Commit

Permalink
feat: add scraper for mentions with an OpenAlex ID
Browse files Browse the repository at this point in the history
  • Loading branch information
ewan-escience committed Oct 10, 2024
1 parent 55918e9 commit fac50eb
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public static void main(String[] args) {
PostgrestCitationRepository localCitationRepository = new PostgrestCitationRepository(backendUrl);

Collection<CitationData> referencePapersToScrape = localCitationRepository.leastRecentlyScrapedCitations(5);
OpenAlexCitations openAlexCitations = new OpenAlexCitations();
OpenAlexConnector openAlexConnector = new OpenAlexConnector();
PostgrestMentionRepository localMentionRepository = new PostgrestMentionRepository(backendUrl);
String email = Config.crossrefContactEmail().orElse(null);
Instant now = Instant.now();
Expand All @@ -47,7 +47,7 @@ public static void main(String[] args) {

LOGGER.info("Scraping for DOI {}, OpenAlex ID {}", citationData.doi(), citationData.openalexId());

Collection<ExternalMentionRecord> citingMentions = openAlexCitations.citations(citationData.openalexId(), citationData.doi(), email, citationData.id());
Collection<ExternalMentionRecord> citingMentions = openAlexConnector.citations(citationData.openalexId(), citationData.doi(), email, citationData.id());
// we don't update mentions that have a DOI in the database with OpenAlex data, as they can already be
// scraped through Crossref or DataCite

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,13 @@
import org.slf4j.LoggerFactory;

import java.time.Instant;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.UUID;
import java.util.stream.Collectors;

public class MainMentions {
Expand All @@ -36,13 +39,27 @@ public static void main(String[] args) {
// we will remove successfully scraped mentions from here,
// we use this to set scrapedAt even for failed mentions,
// to put them back at the scraping queue
Map<Doi, RsdMentionIds> mentionsFailedToScrape = new HashMap<>();
Map<UUID, RsdMentionIds> mentionsFailedToScrape = new HashMap<>();
Map<Doi, UUID> doiToId = new HashMap<>();
Map<OpenalexId, UUID> openalexIdToId = new HashMap<>();
for (RsdMentionIds mentionIds : mentionsToScrape) {
mentionsFailedToScrape.put(mentionIds.doi(), mentionIds);
UUID id = mentionIds.id();
mentionsFailedToScrape.put(id, mentionIds);

Doi doi = mentionIds.doi();
if (doi != null) {
doiToId.put(doi, id);
}

OpenalexId openalexId = mentionIds.openalexId();
if (openalexId != null) {
openalexIdToId.put(openalexId, id);
}
}

String doisJoined = mentionsToScrape.stream()
.map(RsdMentionIds::doi)
.filter(Objects::nonNull)
.map(Doi::toUrlEncodedString)
.collect(Collectors.joining(","));
String jsonSources = null;
Expand Down Expand Up @@ -72,14 +89,14 @@ public static void main(String[] args) {
}
for (ExternalMentionRecord scrapedMention : scrapedDataciteMentions) {
Doi doi = scrapedMention.doi();
RsdMentionIds ids = mentionsFailedToScrape.get(doi);
UUID id = doiToId.get(doi);
try {
RsdMentionRecord mentionToUpdate = new RsdMentionRecord(ids.id(), scrapedMention, now);
RsdMentionRecord mentionToUpdate = new RsdMentionRecord(id, scrapedMention, now);
localMentionRepository.updateMention(mentionToUpdate, false);
mentionsFailedToScrape.remove(doi);
mentionsFailedToScrape.remove(id);
} catch (Exception e) {
LOGGER.error("Failed to update a DataCite mention with DOI {}", scrapedMention.doi());
Utils.saveExceptionInDatabase("Mention scraper", "mention", ids.id(), e);
Utils.saveExceptionInDatabase("Mention scraper", "mention", id, e);
}

}
Expand All @@ -94,51 +111,62 @@ public static void main(String[] args) {
.toList();
for (Doi crossrefDoi : crossrefDois) {
ExternalMentionRecord scrapedMention;
UUID id = doiToId.get(crossrefDoi);
try {
scrapedMention = new CrossrefMention(crossrefDoi).mentionData();
} catch (Exception e) {
LOGGER.error("Failed to scrape a Crossref mention with DOI {}", crossrefDoi);
RuntimeException exceptionWithMessage = new RuntimeException("Failed to scrape a Crossref mention with DOI " + crossrefDoi, e);
Utils.saveExceptionInDatabase("Crossref mention scraper", "mention", mentionsFailedToScrape.get(crossrefDoi).id(), exceptionWithMessage);
Utils.saveExceptionInDatabase("Crossref mention scraper", "mention", id, exceptionWithMessage);
continue;
}
Doi doi = scrapedMention.doi();
RsdMentionIds ids = mentionsFailedToScrape.get(doi);
RsdMentionRecord mentionToUpdate = new RsdMentionRecord(ids.id(), scrapedMention, now);
RsdMentionRecord mentionToUpdate = new RsdMentionRecord(id, scrapedMention, now);
try {
localMentionRepository.updateMention(mentionToUpdate, false);
mentionsFailedToScrape.remove(doi);
mentionsFailedToScrape.remove(id);
} catch (Exception e) {
RuntimeException exceptionWithMessage = new RuntimeException("Failed to update a Crossref mention with DOI " + crossrefDoi, e);
Utils.saveExceptionInDatabase("Crossref mention scraper", "mention", ids.id(), exceptionWithMessage);
Utils.saveExceptionInDatabase("Crossref mention scraper", "mention", id, exceptionWithMessage);
}
}
// END CROSSREF

// OPENALEX (for European Publication Office DOIs)
String email = Config.crossrefContactEmail().orElse(null);
Collection<ExternalMentionRecord> scrapedOpenalexMentions = List.of();
Collection<ExternalMentionRecord> scrapedOpenalexMentions = new ArrayList<>();
OpenAlexConnector openAlexConnector = new OpenAlexConnector();
Collection<Doi> europeanPublicationsOfficeDois = doiToSource.entrySet()
.stream()
.filter(doiSourceEntry -> doiSourceEntry.getValue().equals("OP"))
.map(Map.Entry::getKey)
.map(Doi::fromString)
.toList();
try {
scrapedOpenalexMentions = new OpenAlexCitations().mentionData(europeanPublicationsOfficeDois, email);
scrapedOpenalexMentions.addAll(openAlexConnector.mentionDataByDois(europeanPublicationsOfficeDois, email));
} catch (Exception e) {
Utils.saveExceptionInDatabase("OpenAlex mention scraper", "mention", null, e);
}
Collection<OpenalexId> openalexIdsToScrape = mentionsToScrape
.stream()
.filter(ids -> ids.doi() == null && ids.openalexId() != null)
.map(RsdMentionIds::openalexId)
.toList();
try {
scrapedOpenalexMentions.addAll(openAlexConnector.mentionDataByOpenalexIds(openalexIdsToScrape, email));
} catch (Exception e) {
Utils.saveExceptionInDatabase("OpenAlex mention scraper", "mention", null, e);
}

for (ExternalMentionRecord scrapedMention : scrapedOpenalexMentions) {
Doi doi = scrapedMention.doi();
RsdMentionIds ids = mentionsFailedToScrape.get(doi);
RsdMentionRecord mentionToUpdate = new RsdMentionRecord(ids.id(), scrapedMention, now);
OpenalexId openalexId = scrapedMention.openalexId();
UUID id = openalexIdToId.get(openalexId);
RsdMentionRecord mentionToUpdate = new RsdMentionRecord(id, scrapedMention, now);
try {
localMentionRepository.updateMention(mentionToUpdate, true);
mentionsFailedToScrape.remove(doi);
mentionsFailedToScrape.remove(id);
} catch (Exception e) {
LOGGER.error("Failed to update an OpenAlex mention with DOI {}", scrapedMention.doi());
Utils.saveExceptionInDatabase("Mention scraper", "mention", ids.id(), e);
Utils.saveExceptionInDatabase("Mention scraper", "mention", id, e);
}
}
// END OPENALEX
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,20 @@
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;

class OpenAlexCitations {
class OpenAlexConnector {

private static final Logger LOGGER = LoggerFactory.getLogger(OpenAlexCitations.class);
private static final Logger LOGGER = LoggerFactory.getLogger(OpenAlexConnector.class);

static final String DOI_FILTER_URL_UNFORMATTED = "https://api.openalex.org/works?filter=doi:%s";
static final String OPENALEX_ID_URL_UNFORMATTED = "https://api.openalex.org/works?filter=ids.openalex:%s";

public Collection<ExternalMentionRecord> mentionData(Collection<Doi> dataciteDois, String email) throws IOException, InterruptedException {
String filter = dataciteDois
public Collection<ExternalMentionRecord> mentionDataByDois(Collection<Doi> dois, String email) throws IOException, InterruptedException {
String filter = dois
.stream()
.filter(Objects::nonNull)
.map(Doi::toString)
.collect(Collectors.joining("|"));
// e.g. https://api.openalex.org/works?filter=doi:10.1038%2Fs41598-024-73248-4|10.5194%2Ftc-2022-249-rc1&per-page=200
String worksUri = DOI_FILTER_URL_UNFORMATTED.formatted(Utils.urlEncode(filter)) + "&per-page=200";

HttpResponse<String> response;
Expand Down Expand Up @@ -67,6 +68,41 @@ public Collection<ExternalMentionRecord> mentionData(Collection<Doi> dataciteDoi
return mentions;
}

public Collection<ExternalMentionRecord> mentionDataByOpenalexIds(Collection<OpenalexId> openalexIds, String email) throws IOException, InterruptedException {
String filter = openalexIds
.stream()
.filter(Objects::nonNull)
.map(OpenalexId::getOpenalexKey)
.collect(Collectors.joining("|"));
// e.g. https://api.openalex.org/works?filter=ids.openalex:W4402994101|W4319593220&per-page=200
String worksUri = OPENALEX_ID_URL_UNFORMATTED.formatted(Utils.urlEncode(filter)) + "&per-page=200";

HttpResponse<String> response;
if (email == null || email.isBlank()) {
response = Utils.getAsHttpResponse(worksUri);
} else {
response = Utils.getAsHttpResponse(worksUri, "User-Agent", "mailto:" + email);
}

JsonObject tree = JsonParser.parseString(response.body()).getAsJsonObject();
JsonArray citationsArray = tree
.getAsJsonArray("results");

Collection<ExternalMentionRecord> mentions = new ArrayList<>();
for (JsonElement citation : citationsArray) {
ExternalMentionRecord citationAsMention;
try {
citationAsMention = parseCitationAsMention(citation);
} catch (RuntimeException e) {
Utils.saveExceptionInDatabase("OpenAlex mention scraper", "mention", null, e);
continue;
}
mentions.add(citationAsMention);
}

return mentions;
}

public Collection<ExternalMentionRecord> citations(OpenalexId openalexId, Doi doi, String email, UUID id) throws IOException, InterruptedException {
// This shouldn't happen, but let's check it to prevent unexpected exceptions:
if (doi == null && openalexId == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ public String toUrlEncodedString() {
return Utils.urlEncode(toString());
}

public String getOpenalexKey() {
return openalexKey;
}

@Override
public String toString() {
return OPENALEX_ID_BASE + openalexKey;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ static RsdMentionIds parseSingleRsdIds(String json) {
}

public Collection<RsdMentionIds> leastRecentlyScrapedMentions(int limit) {
String data = Utils.getAsAdmin(backendUrl + "/mention?doi=not.is.null&order=scraped_at.asc.nullsfirst&select=id,doi,openalex_id&limit=" + limit);
String data = Utils.getAsAdmin(backendUrl + "/mention?or=(doi.not.is.null,openalex_id.not.is.null)&order=scraped_at.asc.nullsfirst&select=id,doi,openalex_id&limit=" + limit);
return parseMultipleRsdIds(data);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ void givenLocationWithBackSlashes_whenExtractedAsLocation_thenSlashesUrlEncoded(
location.addProperty("landing_page_url", "https://www.example.com/path\\with\\slash");
array.add(location);

URI result = OpenAlexCitations.extractUrlFromLocation(array);
URI result = OpenAlexConnector.extractUrlFromLocation(array);

Assertions.assertNotNull(result);
Assertions.assertEquals("https://www.example.com/path%5Cwith%5Cslash", result.toString());
Expand Down

0 comments on commit fac50eb

Please sign in to comment.