From a7d84ea9dc667edb8cf9669e95c22ff53c7a5a76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw=20Jacewicz?= Date: Thu, 10 Sep 2020 16:22:53 +0200 Subject: [PATCH 1/4] Closes #1027: Remove console log statements in tests --- .../GenerateOoziePropertiesMojo.java | 8 +++- .../AffMatchingAffOrgQualityTest.java | 6 ++- .../AffMatchingDocOrgQualityTest.java | 12 +++-- .../affmatching/AffMatchingResultPrinter.java | 44 ++++++++++--------- ...OrgMatchVoterStrengthEstimatorAndTest.java | 43 ++++++++---------- .../src/test/resources/log4j.properties | 9 ++++ .../content/ObjectStoresProvider.java | 6 ++- 7 files changed, 74 insertions(+), 54 deletions(-) create mode 100644 iis-wf/iis-wf-affmatching/src/test/resources/log4j.properties diff --git a/iis-build/iis-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java b/iis-build/iis-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java index 389208e2b..db35ccf55 100644 --- a/iis-build/iis-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java +++ b/iis-build/iis-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java @@ -9,6 +9,8 @@ import org.apache.maven.plugin.AbstractMojo; import org.apache.maven.plugin.MojoExecutionException; import org.apache.maven.plugin.MojoFailureException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Generates oozie properties which were not provided from commandline. @@ -18,6 +20,8 @@ */ public class GenerateOoziePropertiesMojo extends AbstractMojo { + private static final Logger logger = LoggerFactory.getLogger(GenerateOoziePropertiesMojo.class); + public static final String PROPERTY_NAME_WF_SOURCE_DIR = "workflow.source.dir"; public static final String PROPERTY_NAME_SANDBOX_NAME = "sandboxName"; @@ -33,8 +37,8 @@ public void execute() throws MojoExecutionException, MojoFailureException { System.getProperties().setProperty(PROPERTY_NAME_SANDBOX_NAME, generatedSandboxName); } else { - System.out.println("unable to generate sandbox name from path: " + - System.getProperties().getProperty(PROPERTY_NAME_WF_SOURCE_DIR)); + logger.warn("unable to generate sandbox name from path: {}", + System.getProperties().getProperty(PROPERTY_NAME_WF_SOURCE_DIR)); } } } diff --git a/iis-wf/iis-wf-affmatching/src/test/java/eu/dnetlib/iis/wf/affmatching/AffMatchingAffOrgQualityTest.java b/iis-wf/iis-wf-affmatching/src/test/java/eu/dnetlib/iis/wf/affmatching/AffMatchingAffOrgQualityTest.java index 3039ca5ea..a33e78188 100644 --- a/iis-wf/iis-wf-affmatching/src/test/java/eu/dnetlib/iis/wf/affmatching/AffMatchingAffOrgQualityTest.java +++ b/iis-wf/iis-wf-affmatching/src/test/java/eu/dnetlib/iis/wf/affmatching/AffMatchingAffOrgQualityTest.java @@ -20,6 +20,8 @@ import org.apache.spark.api.java.JavaSparkContext; import org.junit.*; import org.junit.experimental.categories.Category; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import scala.Tuple2; import java.io.File; @@ -57,6 +59,8 @@ @Category(IntegrationTest.class) public class AffMatchingAffOrgQualityTest { + private static final Logger logger = LoggerFactory.getLogger(AffMatchingAffOrgQualityTest.class); + private final static boolean PRINT_NOT_MATCHED = true; private final static boolean PRINT_FALSE_POSITIVE_MATCHES = true; @@ -197,7 +201,7 @@ private void printQualityFactor(String factorName, int goodCount, int totalCount double factorPercentage = ((double) goodCount / totalCount) * 100; String text = String.format("%-30s %5.2f%% (%d/%d)", factorName + ":", factorPercentage, goodCount, totalCount); - System.out.println(text); + logger.trace(text); } private AffMatchingService createAffMatchingService() throws IOException { diff --git a/iis-wf/iis-wf-affmatching/src/test/java/eu/dnetlib/iis/wf/affmatching/AffMatchingDocOrgQualityTest.java b/iis-wf/iis-wf-affmatching/src/test/java/eu/dnetlib/iis/wf/affmatching/AffMatchingDocOrgQualityTest.java index 1380a01ce..7435ffe9e 100644 --- a/iis-wf/iis-wf-affmatching/src/test/java/eu/dnetlib/iis/wf/affmatching/AffMatchingDocOrgQualityTest.java +++ b/iis-wf/iis-wf-affmatching/src/test/java/eu/dnetlib/iis/wf/affmatching/AffMatchingDocOrgQualityTest.java @@ -27,6 +27,8 @@ import eu.dnetlib.iis.referenceextraction.project.schemas.DocumentToProject; import eu.dnetlib.iis.wf.affmatching.model.MatchedOrganization; import eu.dnetlib.iis.wf.affmatching.model.SimpleAffMatchResult; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import pl.edu.icm.sparkutils.test.SparkJob; import pl.edu.icm.sparkutils.test.SparkJobBuilder; import pl.edu.icm.sparkutils.test.SparkJobExecutor; @@ -46,7 +48,9 @@ */ @Category(IntegrationTest.class) public class AffMatchingDocOrgQualityTest { - + + private static final Logger logger = LoggerFactory.getLogger(AffMatchingDocOrgQualityTest.class); + private final static String INPUT_DATA_DIR_PATH = "src/test/resources/experimentalData/input"; private SparkJobExecutor executor = new SparkJobExecutor(); @@ -116,8 +120,8 @@ public void affiliationMatchingJob_combined_data() throws IOException { // log - - System.out.println("\nALL TEST DATA"); + + logger.trace("ALL TEST DATA"); readResultsAndPrintQualityRate(of( "src/test/resources/experimentalData/expectedOutput/matched_aff.json")); @@ -208,7 +212,7 @@ private void printQualityFactor(String factorName, int goodCount, int totalCount double factorPercentage = ((double)goodCount/totalCount)*100; String text = String.format("%-20s %5.2f%% (%d/%d)", factorName + ":", factorPercentage, goodCount, totalCount); - System.out.println(text); + logger.trace(text); } diff --git a/iis-wf/iis-wf-affmatching/src/test/java/eu/dnetlib/iis/wf/affmatching/AffMatchingResultPrinter.java b/iis-wf/iis-wf-affmatching/src/test/java/eu/dnetlib/iis/wf/affmatching/AffMatchingResultPrinter.java index 56ed5de1d..a2ed3b786 100644 --- a/iis-wf/iis-wf-affmatching/src/test/java/eu/dnetlib/iis/wf/affmatching/AffMatchingResultPrinter.java +++ b/iis-wf/iis-wf-affmatching/src/test/java/eu/dnetlib/iis/wf/affmatching/AffMatchingResultPrinter.java @@ -13,6 +13,8 @@ import eu.dnetlib.iis.metadataextraction.schemas.Affiliation; import eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata; import eu.dnetlib.iis.wf.affmatching.model.SimpleAffMatchResult; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * A helper that prints results of affiliation matching (actual matched affiliations in relation to @@ -23,7 +25,9 @@ */ public class AffMatchingResultPrinter { - + + private static final Logger logger = LoggerFactory.getLogger(AffMatchingResultPrinter.class); + private static final Comparator RESULT_COMPARATOR = Comparator .comparing(SimpleAffMatchResult::getDocumentId) .thenComparingInt(SimpleAffMatchResult::getAffiliationPosition); @@ -49,8 +53,8 @@ public static void printFalsePositives(String inputAffDirPath, String inputOrgDi .filter(x -> !expectedMatches.contains(x)) .sorted(RESULT_COMPARATOR) .collect(toList()); - - System.out.println("\n\t-------------------- false positives ---------------------"); + + logger.trace("-------------------- false positives ---------------------"); for (SimpleAffMatchResult falsePositive : falsePositives) { @@ -63,14 +67,14 @@ public static void printFalsePositives(String inputAffDirPath, String inputOrgDi List expectedOrgs = expectedOrgIds.stream().map(x -> fetchOrganization(organizations, x)).collect(toList()); Organization actualOrg = fetchOrganization(organizations, falsePositive.getOrganizationId()); - - System.out.println("Document id: " + documentId + " \tPosition: " + affiliationPosition); - System.out.println("Affiliation: " + affiliation); - System.out.println("Was matched to: " + actualOrg); + + logger.trace("Document id: " + documentId + " \tPosition: " + affiliationPosition); + logger.trace("Affiliation: " + affiliation); + logger.trace("Was matched to: " + actualOrg); if (expectedOrgs.isEmpty()) { - System.out.println("Should match to: null"); + logger.trace("Should match to: null"); } for (int i=0; i !actualMatches.contains(x)) .sorted(RESULT_COMPARATOR) .collect(toList()); - - - System.out.println("\n\t--------------------- not matched --------------------"); + + + logger.trace("--------------------- not matched --------------------"); for (SimpleAffMatchResult match : notMatched) { Affiliation affiliation = fetchAffiliation(docsAffiliations, match.getDocumentId(), match.getAffiliationPosition()); Organization expectedOrg = fetchOrganization(organizations, match.getOrganizationId()); - - - System.out.println("Document id: " + match.getDocumentId() + " \tPosition: " + match.getAffiliationPosition()); - System.out.println("Affiliation: " + affiliation); - System.out.println("Should match to: " + expectedOrg); - System.out.println(); + + + logger.trace("Document id: " + match.getDocumentId() + " \tPosition: " + match.getAffiliationPosition()); + logger.trace("Affiliation: " + affiliation); + logger.trace("Should match to: " + expectedOrg); } } diff --git a/iis-wf/iis-wf-affmatching/src/test/java/eu/dnetlib/iis/wf/affmatching/match/voter/AffOrgMatchVoterStrengthEstimatorAndTest.java b/iis-wf/iis-wf-affmatching/src/test/java/eu/dnetlib/iis/wf/affmatching/match/voter/AffOrgMatchVoterStrengthEstimatorAndTest.java index 382a70c93..887ea2f9f 100644 --- a/iis-wf/iis-wf-affmatching/src/test/java/eu/dnetlib/iis/wf/affmatching/match/voter/AffOrgMatchVoterStrengthEstimatorAndTest.java +++ b/iis-wf/iis-wf-affmatching/src/test/java/eu/dnetlib/iis/wf/affmatching/match/voter/AffOrgMatchVoterStrengthEstimatorAndTest.java @@ -29,6 +29,8 @@ import org.junit.BeforeClass; import org.junit.Test; import org.junit.experimental.categories.Category; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; @@ -48,7 +50,6 @@ import static eu.dnetlib.iis.wf.affmatching.match.FirstWordsHashBucketMatcherFactory.createNameFirstWordsHashBucketMatcher; import static eu.dnetlib.iis.wf.affmatching.match.FirstWordsHashBucketMatcherFactory.createNameFirstWordsHashBucketMatcherVoters; import static eu.dnetlib.iis.wf.affmatching.match.MainSectionHashBucketMatcherFactory.*; -import static java.lang.System.out; import static java.util.stream.Collectors.toList; import static org.hamcrest.MatcherAssert.assertThat; @@ -63,6 +64,8 @@ @Category(IntegrationTest.class) public class AffOrgMatchVoterStrengthEstimatorAndTest { + private static final Logger logger = LoggerFactory.getLogger(AffOrgMatchVoterStrengthEstimatorAndTest.class); + private final static boolean PRINT_NOT_MATCHED = false; private final static boolean PRINT_FALSE_POSITIVES = true; @@ -141,8 +144,8 @@ public void estimateAndCheckVoterStrengths() throws IOException { // assert if (CollectionUtils.isNotEmpty(invalidVoterStrengths)) { - System.out.println("Invalid Voter Strengths. Change them manually to the calculated values (in the code):\n"); - invalidVoterStrengths.forEach(System.out::println); + logger.trace("Invalid Voter Strengths. Change them manually to the calculated values (in the code):"); + invalidVoterStrengths.stream().map(InvalidVoterStrength::toString).forEach(logger::debug); } assertThat(invalidVoterStrengths, Matchers.emptyIterable()); @@ -221,8 +224,6 @@ private void estimateVoterMatchStrengths(AffOrgMatcher affOrgMatcher, String aff FileUtils.deleteDirectory(new File(outputDirPath)); } - out.println("\n\n"); - FileUtils.deleteDirectory(workingDir); } @@ -238,15 +239,14 @@ private void checkIfVoterStrengthSetCorrectly(String affOrgMatcherName, AffOrgMa } private void printVoterHeader(AffOrgMatchVoter voter) { - out.println("\n\n"); - out.println("---------------------------------- VOTER ----------------------------------------"); - out.println(voter.toString() + "\n"); + logger.trace("---------------------------------- VOTER ----------------------------------------"); + logger.trace(voter.toString()); } private void printMatcherHeader(String affOrgMatcherName) { - out.println("\n\n=================================================================================="); - out.println("========================= " + affOrgMatcherName + " ==========================="); - out.println("=================================================================================="); + logger.trace("=================================================================================="); + logger.trace("========================= " + affOrgMatcherName + " ==========================="); + logger.trace("=================================================================================="); } private void createInputData() throws IOException { @@ -281,8 +281,6 @@ private float calcAndPrintResult(List expectedResultsJsonPaths) throws I printNumberDetails(expectedMatches.size(), actualMatches.size(), correctMatches.size(), falsePositives.size()); } - out.println(); - if (PRINT_FALSE_POSITIVES) { printFalsePositives(inputAffDirPath, inputOrgDirPath, expectedMatches, actualMatches); } @@ -299,25 +297,20 @@ private float calcMatchStrength(int numberOfActualMatches, int numberOfCorrectMa } private void printMatchStrength(float matchStrength) { - out.printf("%s %1." + VOTER_MATCH_STRENGTH_SCALE + "f", "MATCH STRENGTH: ", matchStrength); + logger.trace(String.format("%s %1." + VOTER_MATCH_STRENGTH_SCALE + "f", "MATCH STRENGTH: ", matchStrength)); } private void printNumberDetails(int numberOfExpectedMatches, int numberOfActualMatches, int numberOfCorrectMatches, int numberOfFalsePositives) { - out.print(" ["); - printQualityFactor("All matches", numberOfActualMatches, numberOfExpectedMatches); - out.print(", "); - printQualityFactor("Correct matches", numberOfCorrectMatches, numberOfActualMatches); - out.print(", "); - printQualityFactor("False positives", numberOfFalsePositives, numberOfActualMatches); - out.print("]"); + logger.trace("[{}, {}, {}]", + qualityFactor("All matches", numberOfActualMatches, numberOfExpectedMatches), + qualityFactor("Correct matches", numberOfCorrectMatches, numberOfActualMatches), + qualityFactor("False positives", numberOfFalsePositives, numberOfActualMatches)); } - private void printQualityFactor(String factorName, int goodCount, int totalCount) { + private String qualityFactor(String factorName, int goodCount, int totalCount) { double factorPercentage = ((double)goodCount/totalCount)*100; - String text = String.format("%s %3.2f%% (%d/%d)", factorName + ":", factorPercentage, goodCount, totalCount); - - System.out.print(text); + return String.format("%s %3.2f%% (%d/%d)", factorName + ":", factorPercentage, goodCount, totalCount); } private AffMatchingService createAffMatchingService() throws IOException { diff --git a/iis-wf/iis-wf-affmatching/src/test/resources/log4j.properties b/iis-wf/iis-wf-affmatching/src/test/resources/log4j.properties new file mode 100644 index 000000000..80c17380a --- /dev/null +++ b/iis-wf/iis-wf-affmatching/src/test/resources/log4j.properties @@ -0,0 +1,9 @@ +# Set everything to be logged to the console +log4j.rootCategory=WARN, console + +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.out +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +log4j.logger.eu.dnetlib.iis=DEBUG diff --git a/iis-wf/iis-wf-import/src/test/java/eu/dnetlib/iis/wf/importer/content/ObjectStoresProvider.java b/iis-wf/iis-wf-import/src/test/java/eu/dnetlib/iis/wf/importer/content/ObjectStoresProvider.java index e8a551759..da8beaf4d 100644 --- a/iis-wf/iis-wf-import/src/test/java/eu/dnetlib/iis/wf/importer/content/ObjectStoresProvider.java +++ b/iis-wf/iis-wf-import/src/test/java/eu/dnetlib/iis/wf/importer/content/ObjectStoresProvider.java @@ -6,6 +6,8 @@ import eu.dnetlib.data.objectstore.rmi.ObjectStoreService; import eu.dnetlib.enabling.tools.JaxwsServiceResolverImpl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Provides set of all ObjectStore records available in given ObjectStore service. @@ -15,6 +17,8 @@ */ public class ObjectStoresProvider { + private static final Logger logger = LoggerFactory.getLogger(ObjectStoresProvider.class); + // -------------------- CONSTRUCTORS ------------------------- private ObjectStoresProvider() {} @@ -28,6 +32,6 @@ public static void main(String[] args) { eprBuilder.build(); ObjectStoreService objectStore = new JaxwsServiceResolverImpl().getService(ObjectStoreService.class, eprBuilder.build()); - System.out.println(StringUtils.join(objectStore.getListOfObjectStores(), ',')); + logger.info(StringUtils.join(objectStore.getListOfObjectStores(), ',')); } } From bc6cdb23c919842ed9ddbe6a8bd557eadfd83dd4 Mon Sep 17 00:00:00 2001 From: Marek Horst Date: Fri, 11 Sep 2020 09:21:55 +0200 Subject: [PATCH 2/4] Closes #1137: Handle rate limiting and retrying in webcrawler module Handling various HTTP response codes explicitly including: * 200: accepting result * 404: return empty result * 429: wait and retry * 301, 302, 303: following redirects Returning RetryLimitExceededException when exceeding retry limit, relying on CloseableHttpClient in HttpContentRetriever. --- .../iis/wf/importer/HttpClientUtils.java | 27 ++ .../SoftwareHeritageOriginsImporter.java | 8 +- .../iis/wf/importer/HttpClientUtilsTest.java | 23 ++ .../processing/ClasspathContentRetriever.java | 5 +- .../ClasspathContentRetrieverFactory.java | 21 ++ .../sampledataproducer/oozie_app/workflow.xml | 4 +- .../RetryLimitExceededException.java | 21 ++ .../patent/OpenPatentWebServiceFacade.java | 27 +- .../softwareurl/CachedWebCrawlerJob.java | 46 +-- .../softwareurl/ContentRetriever.java | 5 +- .../softwareurl/ContentRetrieverContext.java | 94 ------ .../softwareurl/HttpContentRetriever.java | 218 ++++++++++--- .../HttpContentRetrieverFactory.java | 40 +++ .../softwareurl/WebCrawlerUtils.java | 7 +- .../softwareurl/main/oozie_app/workflow.xml | 33 +- .../OpenPatentWebServiceFacadeTest.java | 17 +- .../softwareurl/CachedWebCrawlerJobTest.java | 11 +- .../ClasspathContentRetriever.java | 3 +- .../ClasspathContentRetrieverFactory.java | 25 ++ .../ExceptionThrowingContentRetriever.java | 3 +- ...eptionThrowingContentRetrieverFactory.java | 20 ++ .../softwareurl/HttpContentRetrieverTest.java | 289 ++++++++++++++++++ .../main/sampletest/oozie_app/workflow.xml | 4 +- .../oozie_app/workflow.xml | 4 +- .../oozie_app/workflow.xml | 4 +- 25 files changed, 732 insertions(+), 227 deletions(-) create mode 100644 iis-wf/iis-wf-import/src/main/java/eu/dnetlib/iis/wf/importer/HttpClientUtils.java create mode 100644 iis-wf/iis-wf-import/src/test/java/eu/dnetlib/iis/wf/importer/HttpClientUtilsTest.java create mode 100644 iis-wf/iis-wf-primary/src/test/java/eu/dnetlib/iis/wf/primary/processing/ClasspathContentRetrieverFactory.java create mode 100644 iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/RetryLimitExceededException.java delete mode 100644 iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ContentRetrieverContext.java create mode 100644 iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/HttpContentRetrieverFactory.java create mode 100644 iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ClasspathContentRetrieverFactory.java create mode 100644 iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ExceptionThrowingContentRetrieverFactory.java create mode 100644 iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/HttpContentRetrieverTest.java diff --git a/iis-wf/iis-wf-import/src/main/java/eu/dnetlib/iis/wf/importer/HttpClientUtils.java b/iis-wf/iis-wf-import/src/main/java/eu/dnetlib/iis/wf/importer/HttpClientUtils.java new file mode 100644 index 000000000..049543171 --- /dev/null +++ b/iis-wf/iis-wf-import/src/main/java/eu/dnetlib/iis/wf/importer/HttpClientUtils.java @@ -0,0 +1,27 @@ +package eu.dnetlib.iis.wf.importer; + +import java.io.Closeable; + +import org.apache.http.client.config.RequestConfig; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClientBuilder; + +/** + * HTTP client utility class. + * + * @author mhorst + * + */ +public class HttpClientUtils { + + /** + * Builds {@link Closeable} HTTP client issuing requests to remote endpoint. + */ + public static CloseableHttpClient buildHttpClient(int connectionTimeout, int readTimeout) { + HttpClientBuilder httpClientBuilder = HttpClientBuilder.create(); + httpClientBuilder.setDefaultRequestConfig(RequestConfig.custom().setConnectTimeout(connectionTimeout) + .setConnectionRequestTimeout(connectionTimeout).setSocketTimeout(readTimeout).build()); + return httpClientBuilder.build(); + } + +} diff --git a/iis-wf/iis-wf-import/src/main/java/eu/dnetlib/iis/wf/importer/software/origins/SoftwareHeritageOriginsImporter.java b/iis-wf/iis-wf-import/src/main/java/eu/dnetlib/iis/wf/importer/software/origins/SoftwareHeritageOriginsImporter.java index 7cbebafe9..75786843c 100644 --- a/iis-wf/iis-wf-import/src/main/java/eu/dnetlib/iis/wf/importer/software/origins/SoftwareHeritageOriginsImporter.java +++ b/iis-wf/iis-wf-import/src/main/java/eu/dnetlib/iis/wf/importer/software/origins/SoftwareHeritageOriginsImporter.java @@ -31,11 +31,9 @@ import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.HttpRequest; -import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.util.EntityUtils; import org.apache.log4j.Logger; @@ -52,6 +50,7 @@ import eu.dnetlib.iis.common.java.porttype.AvroPortType; import eu.dnetlib.iis.common.java.porttype.PortType; import eu.dnetlib.iis.referenceextraction.softwareurl.schemas.SoftwareHeritageOrigin; +import eu.dnetlib.iis.wf.importer.HttpClientUtils; /** * Importer module retrieving (incrementally) origins from Software Heritage RESTful endpoint. @@ -206,10 +205,7 @@ protected DataFileWriter getWriter(FileSystem fs, PortBi * Builds HTTP client issuing requests to SH endpoint. */ protected CloseableHttpClient buildHttpClient(int connectionTimeout, int readTimeout) { - HttpClientBuilder httpClientBuilder = HttpClientBuilder.create(); - httpClientBuilder.setDefaultRequestConfig(RequestConfig.custom().setConnectTimeout(connectionTimeout) - .setConnectionRequestTimeout(connectionTimeout).setSocketTimeout(readTimeout).build()); - return httpClientBuilder.build(); + return HttpClientUtils.buildHttpClient(connectionTimeout, readTimeout); } protected static void storeNextElementIndex(int nextElementIndex) throws IOException { diff --git a/iis-wf/iis-wf-import/src/test/java/eu/dnetlib/iis/wf/importer/HttpClientUtilsTest.java b/iis-wf/iis-wf-import/src/test/java/eu/dnetlib/iis/wf/importer/HttpClientUtilsTest.java new file mode 100644 index 000000000..5913f5a61 --- /dev/null +++ b/iis-wf/iis-wf-import/src/test/java/eu/dnetlib/iis/wf/importer/HttpClientUtilsTest.java @@ -0,0 +1,23 @@ +package eu.dnetlib.iis.wf.importer; + +import static org.junit.Assert.assertNotNull; + +import org.apache.http.impl.client.CloseableHttpClient; +import org.junit.Test; + +public class HttpClientUtilsTest { + + + @Test + public void testBuildHttpClient() throws Exception { + // given + int connectionTimeout = 1; + int readTimeout = 2; + + // execute + CloseableHttpClient client = HttpClientUtils.buildHttpClient(connectionTimeout, readTimeout); + + // assert + assertNotNull(client); + } +} diff --git a/iis-wf/iis-wf-primary/src/test/java/eu/dnetlib/iis/wf/primary/processing/ClasspathContentRetriever.java b/iis-wf/iis-wf-primary/src/test/java/eu/dnetlib/iis/wf/primary/processing/ClasspathContentRetriever.java index fe744cb87..4359e855b 100644 --- a/iis-wf/iis-wf-primary/src/test/java/eu/dnetlib/iis/wf/primary/processing/ClasspathContentRetriever.java +++ b/iis-wf/iis-wf-primary/src/test/java/eu/dnetlib/iis/wf/primary/processing/ClasspathContentRetriever.java @@ -22,7 +22,7 @@ public class ClasspathContentRetriever implements ContentRetriever { private final Map urlToClasspathMap; - public ClasspathContentRetriever() throws IOException { + public ClasspathContentRetriever() { urlToClasspathMap = new HashMap<>(); urlToClasspathMap.put("https://github.com/madgik/madis", "/eu/dnetlib/iis/wf/primary/processing/sampledataproducer/input/html/madis.html"); @@ -30,8 +30,7 @@ public ClasspathContentRetriever() throws IOException { } @Override - public ContentRetrieverResponse retrieveUrlContent(CharSequence url, int connectionTimeout, int readTimeout, - int maxPageContentLength) { + public ContentRetrieverResponse retrieveUrlContent(CharSequence url) { if (url != null) { String classPathLocation = urlToClasspathMap.get(url.toString()); if (classPathLocation != null) { diff --git a/iis-wf/iis-wf-primary/src/test/java/eu/dnetlib/iis/wf/primary/processing/ClasspathContentRetrieverFactory.java b/iis-wf/iis-wf-primary/src/test/java/eu/dnetlib/iis/wf/primary/processing/ClasspathContentRetrieverFactory.java new file mode 100644 index 000000000..ef6204300 --- /dev/null +++ b/iis-wf/iis-wf-primary/src/test/java/eu/dnetlib/iis/wf/primary/processing/ClasspathContentRetrieverFactory.java @@ -0,0 +1,21 @@ +package eu.dnetlib.iis.wf.primary.processing; + +import java.util.Map; + +import eu.dnetlib.iis.wf.importer.facade.ServiceFacadeFactory; +import eu.dnetlib.iis.wf.referenceextraction.softwareurl.ContentRetriever; + +/** + * Factory class building {@link ClasspathContentRetriever}. + * + * @author mhorst + * + */ +public class ClasspathContentRetrieverFactory implements ServiceFacadeFactory { + + @Override + public ContentRetriever instantiate(Map parameters) { + return new ClasspathContentRetriever(); + } + +} diff --git a/iis-wf/iis-wf-primary/src/test/resources/eu/dnetlib/iis/wf/primary/processing/sampledataproducer/oozie_app/workflow.xml b/iis-wf/iis-wf-primary/src/test/resources/eu/dnetlib/iis/wf/primary/processing/sampledataproducer/oozie_app/workflow.xml index 9dfa5d326..4f89ef3b2 100644 --- a/iis-wf/iis-wf-primary/src/test/resources/eu/dnetlib/iis/wf/primary/processing/sampledataproducer/oozie_app/workflow.xml +++ b/iis-wf/iis-wf-primary/src/test/resources/eu/dnetlib/iis/wf/primary/processing/sampledataproducer/oozie_app/workflow.xml @@ -411,8 +411,8 @@ -Xmx1g - webcrawlContentRetrieverClassName - eu.dnetlib.iis.wf.primary.processing.ClasspathContentRetriever + webcrawlContentRetrieverFactoryClassName + eu.dnetlib.iis.wf.primary.processing.ClasspathContentRetrieverFactory webcrawlLockManagerFactoryClassName diff --git a/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/RetryLimitExceededException.java b/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/RetryLimitExceededException.java new file mode 100644 index 000000000..b5c8154d2 --- /dev/null +++ b/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/RetryLimitExceededException.java @@ -0,0 +1,21 @@ +package eu.dnetlib.iis.wf.referenceextraction; + +/** + * Exception indicating number of retries exceeded the predefined limit. + * + * @author mhorst + * + */ +public class RetryLimitExceededException extends Exception { + + + private static final long serialVersionUID = -1084913230112190824L; + + + //------------------------ CONSTRUCTORS ------------------- + + public RetryLimitExceededException(String message) { + super(message); + } + +} diff --git a/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/patent/OpenPatentWebServiceFacade.java b/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/patent/OpenPatentWebServiceFacade.java index cb6eb13f0..4f1c03e55 100644 --- a/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/patent/OpenPatentWebServiceFacade.java +++ b/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/patent/OpenPatentWebServiceFacade.java @@ -14,22 +14,22 @@ import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.HttpRequest; -import org.apache.http.client.config.RequestConfig; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.message.BasicNameValuePair; import org.apache.http.util.EntityUtils; -import org.apache.log4j.Logger; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import com.google.gson.JsonElement; import com.google.gson.JsonObject; import com.google.gson.JsonParser; import eu.dnetlib.iis.referenceextraction.patent.schemas.ImportedPatent; +import eu.dnetlib.iis.wf.importer.HttpClientUtils; /** * Remote EPO endpoint based patent service facade. @@ -41,7 +41,7 @@ public class OpenPatentWebServiceFacade implements PatentServiceFacade { private static final long serialVersionUID = -9154710658560662015L; - private static final Logger log = Logger.getLogger(OpenPatentWebServiceFacade.class); + private static final Logger log = LoggerFactory.getLogger(OpenPatentWebServiceFacade.class); private String authUriRoot; @@ -114,6 +114,13 @@ public String getPatentMetadata(ImportedPatent patent) throws Exception { // ------------------- PRIVATE ------------------------- + /** + * Retrieves patent metadata from EPO endpoint. + * + * This method is recursive and requires response entity to be consumed in order + * not to hit the ConnectionPoolTimeoutException when connecting the same host + * more than 2 times within recursion (e.g. when reattepmting). + */ private String getPatentMetadata(ImportedPatent patent, String securityToken, int retryCount) throws Exception { if (retryCount > maxRetriesCount) { @@ -136,12 +143,13 @@ private String getPatentMetadata(ImportedPatent patent, String securityToken, in return EntityUtils.toString(entity); } case 400: { - log.info("got 400 HTTP code in response, potential reason: access token invalid or expired"); + log.info("got 400 HTTP code in response, potential reason: access token invalid or expired, " + + "server response: {}", EntityUtils.toString(httpResponse.getEntity())); return getPatentMetadata(patent, reauthenticate(), ++retryCount); } case 403: { - log.warn("got 403 HTTP code in response, potential reason: endpoint rate limit reached. Delaying for " - + throttleSleepTime + " ms, server response: " + EntityUtils.toString(httpResponse.getEntity())); + log.warn("got 403 HTTP code in response, potential reason: endpoint rate limit reached. Delaying for {} ms, " + + "server response: {}", throttleSleepTime, EntityUtils.toString(httpResponse.getEntity())); Thread.sleep(throttleSleepTime); return getPatentMetadata(patent, securityToken, ++retryCount); } @@ -182,10 +190,7 @@ private void reinitialize(SerDe serDe, String authUriRoot, String opsUriRoot, * Builds HTTP client issuing requests to SH endpoint. */ protected static CloseableHttpClient buildHttpClient(int connectionTimeout, int readTimeout) { - HttpClientBuilder httpClientBuilder = HttpClientBuilder.create(); - httpClientBuilder.setDefaultRequestConfig(RequestConfig.custom().setConnectTimeout(connectionTimeout) - .setConnectionRequestTimeout(connectionTimeout).setSocketTimeout(readTimeout).build()); - return httpClientBuilder.build(); + return HttpClientUtils.buildHttpClient(connectionTimeout, readTimeout); } protected String getSecurityToken() throws Exception { diff --git a/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/CachedWebCrawlerJob.java b/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/CachedWebCrawlerJob.java index 98def207b..5990383b4 100644 --- a/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/CachedWebCrawlerJob.java +++ b/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/CachedWebCrawlerJob.java @@ -1,5 +1,7 @@ package eu.dnetlib.iis.wf.referenceextraction.softwareurl; +import java.util.Map; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.log4j.Logger; @@ -10,10 +12,12 @@ import org.apache.spark.api.java.Optional; import org.apache.spark.storage.StorageLevel; +import com.beust.jcommander.DynamicParameter; import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameters; import com.google.common.collect.Lists; +import com.google.common.collect.Maps; import eu.dnetlib.iis.audit.schemas.Fault; import eu.dnetlib.iis.common.cache.CacheMetadataManagingProcess; @@ -30,6 +34,8 @@ import eu.dnetlib.iis.metadataextraction.schemas.DocumentText; import eu.dnetlib.iis.referenceextraction.softwareurl.schemas.DocumentToSoftwareUrl; import eu.dnetlib.iis.referenceextraction.softwareurl.schemas.DocumentToSoftwareUrlWithSource; +import eu.dnetlib.iis.wf.importer.ImportWorkflowRuntimeParameters; +import eu.dnetlib.iis.wf.importer.facade.ServiceFacadeUtils; import pl.edu.icm.sparkutils.avro.SparkAvroLoader; import pl.edu.icm.sparkutils.avro.SparkAvroSaver; import scala.Tuple2; @@ -63,10 +69,6 @@ public static void main(String[] args) throws Exception { JCommander jcommander = new JCommander(params); jcommander.parse(args); - ContentRetrieverContext contentRetrieverContext = new ContentRetrieverContext(params.contentRetrieverClassName, - params.connectionTimeout, params.readTimeout, params.maxPageContentLength, params.numberOfEmittedFiles, - params.numberOfPartitionsForCrawling); - try (JavaSparkContext sc = JavaSparkContextFactory.withConfAndKryo(new SparkConf())) { Configuration hadoopConf = sc.hadoopConfiguration(); @@ -75,6 +77,11 @@ public static void main(String[] args) throws Exception { HdfsUtils.remove(hadoopConf, params.getOutputFaultPath()); HdfsUtils.remove(hadoopConf, params.getOutputReportPath()); + ContentRetriever contentRetriever = ServiceFacadeUtils + .instantiate(prepareFacadeParameters(params.contentRetrieverFactoryClassName, params.contentRetrieverParams)); + int numberOfPartitionsForCrawling = params.numberOfPartitionsForCrawling; + int numberOfEmittedFiles = params.numberOfEmittedFiles; + LockManager lockManager = LockManagerUtils.instantiateLockManager(params.getLockManagerFactoryClassName(), hadoopConf); @@ -97,7 +104,8 @@ public static void main(String[] args) throws Exception { JavaRDD entitiesReturnedFromCache = inputJoinedWithCache.filter(x -> x._2._2.isPresent()).values().map(x -> attachSource(x._1, x._2.get())); entitiesReturnedFromCache.persist(CACHE_STORAGE_DEFAULT_LEVEL); - Tuple2, JavaRDD> returnedFromWebcrawlTuple = WebCrawlerUtils.obtainSources(toBeProcessed, contentRetrieverContext); + Tuple2, JavaRDD> returnedFromWebcrawlTuple = WebCrawlerUtils + .obtainSources(toBeProcessed, contentRetriever, numberOfPartitionsForCrawling); JavaRDD webcrawledEntities; JavaRDD entitiesToBeWritten; @@ -113,7 +121,7 @@ public static void main(String[] args) throws Exception { // storing new cache entry DocumentTextCacheStorageUtils.storeInCache(avroSaver, cachedSources.union(returnedFromWebcrawlTuple._1), cachedFaults.union(returnedFromWebcrawlTuple._2), - cacheRootDir, lockManager, cacheManager, hadoopConf, contentRetrieverContext.getNumberOfEmittedFiles()); + cacheRootDir, lockManager, cacheManager, hadoopConf, numberOfEmittedFiles); // merging final results webcrawledEntities = produceEntitiesToBeStored(toBeProcessed, returnedFromWebcrawlTuple._1); @@ -135,7 +143,7 @@ public static void main(String[] args) throws Exception { //notice: we do not propagate faults from cache, only new faults are written faultsToBeStored, generateReportEntries(sc, entitiesReturnedFromCache, webcrawledEntities, faultsToBeStored), - new OutputPaths(params), contentRetrieverContext.getNumberOfEmittedFiles()); + new OutputPaths(params), numberOfEmittedFiles); } } @@ -209,29 +217,29 @@ private static DocumentToSoftwareUrlWithSource attachSource(DocumentToSoftwareUr return builder.build(); } + private static Map prepareFacadeParameters(String patentFacadeFactoryClassname, Map facadeParams) { + Map resultParams = Maps.newHashMap(); + resultParams.put(ImportWorkflowRuntimeParameters.IMPORT_FACADE_FACTORY_CLASS, patentFacadeFactoryClassname); + resultParams.putAll(facadeParams); + return resultParams; + } + @Parameters(separators = "=") private static class WebCrawlerJobParameters extends CachedStorageJobParameters { @Parameter(names = "-inputPath", required = true) private String inputPath; - @Parameter(names = "-contentRetrieverClassName", required = true) - private String contentRetrieverClassName; - - @Parameter(names = "-connectionTimeout", required = true) - private int connectionTimeout; - - @Parameter(names = "-readTimeout", required = true) - private int readTimeout; - - @Parameter(names = "-maxPageContentLength", required = true) - private int maxPageContentLength; - @Parameter(names = "-numberOfEmittedFiles", required = true) private int numberOfEmittedFiles; @Parameter(names = "-numberOfPartitionsForCrawling", required = true) private int numberOfPartitionsForCrawling; + @Parameter(names = "-contentRetrieverFactoryClassName", required = true) + private String contentRetrieverFactoryClassName; + + @DynamicParameter(names = "-D", description = "dynamic parameters related to content retriever", required = false) + private Map contentRetrieverParams = Maps.newHashMap(); } } diff --git a/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ContentRetriever.java b/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ContentRetriever.java index 79971bf64..1c11920dc 100644 --- a/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ContentRetriever.java +++ b/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ContentRetriever.java @@ -15,10 +15,7 @@ public interface ContentRetriever extends Serializable { /** * Retrieves content for given url. Should never return null. * @param url location the page content should be retrieved from - * @param connectionTimeout connection timeout - * @param readTimeout read timeout - * @param maxPageContentLength maximum size of the retieved content */ - ContentRetrieverResponse retrieveUrlContent(CharSequence url, int connectionTimeout, int readTimeout, int maxPageContentLength); + ContentRetrieverResponse retrieveUrlContent(CharSequence url); } diff --git a/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ContentRetrieverContext.java b/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ContentRetrieverContext.java deleted file mode 100644 index 82fe9e404..000000000 --- a/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ContentRetrieverContext.java +++ /dev/null @@ -1,94 +0,0 @@ -package eu.dnetlib.iis.wf.referenceextraction.softwareurl; - -import java.io.Serializable; - -/** - * Content retriveal context. - * - * @author mhorst - * - */ -public class ContentRetrieverContext implements Serializable { - - /** - * - */ - private static final long serialVersionUID = 6360286121671381083L; - - private ContentRetriever contentRetriever; - - private int connectionTimeout; - - private int readTimeout; - - private int maxPageContentLength; - - private int numberOfEmittedFiles; - - private int numberOfPartitionsForCrawling; - - public ContentRetrieverContext() {} - - public ContentRetrieverContext(String contentRetrieverClassName, int connectionTimeout, int readTimeout, - int maxPageContentLength, int numberOfEmittedFiles, int numberOfPartitionsForCrawling) - throws Exception { - @SuppressWarnings("unchecked") - Class clazz = (Class) Class.forName(contentRetrieverClassName); - this.contentRetriever = clazz.getConstructor().newInstance(); - this.connectionTimeout = connectionTimeout; - this.readTimeout = readTimeout; - this.maxPageContentLength = maxPageContentLength; - this.numberOfEmittedFiles = numberOfEmittedFiles; - this.numberOfPartitionsForCrawling = numberOfPartitionsForCrawling; - } - - - public ContentRetriever getContentRetriever() { - return contentRetriever; - } - - public int getConnectionTimeout() { - return connectionTimeout; - } - - public int getReadTimeout() { - return readTimeout; - } - - public int getMaxPageContentLength() { - return maxPageContentLength; - } - - public int getNumberOfEmittedFiles() { - return numberOfEmittedFiles; - } - - public int getNumberOfPartitionsForCrawling() { - return numberOfPartitionsForCrawling; - } - - public void setContentRetriever(ContentRetriever contentRetriever) { - this.contentRetriever = contentRetriever; - } - - public void setConnectionTimeout(int connectionTimeout) { - this.connectionTimeout = connectionTimeout; - } - - public void setReadTimeout(int readTimeout) { - this.readTimeout = readTimeout; - } - - public void setMaxPageContentLength(int maxPageContentLength) { - this.maxPageContentLength = maxPageContentLength; - } - - public void setNumberOfEmittedFiles(int numberOfEmittedFiles) { - this.numberOfEmittedFiles = numberOfEmittedFiles; - } - - public void setNumberOfPartitionsForCrawling(int numberOfPartitionsForCrawling) { - this.numberOfPartitionsForCrawling = numberOfPartitionsForCrawling; - } - -} diff --git a/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/HttpContentRetriever.java b/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/HttpContentRetriever.java index 21fda5332..780036356 100644 --- a/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/HttpContentRetriever.java +++ b/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/HttpContentRetriever.java @@ -1,77 +1,209 @@ package eu.dnetlib.iis.wf.referenceextraction.softwareurl; import java.io.BufferedReader; +import java.io.IOException; import java.io.InputStreamReader; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; import java.net.HttpURLConnection; -import java.net.URL; import java.nio.charset.StandardCharsets; +import java.util.NoSuchElementException; -import org.apache.log4j.Logger; +import org.apache.commons.lang.StringUtils; +import org.apache.http.Header; +import org.apache.http.HttpEntity; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.util.EntityUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import eu.dnetlib.iis.wf.importer.HttpClientUtils; import eu.dnetlib.iis.wf.referenceextraction.ContentRetrieverResponse; +import eu.dnetlib.iis.wf.referenceextraction.RetryLimitExceededException; /** * HTTP based content retriever. + * * @author mhorst * */ public class HttpContentRetriever implements ContentRetriever { - private static final long serialVersionUID = -6879262115292175343L; - private static final Logger log = Logger.getLogger(HttpContentRetriever.class); + private static final Logger log = LoggerFactory.getLogger(HttpContentRetriever.class); + + private static final String HEADER_LOCATION = "Location"; + + /** + * HTTP Status-Code 429: Too many requests. + */ + private static final int HTTP_TOO_MANY_REQUESTS = 429; + + private int connectionTimeout; + + private int readTimeout; + + private int maxPageContentLength; + + private long throttleSleepTime; + + private int maxRetriesCount; + + // to be reinitialized after deserialization + private transient CloseableHttpClient httpClient; + + + // ----------------------------------------- CONSTRUCTORS --------------------------------------- + + + public HttpContentRetriever(int connectionTimeout, int readTimeout, int maxPageContentLength, + long throttleSleepTime, int maxRetriesCount) { + initialize(connectionTimeout, readTimeout, maxPageContentLength, throttleSleepTime, maxRetriesCount); + } + // ----------------------------------------- LOGIC ---------------------------------------------- + @Override - public ContentRetrieverResponse retrieveUrlContent(CharSequence url, int connectionTimeout, int readTimeout, - int maxPageContentLength) { + public ContentRetrieverResponse retrieveUrlContent(CharSequence url) { + long startTime = System.currentTimeMillis(); - String currentUrl = url.toString(); - + + log.info("starting content retrieval for url: {}", url); try { - log.info("starting content retrieval for url: " + currentUrl); - - HttpURLConnection conn = (HttpURLConnection) new URL(currentUrl).openConnection(); - conn.setReadTimeout(readTimeout); - conn.setConnectTimeout(connectionTimeout); + return retrieveUrlContent(url.toString(), 0); + } catch (Exception e) { + log.error("content retrieval failed for url: " + url, e); + return new ContentRetrieverResponse(e); + } finally { + log.info("finished content retrieval for url: {} in {} ms", url, (System.currentTimeMillis() - startTime)); + } + } + + // ----------------------------------------- PRIVATE ------------------------------------------------ + + /** + * Retrieves web page content from given url. + * + * This method is recursive and requires response entity to be consumed in order + * not to hit the ConnectionPoolTimeoutException when connecting the same host + * more than 2 times within recursion (e.g. when reattepmting). + */ + private ContentRetrieverResponse retrieveUrlContent(String currentUrl, int retryCount) throws Exception { + + if (retryCount > maxRetriesCount) { + String message = String.format("number of maximum retries exceeded: '%d' for url: %s", maxRetriesCount, currentUrl); + log.error(message); + return new ContentRetrieverResponse(new RetryLimitExceededException(message)); + } + + try (CloseableHttpResponse httpResponse = httpClient.execute(new HttpGet(currentUrl))) { - int status = conn.getResponseCode(); + int statusCode = httpResponse.getStatusLine().getStatusCode(); - if (status == HttpURLConnection.HTTP_MOVED_TEMP - || status == HttpURLConnection.HTTP_MOVED_PERM - || status == HttpURLConnection.HTTP_SEE_OTHER) { - currentUrl = conn.getHeaderField("Location"); - log.info("redirecting to: " + currentUrl); - conn.disconnect(); - conn = (HttpURLConnection) new URL(currentUrl).openConnection(); + switch (statusCode) { + case HttpURLConnection.HTTP_OK: { + return readPageContent(httpResponse.getEntity(), maxPageContentLength, currentUrl); } - - try (BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream(), StandardCharsets.UTF_8))) { - StringBuilder pageContent = new StringBuilder(); - String inputLine; - while ((inputLine = reader.readLine()) != null) { - if (pageContent.length() < maxPageContentLength) { - if (pageContent.length() > 0) { - pageContent.append('\n'); - } - pageContent.append(inputLine); - } else { - log.warn("page content from URL: " + currentUrl + " exceeded maximum page length limit: " + maxPageContentLength + ", returning truncated page content"); - return new ContentRetrieverResponse(pageContent.toString()); + case HttpURLConnection.HTTP_NOT_FOUND: { + return new ContentRetrieverResponse(new NoSuchElementException("unable to find page at: " + currentUrl)); + } + case HttpURLConnection.HTTP_MOVED_TEMP: + case HttpURLConnection.HTTP_MOVED_PERM: + case HttpURLConnection.HTTP_SEE_OTHER: { + String redirectedUrl = getHeaderValue(httpResponse.getAllHeaders(), HEADER_LOCATION); + if (StringUtils.isNotBlank(redirectedUrl)) { + log.info("got {} response code, redirecting to {}, server response: {}", statusCode, + redirectedUrl, EntityUtils.toString(httpResponse.getEntity())); + return retrieveUrlContent(redirectedUrl, ++retryCount); + } else { + return new ContentRetrieverResponse( + new RuntimeException("resource was moved, missing redirect header for the url: " + currentUrl)); + } + } + case HTTP_TOO_MANY_REQUESTS: { + log.warn("got {} response code, potential reason: rate limit reached. Delaying for {} ms, server response: {}", statusCode, + throttleSleepTime, EntityUtils.toString(httpResponse.getEntity())); + Thread.sleep(throttleSleepTime); + return retrieveUrlContent(currentUrl, ++retryCount); + } + default: { + return new ContentRetrieverResponse(new RuntimeException(String.format( + "got unsupported HTTP response code: %d when accessing page at url: %s", statusCode, currentUrl))); + } + } + } + } + + private static String getHeaderValue(Header[] headers, String headerName) { + if (headers != null) { + for (Header header : headers) { + if (headerName.equals(header.getName())) { + return header.getValue(); + } + } + } + return null; + } + + private static ContentRetrieverResponse readPageContent(HttpEntity httpEntity, int maxPageContentLength, String url) throws IOException { + try (BufferedReader reader = new BufferedReader(new InputStreamReader(httpEntity.getContent(), StandardCharsets.UTF_8))) { + StringBuilder pageContent = new StringBuilder(); + String inputLine; + while ((inputLine = reader.readLine()) != null) { + if (pageContent.length() < maxPageContentLength) { + if (pageContent.length() > 0) { + pageContent.append('\n'); } + pageContent.append(inputLine); + } else { + log.warn("page content from URL: '{}' exceeded page length limit: {}, returning truncated page content", + url, maxPageContentLength); + return new ContentRetrieverResponse(pageContent.toString()); } - return new ContentRetrieverResponse(pageContent.toString()); - - } finally { - log.info("finished content retrieval for url: " + currentUrl + " in " + - (System.currentTimeMillis()-startTime) + " ms"); } - - } catch (Exception e) { - log.error("content retrieval failed for url: " + currentUrl, e); - return new ContentRetrieverResponse(e); + return new ContentRetrieverResponse(pageContent.toString()); } } + + /** + * Builds HTTP client issuing requests to a remote endpoint. + */ + protected CloseableHttpClient buildHttpClient(int connectionTimeout, int readTimeout) { + return HttpClientUtils.buildHttpClient(connectionTimeout, readTimeout); + } + + /** + * Initializes the object either during construction or deserialization. + */ + private void initialize(int connectionTimeout, int readTimeout, int maxPageContentLength, long throttleSleepTime, int maxRetriesCount) { + this.connectionTimeout = connectionTimeout; + this.readTimeout = readTimeout; + this.httpClient = buildHttpClient(connectionTimeout, readTimeout); + this.maxPageContentLength = maxPageContentLength; + this.throttleSleepTime = throttleSleepTime; + this.maxRetriesCount = maxRetriesCount; + } + + + // -------------------------- SerDe -------------------------------- + + private void writeObject(ObjectOutputStream oos) throws IOException { + oos.defaultWriteObject(); + oos.writeObject(this.connectionTimeout); + oos.writeObject(this.readTimeout); + oos.writeObject(this.maxPageContentLength); + oos.writeObject(this.throttleSleepTime); + oos.writeObject(this.maxRetriesCount); + } + + private void readObject(ObjectInputStream ois) throws ClassNotFoundException, IOException { + ois.defaultReadObject(); + initialize((Integer) ois.readObject(), (Integer) ois.readObject(), (Integer) ois.readObject(), + (Long) ois.readObject(), (Integer) ois.readObject()); + } } diff --git a/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/HttpContentRetrieverFactory.java b/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/HttpContentRetrieverFactory.java new file mode 100644 index 000000000..1006f554e --- /dev/null +++ b/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/HttpContentRetrieverFactory.java @@ -0,0 +1,40 @@ +package eu.dnetlib.iis.wf.referenceextraction.softwareurl; + +import java.util.Map; + +import eu.dnetlib.iis.wf.importer.facade.ServiceFacadeFactory; + +/** + * Factory class instantiating {@link HttpContentRetriever}. + * + * @author mhorst + * + */ +public class HttpContentRetrieverFactory implements ServiceFacadeFactory { + + public static final String PARAM_READ_TIMEOUT = "readTimeout"; + public static final String PARAM_CONNECTION_TIMEOUT = "connectionTimeout"; + + public static final String PARAM_MAX_PAGE_CONTENT_LENGTH = "maxPageContentLength"; + + public static final String PARAM_THROTTLE_SLEEP_TIME = "throttleSleepTime"; + public static final String PARAM_RETRIES_COUNT = "retriesCount"; + + + @Override + public ContentRetriever instantiate(Map conf) { + + String connectionTimeout = conf.getOrDefault(PARAM_CONNECTION_TIMEOUT, "60000"); + String readTimeout = conf.getOrDefault(PARAM_READ_TIMEOUT, "60000"); + + String maxPageContentLength = conf.getOrDefault(PARAM_MAX_PAGE_CONTENT_LENGTH, "500000"); + + String throttleSleepTime = conf.getOrDefault(PARAM_THROTTLE_SLEEP_TIME, "10000"); + String retriesCount = conf.getOrDefault(PARAM_RETRIES_COUNT, "10"); + + return new HttpContentRetriever(Integer.parseInt(connectionTimeout), Integer.parseInt(readTimeout), + Integer.parseInt(maxPageContentLength), Long.parseLong(throttleSleepTime), Integer.parseInt(retriesCount)); + + } + +} diff --git a/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/WebCrawlerUtils.java b/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/WebCrawlerUtils.java index cb1e7e344..80fb511f3 100644 --- a/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/WebCrawlerUtils.java +++ b/iis-wf/iis-wf-referenceextraction/src/main/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/WebCrawlerUtils.java @@ -21,13 +21,12 @@ public class WebCrawlerUtils { * Obtains sources using content retriever. */ public static Tuple2, JavaRDD> obtainSources(JavaRDD documentToSoftwareUrl, - ContentRetrieverContext ctx) { + ContentRetriever contentRetriever, int numberOfPartitionsForCrawling) { JavaRDD uniqueSoftwareUrl = documentToSoftwareUrl.map(e -> e.getSoftwareUrl()).distinct(); JavaPairRDD uniqueFilteredSoftwareUrlToSource = uniqueSoftwareUrl - .repartition(ctx.getNumberOfPartitionsForCrawling()) - .mapToPair(e -> new Tuple2(e, ctx.getContentRetriever().retrieveUrlContent(e, - ctx.getConnectionTimeout(), ctx.getReadTimeout(), ctx.getMaxPageContentLength()))); + .repartition(numberOfPartitionsForCrawling) + .mapToPair(e -> new Tuple2(e, contentRetriever.retrieveUrlContent(e))); return new Tuple2<>( uniqueFilteredSoftwareUrlToSource.map(e -> DocumentText.newBuilder().setId(e._1).setText(e._2.getContent()).build()), diff --git a/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/softwareurl/main/oozie_app/workflow.xml b/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/softwareurl/main/oozie_app/workflow.xml index 068da58b0..3d3ff51f8 100644 --- a/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/softwareurl/main/oozie_app/workflow.xml +++ b/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/softwareurl/main/oozie_app/workflow.xml @@ -25,9 +25,9 @@ directory for storing report (relative to output_report_root_path) - webcrawlContentRetrieverClassName - eu.dnetlib.iis.wf.referenceextraction.softwareurl.HttpContentRetriever - module responsible for obtaining contents for given url + webcrawlContentRetrieverFactoryClassName + eu.dnetlib.iis.wf.referenceextraction.softwareurl.HttpContentRetrieverFactory + factory class producing the module responsible for obtaining contents for given url webcrawlLockManagerFactoryClassName @@ -37,18 +37,28 @@ webcrawlConnectionTimeout 60000 - connection timeout during page retrieval phase + connection timeout during page retrieval phase (expressed in milliseconds) webcrawlReadTimeout 60000 - read timeout during page retrieval phase + read timeout during page retrieval phase (expressed in milliseconds) webcrawlMaxPageContentLength 500000 maximum length (expressed in characters number) of page content + + webcrawlThrottleSleepTime + 10000 + sleep time between retries when crawling for web page sources (expressed in milliseconds) + + + webcrawlRetriesCount + 10 + maximum number of retries when crawling for web page sources + webcrawlNumberOfEmittedFiles 1000 @@ -216,12 +226,15 @@ -outputPath=${workingDir}/referenceextraction_softwareurl_webcrawl/out -outputFaultPath=${workingDir}/referenceextraction_softwareurl_webcrawl/fault -outputReportPath=${output_report_root_path}/${output_report_relative_path} - -contentRetrieverClassName=${webcrawlContentRetrieverClassName} -lockManagerFactoryClassName=${webcrawlLockManagerFactoryClassName} - - -connectionTimeout=${webcrawlConnectionTimeout} - -readTimeout=${webcrawlReadTimeout} - -maxPageContentLength=${webcrawlMaxPageContentLength} + + -contentRetrieverFactoryClassName=${webcrawlContentRetrieverFactoryClassName} + -DconnectionTimeout=${webcrawlConnectionTimeout} + -DreadTimeout=${webcrawlReadTimeout} + -DmaxPageContentLength=${webcrawlMaxPageContentLength} + -DthrottleSleepTime=${webcrawlThrottleSleepTime} + -DretriesCount=${webcrawlRetriesCount} + -numberOfEmittedFiles=${webcrawlNumberOfEmittedFiles} -numberOfPartitionsForCrawling=${webcrawlNumberOfPartitionsForCrawling} diff --git a/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/patent/OpenPatentWebServiceFacadeTest.java b/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/patent/OpenPatentWebServiceFacadeTest.java index cfe8c84a2..f64cdc4a2 100644 --- a/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/patent/OpenPatentWebServiceFacadeTest.java +++ b/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/patent/OpenPatentWebServiceFacadeTest.java @@ -22,7 +22,6 @@ import org.apache.http.HttpHost; import org.apache.http.HttpRequest; import org.apache.http.StatusLine; -import org.apache.http.client.HttpClient; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; @@ -275,8 +274,11 @@ public void testGetPatentMetadataForHttp400() throws Exception { // metadata retrieval mock CloseableHttpResponse getPatentHttpResponse1 = mock(CloseableHttpResponse.class); StatusLine getPatentStatusLine1 = mock(StatusLine.class); + HttpEntity getPatentHttpEntity1 = mock(HttpEntity.class); when(getPatentHttpResponse1.getStatusLine()).thenReturn(getPatentStatusLine1); when(getPatentStatusLine1.getStatusCode()).thenReturn(400); + when(getPatentHttpResponse1.getEntity()).thenReturn(getPatentHttpEntity1); + when(getPatentHttpEntity1.getContent()).thenReturn(new ByteArrayInputStream("".getBytes())); CloseableHttpResponse getPatentHttpResponse2 = mock(CloseableHttpResponse.class); StatusLine getPatentStatusLine2 = mock(StatusLine.class); @@ -452,19 +454,6 @@ public void testGetPatentMetadataForHttp500() throws Exception { service.getPatentMetadata(patentBuilder.build()); } - @Test - public void testBuildHttpClient() throws Exception { - // given - int connectionTimeout = 1; - int readTimeout = 2; - - // execute - HttpClient client = OpenPatentWebServiceFacade.buildHttpClient(connectionTimeout, readTimeout); - - // assert - assertNotNull(client); - } - @Test public void testSerializeAndDeserialize() throws Exception { // given diff --git a/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/CachedWebCrawlerJobTest.java b/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/CachedWebCrawlerJobTest.java index 803af94c4..32b5546fb 100644 --- a/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/CachedWebCrawlerJobTest.java +++ b/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/CachedWebCrawlerJobTest.java @@ -246,7 +246,7 @@ public void obtainPageSourceFromCache() throws IOException { // execute executor.execute(buildWebCrawlerJob(inputPath, outputPath, outputFaultPath, outputReportPath)); executor.execute(buildWebCrawlerJob(inputPath, output2Path, outputFault2Path, outputReport2Path, - "eu.dnetlib.iis.wf.referenceextraction.softwareurl.ExceptionThrowingContentRetriever")); + "eu.dnetlib.iis.wf.referenceextraction.softwareurl.ExceptionThrowingContentRetrieverFactory")); // assert AvroAssertTestUtil.assertEqualsWithJsonIgnoreOrder(outputPath, jsonOutputFile, DocumentToSoftwareUrlWithSource.class); @@ -274,17 +274,14 @@ public void obtainPageSourceFromCache() throws IOException { //------------------------ PRIVATE -------------------------- private SparkJob buildWebCrawlerJob(String inputPath, String outputPath, String outputFaultPath, - String outputReportPath, String contentRetrieverClassName) { + String outputReportPath, String contentRetrieverFactoryClassName) { SparkJob sparkJob = SparkJobBuilder .create() .setAppName("Spark WebCrawler") .setMainClass(CachedWebCrawlerJob.class) .addArg("-inputPath", inputPath) - .addArg("-contentRetrieverClassName", contentRetrieverClassName) + .addArg("-contentRetrieverFactoryClassName", contentRetrieverFactoryClassName) .addArg("-lockManagerFactoryClassName", ZookeeperLockManagerFactory.class.getName()) - .addArg("-connectionTimeout", "0") - .addArg("-readTimeout", "0") - .addArg("-maxPageContentLength", "0") .addArg("-numberOfEmittedFiles", "1") .addArg("-numberOfPartitionsForCrawling", "1") .addArg("-cacheRootDir", cacheRootDir.toString()) @@ -301,6 +298,6 @@ private SparkJob buildWebCrawlerJob(String inputPath, String outputPath, String private SparkJob buildWebCrawlerJob(String inputPath, String outputPath, String outputFaultPath, String outputReportPath) { return buildWebCrawlerJob(inputPath, outputPath, outputFaultPath, outputReportPath, - "eu.dnetlib.iis.wf.referenceextraction.softwareurl.ClasspathContentRetriever"); + "eu.dnetlib.iis.wf.referenceextraction.softwareurl.ClasspathContentRetrieverFactory"); } } diff --git a/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ClasspathContentRetriever.java b/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ClasspathContentRetriever.java index 113832cf9..3df8b18c8 100644 --- a/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ClasspathContentRetriever.java +++ b/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ClasspathContentRetriever.java @@ -31,8 +31,7 @@ public ClasspathContentRetriever() throws IOException { } @Override - public ContentRetrieverResponse retrieveUrlContent(CharSequence url, int connectionTimeout, int readTimeout, - int maxPageContentLength) { + public ContentRetrieverResponse retrieveUrlContent(CharSequence url) { if (url != null) { String classPathLocation = urlToClasspathMap.getProperty(url.toString()); if (classPathLocation != null) { diff --git a/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ClasspathContentRetrieverFactory.java b/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ClasspathContentRetrieverFactory.java new file mode 100644 index 000000000..c577c0d3a --- /dev/null +++ b/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ClasspathContentRetrieverFactory.java @@ -0,0 +1,25 @@ +package eu.dnetlib.iis.wf.referenceextraction.softwareurl; + +import java.io.IOException; +import java.util.Map; + +import eu.dnetlib.iis.wf.importer.facade.ServiceFacadeFactory; + +/** + * Factory class building {@link ClasspathContentRetriever}. + * + * @author mhorst + * + */ +public class ClasspathContentRetrieverFactory implements ServiceFacadeFactory { + + @Override + public ContentRetriever instantiate(Map parameters) { + try { + return new ClasspathContentRetriever(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + +} diff --git a/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ExceptionThrowingContentRetriever.java b/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ExceptionThrowingContentRetriever.java index 231f456ce..94cf64429 100644 --- a/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ExceptionThrowingContentRetriever.java +++ b/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ExceptionThrowingContentRetriever.java @@ -17,8 +17,7 @@ public class ExceptionThrowingContentRetriever implements ContentRetriever { private static final long serialVersionUID = -5244888543422890414L; @Override - public ContentRetrieverResponse retrieveUrlContent(CharSequence url, int connectionTimeout, int readTimeout, - int maxPageContentLength) { + public ContentRetrieverResponse retrieveUrlContent(CharSequence url) { throw new RuntimeException("unexpected content retrieval call!"); } diff --git a/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ExceptionThrowingContentRetrieverFactory.java b/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ExceptionThrowingContentRetrieverFactory.java new file mode 100644 index 000000000..2299dafd5 --- /dev/null +++ b/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/ExceptionThrowingContentRetrieverFactory.java @@ -0,0 +1,20 @@ +package eu.dnetlib.iis.wf.referenceextraction.softwareurl; + +import java.util.Map; + +import eu.dnetlib.iis.wf.importer.facade.ServiceFacadeFactory; + +/** + * Factory class building {@link ExceptionThrowingContentRetriever}. + * + * @author mhorst + * + */ +public class ExceptionThrowingContentRetrieverFactory implements ServiceFacadeFactory { + + @Override + public ContentRetriever instantiate(Map parameters) { + return new ExceptionThrowingContentRetriever(); + } + +} diff --git a/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/HttpContentRetrieverTest.java b/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/HttpContentRetrieverTest.java new file mode 100644 index 000000000..60f83fa9d --- /dev/null +++ b/iis-wf/iis-wf-referenceextraction/src/test/java/eu/dnetlib/iis/wf/referenceextraction/softwareurl/HttpContentRetrieverTest.java @@ -0,0 +1,289 @@ +package eu.dnetlib.iis.wf.referenceextraction.softwareurl; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import static org.mockito.Matchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.util.NoSuchElementException; + +import org.apache.http.Header; +import org.apache.http.HttpEntity; +import org.apache.http.StatusLine; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.Mock; +import org.mockito.runners.MockitoJUnitRunner; + +import eu.dnetlib.iis.wf.referenceextraction.ContentRetrieverResponse; +import eu.dnetlib.iis.wf.referenceextraction.RetryLimitExceededException; + +/** + * {@link HttpContentRetriever} test class. + * + * @author mhorst + * + */ +@RunWith(MockitoJUnitRunner.class) +public class HttpContentRetrieverTest { + + private int connectionTimeout = 10000; + + private int readTimeout = 20000; + + private int maxPageContentLength = 1000000; + + private long throttleSleepTime = 1; + + private int maxRetriesCount = 2; + + @Mock + private CloseableHttpClient httpClient; + + @Test + public void testGetContentForHttp200() throws Exception { + // given + String expectedResult = "this is expected result"; + HttpContentRetriever service = prepareValidService(); + + // content retrieval mock + CloseableHttpResponse getContentHttpResponse = mock(CloseableHttpResponse.class); + StatusLine getContentStatusLine = mock(StatusLine.class); + HttpEntity getContentHttpEntity = mock(HttpEntity.class); + when(getContentHttpResponse.getStatusLine()).thenReturn(getContentStatusLine); + when(getContentStatusLine.getStatusCode()).thenReturn(200); + when(getContentHttpResponse.getEntity()).thenReturn(getContentHttpEntity); + when(getContentHttpEntity.getContent()).thenReturn(new ByteArrayInputStream(expectedResult.getBytes())); + + when(httpClient.execute(any(HttpGet.class))).thenReturn(getContentHttpResponse); + + // execute + ContentRetrieverResponse response = service.retrieveUrlContent("someUrl"); + + // assert + assertNotNull(response); + assertNull(response.getException()); + assertEquals(expectedResult, response.getContent()); + } + + @Test + public void testGetNoContentForHttp404() throws Exception { + // given + HttpContentRetriever service = prepareValidService(); + + // content retrieval mock + CloseableHttpResponse getContentHttpResponse = mock(CloseableHttpResponse.class); + StatusLine getContentStatusLine = mock(StatusLine.class); + when(getContentHttpResponse.getStatusLine()).thenReturn(getContentStatusLine); + when(getContentStatusLine.getStatusCode()).thenReturn(404); + + when(httpClient.execute(any(HttpGet.class))).thenReturn(getContentHttpResponse); + + // execute + ContentRetrieverResponse response = service.retrieveUrlContent("someUrl"); + + // assert + assertNotNull(response); + assertEquals("", response.getContent()); + assertTrue(response.getException() instanceof NoSuchElementException); + } + + @Test + public void testGetContentResultsInExceptionForHttp500() throws Exception { + // given + HttpContentRetriever service = prepareValidService(); + + // content retrieval mock + CloseableHttpResponse getContentHttpResponse = mock(CloseableHttpResponse.class); + StatusLine getContentStatusLine = mock(StatusLine.class); + when(getContentHttpResponse.getStatusLine()).thenReturn(getContentStatusLine); + when(getContentStatusLine.getStatusCode()).thenReturn(500); + + when(httpClient.execute(any(HttpGet.class))).thenReturn(getContentHttpResponse); + + // execute + ContentRetrieverResponse response = service.retrieveUrlContent("someUrl"); + + // assert + assertNotNull(response); + assertEquals("", response.getContent()); + assertTrue(response.getException() instanceof RuntimeException); + } + + @Test + public void testGetMovedContentForHttp301NoLocationHeader() throws Exception { + // given + String originalResult = "this is original result"; + String movedResult = "this is moved result"; + HttpContentRetriever service = prepareValidService(); + + // initial response mock + CloseableHttpResponse getContentHttpResponse = mock(CloseableHttpResponse.class); + StatusLine getContentStatusLine = mock(StatusLine.class); + HttpEntity getContentHttpEntity = mock(HttpEntity.class); + when(getContentHttpResponse.getStatusLine()).thenReturn(getContentStatusLine); + when(getContentStatusLine.getStatusCode()).thenReturn(301); + when(getContentHttpResponse.getEntity()).thenReturn(getContentHttpEntity); + when(getContentHttpEntity.getContent()).thenReturn(new ByteArrayInputStream(originalResult.getBytes())); + + // moved response + CloseableHttpResponse getMovedContentHttpResponse = mock(CloseableHttpResponse.class); + StatusLine getMovedContentStatusLine = mock(StatusLine.class); + HttpEntity getMovedContentHttpEntity = mock(HttpEntity.class); + when(getMovedContentHttpResponse.getStatusLine()).thenReturn(getMovedContentStatusLine); + when(getMovedContentStatusLine.getStatusCode()).thenReturn(200); + when(getMovedContentHttpResponse.getEntity()).thenReturn(getMovedContentHttpEntity); + when(getMovedContentHttpEntity.getContent()).thenReturn(new ByteArrayInputStream(movedResult.getBytes())); + when(httpClient.execute(any(HttpGet.class))).thenReturn(getContentHttpResponse, getMovedContentHttpResponse); + + // execute + ContentRetrieverResponse response = service.retrieveUrlContent("someUrl"); + + // assert + assertNotNull(response); + assertTrue(response.getException() instanceof RuntimeException); + assertEquals("", response.getContent()); + } + + @Test + public void testGetMovedContentForHttp301And200() throws Exception { + // given + String originalResult = "this is original result"; + String movedResult = "this is moved result"; + HttpContentRetriever service = prepareValidService(); + Header mockedHeader = mock(Header.class); + when(mockedHeader.getName()).thenReturn("Location"); + when(mockedHeader.getValue()).thenReturn("newUrl"); + Header[] headers = new Header[] {mockedHeader}; + + // initial response mock + CloseableHttpResponse getContentHttpResponse = mock(CloseableHttpResponse.class); + StatusLine getContentStatusLine = mock(StatusLine.class); + HttpEntity getContentHttpEntity = mock(HttpEntity.class); + when(getContentHttpResponse.getStatusLine()).thenReturn(getContentStatusLine); + when(getContentStatusLine.getStatusCode()).thenReturn(301); + when(getContentHttpResponse.getAllHeaders()).thenReturn(headers); + when(getContentHttpResponse.getEntity()).thenReturn(getContentHttpEntity); + when(getContentHttpEntity.getContent()).thenReturn(new ByteArrayInputStream(originalResult.getBytes())); + + // moved response mock + CloseableHttpResponse getMovedContentHttpResponse = mock(CloseableHttpResponse.class); + StatusLine getMovedContentStatusLine = mock(StatusLine.class); + HttpEntity getMovedContentHttpEntity = mock(HttpEntity.class); + when(getMovedContentHttpResponse.getStatusLine()).thenReturn(getMovedContentStatusLine); + when(getMovedContentStatusLine.getStatusCode()).thenReturn(200); + when(getMovedContentHttpResponse.getEntity()).thenReturn(getMovedContentHttpEntity); + when(getMovedContentHttpEntity.getContent()).thenReturn(new ByteArrayInputStream(movedResult.getBytes())); + when(httpClient.execute(any(HttpGet.class))).thenReturn(getContentHttpResponse, getMovedContentHttpResponse); + + // execute + ContentRetrieverResponse response = service.retrieveUrlContent("someUrl"); + + // assert + assertNotNull(response); + assertNull(response.getException()); + assertEquals(movedResult, response.getContent()); + } + + @Test + public void testGetContentForHttp429RetryLimitExceeded() throws Exception { + // given + HttpContentRetriever service = prepareValidService(); + + // reate-limited response mock + CloseableHttpResponse getContentHttpResponse = mock(CloseableHttpResponse.class); + StatusLine getContentStatusLine = mock(StatusLine.class); + when(getContentHttpResponse.getStatusLine()).thenReturn(getContentStatusLine); + when(getContentStatusLine.getStatusCode()).thenReturn(429); + HttpEntity getContentHttpEntity = mock(HttpEntity.class); + when(getContentHttpResponse.getEntity()).thenReturn(getContentHttpEntity); + when(getContentHttpEntity.getContent()).thenReturn(new ByteArrayInputStream("".getBytes())); + when(httpClient.execute(any(HttpGet.class))).thenReturn(getContentHttpResponse); + + // execute + ContentRetrieverResponse response = service.retrieveUrlContent("someUrl"); + + // assert + assertNotNull(response); + assertTrue(response.getException() instanceof RetryLimitExceededException); + assertEquals("", response.getContent()); + } + + @Test + public void testGetContentForHttp429And200() throws Exception { + // given + String validResult = "this is valid result"; + HttpContentRetriever service = prepareValidService(); + + // rate-limited response mock + CloseableHttpResponse getRateLimitedContentHttpResponse = mock(CloseableHttpResponse.class); + StatusLine getContentStatusLine = mock(StatusLine.class); + when(getRateLimitedContentHttpResponse.getStatusLine()).thenReturn(getContentStatusLine); + when(getContentStatusLine.getStatusCode()).thenReturn(429); + HttpEntity getContentHttpEntity = mock(HttpEntity.class); + when(getRateLimitedContentHttpResponse.getEntity()).thenReturn(getContentHttpEntity); + when(getContentHttpEntity.getContent()).thenReturn(new ByteArrayInputStream("".getBytes())); + + // moved response mock + CloseableHttpResponse getValidContentHttpResponse = mock(CloseableHttpResponse.class); + StatusLine getMovedContentStatusLine = mock(StatusLine.class); + HttpEntity getMovedContentHttpEntity = mock(HttpEntity.class); + when(getValidContentHttpResponse.getStatusLine()).thenReturn(getMovedContentStatusLine); + when(getMovedContentStatusLine.getStatusCode()).thenReturn(200); + when(getValidContentHttpResponse.getEntity()).thenReturn(getMovedContentHttpEntity); + when(getMovedContentHttpEntity.getContent()).thenReturn(new ByteArrayInputStream(validResult.getBytes())); + when(httpClient.execute(any(HttpGet.class))).thenReturn(getRateLimitedContentHttpResponse, getValidContentHttpResponse); + + // execute + ContentRetrieverResponse response = service.retrieveUrlContent("someUrl"); + + // assert + assertNotNull(response); + assertNull(response.getException()); + assertEquals(validResult, response.getContent()); + } + + @Test + public void testSerializeAndDeserialize() throws Exception { + // given + HttpContentRetriever service = new HttpContentRetriever(connectionTimeout, readTimeout, maxPageContentLength, + throttleSleepTime, maxRetriesCount); + + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + ObjectOutputStream oos = new ObjectOutputStream(bos); + + // execute + oos.writeObject(service); + oos.flush(); + oos.close(); + ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(bos.toByteArray())); + HttpContentRetriever deserService = (HttpContentRetriever) ois.readObject(); + ois.close(); + + // assert + assertNotNull(deserService); + } + + private HttpContentRetriever prepareValidService() { + return new HttpContentRetriever(connectionTimeout, readTimeout, maxPageContentLength, throttleSleepTime, + maxRetriesCount) { + + private static final long serialVersionUID = 1L; + + protected CloseableHttpClient buildHttpClient(int connectionTimeout, int readTimeout) { + return httpClient; + } + }; + } + +} diff --git a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/softwareurl/main/sampletest/oozie_app/workflow.xml b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/softwareurl/main/sampletest/oozie_app/workflow.xml index 2d27afbae..fa0bf42fc 100644 --- a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/softwareurl/main/sampletest/oozie_app/workflow.xml +++ b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/softwareurl/main/sampletest/oozie_app/workflow.xml @@ -72,8 +72,8 @@ - webcrawlContentRetrieverClassName - eu.dnetlib.iis.wf.referenceextraction.softwareurl.ClasspathContentRetriever + webcrawlContentRetrieverFactoryClassName + eu.dnetlib.iis.wf.referenceextraction.softwareurl.ClasspathContentRetrieverFactory webcrawlLockManagerFactoryClassName diff --git a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/softwareurl/main/sampletest_empty_input/oozie_app/workflow.xml b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/softwareurl/main/sampletest_empty_input/oozie_app/workflow.xml index 378889608..bb327e214 100644 --- a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/softwareurl/main/sampletest_empty_input/oozie_app/workflow.xml +++ b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/softwareurl/main/sampletest_empty_input/oozie_app/workflow.xml @@ -72,8 +72,8 @@ - webcrawlContentRetrieverClassName - eu.dnetlib.iis.wf.referenceextraction.softwareurl.ClasspathContentRetriever + webcrawlContentRetrieverFactoryClassName + eu.dnetlib.iis.wf.referenceextraction.softwareurl.ClasspathContentRetrieverFactory webcrawlLockManagerFactoryClassName diff --git a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/softwareurl/main/sampletest_without_references/oozie_app/workflow.xml b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/softwareurl/main/sampletest_without_references/oozie_app/workflow.xml index bb75d2229..b5481991b 100644 --- a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/softwareurl/main/sampletest_without_references/oozie_app/workflow.xml +++ b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/softwareurl/main/sampletest_without_references/oozie_app/workflow.xml @@ -72,8 +72,8 @@ - webcrawlContentRetrieverClassName - eu.dnetlib.iis.wf.referenceextraction.softwareurl.ClasspathContentRetriever + webcrawlContentRetrieverFactoryClassName + eu.dnetlib.iis.wf.referenceextraction.softwareurl.ClasspathContentRetrieverFactory webcrawlLockManagerFactoryClassName From 3181f4953b682ef380ed47a203f83aca462e7800 Mon Sep 17 00:00:00 2001 From: Marek Horst Date: Fri, 11 Sep 2020 18:10:00 +0200 Subject: [PATCH 3/4] Closes #1146: Integrate textsnippet improved generation for Canadian funders --- .../main_sqlite/oozie_app/lib/scripts/projects.sql | 6 ++---- .../project/data/document_to_project.json | 8 ++++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql b/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql index 0469c3370..30b0e02d8 100644 --- a/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql +++ b/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql @@ -62,16 +62,14 @@ regexprmatches("support|project|grant|fund|thanks|agreement|research|acknowledge --DFG union all - -- Canadian funders - select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', textsnippet) as C1, docid, id, fundingclass1, grantid from ( select docid, case when regexprmatches(".*(?:(?:CIHR|IRSC)|(?i)(?:canad(?:ian|a) institute(?:s)? health research|institut(?:(?:e)?(?:s)?)? recherche sant(?:é|e) canada)).*", prev||" "||middle||" "||next) then (select id from grants where fundingclass1 = 'CIHR') when regexprmatches(".*(?:(?:NSERC|CRSNG)|(?i)(?:nat(?:ural|ional) science(?:s)?(?:\sengineering(?:\sresearch)?|\sresearch) co(?:u)?n(?:c|se)(?:i)?l|conseil(?:s)? recherche(?:s)? science(?:s)? naturel(?:les)?(?:\sg(?:e|é)nie)? canada)).*", prev||" "||middle||" "||next) then (select id from grants where fundingclass1 = 'NSERC') when regexprmatches(".*(?:(?:SSHRC|CRSH|SSRCC)|(?i)(?:social science(?:s)?|conseil(?:s)? recherche(?:s)?(?:\ssciences humaines)? canada|humanities\sresearch)).*", prev||" "||middle||" "||next) then (select id from grants where fundingclass1 = 'SSHRC') else 'canadian_unspecified_id' - end as id, "unidentified" as grantid, "Canadian" as fundingclass1, (prev||" "||middle||" "||next) as textsnippet + end as id, "unidentified" as grantid, "Canadian" as fundingclass1, (prev||" <<< "||middle||" >>> "||next) as textsnippet from (setschema 'docid,prev,middle,next' select c1, textwindow2s(filterstopwords(keywords(c2)), 15,1,15, "^(?:(?:(?:CIHR|IRSC)|(?:NSERC|CRSNG)|(?:SSHRC|CRSH))|(?i)(?:co(?:(?:un(?:cil|sel))|(?:nseil(?:s)?))|canad(?:a|ian)))$") from pubs where c2 is not null) where @@ -271,4 +269,4 @@ select C1 from secondary_output_table union all select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8,'textsnippet','') from matched_undefined_miur_only union all -select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8,'textsnippet','') from matched_undefined_wt_only; +select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8,'textsnippet','') from matched_undefined_wt_only; \ No newline at end of file diff --git a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/document_to_project.json b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/document_to_project.json index 03e06fc63..d6a564d07 100644 --- a/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/document_to_project.json +++ b/iis-wf/iis-wf-referenceextraction/src/test/resources/eu/dnetlib/iis/wf/referenceextraction/project/data/document_to_project.json @@ -8,13 +8,13 @@ "documentId": "50|dedup_wf_001::966fa995ef22da40b8ee1a5c1ad3eab4", "projectId": "40|sshrc_______::1e5e62235d094afd01cd56e65112fc63", "confidenceLevel": 0.8, - "textsnippet": "grateful nancial support Social Sciences Humanities Research Council Canada SSHRC grant 410 2009 0183" + "textsnippet": "grateful nancial support Social Sciences Humanities Research Council Canada \u003c\u003c\u003c SSHRC \u003e\u003e\u003e grant 410 2009 0183" } { "documentId": "50|dedup_wf_001::96932fdbc6d5a50f289da74bff297d5e", "projectId": "40|nserc_______::1e5e62235d094afd01cd56e65112fc63", "confidenceLevel": 0.8, - "textsnippet": "GT RGM acknowledge support NSERC Canada Research Chair program Ontario Trillium Foundation Perimeter Institute Theoretical Physics" + "textsnippet": "GT RGM acknowledge support \u003c\u003c\u003c NSERC \u003e\u003e\u003e Canada Research Chair program Ontario Trillium Foundation Perimeter Institute Theoretical Physics" } { "documentId": "50|narcis______::0b552824e655d0cd2251691e1722aa20", @@ -98,7 +98,7 @@ "documentId": "50|sharebioRxiv::ff9de1c352dcee69956d01e77149b708", "projectId": "40|cihr________::1e5e62235d094afd01cd56e65112fc63", "confidenceLevel": 0.8, - "textsnippet": "work supported Canadian Institutes Health Research FDN 154328 128090 CHA FDN 148430 201512MSH 360794 228629 DDC" + "textsnippet": "work supported \u003c\u003c\u003c Canadian \u003e\u003e\u003e Institutes Health Research FDN 154328 128090 CHA FDN 148430 201512MSH 360794 228629 DDC" } { "documentId": "PMC3386204", @@ -308,7 +308,7 @@ "documentId": "WOS:000316616100009", "projectId": "40|cihr________::1e5e62235d094afd01cd56e65112fc63", "confidenceLevel": 0.8, - "textsnippet": "work authors laboratory supported operating grants group grant Canadian Institutes Health Research well Canadian Foundation Innovation P K Alberta Heritage Foundation Medical Research AIHS" + "textsnippet": "work authors laboratory supported operating grants group grant \u003c\u003c\u003c Canadian \u003e\u003e\u003e Institutes Health Research well Canadian Foundation Innovation P K Alberta Heritage Foundation Medical Research AIHS" } { "documentId": "WOS:000316616100009", From 53d6b9874b9f095d490370bc9526f463ec373871 Mon Sep 17 00:00:00 2001 From: Marek Horst Date: Tue, 15 Sep 2020 13:38:41 +0200 Subject: [PATCH 4/4] Closes #1148: Optimize performance for DFG funder --- .../main_sqlite/oozie_app/lib/scripts/projects.sql | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql b/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql index 30b0e02d8..40c42e1c9 100644 --- a/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql +++ b/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql @@ -55,10 +55,10 @@ WHERE fundingclass1="RCUK" and middle = grantid union all --DFG -select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8,'textsnippet',j2s(prev,middle,next)) as C1, docid, id, fundingclass1, grantid from -(setschema 'docid,prev,middle,next' select c1, textwindow2s(regexpr("\n",filterstopwords(keywords(c2)),"\s"),10,2,7,"\w{3}\s\d{1,4}" ) from pubs where c2 is not null), grants -where lower(regexpr("\b(\w{3}\s\d{1,4})\b",middle)) = grantid and -regexprmatches("support|project|grant|fund|thanks|agreement|research|acknowledge|centre|center|nstitution|program|priority|dfg|german|dutch|deutche",lower(j2s(prev,middle,next))) group by docid, id +select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8,'textsnippet', prev||" "||middle||" "||next) as C1, docid, id, fundingclass1, grantid from +(setschema 'docid,prev,middle,next' select c1, textwindow2s(filterstopwords(keywords(c2)),10,2,7,"\w{3}\s\d{1,4}") from pubs where c2 is not null), grants +where lower(regexpr("\b(\w{3}\s\d{1,4})\b",middle)) = grantid and +regexprmatches("support|project|grant|fund|thanks|agreement|research|acknowledge|centre|center|nstitution|program|priority|dfg|german|dutch|deutche",lower(prev||" "||next)) group by docid, id --DFG union all