diff --git a/build.xml b/build.xml index db163c6207..6cd470c92d 100644 --- a/build.xml +++ b/build.xml @@ -1002,7 +1002,7 @@ - StringBuffer. - * + * StringBuffer without the mentioned element names in the Set. + * *

* * If abortOnNestedAnchors is true, DOM traversal will be aborted @@ -116,9 +114,9 @@ public void setConf(Configuration conf) { * * @return true if nested anchors were found */ - public boolean getText(StringBuffer sb, Node node, - boolean abortOnNestedAnchors) { - if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) { + private boolean getText(StringBuffer sb, Node node, + boolean abortOnNestedAnchors, Set excludedElementNames) { + if (getTextHelper(sb, node, abortOnNestedAnchors, 0, excludedElementNames)) { return true; } return false; @@ -126,19 +124,27 @@ public boolean getText(StringBuffer sb, Node node, /** * This is a convinience method, equivalent to - * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}. + * {@link #getText(StringBuffer,Node,boolean, Set) getText(sb, node, false, excludedElementNames)}. * */ - public void getText(StringBuffer sb, Node node) { - getText(sb, node, false); + public void getText(StringBuffer sb, Node node, Set excludedElementNames) { + getText(sb, node, false, excludedElementNames); } // returns true if abortOnNestedAnchors is true and we find nested // anchors private boolean getTextHelper(StringBuffer sb, Node node, - boolean abortOnNestedAnchors, int anchorDepth) { + boolean abortOnNestedAnchors, int anchorDepth, Set excludedElementNames) { boolean abort = false; NodeWalker walker = new NodeWalker(node); + Set lcExcludedElementNames = new HashSet<>(); + if (excludedElementNames != null) { + for (String excludedElementName : excludedElementNames) { + if (excludedElementName != null) { + lcExcludedElementNames.add(excludedElementName.toLowerCase()); + } + } + } while (walker.hasNext()) { @@ -146,6 +152,12 @@ private boolean getTextHelper(StringBuffer sb, Node node, String nodeName = currentNode.getNodeName(); short nodeType = currentNode.getNodeType(); + if (nodeName != null) { + if (lcExcludedElementNames.contains(nodeName.toLowerCase())) { + walker.skipChildren(); + } + } + if ("script".equalsIgnoreCase(nodeName)) { walker.skipChildren(); } @@ -244,7 +256,7 @@ public boolean getTitle(StringBuffer sb, Node node) { if (nodeType == Node.ELEMENT_NODE) { if ("title".equalsIgnoreCase(nodeName)) { - getText(sb, currentNode); + getText(sb, currentNode, null); return true; } } @@ -380,7 +392,7 @@ public void getOutlinks(URL base, ArrayList outlinks, Node node) { if (!shouldThrowAwayLink(currentNode, children, childLen, params)) { StringBuffer linkText = new StringBuffer(); - getText(linkText, currentNode, true); + getText(linkText, currentNode, true, null); if (linkText.toString().trim().length() == 0) { // try harder - use img alt if present NodeWalker subWalker = new NodeWalker(currentNode); diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java index 9ed9fa4ee1..ea5347efd7 100644 --- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java +++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java @@ -18,8 +18,7 @@ package org.apache.nutch.parse.html; import java.lang.invoke.MethodHandles; -import java.util.ArrayList; -import java.util.Map; +import java.util.*; import java.net.URL; import java.net.MalformedURLException; import java.nio.charset.StandardCharsets; @@ -49,6 +48,7 @@ public class HtmlParser implements Parser { // (e.g. http://cn.promo.yahoo.com/customcare/music.html) // NUTCH-2042 (cf. TIKA-357): increased to 8 kB private static final int CHUNK_SIZE = 8192; + public static final String ELEMENT_NAMES_SEPARATOR = ","; // NUTCH-1006 Meta equiv with single quotes not accepted private static Pattern metaPattern = Pattern.compile( @@ -132,6 +132,9 @@ private static String sniffCharacterEncoding(byte[] content) { public ParseResult getParse(Content content) { HTMLMetaTags metaTags = new HTMLMetaTags(); + String excludedElementNamesString = getConf().get("html.content.exclude.element.names"); + Set excludedElementNames = excludedElementNamesString == null ? null : new HashSet<>(Arrays.asList(excludedElementNamesString.split(ELEMENT_NAMES_SEPARATOR))); + URL base; try { base = new URL(content.getBaseUrl()); @@ -195,7 +198,7 @@ public ParseResult getParse(Content content) { if (LOG.isTraceEnabled()) { LOG.trace("Getting text..."); } - utils.getText(sb, root); // extract text + utils.getText(sb, root, excludedElementNames); // extract text text = sb.toString(); sb.setLength(0); if (LOG.isTraceEnabled()) { diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java index 0faa013e98..4fa3856457 100644 --- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java +++ b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java @@ -24,8 +24,7 @@ import java.io.ByteArrayInputStream; import java.net.MalformedURLException; import java.net.URL; -import java.util.ArrayList; -import java.util.StringTokenizer; +import java.util.*; import org.cyberneko.html.parsers.*; import org.junit.Assert; @@ -168,6 +167,40 @@ public class TestDOMContentUtils { "my title", "my title", "my title", "my title", "", "", "", "title", "title", "title", "" }; + private static final Set[] contentRemoveTags = new Set[]{ + null, + Collections.singleton("title"), + Collections.singleton("title"), + Collections.singleton("title"), + Collections.singleton("title"), + new HashSet(Arrays.asList("title", "h1")), + Collections.singleton("title"), + Collections.singleton("title"), + Collections.singleton("title"), + Collections.singleton("title"), + Collections.singleton("title"), + Collections.singleton("title"), + Collections.singleton("title"), + null + }; + + private static final String[] answerContent = { + "title body anchor", + "body home bots", + "separate this from this", + "body home 1 2", + "", + "", + "Whitespace test whitespace test " + + "This is a whitespace test . Newlines should appear as space too. " + + "Tabs are spaces too. This is a break -> and the line after break . " + + "one two three space here space there no space " + + "one two two three three four put some text here and there. " + + "End this madness ! . . . .", "ignore ignore", "test1 test2", + "test1 test2", "anchor1 anchor2 anchor3", + "anchor1 anchor2 anchor3 anchor4 anchor5", "", "" + }; + // note: should be in page-order private static Outlink[][] answerOutlinks; @@ -265,7 +298,7 @@ public void testGetText() { setup(); for (int i = 0; i < testPages.length; i++) { StringBuffer sb = new StringBuffer(); - utils.getText(sb, testDOMs[i]); + utils.getText(sb, testDOMs[i], null); String text = sb.toString(); Assert.assertTrue( "expecting text: " + answerText[i] @@ -292,7 +325,23 @@ public void testGetTitle() { } @Test - public void testGetOutlinks() { + public void testGetContent() throws Exception { + if (testDOMs[0] == null) + setup(); + for (int i = 0; i < testPages.length; i++) { + StringBuffer sb = new StringBuffer(); + utils.getText(sb, testDOMs[i], contentRemoveTags[i]); + String text = sb.toString(); + Assert.assertTrue( + "expecting text: " + answerContent[i] + + System.getProperty("line.separator") + + System.getProperty("line.separator") + "got text: " + text, + equalsIgnoreWhitespace(answerContent[i], text)); + } + } + + @Test + public void testGetOutlinks() throws Exception { if (testDOMs[0] == null) setup(); for (int i = 0; i < testPages.length; i++) { diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java index d40958912d..fca93861e9 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java @@ -19,11 +19,7 @@ import java.net.MalformedURLException; import java.net.URL; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; +import java.util.*; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.MapWritable; @@ -49,10 +45,10 @@ public class DOMContentUtils { private boolean keepNodenames; private static class LinkParams { + private String elName; private String attrName; private int childLen; - private LinkParams(String elName, String attrName, int childLen) { this.elName = elName; this.attrName = attrName; @@ -62,12 +58,12 @@ private LinkParams(String elName, String attrName, int childLen) { public String toString() { return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]"; } - } + } private HashMap linkParams = new HashMap(); + private HashSet ignoredTags = new HashSet(); private Configuration conf; - public DOMContentUtils(Configuration conf) { setConf(conf); } @@ -107,23 +103,24 @@ public void setConf(Configuration conf) { } /** - * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will + * This method takes a {@link StringBuffer}, a DOM {@link Node} + * and an excluded element {@link Set}, and will * append all the content text found beneath the DOM node to the - * StringBuffer. - * + * StringBuffer without the mentioned element names in the Set. + * *

- * + * * If abortOnNestedAnchors is true, DOM traversal will be aborted * and the StringBuffer will not contain any text encountered * after a nested anchor is found. - * + * *

- * + * * @return true if nested anchors were found */ private boolean getText(StringBuffer sb, Node node, - boolean abortOnNestedAnchors) { - if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) { + boolean abortOnNestedAnchors, Set excludedElementNames) { + if (getTextHelper(sb, node, abortOnNestedAnchors, 0, excludedElementNames)) { return true; } return false; @@ -131,19 +128,27 @@ private boolean getText(StringBuffer sb, Node node, /** * This is a convinience method, equivalent to - * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}. - * + * {@link #getText(StringBuffer, Node, boolean, Set) getText(sb, node, false, excludedElementNames)}. + * */ - public void getText(StringBuffer sb, Node node) { - getText(sb, node, false); + public void getText(StringBuffer sb, Node node, Set excludedElementNames) { + getText(sb, node, false, excludedElementNames); } // returns true if abortOnNestedAnchors is true and we find nested // anchors private boolean getTextHelper(StringBuffer sb, Node node, - boolean abortOnNestedAnchors, int anchorDepth) { + boolean abortOnNestedAnchors, int anchorDepth, Set excludedElementNames) { boolean abort = false; NodeWalker walker = new NodeWalker(node); + Set lcExcludedElementNames = new HashSet<>(); + if (excludedElementNames != null) { + for (String excludedElementName : excludedElementNames) { + if (excludedElementName != null) { + lcExcludedElementNames.add(excludedElementName.toLowerCase()); + } + } + } while (walker.hasNext()) { @@ -151,6 +156,12 @@ private boolean getTextHelper(StringBuffer sb, Node node, String nodeName = currentNode.getNodeName(); short nodeType = currentNode.getNodeType(); + if (nodeName != null) { + if (lcExcludedElementNames.contains(nodeName.toLowerCase())) { + walker.skipChildren(); + } + } + if ("script".equalsIgnoreCase(nodeName)) { walker.skipChildren(); } @@ -230,7 +241,7 @@ private void appendSpace(StringBuffer buffer) { * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will * append the content text found beneath the first title node to * the StringBuffer. - * + * * @return true if a title node was found, false otherwise */ public boolean getTitle(StringBuffer sb, Node node) { @@ -249,7 +260,7 @@ public boolean getTitle(StringBuffer sb, Node node) { if (nodeType == Node.ELEMENT_NODE) { if ("title".equalsIgnoreCase(nodeName)) { - getText(sb, currentNode); + getText(sb, currentNode, null); return true; } } @@ -385,7 +396,7 @@ public void getOutlinks(URL base, ArrayList outlinks, Node node) { if (!shouldThrowAwayLink(currentNode, children, childLen, params)) { StringBuffer linkText = new StringBuffer(); - getText(linkText, currentNode, true); + getText(linkText, currentNode, true, null); NamedNodeMap attrs = currentNode.getAttributes(); String target = null; diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java index ea864bec25..e88c968677 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java @@ -20,9 +20,7 @@ import java.io.ByteArrayInputStream; import java.net.MalformedURLException; import java.net.URL; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; +import java.util.*; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; @@ -64,6 +62,7 @@ public class TikaParser implements org.apache.nutch.parse.Parser { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); + public static final String ELEMENT_NAMES_SEPARATOR = ","; private Configuration conf; private TikaConfig tikaConfig = null; @@ -76,10 +75,13 @@ public class TikaParser implements org.apache.nutch.parse.Parser { @SuppressWarnings("deprecation") public ParseResult getParse(Content content) { String mimeType = content.getContentType(); - + boolean useBoilerpipe = getConf().get("tika.extractor", "none").equals("boilerpipe"); String boilerpipeExtractorName = getConf().get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor"); + String excludedElementNamesString = getConf().get("tika.content.exclude.element.names"); + Set excludedElementNames = excludedElementNamesString == null ? null : new HashSet<>(Arrays.asList(excludedElementNamesString.split(ELEMENT_NAMES_SEPARATOR))); + URL base; try { base = new URL(content.getBaseUrl()); @@ -109,7 +111,7 @@ public ParseResult getParse(Content content) { DocumentFragment root = doc.createDocumentFragment(); ContentHandler domHandler; - + // Check whether to use Tika's BoilerplateContentHandler if (useBoilerpipe) { BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler((ContentHandler)new DOMBuilder(doc, root), @@ -127,7 +129,7 @@ public ParseResult getParse(Content content) { ParseContext context = new ParseContext(); TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, linkContentHandler); - + if (HTMLMapper != null) context.set(HtmlMapper.class, HTMLMapper); tikamd.set(Metadata.CONTENT_TYPE, mimeType); @@ -159,7 +161,7 @@ public ParseResult getParse(Content content) { if (LOG.isTraceEnabled()) { LOG.trace("Getting text..."); } - utils.getText(sb, root); // extract text + utils.getText(sb, root, excludedElementNames); // extract text text = sb.toString(); sb.setLength(0); if (LOG.isTraceEnabled()) { diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java index 2159b9d5a8..4b9acc7dc4 100644 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java +++ b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java @@ -24,8 +24,7 @@ import java.io.ByteArrayInputStream; import java.net.URL; -import java.util.ArrayList; -import java.util.StringTokenizer; +import java.util.*; import org.xml.sax.*; import org.w3c.dom.*; @@ -170,6 +169,39 @@ public class TestDOMContentUtils { "my title", "my title", "my title", "my title", "", "", "", "title", "title", "" }; + private static final Set[] contentRemoveTags = new Set[]{ + null, + Collections.singleton("title"), + Collections.singleton("title"), + Collections.singleton("title"), + Collections.singleton("title"), + new HashSet(Arrays.asList("title", "h1")), + Collections.singleton("title"), + Collections.singleton("title"), + Collections.singleton("title"), + Collections.singleton("title"), + Collections.singleton("title"), + Collections.singleton("title"), + Collections.singleton("title"), + }; + + private static final String[] answerContent = { + "title body anchor", + "body home bots", + "separate this from this", + "body home 1 2", + "", + "", + "Whitespace test whitespace test " + + "This is a whitespace test . Newlines should appear as space too. " + + "Tabs are spaces too. This is a break -> and the line after break . " + + "one two three space here space there no space " + + "one two two three three four put some text here and there. " + + "End this madness ! . . . .", "ignore ignore", "test1 test2", + "test1 test2", "anchor1 anchor2 anchor3", + "anchor1 anchor2 anchor3 anchor4 anchor5", "" + }; + // note: should be in page-order private static Outlink[][] answerOutlinks; @@ -255,7 +287,7 @@ public void testGetText() throws Exception { setup(); for (int i = 0; i < testPages.length; i++) { StringBuffer sb = new StringBuffer(); - utils.getText(sb, testDOMs[i]); + utils.getText(sb, testDOMs[i], null); String text = sb.toString(); Assert.assertTrue( "expecting text: " + answerText[i] @@ -281,6 +313,22 @@ public void testGetTitle() throws Exception { } } + @Test + public void testGetContent() throws Exception { + if (testDOMs[0] == null) + setup(); + for (int i = 0; i < testPages.length; i++) { + StringBuffer sb = new StringBuffer(); + utils.getText(sb, testDOMs[i], contentRemoveTags[i]); + String text = sb.toString(); + Assert.assertTrue( + "expecting text: " + answerContent[i] + + System.getProperty("line.separator") + + System.getProperty("line.separator") + "got text: " + text, + equalsIgnoreWhitespace(answerContent[i], text)); + } + } + @Test public void testGetOutlinks() throws Exception { if (testDOMs[0] == null)