fix breadcrumb extraction when we have comments

also might fix other extractions where comments are passed
zytedata · Apr 15, 2024 · 0300f11 · 0300f11
1 parent 636a2d4
commit 0300f11
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 6 deletions.
diff --git a/tests/data/breadcrumb_items_extract.json b/tests/data/breadcrumb_items_extract.json
@@ -5724,5 +5724,26 @@
     ],
     "snippet_path": "generated/snippet0300.html",
     "base_url": "https://www.officedepot.cz/zavesne-u-obaly-niceday-a4-55-mikronu-cire-50-ks/"
+  },
+  {
+    "expected": [
+      {
+        "name": "Haberler",
+        "url": "https://www.hurriyet.com.tr"
+      },
+      {
+        "name": "Yerel Haberler",
+        "url": "https://www.hurriyet.com.tr/yerel-haberler/"
+      },
+      {
+        "name": "Ankara",
+        "url": "https://www.hurriyet.com.tr/ankara-haberleri/"
+      },
+      {
+        "name": "ABB’den Balcı’ya Ulus yanıtı"
+      }
+    ],
+    "snippet_path": "comments1.html",
+    "base_url": "https://www.hurriyet.com.tr/yerel-haberler/ankara/abbden-balciya-ulus-yaniti-41547223"
   }
 ]
diff --git a/tests/data/breadcrumb_items_snippets/comments1.html b/tests/data/breadcrumb_items_snippets/comments1.html
@@ -0,0 +1,12 @@
+<div class="breadcrumb-body">
+    <span class="clearfix" style="margin-left:0;">
+        <a href="https://www.hurriyet.com.tr">Haberler</a>
+    </span><!----><span> &gt;</span>
+    <span>
+        <a href="https://www.hurriyet.com.tr/yerel-haberler/" title="Yerel Haberler">Yerel Haberler </a>
+    </span><!----><span> &gt;</span>
+    <span>
+        <a href="https://www.hurriyet.com.tr/ankara-haberleri/" title="Ankara">Ankara </a>
+    </span><!----><span>&gt;</span>
+    <span>ABB’den Balcı’ya Ulus yanıtı</span>
+</div>
diff --git a/zyte_parsers/api.py b/zyte_parsers/api.py
@@ -1,9 +1,9 @@
 from typing import Union
 
-from lxml.html import HtmlElement
+from lxml.html import HtmlComment, HtmlElement
 from parsel import Selector
 
-SelectorOrElement = Union[Selector, HtmlElement]
+SelectorOrElement = Union[Selector, HtmlElement, HtmlComment]
 
 
 def input_to_selector(node: SelectorOrElement) -> Selector:
@@ -13,8 +13,8 @@ def input_to_selector(node: SelectorOrElement) -> Selector:
     return Selector(root=node)
 
 
-def input_to_element(node: SelectorOrElement) -> HtmlElement:
-    """Convert a supported input object to a HtmlElement."""
-    if isinstance(node, HtmlElement):
+def input_to_element(node: SelectorOrElement) -> Union[HtmlElement, HtmlComment]:
+    """Convert a supported input object to a HtmlElement or HtmlComment."""
+    if isinstance(node, (HtmlElement, HtmlComment)):
         return node
     return node.root
diff --git a/zyte_parsers/utils.py b/zyte_parsers/utils.py
@@ -3,7 +3,12 @@
 from urllib.parse import urljoin
 
 import html_text
-from lxml.html import HtmlElement, fromstring  # noqa: F401
+from lxml.html import (  # noqa: F401
+    HtmlComment,
+    HtmlElement,
+    fragment_fromstring,
+    fromstring,
+)
 from parsel import Selector  # noqa: F401
 from w3lib.html import strip_html5_whitespace
 
@@ -87,10 +92,14 @@ def extract_text(
     'foo bar'
     >>> extract_text(Selector(text="<p>foo  bar </p>"))
     'foo bar'
+    >>> extract_text(fragment_fromstring("<!-- a comment -->"))
+    >>> extract_text(Selector(text="<!-- a comment -->"))
     """
     if node is None:
         return None
     node = input_to_element(node)
+    if isinstance(node, HtmlComment):
+        return None
     value = html_text.extract_text(node, guess_layout=guess_layout)
     if value:
         return value