Skip to content

Commit

Permalink
fix breadcrumb extraction when we have comments
Browse files Browse the repository at this point in the history
also might fix other extractions where comments are passed
  • Loading branch information
lopuhin committed Apr 15, 2024
1 parent 636a2d4 commit 0300f11
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 6 deletions.
21 changes: 21 additions & 0 deletions tests/data/breadcrumb_items_extract.json
Original file line number Diff line number Diff line change
Expand Up @@ -5724,5 +5724,26 @@
],
"snippet_path": "generated/snippet0300.html",
"base_url": "https://www.officedepot.cz/zavesne-u-obaly-niceday-a4-55-mikronu-cire-50-ks/"
},
{
"expected": [
{
"name": "Haberler",
"url": "https://www.hurriyet.com.tr"
},
{
"name": "Yerel Haberler",
"url": "https://www.hurriyet.com.tr/yerel-haberler/"
},
{
"name": "Ankara",
"url": "https://www.hurriyet.com.tr/ankara-haberleri/"
},
{
"name": "ABB’den Balcı’ya Ulus yanıtı"
}
],
"snippet_path": "comments1.html",
"base_url": "https://www.hurriyet.com.tr/yerel-haberler/ankara/abbden-balciya-ulus-yaniti-41547223"
}
]
12 changes: 12 additions & 0 deletions tests/data/breadcrumb_items_snippets/comments1.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<div class="breadcrumb-body">
<span class="clearfix" style="margin-left:0;">
<a href="https://www.hurriyet.com.tr">Haberler</a>
</span><!----><span> &gt;</span>
<span>
<a href="https://www.hurriyet.com.tr/yerel-haberler/" title="Yerel Haberler">Yerel Haberler </a>
</span><!----><span> &gt;</span>
<span>
<a href="https://www.hurriyet.com.tr/ankara-haberleri/" title="Ankara">Ankara </a>
</span><!----><span>&gt;</span>
<span>ABB’den Balcı’ya Ulus yanıtı</span>
</div>
10 changes: 5 additions & 5 deletions zyte_parsers/api.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from typing import Union

from lxml.html import HtmlElement
from lxml.html import HtmlComment, HtmlElement
from parsel import Selector

SelectorOrElement = Union[Selector, HtmlElement]
SelectorOrElement = Union[Selector, HtmlElement, HtmlComment]


def input_to_selector(node: SelectorOrElement) -> Selector:
Expand All @@ -13,8 +13,8 @@ def input_to_selector(node: SelectorOrElement) -> Selector:
return Selector(root=node)


def input_to_element(node: SelectorOrElement) -> HtmlElement:
"""Convert a supported input object to a HtmlElement."""
if isinstance(node, HtmlElement):
def input_to_element(node: SelectorOrElement) -> Union[HtmlElement, HtmlComment]:
"""Convert a supported input object to a HtmlElement or HtmlComment."""
if isinstance(node, (HtmlElement, HtmlComment)):
return node
return node.root
11 changes: 10 additions & 1 deletion zyte_parsers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@
from urllib.parse import urljoin

import html_text
from lxml.html import HtmlElement, fromstring # noqa: F401
from lxml.html import ( # noqa: F401
HtmlComment,
HtmlElement,
fragment_fromstring,
fromstring,
)
from parsel import Selector # noqa: F401
from w3lib.html import strip_html5_whitespace

Expand Down Expand Up @@ -87,10 +92,14 @@ def extract_text(
'foo bar'
>>> extract_text(Selector(text="<p>foo bar </p>"))
'foo bar'
>>> extract_text(fragment_fromstring("<!-- a comment -->"))
>>> extract_text(Selector(text="<!-- a comment -->"))
"""
if node is None:
return None
node = input_to_element(node)
if isinstance(node, HtmlComment):
return None
value = html_text.extract_text(node, guess_layout=guess_layout)
if value:
return value
Expand Down

0 comments on commit 0300f11

Please sign in to comment.