From d4645ef96bdb8c2ab8129f496660992f48c32abd Mon Sep 17 00:00:00 2001 From: Cristi Constantin Date: Wed, 17 Jul 2019 11:56:36 +0100 Subject: [PATCH] Strip empty prop and content tags Updated the tests --- extruct/opengraph.py | 4 ++-- requirements.txt | 2 +- tests/samples/songkick/elysianfields.html | 1 + tests/samples/songkick/elysianfields.json | 3 +++ tests/test_extruct.py | 10 +++++++--- 5 files changed, 14 insertions(+), 6 deletions(-) diff --git a/extruct/opengraph.py b/extruct/opengraph.py index e5b97dae..978d25ab 100644 --- a/extruct/opengraph.py +++ b/extruct/opengraph.py @@ -30,8 +30,8 @@ def extract_items(self, document, base_url=None): namespaces.update(self.get_namespaces(head)) props = [] for el in head.xpath('meta[@property and @content]'): - prop = el.attrib['property'] - val = el.attrib['content'] + prop = el.attrib['property'].strip() + val = el.attrib['content'].strip() if prop == '' or val == '': continue ns = prop.partition(':')[0] diff --git a/requirements.txt b/requirements.txt index 87a27224..820557a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ requests rdflib rdflib-jsonld mf2py>=1.1.0 -six +six>=1.11 w3lib diff --git a/tests/samples/songkick/elysianfields.html b/tests/samples/songkick/elysianfields.html index c7a00b4a..4fa2ba0a 100644 --- a/tests/samples/songkick/elysianfields.html +++ b/tests/samples/songkick/elysianfields.html @@ -27,6 +27,7 @@ + diff --git a/tests/samples/songkick/elysianfields.json b/tests/samples/songkick/elysianfields.json index 0f94c14e..ba8e9f56 100644 --- a/tests/samples/songkick/elysianfields.json +++ b/tests/samples/songkick/elysianfields.json @@ -253,6 +253,9 @@ "http://ogp.me/ns#title": [ { "@value": "Elysian Fields" + }, + { + "@value": " " } ], "http://ogp.me/ns#type": [ diff --git a/tests/test_extruct.py b/tests/test_extruct.py index bb79af8f..a2ba8003 100644 --- a/tests/test_extruct.py +++ b/tests/test_extruct.py @@ -16,9 +16,13 @@ def test_all(self): body = get_testdata('songkick', 'elysianfields.html') expected = json.loads(get_testdata('songkick', 'elysianfields.json').decode('UTF-8')) data = extruct.extract(body, base_url='http://www.songkick.com/artists/236156-elysian-fields') - # See test_rdfa_not_preserving_order() - del data['rdfa'][0]['http://ogp.me/ns#image'] - del expected['rdfa'][0]['http://ogp.me/ns#image'] + # Sorting the values here because RDFa is not preserving ordering on duplicated properties. + # See https://github.com/scrapinghub/extruct/issues/116 + # Also see test_rdfa_not_preserving_order() + for rdf in data['rdfa']: + for key, pairs in rdf.items(): + if ':' in key and isinstance(pairs, list): + rdf[key] = sorted(pairs, key=lambda e: e["@value"], reverse=True) self.assertEqual(jsonize_dict(data), expected) @pytest.mark.xfail