From d4645ef96bdb8c2ab8129f496660992f48c32abd Mon Sep 17 00:00:00 2001
From: Cristi Constantin <cristi.constantin@sent.com>
Date: Wed, 17 Jul 2019 11:56:36 +0100
Subject: [PATCH] Strip empty prop and content tags

Updated the tests
---
 extruct/opengraph.py                      |  4 ++--
 requirements.txt                          |  2 +-
 tests/samples/songkick/elysianfields.html |  1 +
 tests/samples/songkick/elysianfields.json |  3 +++
 tests/test_extruct.py                     | 10 +++++++---
 5 files changed, 14 insertions(+), 6 deletions(-)
diff --git a/extruct/opengraph.py b/extruct/opengraph.py
index e5b97dae..978d25ab 100644
--- a/extruct/opengraph.py
+++ b/extruct/opengraph.py
@@ -30,8 +30,8 @@ def extract_items(self, document, base_url=None):
             namespaces.update(self.get_namespaces(head))
             props = []
             for el in head.xpath('meta[@property and @content]'):
-                prop = el.attrib['property']
-                val = el.attrib['content']
+                prop = el.attrib['property'].strip()
+                val = el.attrib['content'].strip()
                 if prop == '' or val == '':
                     continue
                 ns = prop.partition(':')[0]
diff --git a/requirements.txt b/requirements.txt
index 87a27224..820557a0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,5 +7,5 @@ requests
 rdflib
 rdflib-jsonld
 mf2py>=1.1.0
-six
+six>=1.11
 w3lib
diff --git a/tests/samples/songkick/elysianfields.html b/tests/samples/songkick/elysianfields.html
index c7a00b4a..4fa2ba0a 100644
--- a/tests/samples/songkick/elysianfields.html
+++ b/tests/samples/songkick/elysianfields.html
@@ -27,6 +27,7 @@
     <meta property="og:site_name" content="Songkick">
     <meta property="og:type" content="songkick-concerts:artist">
     <meta property="og:title" content="Elysian Fields">
+    <meta property="og:title" content="  ">
     <meta property="og:description" content="Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.">
     <meta property="og:description" content="" />
     <meta property="og:url" content="http://www.songkick.com/artists/236156-elysian-fields">
diff --git a/tests/samples/songkick/elysianfields.json b/tests/samples/songkick/elysianfields.json
index 0f94c14e..ba8e9f56 100644
--- a/tests/samples/songkick/elysianfields.json
+++ b/tests/samples/songkick/elysianfields.json
@@ -253,6 +253,9 @@
             "http://ogp.me/ns#title": [
                 {
                     "@value": "Elysian Fields"
+                },
+                {
+                    "@value": "  "
                 }
             ],
             "http://ogp.me/ns#type": [
diff --git a/tests/test_extruct.py b/tests/test_extruct.py
index bb79af8f..a2ba8003 100644
--- a/tests/test_extruct.py
+++ b/tests/test_extruct.py
@@ -16,9 +16,13 @@ def test_all(self):
         body = get_testdata('songkick', 'elysianfields.html')
         expected = json.loads(get_testdata('songkick', 'elysianfields.json').decode('UTF-8'))
         data = extruct.extract(body, base_url='http://www.songkick.com/artists/236156-elysian-fields')
-        # See test_rdfa_not_preserving_order()
-        del data['rdfa'][0]['http://ogp.me/ns#image']
-        del expected['rdfa'][0]['http://ogp.me/ns#image']
+        # Sorting the values here because RDFa is not preserving ordering on duplicated properties.
+        # See https://github.com/scrapinghub/extruct/issues/116
+        # Also see test_rdfa_not_preserving_order()
+        for rdf in data['rdfa']:
+            for key, pairs in rdf.items():
+                if ':' in key and isinstance(pairs, list):
+                    rdf[key] = sorted(pairs, key=lambda e: e["@value"], reverse=True)
         self.assertEqual(jsonize_dict(data), expected)
 
     @pytest.mark.xfail