diff --git a/.gitignore b/.gitignore index 9e040fff..ef4884db 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ /build/ /dist/ *.egg-info +venv/ # Mac OS *.DS_Store diff --git a/extruct/__init__.py b/extruct/__init__.py index a9363b55..14a63f59 100644 --- a/extruct/__init__.py +++ b/extruct/__init__.py @@ -4,4 +4,5 @@ from .w3cmicrodata import MicrodataExtractor from .opengraph import OpenGraphExtractor from .microformat import MicroformatExtractor +from .twittercard import TwitterCardExtractor from .xmldom import XmlDomHTMLParser diff --git a/extruct/_extruct.py b/extruct/_extruct.py index 5bc247f8..24cdd121 100644 --- a/extruct/_extruct.py +++ b/extruct/_extruct.py @@ -3,6 +3,7 @@ from extruct.jsonld import JsonLdExtractor from extruct.rdfa import RDFaExtractor +from extruct.twittercard import TwitterCardExtractor from extruct.w3cmicrodata import MicrodataExtractor from extruct.opengraph import OpenGraphExtractor from extruct.microformat import MicroformatExtractor @@ -11,7 +12,7 @@ from extruct.utils import parse_xmldom_html logger = logging.getLogger(__name__) -SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa', 'dublincore'] +SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa', 'dublincore', 'twittercard'] def extract(htmlstring, @@ -102,6 +103,11 @@ def extract(htmlstring, ('dublincore', DublinCoreExtractor().extract_items, tree, )) + if 'twittercard' in syntaxes: + processors.append( + ('twittercard', TwitterCardExtractor().extract_items, + tree, + )) output = {} for syntax, extract, document in processors: try: @@ -162,7 +168,7 @@ def extract(htmlstring, logger.exception( 'Failed to uniform extracted for {}, raises {}' .format(syntax, e) - ) + ) if errors == 'strict': raise diff --git a/extruct/twittercard.py b/extruct/twittercard.py new file mode 100644 index 00000000..51a9a3a4 --- /dev/null +++ b/extruct/twittercard.py @@ -0,0 +1,71 @@ +import re + +from extruct.utils import parse_html + + +# _PREFIX_PATTERN = re.compile(r'\s*(\w+):\s*([^\s]+)') +_PREFIX_PATTERN = re.compile(r'^\s*(?:)?\s*$', re.I) +_TW_NAMESPACES = { + 'twitter': 'https://dev.twitter.com/cards#', + 'owl' : 'http://www.w3.org/2002/07/owl#', + 'gr' : 'http://purl.org/goodrelations/v1#', + 'ctag' : 'http://commontag.org/ns#', + 'cc' : 'http://creativecommons.org/ns#', + 'grddl' : 'http://www.w3.org/2003/g/data-view#', + 'rif' : 'http://www.w3.org/2007/rif#', + 'sioc' : 'http://rdfs.org/sioc/ns#', + 'skos' : 'http://www.w3.org/2004/02/skos/core#', + 'xml' : 'http://www.w3.org/XML/1998/namespace', + 'rdfs' : 'http://www.w3.org/2000/01/rdf-schema#', + 'rev' : 'http://purl.org/stuff/rev#', + 'rdfa' : 'http://www.w3.org/ns/rdfa#', + 'dc' : 'http://purl.org/dc/terms/', + 'foaf' : 'http://xmlns.com/foaf/0.1/', + 'void' : 'http://rdfs.org/ns/void#', + 'ical' : 'http://www.w3.org/2002/12/cal/icaltzd#', + 'vcard' : 'http://www.w3.org/2006/vcard/ns#', + 'wdrs' : 'http://www.w3.org/2007/05/powder-s#', + 'og' : 'http://ogp.me/ns#', + 'wdr' : 'http://www.w3.org/2007/05/powder#', + 'rdf' : 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', + 'xhv' : 'http://www.w3.org/1999/xhtml/vocab#', + 'xsd' : 'http://www.w3.org/2001/XMLSchema#', + 'v' : 'http://rdf.data-vocabulary.org/#', + 'skosxl' : 'http://www.w3.org/2008/05/skos-xl#', + 'schema' : 'http://schema.org/', +} + +class TwitterCardExtractor(object): + """TwitterCard extractor following extruct API. + """ + + + def extract(self, htmlstring, base_url=None, encoding='UTF-8'): + tree = parse_html(htmlstring, encoding=encoding) + return list(self.extract_items(tree, base_url=base_url)) + + def extract_items(self, document, base_url=None): + # TwitterCard defines a web page as a single rich object. + for head in document.xpath('//head'): + html_elems = document.head.xpath("parent::html") + namespaces = self.get_namespaces( + html_elems[0]) if html_elems else {} + namespaces.update(self.get_namespaces(head)) + props = [] + for el in head.xpath('meta[@name and @content]'): + prop = el.attrib['name'] + val = el.attrib['content'] + ns = prop.partition(':')[0] + if ns in _TW_NAMESPACES: + namespaces[ns] = _TW_NAMESPACES[ns] + if ns in namespaces: + props.append((prop, val)) + if props: + yield {'namespace': namespaces, 'properties': props} + + + def get_namespaces(self, element): + return dict( + _PREFIX_PATTERN.findall(element.attrib.get('prefix', '')) + ) + \ No newline at end of file diff --git a/tests/samples/misc/twittercard_chess_test.html b/tests/samples/misc/twittercard_chess_test.html new file mode 100644 index 00000000..d3afbaaf --- /dev/null +++ b/tests/samples/misc/twittercard_chess_test.html @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + +

Chess.com - Play Chess Online - Free Games

+ + + \ No newline at end of file diff --git a/tests/samples/misc/twittercard_chess_test.json b/tests/samples/misc/twittercard_chess_test.json new file mode 100644 index 00000000..a7801a6c --- /dev/null +++ b/tests/samples/misc/twittercard_chess_test.json @@ -0,0 +1,29 @@ +[ + { + "namespace": { + "twitter": "https://dev.twitter.com/cards#" + }, + "properties": [ + [ + "twitter:title", + "Chess.com - Play Chess Online - Free Games" + ], + [ + "twitter:card", + "summary_large_image" + ], + [ + "twitter:site", + "@chesscom" + ], + [ + "twitter:description", + "Play chess online for free on Chess.com with over 50 million members from around the world. Have fun playing with friends or challenging the computer!" + ], + [ + "twitter:image", + "https://www.chess.com/bundles/web/images/social/share-home.a3e2cdbb.png" + ] + ] + } +] \ No newline at end of file diff --git a/tests/samples/misc/twittercard_optimizesmart_test.html b/tests/samples/misc/twittercard_optimizesmart_test.html new file mode 100644 index 00000000..c9deaceb --- /dev/null +++ b/tests/samples/misc/twittercard_optimizesmart_test.html @@ -0,0 +1,28 @@ + + + + + + + + + Open Graph Protocol for Facebook Explained with Examples - Optimize Smart + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/samples/misc/twittercard_optimizesmart_test.json b/tests/samples/misc/twittercard_optimizesmart_test.json new file mode 100644 index 00000000..172d3ac8 --- /dev/null +++ b/tests/samples/misc/twittercard_optimizesmart_test.json @@ -0,0 +1,49 @@ +[ + { + "namespace": { + "twitter": "https://dev.twitter.com/cards#" + }, + "properties": [ + [ + "twitter:card", + "summary_large_image" + ], + [ + "twitter:title", + "#Open #Graph #Protocol for #Facebook explained with examples" + ], + [ + "twitter:description", + "What is Open Graph Protocol and why you need it? Learn to implement Open Graph Protocol for Facebook on your website. Open Graph Protocol Meta Tags." + ], + [ + "twitter:site", + "@optimizesmart" + ], + [ + "twitter:creator", + "@optimizesmart" + ], + [ + "twitter:image", + "https://www.optimizesmart.com/wp-content/uploads/2010/07/open-graph-protocol.jpg" + ], + [ + "twitter:label1", + "Written by" + ], + [ + "twitter:data1", + "Himanshu" + ], + [ + "twitter:label2", + "Time to read" + ], + [ + "twitter:data2", + "13 minutes" + ] + ] + } +] \ No newline at end of file diff --git a/tests/samples/misc/twittercard_spinneyslebanon_test.html b/tests/samples/misc/twittercard_spinneyslebanon_test.html new file mode 100644 index 00000000..eb2b6114 --- /dev/null +++ b/tests/samples/misc/twittercard_spinneyslebanon_test.html @@ -0,0 +1,34 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/samples/misc/twittercard_spinneyslebanon_test.json b/tests/samples/misc/twittercard_spinneyslebanon_test.json new file mode 100644 index 00000000..d2e1940d --- /dev/null +++ b/tests/samples/misc/twittercard_spinneyslebanon_test.json @@ -0,0 +1,81 @@ +[ + { + "namespace": { + "twitter": "https://dev.twitter.com/cards#" + }, + "properties": [ + [ + "twitter:card", + "summary_large_image" + ], + [ + "twitter:site", + "@spinneyslebanon" + ], + [ + "twitter:creator", + "@spinneyslebanon" + ], + [ + "twitter:title", + "Mevgal Bio Feta Cheese 200g | Chilled & Deli | Spinneys Lebanon" + ], + [ + "twitter:url", + "https://www.spinneyslebanon.com/mevgal-bio-feta-cheese-200g.html" + ], + [ + "twitter:description", + "This bio feta cheese is smooth and creamy and will make a great addition to your salads or cheese dips." + ], + [ + "twitter:image", + "https://d145dj1pf6foch.cloudfront.net/catalog/product/cache/96439f90b2da9c0500e3c88801966a5f/4/8/489874-v001-1_1.jpg" + ], + [ + "twitter:card", + "summary_large_image" + ], + [ + "twitter:domain", + "https://www.spinneyslebanon.com/" + ], + [ + "twitter:site", + "@https://twitter.com/spinneyslebanon" + ], + [ + "twitter:creator", + "@spinneyslebanon" + ], + [ + "twitter:title", + "Mevgal Bio Feta Cheese 200g " + ], + [ + "twitter:description", + "This bio feta cheese is smooth and creamy and will make a great addition to your salads or cheese dips. " + ], + [ + "twitter:image", + "https://d145dj1pf6foch.cloudfront.net/catalog/product/cache/1dfbce20f903bb776ac5277b1934e5b0/4/8/489874-v001-1_1.jpg" + ], + [ + "twitter:data1", + "LBP139999.00" + ], + [ + "twitter:label1", + "PRICE" + ], + [ + "twitter:data2", + "LB" + ], + [ + "twitter:label2", + "LOCATION" + ] + ] + } +] \ No newline at end of file diff --git a/tests/test_twittercard.py b/tests/test_twittercard.py new file mode 100644 index 00000000..0fa11426 --- /dev/null +++ b/tests/test_twittercard.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +import json +import unittest + +from extruct.twittercard import TwitterCardExtractor +from tests import get_testdata, jsonize_dict + +class TestTwittercard(unittest.TestCase): + maxDiff = None + + def _test_twittercard(self, name): + body = get_testdata('misc', name + '.html') + expected = json.loads(get_testdata('misc', name + '.json').decode('UTF-8')) + + twittercard = TwitterCardExtractor() + data = twittercard.extract(body) + self.assertEqual(jsonize_dict(data), expected) + + def twittercard_spinneyslebanon_test(self): + self._test_twittercard('twittercard_spinneyslebanon_test') + + def twittercard_optimizesmart_test(self): + self._test_twittercard('twittercard_optimizesmart_test') + + def twittercard_chess_test(self): + self._test_twittercard('twittercard_chess_test') +