diff --git a/.gitignore b/.gitignore
index 9e040fff..ef4884db 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
/build/
/dist/
*.egg-info
+venv/
# Mac OS
*.DS_Store
diff --git a/extruct/__init__.py b/extruct/__init__.py
index a9363b55..14a63f59 100644
--- a/extruct/__init__.py
+++ b/extruct/__init__.py
@@ -4,4 +4,5 @@
from .w3cmicrodata import MicrodataExtractor
from .opengraph import OpenGraphExtractor
from .microformat import MicroformatExtractor
+from .twittercard import TwitterCardExtractor
from .xmldom import XmlDomHTMLParser
diff --git a/extruct/_extruct.py b/extruct/_extruct.py
index 5bc247f8..24cdd121 100644
--- a/extruct/_extruct.py
+++ b/extruct/_extruct.py
@@ -3,6 +3,7 @@
from extruct.jsonld import JsonLdExtractor
from extruct.rdfa import RDFaExtractor
+from extruct.twittercard import TwitterCardExtractor
from extruct.w3cmicrodata import MicrodataExtractor
from extruct.opengraph import OpenGraphExtractor
from extruct.microformat import MicroformatExtractor
@@ -11,7 +12,7 @@
from extruct.utils import parse_xmldom_html
logger = logging.getLogger(__name__)
-SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa', 'dublincore']
+SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa', 'dublincore', 'twittercard']
def extract(htmlstring,
@@ -102,6 +103,11 @@ def extract(htmlstring,
('dublincore', DublinCoreExtractor().extract_items,
tree,
))
+ if 'twittercard' in syntaxes:
+ processors.append(
+ ('twittercard', TwitterCardExtractor().extract_items,
+ tree,
+ ))
output = {}
for syntax, extract, document in processors:
try:
@@ -162,7 +168,7 @@ def extract(htmlstring,
logger.exception(
'Failed to uniform extracted for {}, raises {}'
.format(syntax, e)
- )
+ )
if errors == 'strict':
raise
diff --git a/extruct/twittercard.py b/extruct/twittercard.py
new file mode 100644
index 00000000..51a9a3a4
--- /dev/null
+++ b/extruct/twittercard.py
@@ -0,0 +1,71 @@
+import re
+
+from extruct.utils import parse_html
+
+
+# _PREFIX_PATTERN = re.compile(r'\s*(\w+):\s*([^\s]+)')
+_PREFIX_PATTERN = re.compile(r'^\s*(?:)?\s*$', re.I)
+_TW_NAMESPACES = {
+ 'twitter': 'https://dev.twitter.com/cards#',
+ 'owl' : 'http://www.w3.org/2002/07/owl#',
+ 'gr' : 'http://purl.org/goodrelations/v1#',
+ 'ctag' : 'http://commontag.org/ns#',
+ 'cc' : 'http://creativecommons.org/ns#',
+ 'grddl' : 'http://www.w3.org/2003/g/data-view#',
+ 'rif' : 'http://www.w3.org/2007/rif#',
+ 'sioc' : 'http://rdfs.org/sioc/ns#',
+ 'skos' : 'http://www.w3.org/2004/02/skos/core#',
+ 'xml' : 'http://www.w3.org/XML/1998/namespace',
+ 'rdfs' : 'http://www.w3.org/2000/01/rdf-schema#',
+ 'rev' : 'http://purl.org/stuff/rev#',
+ 'rdfa' : 'http://www.w3.org/ns/rdfa#',
+ 'dc' : 'http://purl.org/dc/terms/',
+ 'foaf' : 'http://xmlns.com/foaf/0.1/',
+ 'void' : 'http://rdfs.org/ns/void#',
+ 'ical' : 'http://www.w3.org/2002/12/cal/icaltzd#',
+ 'vcard' : 'http://www.w3.org/2006/vcard/ns#',
+ 'wdrs' : 'http://www.w3.org/2007/05/powder-s#',
+ 'og' : 'http://ogp.me/ns#',
+ 'wdr' : 'http://www.w3.org/2007/05/powder#',
+ 'rdf' : 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
+ 'xhv' : 'http://www.w3.org/1999/xhtml/vocab#',
+ 'xsd' : 'http://www.w3.org/2001/XMLSchema#',
+ 'v' : 'http://rdf.data-vocabulary.org/#',
+ 'skosxl' : 'http://www.w3.org/2008/05/skos-xl#',
+ 'schema' : 'http://schema.org/',
+}
+
+class TwitterCardExtractor(object):
+ """TwitterCard extractor following extruct API.
+ """
+
+
+ def extract(self, htmlstring, base_url=None, encoding='UTF-8'):
+ tree = parse_html(htmlstring, encoding=encoding)
+ return list(self.extract_items(tree, base_url=base_url))
+
+ def extract_items(self, document, base_url=None):
+ # TwitterCard defines a web page as a single rich object.
+ for head in document.xpath('//head'):
+ html_elems = document.head.xpath("parent::html")
+ namespaces = self.get_namespaces(
+ html_elems[0]) if html_elems else {}
+ namespaces.update(self.get_namespaces(head))
+ props = []
+ for el in head.xpath('meta[@name and @content]'):
+ prop = el.attrib['name']
+ val = el.attrib['content']
+ ns = prop.partition(':')[0]
+ if ns in _TW_NAMESPACES:
+ namespaces[ns] = _TW_NAMESPACES[ns]
+ if ns in namespaces:
+ props.append((prop, val))
+ if props:
+ yield {'namespace': namespaces, 'properties': props}
+
+
+ def get_namespaces(self, element):
+ return dict(
+ _PREFIX_PATTERN.findall(element.attrib.get('prefix', ''))
+ )
+
\ No newline at end of file
diff --git a/tests/samples/misc/twittercard_chess_test.html b/tests/samples/misc/twittercard_chess_test.html
new file mode 100644
index 00000000..d3afbaaf
--- /dev/null
+++ b/tests/samples/misc/twittercard_chess_test.html
@@ -0,0 +1,23 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Chess.com - Play Chess Online - Free Games
+
+
+
\ No newline at end of file
diff --git a/tests/samples/misc/twittercard_chess_test.json b/tests/samples/misc/twittercard_chess_test.json
new file mode 100644
index 00000000..a7801a6c
--- /dev/null
+++ b/tests/samples/misc/twittercard_chess_test.json
@@ -0,0 +1,29 @@
+[
+ {
+ "namespace": {
+ "twitter": "https://dev.twitter.com/cards#"
+ },
+ "properties": [
+ [
+ "twitter:title",
+ "Chess.com - Play Chess Online - Free Games"
+ ],
+ [
+ "twitter:card",
+ "summary_large_image"
+ ],
+ [
+ "twitter:site",
+ "@chesscom"
+ ],
+ [
+ "twitter:description",
+ "Play chess online for free on Chess.com with over 50 million members from around the world. Have fun playing with friends or challenging the computer!"
+ ],
+ [
+ "twitter:image",
+ "https://www.chess.com/bundles/web/images/social/share-home.a3e2cdbb.png"
+ ]
+ ]
+ }
+]
\ No newline at end of file
diff --git a/tests/samples/misc/twittercard_optimizesmart_test.html b/tests/samples/misc/twittercard_optimizesmart_test.html
new file mode 100644
index 00000000..c9deaceb
--- /dev/null
+++ b/tests/samples/misc/twittercard_optimizesmart_test.html
@@ -0,0 +1,28 @@
+
+
+
+
+
+
+
+
+ Open Graph Protocol for Facebook Explained with Examples - Optimize Smart
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/samples/misc/twittercard_optimizesmart_test.json b/tests/samples/misc/twittercard_optimizesmart_test.json
new file mode 100644
index 00000000..172d3ac8
--- /dev/null
+++ b/tests/samples/misc/twittercard_optimizesmart_test.json
@@ -0,0 +1,49 @@
+[
+ {
+ "namespace": {
+ "twitter": "https://dev.twitter.com/cards#"
+ },
+ "properties": [
+ [
+ "twitter:card",
+ "summary_large_image"
+ ],
+ [
+ "twitter:title",
+ "#Open #Graph #Protocol for #Facebook explained with examples"
+ ],
+ [
+ "twitter:description",
+ "What is Open Graph Protocol and why you need it? Learn to implement Open Graph Protocol for Facebook on your website. Open Graph Protocol Meta Tags."
+ ],
+ [
+ "twitter:site",
+ "@optimizesmart"
+ ],
+ [
+ "twitter:creator",
+ "@optimizesmart"
+ ],
+ [
+ "twitter:image",
+ "https://www.optimizesmart.com/wp-content/uploads/2010/07/open-graph-protocol.jpg"
+ ],
+ [
+ "twitter:label1",
+ "Written by"
+ ],
+ [
+ "twitter:data1",
+ "Himanshu"
+ ],
+ [
+ "twitter:label2",
+ "Time to read"
+ ],
+ [
+ "twitter:data2",
+ "13 minutes"
+ ]
+ ]
+ }
+]
\ No newline at end of file
diff --git a/tests/samples/misc/twittercard_spinneyslebanon_test.html b/tests/samples/misc/twittercard_spinneyslebanon_test.html
new file mode 100644
index 00000000..eb2b6114
--- /dev/null
+++ b/tests/samples/misc/twittercard_spinneyslebanon_test.html
@@ -0,0 +1,34 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/samples/misc/twittercard_spinneyslebanon_test.json b/tests/samples/misc/twittercard_spinneyslebanon_test.json
new file mode 100644
index 00000000..d2e1940d
--- /dev/null
+++ b/tests/samples/misc/twittercard_spinneyslebanon_test.json
@@ -0,0 +1,81 @@
+[
+ {
+ "namespace": {
+ "twitter": "https://dev.twitter.com/cards#"
+ },
+ "properties": [
+ [
+ "twitter:card",
+ "summary_large_image"
+ ],
+ [
+ "twitter:site",
+ "@spinneyslebanon"
+ ],
+ [
+ "twitter:creator",
+ "@spinneyslebanon"
+ ],
+ [
+ "twitter:title",
+ "Mevgal Bio Feta Cheese 200g | Chilled & Deli | Spinneys Lebanon"
+ ],
+ [
+ "twitter:url",
+ "https://www.spinneyslebanon.com/mevgal-bio-feta-cheese-200g.html"
+ ],
+ [
+ "twitter:description",
+ "This bio feta cheese is smooth and creamy and will make a great addition to your salads or cheese dips."
+ ],
+ [
+ "twitter:image",
+ "https://d145dj1pf6foch.cloudfront.net/catalog/product/cache/96439f90b2da9c0500e3c88801966a5f/4/8/489874-v001-1_1.jpg"
+ ],
+ [
+ "twitter:card",
+ "summary_large_image"
+ ],
+ [
+ "twitter:domain",
+ "https://www.spinneyslebanon.com/"
+ ],
+ [
+ "twitter:site",
+ "@https://twitter.com/spinneyslebanon"
+ ],
+ [
+ "twitter:creator",
+ "@spinneyslebanon"
+ ],
+ [
+ "twitter:title",
+ "Mevgal Bio Feta Cheese 200g "
+ ],
+ [
+ "twitter:description",
+ "This bio feta cheese is smooth and creamy and will make a great addition to your salads or cheese dips. "
+ ],
+ [
+ "twitter:image",
+ "https://d145dj1pf6foch.cloudfront.net/catalog/product/cache/1dfbce20f903bb776ac5277b1934e5b0/4/8/489874-v001-1_1.jpg"
+ ],
+ [
+ "twitter:data1",
+ "LBP139999.00"
+ ],
+ [
+ "twitter:label1",
+ "PRICE"
+ ],
+ [
+ "twitter:data2",
+ "LB"
+ ],
+ [
+ "twitter:label2",
+ "LOCATION"
+ ]
+ ]
+ }
+]
\ No newline at end of file
diff --git a/tests/test_twittercard.py b/tests/test_twittercard.py
new file mode 100644
index 00000000..0fa11426
--- /dev/null
+++ b/tests/test_twittercard.py
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+import json
+import unittest
+
+from extruct.twittercard import TwitterCardExtractor
+from tests import get_testdata, jsonize_dict
+
+class TestTwittercard(unittest.TestCase):
+ maxDiff = None
+
+ def _test_twittercard(self, name):
+ body = get_testdata('misc', name + '.html')
+ expected = json.loads(get_testdata('misc', name + '.json').decode('UTF-8'))
+
+ twittercard = TwitterCardExtractor()
+ data = twittercard.extract(body)
+ self.assertEqual(jsonize_dict(data), expected)
+
+ def twittercard_spinneyslebanon_test(self):
+ self._test_twittercard('twittercard_spinneyslebanon_test')
+
+ def twittercard_optimizesmart_test(self):
+ self._test_twittercard('twittercard_optimizesmart_test')
+
+ def twittercard_chess_test(self):
+ self._test_twittercard('twittercard_chess_test')
+