Skip to content

Commit

Permalink
Fix case regressions and issues (#96)
Browse files Browse the repository at this point in the history
Tag names should be case sensitive for XML. XHTML should be treated like
XML not HTML.
  • Loading branch information
facelessuser authored Jan 23, 2019
1 parent cdeaa6a commit 0f489f4
Show file tree
Hide file tree
Showing 8 changed files with 213 additions and 35 deletions.
6 changes: 6 additions & 0 deletions docs/src/markdown/about/changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## 1.7.3

- **FIX**: Fix regression with tag names in regards to case sensitivity, and ensure there are tests to prevent breakage
in the future.
- **FIX**: XHTML should always be case sensitive like XML.

## 1.7.2

- **FIX**: Fix HTML detection `type` selector.
Expand Down
9 changes: 4 additions & 5 deletions docs/src/markdown/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,10 @@ When detecting XHTML, Soup Sieve simply looks to see if the root element of an X
and does not currently look at the `doctype`. If in the future there is a need for stricter XHTML detection, this may
change.

- All HTML document types (HTML, HTML5, and XHTML) will have their tag names and attribute names treated without case
sensitivity, like most browsers do. Though XHTML is XML, which traditionally is case sensitive, it will still be treated
like HTML in this respect.
- HTML document types (HTML, HTML5) will have their tag names and attribute names treated without case
sensitivity, like most browsers do.

- XML document types (not including XHTML) will have their tag names and attribute names treated with case sensitivity.
- XML document types (including XHTML) will have their tag names and attribute names treated with case sensitivity.

- HTML5, XHTML and XML documents will have namespaces evaluated per the document's support (provided via the
parser). Some additional configuration is required when using namespaces, see [Namespace](#namespaces) for more
Expand All @@ -27,7 +26,7 @@ information.
For XML, the `lxml-xml` parser (`xml` for short) will provide proper namespaces. It is generally suggested that
`lxml-xml` is used to parse XHTML documents to take advantage of namespaces.

- While attribute values are generally treated as case sensitive, HTML5, XHTML, and HTML treat the `type` attribute
- While attribute values are generally treated as case sensitive, HTML5 and HTML treat the `type` attribute
special. The `type` attribute's value is always case insensitive. This is generally how most browsers treat `type`. If
you need `type` to be sensitive, you can use the `s` flag: `#!css [type="submit" s]`.

Expand Down
2 changes: 1 addition & 1 deletion soupsieve/__meta__.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,5 +186,5 @@ def parse_version(ver, pre=False):
return Version(major, minor, micro, release, pre, post, dev)


__version_info__ = Version(1, 7, 2, "final")
__version_info__ = Version(1, 7, 3, "final")
__version__ = __version_info__._get_canonical()
52 changes: 24 additions & 28 deletions soupsieve/css_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
REL_HAS_CLOSE_SIBLING = ':+'

NS_XHTML = 'http://www.w3.org/1999/xhtml'
NS_XML = 'http://www.w3.org/XML/1998/namespace'

DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL
RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE
Expand Down Expand Up @@ -244,11 +245,11 @@ def split_namespace(el, attr_name):
return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)

@staticmethod
def get_attribute_by_name(el, name, default=None, is_xml=False):
def get_attribute_by_name(el, name, default=None):
"""Get attribute by name."""

value = default
if is_xml:
if el._is_xml:
try:
value = el.attrs[name]
except KeyError:
Expand All @@ -268,10 +269,10 @@ def iter_attributes(el):
yield k, v

@classmethod
def get_classes(cls, el, is_xml=False):
def get_classes(cls, el):
"""Get classes."""

classes = cls.get_attribute_by_name(el, 'class', [], is_xml)
classes = cls.get_attribute_by_name(el, 'class', [])
if isinstance(classes, util.ustr):
classes = RE_NOT_WS.findall(classes)
return classes
Expand Down Expand Up @@ -413,7 +414,7 @@ def __init__(self, selectors, scope, namespaces, flags):
self.root = root
self.scope = scope if scope is not doc else root
self.html_namespace = self.is_html_ns(root)
self.is_xml = self.is_xml_tree(doc) and not self.html_namespace
self.is_xml = self.is_xml_tree(doc)

def supports_namespaces(self):
"""Check if namespaces are supported in the HTML type."""
Expand All @@ -424,13 +425,13 @@ def get_tag(self, el):
"""Get tag."""

name = self.get_tag_name(el)
return util.lower(name) if name is not None and self.is_xml else name
return util.lower(name) if name is not None and not self.is_xml else name

def get_prefix(self, el):
"""Get prefix."""

prefix = self.get_prefix_name(el)
return util.lower(prefix) if prefix is not None and self.is_xml else prefix
return util.lower(prefix) if prefix is not None and not self.is_xml else prefix

def find_bidi(self, el):
"""Get directionality from element text."""
Expand Down Expand Up @@ -501,14 +502,10 @@ def match_attribute_name(self, el, attr, prefix):
if namespace is None or ns != namespace and prefix != '*':
continue

if self.is_xml:
# The attribute doesn't match.
if attr != name:
continue
else:
# The attribute doesn't match.
if util.lower(attr) != util.lower(name):
continue
# The attribute doesn't match.
if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name):
continue

value = v
break
else:
Expand Down Expand Up @@ -563,9 +560,10 @@ def match_attributes(self, el, attributes):
def match_tagname(self, el, tag):
"""Match tag name."""

name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)
return not (
tag.name and
tag.name not in (self.get_tag(el), '*')
name is not None and
name not in (self.get_tag(el), '*')
)

def match_tag(self, el, tag):
Expand Down Expand Up @@ -652,15 +650,15 @@ def match_id(self, el, ids):

found = True
for i in ids:
if i != self.get_attribute_by_name(el, 'id', '', self.is_xml):
if i != self.get_attribute_by_name(el, 'id', ''):
found = False
break
return found

def match_classes(self, el, classes):
"""Match element's classes."""

current_classes = self.get_classes(el, self.is_xml)
current_classes = self.get_classes(el)
found = True
for c in classes:
if c not in current_classes:
Expand Down Expand Up @@ -929,15 +927,14 @@ def match_lang(self, el, langs):
parent = el
found_lang = None
while parent and self.get_parent(parent) and not found_lang:
ns = self.is_html_ns(parent)
is_html_ns = self.is_html_ns(parent)
for k, v in self.iter_attributes(parent):
attr_ns, attr = self.split_namespace(parent, k)
if (
(self.is_xml and k == 'xml:lang') or
((not has_ns or is_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or
(
not self.is_xml and (
((not has_ns or ns) and util.lower(k) == 'lang') or
(has_ns and not ns and util.lower(k) == 'xml:lang')
)
has_ns and not is_html_ns and attr_ns == NS_XML and
(util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang'
)
):
found_lang = v
Expand All @@ -949,7 +946,7 @@ def match_lang(self, el, langs):
found_lang = self.cached_meta_lang

# If we couldn't find a language, and the document is HTML, look to meta to determine language.
if found_lang is None and not self.is_xml:
if found_lang is None and (not self.is_xml or (self.html_namespace and self.root.name == 'html')):
# Find head
found = False
for tag in ('html', 'head'):
Expand Down Expand Up @@ -1116,7 +1113,6 @@ def match_defined(self, el):

name = self.get_tag(el)
return (
self.is_xml or
name.find('-') == -1 or
name.find(':') != -1 or
self.get_prefix(el) is not None
Expand All @@ -1128,7 +1124,7 @@ def match_selectors(self, el, selectors):
match = False
is_not = selectors.is_not
is_html = selectors.is_html
if not (is_html and self.is_xml):
if not (is_html and (self.is_xml and not self.html_namespace)):
for selector in selectors:
match = is_not
# We have a un-matchable situation (like `:focus` as you can focus an element in this environment)
Expand Down
53 changes: 53 additions & 0 deletions tests/test_bs4_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,56 @@ def test_parent_nth_of_type(self):
els = sv.select('div:nth-of-type(1) > h1', self.soup)
self.assertEqual(len(els), 1)
self.assertEqual(els[0].string, 'An H1')


SIMPLE_XML = """<Envelope><Header>...</Header></Envelope>"""
NAMESPACE_XML = """
<?xml version="1.0"?>
<s:Envelope xmlns:s="http://www.w3.org/2003/05/soap-envelope" xmlns:a="http://www.w3.org/2005/08/addressing"
xmlns:u="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-utility-1.0.xsd">
<a:Action s:mustUnderstand="1">http://docs.oasis-open.org/ws-sx/ws-trust/200512/RST/Issue</a:Action>
<o:UsernameToken u:Id="uuid-00000043-0000-4000-0000-000000000000">
</s:Envelope>
""".strip()
NAMESPACES = dict(x="http://www.w3.org/2003/05/soap-envelope",
y="http://www.w3.org/2005/08/addressing",
z="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-utility-1.0.xsd")


def test_simple_xml():
"""Test basic XML."""
xml = BeautifulSoup(SIMPLE_XML, "xml")

assert xml.select_one("Envelope")
assert xml.select_one("Envelope Header")
assert xml.select_one("Header")

assert not xml.select_one("envelope")
assert not xml.select_one("envelope header")
assert not xml.select_one("header")


def test_namespace_xml():
"""Test namespace XML."""
xml = BeautifulSoup(NAMESPACE_XML, "xml")

assert xml.select_one("Envelope")
assert xml.select_one("Envelope Action")
assert xml.select_one("Action")

assert not xml.select_one("envelope")
assert not xml.select_one("envelope action")
assert not xml.select_one("action")


def test_namespace_xml_with_namespace():
"""Test namespace selectors with XML."""
xml = BeautifulSoup(NAMESPACE_XML, "xml")

assert xml.select_one("x|Envelope", namespaces=NAMESPACES)
assert xml.select_one("x|Envelope y|Action", namespaces=NAMESPACES)
assert xml.select_one("y|Action", namespaces=NAMESPACES)

assert not xml.select_one("x|envelope", namespaces=NAMESPACES)
assert not xml.select_one("x|envelope y|action", namespaces=NAMESPACES)
assert not xml.select_one("y|action", namespaces=NAMESPACES)
102 changes: 102 additions & 0 deletions tests/test_level1.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,108 @@ def test_tag(self):
flags=util.HTML
)

def test_tag_html(self):
"""Test tag for HTML."""

markup = """
<Tag id="1">
<tag id="2"></tag>
<TAG id="3"></TAG>
</Tag>
"""

self.assert_selector(
markup,
"tag",
["1", "2", "3"],
flags=util.HTML
)

self.assert_selector(
markup,
"Tag",
["1", "2", "3"],
flags=util.HTML
)

self.assert_selector(
markup,
"TAG",
["1", "2", "3"],
flags=util.HTML
)

def test_tag_xhtml(self):
"""Test tag for XHTML."""

markup = """
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html lang="en" xmlns="http://www.w3.org/1999/xhtml">
<head>
</head>
<body>
<Tag id="1">
<tag id="2"></tag>
<TAG id="3"></TAG>
</Tag>
</body>
</html>
"""

self.assert_selector(
markup,
"tag",
["2"],
flags=util.XHTML
)

self.assert_selector(
markup,
"Tag",
["1"],
flags=util.XHTML
)

self.assert_selector(
markup,
"TAG",
["3"],
flags=util.XHTML
)

def test_tag_xml(self):
"""Test tag for XML."""

markup = """
<Tag id="1">
<tag id="2"></tag>
<TAG id="3"></TAG>
</Tag>
"""

self.assert_selector(
markup,
"tag",
["2"],
flags=util.XML
)

self.assert_selector(
markup,
"Tag",
["1"],
flags=util.XML
)

self.assert_selector(
markup,
"TAG",
["3"],
flags=util.XML
)

def test_tags(self):
"""Test multiple selectors."""

Expand Down
2 changes: 1 addition & 1 deletion tests/test_level2.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ def test_attribute_type(self):
self.assert_selector(
markup,
'[type="test"]',
["0", '2'],
['2'],
flags=util.XML
)

Expand Down
Loading

0 comments on commit 0f489f4

Please sign in to comment.