Fix case regressions and issues (#96)

Tag names should be case sensitive for XML. XHTML should be treated like XML not HTML.
facelessuser · Jan 23, 2019 · 0f489f4 · 0f489f4
1 parent cdeaa6a
commit 0f489f4
Show file tree

Hide file tree

Showing 8 changed files with 213 additions and 35 deletions.
diff --git a/docs/src/markdown/about/changelog.md b/docs/src/markdown/about/changelog.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## 1.7.3
+
+- **FIX**: Fix regression with tag names in regards to case sensitivity, and ensure there are tests to prevent breakage
+  in the future.
+- **FIX**: XHTML should always be case sensitive like XML.
+
 ## 1.7.2
 
 - **FIX**: Fix HTML detection `type` selector.

diff --git a/docs/src/markdown/api.md b/docs/src/markdown/api.md
@@ -10,11 +10,10 @@ When detecting XHTML, Soup Sieve simply looks to see if the root element of an X
 and does not currently look at the `doctype`. If in the future there is a need for stricter XHTML detection, this may
 change.
 
-- All HTML document types (HTML, HTML5, and XHTML) will have their tag names and attribute names treated without case
-sensitivity, like most browsers do. Though XHTML is XML, which traditionally is case sensitive, it will still be treated
-like HTML in this respect.
+- HTML document types (HTML, HTML5) will have their tag names and attribute names treated without case
+sensitivity, like most browsers do.
 
-- XML document types (not including XHTML) will have their tag names and attribute names treated with case sensitivity.
+- XML document types (including XHTML) will have their tag names and attribute names treated with case sensitivity.
 
 - HTML5, XHTML and XML documents will have namespaces evaluated per the document's support (provided via the
 parser). Some additional configuration is required when using namespaces, see [Namespace](#namespaces) for more
@@ -27,7 +26,7 @@ information.
         For XML, the `lxml-xml` parser (`xml` for short) will provide proper namespaces. It is generally suggested that
         `lxml-xml` is used to parse XHTML documents to take advantage of namespaces.
 
-- While attribute values are generally treated as case sensitive, HTML5, XHTML, and HTML treat the `type` attribute
+- While attribute values are generally treated as case sensitive, HTML5 and HTML treat the `type` attribute
 special. The `type` attribute's value is always case insensitive. This is generally how most browsers treat `type`. If
 you need `type` to be sensitive, you can use the `s` flag: `#!css [type="submit" s]`.
 

diff --git a/soupsieve/__meta__.py b/soupsieve/__meta__.py
@@ -186,5 +186,5 @@ def parse_version(ver, pre=False):
     return Version(major, minor, micro, release, pre, post, dev)
 
 
-__version_info__ = Version(1, 7, 2, "final")
+__version_info__ = Version(1, 7, 3, "final")
 __version__ = __version_info__._get_canonical()
diff --git a/soupsieve/css_match.py b/soupsieve/css_match.py
@@ -24,6 +24,7 @@
 REL_HAS_CLOSE_SIBLING = ':+'
 
 NS_XHTML = 'http://www.w3.org/1999/xhtml'
+NS_XML = 'http://www.w3.org/XML/1998/namespace'
 
 DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL
 RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE
@@ -244,11 +245,11 @@ def split_namespace(el, attr_name):
         return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
 
     @staticmethod
-    def get_attribute_by_name(el, name, default=None, is_xml=False):
+    def get_attribute_by_name(el, name, default=None):
         """Get attribute by name."""
 
         value = default
-        if is_xml:
+        if el._is_xml:
             try:
                 value = el.attrs[name]
             except KeyError:
@@ -268,10 +269,10 @@ def iter_attributes(el):
             yield k, v
 
     @classmethod
-    def get_classes(cls, el, is_xml=False):
+    def get_classes(cls, el):
         """Get classes."""
 
-        classes = cls.get_attribute_by_name(el, 'class', [], is_xml)
+        classes = cls.get_attribute_by_name(el, 'class', [])
         if isinstance(classes, util.ustr):
             classes = RE_NOT_WS.findall(classes)
         return classes
@@ -413,7 +414,7 @@ def __init__(self, selectors, scope, namespaces, flags):
         self.root = root
         self.scope = scope if scope is not doc else root
         self.html_namespace = self.is_html_ns(root)
-        self.is_xml = self.is_xml_tree(doc) and not self.html_namespace
+        self.is_xml = self.is_xml_tree(doc)
 
     def supports_namespaces(self):
         """Check if namespaces are supported in the HTML type."""
@@ -424,13 +425,13 @@ def get_tag(self, el):
         """Get tag."""
 
         name = self.get_tag_name(el)
-        return util.lower(name) if name is not None and self.is_xml else name
+        return util.lower(name) if name is not None and not self.is_xml else name
 
     def get_prefix(self, el):
         """Get prefix."""
 
         prefix = self.get_prefix_name(el)
-        return util.lower(prefix) if prefix is not None and self.is_xml else prefix
+        return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
 
     def find_bidi(self, el):
         """Get directionality from element text."""
@@ -501,14 +502,10 @@ def match_attribute_name(self, el, attr, prefix):
                 if namespace is None or ns != namespace and prefix != '*':
                     continue
 
-                if self.is_xml:
-                    # The attribute doesn't match.
-                    if attr != name:
-                        continue
-                else:
-                    # The attribute doesn't match.
-                    if util.lower(attr) != util.lower(name):
-                        continue
+                # The attribute doesn't match.
+                if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name):
+                    continue
+
                 value = v
                 break
         else:
@@ -563,9 +560,10 @@ def match_attributes(self, el, attributes):
     def match_tagname(self, el, tag):
         """Match tag name."""
 
+        name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)
         return not (
-            tag.name and
-            tag.name not in (self.get_tag(el), '*')
+            name is not None and
+            name not in (self.get_tag(el), '*')
         )
 
     def match_tag(self, el, tag):
@@ -652,15 +650,15 @@ def match_id(self, el, ids):
 
         found = True
         for i in ids:
-            if i != self.get_attribute_by_name(el, 'id', '', self.is_xml):
+            if i != self.get_attribute_by_name(el, 'id', ''):
                 found = False
                 break
         return found
 
     def match_classes(self, el, classes):
         """Match element's classes."""
 
-        current_classes = self.get_classes(el, self.is_xml)
+        current_classes = self.get_classes(el)
         found = True
         for c in classes:
             if c not in current_classes:
@@ -929,15 +927,14 @@ def match_lang(self, el, langs):
         parent = el
         found_lang = None
         while parent and self.get_parent(parent) and not found_lang:
-            ns = self.is_html_ns(parent)
+            is_html_ns = self.is_html_ns(parent)
             for k, v in self.iter_attributes(parent):
+                attr_ns, attr = self.split_namespace(parent, k)
                 if (
-                    (self.is_xml and k == 'xml:lang') or
+                    ((not has_ns or is_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or
                     (
-                        not self.is_xml and (
-                            ((not has_ns or ns) and util.lower(k) == 'lang') or
-                            (has_ns and not ns and util.lower(k) == 'xml:lang')
-                        )
+                        has_ns and not is_html_ns and attr_ns == NS_XML and
+                        (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang'
                     )
                 ):
                     found_lang = v
@@ -949,7 +946,7 @@ def match_lang(self, el, langs):
             found_lang = self.cached_meta_lang
 
         # If we couldn't find a language, and the document is HTML, look to meta to determine language.
-        if found_lang is None and not self.is_xml:
+        if found_lang is None and (not self.is_xml or (self.html_namespace and self.root.name == 'html')):
             # Find head
             found = False
             for tag in ('html', 'head'):
@@ -1116,7 +1113,6 @@ def match_defined(self, el):
 
         name = self.get_tag(el)
         return (
-            self.is_xml or
             name.find('-') == -1 or
             name.find(':') != -1 or
             self.get_prefix(el) is not None
@@ -1128,7 +1124,7 @@ def match_selectors(self, el, selectors):
         match = False
         is_not = selectors.is_not
         is_html = selectors.is_html
-        if not (is_html and self.is_xml):
+        if not (is_html and (self.is_xml and not self.html_namespace)):
             for selector in selectors:
                 match = is_not
                 # We have a un-matchable situation (like `:focus` as you can focus an element in this environment)

diff --git a/tests/test_bs4_cases.py b/tests/test_bs4_cases.py
@@ -89,3 +89,56 @@ def test_parent_nth_of_type(self):
         els = sv.select('div:nth-of-type(1) > h1', self.soup)
         self.assertEqual(len(els), 1)
         self.assertEqual(els[0].string, 'An H1')
+
+
+SIMPLE_XML = """<Envelope><Header>...</Header></Envelope>"""
+NAMESPACE_XML = """
+<?xml version="1.0"?>
+<s:Envelope xmlns:s="http://www.w3.org/2003/05/soap-envelope" xmlns:a="http://www.w3.org/2005/08/addressing"
+            xmlns:u="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-utility-1.0.xsd">
+  <a:Action s:mustUnderstand="1">http://docs.oasis-open.org/ws-sx/ws-trust/200512/RST/Issue</a:Action>
+  <o:UsernameToken u:Id="uuid-00000043-0000-4000-0000-000000000000">
+</s:Envelope>
+""".strip()
+NAMESPACES = dict(x="http://www.w3.org/2003/05/soap-envelope",
+                  y="http://www.w3.org/2005/08/addressing",
+                  z="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-utility-1.0.xsd")
+
+
+def test_simple_xml():
+    """Test basic XML."""
+    xml = BeautifulSoup(SIMPLE_XML, "xml")
+
+    assert xml.select_one("Envelope")
+    assert xml.select_one("Envelope Header")
+    assert xml.select_one("Header")
+
+    assert not xml.select_one("envelope")
+    assert not xml.select_one("envelope header")
+    assert not xml.select_one("header")
+
+
+def test_namespace_xml():
+    """Test namespace XML."""
+    xml = BeautifulSoup(NAMESPACE_XML, "xml")
+
+    assert xml.select_one("Envelope")
+    assert xml.select_one("Envelope Action")
+    assert xml.select_one("Action")
+
+    assert not xml.select_one("envelope")
+    assert not xml.select_one("envelope action")
+    assert not xml.select_one("action")
+
+
+def test_namespace_xml_with_namespace():
+    """Test namespace selectors with XML."""
+    xml = BeautifulSoup(NAMESPACE_XML, "xml")
+
+    assert xml.select_one("x|Envelope", namespaces=NAMESPACES)
+    assert xml.select_one("x|Envelope y|Action", namespaces=NAMESPACES)
+    assert xml.select_one("y|Action", namespaces=NAMESPACES)
+
+    assert not xml.select_one("x|envelope", namespaces=NAMESPACES)
+    assert not xml.select_one("x|envelope y|action", namespaces=NAMESPACES)
+    assert not xml.select_one("y|action", namespaces=NAMESPACES)
diff --git a/tests/test_level1.py b/tests/test_level1.py
@@ -43,6 +43,108 @@ def test_tag(self):
             flags=util.HTML
         )
 
+    def test_tag_html(self):
+        """Test tag for HTML."""
+
+        markup = """
+        <Tag id="1">
+        <tag id="2"></tag>
+        <TAG id="3"></TAG>
+        </Tag>
+        """
+
+        self.assert_selector(
+            markup,
+            "tag",
+            ["1", "2", "3"],
+            flags=util.HTML
+        )
+
+        self.assert_selector(
+            markup,
+            "Tag",
+            ["1", "2", "3"],
+            flags=util.HTML
+        )
+
+        self.assert_selector(
+            markup,
+            "TAG",
+            ["1", "2", "3"],
+            flags=util.HTML
+        )
+
+    def test_tag_xhtml(self):
+        """Test tag for XHTML."""
+
+        markup = """
+        <?xml version="1.0" encoding="UTF-8"?>
+        <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
+            "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+        <html lang="en" xmlns="http://www.w3.org/1999/xhtml">
+        <head>
+        </head>
+        <body>
+        <Tag id="1">
+        <tag id="2"></tag>
+        <TAG id="3"></TAG>
+        </Tag>
+        </body>
+        </html>
+        """
+
+        self.assert_selector(
+            markup,
+            "tag",
+            ["2"],
+            flags=util.XHTML
+        )
+
+        self.assert_selector(
+            markup,
+            "Tag",
+            ["1"],
+            flags=util.XHTML
+        )
+
+        self.assert_selector(
+            markup,
+            "TAG",
+            ["3"],
+            flags=util.XHTML
+        )
+
+    def test_tag_xml(self):
+        """Test tag for XML."""
+
+        markup = """
+        <Tag id="1">
+        <tag id="2"></tag>
+        <TAG id="3"></TAG>
+        </Tag>
+        """
+
+        self.assert_selector(
+            markup,
+            "tag",
+            ["2"],
+            flags=util.XML
+        )
+
+        self.assert_selector(
+            markup,
+            "Tag",
+            ["1"],
+            flags=util.XML
+        )
+
+        self.assert_selector(
+            markup,
+            "TAG",
+            ["3"],
+            flags=util.XML
+        )
+
     def test_tags(self):
         """Test multiple selectors."""
 

diff --git a/tests/test_level2.py b/tests/test_level2.py
@@ -357,7 +357,7 @@ def test_attribute_type(self):
         self.assert_selector(
             markup,
             '[type="test"]',
-            ["0", '2'],
+            ['2'],
             flags=util.XML
         )