isValidUtf8 function, tests (#423)

* isValidUtf8 function, tests * Cleanup, change to forAll * Fix accidentally-correct generation of wrong test cases * Increase test count, fix segfault on Clang * Try fixing Windows * Eliminate block-checking for ASCII in fallback mode * Use GHC 8.10 for CI on Windows * Namespace externally-visible C function better * Link to libgcc on Windows * Fix error accumulation in AVX2 check * Fix use of <> on older GHCs * Repair Drone config to deal with older Cabal * Ensure non-SSE2 x86 can still build * License information and contact for C code * Fix test name typo, allow more than pre-set count of tests * Stop using MagicHash for FFI wrapper * Revert change to 8.10 for Windows CI * Use existing license verbatim for C code * Explain the use of gcc and gcc_s * More concise implementation of isValidUtf8 * Document AVX2 block-checking function
haskell · Nov 3, 2021 · dad6dd1 · dad6dd1
1 parent e4d6176
commit dad6dd1
Show file tree

Hide file tree

Showing 8 changed files with 1,214 additions and 8 deletions.
diff --git a/.drone.yml b/.drone.yml
@@ -29,7 +29,8 @@ steps:
     # of coming up with these automatically, but older ones need a hint.
     # Specifically, we need to inform Cabal that:
     #
-    # * optparse-applicative doesn't use process; and
-    # * QuickCheck uses the old random package.
-    - cabal install --dependencies-only --enable-tests --constraint 'optparse-applicative -process' --constraint 'QuickCheck +old-random'
+    # * optparse-applicative doesn't use process; 
+    # * QuickCheck uses the old random package; and
+    # * tasty doesn't use unix.
+    - cabal install --dependencies-only --enable-tests --constraint 'optparse-applicative -process' --constraint 'QuickCheck +old-random' --constraint 'tasty -unix'
     - cabal test
diff --git a/Changelog.md b/Changelog.md
@@ -20,6 +20,7 @@
 * [Use `unsafeWithForeignPtr` whenever possible](https://github.com/haskell/bytestring/pull/401)
 * [Remove `integer-simple` flag](https://github.com/haskell/bytestring/pull/371)
 * [Remove misleading mentions of fusion](https://github.com/haskell/bytestring/pull/412)
+* Add `Data.ByteString.isValidUtf8`
 
 [0.11.2.0]: https://github.com/haskell/bytestring/compare/0.11.1.0...0.11.2.0
 

diff --git a/Data/ByteString.hs b/Data/ByteString.hs
@@ -141,6 +141,9 @@ module Data.ByteString (
         isSuffixOf,
         isInfixOf,
 
+        -- ** Encoding validation
+        isValidUtf8,
+
         -- ** Search for arbitrary substrings
         breakSubstring,
 
@@ -238,7 +241,7 @@ import Control.Exception        (IOException, catch, finally, assert, throwIO)
 import Control.Monad            (when, void)
 
 import Foreign.C.String         (CString, CStringLen)
-import Foreign.C.Types          (CSize)
+import Foreign.C.Types          (CSize (CSize), CInt (CInt))
 import Foreign.ForeignPtr       (ForeignPtr, withForeignPtr, touchForeignPtr)
 import Foreign.ForeignPtr.Unsafe(unsafeForeignPtrToPtr)
 import Foreign.Marshal.Alloc    (allocaBytes)
@@ -1527,6 +1530,17 @@ stripSuffix bs1@(BS _ l1) bs2@(BS _ l2)
 isInfixOf :: ByteString -> ByteString -> Bool
 isInfixOf p s = null p || not (null $ snd $ breakSubstring p s)
 
+-- | /O(n)/ Check whether a 'ByteString' represents valid UTF-8.
+--
+-- @since 0.11.2.0
+isValidUtf8 :: ByteString -> Bool
+isValidUtf8 (BS ptr len) = accursedUnutterablePerformIO $ unsafeWithForeignPtr ptr $ \p -> do 
+  i <- cIsValidUtf8 p (fromIntegral len)
+  pure $ i /= 0
+
+foreign import ccall unsafe "bytestring_is_valid_utf8" cIsValidUtf8
+  :: Ptr Word8 -> CSize -> IO CInt
+
 -- | Break a string on a substring, returning a pair of the part of the
 -- string prior to the match, and the rest of the string.
 --

diff --git a/LICENSE b/LICENSE
@@ -2,6 +2,7 @@ Copyright (c) Don Stewart 2005-2009
           (c) Duncan Coutts 2006-2015
           (c) David Roundy 2003-2005
           (c) Simon Meier 2010-2011
+          (c) Koz Ross 2021
 
 All rights reserved.
 

diff --git a/bytestring.cabal b/bytestring.cabal
@@ -1,5 +1,5 @@
 Name:                bytestring
-Version:             0.11.1.0
+Version:             0.11.2.0
 Synopsis:            Fast, compact, strict and lazy byte strings with a list interface
 Description:
     An efficient compact, immutable byte string type (both strict and lazy)
@@ -113,10 +113,23 @@ library
                     -fmax-simplifier-iterations=10
                     -fdicts-cheap
                     -fspec-constr-count=6
-
-  c-sources:         cbits/fpstring.c
-                     cbits/itoa.c
+
+  c-sources:        cbits/fpstring.c
+                    cbits/itoa.c
+
+  if (arch(x86_64) || arch(i386))
+    c-sources:        cbits/x86/is-valid-utf8.c
+  else
+    c-sources:        cbits/is-valid-utf8.c
+
   cc-options:        -std=c11
+
+  -- Required, due to the following issues:
+  -- * https://gitlab.haskell.org/ghc/ghc/-/issues/20525#note_385580
+  -- * https://gitlab.haskell.org/ghc/ghc/-/issues/19417
+  if os(windows)
+    extra-libraries:  gcc_s gcc
+
   include-dirs:      include
   includes:          fpstring.h
   install-includes:  fpstring.h
@@ -172,6 +185,19 @@ test-suite bytestring-th
   ghc-options:      -Wall -fwarn-tabs -threaded -rtsopts
   default-language: Haskell2010
 
+test-suite is-valid-utf8
+  type:             exitcode-stdio-1.0
+  hs-source-dirs:   tests/is-valid-utf8
+  main-is:          Main.hs
+  build-depends:    base, 
+                    bytestring, 
+                    tasty, 
+                    tasty-hunit,
+                    tasty-quickcheck, 
+                    QuickCheck
+  ghc-options:      -Wall -fwarn-tabs -threaded -rtsopts
+  default-language: Haskell2010
+
 benchmark bytestring-bench
   main-is:          BenchAll.hs
   other-modules:    BenchBoundsCheckFusion

diff --git a/cbits/is-valid-utf8.c b/cbits/is-valid-utf8.c
@@ -0,0 +1,139 @@
+/*
+Copyright (c) Koz Ross 2021
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+3. Neither the name of the author nor the names of his contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
+*/
+#pragma GCC push_options
+#pragma GCC optimize("-O2")
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+// 0x80 in every 'lane'.
+static uint64_t const high_bits_mask = 0x8080808080808080ULL;
+
+int bytestring_is_valid_utf8(uint8_t const *const src, size_t const len) {
+  uint8_t const *ptr = (uint8_t const *)src;
+  // This is 'one past the end' to make loop termination and bounds checks
+  // easier.
+  uint8_t const *const end = ptr + len;
+  while (ptr < end) {
+    uint8_t const byte = *ptr;
+    // Check if the byte is ASCII.
+    if (byte <= 0x7F) {
+      ptr++;
+      /*
+      // If we saw one ASCII byte, as long as it's not whitespace, it's quite
+      // likely we'll see more.
+      bool is_not_whitespace = byte > 32;
+      // If possible, do a block-check ahead.
+      if ((ptr + 32 < end) && is_not_whitespace) {
+        uint64_t const *big_ptr = (uint64_t const *)ptr;
+        // Non-ASCII bytes have a set MSB. Thus, if we AND with 0x80 in every
+        // 'lane', we will get 0 if everything is ASCII, and something else
+        // otherwise.
+        uint64_t results[4] = {(*big_ptr) & high_bits_mask,
+                               (*(big_ptr + 1)) & high_bits_mask,
+                               (*(big_ptr + 2)) & high_bits_mask,
+                               (*(big_ptr + 3)) & high_bits_mask};
+        if (results[0] == 0) {
+          ptr += 8;
+          if (results[1] == 0) {
+            ptr += 8;
+            if (results[2] == 0) {
+              ptr += 8;
+              if (results[3] == 0) {
+                ptr += 8;
+              } else {
+                ptr += (__builtin_ctzl(results[3]) / 8);
+              }
+            } else {
+              ptr += (__builtin_ctzl(results[2]) / 8);
+            }
+          } else {
+            ptr += (__builtin_ctzl(results[1]) / 8);
+          }
+        } else {
+          ptr += (__builtin_ctzl(results[0]) / 8);
+        }
+      }
+      */
+    }
+    // Check for a valid 2-byte sequence.
+    //
+    // We use a signed comparison to avoid an extra comparison with 0x80, since
+    // _signed_ 0x80 is -128.
+    else if (ptr + 1 < end && byte >= 0xC2 && byte <= 0xDF &&
+             ((int8_t) * (ptr + 1)) <= (int8_t)0xBF) {
+      ptr += 2;
+    }
+    // Check for a valid 3-byte sequence.
+    else if (ptr + 2 < end && byte >= 0xE0 && byte <= 0xEF) {
+      uint8_t const byte2 = *(ptr + 1);
+      bool byte2_valid = (int8_t)byte2 <= (int8_t)0xBF;
+      bool byte3_valid = ((int8_t) * (ptr + 2)) <= (int8_t)0xBF;
+      if (byte2_valid && byte3_valid &&
+          // E0, A0..BF, 80..BF
+          ((byte == 0xE0 && byte2 >= 0xA0) ||
+           // E1..EC, 80..BF, 80..BF
+           (byte >= 0xE1 && byte <= 0xEC) ||
+           // ED, 80..9F, 80..BF
+           (byte == 0xED && byte2 <= 0x9F) ||
+           // EE..EF, 80..BF, 80..BF
+           (byte >= 0xEE && byte <= 0xEF))) {
+        ptr += 3;
+      } else {
+        return 0;
+      }
+    }
+    // Check for a valid 4-byte sequence.
+    else if (ptr + 3 < end) {
+      uint8_t const byte2 = *(ptr + 1);
+      bool byte2_valid = (int8_t)byte2 <= (int8_t)0xBF;
+      bool byte3_valid = ((int8_t) * (ptr + 2)) <= (int8_t)0xBF;
+      bool byte4_valid = ((int8_t) * (ptr + 3)) <= (int8_t)0xBF;
+      if (byte2_valid && byte3_valid && byte4_valid &&
+          // F0, 90..BF, 80..BF, 80..BF
+          ((byte == 0xF0 && byte2 >= 0x90) ||
+           // F1..F3, 80..BF, 80..BF, 80..BF
+           (byte >= 0xF1 && byte <= 0xF3) ||
+           // F4, 80..8F, 80..BF, 80..BF
+           (byte == 0xF4 && byte2 <= 0x8F))) {
+        ptr += 4;
+      } else {
+        return 0;
+      }
+    }
+    // Otherwise, invalid.
+    else {
+      return 0;
+    }
+  }
+  // If we got this far, we're valid.
+  return 1;
+}
+#pragma GCC pop_options