Skip to content

Commit

Permalink
isValidUtf8 function, tests (#423)
Browse files Browse the repository at this point in the history
* isValidUtf8 function, tests

* Cleanup, change to forAll

* Fix accidentally-correct generation of wrong test cases

* Increase test count, fix segfault on Clang

* Try fixing Windows

* Eliminate block-checking for ASCII in fallback mode

* Use GHC 8.10 for CI on Windows

* Namespace externally-visible C function better

* Link to libgcc on Windows

* Fix error accumulation in AVX2 check

* Fix use of <> on older GHCs

* Repair Drone config to deal with older Cabal

* Ensure non-SSE2 x86 can still build

* License information and contact for C code

* Fix test name typo, allow more than pre-set count of tests

* Stop using MagicHash for FFI wrapper

* Revert change to 8.10 for Windows CI

* Use existing license verbatim for C code

* Explain the use of gcc and gcc_s

* More concise implementation of isValidUtf8

* Document AVX2 block-checking function
  • Loading branch information
kozross authored Nov 3, 2021
1 parent e4d6176 commit dad6dd1
Show file tree
Hide file tree
Showing 8 changed files with 1,214 additions and 8 deletions.
7 changes: 4 additions & 3 deletions .drone.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ steps:
# of coming up with these automatically, but older ones need a hint.
# Specifically, we need to inform Cabal that:
#
# * optparse-applicative doesn't use process; and
# * QuickCheck uses the old random package.
- cabal install --dependencies-only --enable-tests --constraint 'optparse-applicative -process' --constraint 'QuickCheck +old-random'
# * optparse-applicative doesn't use process;
# * QuickCheck uses the old random package; and
# * tasty doesn't use unix.
- cabal install --dependencies-only --enable-tests --constraint 'optparse-applicative -process' --constraint 'QuickCheck +old-random' --constraint 'tasty -unix'
- cabal test
1 change: 1 addition & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
* [Use `unsafeWithForeignPtr` whenever possible](https://github.com/haskell/bytestring/pull/401)
* [Remove `integer-simple` flag](https://github.com/haskell/bytestring/pull/371)
* [Remove misleading mentions of fusion](https://github.com/haskell/bytestring/pull/412)
* Add `Data.ByteString.isValidUtf8`

[0.11.2.0]: https://github.com/haskell/bytestring/compare/0.11.1.0...0.11.2.0

Expand Down
16 changes: 15 additions & 1 deletion Data/ByteString.hs
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@ module Data.ByteString (
isSuffixOf,
isInfixOf,

-- ** Encoding validation
isValidUtf8,

-- ** Search for arbitrary substrings
breakSubstring,

Expand Down Expand Up @@ -238,7 +241,7 @@ import Control.Exception (IOException, catch, finally, assert, throwIO)
import Control.Monad (when, void)

import Foreign.C.String (CString, CStringLen)
import Foreign.C.Types (CSize)
import Foreign.C.Types (CSize (CSize), CInt (CInt))
import Foreign.ForeignPtr (ForeignPtr, withForeignPtr, touchForeignPtr)
import Foreign.ForeignPtr.Unsafe(unsafeForeignPtrToPtr)
import Foreign.Marshal.Alloc (allocaBytes)
Expand Down Expand Up @@ -1527,6 +1530,17 @@ stripSuffix bs1@(BS _ l1) bs2@(BS _ l2)
isInfixOf :: ByteString -> ByteString -> Bool
isInfixOf p s = null p || not (null $ snd $ breakSubstring p s)

-- | /O(n)/ Check whether a 'ByteString' represents valid UTF-8.
--
-- @since 0.11.2.0
isValidUtf8 :: ByteString -> Bool
isValidUtf8 (BS ptr len) = accursedUnutterablePerformIO $ unsafeWithForeignPtr ptr $ \p -> do
i <- cIsValidUtf8 p (fromIntegral len)
pure $ i /= 0

foreign import ccall unsafe "bytestring_is_valid_utf8" cIsValidUtf8
:: Ptr Word8 -> CSize -> IO CInt

-- | Break a string on a substring, returning a pair of the part of the
-- string prior to the match, and the rest of the string.
--
Expand Down
1 change: 1 addition & 0 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ Copyright (c) Don Stewart 2005-2009
(c) Duncan Coutts 2006-2015
(c) David Roundy 2003-2005
(c) Simon Meier 2010-2011
(c) Koz Ross 2021

All rights reserved.

Expand Down
34 changes: 30 additions & 4 deletions bytestring.cabal
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Name: bytestring
Version: 0.11.1.0
Version: 0.11.2.0
Synopsis: Fast, compact, strict and lazy byte strings with a list interface
Description:
An efficient compact, immutable byte string type (both strict and lazy)
Expand Down Expand Up @@ -113,10 +113,23 @@ library
-fmax-simplifier-iterations=10
-fdicts-cheap
-fspec-constr-count=6

c-sources: cbits/fpstring.c
cbits/itoa.c

c-sources: cbits/fpstring.c
cbits/itoa.c

if (arch(x86_64) || arch(i386))
c-sources: cbits/x86/is-valid-utf8.c
else
c-sources: cbits/is-valid-utf8.c

cc-options: -std=c11

-- Required, due to the following issues:
-- * https://gitlab.haskell.org/ghc/ghc/-/issues/20525#note_385580
-- * https://gitlab.haskell.org/ghc/ghc/-/issues/19417
if os(windows)
extra-libraries: gcc_s gcc

include-dirs: include
includes: fpstring.h
install-includes: fpstring.h
Expand Down Expand Up @@ -172,6 +185,19 @@ test-suite bytestring-th
ghc-options: -Wall -fwarn-tabs -threaded -rtsopts
default-language: Haskell2010

test-suite is-valid-utf8
type: exitcode-stdio-1.0
hs-source-dirs: tests/is-valid-utf8
main-is: Main.hs
build-depends: base,
bytestring,
tasty,
tasty-hunit,
tasty-quickcheck,
QuickCheck
ghc-options: -Wall -fwarn-tabs -threaded -rtsopts
default-language: Haskell2010

benchmark bytestring-bench
main-is: BenchAll.hs
other-modules: BenchBoundsCheckFusion
Expand Down
139 changes: 139 additions & 0 deletions cbits/is-valid-utf8.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
/*
Copyright (c) Koz Ross 2021
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the author nor the names of his contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
*/
#pragma GCC push_options
#pragma GCC optimize("-O2")
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>

// 0x80 in every 'lane'.
static uint64_t const high_bits_mask = 0x8080808080808080ULL;

int bytestring_is_valid_utf8(uint8_t const *const src, size_t const len) {
uint8_t const *ptr = (uint8_t const *)src;
// This is 'one past the end' to make loop termination and bounds checks
// easier.
uint8_t const *const end = ptr + len;
while (ptr < end) {
uint8_t const byte = *ptr;
// Check if the byte is ASCII.
if (byte <= 0x7F) {
ptr++;
/*
// If we saw one ASCII byte, as long as it's not whitespace, it's quite
// likely we'll see more.
bool is_not_whitespace = byte > 32;
// If possible, do a block-check ahead.
if ((ptr + 32 < end) && is_not_whitespace) {
uint64_t const *big_ptr = (uint64_t const *)ptr;
// Non-ASCII bytes have a set MSB. Thus, if we AND with 0x80 in every
// 'lane', we will get 0 if everything is ASCII, and something else
// otherwise.
uint64_t results[4] = {(*big_ptr) & high_bits_mask,
(*(big_ptr + 1)) & high_bits_mask,
(*(big_ptr + 2)) & high_bits_mask,
(*(big_ptr + 3)) & high_bits_mask};
if (results[0] == 0) {
ptr += 8;
if (results[1] == 0) {
ptr += 8;
if (results[2] == 0) {
ptr += 8;
if (results[3] == 0) {
ptr += 8;
} else {
ptr += (__builtin_ctzl(results[3]) / 8);
}
} else {
ptr += (__builtin_ctzl(results[2]) / 8);
}
} else {
ptr += (__builtin_ctzl(results[1]) / 8);
}
} else {
ptr += (__builtin_ctzl(results[0]) / 8);
}
}
*/
}
// Check for a valid 2-byte sequence.
//
// We use a signed comparison to avoid an extra comparison with 0x80, since
// _signed_ 0x80 is -128.
else if (ptr + 1 < end && byte >= 0xC2 && byte <= 0xDF &&
((int8_t) * (ptr + 1)) <= (int8_t)0xBF) {
ptr += 2;
}
// Check for a valid 3-byte sequence.
else if (ptr + 2 < end && byte >= 0xE0 && byte <= 0xEF) {
uint8_t const byte2 = *(ptr + 1);
bool byte2_valid = (int8_t)byte2 <= (int8_t)0xBF;
bool byte3_valid = ((int8_t) * (ptr + 2)) <= (int8_t)0xBF;
if (byte2_valid && byte3_valid &&
// E0, A0..BF, 80..BF
((byte == 0xE0 && byte2 >= 0xA0) ||
// E1..EC, 80..BF, 80..BF
(byte >= 0xE1 && byte <= 0xEC) ||
// ED, 80..9F, 80..BF
(byte == 0xED && byte2 <= 0x9F) ||
// EE..EF, 80..BF, 80..BF
(byte >= 0xEE && byte <= 0xEF))) {
ptr += 3;
} else {
return 0;
}
}
// Check for a valid 4-byte sequence.
else if (ptr + 3 < end) {
uint8_t const byte2 = *(ptr + 1);
bool byte2_valid = (int8_t)byte2 <= (int8_t)0xBF;
bool byte3_valid = ((int8_t) * (ptr + 2)) <= (int8_t)0xBF;
bool byte4_valid = ((int8_t) * (ptr + 3)) <= (int8_t)0xBF;
if (byte2_valid && byte3_valid && byte4_valid &&
// F0, 90..BF, 80..BF, 80..BF
((byte == 0xF0 && byte2 >= 0x90) ||
// F1..F3, 80..BF, 80..BF, 80..BF
(byte >= 0xF1 && byte <= 0xF3) ||
// F4, 80..8F, 80..BF, 80..BF
(byte == 0xF4 && byte2 <= 0x8F))) {
ptr += 4;
} else {
return 0;
}
}
// Otherwise, invalid.
else {
return 0;
}
}
// If we got this far, we're valid.
return 1;
}
#pragma GCC pop_options
Loading

0 comments on commit dad6dd1

Please sign in to comment.