Implementation of RFC 8785: JSON Canonicalization Scheme

haskell · Oct 3, 2023 · 3f2a79a · 3f2a79a
1 parent 54d3c33
commit 3f2a79a
Show file tree

Hide file tree

Showing 9 changed files with 612 additions and 4 deletions.
diff --git a/.hlint.yaml b/.hlint.yaml
@@ -46,3 +46,4 @@
 - ignore: { name: "Use const" }
 - ignore: { name: "Use -" }
 - ignore: { name: "Use /=" }
+- ignore: { name: "Use uncurry" }
diff --git a/aeson.cabal b/aeson.cabal
@@ -65,6 +65,7 @@ library
     Data.Aeson.Key
     Data.Aeson.KeyMap
     Data.Aeson.QQ.Simple
+    Data.Aeson.RFC8785
     Data.Aeson.Text
     Data.Aeson.TH
     Data.Aeson.Types
@@ -108,13 +109,17 @@ library
   if !impl(ghc >=8.6)
     build-depends: contravariant >=1.4.1 && <1.6
 
+  if !impl(ghc >=9.0)
+    build-depends: integer-gmp
+
   -- Other dependencies
   build-depends:
       data-fix              >=0.3.2    && <0.4
     , dlist                 >=1.0      && <1.1
     , hashable              >=1.4.2.0  && <1.5
     , indexed-traversable   >=0.1.2    && <0.2
     , integer-conversion    >=0.1      && <0.2
+    , integer-logarithms    >=1.0.3.1  && <1.1
     , network-uri           >=2.6.4.1  && <2.7
     , OneTuple              >=0.4.1.1  && <0.5
     , primitive             >=0.8.0.0  && <0.9
@@ -146,10 +151,12 @@ test-suite aeson-tests
   main-is:          Tests.hs
   ghc-options:      -Wall -threaded -rtsopts
   other-modules:
+    CastFloat
     DataFamilies.Encoders
     DataFamilies.Instances
     DataFamilies.Properties
     DataFamilies.Types
+    DoubleToScientific
     Encoders
     ErrorMessages
     Functions
@@ -168,13 +175,15 @@ test-suite aeson-tests
     Regression.Issue571
     Regression.Issue687
     Regression.Issue967
+    RFC8785
     SerializationFormatSpec
     Types
     UnitTests
     UnitTests.FromJSONKey
     UnitTests.Hashable
     UnitTests.KeyMapInsertWith
     UnitTests.MonadFix
+    UnitTests.NoThunks
     UnitTests.NullaryConstructors
     UnitTests.OmitNothingFieldsNote
     UnitTests.OptionalFields
@@ -183,18 +192,17 @@ test-suite aeson-tests
     UnitTests.OptionalFields.Manual
     UnitTests.OptionalFields.TH
     UnitTests.UTCTime
-    UnitTests.NoThunks
 
   build-depends:
       aeson
     , base
     , base-compat
-    , deepseq
     , base-orphans          >=0.5.3  && <0.10
     , base16-bytestring
     , bytestring
     , containers
     , data-fix
+    , deepseq
     , Diff                  >=0.4    && <0.5
     , directory
     , dlist
@@ -227,6 +235,9 @@ test-suite aeson-tests
     , uuid-types
     , vector
 
+  if !impl(ghc >=9.0)
+    build-depends: integer-gmp
+
   if impl(ghc >=9.2 && <9.7)
     build-depends: nothunks >=0.1.4 && <0.2
 

diff --git a/changelog.md b/changelog.md
@@ -1,5 +1,10 @@
 For the latest version of this document, please see [https://github.com/haskell/aeson/blob/master/changelog.md](https://github.com/haskell/aeson/blob/master/changelog.md).
 
+### 2.2.1.0
+
+* Add `Data.Aeson.RFC8785`, a JSON Canonicalization Scheme implementation
+  https://datatracker.ietf.org/doc/html/rfc8785
+
 ### 2.2.0.0
 
 * Rework how `omitNothingFields` works. Add `allowOmittedFields` as a parsing counterpart.

diff --git a/src/Data/Aeson/RFC8785.hs b/src/Data/Aeson/RFC8785.hs
@@ -0,0 +1,169 @@
+{-# LANGUAGE UnboxedTuples, BangPatterns #-}
+-- | JSON Canonicalization Scheme https://datatracker.ietf.org/doc/html/rfc8785
+module Data.Aeson.RFC8785 (
+    encodeCanonical,
+) where
+
+import Data.List (sortBy)
+import Data.Ord (comparing)
+import GHC.Integer (quotRemInteger)
+import Math.NumberTheory.Logarithms (integerLog10)
+
+import Data.Aeson
+import Data.Aeson.Encoding
+import Data.Aeson.Encoding.Internal
+import Data.Aeson.Internal.Prelude
+import Data.Aeson.Internal.Word8
+
+import qualified Data.Aeson.Key as Key
+import qualified Data.Aeson.KeyMap as KM
+import qualified Data.ByteString as BS
+import qualified Data.ByteString.Builder as B
+import qualified Data.ByteString.Builder.Prim as BP
+import qualified Data.ByteString.Lazy as LBS
+import qualified Data.Scientific as Sci
+import qualified Data.Text.Encoding as TE
+import qualified Data.Vector as V
+
+-- $setup
+-- >>> import Data.Aeson
+
+-- | Encode to JSON according to RFC 8785 canonicalization scheme.
+-- https://datatracker.ietf.org/doc/html/rfc8785
+--
+-- 'encodeCanonical' uses 'toJSON' to produce intermediate 'Value',
+-- as 'toEncoding' may (and most likely) produces non-canonical JSON.
+--
+-- Note: @decode (encodeCanonical v) === Just v@ for all @v :: Value@,
+-- i.e. 'encodeCanonical' doesn't lose any information.
+--
+-- However, the example in RFC8785 /loses/ information as the intermediate
+-- number representation is 'Double', also current @toJSON :: Double -> Value@
+-- sometimes produces too precise values. For example
+--
+-- >>> toJSON (1e23 :: Double)
+-- Number 9.999999999999999e22
+--
+-- 'show' also behaves the same:
+--
+-- >>> 1e23 :: Double
+-- 9.999999999999999e22
+--
+-- Note: RFC8785 is __not the same scheme__ as used in
+-- [canonical-json](https://hackage.haskell.org/package/canonical-json) package
+-- (https://wiki.laptop.org/go/Canonical_JSON).
+-- That scheme produces /invalid/ JSON (e.g. control characters encoded as is, not escaped)
+-- and cannot encode non-integral numbers.
+--
+-- @since 2.2.1.0
+--
+encodeCanonical :: ToJSON a => a -> LBS.ByteString
+encodeCanonical = encodingToLazyByteString . toCanonical . toJSON
+
+toCanonical :: Value -> Encoding
+toCanonical Null       = null_
+toCanonical (Bool b)   = bool b
+toCanonical (Number n) = canonicalNumber n
+toCanonical (String s) = canonicalString s
+toCanonical (Array v)  = list toCanonical (V.toList v)
+toCanonical (Object m) = dict (canonicalString . Key.toText) toCanonical ifr $
+    sortBy (\(k1, _) (k2, _) -> propertyCmp k1 k2) (KM.toList m)
+
+ifr :: (k -> v -> a -> a) -> a -> [(k, v)] -> a
+ifr f z = foldr (\(k, v) -> f k v) z
+{-# INLINE ifr #-}
+
+-- Property name strings to be sorted are formatted as arrays of UTF-16 code units.
+propertyCmp :: Key -> Key -> Ordering
+propertyCmp = comparing f where
+    -- this is slow implementation, but it's obviously not wrong.
+    f :: Key -> BS.ByteString
+    f = TE.encodeUtf16BE . Key.toText
+
+-- strings are already serialized canonically.
+canonicalString :: Text -> Encoding' a
+canonicalString = text
+
+-- RFC 8785 is outsourcing number format to ECMA-262.
+-- 10th edition, 7.1.12.1 NumberToString
+-- https://262.ecma-international.org/10.0/#sec-tostring-applied-to-the-number-type
+--
+-- Note: this specification is not lossy
+-- Given 'Scientific' we can choose n,k,s uniquely: 'nks'.
+--
+-- RFC8785 Appendix D says "don't use bignums".
+canonicalNumber :: Scientific -> Encoding
+canonicalNumber m = case compare m 0 of
+    EQ -> Encoding (B.word8 W8_0)
+    LT -> Encoding (B.word8 W8_MINUS <> fromEncoding (canonicalNumber' (negate m)))
+    GT -> canonicalNumber' m
+
+-- input: Positive number
+canonicalNumber' :: Scientific -> Encoding
+canonicalNumber' m
+    | k <= n, n <= 21
+    = Encoding $
+        BP.primMapListFixed BP.word8 ds <>
+        BP.primMapListFixed BP.word8 (replicate (n - k) W8_0)
+
+    | 0 < n, n <= 21
+    , let (pfx, sfx) = splitAt n ds
+    = Encoding $
+        BP.primMapListFixed BP.word8 pfx <>
+        B.word8 W8_DOT <>
+        BP.primMapListFixed BP.word8 sfx
+
+    | -6 < n, n <= 0
+    = Encoding $
+        B.word8 W8_0 <>
+        B.word8 W8_DOT <>
+        BP.primMapListFixed BP.word8 (replicate (negate n) W8_0) <>
+        BP.primMapListFixed BP.word8 ds
+
+    | k == 1, [d] <- ds
+    = Encoding $
+        B.word8 d <>
+        B.word8 W8_e <>
+        B.word8 (if (n - 1) >= 0 then W8_PLUS else W8_MINUS) <>
+        BP.primMapListFixed BP.word8 (integerToDecimalDigits (abs (toInteger n - 1)))
+
+    | (d:ds') <- ds
+    = Encoding $
+        B.word8 d <>
+        B.word8 W8_DOT <>
+        BP.primMapListFixed BP.word8 ds' <>
+        B.word8 W8_e <>
+        B.word8 (if (n - 1) >= 0 then W8_PLUS else W8_MINUS) <>
+        BP.primMapListFixed BP.word8 (integerToDecimalDigits (abs (toInteger n - 1)))
+
+    | otherwise
+    = string "0" -- shouldn't happen, but we need a default case.
+
+  where
+    -- 5. Otherwise, let n, k, and s be integers such that
+    -- k ≥ 1, 10k - 1 ≤ s < 10k, the Number value for s × 10n - k is m,
+    -- and k is as small as possible.
+    -- Note that k is the number of digits in the decimal representation of s,
+    -- that s is not divisible by 10, and that the least significant digit of s
+    -- is not necessarily uniquely determined by these criteria.
+    (n, k, s) = nks m
+    ds = integerToDecimalDigits s
+
+-- 5. Otherwise, let n, k, and s be integers such that k ≥ 1, 10^(k - 1) ≤ s < 10^k,
+-- the Number value for s × 10^(n - k) is m, and k is as small as possible.
+-- Note that k is the number of digits in the decimal representation of s,
+-- that s is not divisible by 10, and that the least significant digit of s
+-- is not necessarily uniquely determined by these criteria.
+nks :: Scientific -> (Int, Int, Integer)
+nks m = (e + k, k, c)
+  where
+    m' = Sci.normalize m
+    c = Sci.coefficient m'
+    e = Sci.base10Exponent m'
+    k = integerLog10 c + 1
+
+integerToDecimalDigits :: Integer -> [Word8]
+integerToDecimalDigits = go [] where
+    go acc 0 = acc
+    go acc i = case quotRemInteger i 10 of
+        (# q, r #) -> go (d:acc) q where !d = fromIntegral r + W8_0
diff --git a/tests/CastFloat.hs b/tests/CastFloat.hs
@@ -0,0 +1,42 @@
+{-# LANGUAGE CPP #-}
+module CastFloat (
+    castDoubleToWord64,
+    castWord64ToDouble,
+    castFloatTests,
+) where
+
+import Test.Tasty (TestTree, testGroup)
+import Test.Tasty.HUnit (testCase, (@?=))
+import Test.Tasty.QuickCheck (testProperty, (===))
+
+import Types (UniformWord64 (..))
+
+#if MIN_VERSION_base(4,11,0)
+import GHC.Float (castDoubleToWord64, castWord64ToDouble)
+#else
+
+import Data.Word (Word64)
+import Foreign.Storable (Storable (peek, poke))
+import Foreign.Ptr (castPtr)
+import Foreign.Marshal (alloca)
+import System.IO.Unsafe (unsafeDupablePerformIO)
+
+castDoubleToWord64 :: Double -> Word64
+castDoubleToWord64 = reinterpretCast
+
+castWord64ToDouble :: Word64 -> Double
+castWord64ToDouble = reinterpretCast
+
+reinterpretCast :: (Storable a, Storable b) => a -> b
+reinterpretCast x = unsafeDupablePerformIO $ alloca $ \ptr -> do
+    poke ptr x
+    peek (castPtr ptr)
+
+#endif
+
+castFloatTests :: TestTree
+castFloatTests = testGroup "castDoubleToWord64"
+    [ testCase "5e-324" $ castDoubleToWord64 5e-324 @?= 1
+    , testProperty "roundtrip1" $ \d       -> castWord64ToDouble (castDoubleToWord64 d) === d
+    , testProperty "roundtrip2" $ \(U64 w) -> castDoubleToWord64 (castWord64ToDouble w) === w
+    ]