-
Notifications
You must be signed in to change notification settings - Fork 323
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implementation of RFC 8785: JSON Canonicalization Scheme
- Loading branch information
Showing
9 changed files
with
612 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
{-# LANGUAGE UnboxedTuples, BangPatterns #-} | ||
-- | JSON Canonicalization Scheme https://datatracker.ietf.org/doc/html/rfc8785 | ||
module Data.Aeson.RFC8785 ( | ||
encodeCanonical, | ||
) where | ||
|
||
import Data.List (sortBy) | ||
import Data.Ord (comparing) | ||
import GHC.Integer (quotRemInteger) | ||
import Math.NumberTheory.Logarithms (integerLog10) | ||
|
||
import Data.Aeson | ||
import Data.Aeson.Encoding | ||
import Data.Aeson.Encoding.Internal | ||
import Data.Aeson.Internal.Prelude | ||
import Data.Aeson.Internal.Word8 | ||
|
||
import qualified Data.Aeson.Key as Key | ||
import qualified Data.Aeson.KeyMap as KM | ||
import qualified Data.ByteString as BS | ||
import qualified Data.ByteString.Builder as B | ||
import qualified Data.ByteString.Builder.Prim as BP | ||
import qualified Data.ByteString.Lazy as LBS | ||
import qualified Data.Scientific as Sci | ||
import qualified Data.Text.Encoding as TE | ||
import qualified Data.Vector as V | ||
|
||
-- $setup | ||
-- >>> import Data.Aeson | ||
|
||
-- | Encode to JSON according to RFC 8785 canonicalization scheme. | ||
-- https://datatracker.ietf.org/doc/html/rfc8785 | ||
-- | ||
-- 'encodeCanonical' uses 'toJSON' to produce intermediate 'Value', | ||
-- as 'toEncoding' may (and most likely) produces non-canonical JSON. | ||
-- | ||
-- Note: @decode (encodeCanonical v) === Just v@ for all @v :: Value@, | ||
-- i.e. 'encodeCanonical' doesn't lose any information. | ||
-- | ||
-- However, the example in RFC8785 /loses/ information as the intermediate | ||
-- number representation is 'Double', also current @toJSON :: Double -> Value@ | ||
-- sometimes produces too precise values. For example | ||
-- | ||
-- >>> toJSON (1e23 :: Double) | ||
-- Number 9.999999999999999e22 | ||
-- | ||
-- 'show' also behaves the same: | ||
-- | ||
-- >>> 1e23 :: Double | ||
-- 9.999999999999999e22 | ||
-- | ||
-- Note: RFC8785 is __not the same scheme__ as used in | ||
-- [canonical-json](https://hackage.haskell.org/package/canonical-json) package | ||
-- (https://wiki.laptop.org/go/Canonical_JSON). | ||
-- That scheme produces /invalid/ JSON (e.g. control characters encoded as is, not escaped) | ||
-- and cannot encode non-integral numbers. | ||
-- | ||
-- @since 2.2.1.0 | ||
-- | ||
encodeCanonical :: ToJSON a => a -> LBS.ByteString | ||
encodeCanonical = encodingToLazyByteString . toCanonical . toJSON | ||
|
||
toCanonical :: Value -> Encoding | ||
toCanonical Null = null_ | ||
toCanonical (Bool b) = bool b | ||
toCanonical (Number n) = canonicalNumber n | ||
toCanonical (String s) = canonicalString s | ||
toCanonical (Array v) = list toCanonical (V.toList v) | ||
toCanonical (Object m) = dict (canonicalString . Key.toText) toCanonical ifr $ | ||
sortBy (\(k1, _) (k2, _) -> propertyCmp k1 k2) (KM.toList m) | ||
|
||
ifr :: (k -> v -> a -> a) -> a -> [(k, v)] -> a | ||
ifr f z = foldr (\(k, v) -> f k v) z | ||
{-# INLINE ifr #-} | ||
|
||
-- Property name strings to be sorted are formatted as arrays of UTF-16 code units. | ||
propertyCmp :: Key -> Key -> Ordering | ||
propertyCmp = comparing f where | ||
-- this is slow implementation, but it's obviously not wrong. | ||
f :: Key -> BS.ByteString | ||
f = TE.encodeUtf16BE . Key.toText | ||
|
||
-- strings are already serialized canonically. | ||
canonicalString :: Text -> Encoding' a | ||
canonicalString = text | ||
|
||
-- RFC 8785 is outsourcing number format to ECMA-262. | ||
-- 10th edition, 7.1.12.1 NumberToString | ||
-- https://262.ecma-international.org/10.0/#sec-tostring-applied-to-the-number-type | ||
-- | ||
-- Note: this specification is not lossy | ||
-- Given 'Scientific' we can choose n,k,s uniquely: 'nks'. | ||
-- | ||
-- RFC8785 Appendix D says "don't use bignums". | ||
canonicalNumber :: Scientific -> Encoding | ||
canonicalNumber m = case compare m 0 of | ||
EQ -> Encoding (B.word8 W8_0) | ||
LT -> Encoding (B.word8 W8_MINUS <> fromEncoding (canonicalNumber' (negate m))) | ||
GT -> canonicalNumber' m | ||
|
||
-- input: Positive number | ||
canonicalNumber' :: Scientific -> Encoding | ||
canonicalNumber' m | ||
| k <= n, n <= 21 | ||
= Encoding $ | ||
BP.primMapListFixed BP.word8 ds <> | ||
BP.primMapListFixed BP.word8 (replicate (n - k) W8_0) | ||
|
||
| 0 < n, n <= 21 | ||
, let (pfx, sfx) = splitAt n ds | ||
= Encoding $ | ||
BP.primMapListFixed BP.word8 pfx <> | ||
B.word8 W8_DOT <> | ||
BP.primMapListFixed BP.word8 sfx | ||
|
||
| -6 < n, n <= 0 | ||
= Encoding $ | ||
B.word8 W8_0 <> | ||
B.word8 W8_DOT <> | ||
BP.primMapListFixed BP.word8 (replicate (negate n) W8_0) <> | ||
BP.primMapListFixed BP.word8 ds | ||
|
||
| k == 1, [d] <- ds | ||
= Encoding $ | ||
B.word8 d <> | ||
B.word8 W8_e <> | ||
B.word8 (if (n - 1) >= 0 then W8_PLUS else W8_MINUS) <> | ||
BP.primMapListFixed BP.word8 (integerToDecimalDigits (abs (toInteger n - 1))) | ||
|
||
| (d:ds') <- ds | ||
= Encoding $ | ||
B.word8 d <> | ||
B.word8 W8_DOT <> | ||
BP.primMapListFixed BP.word8 ds' <> | ||
B.word8 W8_e <> | ||
B.word8 (if (n - 1) >= 0 then W8_PLUS else W8_MINUS) <> | ||
BP.primMapListFixed BP.word8 (integerToDecimalDigits (abs (toInteger n - 1))) | ||
|
||
| otherwise | ||
= string "0" -- shouldn't happen, but we need a default case. | ||
|
||
where | ||
-- 5. Otherwise, let n, k, and s be integers such that | ||
-- k ≥ 1, 10k - 1 ≤ s < 10k, the Number value for s × 10n - k is m, | ||
-- and k is as small as possible. | ||
-- Note that k is the number of digits in the decimal representation of s, | ||
-- that s is not divisible by 10, and that the least significant digit of s | ||
-- is not necessarily uniquely determined by these criteria. | ||
(n, k, s) = nks m | ||
ds = integerToDecimalDigits s | ||
|
||
-- 5. Otherwise, let n, k, and s be integers such that k ≥ 1, 10^(k - 1) ≤ s < 10^k, | ||
-- the Number value for s × 10^(n - k) is m, and k is as small as possible. | ||
-- Note that k is the number of digits in the decimal representation of s, | ||
-- that s is not divisible by 10, and that the least significant digit of s | ||
-- is not necessarily uniquely determined by these criteria. | ||
nks :: Scientific -> (Int, Int, Integer) | ||
nks m = (e + k, k, c) | ||
where | ||
m' = Sci.normalize m | ||
c = Sci.coefficient m' | ||
e = Sci.base10Exponent m' | ||
k = integerLog10 c + 1 | ||
|
||
integerToDecimalDigits :: Integer -> [Word8] | ||
integerToDecimalDigits = go [] where | ||
go acc 0 = acc | ||
go acc i = case quotRemInteger i 10 of | ||
(# q, r #) -> go (d:acc) q where !d = fromIntegral r + W8_0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
{-# LANGUAGE CPP #-} | ||
module CastFloat ( | ||
castDoubleToWord64, | ||
castWord64ToDouble, | ||
castFloatTests, | ||
) where | ||
|
||
import Test.Tasty (TestTree, testGroup) | ||
import Test.Tasty.HUnit (testCase, (@?=)) | ||
import Test.Tasty.QuickCheck (testProperty, (===)) | ||
|
||
import Types (UniformWord64 (..)) | ||
|
||
#if MIN_VERSION_base(4,11,0) | ||
import GHC.Float (castDoubleToWord64, castWord64ToDouble) | ||
#else | ||
|
||
import Data.Word (Word64) | ||
import Foreign.Storable (Storable (peek, poke)) | ||
import Foreign.Ptr (castPtr) | ||
import Foreign.Marshal (alloca) | ||
import System.IO.Unsafe (unsafeDupablePerformIO) | ||
|
||
castDoubleToWord64 :: Double -> Word64 | ||
castDoubleToWord64 = reinterpretCast | ||
|
||
castWord64ToDouble :: Word64 -> Double | ||
castWord64ToDouble = reinterpretCast | ||
|
||
reinterpretCast :: (Storable a, Storable b) => a -> b | ||
reinterpretCast x = unsafeDupablePerformIO $ alloca $ \ptr -> do | ||
poke ptr x | ||
peek (castPtr ptr) | ||
|
||
#endif | ||
|
||
castFloatTests :: TestTree | ||
castFloatTests = testGroup "castDoubleToWord64" | ||
[ testCase "5e-324" $ castDoubleToWord64 5e-324 @?= 1 | ||
, testProperty "roundtrip1" $ \d -> castWord64ToDouble (castDoubleToWord64 d) === d | ||
, testProperty "roundtrip2" $ \(U64 w) -> castDoubleToWord64 (castWord64ToDouble w) === w | ||
] |
Oops, something went wrong.