Skip to content

Commit

Permalink
Implementation of RFC 8785: JSON Canonicalization Scheme
Browse files Browse the repository at this point in the history
  • Loading branch information
phadej committed Oct 3, 2023
1 parent 54d3c33 commit 3f2a79a
Show file tree
Hide file tree
Showing 9 changed files with 612 additions and 4 deletions.
1 change: 1 addition & 0 deletions .hlint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,4 @@
- ignore: { name: "Use const" }
- ignore: { name: "Use -" }
- ignore: { name: "Use /=" }
- ignore: { name: "Use uncurry" }
15 changes: 13 additions & 2 deletions aeson.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ library
Data.Aeson.Key
Data.Aeson.KeyMap
Data.Aeson.QQ.Simple
Data.Aeson.RFC8785
Data.Aeson.Text
Data.Aeson.TH
Data.Aeson.Types
Expand Down Expand Up @@ -108,13 +109,17 @@ library
if !impl(ghc >=8.6)
build-depends: contravariant >=1.4.1 && <1.6

if !impl(ghc >=9.0)
build-depends: integer-gmp

-- Other dependencies
build-depends:
data-fix >=0.3.2 && <0.4
, dlist >=1.0 && <1.1
, hashable >=1.4.2.0 && <1.5
, indexed-traversable >=0.1.2 && <0.2
, integer-conversion >=0.1 && <0.2
, integer-logarithms >=1.0.3.1 && <1.1
, network-uri >=2.6.4.1 && <2.7
, OneTuple >=0.4.1.1 && <0.5
, primitive >=0.8.0.0 && <0.9
Expand Down Expand Up @@ -146,10 +151,12 @@ test-suite aeson-tests
main-is: Tests.hs
ghc-options: -Wall -threaded -rtsopts
other-modules:
CastFloat
DataFamilies.Encoders
DataFamilies.Instances
DataFamilies.Properties
DataFamilies.Types
DoubleToScientific
Encoders
ErrorMessages
Functions
Expand All @@ -168,13 +175,15 @@ test-suite aeson-tests
Regression.Issue571
Regression.Issue687
Regression.Issue967
RFC8785
SerializationFormatSpec
Types
UnitTests
UnitTests.FromJSONKey
UnitTests.Hashable
UnitTests.KeyMapInsertWith
UnitTests.MonadFix
UnitTests.NoThunks
UnitTests.NullaryConstructors
UnitTests.OmitNothingFieldsNote
UnitTests.OptionalFields
Expand All @@ -183,18 +192,17 @@ test-suite aeson-tests
UnitTests.OptionalFields.Manual
UnitTests.OptionalFields.TH
UnitTests.UTCTime
UnitTests.NoThunks

build-depends:
aeson
, base
, base-compat
, deepseq
, base-orphans >=0.5.3 && <0.10
, base16-bytestring
, bytestring
, containers
, data-fix
, deepseq
, Diff >=0.4 && <0.5
, directory
, dlist
Expand Down Expand Up @@ -227,6 +235,9 @@ test-suite aeson-tests
, uuid-types
, vector

if !impl(ghc >=9.0)
build-depends: integer-gmp

if impl(ghc >=9.2 && <9.7)
build-depends: nothunks >=0.1.4 && <0.2

Expand Down
5 changes: 5 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
For the latest version of this document, please see [https://github.com/haskell/aeson/blob/master/changelog.md](https://github.com/haskell/aeson/blob/master/changelog.md).

### 2.2.1.0

* Add `Data.Aeson.RFC8785`, a JSON Canonicalization Scheme implementation
https://datatracker.ietf.org/doc/html/rfc8785

### 2.2.0.0

* Rework how `omitNothingFields` works. Add `allowOmittedFields` as a parsing counterpart.
Expand Down
169 changes: 169 additions & 0 deletions src/Data/Aeson/RFC8785.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
{-# LANGUAGE UnboxedTuples, BangPatterns #-}
-- | JSON Canonicalization Scheme https://datatracker.ietf.org/doc/html/rfc8785
module Data.Aeson.RFC8785 (
encodeCanonical,
) where

import Data.List (sortBy)
import Data.Ord (comparing)
import GHC.Integer (quotRemInteger)
import Math.NumberTheory.Logarithms (integerLog10)

import Data.Aeson
import Data.Aeson.Encoding
import Data.Aeson.Encoding.Internal
import Data.Aeson.Internal.Prelude
import Data.Aeson.Internal.Word8

import qualified Data.Aeson.Key as Key
import qualified Data.Aeson.KeyMap as KM
import qualified Data.ByteString as BS
import qualified Data.ByteString.Builder as B
import qualified Data.ByteString.Builder.Prim as BP
import qualified Data.ByteString.Lazy as LBS
import qualified Data.Scientific as Sci
import qualified Data.Text.Encoding as TE
import qualified Data.Vector as V

-- $setup
-- >>> import Data.Aeson

-- | Encode to JSON according to RFC 8785 canonicalization scheme.
-- https://datatracker.ietf.org/doc/html/rfc8785
--
-- 'encodeCanonical' uses 'toJSON' to produce intermediate 'Value',
-- as 'toEncoding' may (and most likely) produces non-canonical JSON.
--
-- Note: @decode (encodeCanonical v) === Just v@ for all @v :: Value@,
-- i.e. 'encodeCanonical' doesn't lose any information.
--
-- However, the example in RFC8785 /loses/ information as the intermediate
-- number representation is 'Double', also current @toJSON :: Double -> Value@
-- sometimes produces too precise values. For example
--
-- >>> toJSON (1e23 :: Double)
-- Number 9.999999999999999e22
--
-- 'show' also behaves the same:
--
-- >>> 1e23 :: Double
-- 9.999999999999999e22
--
-- Note: RFC8785 is __not the same scheme__ as used in
-- [canonical-json](https://hackage.haskell.org/package/canonical-json) package
-- (https://wiki.laptop.org/go/Canonical_JSON).
-- That scheme produces /invalid/ JSON (e.g. control characters encoded as is, not escaped)
-- and cannot encode non-integral numbers.
--
-- @since 2.2.1.0
--
encodeCanonical :: ToJSON a => a -> LBS.ByteString
encodeCanonical = encodingToLazyByteString . toCanonical . toJSON

toCanonical :: Value -> Encoding
toCanonical Null = null_
toCanonical (Bool b) = bool b
toCanonical (Number n) = canonicalNumber n
toCanonical (String s) = canonicalString s
toCanonical (Array v) = list toCanonical (V.toList v)
toCanonical (Object m) = dict (canonicalString . Key.toText) toCanonical ifr $
sortBy (\(k1, _) (k2, _) -> propertyCmp k1 k2) (KM.toList m)

ifr :: (k -> v -> a -> a) -> a -> [(k, v)] -> a
ifr f z = foldr (\(k, v) -> f k v) z
{-# INLINE ifr #-}

-- Property name strings to be sorted are formatted as arrays of UTF-16 code units.
propertyCmp :: Key -> Key -> Ordering
propertyCmp = comparing f where
-- this is slow implementation, but it's obviously not wrong.
f :: Key -> BS.ByteString
f = TE.encodeUtf16BE . Key.toText

-- strings are already serialized canonically.
canonicalString :: Text -> Encoding' a
canonicalString = text

-- RFC 8785 is outsourcing number format to ECMA-262.
-- 10th edition, 7.1.12.1 NumberToString
-- https://262.ecma-international.org/10.0/#sec-tostring-applied-to-the-number-type
--
-- Note: this specification is not lossy
-- Given 'Scientific' we can choose n,k,s uniquely: 'nks'.
--
-- RFC8785 Appendix D says "don't use bignums".
canonicalNumber :: Scientific -> Encoding
canonicalNumber m = case compare m 0 of
EQ -> Encoding (B.word8 W8_0)
LT -> Encoding (B.word8 W8_MINUS <> fromEncoding (canonicalNumber' (negate m)))
GT -> canonicalNumber' m

-- input: Positive number
canonicalNumber' :: Scientific -> Encoding
canonicalNumber' m
| k <= n, n <= 21
= Encoding $
BP.primMapListFixed BP.word8 ds <>
BP.primMapListFixed BP.word8 (replicate (n - k) W8_0)

| 0 < n, n <= 21
, let (pfx, sfx) = splitAt n ds
= Encoding $
BP.primMapListFixed BP.word8 pfx <>
B.word8 W8_DOT <>
BP.primMapListFixed BP.word8 sfx

| -6 < n, n <= 0
= Encoding $
B.word8 W8_0 <>
B.word8 W8_DOT <>
BP.primMapListFixed BP.word8 (replicate (negate n) W8_0) <>
BP.primMapListFixed BP.word8 ds

| k == 1, [d] <- ds
= Encoding $
B.word8 d <>
B.word8 W8_e <>
B.word8 (if (n - 1) >= 0 then W8_PLUS else W8_MINUS) <>
BP.primMapListFixed BP.word8 (integerToDecimalDigits (abs (toInteger n - 1)))

| (d:ds') <- ds
= Encoding $
B.word8 d <>
B.word8 W8_DOT <>
BP.primMapListFixed BP.word8 ds' <>
B.word8 W8_e <>
B.word8 (if (n - 1) >= 0 then W8_PLUS else W8_MINUS) <>
BP.primMapListFixed BP.word8 (integerToDecimalDigits (abs (toInteger n - 1)))

| otherwise
= string "0" -- shouldn't happen, but we need a default case.

where
-- 5. Otherwise, let n, k, and s be integers such that
-- k ≥ 1, 10k - 1 ≤ s < 10k, the Number value for s × 10n - k is m,
-- and k is as small as possible.
-- Note that k is the number of digits in the decimal representation of s,
-- that s is not divisible by 10, and that the least significant digit of s
-- is not necessarily uniquely determined by these criteria.
(n, k, s) = nks m
ds = integerToDecimalDigits s

-- 5. Otherwise, let n, k, and s be integers such that k ≥ 1, 10^(k - 1) ≤ s < 10^k,
-- the Number value for s × 10^(n - k) is m, and k is as small as possible.
-- Note that k is the number of digits in the decimal representation of s,
-- that s is not divisible by 10, and that the least significant digit of s
-- is not necessarily uniquely determined by these criteria.
nks :: Scientific -> (Int, Int, Integer)
nks m = (e + k, k, c)
where
m' = Sci.normalize m
c = Sci.coefficient m'
e = Sci.base10Exponent m'
k = integerLog10 c + 1

integerToDecimalDigits :: Integer -> [Word8]
integerToDecimalDigits = go [] where
go acc 0 = acc
go acc i = case quotRemInteger i 10 of
(# q, r #) -> go (d:acc) q where !d = fromIntegral r + W8_0
42 changes: 42 additions & 0 deletions tests/CastFloat.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{-# LANGUAGE CPP #-}
module CastFloat (
castDoubleToWord64,
castWord64ToDouble,
castFloatTests,
) where

import Test.Tasty (TestTree, testGroup)
import Test.Tasty.HUnit (testCase, (@?=))
import Test.Tasty.QuickCheck (testProperty, (===))

import Types (UniformWord64 (..))

#if MIN_VERSION_base(4,11,0)
import GHC.Float (castDoubleToWord64, castWord64ToDouble)
#else

import Data.Word (Word64)
import Foreign.Storable (Storable (peek, poke))
import Foreign.Ptr (castPtr)
import Foreign.Marshal (alloca)
import System.IO.Unsafe (unsafeDupablePerformIO)

castDoubleToWord64 :: Double -> Word64
castDoubleToWord64 = reinterpretCast

castWord64ToDouble :: Word64 -> Double
castWord64ToDouble = reinterpretCast

reinterpretCast :: (Storable a, Storable b) => a -> b
reinterpretCast x = unsafeDupablePerformIO $ alloca $ \ptr -> do
poke ptr x
peek (castPtr ptr)

#endif

castFloatTests :: TestTree
castFloatTests = testGroup "castDoubleToWord64"
[ testCase "5e-324" $ castDoubleToWord64 5e-324 @?= 1
, testProperty "roundtrip1" $ \d -> castWord64ToDouble (castDoubleToWord64 d) === d
, testProperty "roundtrip2" $ \(U64 w) -> castDoubleToWord64 (castWord64ToDouble w) === w
]
Loading

0 comments on commit 3f2a79a

Please sign in to comment.