Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX parsing for Long in scientific notation #674

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
8 changes: 8 additions & 0 deletions ast/jvm/src/test/scala/jawn/ast/AstCheckPlatform.scala
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,14 @@ private[jawn] trait AstCheckPlatform { self: AstCheck =>
p0 && p1
}

property("string/charSequence parsing") = forAll { (value: JValue) =>
val s = CanonicalRenderer.render(value)
val j1 = JParser.parseFromString(s)
val cs = java.nio.CharBuffer.wrap(s.toCharArray)
val j2 = JParser.parseFromCharSequence(cs)
Prop(j1 == j2 && j1.## == j2.##)
}

import AsyncParser.SingleValue

property("async parsing") = forAll { (v: JValue) =>
Expand Down
11 changes: 10 additions & 1 deletion ast/jvm/src/test/scala/jawn/ast/AstTestPlatform.scala
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,11 @@ package org.typelevel.jawn
package ast

import org.scalacheck.Prop
import Prop.{forAll, forAllNoShrink}

import Prop.forAll
import scala.util.Try

import ArbitraryUtil.expNotationNums

private[jawn] trait AstTestPlatform { self: AstTest =>

Expand All @@ -37,4 +40,10 @@ private[jawn] trait AstTestPlatform { self: AstTest =>
)
}

expNotationNums.foreach { (expForm: (String, Double)) =>
property(s".asDouble ${expForm._1}") = Prop(
JParser.parseUnsafe(expForm._1).getDouble == Try(JParser.parseUnsafe(expForm._1).asDouble).toOption &&
JParser.parseUnsafe(expForm._1).asDouble == expForm._2
)
}
}
17 changes: 16 additions & 1 deletion ast/native/src/test/scala/jawn/ast/AstCheckPlatform.scala
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,19 @@
package org.typelevel.jawn
package ast

private[jawn] trait AstCheckPlatform
import org.typelevel.jawn.ast.ArbitraryUtil.arbitraryJValue

import org.scalacheck.Prop
import org.scalacheck.Prop.forAll

private[jawn] trait AstCheckPlatform { self: AstCheck =>

// Rendering/parsing numbers on JS isn't always idempotent
property("string/charSequence parsing") = forAll { (value: JValue) =>
val s = CanonicalRenderer.render(value)
val j1 = JParser.parseFromString(s)
val cs = java.nio.CharBuffer.wrap(s.toCharArray)
val j2 = JParser.parseFromCharSequence(cs)
Prop(j1 == j2 && j1.## == j2.##)
}
}
37 changes: 21 additions & 16 deletions ast/shared/src/main/scala/jawn/ast/JValue.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ package ast
import java.lang.Double.{isInfinite, isNaN}
import scala.collection.mutable
import scala.reflect.ClassTag
import scala.util.Try
import scala.util.hashing.MurmurHash3

class WrongValueException(e: String, g: String) extends Exception(s"expected $e, got $g")
Expand Down Expand Up @@ -223,28 +224,30 @@ case class DoubleNum(n: Double) extends JNum {

case class DeferLong(s: String) extends JNum {

lazy val n: Long = util.parseLongUnsafe(s)
lazy val nOpt: Option[Long] = Try(util.parseLong(s)).toOption
lazy val n: Long = nOpt.getOrElse(throw new InvalidNumException(s))

final override def getInt: Option[Int] = Some(n.toInt)
final override def getLong: Option[Long] = Some(n)
final override def getDouble: Option[Double] = Some(n.toDouble)
final override def getInt: Option[Int] = nOpt.map(_.toInt)
final override def getLong: Option[Long] = nOpt
final override def getDouble: Option[Double] = nOpt.map(_.toDouble)
final override def getBigInt: Option[BigInt] = Some(BigInt(s))
final override def getBigDecimal: Option[BigDecimal] = Some(BigDecimal(s))

final override def asInt: Int = n.toInt
final override def asLong: Long = n
final override def asDouble: Double = n.toDouble
final override def asInt: Int = nOpt.map(_.toInt).getOrElse(throw new InvalidNumException(s))
final override def asLong: Long = nOpt.getOrElse(throw new InvalidNumException(s))
final override def asDouble: Double = nOpt.map(_.toDouble).getOrElse(throw new InvalidNumException(s))
final override def asBigInt: BigInt = BigInt(s)
final override def asBigDecimal: BigDecimal = BigDecimal(s)

final override def hashCode: Int = n.##
final override def hashCode: Int = if (nOpt.isEmpty) s.## else nOpt.get.##

final override def equals(that: Any): Boolean =
that match {
case LongNum(n2) => n == n2
case DoubleNum(n2) => JNum.hybridEq(n, n2)
case jn: DeferLong => n == jn.asLong
case jn: DeferNum => JNum.hybridEq(n, jn.asDouble)
(nOpt, that) match {
case (None, _) => false
case (Some(n), LongNum(n2)) => n == n2
case (Some(n), DoubleNum(n2)) => JNum.hybridEq(n, n2)
case (Some(n), jn: DeferLong) => n == jn.asLong
case (Some(n), jn: DeferNum) => JNum.hybridEq(n, jn.asDouble)
case _ => false
}
}
Expand All @@ -254,13 +257,13 @@ case class DeferNum(s: String) extends JNum {
lazy val n: Double = java.lang.Double.parseDouble(s)

final override def getInt: Option[Int] = Some(n.toInt)
final override def getLong: Option[Long] = Some(util.parseLongUnsafe(s))
final override def getLong: Option[Long] = Some(n.toLong)
Comment on lines 259 to +260
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's strange that both of these methods return Some with the JNum is not integral, but I guess that's probably behavior best left alone at this point.

Copy link
Author

@ilmirons ilmirons Feb 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure I understood: are you suggesting to leave the code as it is? I made a small demo program and as it is it produces wrong results, unless I'm missing something.

// Scala 2.12.20
// Temurin 22.0.2
// jawn-util 1.6.0

package com.example

import org.typelevel.jawn.util.{parseLong, parseLongUnsafe}

object Main {
  def main(args: Array[String]): Unit = {

    List(
      ("2.0", 2.0), // parseLongUnsafe(2.0) = 180 (expected 2.0)
      ("2.5", 2.5), // parseLongUnsafe(2.5) = 185 (expected 2.5)             
      ("2e3", 2e3), // parseLongUnsafe(2e3) = 733 (expected 2000.0)          
      ("2.5e0", 2.5e0), // parseLongUnsafe(2.5e0) = 19030 (expected 2.5)          
      ("2e+3", 2e+3), // parseLongUnsafe(2e+3) = 7253 (expected 2000.0)          
      ("2.5e-1", 2.5e-1),  // parseLongUnsafe(2.5e-1) = 190271 (expected 0.25)           
      ("9.223372036854776e18", 9.223372036854776e18), // parseLongUnsafe(9.223372036854776e18) = -4010348331692976762 (expected 9.223372036854776E18)          
      ("-9.223372036854776e+18", -9.223372036854776e18)) // parseLongUnsafe(-9.223372036854776e+18) = 3209995169510665050 (expected -9.223372036854776E18)
      .foreach { t =>
        try {
          println(s"parseLongUnsafe(${t._1}) = " + parseLongUnsafe(t._1) + " (expected " + t._2 + ")")
        } catch { // when switching to parseLong everything falls here
          case e: Throwable => println(s"parseLongUnsafe(${t._1}) = " + e)
        }
      }
  }
}

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PS digging down the rabbit hole it seems that also the use of parseLongUnsafe in DeferLong is not correct, as there is no check at parser level that the digit string parsed represent a number in the Long range, just that there is no . or e.

// Parser.scala lines 166-176

if (c == '-') {
      j += 1
      c = at(j)
    }
    if (c == '0') {
      j += 1
      c = at(j)
    } else if ('1' <= c && c <= '9')
      while ({ j += 1; c = at(j); '0' <= c && c <= '9' }) () // can easily pass digit strings that are out of Long range
    else
      die(i, "expected digit")

// No further checks in the lines below

After the string is passed to jnum in JawnFacade (see code in comment above)
And in DeferLong

// JValue.scala, lines 224-226

case class DeferLong(s: String) extends JNum {

  lazy val n: Long = util.parseLongUnsafe(s) // should be parseLong

So this too can produce wrong results for number outside of Long range (which are anyway valid as JSON as everything is supposed to be encoded as double). This also hint at the matter of who is in charge of semantic checks: IMHO should be the Facade, but comment on parseLongUnsafe seems to suggest the contrary:

/**
   * Parse the given character sequence as a single Long value (64-bit signed integer) in decimal (base-10).
   *
   * For valid inputs, this method produces the same values as `parseLong`. However, by avoiding input validation it is
   * up to 50% faster.
   *
   * For inputs which `parseLong` throws an error on, `parseLongUnsafe` may (or may not) throw an error, or return a
   * bogus value. This method makes no guarantees about how it handles invalid input.
   *
   * This method should only be used on sequences which have already been parsed (e.g. by a Jawn parser). When in doubt,
   * use `parseLong(cs)`, which is still significantly faster than `java.lang.Long.parseLong(cs.toString)`.
   */
  def parseLongUnsafe(cs: CharSequence): Long = {
// ...

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tried to fix this too, but somehow we lose idempotency on JS CharSequence/String parsing. I was unable to investigate how this happens, but it's just JS. Would it be a good idea to rewire CharSequence to String parsing in JS implementation? At the end of the day we have not a CharSequence type in JS.

final override def getDouble: Option[Double] = Some(n)
final override def getBigInt: Option[BigInt] = Some(BigDecimal(s).toBigInt)
final override def getBigDecimal: Option[BigDecimal] = Some(BigDecimal(s))

final override def asInt: Int = n.toInt
final override def asLong: Long = util.parseLongUnsafe(s)
final override def asLong: Long = n.toLong
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checking my understanding: right now it sometimes returns an incorrect value, and you want it to throw? Changing behavior to start throwing scares me, particularly in a method not documented to throw. But silently returning bad values isn't great, either. And I guess it would align the behavior more closely with .asInt.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a util.parseLong(s), which isn't as fast as util.parseLongUnsafe(s), but about 30% faster than .toLong and does throw on invalid input. Maybe that would be a better implementation?

Copy link
Author

@ilmirons ilmirons Feb 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I would like it not to throw but to produce the correct result.

Looking at both parseLong and parseLongUnsafe I see no specific handling for exponent and also as stated in the scaladoc of the former

Stated more precisely, accepted values:

  - conform to the pattern: -?(0|([1-9][0-9]*))
  - are within [-9223372036854775808, 9223372036854775807]

So these functions cannot handle 2e3 or any number in scientific notation or with a '.' by design, but still this kind of strings are passed to DeferNum
I see 3 solutions to this:

  1. Just use .toLong. Simple, not so performant, but correct. Also the performance loss would be only when accessing the Long value of a Double through JValue AST, not during parsing
  2. Use parseLong, that you mentioned: according to code in JawnFacade it will throw on any input, as Strings passed to DeferNum either have a '.' or a 'e'
// from JawnFacade

final def jnum(s: CharSequence, decIndex: Int, expIndex: Int): JValue =
    if (decIndex == -1 && expIndex == -1)
      DeferLong(s.toString)
    else
      DeferNum(s.toString)

So basically we would fall in (non-)solution 3
3. State clearly a number in scientific notation/with a decimal point can only be a Double. It's up to the dev know how the field he is getting has been written. Getting a Double as a Long produces error Hence throw immediately or return Option.empty if trying to get a Long from DeferNum. This is the (non-)solution I like the least, but has 0 impact on performances and we are talking about a very rare case. That's saying: we choose not to support long in scientific notation

Edit: I initially thought it was just scientific notation and adding a flag could solve the problem, but looking again at the code I realized even decimal point is not handled by parseLongUnsafe/parseLong.

final override def asDouble: Double = n
final override def asBigInt: BigInt = BigDecimal(s).toBigInt
final override def asBigDecimal: BigDecimal = BigDecimal(s)
Expand All @@ -271,7 +274,9 @@ case class DeferNum(s: String) extends JNum {
that match {
case LongNum(n2) => JNum.hybridEq(n2, n)
case DoubleNum(n2) => n == n2
case jn: DeferLong => JNum.hybridEq(jn.asLong, n)
case jn: DeferLong =>
try JNum.hybridEq(jn.asLong, n)
catch { case _: InvalidNumException => false }
case jn: DeferNum => n == jn.asDouble
case _ => false
}
Expand Down
11 changes: 11 additions & 0 deletions ast/shared/src/test/scala/jawn/ArbitraryUtil.scala
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,15 @@ object ArbitraryUtil {

implicit lazy val arbitraryJValue: Arbitrary[JValue] =
Arbitrary(jvalue())

// Valid JSON numbers with an exact double representation and in the Long range

implicit lazy val expNotationNums: List[(String, Double)] = List(
("2e3", 2e3),
("2.5e0", 2.5e0),
("2e+3", 2e+3),
("2.5e-1", 2.5e-1),
("9.223372036854776e18", 9.223372036854776e18),
("-9.223372036854776e+18", -9.223372036854776e18)
)
}
8 changes: 0 additions & 8 deletions ast/shared/src/test/scala/jawn/AstCheck.scala
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,6 @@ class AstCheck extends Properties("AstCheck") with AstCheckPlatform {
)
}

property("string/charSequence parsing") = forAll { (value: JValue) =>
val s = CanonicalRenderer.render(value)
val j1 = JParser.parseFromString(s)
val cs = java.nio.CharBuffer.wrap(s.toCharArray)
val j2 = JParser.parseFromCharSequence(cs)
Prop(j1 == j2 && j1.## == j2.##)
}

implicit val facade: Facade[JValue] = JawnFacade

val percs = List(0.0, 0.2, 0.4, 0.8, 1.0)
Expand Down
30 changes: 29 additions & 1 deletion ast/shared/src/test/scala/jawn/AstTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ package org.typelevel.jawn
package ast

import org.scalacheck.{Prop, Properties}
import scala.util.{Success, Try}

import scala.util.{Success, Try}
import ArbitraryUtil._
import Prop.forAll

Expand Down Expand Up @@ -62,20 +62,41 @@ class AstTest extends Properties("AstTest") with AstTestPlatform {
)
}

expNotationNums.foreach { (expForm: (String, Double)) =>
property(s".asInt ${expForm._1}") = Prop(
JParser.parseUnsafe(expForm._1).getInt == Try(JParser.parseUnsafe(expForm._1).asInt).toOption &&
JParser.parseUnsafe(expForm._1).asInt == expForm._2.intValue()
)
}

property(".getLong") = forAll { (n: Long) =>
Prop(
JNum(n).getLong == Some(n) &&
JParser.parseUnsafe(n.toString).getLong == Some(n)
)
}

expNotationNums.foreach { (expForm: (String, Double)) =>
property(s".asLong ${expForm._1}") = Prop(
JParser.parseUnsafe(expForm._1).getLong == Try(JParser.parseUnsafe(expForm._1).asLong).toOption &&
JParser.parseUnsafe(expForm._1).asLong == expForm._2.longValue()
)
}

property(".getBigInt") = forAll { (n: BigInt) =>
Prop(
JNum(n.toString).getBigInt == Some(n) &&
JParser.parseUnsafe(n.toString).getBigInt == Some(n)
)
}

expNotationNums.foreach { (expForm: (String, Double)) =>
property(s".asBigInt ${expForm._1}") = Prop(
JParser.parseUnsafe(expForm._1).getBigInt == Try(JParser.parseUnsafe(expForm._1).asBigInt).toOption &&
JParser.parseUnsafe(expForm._1).asBigInt == BigDecimal(expForm._2).toBigInt
)
}

property(".getBigDecimal") = forAll { (n: BigDecimal) =>
if (Try(BigDecimal(n.toString)) == Success(n))
Prop(
Expand All @@ -85,4 +106,11 @@ class AstTest extends Properties("AstTest") with AstTestPlatform {
else
Prop(true)
}

expNotationNums.foreach { (expForm: (String, Double)) =>
property(s".asBigDecimal ${expForm._1}") = Prop(
JParser.parseUnsafe(expForm._1).getBigDecimal == Try(JParser.parseUnsafe(expForm._1).asBigDecimal).toOption &&
JParser.parseUnsafe(expForm._1).asBigDecimal == BigDecimal(expForm._2)
)
}
}