From f76033c8d2a3d7b093a6e234b294749b875ba447 Mon Sep 17 00:00:00 2001 From: Johannes Coetzee Date: Mon, 13 Jan 2025 10:55:40 +0100 Subject: [PATCH 1/3] [php2cpg] Try multiple charsets when reading file content --- .../php2cpg/astcreation/AstCreator.scala | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/joern-cli/frontends/php2cpg/src/main/scala/io/joern/php2cpg/astcreation/AstCreator.scala b/joern-cli/frontends/php2cpg/src/main/scala/io/joern/php2cpg/astcreation/AstCreator.scala index 8892100efc75..b52996cd5755 100644 --- a/joern-cli/frontends/php2cpg/src/main/scala/io/joern/php2cpg/astcreation/AstCreator.scala +++ b/joern-cli/frontends/php2cpg/src/main/scala/io/joern/php2cpg/astcreation/AstCreator.scala @@ -19,6 +19,7 @@ import org.slf4j.LoggerFactory import java.nio.charset.StandardCharsets import java.nio.file.{Files, Path} import scala.collection.mutable +import scala.util.{Success, Try} class AstCreator(relativeFileName: String, fileName: String, phpAst: PhpFile, disableFileContent: Boolean)(implicit withSchemaValidation: ValidationMode @@ -29,13 +30,33 @@ class AstCreator(relativeFileName: String, fileName: String, phpAst: PhpFile, di private val scope = new Scope()(() => nextClosureName()) private val tmpKeyPool = new IntervalKeyPool(first = 0, last = Long.MaxValue) private val globalNamespace = globalNamespaceBlock() - private var fileContent = Option.empty[String] + private val fileEncodings = List( + StandardCharsets.UTF_8, + StandardCharsets.UTF_16, + StandardCharsets.ISO_8859_1, + StandardCharsets.US_ASCII, + StandardCharsets.UTF_16LE, + StandardCharsets.UTF_16BE + ) + private var fileContent = Option.empty[String] private def getNewTmpName(prefix: String = "tmp"): String = s"$prefix${tmpKeyPool.next.toString}" override def createAst(): DiffGraphBuilder = { if (!disableFileContent) { fileContent = Some(Files.readString(Path.of(fileName))) + + fileContent = fileEncodings.iterator + .map { encoding => + Try(Files.readString(Path.of(fileName), encoding)) + } + .collectFirst { case Success(content) => + content + } + + if (fileContent.isEmpty) { + logger.warn(s"Could not parse file using any standard charsets. File content will be missing for $fileName") + } } val ast = astForPhpFile(phpAst) From c88451620daa965c5a70241deff17c02f8753f68 Mon Sep 17 00:00:00 2001 From: Johannes Coetzee Date: Mon, 13 Jan 2025 11:46:19 +0100 Subject: [PATCH 2/3] Use IOUtils to read file instead --- .../php2cpg/astcreation/AstCreator.scala | 28 +++---------------- .../joern/php2cpg/querying/MethodTests.scala | 2 +- 2 files changed, 5 insertions(+), 25 deletions(-) diff --git a/joern-cli/frontends/php2cpg/src/main/scala/io/joern/php2cpg/astcreation/AstCreator.scala b/joern-cli/frontends/php2cpg/src/main/scala/io/joern/php2cpg/astcreation/AstCreator.scala index b52996cd5755..c5ad8d6a9cb1 100644 --- a/joern-cli/frontends/php2cpg/src/main/scala/io/joern/php2cpg/astcreation/AstCreator.scala +++ b/joern-cli/frontends/php2cpg/src/main/scala/io/joern/php2cpg/astcreation/AstCreator.scala @@ -14,12 +14,12 @@ import io.joern.x2cpg.{Ast, AstCreatorBase, AstNodeBuilder, Defines, ValidationM import io.shiftleft.codepropertygraph.generated.* import io.shiftleft.codepropertygraph.generated.nodes.* import io.shiftleft.semanticcpg.language.types.structure.NamespaceTraversal +import io.shiftleft.utils.IOUtils import org.slf4j.LoggerFactory import java.nio.charset.StandardCharsets -import java.nio.file.{Files, Path} +import java.nio.file.Path import scala.collection.mutable -import scala.util.{Success, Try} class AstCreator(relativeFileName: String, fileName: String, phpAst: PhpFile, disableFileContent: Boolean)(implicit withSchemaValidation: ValidationMode @@ -30,33 +30,13 @@ class AstCreator(relativeFileName: String, fileName: String, phpAst: PhpFile, di private val scope = new Scope()(() => nextClosureName()) private val tmpKeyPool = new IntervalKeyPool(first = 0, last = Long.MaxValue) private val globalNamespace = globalNamespaceBlock() - private val fileEncodings = List( - StandardCharsets.UTF_8, - StandardCharsets.UTF_16, - StandardCharsets.ISO_8859_1, - StandardCharsets.US_ASCII, - StandardCharsets.UTF_16LE, - StandardCharsets.UTF_16BE - ) - private var fileContent = Option.empty[String] + private var fileContent = Option.empty[String] private def getNewTmpName(prefix: String = "tmp"): String = s"$prefix${tmpKeyPool.next.toString}" override def createAst(): DiffGraphBuilder = { if (!disableFileContent) { - fileContent = Some(Files.readString(Path.of(fileName))) - - fileContent = fileEncodings.iterator - .map { encoding => - Try(Files.readString(Path.of(fileName), encoding)) - } - .collectFirst { case Success(content) => - content - } - - if (fileContent.isEmpty) { - logger.warn(s"Could not parse file using any standard charsets. File content will be missing for $fileName") - } + fileContent = Option(IOUtils.readEntireFile(Path.of(fileName))) } val ast = astForPhpFile(phpAst) diff --git a/joern-cli/frontends/php2cpg/src/test/scala/io/joern/php2cpg/querying/MethodTests.scala b/joern-cli/frontends/php2cpg/src/test/scala/io/joern/php2cpg/querying/MethodTests.scala index ca9e187a132b..d442e8d73ebf 100644 --- a/joern-cli/frontends/php2cpg/src/test/scala/io/joern/php2cpg/querying/MethodTests.scala +++ b/joern-cli/frontends/php2cpg/src/test/scala/io/joern/php2cpg/querying/MethodTests.scala @@ -146,7 +146,7 @@ class MethodTests extends PhpCode2CpgFixture { fooMethod.file.head.content.substring(offsetStart, offsetEnd) shouldBe """function foo() { | // ⦝ - | $x = "🙂⨌🙂𐇐🙂🙂🙂🙂"; + | $x = "??⨌????????????"; |}""".stripMargin } } From 3a57833201eaddf1f0bb215a4ad51cfd78e6d48f Mon Sep 17 00:00:00 2001 From: Johannes Coetzee Date: Mon, 13 Jan 2025 16:49:45 +0100 Subject: [PATCH 3/3] Bump cpg and fix test expectations --- build.sbt | 2 +- .../src/test/scala/io/joern/php2cpg/querying/MethodTests.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build.sbt b/build.sbt index 33915551a853..7b7cefe9e2fe 100644 --- a/build.sbt +++ b/build.sbt @@ -2,7 +2,7 @@ name := "joern" ThisBuild / organization := "io.joern" ThisBuild / scalaVersion := "3.5.2" -val cpgVersion = "1.7.19" +val cpgVersion = "1.7.21" lazy val joerncli = Projects.joerncli lazy val querydb = Projects.querydb diff --git a/joern-cli/frontends/php2cpg/src/test/scala/io/joern/php2cpg/querying/MethodTests.scala b/joern-cli/frontends/php2cpg/src/test/scala/io/joern/php2cpg/querying/MethodTests.scala index d442e8d73ebf..ca9e187a132b 100644 --- a/joern-cli/frontends/php2cpg/src/test/scala/io/joern/php2cpg/querying/MethodTests.scala +++ b/joern-cli/frontends/php2cpg/src/test/scala/io/joern/php2cpg/querying/MethodTests.scala @@ -146,7 +146,7 @@ class MethodTests extends PhpCode2CpgFixture { fooMethod.file.head.content.substring(offsetStart, offsetEnd) shouldBe """function foo() { | // ⦝ - | $x = "??⨌????????????"; + | $x = "🙂⨌🙂𐇐🙂🙂🙂🙂"; |}""".stripMargin } }