Skip to content

Commit

Permalink
[php2cpg] Try multiple charsets when reading file content
Browse files Browse the repository at this point in the history
  • Loading branch information
johannescoetzee committed Jan 13, 2025
1 parent d427219 commit b2a3ecb
Showing 1 changed file with 22 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import org.slf4j.LoggerFactory
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Path}
import scala.collection.mutable
import scala.util.{Success, Try}

class AstCreator(relativeFileName: String, fileName: String, phpAst: PhpFile, disableFileContent: Boolean)(implicit
withSchemaValidation: ValidationMode
Expand All @@ -29,13 +30,33 @@ class AstCreator(relativeFileName: String, fileName: String, phpAst: PhpFile, di
private val scope = new Scope()(() => nextClosureName())
private val tmpKeyPool = new IntervalKeyPool(first = 0, last = Long.MaxValue)
private val globalNamespace = globalNamespaceBlock()
private var fileContent = Option.empty[String]
private val fileEncodings = List(
StandardCharsets.UTF_8,
StandardCharsets.UTF_16,
StandardCharsets.ISO_8859_1,
StandardCharsets.US_ASCII,
StandardCharsets.UTF_16LE,
StandardCharsets.UTF_16BE
)
private var fileContent = Option.empty[String]

private def getNewTmpName(prefix: String = "tmp"): String = s"$prefix${tmpKeyPool.next.toString}"

override def createAst(): DiffGraphBuilder = {
if (!disableFileContent) {
fileContent = Some(Files.readString(Path.of(fileName)))

fileContent = fileEncodings.iterator
.map { encoding =>
Try(Files.readString(Path.of(fileName), encoding))
}
.collectFirst { case Success(content) =>
content
}

if (fileContent.isEmpty) {
logger.warn(s"Could not parse file using any standard charsets. File content will be missing for $fileName")
}
}

val ast = astForPhpFile(phpAst)
Expand Down

0 comments on commit b2a3ecb

Please sign in to comment.