Skip to content

Commit

Permalink
Populate Data-Flow Cache (#222)
Browse files Browse the repository at this point in the history
* Unchanged methods no longer have REACHABLE_BY edges regenerated/duplicated

* Saving cache to blob and deserializing working

* Dataflow context usable via cmd call

* Removes expired paths from cache

* Added to changelog

* Documented code

* Fixed import

* Updated semantics

* Test for when default semantics are inaccessible

* Testing re-use of results

* Testing re-use of results with time measurements

* Able to work everything in JSON

* Using compressed JSON

* Reduced T to manifest
  • Loading branch information
DavidBakerEffendi authored Feb 7, 2022
1 parent 7f5c7ee commit 42c4af7
Show file tree
Hide file tree
Showing 15 changed files with 527 additions and 71 deletions.
7 changes: 0 additions & 7 deletions .deepsource.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,3 @@ version = 1
[[analyzers]]
name = "scala"
enabled = true

[[analyzers]]
name = "java"
enabled = true

[analyzers.meta]
runtime_version = "11"
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/.gradle/
/build/
/bin/
*.bin

# Ignore Gradle GUI config
gradle-app.setting
Expand Down
13 changes: 12 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/)
and this project adheres to [Semantic Versioning](http://semver.org/).

## [Unreleased] - 2022-02-02

### Added

- Data-flow paths are saved to a blob and are re-used on future runs. Only available on `OverflowDbDriver`.

### Fixed

- Unchanged methods no longer have REACHABLE_BY edges regenerated/duplicated

## [1.0.4] - 2022-01-25

### Added
Expand All @@ -14,7 +24,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/).

### Fixed

- Fixed access path issue where array index accesses were reported to be invalid ASTs. This was just a change in AST children's `order` from `(0, 1)` to `(1, 2)`
- Fixed access path issue where array index accesses were reported to be invalid ASTs. This was just a change in AST
children's `order` from `(0, 1)` to `(1, 2)`
- Fixed bug where if a single file was specified then all files in the directory were loaded

### Changed
Expand Down
44 changes: 23 additions & 21 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@ inThisBuild(
)

val cpgVersion = "1.3.493"
val joernVersion = "1.1.488"
val joernVersion = "1.1.502"
val sootVersion = "4.2.1"
val tinkerGraphVersion = "3.4.8"
val neo4jVersion = "4.4.2"
val neo4jVersion = "4.4.3"
val tigerGraphVersion = "3.1.0"
val sttpVersion = "3.3.17"
val sttpVersion = "3.4.1"
val scalajHttpVersion = "2.4.2"
val lz4Version = "1.8.0"
val slf4jVersion = "1.7.33"
val slf4jVersion = "1.7.35"
val scalatestVersion = "3.2.9"
val circeVersion = "0.14.1"

Expand All @@ -36,23 +36,25 @@ trapExit := false
Test / fork := true

libraryDependencies ++= Seq(
"io.shiftleft" %% "codepropertygraph" % cpgVersion,
"io.shiftleft" %% "semanticcpg" % cpgVersion,
"io.joern" %% "dataflowengineoss" % joernVersion,
"io.shiftleft" %% "semanticcpg" % cpgVersion % Test classifier "tests",
"org.soot-oss" % "soot" % sootVersion,
"org.apache.tinkerpop" % "tinkergraph-gremlin" % tinkerGraphVersion,
"org.apache.tinkerpop" % "gremlin-driver" % tinkerGraphVersion,
"org.neo4j.driver" % "neo4j-java-driver" % neo4jVersion,
"com.tigergraph.client" % "gsql_client" % tigerGraphVersion,
"com.softwaremill.sttp.client3" %% "core" % sttpVersion,
"com.softwaremill.sttp.client3" %% "circe" % sttpVersion,
"org.scalaj" % "scalaj-http_2.13" % scalajHttpVersion,
"org.lz4" % "lz4-java" % lz4Version,
"org.slf4j" % "slf4j-api" % slf4jVersion,
"org.slf4j" % "slf4j-simple" % slf4jVersion,
"org.scala-lang" % "scala-reflect" % scalaVersion.value,
"org.scalatest" %% "scalatest" % scalatestVersion % Test
"io.shiftleft" %% "codepropertygraph" % cpgVersion,
"io.shiftleft" %% "semanticcpg" % cpgVersion,
"io.joern" %% "dataflowengineoss" % joernVersion,
"io.shiftleft" %% "semanticcpg" % cpgVersion % Test classifier "tests",
"org.soot-oss" % "soot" % sootVersion,
"org.apache.tinkerpop" % "tinkergraph-gremlin" % tinkerGraphVersion,
"org.apache.tinkerpop" % "gremlin-driver" % tinkerGraphVersion,
"org.neo4j.driver" % "neo4j-java-driver" % neo4jVersion,
"com.tigergraph.client" % "gsql_client" % tigerGraphVersion,
"com.softwaremill.sttp.client3" %% "core" % sttpVersion,
"com.softwaremill.sttp.client3" %% "circe" % sttpVersion,
"com.fasterxml.jackson.core" % "jackson-databind" % "2.13.1",
"com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.13.1",
"org.scalaj" % "scalaj-http_2.13" % scalajHttpVersion,
"org.lz4" % "lz4-java" % lz4Version,
"org.slf4j" % "slf4j-api" % slf4jVersion,
"org.slf4j" % "slf4j-simple" % slf4jVersion,
"org.scala-lang" % "scala-reflect" % scalaVersion.value,
"org.scalatest" %% "scalatest" % scalatestVersion % Test
) ++ Seq(
"io.circe" %% "circe-core",
"io.circe" %% "circe-generic",
Expand Down
1 change: 1 addition & 0 deletions driver.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# storageLocation: cpg.odb
# heapPercentageThreshold: 80
# serializationStatsEnabled: false
# maxCallDepth: 2

database: TinkerGraph
params:
Expand Down
5 changes: 2 additions & 3 deletions src/main/resources/default.semantics
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"<operator>.indexAccess" 1->-1
"<operator>.indirectComputedMemberAccess" 1->-1
"<operator>.indirectFieldAccess" 1->-1
"<operator>.indirectIndexAccess" 1->-1
"<operator>.indirectIndexAccess" 1->-1 2->-1
"<operator>.indirectMemberAccess" 1->-1
"<operator>.indirection" 1->-1
"<operator>.memberAccess" 1->-1
Expand All @@ -32,5 +32,4 @@
"<operator>.postIncrement" 1->1
"<operator>.preDecrement" 1->1
"<operator>.preIncrement" 1->1
"<operator>.sizeOf"
"<operator>.lengthOf"
"<operator>.sizeOf"
6 changes: 5 additions & 1 deletion src/main/scala/com/github/plume/oss/Jimple2Cpg.scala
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,12 @@ class Jimple2Cpg {

basePasses(cpg, driver, unchangedTypes, unchangedNamespaces).foreach(_.createAndApply(driver))
controlFlowPasses(cpg).foreach(_.createAndApply(driver))
new PlumeReachingDefPass(cpg).createAndApply(driver)
new PlumeReachingDefPass(cpg, unchangedTypes = unchangedTypes).createAndApply(driver)
new PlumeHashPass(cpg).createAndApply(driver)
driver match {
case x: OverflowDbDriver => x.removeExpiredPathsFromCache(unchangedTypes)
case _ =>
}

driver.buildInterproceduralEdges()
cpg
Expand Down
4 changes: 3 additions & 1 deletion src/main/scala/com/github/plume/oss/Plume.scala
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,14 @@ object Plume extends App {
private def createDriver(conf: DriverConfig): IDriver = {
conf match {
case _ if conf.database == "OverflowDB" =>
new OverflowDbDriver(
val d = new OverflowDbDriver(
storageLocation = Option(conf.params.getOrElse("storageLocation", "cpg.odb")),
heapPercentageThreshold = conf.params.getOrElse("heapPercentageThreshold", "80").toInt,
serializationStatsEnabled =
conf.params.getOrElse("serializationStatsEnabled", "false").toBoolean
)
d.setDataflowContext(conf.params.getOrElse("maxCallDepth", "2").toInt)
d
case _ if conf.database == "TinkerGraph" => new TinkerGraphDriver()
case _ if conf.database == "Neo4j" =>
new Neo4jDriver(
Expand Down
231 changes: 231 additions & 0 deletions src/main/scala/com/github/plume/oss/domain/package.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
package com.github.plume.oss

import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.scala.{DefaultScalaModule, ScalaObjectMapper}
import io.joern.dataflowengineoss.queryengine.{PathElement, ReachableByResult, ResultTable}
import io.shiftleft.codepropertygraph.generated.Cpg
import io.shiftleft.codepropertygraph.generated.nodes.{Call, CfgNode, StoredNode}
import org.apache.commons.codec.binary.Base64
import org.apache.commons.io.IOUtils
import org.apache.commons.io.output.ByteArrayOutputStream
import org.slf4j.LoggerFactory

import java.io.ByteArrayInputStream
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Path}
import java.util.concurrent.ConcurrentHashMap
import java.util.zip.{GZIPInputStream, GZIPOutputStream}
import scala.io.Source
import scala.jdk.CollectionConverters
import scala.util.Using

/** Contains case classes that can be used independently.
*/
package object domain {

private val logger = LoggerFactory.getLogger("com.github.plume.oss.domain")
private val objectMapper = new ObjectMapper() with ScalaObjectMapper
objectMapper.registerModule(DefaultScalaModule)

/** Serializes a given object as JSON and then returns the GZIP compressed base 64 encoded string.
* @param o object to serialize.
* @return the GZIP compressed base 64 encoded string.
*/
def compress(o: Any): String = {
val str = objectMapper.writeValueAsString(o)
if (str == null || str.isEmpty) return str
val out = new ByteArrayOutputStream()
Using.resource(new GZIPOutputStream(out)) { gzip =>
gzip.write(str.getBytes)
}
Base64.encodeBase64String(out.toByteArray)
}

/** Given an object and a path, will serialize and compress the object to the given path
* using [[compress()]].
* @param o object to serialize.
* @param p path to write serialized data to.
*/
def compressToFile(o: Any, p: Path): Unit = {
val deflatedStr = compress(o)
Files.write(p, deflatedStr.getBytes(StandardCharsets.UTF_8))
}

/** Deserializes a given object from GZIP compressed base 64 encoded to JSON string.
* @param deflatedTxt object to deserialize.
* @return the GZIP compressed base 64 encoded string.
*/
def decompress(deflatedTxt: String): String = {
val bytes = Base64.decodeBase64(deflatedTxt)
Using.resource(new GZIPInputStream(new ByteArrayInputStream(bytes))) { zis =>
IOUtils.toString(zis, "UTF-8")
}
}

/** Given a path, will deserialize and decompress the file at the given path
* using [[decompress()]].
* @param p path to read deserialized data from.
* @tparam T the type of the class to deserialize.
* @return the deserialized object.
*/
def decompressFile[T: Manifest](p: Path): T = {
Using.resource(Source.fromFile(p.toFile)) { deflatedStr =>
objectMapper.readValue[T](
decompress(deflatedStr.mkString)
)
}
}

/** Converts serialized path results to deserialized ReachableByResults. This is assumed to be called before any nodes
* are removed from the graph since these results were serialized.
*
* @param serTab serialized raw results.
* @return deserialized ReachableByResults table.
*/
def deserializeResultTable(
serTab: ConcurrentHashMap[Long, Vector[SerialReachableByResult]],
cpg: Cpg
): Option[ResultTable] = {
val resultTable = new ResultTable()

try {
CollectionConverters
.MapHasAsScala(serTab)
.asScala
.map { case (id, vec) =>
if (!cpg.graph.nodes(id).hasNext)
throw new RuntimeException(
"""Current database does not contain references to previous ReachableByResults cache. Unable to re-use
|old cache.""".stripMargin
)
(
cpg.graph.nodes(id).next(),
vec.map { f: SerialReachableByResult =>
SerialReachableByResult.unapply(f, cpg, resultTable)
}
)
}
.foreach { case (k, v) => resultTable.table.put(k.asInstanceOf[StoredNode], v) }
Some(resultTable)
} catch {
case e: RuntimeException =>
logger.warn(e.getMessage)
None
case e: Exception =>
logger.error("Unable to deserialize results table.", e)
None
}
}

/** A serializable version of ReachableByResult.
* @param path a path of nodes represented by [[SerialReachableByResult]]s.
* @param table a pointer to the global serializable result table.
* @param callSite the call site that was expanded to kick off the task. We require this to match call sites to
* exclude non-realizable paths through other callers
* @param callDepth the call depth of this result.
* @param partial indicate whether this result stands on its own or requires further analysis, e.g., by expanding
* output arguments backwards into method output parameters.
*/
final case class SerialReachableByResult(
path: Vector[SerialPathElement],
callSite: Option[Long],
callDepth: Int = 0,
partial: Boolean = false
)

/** A serializable version of ReachableByResult.
*/
object SerialReachableByResult {

private val logger = LoggerFactory.getLogger(classOf[SerialReachableByResult])

/** Creates a serializable version of ReachableByResult.
* @param rbr the ReachableByResult class.
* @param table a pointer to the global serializable result table.
* @return a serializable ReachableByResult.
*/
def apply(
rbr: ReachableByResult
): SerialReachableByResult = {
new SerialReachableByResult(
rbr.path.map(SerialPathElement.apply),
rbr.callSite match {
case Some(call) => Some(call.id())
case None => None
},
rbr.callDepth,
rbr.partial
)
}

/** Deserializes a given of [[SerialReachableByResult]].
* @param srb the serial ReachableByResult class.
* @param cpg the code property graph pointer.
* @param table a pointer to the global serializable result table.
* @return a deserialized ReachableByResult.
*/
def unapply(srb: SerialReachableByResult, cpg: Cpg, table: ResultTable): ReachableByResult = {
ReachableByResult(
srb.path.map { sbr => SerialPathElement.unapply(sbr, cpg) },
table,
if (srb.callSite.isDefined) {
cpg.graph.nodes(srb.callSite.get).next() match {
case node: Call => Some(node)
case n =>
logger.warn(s"Unable to serialize call node ${n.getClass}.")
None
}
} else {
None
},
srb.callDepth,
srb.partial
)
}
}

/** A serializable version of the SerialPathElement.
* @param nodeId the ID of the node this path element represents.
* @param visible whether this path element should be shown in the flow.
* @param resolved whether we have resolved the method call this argument belongs to.
* @param outEdgeLabel label of the outgoing DDG edge.
*/
final case class SerialPathElement(
nodeId: Long,
visible: Boolean = true,
resolved: Boolean = true,
outEdgeLabel: String = ""
)

/** A serializable version of the SerialPathElement.
*/
object SerialPathElement {

/** Creates a [[SerialPathElement]] from a given PathElement.
* @param pe the PathElement to serialize.
* @return a serializable version of PathElement.
*/
def apply(pe: PathElement): SerialPathElement = {
new SerialPathElement(
pe.node.id(),
pe.visible,
pe.resolved,
pe.outEdgeLabel
)
}

/** Deserializes the given [[SerialPathElement]].
* @param spe the serializable version of the representative PathElement.
* @param cpg the code property graph pointer.
* @return the deserialized PathElement.
*/
def unapply(spe: SerialPathElement, cpg: Cpg): PathElement = {
PathElement(
cpg.graph.nodes(spe.nodeId).next().asInstanceOf[CfgNode],
spe.visible,
spe.resolved,
spe.outEdgeLabel
)
}
}
}
Loading

0 comments on commit 42c4af7

Please sign in to comment.