Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ruby] Download builtin package dependencies #4473

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion joern-cli/frontends/rubysrc2cpg/build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ libraryDependencies ++= Seq(
"io.shiftleft" %% "codepropertygraph" % Versions.cpg,
"org.apache.commons" % "commons-compress" % "1.26.1", // For unpacking Gems with `--download-dependencies`
"org.scalatest" %% "scalatest" % Versions.scalatest % Test,
"org.antlr" % "antlr4-runtime" % Versions.antlr
"org.antlr" % "antlr4-runtime" % Versions.antlr,
"net.ruippeixotog" %% "scala-scraper" % "3.1.1",
)

enablePlugins(JavaAppPackaging, LauncherJarPlugin, Antlr4Plugin)
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ import scopt.OParser
final case class Config(
antlrCacheMemLimit: Double = 0.6d,
useDeprecatedFrontend: Boolean = false,
downloadDependencies: Boolean = false
downloadDependencies: Boolean = false,
downloadBuiltinPackages: Boolean = false
) extends X2CpgConfig[Config]
with DependencyDownloadConfig[Config]
with TypeRecoveryParserConfig[Config] {
Expand All @@ -25,6 +26,10 @@ final case class Config(
copy(useDeprecatedFrontend = value).withInheritedFields(this)
}

def withDownloadBuiltinPackages(value: Boolean): Config = {
copy(downloadBuiltinPackages = value).withInheritedFields(this)
}

override def withDownloadDependencies(value: Boolean): Config = {
copy(downloadDependencies = value).withInheritedFields(this)
}
Expand Down Expand Up @@ -54,6 +59,9 @@ private object Frontend {
opt[Unit]("useDeprecatedFrontend")
.action((_, c) => c.withUseDeprecatedFrontend(true))
.text("uses the original (but deprecated) Ruby frontend (default false)"),
opt[Unit]("downloadBuiltinPackages")
.action((_, c) => c.withDownloadBuiltinPackages(true))
.text("download builtin package information from RubyDocs"),
DependencyDownloadConfig.parserOptions,
XTypeRecovery.parserOptions
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import io.joern.rubysrc2cpg.deprecated.parser.DeprecatedRubyParser
import io.joern.rubysrc2cpg.deprecated.parser.DeprecatedRubyParser.*
import io.joern.rubysrc2cpg.parser.RubyParser
import io.joern.rubysrc2cpg.passes.{AstCreationPass, ConfigFileCreationPass, DependencyPass, ImportsPass}
import io.joern.rubysrc2cpg.utils.DependencyDownloader
import io.joern.rubysrc2cpg.utils.{BuiltinPackageDownloader, DependencyDownloader}
import io.joern.x2cpg.X2Cpg.withNewEmptyCpg
import io.joern.x2cpg.passes.base.AstLinkerPass
import io.joern.x2cpg.passes.callgraph.NaiveCallLinker
Expand Down Expand Up @@ -65,6 +65,10 @@ class RubySrc2Cpg extends X2CpgFrontend[Config] {
internalProgramSummary
}

if (config.downloadBuiltinPackages) {
BuiltinPackageDownloader().run()
}

val astCreationPass = new AstCreationPass(cpg, astCreators.map(_.withSummary(programSummary)))
astCreationPass.createAndApply()
val importsPass = new ImportsPass(cpg)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@ import io.joern.x2cpg.Defines as XDefines
import io.joern.x2cpg.datastructures.{FieldLike, MethodLike, ProgramSummary, TypeLike}

import scala.annotation.targetName
import upickle.default.ReadWriter

type NamespaceToTypeMap = Map[String, Set[RubyType]]

class RubyProgramSummary(
initialNamespaceMap: Map[String, Set[RubyType]] = Map.empty,
initialNamespaceMap: NamespaceToTypeMap = Map.empty,
initialPathMap: Map[String, Set[RubyType]] = Map.empty
) extends ProgramSummary[RubyType] {

Expand All @@ -20,7 +23,6 @@ class RubyProgramSummary(
ProgramSummary.combine(this.pathToType, other.pathToType)
)
}

}

case class RubyMethod(
Expand All @@ -29,11 +31,12 @@ case class RubyMethod(
returnType: String,
baseTypeFullName: Option[String]
) extends MethodLike
derives ReadWriter

case class RubyField(name: String, typeName: String) extends FieldLike
case class RubyField(name: String, typeName: String) extends FieldLike derives ReadWriter

case class RubyType(name: String, methods: List[RubyMethod], fields: List[RubyField])
extends TypeLike[RubyMethod, RubyField] {
extends TypeLike[RubyMethod, RubyField] derives ReadWriter {

@targetName("add")
override def +(o: TypeLike[RubyMethod, RubyField]): TypeLike[RubyMethod, RubyField] = {
Expand All @@ -43,4 +46,5 @@ case class RubyType(name: String, methods: List[RubyMethod], fields: List[RubyFi
def hasConstructor: Boolean = {
methods.exists(_.name == XDefines.ConstructorMethodName)
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
package io.joern.rubysrc2cpg.utils

import io.joern.rubysrc2cpg.datastructures.{RubyMethod, RubyType}
import io.joern.x2cpg.Defines
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.dsl.DSL.*
import net.ruippeixotog.scalascraper.dsl.DSL.Extract.*
import net.ruippeixotog.scalascraper.model.Element
import better.files.File
import io.joern.x2cpg.utils.ConcurrentTaskUtil
import org.slf4j.{Logger, LoggerFactory}

import scala.util.{Failure, Success}

/** Class to scrape and generate Ruby Namespace Map for builtin Ruby packages from https://ruby-doc.org
* @param rubyVersion
* \- Ruby version to fetch dependencies for
*/
class BuiltinPackageDownloader(rubyVersion: String = "3.3.0") {
private val logger: Logger = LoggerFactory.getLogger(this.getClass)

private val CLASS = "class"
private val INSTANCE = "instance"

private val browser = JsoupBrowser()
private val baseUrl = s"https://ruby-doc.org/$rubyVersion"

private val baseDir = "src/main/resources/builtin_types"

// Below unicode value caluclated with: println("\\u" + Integer.toHexString('→' | 0x10000).substring(1))
// taken from: https://stackoverflow.com/questions/2220366/get-unicode-value-of-a-character
private val arrowUnicodeValue = "\\u2192"

def run(): Unit = {
val builtinDir = File(baseDir)
builtinDir.createDirectoryIfNotExists()

val paths = generatePaths()

val typesMap = collection.mutable.Map[String, List[RubyType]]()

val types = ConcurrentTaskUtil
.runUsingThreadPool(generateRubyTypes(paths))
.flatMap {
case Success(rubyTypes) =>
typesMap.addOne(rubyTypes._1, rubyTypes._2)
case Failure(ex) =>
logger.warn(s"Failed to scrape/write Ruby builtin types: $ex")
None
}

writeToFileJson(typesMap)
writeToFile(typesMap)
}

/** Generates a `RubyType` for each class/module in each gem
* @param pathsMap
* @return
*/
private def generateRubyTypes(
pathsMap: collection.mutable.Map[String, List[String]]
): Iterator[() => (String, List[RubyType])] = {
pathsMap
.map((gemName, paths) =>
() => {
val rubyTypes = paths.map { path =>
val doc = browser.get(path)

val namespace =
doc >?> element("h1.class, h1.module") match {
case Some(classOrModuleElement) =>
// Text on website is: Class/Module <some>::<module/class>::<name>
val classOrModuleName = classOrModuleElement.text.split("\\s")(1).replaceAll("::", "\\.").strip
s"$gemName.$classOrModuleName"
case None => gemName
}

val rubyMethods = buildRubyMethods(doc, namespace)

RubyType(namespace, rubyMethods, List.empty)
}
(gemName, rubyTypes)
}
)
.iterator
}

private def writeToFile(rubyTypesMap: collection.mutable.Map[String, List[RubyType]]): Unit = {
val dir = File(s"${baseDir}/")
dir.createDirectoryIfNotExists()

rubyTypesMap.foreach { (gem, rubyTypes) =>
// gem is file name
val gemsMap = collection.mutable.Map[String, List[RubyType]]()

rubyTypes.foreach { rubyType =>
val rubyTypeNameSegments = rubyType.name.split("\\.")

val namespaceKey = rubyTypeNameSegments.size match {
case x if x == 1 =>
""
case x if x > 1 =>
rubyTypeNameSegments.take(x - 1).mkString(".")
}

if gemsMap.contains(namespaceKey) then gemsMap.update(namespaceKey, gemsMap(namespaceKey) :+ rubyType)
else gemsMap.put(namespaceKey, List(rubyType))
}

val typesFile = File(s"${dir.pathAsString}/$gem.mpk")
typesFile.createIfNotExists()

val msg: upack.Msg = upickle.default.writeMsg(gemsMap)
typesFile.writeByteArray(upack.writeToByteArray(msg))
}

dir.zipTo(destination = File(s"${baseDir}.zip"))
dir.delete()
}

// TODO: Remove before merging to master at a later stage
private def writeToFileJson(rubyTypesMap: collection.mutable.Map[String, List[RubyType]]): Unit = {
val dir = File(s"${baseDir}_json/")
dir.createDirectoryIfNotExists()

rubyTypesMap.foreach { (gem, rubyTypes) =>
// gem is file name
val gemsMap = collection.mutable.Map[String, List[RubyType]]()

rubyTypes.foreach { rubyType =>
val rubyTypeNameSegments = rubyType.name.split("\\.")

val namespaceKey = rubyTypeNameSegments.size match {
case x if x == 1 =>
""
case x if x > 1 =>
rubyTypeNameSegments.take(x - 1).mkString(".")
}

if gemsMap.contains(namespaceKey) then gemsMap.update(namespaceKey, gemsMap(namespaceKey) ++ List(rubyType))
else gemsMap.put(namespaceKey, List(rubyType))
}

val typesFile = File(s"${dir.pathAsString}/$gem.json")
typesFile.createIfNotExists()

typesFile.write(upickle.default.write(gemsMap, indent = 2))
}

dir.zipTo(destination = File(s"${baseDir}_json.zip"))
dir.delete()
}

/** Scrapes the given RubyDoc page and generates a `RubyMethod` for each public class and instance method found
* @param doc
* \- page to scrape
* @param namespace
* @return
* \- List of RubyMethod's for the given class/module
*/
private def buildRubyMethods(doc: browser.DocumentType, namespace: String): List[RubyMethod] = {
def generateMethodHeadingsSelector(methodType: String): String = {
s"#public-$methodType-5Buntitled-5D-method-details > .method-detail > .method-heading"
}

val methodHeadings =
doc >> elementList(s"${generateMethodHeadingsSelector(CLASS)}, ${generateMethodHeadingsSelector(INSTANCE)}")

val methodElements = methodHeadings >> element(".method-callseq, .method-name")

val funcNameRegex = "^([^{(]+)".r

methodElements
.map { x =>
val method = x.text.split(arrowUnicodeValue)(0)

funcNameRegex.findFirstMatchIn(method) match {
case Some(methodName) =>
// Some methods are `methodName == something`, which is why the split on space here is required
s"${methodName.toString.replaceAll("[!?=]", "").split("\\s+")(0).strip}"
case None => ""
}
}
.filterNot(_ == "")
.distinct
.map(x => RubyMethod(s"$namespace.$x", List.empty, Defines.Any, Option(namespace)))
}

/** Generates links for all classes on the RubyDocs page
* @return
* Map[gemName -> list of paths]
*/
private def generatePaths(): collection.mutable.Map[String, List[String]] = {
val doc = browser.get(baseUrl)

val liElements = doc >> elementList("#classindex-section > .link-list > li")

val linksMap = collection.mutable.Map[String, List[String]]()

val baseItems = liElements.takeWhile { x =>
!x.hasAttr("class") || !(x.attr("class") == "gemheader")
}

val (_, restOfItems) = liElements.splitAt(baseItems.size + 1)

val links = (restOfItems >> elementList("a")).filter(_.nonEmpty).map(_.head).groupBy(_.attr("href").split("/")(2))

val baseLinks = baseItems.map { x =>
val anchor = x >?> element("a")
s"$baseUrl/${anchor.get.attr("href").replaceAll("\\./", "")}"
}

linksMap.addOne("__builtin", baseLinks)

links.foreach { (extensionName, anchorElements) =>
val anchorHrefs = anchorElements
.map { anchorElement =>
s"$baseUrl/${anchorElement.attr("href").replaceAll("\\./", "")}"
}
.filter(!_.contains("table_of_contents"))

linksMap.get(extensionName) match {
case Some(prevHrefs) if prevHrefs.length < anchorHrefs.length => linksMap.update(extensionName, anchorHrefs)
case Some(prevHrefs) => // do nothing
case None => linksMap.addOne(extensionName, anchorHrefs)
}
}

linksMap
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@ import org.scalatest.Tag
import java.io.File
import org.scalatest.Inside

trait RubyFrontend(useDeprecatedFrontend: Boolean, withDownloadDependencies: Boolean) extends LanguageFrontend {
trait RubyFrontend(
useDeprecatedFrontend: Boolean,
withDownloadDependencies: Boolean,
withDownloadBuiltinPackages: Boolean
) extends LanguageFrontend {
override val fileSuffix: String = ".rb"

implicit val config: Config =
Expand All @@ -24,6 +28,7 @@ trait RubyFrontend(useDeprecatedFrontend: Boolean, withDownloadDependencies: Boo
.getOrElse(Config().withSchemaValidation(ValidationMode.Enabled))
.withUseDeprecatedFrontend(useDeprecatedFrontend)
.withDownloadDependencies(withDownloadDependencies)
.withDownloadBuiltinPackages(withDownloadBuiltinPackages)

override def execute(sourceCodeFile: File): Cpg = {
new RubySrc2Cpg().createCpg(sourceCodeFile.getAbsolutePath).get
Expand All @@ -34,9 +39,10 @@ trait RubyFrontend(useDeprecatedFrontend: Boolean, withDownloadDependencies: Boo
class DefaultTestCpgWithRuby(
packageTable: Option[PackageTable],
useDeprecatedFrontend: Boolean,
downloadDependencies: Boolean = false
downloadDependencies: Boolean = false,
downloadBuiltinPackages: Boolean = false
) extends DefaultTestCpg
with RubyFrontend(useDeprecatedFrontend, downloadDependencies)
with RubyFrontend(useDeprecatedFrontend, downloadDependencies, downloadBuiltinPackages)
with SemanticTestCpg {

override protected def applyPasses(): Unit = {
Expand All @@ -61,9 +67,10 @@ class RubyCode2CpgFixture(
downloadDependencies: Boolean = false,
extraFlows: List[FlowSemantic] = List.empty,
packageTable: Option[PackageTable] = None,
useDeprecatedFrontend: Boolean = false
useDeprecatedFrontend: Boolean = false,
downloadBuiltinPackages: Boolean = false
) extends Code2CpgFixture(() =>
new DefaultTestCpgWithRuby(packageTable, useDeprecatedFrontend, downloadDependencies)
new DefaultTestCpgWithRuby(packageTable, useDeprecatedFrontend, downloadDependencies, downloadBuiltinPackages)
.withOssDataflow(withDataFlow)
.withExtraFlows(extraFlows)
.withPostProcessingPasses(withPostProcessing)
Expand All @@ -79,9 +86,12 @@ class RubyCode2CpgFixture(
}
}

class RubyCfgTestCpg(useDeprecatedFrontend: Boolean = true, downloadDependencies: Boolean = false)
extends CfgTestCpg
with RubyFrontend(useDeprecatedFrontend, downloadDependencies) {
class RubyCfgTestCpg(
useDeprecatedFrontend: Boolean = true,
downloadDependencies: Boolean = false,
downloadBuiltinPackages: Boolean = false
) extends CfgTestCpg
with RubyFrontend(useDeprecatedFrontend, downloadDependencies, downloadBuiltinPackages) {
override val fileSuffix: String = ".rb"

}
Expand Down