Skip to content

Commit

Permalink
[x2cpg] Completely overhauled SourceFiles (#5180)
Browse files Browse the repository at this point in the history
1) Storage of found files in now in an Array (constant append). A Set was never actually needed.
2) Filtering and skipping of files/folders happens now directly during the traversal instead of at the very end.
3) Proper scaladoc
  • Loading branch information
max-leuthaeuser authored Dec 12, 2024
1 parent 8fa3240 commit 498f895
Show file tree
Hide file tree
Showing 2 changed files with 140 additions and 70 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -64,17 +64,14 @@ class ExcludeTests extends AnyWordSpec with Matchers with TableDrivenPropertyChe

"Using case sensitive excludes" should {
"exclude the given files correctly" in {
if (scala.util.Properties.isWin) {
// both are written uppercase and are ignored nevertheless
if (scala.util.Properties.isWin || scala.util.Properties.isMac) {
// both are written uppercase and are ignored nevertheless because
// the file systems are case-insensitive by default
testWithArguments(Seq("Folder", "Index.c"), "", Set("a.c", "foo.bar/d.c"))
}
if (scala.util.Properties.isMac) {
// Folder written uppercase and it is not ignored while Index.c is.
// This might be an issue within Files.isSameFile but we take it for now.
testWithArguments(Seq("Folder", "Index.c"), "", Set("a.c", "folder/b.c", "folder/c.c", "foo.bar/d.c"))
}
if (scala.util.Properties.isLinux) {
// both are written uppercase and are not ignored
// both are written uppercase and are not ignored because
// ext3/ext4 and many other Linux filesystems are case-sensitive by default
testWithArguments(
Seq("Folder", "Index.c"),
"",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package io.joern.x2cpg

import better.files.File.VisitOptions
import better.files.*
import better.files.File.VisitOptions
import org.slf4j.LoggerFactory

import java.io.FileNotFoundException
Expand All @@ -11,35 +11,63 @@ import java.nio.file.Path
import java.nio.file.Paths
import java.nio.file.attribute.BasicFileAttributes
import java.nio.file.Files
import scala.util.matching.Regex

import scala.jdk.CollectionConverters.SetHasAsJava
import scala.util.matching.Regex

object SourceFiles {

private val logger = LoggerFactory.getLogger(getClass)

/** Hack to have a FileVisitor in place that will continue iterating files even if an IOException happened during
* traversal.
/** A failsafe implementation of a [[FileVisitor]] that continues iterating through files even if an [[IOException]]
* occurs during traversal.
*
* This visitor determines during traversal whether a given file should be excluded based on several criteria, such
* as matching default ignore patterns, specific file name patterns, or explicit file paths to ignore. It does not
* descent into folders matching such ignore patterns.
*
* This class is useful in scenarios where file traversal must be resilient to errors, such as accessing files with
* restricted permissions or encountering corrupted file entries.
*
* @param inputPath
* The root path from which the file traversal starts.
* @param ignoredDefaultRegex
* Optional sequence of regular expressions to filter out default ignored file patterns.
* @param ignoredFilesRegex
* Optional regular expression to filter out specific files based on their names.
* @param ignoredFilesPath
* Optional sequence of file paths to exclude from traversal explicitly.
*/
private final class FailsafeFileVisitor extends FileVisitor[Path] {
private final class FailsafeFileVisitor(
inputPath: String,
sourceFileExtensions: Set[String],
ignoredDefaultRegex: Option[Seq[Regex]] = None,
ignoredFilesRegex: Option[Regex] = None,
ignoredFilesPath: Option[Seq[String]] = None
) extends FileVisitor[Path] {

private val seenFiles = scala.collection.mutable.Set.empty[Path]
private val seenFiles = scala.collection.mutable.ArrayBuffer.empty[Path]

def files(): Set[File] = seenFiles.map(File(_)).toSet
def files(): Array[File] = seenFiles.map(File(_)).toArray

override def preVisitDirectory(dir: Path, attrs: BasicFileAttributes): FileVisitResult = {
FileVisitResult.CONTINUE
if (filterFile(dir.toString, inputPath, ignoredDefaultRegex, ignoredFilesRegex, ignoredFilesPath)) {
FileVisitResult.CONTINUE
} else {
FileVisitResult.SKIP_SUBTREE
}
}

override def visitFile(file: Path, attrs: BasicFileAttributes): FileVisitResult = {
seenFiles.addOne(file)
if (
hasSourceFileExtension(file, sourceFileExtensions) &&
filterFile(file.toString, inputPath, ignoredDefaultRegex, ignoredFilesRegex, ignoredFilesPath)
) { seenFiles.addOne(file) }
FileVisitResult.CONTINUE
}

override def visitFileFailed(file: Path, exc: java.io.IOException): FileVisitResult = {
exc match {
case e: java.nio.file.FileSystemLoopException => logger.warn(s"Ignoring '$file' (cyclic symlink)")
case _: java.nio.file.FileSystemLoopException => logger.warn(s"Ignoring '$file' (cyclic symlink)")
case other => logger.warn(s"Ignoring '$file'", other)
}
FileVisitResult.CONTINUE
Expand Down Expand Up @@ -88,13 +116,23 @@ object SourceFiles {
}
}

/** Method to filter file based on the passed parameters
/** Filters a file based on the provided ignore rules.
*
* This method determines whether a given file should be excluded from processing based on several criteria, such as
* matching default ignore patterns, specific file name patterns, or explicit file paths to ignore.
*
* @param file
* The file name or path to evaluate.
* @param inputPath
* The root input path for the file traversal.
* @param ignoredDefaultRegex
* Optional sequence of regular expressions defining default file patterns to ignore.
* @param ignoredFilesRegex
* Optional regular expression defining specific file name patterns to ignore.
* @param ignoredFilesPath
* Optional sequence of file paths to explicitly exclude.
* @return
* `true` if the file is accepted, i.e., does not match any of the ignore criteria, `false` otherwise.
*/
def filterFile(
file: String,
Expand All @@ -106,6 +144,24 @@ object SourceFiles {
&& !ignoredFilesRegex.exists(isIgnoredByRegex(file, inputPath, _))
&& !ignoredFilesPath.exists(isIgnoredByFileList(file, _))

/** Filters a list of files based on the provided ignore rules.
*
* This method applies [[filterFile]] to each file in the input list, returning only those files that do not match
* any of the ignore criteria.
*
* @param files
* The list of file names or paths to evaluate.
* @param inputPath
* The root input path for the file traversal.
* @param ignoredDefaultRegex
* Optional sequence of regular expressions defining default file patterns to ignore.
* @param ignoredFilesRegex
* Optional regular expression defining specific file name patterns to ignore.
* @param ignoredFilesPath
* Optional sequence of file paths to explicitly exclude.
* @return
* A filtered list of files that do not match the ignore criteria.
*/
def filterFiles(
files: List[String],
inputPath: String,
Expand All @@ -114,8 +170,49 @@ object SourceFiles {
ignoredFilesPath: Option[Seq[String]] = None
): List[String] = files.filter(filterFile(_, inputPath, ignoredDefaultRegex, ignoredFilesRegex, ignoredFilesPath))

/** For given input paths, determine all source files by inspecting filename extensions and filter the result if
* following arguments ignoredDefaultRegex, ignoredFilesRegex and ignoredFilesPath are used
private def hasSourceFileExtension(file: File, sourceFileExtensions: Set[String]): Boolean =
file.extension.exists(sourceFileExtensions.contains)

/** Determines a sorted list of file paths in a directory that match the specified criteria.
*
* @param inputPath
* The root directory to search for files.
* @param sourceFileExtensions
* A set of file extensions to include in the search.
* @param ignoredDefaultRegex
* An optional sequence of regular expressions for default files to ignore.
* @param ignoredFilesRegex
* An optional regular expression for additional files to ignore.
* @param ignoredFilesPath
* An optional sequence of specific file paths to ignore.
* @param visitOptions
* Implicit parameter defining the options for visiting the file tree. Defaults to `VisitOptions.follow`, which
* follows symbolic links.
* @return
* A sorted `List[String]` of file paths matching the criteria.
*
* This function traverses the file tree starting at the given `inputPath` and collects file paths that:
* - Have extensions specified in `sourceFileExtensions`.
* - Are not ignored based on `ignoredDefaultRegex`, `ignoredFilesRegex`, or `ignoredFilesPath`.
*
* It uses a custom `FailsafeFileVisitor` to handle the filtering logic and `Files.walkFileTree` to perform the
* traversal.
*
* Example usage:
* {{{
* val files = determine(
* inputPath = "/path/to/dir",
* sourceFileExtensions = Set(".scala", ".java"),
* ignoredDefaultRegex = Some(Seq(".*\\.tmp".r)),
* ignoredFilesRegex = Some(".*_backup\\.scala".r),
* ignoredFilesPath = Some(Seq("/path/to/dir/ignore_me.scala"))
* )
* println(files)
* }}}
* @throws java.io.FileNotFoundException
* if the `inputPath` does not exist or is not readable.
* @see
* [[FailsafeFileVisitor]] for details on the visitor used to process files.
*/
def determine(
inputPath: String,
Expand All @@ -124,62 +221,38 @@ object SourceFiles {
ignoredFilesRegex: Option[Regex] = None,
ignoredFilesPath: Option[Seq[String]] = None
)(implicit visitOptions: VisitOptions = VisitOptions.follow): List[String] = {
filterFiles(
determine(Set(inputPath), sourceFileExtensions),
inputPath,
val dir = File(inputPath)
assertExists(dir)
val visitor = new FailsafeFileVisitor(
dir.pathAsString,
sourceFileExtensions,
ignoredDefaultRegex,
ignoredFilesRegex,
ignoredFilesPath
)
Files.walkFileTree(dir.path, visitOptions.toSet.asJava, Int.MaxValue, visitor)
val matchingFiles = visitor.files().map(_.pathAsString)
matchingFiles.toList.sorted
}

/** For a given array of input paths, determine all source files by inspecting filename extensions.
*/
def determine(inputPaths: Set[String], sourceFileExtensions: Set[String])(implicit
visitOptions: VisitOptions
): List[String] = {
def hasSourceFileExtension(file: File): Boolean =
file.extension.exists(sourceFileExtensions.contains)

val inputFiles = inputPaths.map(File(_))
assertAllExist(inputFiles)

val (dirs, files) = inputFiles.partition(_.isDirectory)

val matchingFiles = files.filter(hasSourceFileExtension).map(_.toString)
val matchingFilesFromDirs = dirs
.flatMap { dir =>
val visitor = new FailsafeFileVisitor
Files.walkFileTree(dir.path, visitOptions.toSet.asJava, Int.MaxValue, visitor)
visitor.files()
}
.filter(hasSourceFileExtension)
.map(_.pathAsString)

(matchingFiles ++ matchingFilesFromDirs).toList.sorted
}

/** Attempting to analyse source paths that do not exist is a hard error. Terminate execution early to avoid
* unexpected and hard-to-debug issues in the results.
/** Asserts that a given file exists and is readable.
*
* This method validates the existence and readability of the specified file. If the file does not exist or is not
* readable, it logs an error and throws a [[FileNotFoundException]].
*
* @param file
* The file to validate.
* @throws FileNotFoundException
* if the file does not exist or is not readable.
*/
private def assertAllExist(files: Set[File]): Unit = {
val (existent, nonExistent) = files.partition(_.exists)
val nonReadable = existent.filterNot(_.isReadable)
if (nonExistent.nonEmpty || nonReadable.nonEmpty) {
logErrorWithPaths("Source input paths do not exist", nonExistent.map(_.canonicalPath))
logErrorWithPaths("Source input paths exist, but are not readable", nonReadable.map(_.canonicalPath))
throw FileNotFoundException("Invalid source paths provided")
private def assertExists(file: File): Unit = {
if (!file.exists) {
logger.error(s"Source input path does not exist: ${file.pathAsString}")
throw FileNotFoundException("Invalid source path provided!")
}
}

private def logErrorWithPaths(message: String, paths: Iterable[String]): Unit = {
val pathsArray = paths.toArray.sorted
pathsArray.lengthCompare(1) match {
case cmp if cmp < 0 => // pathsArray is empty, so don't log anything
case cmp if cmp == 0 => logger.error(s"$message: ${paths.head}")
case _ =>
val errorMessage = (message +: pathsArray.map(path => s"- $path")).mkString("\n")
logger.error(errorMessage)
if (!file.isReadable) {
logger.error(s"Source input path exists, but is not readable: ${file.pathAsString}")
throw FileNotFoundException("Invalid source path provided!")
}
}

Expand Down

0 comments on commit 498f895

Please sign in to comment.