Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow customizing semgrep configurations; correct rule matching glob #21126

Merged
merged 13 commits into from
Jul 23, 2024
3 changes: 3 additions & 0 deletions docs/notes/2.23.x.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ Fixed pulling `helm_artifact`s from OCI repositories.

Added `workspace_invalidation_sources` field to `adhoc_tool` and `shell_command` target types. This new field allows declaring that these targets depend on files without bringing those files into the execution sandbox, but that the target should still be re-executed if those files change. This is intended to work with the `workspace_environment` support where processes are executed in the workspace and not in a separate sandbox.

#### Semgrep
Semgrep now allows configuring the config directory, ignore config file, and will recursively discover all rules in the directory.
purajit marked this conversation as resolved.
Show resolved Hide resolved

### Plugin API changes

Fixed bug with workspace environment support where Pants used a workspace environment when it was searching for a local environment.
Expand Down
57 changes: 22 additions & 35 deletions src/python/pants/backend/tools/semgrep/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,29 +44,12 @@ class SemgrepLintRequest(LintTargetsRequest):
@dataclass(frozen=True)
class PartitionMetadata:
config_files: frozenset[PurePath]
ignore_files: Snapshot

@property
def description(self) -> str:
return ", ".join(sorted(str(path) for path in self.config_files))


_IGNORE_FILE_NAME = ".semgrepignore"

_RULES_DIR_NAME = ".semgrep"
_RULES_FILES_GLOBS = (
".semgrep.yml",
".semgrep.yaml",
f"{_RULES_DIR_NAME}/*.yml",
f"{_RULES_DIR_NAME}/*.yaml",
)


@dataclass
class SemgrepIgnoreFiles:
snapshot: Snapshot


@dataclass
class AllSemgrepConfigs:
configs_by_dir: dict[PurePath, set[PurePath]]
Expand All @@ -85,24 +68,33 @@ def ancestor_configs(self, address: Address) -> Iterable[PurePath]:
yield from self.configs_by_dir.get(ancestor, [])


def _group_by_semgrep_dir(all_paths: Paths) -> AllSemgrepConfigs:
def _group_by_semgrep_dir(config_dir: str, all_paths: Paths) -> AllSemgrepConfigs:
configs_by_dir = defaultdict(set)
for path_ in all_paths.files:
path = PurePath(path_)
# A rule like foo/bar/.semgrep/baz.yaml should behave like it's in in foo/bar, not
# foo/bar/.semgrep
parent = path.parent
config_directory = parent.parent if parent.name == _RULES_DIR_NAME else parent

# Rules like foo/bar/.semgrep/baz.yaml and foo/bar/.semgrep/baz/qux.yaml should apply to the
# project at foo/bar
config_directory = (
PurePath(*path.parts[:path.parts.index(config_dir)])
if config_dir in path.parts
else path.parent
)
configs_by_dir[config_directory].add(path)

return AllSemgrepConfigs(configs_by_dir)


@rule
async def find_all_semgrep_configs() -> AllSemgrepConfigs:
all_paths = await Get(Paths, PathGlobs([f"**/{file_glob}" for file_glob in _RULES_FILES_GLOBS]))
return _group_by_semgrep_dir(all_paths)
async def find_all_semgrep_configs(semgrep: SemgrepSubsystem) -> AllSemgrepConfigs:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this new approach a lot more! Thanks.

rules_files_globs = (
f"{semgrep.config_dir}/**/*.yml",
f"{semgrep.config_dir}/**/*.yaml",
".semgrep.yml",
".semgrep.yaml",
)

all_paths = await Get(Paths, PathGlobs([f"**/{file_glob}" for file_glob in rules_files_globs]))
return _group_by_semgrep_dir(semgrep.config_dir, all_paths)


@dataclass(frozen=True)
Expand All @@ -121,17 +113,10 @@ async def infer_relevant_semgrep_configs(
return RelevantSemgrepConfigs(all_semgrep.ancestor_configs(request.field_set.address))


@rule
async def all_semgrep_ignore_files() -> SemgrepIgnoreFiles:
snapshot = await Get(Snapshot, PathGlobs([f"**/{_IGNORE_FILE_NAME}"]))
return SemgrepIgnoreFiles(snapshot)


@rule
async def partition(
request: SemgrepLintRequest.PartitionRequest[SemgrepFieldSet],
semgrep: SemgrepSubsystem,
ignore_files: SemgrepIgnoreFiles,
) -> Partitions:
if semgrep.skip:
return Partitions()
Expand All @@ -148,7 +133,7 @@ async def partition(
by_config[configs].append(field_set)

return Partitions(
Partition(tuple(field_sets), PartitionMetadata(configs, ignore_files.snapshot))
Partition(tuple(field_sets), PartitionMetadata(configs))
for configs, field_sets in by_config.items()
)

Expand All @@ -175,14 +160,16 @@ async def lint(
Get(Digest, CreateDigest([_DEFAULT_SETTINGS])),
)

ignore_files = await Get(Snapshot, PathGlobs([semgrep.ignore_config_path]))

input_digest = await Get(
Digest,
MergeDigests(
(
input_files.snapshot.digest,
config_files.digest,
settings,
request.partition_metadata.ignore_files.digest,
ignore_files.digest,
)
),
)
Expand Down
11 changes: 10 additions & 1 deletion src/python/pants/backend/tools/semgrep/rules_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,15 @@ def configs(strs: dict[str, set[str]]) -> AllSemgrepConfigs:
configs({"foo/bar": {"foo/bar/.semgrep.yml", "foo/bar/.semgrep/baz.yml"}}),
id="both_file_and_dir",
),
pytest.param(
(
"foo/bar/.semgrep/.semgrep.yml",
"foo/bar/.semgrep/baz1.yml",
"foo/bar/.semgrep/quz/baz2.yml",
),
configs({"foo/bar": {"foo/bar/.semgrep/.semgrep.yml", "foo/bar/.semgrep/baz1.yml", "foo/bar/.semgrep/quz/baz2.yml"}}),
id="recursively_find_yamls",
),
pytest.param(
(
"foo/.semgrep/baz.yml",
Expand All @@ -66,7 +75,7 @@ def configs(strs: dict[str, set[str]]) -> AllSemgrepConfigs:
)
def test_group_by_group_by_semgrep_dir(paths: tuple[str, ...], expected: AllSemgrepConfigs):
input = Paths(files=paths, dirs=())
result = rules._group_by_semgrep_dir(input)
result = rules._group_by_semgrep_dir(".semgrep", input)
assert result == expected


Expand Down
17 changes: 16 additions & 1 deletion src/python/pants/backend/tools/semgrep/subsystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from pants.engine.rules import Rule, collect_rules
from pants.engine.target import Dependencies, FieldSet, SingleSourceField, Target
from pants.engine.unions import UnionRule
from pants.option.option_types import ArgsListOption, BoolOption, SkipOption
from pants.option.option_types import ArgsListOption, BoolOption, SkipOption, StrOption
from pants.util.strutil import softwrap


Expand Down Expand Up @@ -51,6 +51,21 @@ class SemgrepSubsystem(PythonToolBase):
register_lockfile = True
default_lockfile_resource = ("pants.backend.tools.semgrep", "semgrep.lock")

config_dir = StrOption(
default=".semgrep",
help=softwrap(
"""
The directory name with semgrep rules, which is searched recursively for YAML files, and
can be present at any level, with rules applying to all levels below it.
"""
),
)

ignore_config_path = StrOption(
default=".semgrepignore",
Copy link
Contributor

@huonw huonw Jul 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah... sorry about forcing you to re-investigate everything about these ignore files. This was known in the original addition of the backend, and cut out of scope: #18593 (comment)

I even referenced huonw#1 "so I don't forget about it", but look how far that's gone. 🙈

In any case, I think these two things are true, and still true as of this PR:

  • pants runs semgrep in the sandbox root (i.e. working directory is the root of the repository)
  • semgrep only looks for .semgrepignore in its working directory

Given this, running semgrep via pants will only ever successfully read //.semgrepignore. Thus, someone who sets ignore_config_path = "some/sub/directory/.semgrepignore will just have semgrep not using the ignore. It'll be included in the sandbox, but not used!

(The thinking behind the globbing behaviour was "future proofing": if a new version of semgrep started reading the nested .semgrepignores, it'll automatically work with Pants, because Pants is including them in the sandboxes.)


Two questions:

  1. Are you changing the old behaviour (of including all files called .semgrepignore) because it was causing problems? Can you be specific about what the problems are?
  2. Do you have an example of how someone would customise this path (and why)?

Copy link
Contributor Author

@purajit purajit Jul 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah gotcha, I'm on my phone right now but I'll look at those PRs later.

No, it wasn't causing problems. Basically when I was adding the description for the semgrepignore customization, I initially had a note that it was hierarchical. That got me curious, so I tried to use it, and realized it did not work as expected. Probably should've done a git blame to see why it was done, but I decided to mention explicitly that it had to be at the root-based path (since this is different from the rules dir behavior), and remove the relevant code.

Shall I add it back with a note about context? Though I do think as it stands pants taking this on will be a huge burden in taking over tool responsibility stuff. The simplest way I can think to get it done would be to to inject :includes into the root semgrepignore during the pants run. Is there any precedent to alter file digests like that?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And yeah, just realized semgrepignore is not actually customizable, so should just remove this and go back to the hard-coded value.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😂 hilarious that we ended up with nearly the same integration tests for showing this behavior

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removed this configurability

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Though I do think as it stands pants taking this on will be a huge burden in taking over tool responsibility stuff. The simplest way I can think to get it done would be to to inject :includes into the root semgrepignore during the pants run. Is there any precedent to alter file digests like that?

Yes, I agree that Pants should try to avoid having too much "smarts" tailored to each tool.

My thinking with the continuing work related to .semgrepignore (i.e. huonw#1) was just to make it more obvious when something is ignored, to reduce confusion. Not "invent" a new file.

help="The path to the semgrepignore file",
)

args = ArgsListOption(
example="--verbose",
default=["--quiet"],
Expand Down
Loading