Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow customizing semgrep configurations; correct rule matching glob #21126

Merged
merged 13 commits into from
Jul 23, 2024
3 changes: 3 additions & 0 deletions docs/notes/2.23.x.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,9 @@ Fixed pulling `helm_artifact`s from OCI repositories.

Added `workspace_invalidation_sources` field to `adhoc_tool` and `shell_command` target types. This new field allows declaring that these targets depend on files without bringing those files into the execution sandbox, but that the target should still be re-executed if those files change. This is intended to work with the `workspace_environment` support where processes are executed in the workspace and not in a separate sandbox.

#### Semgrep
Semgrep now allows configuring config file discovery via [the new `config_name` option](https://www.pantsbuild.org/2.23/reference/subsystems/semgrep#config_name). In addition, it will now recursively discover all rules within a config directory, not just the immediate children.

#### Workunit logger

The `pants.backend.experimental.tools.workunit_logger` backend will now create directory specified by [the `logdir` option](https://www.pantsbuild.org/2.23/reference/subsystems/workunit-logger#logdir) if it doesn't already exist.
Expand Down
92 changes: 50 additions & 42 deletions src/python/pants/backend/tools/semgrep/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from pants.core.util_rules.partitions import Partition, Partitions
from pants.core.util_rules.source_files import SourceFilesRequest, determine_source_files
from pants.engine.addresses import Address
from pants.engine.fs import CreateDigest, FileContent, MergeDigests, PathGlobs, Paths, Snapshot
from pants.engine.fs import CreateDigest, FileContent, MergeDigests, PathGlobs, Paths
from pants.engine.intrinsics import (
create_digest_to_digest,
digest_to_snapshot,
Expand All @@ -35,6 +35,10 @@
logger = logging.getLogger(__name__)


_SEMGREPIGNORE_FILE_NAME = ".semgrepignore"
_DEFAULT_SEMGREP_CONFIG_DIR = ".semgrep"


class SemgrepLintRequest(LintTargetsRequest):
field_set_type = SemgrepFieldSet
tool_subsystem = SemgrepSubsystem
Expand All @@ -43,29 +47,12 @@ class SemgrepLintRequest(LintTargetsRequest):
@dataclass(frozen=True)
class PartitionMetadata:
config_files: frozenset[PurePath]
ignore_files: Snapshot

@property
def description(self) -> str:
return ", ".join(sorted(str(path) for path in self.config_files))


_IGNORE_FILE_NAME = ".semgrepignore"

_RULES_DIR_NAME = ".semgrep"
_RULES_FILES_GLOBS = (
".semgrep.yml",
".semgrep.yaml",
f"{_RULES_DIR_NAME}/*.yml",
f"{_RULES_DIR_NAME}/*.yaml",
)


@dataclass
class SemgrepIgnoreFiles:
snapshot: Snapshot


@dataclass
class AllSemgrepConfigs:
configs_by_dir: dict[PurePath, set[PurePath]]
Expand All @@ -84,26 +71,53 @@ def ancestor_configs(self, address: Address) -> Iterable[PurePath]:
yield from self.configs_by_dir.get(ancestor, [])


def _group_by_semgrep_dir(all_paths: Paths) -> AllSemgrepConfigs:
configs_by_dir = defaultdict(set)
for path_ in all_paths.files:
path = PurePath(path_)
# A rule like foo/bar/.semgrep/baz.yaml should behave like it's in in foo/bar, not
# foo/bar/.semgrep
parent = path.parent
config_directory = parent.parent if parent.name == _RULES_DIR_NAME else parent

configs_by_dir[config_directory].add(path)
def _group_by_semgrep_dir(
all_config_files: Paths, all_config_dir_files: Paths, config_name: str
) -> AllSemgrepConfigs:
configs_by_dir: dict[PurePath, set[PurePath]] = {}
for config_path in all_config_files.files:
# Rules like foo/semgrep.yaml should apply to the project at foo/
path = PurePath(config_path)
configs_by_dir.setdefault(path.parent, set()).add(path)

for config_path in all_config_dir_files.files:
# Rules like foo/bar/.semgrep/baz.yaml and foo/bar/.semgrep/baz/qux.yaml should apply to the
# project at foo/bar/
path = PurePath(config_path)
config_directory = next(
parent.parent for parent in path.parents if parent.name == config_name
)
configs_by_dir.setdefault(config_directory, set()).add(path)

return AllSemgrepConfigs(configs_by_dir)


@rule
async def find_all_semgrep_configs() -> AllSemgrepConfigs:
all_paths = await path_globs_to_paths(
PathGlobs([f"**/{file_glob}" for file_glob in _RULES_FILES_GLOBS])
async def find_all_semgrep_configs(semgrep: SemgrepSubsystem) -> AllSemgrepConfigs:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this new approach a lot more! Thanks.

config_file_globs: tuple[str, ...] = ()
config_dir_globs: tuple[str, ...] = ()

if semgrep.config_name is None:
config_file_globs = ("**/.semgrep.yml", "**/.semgrep.yaml")
config_dir_globs = (
f"**/{_DEFAULT_SEMGREP_CONFIG_DIR}/**/*.yaml",
f"**/{_DEFAULT_SEMGREP_CONFIG_DIR}/**/*.yml",
)
elif semgrep.config_name.endswith((".yaml", ".yml")):
config_file_globs = (f"**/{semgrep.config_name}",)
else:
config_dir_globs = (
f"**/{semgrep.config_name}/**/*.yaml",
f"**/{semgrep.config_name}/**/*.yml",
)

all_config_files = await path_globs_to_paths(PathGlobs(config_file_globs))
all_config_dir_files = await path_globs_to_paths(PathGlobs(config_dir_globs))
return _group_by_semgrep_dir(
all_config_files,
all_config_dir_files,
(semgrep.config_name or _DEFAULT_SEMGREP_CONFIG_DIR),
)
return _group_by_semgrep_dir(all_paths)


@dataclass(frozen=True)
Expand All @@ -122,17 +136,10 @@ async def infer_relevant_semgrep_configs(
return RelevantSemgrepConfigs(all_semgrep.ancestor_configs(request.field_set.address))


@rule
async def all_semgrep_ignore_files() -> SemgrepIgnoreFiles:
snapshot = await digest_to_snapshot(**implicitly(PathGlobs([f"**/{_IGNORE_FILE_NAME}"])))
return SemgrepIgnoreFiles(snapshot)


@rule
async def partition(
request: SemgrepLintRequest.PartitionRequest[SemgrepFieldSet],
semgrep: SemgrepSubsystem,
ignore_files: SemgrepIgnoreFiles,
) -> Partitions:
if semgrep.skip:
return Partitions()
Expand All @@ -149,7 +156,7 @@ async def partition(
by_config[configs].append(field_set)

return Partitions(
Partition(tuple(field_sets), PartitionMetadata(configs, ignore_files.snapshot))
Partition(tuple(field_sets), PartitionMetadata(configs))
for configs, field_sets in by_config.items()
)

Expand All @@ -169,10 +176,11 @@ async def lint(
semgrep: SemgrepSubsystem,
global_options: GlobalOptions,
) -> LintResult:
config_files, semgrep_pex, input_files, settings = await concurrently(
config_files, ignore_files, semgrep_pex, input_files, settings = await concurrently(
digest_to_snapshot(
**implicitly(PathGlobs(str(s) for s in request.partition_metadata.config_files))
),
digest_to_snapshot(**implicitly(PathGlobs([_SEMGREPIGNORE_FILE_NAME]))),
create_venv_pex(**implicitly(semgrep.to_pex_request())),
determine_source_files(
SourceFilesRequest(field_set.source for field_set in request.elements)
Expand All @@ -186,7 +194,7 @@ async def lint(
input_files.snapshot.digest,
config_files.digest,
settings,
request.partition_metadata.ignore_files.digest,
ignore_files.digest,
)
)
)
Expand Down
61 changes: 53 additions & 8 deletions src/python/pants/backend/tools/semgrep/rules_integration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,8 @@ def run_semgrep(

return tuple(
rule_runner.request(
LintResult, [SemgrepLintRequest.Batch("", partition.elements, partition.metadata)]
LintResult,
[SemgrepLintRequest.Batch("", partition.elements, partition.metadata)],
)
for partition in partitions
)
Expand Down Expand Up @@ -161,11 +162,30 @@ def test_passing(rule_runner: RuleRunner, major_minor_interpreter: str) -> None:
)


def test_failing(rule_runner: RuleRunner) -> None:
rule_runner.write_files(BAD_FILE_LAYOUT)
@pytest.mark.parametrize(
"files,config_name",
[
pytest.param(
BAD_FILE_LAYOUT,
None,
),
pytest.param(
{
f"{DIR}/bad.txt": BAD_FILE,
f"{DIR}/BUILD": """file(name="f", source="bad.txt")""",
".custom_semgrep_file.yml": RULES,
},
".custom_semgrep_file.yml",
id="via custom config file",
),
],
)
def test_failing(rule_runner: RuleRunner, files: dict[str, str], config_name: str | None) -> None:
rule_runner.write_files(files)
tgt = rule_runner.get_target(Address(DIR, target_name="f"))

results = run_semgrep(rule_runner, [tgt])
extra_args = [f"--semgrep-config-name={config_name}"] if config_name else []
results = run_semgrep(rule_runner, [tgt], extra_args=extra_args)
assert len(results) == 1
result = results[0]
assert "find-bad-pattern" in result.stdout
Expand Down Expand Up @@ -204,31 +224,56 @@ def test_multiple_targets(rule_runner: RuleRunner) -> None:


@pytest.mark.parametrize(
"files",
"files,config_name",
[
pytest.param(
{
**BAD_FILE_LAYOUT,
".semgrep.yml": RULES2,
},
None,
id="via nesting",
),
pytest.param(
{
f"{DIR}/bad.txt": BAD_FILE,
f"{DIR}/BUILD": """file(name="f", source="bad.txt")""",
".semgrep/one.yml": RULES,
".semgrep/two.yml": RULES2,
".semgrep/vendored/two.yml": RULES2,
},
None,
id="via .semgrep directory",
),
pytest.param(
{
f"{DIR}/bad.txt": BAD_FILE,
f"{DIR}/BUILD": """file(name="f", source="bad.txt")""",
".semgrep/one.yml": RULES,
".semgrep/vendored/two.yml": RULES2,
},
None,
id="via recursive .semgrep directory",
),
pytest.param(
{
f"{DIR}/bad.txt": BAD_FILE,
f"{DIR}/BUILD": """file(name="f", source="bad.txt")""",
"custom_semgrep_dir/one.yml": RULES,
"custom_semgrep_dir/vendored/two.yml": RULES2,
},
"custom_semgrep_dir",
id="via custom recursive config directory",
),
],
)
def test_multiple_configs(rule_runner: RuleRunner, files: dict[str, str]) -> None:
def test_multiple_configs(
rule_runner: RuleRunner, files: dict[str, str], config_name: str | None
) -> None:
rule_runner.write_files(files)

tgt = rule_runner.get_target(Address(DIR, target_name="f"))
results = run_semgrep(rule_runner, [tgt])
extra_args = [f"--semgrep-config-name={config_name}"] if config_name else []
results = run_semgrep(rule_runner, [tgt], extra_args=extra_args)

assert len(results) == 1
result = results[0]
Expand Down
46 changes: 35 additions & 11 deletions src/python/pants/backend/tools/semgrep/rules_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,33 +20,51 @@ def configs(strs: dict[str, set[str]]) -> AllSemgrepConfigs:


@pytest.mark.parametrize(
("paths", "expected"),
("config_files", "config_dir_files", "expected"),
[
pytest.param((), configs({}), id="nothing"),
pytest.param((), (), configs({}), id="nothing"),
pytest.param(
("foo/bar/.semgrep.yml",),
(),
configs({"foo/bar": {"foo/bar/.semgrep.yml"}}),
id="semgrep_file",
),
pytest.param(
(),
("foo/bar/.semgrep/baz.yml", "foo/bar/.semgrep/qux.yml"),
configs({"foo/bar": {"foo/bar/.semgrep/baz.yml", "foo/bar/.semgrep/qux.yml"}}),
id="semgrep_dir",
),
pytest.param(
(
"foo/bar/.semgrep.yml",
"foo/bar/.semgrep/baz.yml",
),
("foo/bar/.semgrep.yml",),
("foo/bar/.semgrep/baz.yml",),
configs({"foo/bar": {"foo/bar/.semgrep.yml", "foo/bar/.semgrep/baz.yml"}}),
id="both_file_and_dir",
),
pytest.param(
(),
(
"foo/bar/.semgrep/.semgrep.yml",
"foo/bar/.semgrep/baz1.yml",
"foo/bar/.semgrep/quz/baz2.yml",
),
configs(
{
"foo/bar": {
"foo/bar/.semgrep/.semgrep.yml",
"foo/bar/.semgrep/baz1.yml",
"foo/bar/.semgrep/quz/baz2.yml",
}
}
),
id="recursively_find_yamls",
),
pytest.param(
(
"foo/.semgrep/baz.yml",
"foo/bar/.semgrep.yml",
"foo/bar/qux/.semgrep.yml",
),
("foo/.semgrep/baz.yml",),
configs(
{
"foo": {"foo/.semgrep/baz.yml"},
Expand All @@ -58,15 +76,21 @@ def configs(strs: dict[str, set[str]]) -> AllSemgrepConfigs:
),
# at the top level should be okay too
pytest.param(
(".semgrep.yml", ".semgrep/foo.yml"),
(".semgrep.yml",),
(".semgrep/foo.yml",),
configs({"": {".semgrep.yml", ".semgrep/foo.yml"}}),
id="top_level",
),
],
)
def test_group_by_group_by_semgrep_dir(paths: tuple[str, ...], expected: AllSemgrepConfigs):
input = Paths(files=paths, dirs=())
result = rules._group_by_semgrep_dir(input)
def test_group_by_group_by_semgrep_dir(
config_files: tuple[str, ...],
config_dir_files: tuple[str, ...],
expected: AllSemgrepConfigs,
):
config_file_paths = Paths(files=config_files, dirs=())
config_dir_file_paths = Paths(files=config_dir_files, dirs=())
result = rules._group_by_semgrep_dir(config_file_paths, config_dir_file_paths, ".semgrep")
assert result == expected


Expand Down
14 changes: 13 additions & 1 deletion src/python/pants/backend/tools/semgrep/subsystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from pants.engine.rules import Rule, collect_rules
from pants.engine.target import Dependencies, FieldSet, SingleSourceField, Target
from pants.engine.unions import UnionRule
from pants.option.option_types import ArgsListOption, BoolOption, SkipOption
from pants.option.option_types import ArgsListOption, BoolOption, SkipOption, StrOption
from pants.util.strutil import softwrap


Expand Down Expand Up @@ -51,6 +51,18 @@ class SemgrepSubsystem(PythonToolBase):
register_lockfile = True
default_lockfile_resource = ("pants.backend.tools.semgrep", "semgrep.lock")

config_name = StrOption(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, one more thing. I think there's no test that currently exercises this option and its new code-path. Could you expand src/python/pants/backend/tools/semgrep/rules_integration_test.py to include one? Let me know if you'd like some assistance choosing where to add it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

working on it

default=None,
help=softwrap(
"""
The name of the semgrep config file or directory, which will be discovered and used
hierarchically. If using a file, it must have the extension `.yaml` or `.yml`.

URLs and registry names are not supported.
"""
),
)

args = ArgsListOption(
example="--verbose",
default=["--quiet"],
Expand Down
Loading