From f092630ec93724c655a3c843f9ed1a92e824c704 Mon Sep 17 00:00:00 2001 From: purajit Date: Wed, 3 Jul 2024 13:17:51 -0700 Subject: [PATCH 01/10] Allow customizing semgrep configurations; correct rule matching glob --- .../pants/backend/tools/semgrep/rules.py | 57 +++++++------------ .../pants/backend/tools/semgrep/rules_test.py | 11 +++- .../pants/backend/tools/semgrep/subsystem.py | 17 +++++- 3 files changed, 48 insertions(+), 37 deletions(-) diff --git a/src/python/pants/backend/tools/semgrep/rules.py b/src/python/pants/backend/tools/semgrep/rules.py index e2e68cbe8bf..3c2259a8e81 100644 --- a/src/python/pants/backend/tools/semgrep/rules.py +++ b/src/python/pants/backend/tools/semgrep/rules.py @@ -44,29 +44,12 @@ class SemgrepLintRequest(LintTargetsRequest): @dataclass(frozen=True) class PartitionMetadata: config_files: frozenset[PurePath] - ignore_files: Snapshot @property def description(self) -> str: return ", ".join(sorted(str(path) for path in self.config_files)) -_IGNORE_FILE_NAME = ".semgrepignore" - -_RULES_DIR_NAME = ".semgrep" -_RULES_FILES_GLOBS = ( - ".semgrep.yml", - ".semgrep.yaml", - f"{_RULES_DIR_NAME}/*.yml", - f"{_RULES_DIR_NAME}/*.yaml", -) - - -@dataclass -class SemgrepIgnoreFiles: - snapshot: Snapshot - - @dataclass class AllSemgrepConfigs: configs_by_dir: dict[PurePath, set[PurePath]] @@ -85,24 +68,33 @@ def ancestor_configs(self, address: Address) -> Iterable[PurePath]: yield from self.configs_by_dir.get(ancestor, []) -def _group_by_semgrep_dir(all_paths: Paths) -> AllSemgrepConfigs: +def _group_by_semgrep_dir(config_dir: str, all_paths: Paths) -> AllSemgrepConfigs: configs_by_dir = defaultdict(set) for path_ in all_paths.files: path = PurePath(path_) - # A rule like foo/bar/.semgrep/baz.yaml should behave like it's in in foo/bar, not - # foo/bar/.semgrep - parent = path.parent - config_directory = parent.parent if parent.name == _RULES_DIR_NAME else parent - + # Rules like foo/bar/.semgrep/baz.yaml and foo/bar/.semgrep/baz/qux.yaml should apply to the + # project at foo/bar + config_directory = ( + PurePath(*path.parts[:path.parts.index(config_dir)]) + if config_dir in path.parts + else path.parent + ) configs_by_dir[config_directory].add(path) return AllSemgrepConfigs(configs_by_dir) @rule -async def find_all_semgrep_configs() -> AllSemgrepConfigs: - all_paths = await Get(Paths, PathGlobs([f"**/{file_glob}" for file_glob in _RULES_FILES_GLOBS])) - return _group_by_semgrep_dir(all_paths) +async def find_all_semgrep_configs(semgrep: SemgrepSubsystem) -> AllSemgrepConfigs: + rules_files_globs = ( + f"{semgrep.config_dir}/**/*.yml", + f"{semgrep.config_dir}/**/*.yaml", + ".semgrep.yml", + ".semgrep.yaml", + ) + + all_paths = await Get(Paths, PathGlobs([f"**/{file_glob}" for file_glob in rules_files_globs])) + return _group_by_semgrep_dir(semgrep.config_dir, all_paths) @dataclass(frozen=True) @@ -121,17 +113,10 @@ async def infer_relevant_semgrep_configs( return RelevantSemgrepConfigs(all_semgrep.ancestor_configs(request.field_set.address)) -@rule -async def all_semgrep_ignore_files() -> SemgrepIgnoreFiles: - snapshot = await Get(Snapshot, PathGlobs([f"**/{_IGNORE_FILE_NAME}"])) - return SemgrepIgnoreFiles(snapshot) - - @rule async def partition( request: SemgrepLintRequest.PartitionRequest[SemgrepFieldSet], semgrep: SemgrepSubsystem, - ignore_files: SemgrepIgnoreFiles, ) -> Partitions: if semgrep.skip: return Partitions() @@ -148,7 +133,7 @@ async def partition( by_config[configs].append(field_set) return Partitions( - Partition(tuple(field_sets), PartitionMetadata(configs, ignore_files.snapshot)) + Partition(tuple(field_sets), PartitionMetadata(configs)) for configs, field_sets in by_config.items() ) @@ -175,6 +160,8 @@ async def lint( Get(Digest, CreateDigest([_DEFAULT_SETTINGS])), ) + ignore_files = await Get(Snapshot, PathGlobs([semgrep.ignore_config_path])) + input_digest = await Get( Digest, MergeDigests( @@ -182,7 +169,7 @@ async def lint( input_files.snapshot.digest, config_files.digest, settings, - request.partition_metadata.ignore_files.digest, + ignore_files.digest, ) ), ) diff --git a/src/python/pants/backend/tools/semgrep/rules_test.py b/src/python/pants/backend/tools/semgrep/rules_test.py index 520560b8c73..e7c732be00b 100644 --- a/src/python/pants/backend/tools/semgrep/rules_test.py +++ b/src/python/pants/backend/tools/semgrep/rules_test.py @@ -41,6 +41,15 @@ def configs(strs: dict[str, set[str]]) -> AllSemgrepConfigs: configs({"foo/bar": {"foo/bar/.semgrep.yml", "foo/bar/.semgrep/baz.yml"}}), id="both_file_and_dir", ), + pytest.param( + ( + "foo/bar/.semgrep/.semgrep.yml", + "foo/bar/.semgrep/baz1.yml", + "foo/bar/.semgrep/quz/baz2.yml", + ), + configs({"foo/bar": {"foo/bar/.semgrep/.semgrep.yml", "foo/bar/.semgrep/baz1.yml", "foo/bar/.semgrep/quz/baz2.yml"}}), + id="recursively_find_yamls", + ), pytest.param( ( "foo/.semgrep/baz.yml", @@ -66,7 +75,7 @@ def configs(strs: dict[str, set[str]]) -> AllSemgrepConfigs: ) def test_group_by_group_by_semgrep_dir(paths: tuple[str, ...], expected: AllSemgrepConfigs): input = Paths(files=paths, dirs=()) - result = rules._group_by_semgrep_dir(input) + result = rules._group_by_semgrep_dir(".semgrep", input) assert result == expected diff --git a/src/python/pants/backend/tools/semgrep/subsystem.py b/src/python/pants/backend/tools/semgrep/subsystem.py index af73b460304..f7608b7b5ee 100644 --- a/src/python/pants/backend/tools/semgrep/subsystem.py +++ b/src/python/pants/backend/tools/semgrep/subsystem.py @@ -12,7 +12,7 @@ from pants.engine.rules import Rule, collect_rules from pants.engine.target import Dependencies, FieldSet, SingleSourceField, Target from pants.engine.unions import UnionRule -from pants.option.option_types import ArgsListOption, BoolOption, SkipOption +from pants.option.option_types import ArgsListOption, BoolOption, SkipOption, StrOption from pants.util.strutil import softwrap @@ -51,6 +51,21 @@ class SemgrepSubsystem(PythonToolBase): register_lockfile = True default_lockfile_resource = ("pants.backend.tools.semgrep", "semgrep.lock") + config_dir = StrOption( + default=".semgrep", + help=softwrap( + """ + The directory name with semgrep rules, which is searched recursively for YAML files, and + can be present at any level, with rules applying to all levels below it. + """ + ), + ) + + ignore_config_path = StrOption( + default=".semgrepignore", + help="The path to the semgrepignore file", + ) + args = ArgsListOption( example="--verbose", default=["--quiet"], From c55500a6fcc3bdce99820a7af3edb6da5e0867e2 Mon Sep 17 00:00:00 2001 From: purajit Date: Thu, 4 Jul 2024 00:39:11 -0700 Subject: [PATCH 02/10] release notes --- docs/notes/2.23.x.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/notes/2.23.x.md b/docs/notes/2.23.x.md index c4d24f3c50d..b7267b94ff4 100644 --- a/docs/notes/2.23.x.md +++ b/docs/notes/2.23.x.md @@ -137,6 +137,9 @@ Fixed pulling `helm_artifact`s from OCI repositories. Added `workspace_invalidation_sources` field to `adhoc_tool` and `shell_command` target types. This new field allows declaring that these targets depend on files without bringing those files into the execution sandbox, but that the target should still be re-executed if those files change. This is intended to work with the `workspace_environment` support where processes are executed in the workspace and not in a separate sandbox. +#### Semgrep +Semgrep now allows configuring the config directory, ignore config file, and will recursively discover all rules in the directory. + ### Plugin API changes Fixed bug with workspace environment support where Pants used a workspace environment when it was searching for a local environment. From 24b5d51dd92bb37849b2a6ddfdf9a6b046be7c43 Mon Sep 17 00:00:00 2001 From: purajit Date: Thu, 4 Jul 2024 01:01:07 -0700 Subject: [PATCH 03/10] fmt --- src/python/pants/backend/tools/semgrep/rules.py | 2 +- src/python/pants/backend/tools/semgrep/rules_test.py | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/python/pants/backend/tools/semgrep/rules.py b/src/python/pants/backend/tools/semgrep/rules.py index 3c2259a8e81..1e0f82b4cb9 100644 --- a/src/python/pants/backend/tools/semgrep/rules.py +++ b/src/python/pants/backend/tools/semgrep/rules.py @@ -75,7 +75,7 @@ def _group_by_semgrep_dir(config_dir: str, all_paths: Paths) -> AllSemgrepConfig # Rules like foo/bar/.semgrep/baz.yaml and foo/bar/.semgrep/baz/qux.yaml should apply to the # project at foo/bar config_directory = ( - PurePath(*path.parts[:path.parts.index(config_dir)]) + PurePath(*path.parts[: path.parts.index(config_dir)]) if config_dir in path.parts else path.parent ) diff --git a/src/python/pants/backend/tools/semgrep/rules_test.py b/src/python/pants/backend/tools/semgrep/rules_test.py index e7c732be00b..f4f4b1dea05 100644 --- a/src/python/pants/backend/tools/semgrep/rules_test.py +++ b/src/python/pants/backend/tools/semgrep/rules_test.py @@ -47,7 +47,15 @@ def configs(strs: dict[str, set[str]]) -> AllSemgrepConfigs: "foo/bar/.semgrep/baz1.yml", "foo/bar/.semgrep/quz/baz2.yml", ), - configs({"foo/bar": {"foo/bar/.semgrep/.semgrep.yml", "foo/bar/.semgrep/baz1.yml", "foo/bar/.semgrep/quz/baz2.yml"}}), + configs( + { + "foo/bar": { + "foo/bar/.semgrep/.semgrep.yml", + "foo/bar/.semgrep/baz1.yml", + "foo/bar/.semgrep/quz/baz2.yml", + } + } + ), id="recursively_find_yamls", ), pytest.param( From bb4842c7f2829ebb3f07f1bd67a26be8fe59bea6 Mon Sep 17 00:00:00 2001 From: purajit Date: Thu, 4 Jul 2024 01:31:47 -0700 Subject: [PATCH 04/10] change to config_paths list --- .../pants/backend/tools/semgrep/rules.py | 20 ++++++++++--------- .../tools/semgrep/rules_integration_test.py | 9 +++++++++ .../pants/backend/tools/semgrep/rules_test.py | 2 +- .../pants/backend/tools/semgrep/subsystem.py | 15 +++++++++----- 4 files changed, 31 insertions(+), 15 deletions(-) diff --git a/src/python/pants/backend/tools/semgrep/rules.py b/src/python/pants/backend/tools/semgrep/rules.py index 1e0f82b4cb9..d2a95ecdee4 100644 --- a/src/python/pants/backend/tools/semgrep/rules.py +++ b/src/python/pants/backend/tools/semgrep/rules.py @@ -68,17 +68,19 @@ def ancestor_configs(self, address: Address) -> Iterable[PurePath]: yield from self.configs_by_dir.get(ancestor, []) -def _group_by_semgrep_dir(config_dir: str, all_paths: Paths) -> AllSemgrepConfigs: +def _group_by_semgrep_dir(config_paths: tuple[str, ...], all_paths: Paths) -> AllSemgrepConfigs: configs_by_dir = defaultdict(set) for path_ in all_paths.files: path = PurePath(path_) # Rules like foo/bar/.semgrep/baz.yaml and foo/bar/.semgrep/baz/qux.yaml should apply to the # project at foo/bar - config_directory = ( - PurePath(*path.parts[: path.parts.index(config_dir)]) - if config_dir in path.parts - else path.parent - ) + config_directory = path.parent + for config_path in config_paths: + if config_path not in path.parts: + continue + config_directory = PurePath(*path.parts[: path.parts.index(config_path)]) + break + configs_by_dir[config_directory].add(path) return AllSemgrepConfigs(configs_by_dir) @@ -87,14 +89,14 @@ def _group_by_semgrep_dir(config_dir: str, all_paths: Paths) -> AllSemgrepConfig @rule async def find_all_semgrep_configs(semgrep: SemgrepSubsystem) -> AllSemgrepConfigs: rules_files_globs = ( - f"{semgrep.config_dir}/**/*.yml", - f"{semgrep.config_dir}/**/*.yaml", + *(f"{config_path}/**/*.yml" for config_path in semgrep.config_paths), + *(f"{config_path}/**/*.yaml" for config_path in semgrep.config_paths), ".semgrep.yml", ".semgrep.yaml", ) all_paths = await Get(Paths, PathGlobs([f"**/{file_glob}" for file_glob in rules_files_globs])) - return _group_by_semgrep_dir(semgrep.config_dir, all_paths) + return _group_by_semgrep_dir(semgrep.config_paths, all_paths) @dataclass(frozen=True) diff --git a/src/python/pants/backend/tools/semgrep/rules_integration_test.py b/src/python/pants/backend/tools/semgrep/rules_integration_test.py index b54256b4de6..0d2ec9f916c 100644 --- a/src/python/pants/backend/tools/semgrep/rules_integration_test.py +++ b/src/python/pants/backend/tools/semgrep/rules_integration_test.py @@ -222,6 +222,15 @@ def test_multiple_targets(rule_runner: RuleRunner) -> None: }, id="via .semgrep directory", ), + pytest.param( + { + f"{DIR}/bad.txt": BAD_FILE, + f"{DIR}/BUILD": """file(name="f", source="bad.txt")""", + ".semgrep/one.yml": RULES, + ".semgrep/vendored/two.yml": RULES2, + }, + id="via recursive .semgrep directory", + ), ], ) def test_multiple_configs(rule_runner: RuleRunner, files: dict[str, str]) -> None: diff --git a/src/python/pants/backend/tools/semgrep/rules_test.py b/src/python/pants/backend/tools/semgrep/rules_test.py index f4f4b1dea05..6309fd1fa97 100644 --- a/src/python/pants/backend/tools/semgrep/rules_test.py +++ b/src/python/pants/backend/tools/semgrep/rules_test.py @@ -83,7 +83,7 @@ def configs(strs: dict[str, set[str]]) -> AllSemgrepConfigs: ) def test_group_by_group_by_semgrep_dir(paths: tuple[str, ...], expected: AllSemgrepConfigs): input = Paths(files=paths, dirs=()) - result = rules._group_by_semgrep_dir(".semgrep", input) + result = rules._group_by_semgrep_dir((".semgrep",), input) assert result == expected diff --git a/src/python/pants/backend/tools/semgrep/subsystem.py b/src/python/pants/backend/tools/semgrep/subsystem.py index f7608b7b5ee..013cab96f71 100644 --- a/src/python/pants/backend/tools/semgrep/subsystem.py +++ b/src/python/pants/backend/tools/semgrep/subsystem.py @@ -12,7 +12,13 @@ from pants.engine.rules import Rule, collect_rules from pants.engine.target import Dependencies, FieldSet, SingleSourceField, Target from pants.engine.unions import UnionRule -from pants.option.option_types import ArgsListOption, BoolOption, SkipOption, StrOption +from pants.option.option_types import ( + ArgsListOption, + BoolOption, + SkipOption, + StrListOption, + StrOption, +) from pants.util.strutil import softwrap @@ -51,12 +57,11 @@ class SemgrepSubsystem(PythonToolBase): register_lockfile = True default_lockfile_resource = ("pants.backend.tools.semgrep", "semgrep.lock") - config_dir = StrOption( - default=".semgrep", + config_paths = StrListOption( + default=[".semgrep"], help=softwrap( """ - The directory name with semgrep rules, which is searched recursively for YAML files, and - can be present at any level, with rules applying to all levels below it. + The locations of semgrep configs """ ), ) From 1e1a899ad27d5cd8528ad3772d5c857d1de468c3 Mon Sep 17 00:00:00 2001 From: purajit Date: Thu, 4 Jul 2024 02:17:30 -0700 Subject: [PATCH 05/10] make release note more obvious --- docs/notes/2.23.x.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/notes/2.23.x.md b/docs/notes/2.23.x.md index b7267b94ff4..cd2154ca84d 100644 --- a/docs/notes/2.23.x.md +++ b/docs/notes/2.23.x.md @@ -138,7 +138,7 @@ Fixed pulling `helm_artifact`s from OCI repositories. Added `workspace_invalidation_sources` field to `adhoc_tool` and `shell_command` target types. This new field allows declaring that these targets depend on files without bringing those files into the execution sandbox, but that the target should still be re-executed if those files change. This is intended to work with the `workspace_environment` support where processes are executed in the workspace and not in a separate sandbox. #### Semgrep -Semgrep now allows configuring the config directory, ignore config file, and will recursively discover all rules in the directory. +Semgrep now allows configuring the config directory, the semgrepignore config file, and will recursively discover all rules in the directory. ### Plugin API changes From 71e477a83595cf96564be70fa5e40e3d50d63344 Mon Sep 17 00:00:00 2001 From: purajit Date: Tue, 9 Jul 2024 10:51:41 -0700 Subject: [PATCH 06/10] don't allow customizing semgrepignore; better management of config-to-dir matching --- .../pants/backend/tools/semgrep/rules.py | 87 +++++++++++++------ .../tools/semgrep/rules_integration_test.py | 13 ++- .../pants/backend/tools/semgrep/rules_test.py | 35 +++++--- .../pants/backend/tools/semgrep/subsystem.py | 22 ++--- 4 files changed, 99 insertions(+), 58 deletions(-) diff --git a/src/python/pants/backend/tools/semgrep/rules.py b/src/python/pants/backend/tools/semgrep/rules.py index d2a95ecdee4..4568a16ee9c 100644 --- a/src/python/pants/backend/tools/semgrep/rules.py +++ b/src/python/pants/backend/tools/semgrep/rules.py @@ -36,6 +36,10 @@ logger = logging.getLogger(__name__) +_SEMGREPIGNORE_FILE_NAME = ".semgrepignore" +_DEFAULT_SEMGREP_CONFIG_DIR = ".semgrep" + + class SemgrepLintRequest(LintTargetsRequest): field_set_type = SemgrepFieldSet tool_subsystem = SemgrepSubsystem @@ -68,36 +72,54 @@ def ancestor_configs(self, address: Address) -> Iterable[PurePath]: yield from self.configs_by_dir.get(ancestor, []) -def _group_by_semgrep_dir(config_paths: tuple[str, ...], all_paths: Paths) -> AllSemgrepConfigs: - configs_by_dir = defaultdict(set) - for path_ in all_paths.files: - path = PurePath(path_) - # Rules like foo/bar/.semgrep/baz.yaml and foo/bar/.semgrep/baz/qux.yaml should apply to the - # project at foo/bar - config_directory = path.parent - for config_path in config_paths: - if config_path not in path.parts: - continue - config_directory = PurePath(*path.parts[: path.parts.index(config_path)]) - break +def _group_by_semgrep_dir( + all_config_files: Paths, all_config_dir_files: Paths, config_name: str +) -> AllSemgrepConfigs: + configs_by_dir: dict[PurePath, set[PurePath]] = {} + for config_path in all_config_files.files: + # Rules like foo/semgrep.yaml should apply to the project at foo/ + path = PurePath(config_path) + configs_by_dir.setdefault(path.parent, set()).add(path) - configs_by_dir[config_directory].add(path) + for config_path in all_config_dir_files.files: + # Rules like foo/bar/.semgrep/baz.yaml and foo/bar/.semgrep/baz/qux.yaml should apply to the + # project at foo/bar/ + path = PurePath(config_path) + config_directory = next( + parent.parent for parent in path.parents if parent.name == config_name + ) + configs_by_dir.setdefault(config_directory, set()).add(path) return AllSemgrepConfigs(configs_by_dir) @rule async def find_all_semgrep_configs(semgrep: SemgrepSubsystem) -> AllSemgrepConfigs: - rules_files_globs = ( - *(f"{config_path}/**/*.yml" for config_path in semgrep.config_paths), - *(f"{config_path}/**/*.yaml" for config_path in semgrep.config_paths), - ".semgrep.yml", - ".semgrep.yaml", + config_file_globs: tuple[str, ...] = () + config_dir_globs: tuple[str, ...] = () + + if semgrep.config_name is None: + config_file_globs = ("**/.semgrep.yml", "**/.semgrep.yaml") + config_dir_globs = ( + f"**/{_DEFAULT_SEMGREP_CONFIG_DIR}/**/*.yaml", + f"**/{_DEFAULT_SEMGREP_CONFIG_DIR}/**/*.yml", + ) + elif semgrep.config_name.endswith((".yaml", ".yml")): + config_file_globs = (f"**/{semgrep.config_name}",) + else: + config_dir_globs = ( + f"**/{semgrep.config_name}/**/*.yaml", + f"**/{semgrep.config_name}/**/*.yml", + ) + + all_config_files = await Get(Paths, PathGlobs(config_file_globs)) + all_config_dir_files = await Get(Paths, PathGlobs(config_dir_globs)) + return _group_by_semgrep_dir( + all_config_files, + all_config_dir_files, + (semgrep.config_name or _DEFAULT_SEMGREP_CONFIG_DIR), ) - all_paths = await Get(Paths, PathGlobs([f"**/{file_glob}" for file_glob in rules_files_globs])) - return _group_by_semgrep_dir(semgrep.config_paths, all_paths) - @dataclass(frozen=True) class RelevantSemgrepConfigsRequest: @@ -112,7 +134,9 @@ class RelevantSemgrepConfigs(frozenset[PurePath]): async def infer_relevant_semgrep_configs( request: RelevantSemgrepConfigsRequest, all_semgrep: AllSemgrepConfigs ) -> RelevantSemgrepConfigs: - return RelevantSemgrepConfigs(all_semgrep.ancestor_configs(request.field_set.address)) + return RelevantSemgrepConfigs( + all_semgrep.ancestor_configs(request.field_set.address) + ) @rule @@ -156,13 +180,18 @@ async def lint( global_options: GlobalOptions, ) -> LintResult: config_files, semgrep_pex, input_files, settings = await MultiGet( - Get(Snapshot, PathGlobs(str(s) for s in request.partition_metadata.config_files)), + Get( + Snapshot, PathGlobs(str(s) for s in request.partition_metadata.config_files) + ), Get(VenvPex, PexRequest, semgrep.to_pex_request()), - Get(SourceFiles, SourceFilesRequest(field_set.source for field_set in request.elements)), + Get( + SourceFiles, + SourceFilesRequest(field_set.source for field_set in request.elements), + ), Get(Digest, CreateDigest([_DEFAULT_SETTINGS])), ) - ignore_files = await Get(Snapshot, PathGlobs([semgrep.ignore_config_path])) + ignore_files = await Get(Snapshot, PathGlobs([_SEMGREPIGNORE_FILE_NAME])) input_digest = await Get( Digest, @@ -176,7 +205,9 @@ async def lint( ), ) - cache_scope = ProcessCacheScope.PER_SESSION if semgrep.force else ProcessCacheScope.SUCCESSFUL + cache_scope = ( + ProcessCacheScope.PER_SESSION if semgrep.force else ProcessCacheScope.SUCCESSFUL + ) # TODO: https://github.com/pantsbuild/pants/issues/18430 support running this with --autofix # under the fix goal... but not all rules have fixes, so we need to be running with @@ -215,7 +246,9 @@ async def lint( ), ) - return LintResult.create(request, result, output_simplifier=global_options.output_simplifier()) + return LintResult.create( + request, result, output_simplifier=global_options.output_simplifier() + ) def rules() -> Iterable[Rule | UnionRule]: diff --git a/src/python/pants/backend/tools/semgrep/rules_integration_test.py b/src/python/pants/backend/tools/semgrep/rules_integration_test.py index 0d2ec9f916c..d7c9a3a36a2 100644 --- a/src/python/pants/backend/tools/semgrep/rules_integration_test.py +++ b/src/python/pants/backend/tools/semgrep/rules_integration_test.py @@ -128,7 +128,8 @@ def run_semgrep( return tuple( rule_runner.request( - LintResult, [SemgrepLintRequest.Batch("", partition.elements, partition.metadata)] + LintResult, + [SemgrepLintRequest.Batch("", partition.elements, partition.metadata)], ) for partition in partitions ) @@ -157,7 +158,9 @@ def test_passing(rule_runner: RuleRunner, major_minor_interpreter: str) -> None: assert_success( rule_runner, tgt, - extra_args=[f"--python-interpreter-constraints=['=={major_minor_interpreter}.*']"], + extra_args=[ + f"--python-interpreter-constraints=['=={major_minor_interpreter}.*']" + ], ) @@ -189,7 +192,9 @@ def test_multiple_targets(rule_runner: RuleRunner) -> None: } ) - tgts = [rule_runner.get_target(Address(DIR, target_name=name)) for name in ["g", "b"]] + tgts = [ + rule_runner.get_target(Address(DIR, target_name=name)) for name in ["g", "b"] + ] results = run_semgrep( rule_runner, @@ -218,7 +223,7 @@ def test_multiple_targets(rule_runner: RuleRunner) -> None: f"{DIR}/bad.txt": BAD_FILE, f"{DIR}/BUILD": """file(name="f", source="bad.txt")""", ".semgrep/one.yml": RULES, - ".semgrep/two.yml": RULES2, + ".semgrep/vendored/two.yml": RULES2, }, id="via .semgrep directory", ), diff --git a/src/python/pants/backend/tools/semgrep/rules_test.py b/src/python/pants/backend/tools/semgrep/rules_test.py index 6309fd1fa97..1bf1aa13d99 100644 --- a/src/python/pants/backend/tools/semgrep/rules_test.py +++ b/src/python/pants/backend/tools/semgrep/rules_test.py @@ -20,28 +20,31 @@ def configs(strs: dict[str, set[str]]) -> AllSemgrepConfigs: @pytest.mark.parametrize( - ("paths", "expected"), + ("config_files", "config_dir_files", "expected"), [ - pytest.param((), configs({}), id="nothing"), + pytest.param((), (), configs({}), id="nothing"), pytest.param( ("foo/bar/.semgrep.yml",), + (), configs({"foo/bar": {"foo/bar/.semgrep.yml"}}), id="semgrep_file", ), pytest.param( + (), ("foo/bar/.semgrep/baz.yml", "foo/bar/.semgrep/qux.yml"), - configs({"foo/bar": {"foo/bar/.semgrep/baz.yml", "foo/bar/.semgrep/qux.yml"}}), + configs( + {"foo/bar": {"foo/bar/.semgrep/baz.yml", "foo/bar/.semgrep/qux.yml"}} + ), id="semgrep_dir", ), pytest.param( - ( - "foo/bar/.semgrep.yml", - "foo/bar/.semgrep/baz.yml", - ), + ("foo/bar/.semgrep.yml",), + ("foo/bar/.semgrep/baz.yml",), configs({"foo/bar": {"foo/bar/.semgrep.yml", "foo/bar/.semgrep/baz.yml"}}), id="both_file_and_dir", ), pytest.param( + (), ( "foo/bar/.semgrep/.semgrep.yml", "foo/bar/.semgrep/baz1.yml", @@ -60,10 +63,10 @@ def configs(strs: dict[str, set[str]]) -> AllSemgrepConfigs: ), pytest.param( ( - "foo/.semgrep/baz.yml", "foo/bar/.semgrep.yml", "foo/bar/qux/.semgrep.yml", ), + ("foo/.semgrep/baz.yml",), configs( { "foo": {"foo/.semgrep/baz.yml"}, @@ -75,15 +78,23 @@ def configs(strs: dict[str, set[str]]) -> AllSemgrepConfigs: ), # at the top level should be okay too pytest.param( - (".semgrep.yml", ".semgrep/foo.yml"), + (".semgrep.yml",), + (".semgrep/foo.yml",), configs({"": {".semgrep.yml", ".semgrep/foo.yml"}}), id="top_level", ), ], ) -def test_group_by_group_by_semgrep_dir(paths: tuple[str, ...], expected: AllSemgrepConfigs): - input = Paths(files=paths, dirs=()) - result = rules._group_by_semgrep_dir((".semgrep",), input) +def test_group_by_group_by_semgrep_dir( + config_files: tuple[str, ...], + config_dir_files: tuple[str, ...], + expected: AllSemgrepConfigs, +): + config_file_paths = Paths(files=config_files, dirs=()) + config_dir_file_paths = Paths(files=config_dir_files, dirs=()) + result = rules._group_by_semgrep_dir( + config_file_paths, config_dir_file_paths, ".semgrep" + ) assert result == expected diff --git a/src/python/pants/backend/tools/semgrep/subsystem.py b/src/python/pants/backend/tools/semgrep/subsystem.py index 013cab96f71..36b04cc648e 100644 --- a/src/python/pants/backend/tools/semgrep/subsystem.py +++ b/src/python/pants/backend/tools/semgrep/subsystem.py @@ -12,13 +12,7 @@ from pants.engine.rules import Rule, collect_rules from pants.engine.target import Dependencies, FieldSet, SingleSourceField, Target from pants.engine.unions import UnionRule -from pants.option.option_types import ( - ArgsListOption, - BoolOption, - SkipOption, - StrListOption, - StrOption, -) +from pants.option.option_types import ArgsListOption, BoolOption, SkipOption, StrOption from pants.util.strutil import softwrap @@ -57,20 +51,18 @@ class SemgrepSubsystem(PythonToolBase): register_lockfile = True default_lockfile_resource = ("pants.backend.tools.semgrep", "semgrep.lock") - config_paths = StrListOption( - default=[".semgrep"], + config_name = StrOption( + default=None, help=softwrap( """ - The locations of semgrep configs + The name of the semgrep config file or directory, which will be discovered and used + hierarchically. If using a file, it must have the extension `.yaml` or `.yml`. + + URLs and registry names are not supported. """ ), ) - ignore_config_path = StrOption( - default=".semgrepignore", - help="The path to the semgrepignore file", - ) - args = ArgsListOption( example="--verbose", default=["--quiet"], From 3e47476b2577bf76eca2042d665c63463e5277b4 Mon Sep 17 00:00:00 2001 From: purajit Date: Tue, 9 Jul 2024 12:44:01 -0700 Subject: [PATCH 07/10] lint --- .../pants/backend/tools/semgrep/rules.py | 38 +++++-------------- .../tools/semgrep/rules_integration_test.py | 8 +--- .../pants/backend/tools/semgrep/rules_test.py | 8 +--- 3 files changed, 13 insertions(+), 41 deletions(-) diff --git a/src/python/pants/backend/tools/semgrep/rules.py b/src/python/pants/backend/tools/semgrep/rules.py index 3ae7d5089a8..ce123b66ae4 100644 --- a/src/python/pants/backend/tools/semgrep/rules.py +++ b/src/python/pants/backend/tools/semgrep/rules.py @@ -13,19 +13,9 @@ from pants.backend.python.util_rules.pex import VenvPexProcess, create_venv_pex from pants.core.goals.lint import LintResult, LintTargetsRequest from pants.core.util_rules.partitions import Partition, Partitions -from pants.core.util_rules.source_files import ( - SourceFilesRequest, - determine_source_files, -) +from pants.core.util_rules.source_files import SourceFilesRequest, determine_source_files from pants.engine.addresses import Address -from pants.engine.fs import ( - CreateDigest, - FileContent, - MergeDigests, - PathGlobs, - Paths, - Snapshot, -) +from pants.engine.fs import CreateDigest, FileContent, MergeDigests, PathGlobs, Paths, Snapshot from pants.engine.intrinsics import ( create_digest_to_digest, digest_to_snapshot, @@ -143,9 +133,7 @@ class RelevantSemgrepConfigs(frozenset[PurePath]): async def infer_relevant_semgrep_configs( request: RelevantSemgrepConfigsRequest, all_semgrep: AllSemgrepConfigs ) -> RelevantSemgrepConfigs: - return RelevantSemgrepConfigs( - all_semgrep.ancestor_configs(request.field_set.address) - ) + return RelevantSemgrepConfigs(all_semgrep.ancestor_configs(request.field_set.address)) @rule @@ -157,9 +145,7 @@ async def partition( return Partitions() all_configs = await concurrently( - infer_relevant_semgrep_configs( - RelevantSemgrepConfigsRequest(field_set), **implicitly() - ) + infer_relevant_semgrep_configs(RelevantSemgrepConfigsRequest(field_set), **implicitly()) for field_set in request.field_sets ) @@ -190,19 +176,17 @@ async def lint( semgrep: SemgrepSubsystem, global_options: GlobalOptions, ) -> LintResult: - config_files, semgrep_pex, input_files, settings = await concurrently( + config_files, ignore_files, semgrep_pex, input_files, settings = await concurrently( digest_to_snapshot( - **implicitly( - PathGlobs(str(s) for s in request.partition_metadata.config_files) - ) + **implicitly(PathGlobs(str(s) for s in request.partition_metadata.config_files)) ), + digest_to_snapshot(**implicitly(PathGlobs([_SEMGREPIGNORE_FILE_NAME]))), create_venv_pex(**implicitly(semgrep.to_pex_request())), determine_source_files( SourceFilesRequest(field_set.source for field_set in request.elements) ), create_digest_to_digest(CreateDigest([_DEFAULT_SETTINGS])), ) - ignore_files = await Get(Snapshot, PathGlobs([_SEMGREPIGNORE_FILE_NAME])) input_digest = await merge_digests_request_to_digest( MergeDigests( @@ -215,9 +199,7 @@ async def lint( ) ) - cache_scope = ( - ProcessCacheScope.PER_SESSION if semgrep.force else ProcessCacheScope.SUCCESSFUL - ) + cache_scope = ProcessCacheScope.PER_SESSION if semgrep.force else ProcessCacheScope.SUCCESSFUL # TODO: https://github.com/pantsbuild/pants/issues/18430 support running this with --autofix # under the fix goal... but not all rules have fixes, so we need to be running with @@ -257,9 +239,7 @@ async def lint( ) ) - return LintResult.create( - request, result, output_simplifier=global_options.output_simplifier() - ) + return LintResult.create(request, result, output_simplifier=global_options.output_simplifier()) def rules() -> Iterable[Rule | UnionRule]: diff --git a/src/python/pants/backend/tools/semgrep/rules_integration_test.py b/src/python/pants/backend/tools/semgrep/rules_integration_test.py index d7c9a3a36a2..84a844d9509 100644 --- a/src/python/pants/backend/tools/semgrep/rules_integration_test.py +++ b/src/python/pants/backend/tools/semgrep/rules_integration_test.py @@ -158,9 +158,7 @@ def test_passing(rule_runner: RuleRunner, major_minor_interpreter: str) -> None: assert_success( rule_runner, tgt, - extra_args=[ - f"--python-interpreter-constraints=['=={major_minor_interpreter}.*']" - ], + extra_args=[f"--python-interpreter-constraints=['=={major_minor_interpreter}.*']"], ) @@ -192,9 +190,7 @@ def test_multiple_targets(rule_runner: RuleRunner) -> None: } ) - tgts = [ - rule_runner.get_target(Address(DIR, target_name=name)) for name in ["g", "b"] - ] + tgts = [rule_runner.get_target(Address(DIR, target_name=name)) for name in ["g", "b"]] results = run_semgrep( rule_runner, diff --git a/src/python/pants/backend/tools/semgrep/rules_test.py b/src/python/pants/backend/tools/semgrep/rules_test.py index 1bf1aa13d99..ec530bce32e 100644 --- a/src/python/pants/backend/tools/semgrep/rules_test.py +++ b/src/python/pants/backend/tools/semgrep/rules_test.py @@ -32,9 +32,7 @@ def configs(strs: dict[str, set[str]]) -> AllSemgrepConfigs: pytest.param( (), ("foo/bar/.semgrep/baz.yml", "foo/bar/.semgrep/qux.yml"), - configs( - {"foo/bar": {"foo/bar/.semgrep/baz.yml", "foo/bar/.semgrep/qux.yml"}} - ), + configs({"foo/bar": {"foo/bar/.semgrep/baz.yml", "foo/bar/.semgrep/qux.yml"}}), id="semgrep_dir", ), pytest.param( @@ -92,9 +90,7 @@ def test_group_by_group_by_semgrep_dir( ): config_file_paths = Paths(files=config_files, dirs=()) config_dir_file_paths = Paths(files=config_dir_files, dirs=()) - result = rules._group_by_semgrep_dir( - config_file_paths, config_dir_file_paths, ".semgrep" - ) + result = rules._group_by_semgrep_dir(config_file_paths, config_dir_file_paths, ".semgrep") assert result == expected From 8f38d51acb971d61a3c9c60673289f485cf3790c Mon Sep 17 00:00:00 2001 From: purajit Date: Tue, 9 Jul 2024 12:47:32 -0700 Subject: [PATCH 08/10] update --- src/python/pants/backend/tools/semgrep/rules.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/python/pants/backend/tools/semgrep/rules.py b/src/python/pants/backend/tools/semgrep/rules.py index ce123b66ae4..7a78de2d8c8 100644 --- a/src/python/pants/backend/tools/semgrep/rules.py +++ b/src/python/pants/backend/tools/semgrep/rules.py @@ -15,7 +15,7 @@ from pants.core.util_rules.partitions import Partition, Partitions from pants.core.util_rules.source_files import SourceFilesRequest, determine_source_files from pants.engine.addresses import Address -from pants.engine.fs import CreateDigest, FileContent, MergeDigests, PathGlobs, Paths, Snapshot +from pants.engine.fs import CreateDigest, FileContent, MergeDigests, PathGlobs, Paths from pants.engine.intrinsics import ( create_digest_to_digest, digest_to_snapshot, @@ -111,8 +111,8 @@ async def find_all_semgrep_configs(semgrep: SemgrepSubsystem) -> AllSemgrepConfi f"**/{semgrep.config_name}/**/*.yml", ) - all_config_files = await path_globs_to_paths(config_file_globs) - all_config_dir_files = await path_globs_to_paths(config_dir_globs) + all_config_files = await path_globs_to_paths(PathGlobs(config_file_globs)) + all_config_dir_files = await path_globs_to_paths(PathGlobs(config_dir_globs)) return _group_by_semgrep_dir( all_config_files, all_config_dir_files, From 582cf7b90d3b27541886c7e26cf409894ea56a80 Mon Sep 17 00:00:00 2001 From: purajit Date: Wed, 10 Jul 2024 19:45:03 -0400 Subject: [PATCH 09/10] Update docs/notes/2.23.x.md Co-authored-by: Huon Wilson --- docs/notes/2.23.x.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/notes/2.23.x.md b/docs/notes/2.23.x.md index ffde68803c1..41bc16f2589 100644 --- a/docs/notes/2.23.x.md +++ b/docs/notes/2.23.x.md @@ -150,7 +150,7 @@ Fixed pulling `helm_artifact`s from OCI repositories. Added `workspace_invalidation_sources` field to `adhoc_tool` and `shell_command` target types. This new field allows declaring that these targets depend on files without bringing those files into the execution sandbox, but that the target should still be re-executed if those files change. This is intended to work with the `workspace_environment` support where processes are executed in the workspace and not in a separate sandbox. #### Semgrep -Semgrep now allows configuring the config directory, the semgrepignore config file, and will recursively discover all rules in the directory. +Semgrep now allows configuring config file discovery via [the new `config_name` option](https://www.pantsbuild.org/2.23/reference/subsystems/semgrep#config_name). In addition, it will now recursively discover all rules within a config directory, not just the immediate children. ### Plugin API changes From ed060496d43f33c8122957c162095cd8a8366d2d Mon Sep 17 00:00:00 2001 From: purajit Date: Mon, 15 Jul 2024 14:52:57 -0700 Subject: [PATCH 10/10] added config_name tests --- .../tools/semgrep/rules_integration_test.py | 47 ++++++++++++++++--- 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/src/python/pants/backend/tools/semgrep/rules_integration_test.py b/src/python/pants/backend/tools/semgrep/rules_integration_test.py index 84a844d9509..cc5edf104f9 100644 --- a/src/python/pants/backend/tools/semgrep/rules_integration_test.py +++ b/src/python/pants/backend/tools/semgrep/rules_integration_test.py @@ -162,11 +162,30 @@ def test_passing(rule_runner: RuleRunner, major_minor_interpreter: str) -> None: ) -def test_failing(rule_runner: RuleRunner) -> None: - rule_runner.write_files(BAD_FILE_LAYOUT) +@pytest.mark.parametrize( + "files,config_name", + [ + pytest.param( + BAD_FILE_LAYOUT, + None, + ), + pytest.param( + { + f"{DIR}/bad.txt": BAD_FILE, + f"{DIR}/BUILD": """file(name="f", source="bad.txt")""", + ".custom_semgrep_file.yml": RULES, + }, + ".custom_semgrep_file.yml", + id="via custom config file", + ), + ], +) +def test_failing(rule_runner: RuleRunner, files: dict[str, str], config_name: str | None) -> None: + rule_runner.write_files(files) tgt = rule_runner.get_target(Address(DIR, target_name="f")) - results = run_semgrep(rule_runner, [tgt]) + extra_args = [f"--semgrep-config-name={config_name}"] if config_name else [] + results = run_semgrep(rule_runner, [tgt], extra_args=extra_args) assert len(results) == 1 result = results[0] assert "find-bad-pattern" in result.stdout @@ -205,13 +224,14 @@ def test_multiple_targets(rule_runner: RuleRunner) -> None: @pytest.mark.parametrize( - "files", + "files,config_name", [ pytest.param( { **BAD_FILE_LAYOUT, ".semgrep.yml": RULES2, }, + None, id="via nesting", ), pytest.param( @@ -221,6 +241,7 @@ def test_multiple_targets(rule_runner: RuleRunner) -> None: ".semgrep/one.yml": RULES, ".semgrep/vendored/two.yml": RULES2, }, + None, id="via .semgrep directory", ), pytest.param( @@ -230,15 +251,29 @@ def test_multiple_targets(rule_runner: RuleRunner) -> None: ".semgrep/one.yml": RULES, ".semgrep/vendored/two.yml": RULES2, }, + None, id="via recursive .semgrep directory", ), + pytest.param( + { + f"{DIR}/bad.txt": BAD_FILE, + f"{DIR}/BUILD": """file(name="f", source="bad.txt")""", + "custom_semgrep_dir/one.yml": RULES, + "custom_semgrep_dir/vendored/two.yml": RULES2, + }, + "custom_semgrep_dir", + id="via custom recursive config directory", + ), ], ) -def test_multiple_configs(rule_runner: RuleRunner, files: dict[str, str]) -> None: +def test_multiple_configs( + rule_runner: RuleRunner, files: dict[str, str], config_name: str | None +) -> None: rule_runner.write_files(files) tgt = rule_runner.get_target(Address(DIR, target_name="f")) - results = run_semgrep(rule_runner, [tgt]) + extra_args = [f"--semgrep-config-name={config_name}"] if config_name else [] + results = run_semgrep(rule_runner, [tgt], extra_args=extra_args) assert len(results) == 1 result = results[0]