From c16e63cf933ae919119926e1bab8dcea6ea3a1bc Mon Sep 17 00:00:00 2001 From: David Wertenteil Date: Wed, 29 May 2024 09:10:35 +0300 Subject: [PATCH] support SARIF format --- .github/workflows/main.yml | 2 +- .github/workflows/test.yml | 1 - README.md | 57 ++++++++++++++++++++++- tests/test_client_check.py | 52 +++++++++++++++++++++ tests/test_core_check.py | 74 ++++++++++++++++++++++++++++- urlchecker/client/__init__.py | 9 +++- urlchecker/client/check.py | 3 ++ urlchecker/core/check.py | 87 ++++++++++++++++++++++++++++++++--- 8 files changed, 271 insertions(+), 14 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 80d962a..26a2ae1 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,7 +2,7 @@ name: Build and Deploy containers on: # Always test on pull request - pull_request: [] + pull_request: # Deploy on merge to main push: diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 268dcff..37bcc10 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -5,7 +5,6 @@ on: branches: - master pull_request: - branches_ignore: [] jobs: formatting: diff --git a/README.md b/README.md index 145851a..76055ab 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ $ urlchecker check --help usage: urlchecker check [-h] [-b BRANCH] [--subfolder SUBFOLDER] [--cleanup] [--serial] [--no-check-certs] [--force-pass] [--no-print] [--verbose] [--file-types FILE_TYPES] [--files FILES] [--exclude-urls EXCLUDE_URLS] [--exclude-patterns EXCLUDE_PATTERNS] - [--exclude-files EXCLUDE_FILES] [--save SAVE] [--retry-count RETRY_COUNT] [--timeout TIMEOUT] + [--exclude-files EXCLUDE_FILES] [--save SAVE] [--format FORMAT] [--retry-count RETRY_COUNT] [--timeout TIMEOUT] path positional arguments: @@ -89,6 +89,7 @@ options: --exclude-files EXCLUDE_FILES comma separated list of files and patterns to exclude (no spaces) --save SAVE Path to a csv file to save results to. + --format FORMAT format to save results to (csv or sarif), defaults to csv. --retry-count RETRY_COUNT retry count upon failure (defaults to 2, one retry). --timeout TIMEOUT timeout (seconds) to provide to the requests library (defaults to 5) @@ -266,7 +267,9 @@ $ urlchecker check --exclude-files=README.md,_config.yml ### Save Results -If you want to save your results to file, perhaps for some kind of record or +#### Save results in CSV format + +If you want to save your results to file in csv format, perhaps for some kind of record or other data analysis, you can provide the `--save` argument: ```bash @@ -313,6 +316,56 @@ https://github.com/SuperKogito/URLs-checker/issues/1,failed https://github.com/SuperKogito/URLs-checker/issues/4,failed ``` +#### Save results in SARIF format + +To save results in SARIF format, you can provide the `--format` argument with `sarif`: + +```bash +$ urlchecker check --save results.sarif --format sarif . +``` + +This will produce a SARIF file with detailed information about each URL, including the exact line in the code where the URL was found, useful for integrating with tools that support SARIF for static analysis. +This output helps in pinpointing the exact issues directly in the code, improving the efficiency of addressing broken links. + +```json +{ + "version": "2.1.0", + "runs": [ + { + "tool": { + "driver": { + "name": "UrlChecker", + "informationUri": "https://github.com/urlstechie/urlchecker-python", + "rules": [ + { + "id": "URL001", + "name": "Invalid URL", + "shortDescription": { "text": "This URL is invalid or unreachable." }, + "fullDescription": { "text": "This URL is invalid or unreachable." }, + "helpUri": "https://example.com/rule/url001" + } + ] + } + }, + "results": [ + { + "ruleId": "URL001", + "message": { "text": "URL https://github.com/SuperKogito/URLs-checker/README.md is invalid or unreachable." }, + "locations": [ + { + "physicalLocation": { + "artifactLocation": { "uri": "example_file.py" }, + "region": { "startLine": 10 } + } + } + ] + }, + ... + ] + } + ] +} +``` ### Usage from Python diff --git a/tests/test_client_check.py b/tests/test_client_check.py index 3abc86d..f3ccead 100644 --- a/tests/test_client_check.py +++ b/tests/test_client_check.py @@ -219,3 +219,55 @@ def test_client_save(save): if save: if not os.path.exists(output_csv.name): raise AssertionError + +@pytest.mark.parametrize("save, output_format", [(True, ""), (True, "csv"), (True, "sarif")]) +def test_client_save_format_csv(save, output_format): + + # init config parser + config = configparser.ConfigParser() + config.read("./tests/_local_test_config.conf") + + # init env variables + path = config["DEFAULT"]["git_path_test_value"] + file_types = config["DEFAULT"]["file_types_test_values"] + exclude_urls = config["DEFAULT"]["exclude_test_urls"] + exclude_patterns = config["DEFAULT"]["exclude_test_patterns"] + + # Generate command + cmd = [ + "urlchecker", + "check", + "--subfolder", + "test_files", + "--file-types", + file_types, + "--exclude-files", + "conf.py", + "--exclude-urls", + exclude_urls, + "--exclude_patterns", + exclude_patterns, + ] + + suffix = { + "csv": ".csv", + "sarif": ".sarif", + "" : ".csv" + } + # Write to file + if save: + output_file = tempfile.NamedTemporaryFile(suffix=suffix[output_format], prefix="urlchecker-") + cmd += ["--save", output_file.name] + if output_format: + cmd += ["--format", output_format] + + # Add final path + cmd.append(path) + + print(" ".join(cmd)) + # execute script + pipe = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if save: + if not os.path.exists(output_file.name): + raise AssertionError + \ No newline at end of file diff --git a/tests/test_core_check.py b/tests/test_core_check.py index 6399952..1032504 100644 --- a/tests/test_core_check.py +++ b/tests/test_core_check.py @@ -2,6 +2,7 @@ import re import sys import pytest +import json import configparser from urlchecker.core.fileproc import get_file_paths from urlchecker.main.github import clone_repo @@ -95,9 +96,8 @@ def test_locally(local_folder_path, config_fname): ) print("Done.") - @pytest.mark.parametrize("retry_count", [1, 3]) -def test_check_run_save(tmp_path, retry_count): +def test_check_run_save_csv(tmp_path, retry_count): # init vars git_path = "https://github.com/urlstechie/urlchecker-test-repo" @@ -161,3 +161,73 @@ def test_check_run_save(tmp_path, retry_count): for line in lines[1:]: url, result, filename = line.split(",") assert not filename.startswith(root) + +@pytest.mark.parametrize("retry_count", [1, 3]) +def test_check_run_save_sarif(tmp_path, retry_count): + + # init vars + git_path = "https://github.com/urlstechie/urlchecker-test-repo" + file_types = [".py", ".md"] + print_all = True + exclude_urls = [ + "https://superkogito.github.io/figures/fig2.html", + "https://superkogito.github.io/figures/fig4.html", + ] + exclude_patterns = ["https://superkogito.github.io/tables"] + timeout = 1 + force_pass = False + + # clone repo + base_path = clone_repo(git_path) + + # get all file paths in subfolder specified + base_path = os.path.join(base_path, "test_files") + file_paths = get_file_paths(base_path, file_types) + + # check repo urls + checker = UrlChecker(print_all=print_all, save_results_format="sarif") + check_results = checker.run( + file_paths=file_paths, + exclude_urls=exclude_urls, + exclude_patterns=exclude_patterns, + retry_count=retry_count, + timeout=timeout, + ) + + # Test saving to file + output_file = os.path.join(str(tmp_path), "results.sarif") + assert not os.path.exists(output_file) + saved_file = checker.save_results(output_file) + assert os.path.exists(output_file) + + # Read in output file + with open(saved_file, "r") as file: + sarif_output = json.load(file) + + # Verify SARIF output structure + assert "version" in sarif_output + assert sarif_output["version"] == "2.1.0" + assert "runs" in sarif_output + assert len(sarif_output["runs"]) > 0 + assert "tool" in sarif_output["runs"][0] + assert "driver" in sarif_output["runs"][0]["tool"] + assert "name" in sarif_output["runs"][0]["tool"]["driver"] + assert sarif_output["runs"][0]["tool"]["driver"]["name"] == "UrlChecker" + assert "results" in sarif_output["runs"][0] + + # Verify at least one result entry + assert len(sarif_output["runs"][0]["results"]) > 0 + + # Verify the structure of a result entry + result_entry = sarif_output["runs"][0]["results"][0] + assert "ruleId" in result_entry + assert result_entry["ruleId"] == "URL001" + assert "message" in result_entry + assert "text" in result_entry["message"] + assert "locations" in result_entry + assert len(result_entry["locations"]) > 0 + assert "physicalLocation" in result_entry["locations"][0] + assert "artifactLocation" in result_entry["locations"][0]["physicalLocation"] + assert "uri" in result_entry["locations"][0]["physicalLocation"]["artifactLocation"] + assert "region" in result_entry["locations"][0]["physicalLocation"] + assert "startLine" in result_entry["locations"][0]["physicalLocation"]["region"] diff --git a/urlchecker/client/__init__.py b/urlchecker/client/__init__.py index f8abeb5..316b76a 100755 --- a/urlchecker/client/__init__.py +++ b/urlchecker/client/__init__.py @@ -142,9 +142,16 @@ def get_parser(): check.add_argument( "--save", - help="Path to a csv file to save results to.", + help="Path to file to save results to.", default=None, ) + check.add_argument( + "--format", + help="File format to save results to.", + default="csv", + choices=["csv", "sarif"], + type=str, + ) # Timeouts diff --git a/urlchecker/client/check.py b/urlchecker/client/check.py index f1108f9..eaafab6 100644 --- a/urlchecker/client/check.py +++ b/urlchecker/client/check.py @@ -76,6 +76,7 @@ def main(args, extra): print(" force pass: %s" % args.force_pass) print(" retry count: %s" % args.retry_count) print(" save: %s" % args.save) + print(" format: %s" % args.format) print(" timeout: %s" % args.timeout) # Instantiate a new checker with provided arguments @@ -86,6 +87,8 @@ def main(args, extra): exclude_files=exclude_files, print_all=not args.no_print, serial=args.serial, + save_results_format=args.format, + ) check_results = checker.run( exclude_urls=exclude_urls, diff --git a/urlchecker/core/check.py b/urlchecker/core/check.py index 1d7eb43..ab5102d 100644 --- a/urlchecker/core/check.py +++ b/urlchecker/core/check.py @@ -8,10 +8,12 @@ """ import csv +import copy import os import random import re import sys +import json from typing import Optional, Dict, List from urlchecker.core import fileproc @@ -33,6 +35,7 @@ def __init__( print_all: bool = True, include_patterns: Optional[List[str]] = None, serial: bool = False, + save_results_format: str = "csv", ): """ initiate a url checker. At init we take in preferences for @@ -40,12 +43,13 @@ def __init__( parameters to run a url check. Args: - - path (str) : full path to the root folder to check. If not defined, no file_paths are parsed. - - file_types (list) : types of files to scan for links. - - print_all (bool) : control var for whether to print all checked file names or only the ones with urls. - - exclude_files (list) : list of excluded files and patterns for flies. - - include_patterns (list) : list of files and patterns to check. - - serial (bool) : do checks in serial (no multiprocessing) + - path (str) : full path to the root folder to check. If not defined, no file_paths are parsed. + - file_types (list) : types of files to scan for links. + - print_all (bool) : control var for whether to print all checked file names or only the ones with urls. + - exclude_files (list) : list of excluded files and patterns for flies. + - include_patterns (list) : list of files and patterns to check. + - serial (bool) : do checks in serial (no multiprocessing) + - save_results_format (bool) : format to save results (csv or sarif) """ # Initiate results object, and checks lookup (holds UrlCheck) for each file self.results = { @@ -65,6 +69,17 @@ def __init__( self.file_types = file_types or [".py", ".md"] self.file_paths = [] self.serial = serial + + # Mapping save results formats to their respective methods + save_methods = { + "csv": self.save_results_as_csv, + "sarif": self.save_results_as_sarif + } + if save_results_format in save_methods: + self.save_results = save_methods[save_results_format] + else: + sys.exit(f"{save_results_format} is an invalid format to save results.") + # get all file paths if a path is defined if path: @@ -92,7 +107,7 @@ def __str__(self) -> str: def __repr__(self) -> str: return self.__str__() - def save_results( + def save_results_as_csv( self, file_path: str, sep: str = ",", @@ -163,6 +178,56 @@ def save_results( return file_path + def save_results_as_sarif(self, file_path: str) -> str: + sarif_log = { + "version": "2.1.0", + "runs": [ + { + "tool": { + "driver": { + "name": "UrlChecker", + "informationUri": "https://github.com/urlstechie/urlchecker-python", + "rules": [ + { + "id": "RFC3986", + "name": "Invalid/Unreachable URL", + "shortDescription": {"text": "This URL is invalid or unreachable."}, + "fullDescription": {"text": "This URL is invalid or unreachable."}, + "helpUri": "https://www.rfc-editor.org/rfc/rfc3986" + } + ] + } + }, + "results": [] + } + ] + } + + for file_name, result in self.checks.items(): + failed_urls = copy.deepcopy(result["failed"]) + unique_failed_urls = set(failed_urls) + for url in unique_failed_urls: + line_numbers = find_url_lines(file_name, url) + if not line_numbers: + line_numbers = [1] # Default to 1 if not found + + for line_number in line_numbers: + sarif_log["runs"][0]["results"].append({ + "ruleId": "URL001", + "message": {"text": f"URL {url} is invalid or unreachable."}, + "locations": [{ + "physicalLocation": { + "artifactLocation": {"uri": file_name}, + "region": {"startLine": line_number} + } + }] + }) + + with open(file_path, 'w') as file: + json.dump(sarif_log, file, indent=2) + + return file_path + def run( self, file_paths: Optional[List[str]] = None, @@ -273,3 +338,11 @@ def check_task(*args, **kwargs): "passed": checker.passed, "excluded": checker.excluded, } + +def find_url_lines(file_name: str, url: str) -> List[int]: + line_numbers = [] + with open(file_name, 'r') as file: + for i, line in enumerate(file, 1): + if url in line: + line_numbers.append(i) + return line_numbers \ No newline at end of file