From c16e63cf933ae919119926e1bab8dcea6ea3a1bc Mon Sep 17 00:00:00 2001
From: David Wertenteil <dwertent@armosec.io>
Date: Wed, 29 May 2024 09:10:35 +0300
Subject: [PATCH] support SARIF format

---
 .github/workflows/main.yml    |  2 +-
 .github/workflows/test.yml    |  1 -
 README.md                     | 57 ++++++++++++++++++++++-
 tests/test_client_check.py    | 52 +++++++++++++++++++++
 tests/test_core_check.py      | 74 ++++++++++++++++++++++++++++-
 urlchecker/client/__init__.py |  9 +++-
 urlchecker/client/check.py    |  3 ++
 urlchecker/core/check.py      | 87 ++++++++++++++++++++++++++++++++---
 8 files changed, 271 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 80d962a..26a2ae1 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -2,7 +2,7 @@ name: Build and Deploy containers
 
 on:
   # Always test on pull request
-  pull_request: []
+  pull_request:
 
   # Deploy on merge to main
   push:
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 268dcff..37bcc10 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -5,7 +5,6 @@ on:
     branches:
       - master
   pull_request:
-    branches_ignore: []
 
 jobs:
   formatting:
diff --git a/README.md b/README.md
index 145851a..76055ab 100644
--- a/README.md
+++ b/README.md
@@ -61,7 +61,7 @@ $ urlchecker check --help
 usage: urlchecker check [-h] [-b BRANCH] [--subfolder SUBFOLDER] [--cleanup] [--serial] [--no-check-certs]
                         [--force-pass] [--no-print] [--verbose] [--file-types FILE_TYPES] [--files FILES]
                         [--exclude-urls EXCLUDE_URLS] [--exclude-patterns EXCLUDE_PATTERNS]
-                        [--exclude-files EXCLUDE_FILES] [--save SAVE] [--retry-count RETRY_COUNT] [--timeout TIMEOUT]
+                        [--exclude-files EXCLUDE_FILES] [--save SAVE] [--format FORMAT] [--retry-count RETRY_COUNT] [--timeout TIMEOUT]
                         path
 
 positional arguments:
@@ -89,6 +89,7 @@ options:
   --exclude-files EXCLUDE_FILES
                         comma separated list of files and patterns to exclude (no spaces)
   --save SAVE           Path to a csv file to save results to.
+  --format FORMAT       format to save results to (csv or sarif), defaults to csv.
   --retry-count RETRY_COUNT
                         retry count upon failure (defaults to 2, one retry).
   --timeout TIMEOUT     timeout (seconds) to provide to the requests library (defaults to 5)
@@ -266,7 +267,9 @@ $ urlchecker check --exclude-files=README.md,_config.yml
 
 ### Save Results
 
-If you want to save your results to file, perhaps for some kind of record or
+#### Save results in CSV format
+
+If you want to save your results to file in csv format, perhaps for some kind of record or
 other data analysis, you can provide the `--save` argument:
 
 ```bash
@@ -313,6 +316,56 @@ https://github.com/SuperKogito/URLs-checker/issues/1,failed
 https://github.com/SuperKogito/URLs-checker/issues/4,failed
 ```
 
+#### Save results in SARIF format
+
+To save results in SARIF format, you can provide the `--format` argument with `sarif`:
+
+```bash
+$ urlchecker check --save results.sarif --format sarif .
+```
+
+This will produce a SARIF file with detailed information about each URL, including the exact line in the code where the URL was found, useful for integrating with tools that support SARIF for static analysis.
+This output helps in pinpointing the exact issues directly in the code, improving the efficiency of addressing broken links.
+
+```json
+{
+  "version": "2.1.0",
+  "runs": [
+    {
+      "tool": {
+        "driver": {
+          "name": "UrlChecker",
+          "informationUri": "https://github.com/urlstechie/urlchecker-python",
+          "rules": [
+            {
+              "id": "URL001",
+              "name": "Invalid URL",
+              "shortDescription": { "text": "This URL is invalid or unreachable." },
+              "fullDescription": { "text": "This URL is invalid or unreachable." },
+              "helpUri": "https://example.com/rule/url001"
+            }
+          ]
+        }
+      },
+      "results": [
+        {
+          "ruleId": "URL001",
+          "message": { "text": "URL https://github.com/SuperKogito/URLs-checker/README.md is invalid or unreachable." },
+          "locations": [
+            {
+              "physicalLocation": {
+                "artifactLocation": { "uri": "example_file.py" },
+                "region": { "startLine": 10 }
+              }
+            }
+          ]
+        },
+        ...
+      ]
+    }
+  ]
+}
+```
 
 ### Usage from Python
 
diff --git a/tests/test_client_check.py b/tests/test_client_check.py
index 3abc86d..f3ccead 100644
--- a/tests/test_client_check.py
+++ b/tests/test_client_check.py
@@ -219,3 +219,55 @@ def test_client_save(save):
     if save:
         if not os.path.exists(output_csv.name):
             raise AssertionError
+
+@pytest.mark.parametrize("save, output_format", [(True, ""), (True, "csv"), (True, "sarif")])
+def test_client_save_format_csv(save, output_format):
+
+    # init config parser
+    config = configparser.ConfigParser()
+    config.read("./tests/_local_test_config.conf")
+
+    # init env variables
+    path = config["DEFAULT"]["git_path_test_value"]
+    file_types = config["DEFAULT"]["file_types_test_values"]
+    exclude_urls = config["DEFAULT"]["exclude_test_urls"]
+    exclude_patterns = config["DEFAULT"]["exclude_test_patterns"]
+
+    # Generate command
+    cmd = [
+        "urlchecker",
+        "check",
+        "--subfolder",
+        "test_files",
+        "--file-types",
+        file_types,
+        "--exclude-files",
+        "conf.py",
+        "--exclude-urls",
+        exclude_urls,
+        "--exclude_patterns",
+        exclude_patterns,
+    ]
+
+    suffix = {
+        "csv": ".csv",
+        "sarif": ".sarif",
+        "" : ".csv"
+    }
+    # Write to file
+    if save:
+        output_file = tempfile.NamedTemporaryFile(suffix=suffix[output_format], prefix="urlchecker-")
+        cmd += ["--save", output_file.name]
+    if output_format:
+        cmd += ["--format", output_format]
+
+    # Add final path
+    cmd.append(path)
+
+    print(" ".join(cmd))
+    # execute script
+    pipe = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    if save:
+        if not os.path.exists(output_file.name):
+            raise AssertionError
+ 
\ No newline at end of file
diff --git a/tests/test_core_check.py b/tests/test_core_check.py
index 6399952..1032504 100644
--- a/tests/test_core_check.py
+++ b/tests/test_core_check.py
@@ -2,6 +2,7 @@
 import re
 import sys
 import pytest
+import json
 import configparser
 from urlchecker.core.fileproc import get_file_paths
 from urlchecker.main.github import clone_repo
@@ -95,9 +96,8 @@ def test_locally(local_folder_path, config_fname):
     )
     print("Done.")
 
-
 @pytest.mark.parametrize("retry_count", [1, 3])
-def test_check_run_save(tmp_path, retry_count):
+def test_check_run_save_csv(tmp_path, retry_count):
 
     # init vars
     git_path = "https://github.com/urlstechie/urlchecker-test-repo"
@@ -161,3 +161,73 @@ def test_check_run_save(tmp_path, retry_count):
     for line in lines[1:]:
         url, result, filename = line.split(",")
         assert not filename.startswith(root)
+
+@pytest.mark.parametrize("retry_count", [1, 3])
+def test_check_run_save_sarif(tmp_path, retry_count):
+
+    # init vars
+    git_path = "https://github.com/urlstechie/urlchecker-test-repo"
+    file_types = [".py", ".md"]
+    print_all = True
+    exclude_urls = [
+        "https://superkogito.github.io/figures/fig2.html",
+        "https://superkogito.github.io/figures/fig4.html",
+    ]
+    exclude_patterns = ["https://superkogito.github.io/tables"]
+    timeout = 1
+    force_pass = False
+
+    # clone repo
+    base_path = clone_repo(git_path)
+
+    # get all file paths in subfolder specified
+    base_path = os.path.join(base_path, "test_files")
+    file_paths = get_file_paths(base_path, file_types)
+
+    # check repo urls
+    checker = UrlChecker(print_all=print_all, save_results_format="sarif")
+    check_results = checker.run(
+        file_paths=file_paths,
+        exclude_urls=exclude_urls,
+        exclude_patterns=exclude_patterns,
+        retry_count=retry_count,
+        timeout=timeout,
+    )
+
+    # Test saving to file
+    output_file = os.path.join(str(tmp_path), "results.sarif")
+    assert not os.path.exists(output_file)
+    saved_file = checker.save_results(output_file)
+    assert os.path.exists(output_file)
+
+    # Read in output file
+    with open(saved_file, "r") as file:
+        sarif_output = json.load(file)
+
+    # Verify SARIF output structure
+    assert "version" in sarif_output
+    assert sarif_output["version"] == "2.1.0"
+    assert "runs" in sarif_output
+    assert len(sarif_output["runs"]) > 0
+    assert "tool" in sarif_output["runs"][0]
+    assert "driver" in sarif_output["runs"][0]["tool"]
+    assert "name" in sarif_output["runs"][0]["tool"]["driver"]
+    assert sarif_output["runs"][0]["tool"]["driver"]["name"] == "UrlChecker"
+    assert "results" in sarif_output["runs"][0]
+
+    # Verify at least one result entry
+    assert len(sarif_output["runs"][0]["results"]) > 0
+
+    # Verify the structure of a result entry
+    result_entry = sarif_output["runs"][0]["results"][0]
+    assert "ruleId" in result_entry
+    assert result_entry["ruleId"] == "URL001"
+    assert "message" in result_entry
+    assert "text" in result_entry["message"]
+    assert "locations" in result_entry
+    assert len(result_entry["locations"]) > 0
+    assert "physicalLocation" in result_entry["locations"][0]
+    assert "artifactLocation" in result_entry["locations"][0]["physicalLocation"]
+    assert "uri" in result_entry["locations"][0]["physicalLocation"]["artifactLocation"]
+    assert "region" in result_entry["locations"][0]["physicalLocation"]
+    assert "startLine" in result_entry["locations"][0]["physicalLocation"]["region"]
diff --git a/urlchecker/client/__init__.py b/urlchecker/client/__init__.py
index f8abeb5..316b76a 100755
--- a/urlchecker/client/__init__.py
+++ b/urlchecker/client/__init__.py
@@ -142,9 +142,16 @@ def get_parser():
 
     check.add_argument(
         "--save",
-        help="Path to a csv file to save results to.",
+        help="Path to file to save results to.",
         default=None,
     )
+    check.add_argument(
+        "--format",
+        help="File format to save results to.",
+        default="csv",
+        choices=["csv", "sarif"],
+        type=str,
+    )
 
     # Timeouts
 
diff --git a/urlchecker/client/check.py b/urlchecker/client/check.py
index f1108f9..eaafab6 100644
--- a/urlchecker/client/check.py
+++ b/urlchecker/client/check.py
@@ -76,6 +76,7 @@ def main(args, extra):
     print("              force pass: %s" % args.force_pass)
     print("             retry count: %s" % args.retry_count)
     print("                    save: %s" % args.save)
+    print("                  format: %s" % args.format)
     print("                 timeout: %s" % args.timeout)
 
     # Instantiate a new checker with provided arguments
@@ -86,6 +87,8 @@ def main(args, extra):
         exclude_files=exclude_files,
         print_all=not args.no_print,
         serial=args.serial,
+        save_results_format=args.format,
+
     )
     check_results = checker.run(
         exclude_urls=exclude_urls,
diff --git a/urlchecker/core/check.py b/urlchecker/core/check.py
index 1d7eb43..ab5102d 100644
--- a/urlchecker/core/check.py
+++ b/urlchecker/core/check.py
@@ -8,10 +8,12 @@
 """
 
 import csv
+import copy
 import os
 import random
 import re
 import sys
+import json
 from typing import Optional, Dict, List
 
 from urlchecker.core import fileproc
@@ -33,6 +35,7 @@ def __init__(
         print_all: bool = True,
         include_patterns: Optional[List[str]] = None,
         serial: bool = False,
+        save_results_format: str = "csv",
     ):
         """
         initiate a url checker. At init we take in preferences for
@@ -40,12 +43,13 @@ def __init__(
         parameters to run a url check.
 
         Args:
-            - path              (str) : full path to the root folder to check. If not defined, no file_paths are parsed.
-            - file_types       (list) : types of files to scan for links.
-            - print_all        (bool) : control var for whether to print all checked file names or only the ones with urls.
-            - exclude_files    (list) : list of excluded files and patterns for flies.
-            - include_patterns (list) : list of files and patterns to check.
-            - serial           (bool) : do checks in serial (no multiprocessing)
+            - path                (str) : full path to the root folder to check. If not defined, no file_paths are parsed.
+            - file_types          (list) : types of files to scan for links.
+            - print_all           (bool) : control var for whether to print all checked file names or only the ones with urls.
+            - exclude_files       (list) : list of excluded files and patterns for flies.
+            - include_patterns    (list) : list of files and patterns to check.
+            - serial              (bool) : do checks in serial (no multiprocessing)
+            - save_results_format (bool) : format to save results (csv or sarif)
         """
         # Initiate results object, and checks lookup (holds UrlCheck) for each file
         self.results = {
@@ -65,6 +69,17 @@ def __init__(
         self.file_types = file_types or [".py", ".md"]
         self.file_paths = []
         self.serial = serial
+                
+        # Mapping save results formats to their respective methods
+        save_methods = {
+            "csv": self.save_results_as_csv,
+            "sarif": self.save_results_as_sarif
+        }
+        if save_results_format in save_methods:
+            self.save_results = save_methods[save_results_format]
+        else:
+            sys.exit(f"{save_results_format} is an invalid format to save results.")
+
 
         # get all file paths if a path is defined
         if path:
@@ -92,7 +107,7 @@ def __str__(self) -> str:
     def __repr__(self) -> str:
         return self.__str__()
 
-    def save_results(
+    def save_results_as_csv(
         self,
         file_path: str,
         sep: str = ",",
@@ -163,6 +178,56 @@ def save_results(
 
         return file_path
 
+    def save_results_as_sarif(self, file_path: str) -> str:
+        sarif_log = {
+            "version": "2.1.0",
+            "runs": [
+                {
+                    "tool": {
+                        "driver": {
+                            "name": "UrlChecker",
+                            "informationUri": "https://github.com/urlstechie/urlchecker-python",
+                            "rules": [
+                                {
+                                    "id": "RFC3986",
+                                    "name": "Invalid/Unreachable URL",
+                                    "shortDescription": {"text": "This URL is invalid or unreachable."},
+                                    "fullDescription": {"text": "This URL is invalid or unreachable."},
+                                    "helpUri": "https://www.rfc-editor.org/rfc/rfc3986"
+                                }
+                            ]
+                        }
+                    },
+                    "results": []
+                }
+            ]
+        }
+
+        for file_name, result in self.checks.items():
+            failed_urls = copy.deepcopy(result["failed"])
+            unique_failed_urls = set(failed_urls)
+            for url in unique_failed_urls:
+                line_numbers = find_url_lines(file_name, url)
+                if not line_numbers:
+                    line_numbers = [1]  # Default to 1 if not found
+
+                for line_number in line_numbers:
+                    sarif_log["runs"][0]["results"].append({
+                        "ruleId": "URL001",
+                        "message": {"text": f"URL {url} is invalid or unreachable."},
+                        "locations": [{
+                            "physicalLocation": {
+                                "artifactLocation": {"uri": file_name},
+                                "region": {"startLine": line_number}
+                            }
+                        }]
+                    })
+
+        with open(file_path, 'w') as file:
+            json.dump(sarif_log, file, indent=2)
+
+        return file_path
+
     def run(
         self,
         file_paths: Optional[List[str]] = None,
@@ -273,3 +338,11 @@ def check_task(*args, **kwargs):
         "passed": checker.passed,
         "excluded": checker.excluded,
     }
+
+def find_url_lines(file_name: str, url: str) -> List[int]:
+    line_numbers = []
+    with open(file_name, 'r') as file:
+        for i, line in enumerate(file, 1):
+            if url in line:
+                line_numbers.append(i)
+    return line_numbers
\ No newline at end of file