SQL Password pattern (#653)

* sql-password * ML docs updated
Samsung · Jan 24, 2025 · add6156 · add6156
1 parent 75df2ab
commit add6156
Show file tree

Hide file tree

Showing 30 changed files with 10,078 additions and 11,805 deletions.
diff --git a/.ci/benchmark.txt b/.ci/benchmark.txt
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -441,8 +441,7 @@ jobs:
           # check whether credsweeper is available as module
           python -m credsweeper --banner
           # use only 2 epochs for the test
-          sed -i 's/max_epochs = .*/max_epochs = 2/' main.py
-          python main.py --data ${{ github.workspace }}/CredData -j $(( 2 * $(nproc) ))
+          python main.py --data ${{ github.workspace }}/CredData --jobs $(( 2 * $(nproc) )) --epochs 2
           # dbg
           git diff
           # crc32 should be changed

diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
@@ -40,8 +40,8 @@ jobs:
     - name: Check ml_config.json and ml_model.onnx integrity
       if: ${{ always() && steps.code_checkout.conclusion == 'success' }}
       run: |
-        md5sum --binary credsweeper/ml_model/ml_config.json | grep 092a588d5bebdac5136c4d01c87abf27
-        md5sum --binary credsweeper/ml_model/ml_model.onnx | grep a707745d781517556fd58890cb2812be
+        md5sum --binary credsweeper/ml_model/ml_config.json | grep 3a4bfcd6f3ea74461b158d4ec073cc06
+        md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 9725b166e07e60f94929fea986f84ae2
 
     # # # line ending
 

diff --git a/credsweeper/ml_model/ml_config.json b/credsweeper/ml_model/ml_config.json
@@ -1,5 +1,5 @@
 {
-    "char_set": " \t\n0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
+    "char_set": "\u001b\t\n\r !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~",
     "thresholds": {
         "lowest": 0.22917,
         "low": 0.35739,
@@ -54,6 +54,22 @@
                 "attribute": "value"
             }
         },
+        {
+            "type": "SearchInAttribute",
+            "comment": "Repeated symbol",
+            "kwargs": {
+                "pattern": ".*(?:(\\S)(\\S))((\\1.)|(.\\2)){7,}",
+                "attribute": "value"
+            }
+        },
+        {
+            "type": "SearchInAttribute",
+            "comment": "SHA marker",
+            "kwargs": {
+                "pattern": ".*(?i:sha)[_-]?(224|256|384|512)",
+                "attribute": "value"
+            }
+        },
         {
             "type": "SearchInAttribute",
             "comment": "VariableNotAllowedPatternCheck",
@@ -239,6 +255,7 @@
                     ".bat",
                     ".bats",
                     ".bazel",
+                    ".build",
                     ".bundle",
                     ".bzl",
                     ".c",
@@ -295,6 +312,7 @@
                     ".jsx",
                     ".ks",
                     ".kt",
+                    ".kts",
                     ".las",
                     ".ldif",
                     ".ldml",
@@ -360,6 +378,7 @@
                     ".sql",
                     ".storyboard",
                     ".strings",
+                    ".sty",
                     ".t",
                     ".td",
                     ".tdf",
@@ -403,6 +422,7 @@
                     "Key",
                     "Nonce",
                     "Password",
+                    "SQL Password",
                     "Salt",
                     "Secret",
                     "Token",

diff --git a/credsweeper/ml_model/ml_model.onnx b/credsweeper/ml_model/ml_model.onnx
diff --git a/credsweeper/rules/config.yaml b/credsweeper/rules/config.yaml
@@ -144,6 +144,24 @@
   target:
     - doc
 
+- name: SQL Password
+  severity: medium
+  confidence: weak
+  type: pattern
+  values:
+    - (\\[nrt]|\b)(?i:(?P<variable>(CREATE|ALTER|SET\s{1,8}PASSWORD|INSERT(\s{1,8}IGNORE)?|UPDATE\s{1,8}[^\s;]{1,80})\s{1,8}(LOGIN|USER|ROLE|FOR|INTO|SET)\s{1,8}([^\s;]{1,80}\s{1,8}|VALUES\s*\(){1,8}(IDENTIFIED((\s{1,8}WITH\s{1,8}\S{1,80})?\s{1,8}(BY|AS))|(=|WITH)?\s*PASSWORD\b(\s*=)?)))\s*(?P<wrap>[(]\s*)?(?P<value_leftquote>((?P<esq>\\{1,8})?([`'\"]|&(quot|apos);)){1,4})?(?P<value>(?(value_leftquote)((?!(?P=value_leftquote))(?(esq)((?!(?P=esq)([`'\"]|&(quot|apos);)).)|((?!(?P=value_leftquote)).)))|(?!&(quot|apos);)(\\+([ tnr]|[^\s`'\"])|[^\s`'\",;\\])){3,80})(?(value_leftquote)(?P<value_rightquote>(?<!\\)(?P=value_leftquote))|(?(wrap)[)]|[\s`'\",;]))
+  filter_type:
+    - ValueAllowlistCheck
+    - ValuePatternCheck(4)
+  min_line_len: 8
+  required_substrings:
+    - password
+    - identified
+  target:
+    - doc
+    - code
+  use_ml: true
+
 - name: API
   severity: medium
   confidence: moderate
@@ -1209,7 +1227,7 @@
   type: pattern
   values:
     - (?:(?<![0-9A-Za-z_/+-])|\\[0abfnrtv]|(%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu]([0-9A-Fa-f]{4}){1,2}|\x1B\[[0-9;]{0,80}m)(?=[0-9A-Za-z]{64})(?P<value>[0-9A-Za-z]{10,12}[B-Za-z0-9]A{10,12}[B-Za-z0-9][0-9A-Za-z]{40,44})(?![=0-9A-Za-z_/+-])
-  filter_type: []
+  filter_type: [ ]
   min_line_len: 43
   required_substrings:
     - AAAAAAAAAA

diff --git a/docs/images/Model_with_features.png b/docs/images/Model_with_features.png
diff --git a/docs/source/overall_architecture.rst b/docs/source/overall_architecture.rst
@@ -82,13 +82,18 @@ Each Rule_ is dedicated to detect a specific type of credential, imported from `
 
     ...
     - name: API
-    severity: medium
-    type: keyword
-    values:
-    - api
-    filter_type: GeneralKeyword
-    use_ml: true
-    - name: AWS Client ID
+      severity: medium
+      confidence: moderate
+      type: keyword
+      values:
+        - api
+      filter_type: GeneralKeyword
+      use_ml: true
+      min_line_len: 11
+      required_substrings:
+        - api
+      target:
+        - code
     ...
 
 **Rule Attributes** 
@@ -140,6 +145,13 @@ Each Rule_ is dedicated to detect a specific type of credential, imported from `
    - The type of the Filter_ group you want to apply. Filter_ groups implemented are as follows: `GeneralKeyword <credsweeper.filters.group.html#module-credsweeper.filters.group.general_keyword>`_, `GeneralPattern <credsweeper.filters.group.html#module-credsweeper.filters.group.general_pattern>`_, `PasswordKeyword <credsweeper.filters.group.html#module-credsweeper.filters.group.password_keyword>`_, and `UrlCredentials <credsweeper.filters.group.html#module-credsweeper.filters.group.url_credentials_group>`_.
 - use_ml
    - The attribute to set whether to perform ML validation. If true, ML validation will be performed. If false - ml_probability will be set to None in report.
+- min_line_len
+   - drop too short stripped lines before text search to increase performance
+- required_substrings
+   - any strings has to be found in a line before regex search to increase performance
+- target
+   - code : The rule will be applied without --doc option
+   - doc  : The rule will be applied with --doc option
 
 Filter
 ------
@@ -168,24 +180,23 @@ And ML can be fully disable by setting ``--ml_threshold 0``
     python -m credsweeper --ml_threshold 0 ...
 
 Our ML model architecture is a combination of Bidirectional LSTM with additional handcrafted features.
-It uses last 50 characters from the potential credential and 91 handcrafted features to decide if it's a real credential or not.
+It uses first 80 characters from the potential credential value and variable (if available), 160 characters from line around the value and configurable handcrafted features to decide if it's a real credential or not.
 
-Example:
+Example (file leaked_cred.py):
 
-.. code-block:: text
+.. code-block:: python
 
-    leaked_cred.py:
     my_db_password = "NUU423cds"
 
 Steps:
 
 1. Regular expression extracts ```NUU423cds``` as a secret value, ```my_db_password``` as a variable, and ```my_db_password = "NUU423cds"``` as whole line
 2. Handcrafted feature classes instantiated from classes in `features.py <https://github.com/Samsung/CredSweeper/blob/main/credsweeper/ml_model/features.py>`_ using `model_config.json <https://github.com/Samsung/CredSweeper/blob/6a2e575987448dd20895a8e72efb3b09fdcbecc2/credsweeper/ml_model/model_config.json#L10>`_. Instantiation process can be checked at `ml_validator.py#L46 <https://github.com/Samsung/CredSweeper/blob/main/credsweeper/ml_model/ml_validator.py#L46>`_. Features include: ``` ``` character in line: yes/no, ```(``` character in line: yes/no, file extension is ```.c```: yes/no, etc.
 3. Handcrafted features from step 2 used on line, value, variable, and filename to get feature vector of length 91
-4. ```NUU423cds``` lowercased and right padded with special padding characters to the length 50. Last 50 characters selected if longer. Only 70 symbols used: 68 ASCII characters + 1 padding character + 1 special character for all other symbols: `ml_validator.py#L29 <https://github.com/Samsung/CredSweeper/blob/6a2e575987448dd20895a8e72efb3b09fdcbecc2/credsweeper/ml_model/ml_validator.py#L29>`_. Padded line than `one-hot encoded <https://en.wikipedia.org/wiki/One-hot>`_. Link to corresponding code: `ml_validator.py#L63 <https://github.com/Samsung/CredSweeper/blob/6a2e575987448dd20895a8e72efb3b09fdcbecc2/credsweeper/ml_model/ml_validator.py#L63>`_
-5. Padded line from step 4 inputted to Bidirectional LSTM. LSTM produce single vector of length 60 as output
-6. LSTM output and handcrafted features concatenated into a single vector of length 151
-7. Vector from step 6 feed into the two last Dense layers
+4. ```NUU423cds``` Configurable character set is applied + 1 padding character + 1 special character for all other symbols. Padded line than `one-hot encoded <https://en.wikipedia.org/wiki/One-hot>`_. Link to corresponding code: `MlValidator.encode <https://github.com/Samsung/CredSweeper/blob/75df2ab8fc660df19523e939c538cdb0bbd7ce52/credsweeper/ml_model/ml_validator.py#L102>`_
+5. Padded line from step 4 inputted to Bidirectional LSTM of value. The same encodings are performed for variable and line. LSTM produce 3 single vectors of lengths 80, 80, 160 as outputs
+6. LSTM outputs and handcrafted features concatenated into a single vector
+7. The vector from step 6 is fed into a stack of two sequential Dense layers, each with the number of output units equal to the number of input units.
 8. Last layer outputs float value in range 0-1 with estimated probability of line being a real credential
 9. Predicted probability compared to the threshold (see `--ml_threshold` CLI option) and credential reported if predicted probability is greater
 

diff --git a/experiment/main.py b/experiment/main.py
@@ -53,7 +53,13 @@ def evaluate_model(thresholds: dict, keras_model: Model, x_data: List[np.ndarray
               f"F1:{f1:0.6f}")
 
 
-def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
+def main(cred_data_location: str,
+         jobs: int,
+         epochs: int,
+         batch_size: int,
+         patience: int,
+         doc_target: bool,
+         use_tuner: bool = False) -> str:
     print(f"Memory at start: {LogCallback.get_memory_info()}")
 
     current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -62,7 +68,7 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
     os.makedirs(dir_path, exist_ok=True)
 
     print(f"Train model on data from {cred_data_location}")
-    prepare_train_data(_cred_data_location, jobs)
+    prepare_train_data(cred_data_location, jobs, doc_target)
 
     # detected data means which data is passed to ML validator of credsweeper after filters with RuleName
     cred_data_location_path = pathlib.Path(cred_data_location) / "data"
@@ -82,7 +88,7 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
     for i in range(3):
         # there are 2 times possible fails due ml config was updated
         try:
-            thresholds = model_config_preprocess(df_all)
+            thresholds = model_config_preprocess(df_all, doc_target)
             break
         except RuntimeError as exc:
             if "RESTART:" in str(exc):
@@ -136,12 +142,6 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
 
     print(f"Memory before search / compile: {LogCallback.get_memory_info()}")
 
-    max_epochs = 100
-    # ^^^ the line is patched in GitHub action to speed-up test train
-    batch_size = 256
-    patience = 5
-    #return
-
     log_callback = LogCallback()
     if use_tuner:
         tuner = kt.GridSearch(
@@ -158,7 +158,7 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
         tuner.search(
             x=[x_train_line, x_train_variable, x_train_value, x_train_features],
             y=y_train,
-            epochs=max_epochs,
+            epochs=epochs,
             batch_size=batch_size,
             callbacks=[search_early_stopping, log_callback],
             validation_data=([x_test_line, x_test_variable, x_test_value, x_test_features], y_test),
@@ -189,7 +189,7 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
     fit_history = keras_model.fit(x=[x_train_line, x_train_variable, x_train_value, x_train_features],
                                   y=y_train,
                                   batch_size=batch_size,
-                                  epochs=max_epochs,
+                                  epochs=epochs,
                                   verbose=2,
                                   validation_data=([x_test_line, x_test_variable, x_test_value,
                                                     x_test_features], y_test),
@@ -259,7 +259,8 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
 
 if __name__ == "__main__":
     parser = ArgumentParser()
-    parser.add_argument("--data",
+    parser.add_argument("-d",
+                        "--data",
                         nargs="?",
                         help="CredData location",
                         dest="cred_data_location",
@@ -271,25 +272,46 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
                         default=4,
                         dest="jobs",
                         metavar="POSITIVE_INT")
-    parser.add_argument("-t", "--tuner", help="use keras tuner", dest="use_tuner", action="store_true")
+    parser.add_argument("-e",
+                        "--epochs",
+                        help="maximal epochs to train (default: 100)",
+                        default=100,
+                        dest="epochs",
+                        metavar="POSITIVE_INT")
+    parser.add_argument("-b",
+                        "--batch_size",
+                        help="batch size (default: 256)",
+                        default=256,
+                        dest="batch_size",
+                        metavar="POSITIVE_INT")
+    parser.add_argument("-p",
+                        "--patience",
+                        help="early stopping patience (default: 5)",
+                        default=5,
+                        dest="patience",
+                        metavar="POSITIVE_INT")
+    parser.add_argument("--doc", help="use doc target", dest="doc_target", action="store_true")
+    parser.add_argument("--tuner", help="use keras tuner", dest="use_tuner", action="store_true")
     args = parser.parse_args()
 
-    fixed_seed = 20241126  # int(datetime.now().timestamp())
-    # print(f"Random seed:{fixed_seed}")
-    if fixed_seed is not None:
-        tf.random.set_seed(fixed_seed)
-        np.random.seed(fixed_seed)
-        random.seed(fixed_seed)
-
-    _cred_data_location = args.cred_data_location
-    _jobs = int(args.jobs)
+    fixed_seed = 20250117
+    print(f"Fixed seed:{fixed_seed}")
+    tf.random.set_seed(fixed_seed)
+    np.random.seed(fixed_seed)
+    random.seed(fixed_seed)
 
     # to keep the hash in log and verify
     command = f"md5sum {pathlib.Path(__file__).parent.parent}/credsweeper/ml_model/ml_config.json"
     subprocess.check_call(command, shell=True, cwd=pathlib.Path(__file__).parent)
     command = f"md5sum {pathlib.Path(__file__).parent.parent}/credsweeper/ml_model/ml_model.onnx"
     subprocess.check_call(command, shell=True, cwd=pathlib.Path(__file__).parent)
 
-    _model_file_name = main(_cred_data_location, _jobs, args.use_tuner)
+    _model_file_name = main(cred_data_location=args.cred_data_location,
+                            jobs=int(args.jobs),
+                            epochs=int(args.epochs),
+                            batch_size=int(args.batch_size),
+                            patience=int(args.patience),
+                            doc_target=bool(args.doc_target),
+                            use_tuner=bool(args.use_tuner))
     # print in last line the name
     print(f"\nYou can find your model in:\n{_model_file_name}")
diff --git a/experiment/main.sh b/experiment/main.sh
@@ -2,25 +2,57 @@
 
 set -ex
 
-CREDSWEEPER_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." > /dev/null 2>&1 && pwd )"
-export PYTHONPATH=${CREDSWEEPER_DIR}:$PYTHONPATH
-echo $PYTHONPATH
-${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper --banner
+START_TIME=$(date +%s)
+NOW=$(date +%Y%m%d_%H%M%S)
+echo ">>> START ${BASH_SOURCE[0]} in $(pwd) at ${NOW}"
 
-now=$(date +%Y%m%d_%H%M%S)
+# use the path environments without / at end
 
-RESULT_DIR=${CREDSWEEPER_DIR}/experiment/results
-mkdir -vp ${RESULT_DIR}
+echo "CREDSWEEPER_DIR='${CREDSWEEPER_DIR}'"
+if [ -z "${CREDSWEEPER_DIR}" ] || [ ! -d "${CREDSWEEPER_DIR}" ]; then
+    echo "CREDSWEEPER_DIR environment is empty or does not exist"
+    exit 1
+fi
+
+echo "CREDDATA_DIR='${CREDDATA_DIR}'"
+if [ -z "${CREDDATA_DIR}" ] || [ ! -d "${CREDDATA_DIR}" ]; then
+    echo "CREDDATA_DIR environment is empty or does not exist"
+    exit 1
+fi
+
+echo "JOBS=$(nproc)"
+if [ -z "${JOBS}" ]; then
+    JOBS=$(nproc)
+    echo "Used JOBS=${JOBS} for multiple process"
+elif [ ! 0 -lt ${JOBS} ]; then
+    echo "Unappropriated JOBS=${JOBS}"
+    exit 1
+fi
+
+export PYTHONPATH="${CREDSWEEPER_DIR}":$PYTHONPATH
+
+# check whether current version
+"${CREDSWEEPER_DIR}"/.venv/bin/python -m credsweeper --banner
+
+WORK_DIR="${CREDSWEEPER_DIR}/experiment"
+cd "${WORK_DIR}"
+RESULT_DIR="${WORK_DIR}/results"
+mkdir -vp "${RESULT_DIR}"
 
 # set env TUNER to use keras-tuner
 #TUNER=--tuner
-${CREDSWEEPER_DIR}/.venv/bin/python main.py --data ~/q/DataCred/main --jobs $(nproc) ${TUNER} | tee ${RESULT_DIR}/${now}.train.log
+# set env DOC to apply doc dataset
+#DOC=--doc
+"${CREDSWEEPER_DIR}"/.venv/bin/python main.py --data "${CREDDATA_DIR}" --jobs ${JOBS} ${TUNER} ${DOC} | tee "${RESULT_DIR}/${NOW}.train.log"
 error_code=${PIPESTATUS}
 if [ 0 -ne ${error_code} ]; then exit ${error_code}; fi
 
-cd ${CREDSWEEPER_DIR}
-report_file=${RESULT_DIR}/${now}.json
-${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper --sort --path ~/q/DataCred/main/data --log info --job $(nproc) --subtext --save-json ${report_file}
+cd "${CREDSWEEPER_DIR}"
+report_file=${RESULT_DIR}/${NOW}.json
+${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper ${DOC} --sort --path "${CREDDATA_DIR}/data" --log info --jobs ${JOBS}  --subtext --save-json ${report_file}
 
-cd ~/q/DataCred/main
+cd "${CREDDATA_DIR}"
 .venv/bin/python -m benchmark --scanner credsweeper --load ${report_file} | tee ${CREDSWEEPER_DIR}/.ci/benchmark.txt
+
+SPENT_TIME=$(date -ud "@$(( $(date +%s) - ${START_TIME} ))" +"%H:%M:%S")
+echo "<<< DONE ${BASH_SOURCE[0]} in $(pwd) at $(date) elapsed ${SPENT_TIME}"