Skip to content

Commit

Permalink
SQL Password pattern (#653)
Browse files Browse the repository at this point in the history
* sql-password

* ML docs updated
  • Loading branch information
babenek authored Jan 24, 2025
1 parent 75df2ab commit add6156
Show file tree
Hide file tree
Showing 30 changed files with 10,078 additions and 11,805 deletions.
65 changes: 33 additions & 32 deletions .ci/benchmark.txt

Large diffs are not rendered by default.

3 changes: 1 addition & 2 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -441,8 +441,7 @@ jobs:
# check whether credsweeper is available as module
python -m credsweeper --banner
# use only 2 epochs for the test
sed -i 's/max_epochs = .*/max_epochs = 2/' main.py
python main.py --data ${{ github.workspace }}/CredData -j $(( 2 * $(nproc) ))
python main.py --data ${{ github.workspace }}/CredData --jobs $(( 2 * $(nproc) )) --epochs 2
# dbg
git diff
# crc32 should be changed
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ jobs:
- name: Check ml_config.json and ml_model.onnx integrity
if: ${{ always() && steps.code_checkout.conclusion == 'success' }}
run: |
md5sum --binary credsweeper/ml_model/ml_config.json | grep 092a588d5bebdac5136c4d01c87abf27
md5sum --binary credsweeper/ml_model/ml_model.onnx | grep a707745d781517556fd58890cb2812be
md5sum --binary credsweeper/ml_model/ml_config.json | grep 3a4bfcd6f3ea74461b158d4ec073cc06
md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 9725b166e07e60f94929fea986f84ae2
# # # line ending

Expand Down
22 changes: 21 additions & 1 deletion credsweeper/ml_model/ml_config.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"char_set": " \t\n0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
"char_set": "\u001b\t\n\r !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~",
"thresholds": {
"lowest": 0.22917,
"low": 0.35739,
Expand Down Expand Up @@ -54,6 +54,22 @@
"attribute": "value"
}
},
{
"type": "SearchInAttribute",
"comment": "Repeated symbol",
"kwargs": {
"pattern": ".*(?:(\\S)(\\S))((\\1.)|(.\\2)){7,}",
"attribute": "value"
}
},
{
"type": "SearchInAttribute",
"comment": "SHA marker",
"kwargs": {
"pattern": ".*(?i:sha)[_-]?(224|256|384|512)",
"attribute": "value"
}
},
{
"type": "SearchInAttribute",
"comment": "VariableNotAllowedPatternCheck",
Expand Down Expand Up @@ -239,6 +255,7 @@
".bat",
".bats",
".bazel",
".build",
".bundle",
".bzl",
".c",
Expand Down Expand Up @@ -295,6 +312,7 @@
".jsx",
".ks",
".kt",
".kts",
".las",
".ldif",
".ldml",
Expand Down Expand Up @@ -360,6 +378,7 @@
".sql",
".storyboard",
".strings",
".sty",
".t",
".td",
".tdf",
Expand Down Expand Up @@ -403,6 +422,7 @@
"Key",
"Nonce",
"Password",
"SQL Password",
"Salt",
"Secret",
"Token",
Expand Down
Binary file modified credsweeper/ml_model/ml_model.onnx
Binary file not shown.
20 changes: 19 additions & 1 deletion credsweeper/rules/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,24 @@
target:
- doc

- name: SQL Password
severity: medium
confidence: weak
type: pattern
values:
- (\\[nrt]|\b)(?i:(?P<variable>(CREATE|ALTER|SET\s{1,8}PASSWORD|INSERT(\s{1,8}IGNORE)?|UPDATE\s{1,8}[^\s;]{1,80})\s{1,8}(LOGIN|USER|ROLE|FOR|INTO|SET)\s{1,8}([^\s;]{1,80}\s{1,8}|VALUES\s*\(){1,8}(IDENTIFIED((\s{1,8}WITH\s{1,8}\S{1,80})?\s{1,8}(BY|AS))|(=|WITH)?\s*PASSWORD\b(\s*=)?)))\s*(?P<wrap>[(]\s*)?(?P<value_leftquote>((?P<esq>\\{1,8})?([`'\"]|&(quot|apos);)){1,4})?(?P<value>(?(value_leftquote)((?!(?P=value_leftquote))(?(esq)((?!(?P=esq)([`'\"]|&(quot|apos);)).)|((?!(?P=value_leftquote)).)))|(?!&(quot|apos);)(\\+([ tnr]|[^\s`'\"])|[^\s`'\",;\\])){3,80})(?(value_leftquote)(?P<value_rightquote>(?<!\\)(?P=value_leftquote))|(?(wrap)[)]|[\s`'\",;]))
filter_type:
- ValueAllowlistCheck
- ValuePatternCheck(4)
min_line_len: 8
required_substrings:
- password
- identified
target:
- doc
- code
use_ml: true

- name: API
severity: medium
confidence: moderate
Expand Down Expand Up @@ -1209,7 +1227,7 @@
type: pattern
values:
- (?:(?<![0-9A-Za-z_/+-])|\\[0abfnrtv]|(%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu]([0-9A-Fa-f]{4}){1,2}|\x1B\[[0-9;]{0,80}m)(?=[0-9A-Za-z]{64})(?P<value>[0-9A-Za-z]{10,12}[B-Za-z0-9]A{10,12}[B-Za-z0-9][0-9A-Za-z]{40,44})(?![=0-9A-Za-z_/+-])
filter_type: []
filter_type: [ ]
min_line_len: 43
required_substrings:
- AAAAAAAAAA
Expand Down
Binary file modified docs/images/Model_with_features.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
41 changes: 26 additions & 15 deletions docs/source/overall_architecture.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,18 @@ Each Rule_ is dedicated to detect a specific type of credential, imported from `
...
- name: API
severity: medium
type: keyword
values:
- api
filter_type: GeneralKeyword
use_ml: true
- name: AWS Client ID
severity: medium
confidence: moderate
type: keyword
values:
- api
filter_type: GeneralKeyword
use_ml: true
min_line_len: 11
required_substrings:
- api
target:
- code
...
**Rule Attributes**
Expand Down Expand Up @@ -140,6 +145,13 @@ Each Rule_ is dedicated to detect a specific type of credential, imported from `
- The type of the Filter_ group you want to apply. Filter_ groups implemented are as follows: `GeneralKeyword <credsweeper.filters.group.html#module-credsweeper.filters.group.general_keyword>`_, `GeneralPattern <credsweeper.filters.group.html#module-credsweeper.filters.group.general_pattern>`_, `PasswordKeyword <credsweeper.filters.group.html#module-credsweeper.filters.group.password_keyword>`_, and `UrlCredentials <credsweeper.filters.group.html#module-credsweeper.filters.group.url_credentials_group>`_.
- use_ml
- The attribute to set whether to perform ML validation. If true, ML validation will be performed. If false - ml_probability will be set to None in report.
- min_line_len
- drop too short stripped lines before text search to increase performance
- required_substrings
- any strings has to be found in a line before regex search to increase performance
- target
- code : The rule will be applied without --doc option
- doc : The rule will be applied with --doc option

Filter
------
Expand Down Expand Up @@ -168,24 +180,23 @@ And ML can be fully disable by setting ``--ml_threshold 0``
python -m credsweeper --ml_threshold 0 ...
Our ML model architecture is a combination of Bidirectional LSTM with additional handcrafted features.
It uses last 50 characters from the potential credential and 91 handcrafted features to decide if it's a real credential or not.
It uses first 80 characters from the potential credential value and variable (if available), 160 characters from line around the value and configurable handcrafted features to decide if it's a real credential or not.

Example:
Example (file leaked_cred.py):

.. code-block:: text
.. code-block:: python
leaked_cred.py:
my_db_password = "NUU423cds"
Steps:

1. Regular expression extracts ```NUU423cds``` as a secret value, ```my_db_password``` as a variable, and ```my_db_password = "NUU423cds"``` as whole line
2. Handcrafted feature classes instantiated from classes in `features.py <https://github.com/Samsung/CredSweeper/blob/main/credsweeper/ml_model/features.py>`_ using `model_config.json <https://github.com/Samsung/CredSweeper/blob/6a2e575987448dd20895a8e72efb3b09fdcbecc2/credsweeper/ml_model/model_config.json#L10>`_. Instantiation process can be checked at `ml_validator.py#L46 <https://github.com/Samsung/CredSweeper/blob/main/credsweeper/ml_model/ml_validator.py#L46>`_. Features include: ``` ``` character in line: yes/no, ```(``` character in line: yes/no, file extension is ```.c```: yes/no, etc.
3. Handcrafted features from step 2 used on line, value, variable, and filename to get feature vector of length 91
4. ```NUU423cds``` lowercased and right padded with special padding characters to the length 50. Last 50 characters selected if longer. Only 70 symbols used: 68 ASCII characters + 1 padding character + 1 special character for all other symbols: `ml_validator.py#L29 <https://github.com/Samsung/CredSweeper/blob/6a2e575987448dd20895a8e72efb3b09fdcbecc2/credsweeper/ml_model/ml_validator.py#L29>`_. Padded line than `one-hot encoded <https://en.wikipedia.org/wiki/One-hot>`_. Link to corresponding code: `ml_validator.py#L63 <https://github.com/Samsung/CredSweeper/blob/6a2e575987448dd20895a8e72efb3b09fdcbecc2/credsweeper/ml_model/ml_validator.py#L63>`_
5. Padded line from step 4 inputted to Bidirectional LSTM. LSTM produce single vector of length 60 as output
6. LSTM output and handcrafted features concatenated into a single vector of length 151
7. Vector from step 6 feed into the two last Dense layers
4. ```NUU423cds``` Configurable character set is applied + 1 padding character + 1 special character for all other symbols. Padded line than `one-hot encoded <https://en.wikipedia.org/wiki/One-hot>`_. Link to corresponding code: `MlValidator.encode <https://github.com/Samsung/CredSweeper/blob/75df2ab8fc660df19523e939c538cdb0bbd7ce52/credsweeper/ml_model/ml_validator.py#L102>`_
5. Padded line from step 4 inputted to Bidirectional LSTM of value. The same encodings are performed for variable and line. LSTM produce 3 single vectors of lengths 80, 80, 160 as outputs
6. LSTM outputs and handcrafted features concatenated into a single vector
7. The vector from step 6 is fed into a stack of two sequential Dense layers, each with the number of output units equal to the number of input units.
8. Last layer outputs float value in range 0-1 with estimated probability of line being a real credential
9. Predicted probability compared to the threshold (see `--ml_threshold` CLI option) and credential reported if predicted probability is greater

Expand Down
68 changes: 45 additions & 23 deletions experiment/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,13 @@ def evaluate_model(thresholds: dict, keras_model: Model, x_data: List[np.ndarray
f"F1:{f1:0.6f}")


def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
def main(cred_data_location: str,
jobs: int,
epochs: int,
batch_size: int,
patience: int,
doc_target: bool,
use_tuner: bool = False) -> str:
print(f"Memory at start: {LogCallback.get_memory_info()}")

current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
Expand All @@ -62,7 +68,7 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
os.makedirs(dir_path, exist_ok=True)

print(f"Train model on data from {cred_data_location}")
prepare_train_data(_cred_data_location, jobs)
prepare_train_data(cred_data_location, jobs, doc_target)

# detected data means which data is passed to ML validator of credsweeper after filters with RuleName
cred_data_location_path = pathlib.Path(cred_data_location) / "data"
Expand All @@ -82,7 +88,7 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
for i in range(3):
# there are 2 times possible fails due ml config was updated
try:
thresholds = model_config_preprocess(df_all)
thresholds = model_config_preprocess(df_all, doc_target)
break
except RuntimeError as exc:
if "RESTART:" in str(exc):
Expand Down Expand Up @@ -136,12 +142,6 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:

print(f"Memory before search / compile: {LogCallback.get_memory_info()}")

max_epochs = 100
# ^^^ the line is patched in GitHub action to speed-up test train
batch_size = 256
patience = 5
#return

log_callback = LogCallback()
if use_tuner:
tuner = kt.GridSearch(
Expand All @@ -158,7 +158,7 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
tuner.search(
x=[x_train_line, x_train_variable, x_train_value, x_train_features],
y=y_train,
epochs=max_epochs,
epochs=epochs,
batch_size=batch_size,
callbacks=[search_early_stopping, log_callback],
validation_data=([x_test_line, x_test_variable, x_test_value, x_test_features], y_test),
Expand Down Expand Up @@ -189,7 +189,7 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
fit_history = keras_model.fit(x=[x_train_line, x_train_variable, x_train_value, x_train_features],
y=y_train,
batch_size=batch_size,
epochs=max_epochs,
epochs=epochs,
verbose=2,
validation_data=([x_test_line, x_test_variable, x_test_value,
x_test_features], y_test),
Expand Down Expand Up @@ -259,7 +259,8 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:

if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("--data",
parser.add_argument("-d",
"--data",
nargs="?",
help="CredData location",
dest="cred_data_location",
Expand All @@ -271,25 +272,46 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
default=4,
dest="jobs",
metavar="POSITIVE_INT")
parser.add_argument("-t", "--tuner", help="use keras tuner", dest="use_tuner", action="store_true")
parser.add_argument("-e",
"--epochs",
help="maximal epochs to train (default: 100)",
default=100,
dest="epochs",
metavar="POSITIVE_INT")
parser.add_argument("-b",
"--batch_size",
help="batch size (default: 256)",
default=256,
dest="batch_size",
metavar="POSITIVE_INT")
parser.add_argument("-p",
"--patience",
help="early stopping patience (default: 5)",
default=5,
dest="patience",
metavar="POSITIVE_INT")
parser.add_argument("--doc", help="use doc target", dest="doc_target", action="store_true")
parser.add_argument("--tuner", help="use keras tuner", dest="use_tuner", action="store_true")
args = parser.parse_args()

fixed_seed = 20241126 # int(datetime.now().timestamp())
# print(f"Random seed:{fixed_seed}")
if fixed_seed is not None:
tf.random.set_seed(fixed_seed)
np.random.seed(fixed_seed)
random.seed(fixed_seed)

_cred_data_location = args.cred_data_location
_jobs = int(args.jobs)
fixed_seed = 20250117
print(f"Fixed seed:{fixed_seed}")
tf.random.set_seed(fixed_seed)
np.random.seed(fixed_seed)
random.seed(fixed_seed)

# to keep the hash in log and verify
command = f"md5sum {pathlib.Path(__file__).parent.parent}/credsweeper/ml_model/ml_config.json"
subprocess.check_call(command, shell=True, cwd=pathlib.Path(__file__).parent)
command = f"md5sum {pathlib.Path(__file__).parent.parent}/credsweeper/ml_model/ml_model.onnx"
subprocess.check_call(command, shell=True, cwd=pathlib.Path(__file__).parent)

_model_file_name = main(_cred_data_location, _jobs, args.use_tuner)
_model_file_name = main(cred_data_location=args.cred_data_location,
jobs=int(args.jobs),
epochs=int(args.epochs),
batch_size=int(args.batch_size),
patience=int(args.patience),
doc_target=bool(args.doc_target),
use_tuner=bool(args.use_tuner))
# print in last line the name
print(f"\nYou can find your model in:\n{_model_file_name}")
56 changes: 44 additions & 12 deletions experiment/main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,57 @@

set -ex

CREDSWEEPER_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." > /dev/null 2>&1 && pwd )"
export PYTHONPATH=${CREDSWEEPER_DIR}:$PYTHONPATH
echo $PYTHONPATH
${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper --banner
START_TIME=$(date +%s)
NOW=$(date +%Y%m%d_%H%M%S)
echo ">>> START ${BASH_SOURCE[0]} in $(pwd) at ${NOW}"

now=$(date +%Y%m%d_%H%M%S)
# use the path environments without / at end

RESULT_DIR=${CREDSWEEPER_DIR}/experiment/results
mkdir -vp ${RESULT_DIR}
echo "CREDSWEEPER_DIR='${CREDSWEEPER_DIR}'"
if [ -z "${CREDSWEEPER_DIR}" ] || [ ! -d "${CREDSWEEPER_DIR}" ]; then
echo "CREDSWEEPER_DIR environment is empty or does not exist"
exit 1
fi

echo "CREDDATA_DIR='${CREDDATA_DIR}'"
if [ -z "${CREDDATA_DIR}" ] || [ ! -d "${CREDDATA_DIR}" ]; then
echo "CREDDATA_DIR environment is empty or does not exist"
exit 1
fi

echo "JOBS=$(nproc)"
if [ -z "${JOBS}" ]; then
JOBS=$(nproc)
echo "Used JOBS=${JOBS} for multiple process"
elif [ ! 0 -lt ${JOBS} ]; then
echo "Unappropriated JOBS=${JOBS}"
exit 1
fi

export PYTHONPATH="${CREDSWEEPER_DIR}":$PYTHONPATH

# check whether current version
"${CREDSWEEPER_DIR}"/.venv/bin/python -m credsweeper --banner

WORK_DIR="${CREDSWEEPER_DIR}/experiment"
cd "${WORK_DIR}"
RESULT_DIR="${WORK_DIR}/results"
mkdir -vp "${RESULT_DIR}"

# set env TUNER to use keras-tuner
#TUNER=--tuner
${CREDSWEEPER_DIR}/.venv/bin/python main.py --data ~/q/DataCred/main --jobs $(nproc) ${TUNER} | tee ${RESULT_DIR}/${now}.train.log
# set env DOC to apply doc dataset
#DOC=--doc
"${CREDSWEEPER_DIR}"/.venv/bin/python main.py --data "${CREDDATA_DIR}" --jobs ${JOBS} ${TUNER} ${DOC} | tee "${RESULT_DIR}/${NOW}.train.log"
error_code=${PIPESTATUS}
if [ 0 -ne ${error_code} ]; then exit ${error_code}; fi

cd ${CREDSWEEPER_DIR}
report_file=${RESULT_DIR}/${now}.json
${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper --sort --path ~/q/DataCred/main/data --log info --job $(nproc) --subtext --save-json ${report_file}
cd "${CREDSWEEPER_DIR}"
report_file=${RESULT_DIR}/${NOW}.json
${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper ${DOC} --sort --path "${CREDDATA_DIR}/data" --log info --jobs ${JOBS} --subtext --save-json ${report_file}

cd ~/q/DataCred/main
cd "${CREDDATA_DIR}"
.venv/bin/python -m benchmark --scanner credsweeper --load ${report_file} | tee ${CREDSWEEPER_DIR}/.ci/benchmark.txt

SPENT_TIME=$(date -ud "@$(( $(date +%s) - ${START_TIME} ))" +"%H:%M:%S")
echo "<<< DONE ${BASH_SOURCE[0]} in $(pwd) at $(date) elapsed ${SPENT_TIME}"
Loading

0 comments on commit add6156

Please sign in to comment.