ML model updated and re-trained (#502)

* Update training model experiment * worflow to test ml learning * missed file added * 300 is a limit * ML model updated * updated ml * [skip actions] [mltrain] 2024-02-13T18:20:18+02:00 * style * update benchmark scores * packages versions were updated * small rollback * 2 epochs for test only * packages versions were updated * ml train workflow test * Rollback BM workflow and req...txt * ml integrity check * fix suggestions --------- Co-authored-by: yuliia.t <[email protected]>
Samsung · Feb 14, 2024 · b51a9db · b51a9db
1 parent ec69baa
commit b51a9db
Show file tree

Hide file tree

Showing 28 changed files with 3,859 additions and 1,880 deletions.
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -324,4 +324,95 @@ jobs:
 
         exit ${exit_code}
 
+# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+  experiment:
+    # the ml train test is placed here to use cached data set
+    needs: [ download_data ]
+
+    runs-on: ubuntu-latest
+
+    steps:
+
+      - name: Checkout CredData
+        uses: actions/checkout@v3
+        with:
+          repository: Samsung/CredData
+
+      - name: Cache data
+        id: cache-data
+        uses: actions/cache@v3
+        with:
+          path: data
+          key: cred-data-${{ hashFiles('snapshot.yaml') }}
+
+      - name: Failure in case when cache missed
+        if: steps.cache-data.outputs.cache-hit != 'true'
+        run: exit 1
+
+      - name: Exclude some sets for speed-up
+        run: |
+          rm -rf data/2* data/8* data/b*
+          rm -rf meta/2* meta/8* meta/b*
+          mkdir -vp ${{ github.workspace }}/CredData
+          mv data ${{ github.workspace }}/CredData/
+          mv meta ${{ github.workspace }}/CredData/
+
+      - name: Set up Python 3.8
+        if: steps.cache-data.outputs.cache-hit != 'true'
+        uses: actions/setup-python@v3
+        with:
+          python-version: "3.8"
+
+      - name: Update PIP
+        run: python -m pip install --upgrade pip
+
+      - name: Checkout current CredSweeper
+        uses: actions/checkout@v3
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
+          path: CredSweeper.head
+
+      - name: Install development packages
+        run: python -m pip install --requirement CredSweeper.head/requirements.txt
+
+      - name: Install experimental packages
+        # some versions will be changed for compatibility
+        run: python -m pip install --requirement CredSweeper.head/experiment/requirements.txt
+
+      - name: dbg
+        run: echo ${{ github.workspace }} && ls -al ${{ github.workspace }} && tree ${{ github.workspace }}
+
+      - name: Lighten spit.json
+        run: |
+          mv -vf ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak
+          cat ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak
+          grep -v '"[28b][0-9a-f]\+' ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak >${{ github.workspace }}/CredSweeper.head/experiment/src/split.json
+          cat ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json
+
+      - name: Run the experiment
+        run: |
+          cd CredSweeper.head
+          ls -al #dbg
+          pwd #dbg
+          export PYTHONPATH=$(pwd):${PYTHONPATH}
+          cd experiment
+          # check whether credsweeper is available as module
+          python -m credsweeper --banner
+          # use only 2 epochs for the test
+          sed -i 's/epochs=42,/epochs=2,/' main.py
+          python main.py --data ${{ github.workspace }}/CredData -j $(( 2 * $(nproc) ))
+          ls -al results #dbg
+          python -m tf2onnx.convert --saved-model $(find results -mindepth 1 -maxdepth 1 -type d) --output ../credsweeper/ml_model/ml_model.onnx --verbose
+          # dbg
+          git diff
+          # crc32 should be changed
+          python -m credsweeper --banner
+          # run quick scan
+          python -m credsweeper --log debug --path ../tests/samples --save-json
+          NEW_MODEL_FOUND_SAMPLES=$(jq '.|length' output.json)
+          if [ 100 -gt ${NEW_MODEL_FOUND_SAMPLES} ]; then
+            echo "Failure: found ${NEW_MODEL_FOUND_SAMPLES} credentials"
+            exit 1
+          fi
+
 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
@@ -58,7 +58,7 @@ jobs:
     - name: Check ml_model.onnx integrity
       if: ${{ always() && steps.code_checkout.conclusion == 'success' }}
       run: |
-        md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 8cb870a200d7bc07893aacec38f54033
+        md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 4774fdce802e940023316c32f14a68df
 
     # # # Python setup
 

diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt
@@ -232,16 +232,16 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .zsh                        7          1109                        13
 .zsh-theme                  1           121                         1
 TOTAL:                  10188      19071512         5188        53756        5484
-Detected Credentials: 6082
-credsweeper result_cnt : 5232, lost_cnt : 0, true_cnt : 4388, false_cnt : 844
+Detected Credentials: 5792
+credsweeper result_cnt : 5038, lost_cnt : 0, true_cnt : 4606, false_cnt : 432
 Category                      TP    FP        TN    FN       FPR       FNR       ACC       PRC       RCL        F1
 --------------------------  ----  ----  --------  ----  --------  --------  --------  --------  --------  --------
-Authentication Credentials    76    78      2607    15  0.029050  0.164835  0.966499  0.493506  0.835165  0.620408
-Cryptographic Primitives      47    14       158     7  0.081395  0.129630  0.907080  0.770492  0.870370  0.817391
-Generic Secret              1061    71     30139   143  0.002350  0.118771  0.993188  0.937279  0.881229  0.908390
-Generic Token                295    33      4242    41  0.007719  0.122024  0.983951  0.899390  0.877976  0.888554
-Other                        496   329      3417   188  0.087827  0.274854  0.883296  0.601212  0.725146  0.657389
-Password                    1049   248     11123   381  0.021810  0.266434  0.950863  0.808790  0.733566  0.769344
-Predefined Pattern           353    71      5233    25  0.013386  0.066138  0.983105  0.832547  0.933862  0.880299
+Authentication Credentials    74    27      2658    17  0.010056  0.186813  0.984150  0.732673  0.813187  0.770833
+Cryptographic Primitives      43     3       169    11  0.017442  0.203704  0.938053  0.934783  0.796296  0.860000
+Generic Secret              1112    25     30185    92  0.000828  0.076412  0.996276  0.978012  0.923588  0.950021
+Generic Token                304     7      4268    32  0.001637  0.095238  0.991542  0.977492  0.904762  0.939722
+Other                        503   330      3416   181  0.088094  0.264620  0.884650  0.603842  0.735380  0.663151
+Password                    1200    34     11337   230  0.002990  0.160839  0.979377  0.972447  0.839161  0.900901
+Predefined Pattern           359     6      5298    19  0.001131  0.050265  0.995600  0.983562  0.949735  0.966353
 Private Key                 1011     0      1477     0                      1.000000  1.000000  1.000000  1.000000
-                            4388   844  19065480   800  0.000044  0.154202  0.999914  0.838685  0.845798  0.842226
+                            4606   432  19065892   582  0.000023  0.112182  0.999947  0.914252  0.887818  0.900841
diff --git a/credsweeper/ml_model/features.py b/credsweeper/ml_model/features.py
@@ -49,6 +49,26 @@ def any_word_in_(self, lower_case_line: str) -> bool:
         return False
 
 
+class WordInVariable(Feature):
+    """Feature returns true if candidate value contains at least one word from predefined list."""
+
+    def __init__(self, words: List[str]) -> None:
+        """Feature is true if candidate value contains at least one predefined word.
+
+        Args:
+            words: list of predefined words - MUST BE IN LOWER CASE
+
+        """
+        super().__init__()
+        self.words = words
+
+    def extract(self, candidate: Candidate) -> bool:
+        """Returns true if any words in first line"""
+        if candidate.line_data_list[0].variable:
+            return self.any_word_in_(candidate.line_data_list[0].variable.lower())
+        return False
+
+
 class WordInSecret(Feature):
     """Feature returns true if candidate value contains at least one word from predefined list."""
 

diff --git a/credsweeper/ml_model/ml_model.onnx b/credsweeper/ml_model/ml_model.onnx
diff --git a/credsweeper/ml_model/ml_validator.py b/credsweeper/ml_model/ml_validator.py
@@ -44,7 +44,7 @@ def __init__(self, threshold: Union[float, ThresholdPreset], azure: bool = False
             self.threshold = model_details["thresholds"][threshold.value]
         else:
             self.threshold = 0.5
-        self.maxlen = int(model_details.get("max_len", 50))
+        self.maxlen = int(model_details.get("max_len", 160))
         self.common_feature_list = []
         self.unique_feature_list = []
         logger.info("Init ML validator, model file path: %s", model_file_path)