Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Increase max line length to 8000 #474

Merged
merged 15 commits into from
Dec 13, 2023
6 changes: 4 additions & 2 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@v3
with:
repository: Samsung/CredData
ref: longline
repository: babenek/CredData

- name: Cache data
id: cache-data
Expand Down Expand Up @@ -62,7 +63,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@v3
with:
repository: Samsung/CredData
ref: longline
repository: babenek/CredData

- name: Cache data
id: cache-data
Expand Down
30 changes: 15 additions & 15 deletions cicd/benchmark.txt
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
DATA: 19434458 valid lines. MARKUP: 74402 items
DATA: 19434458 valid lines. MARKUP: 74416 items
Category Positives Negatives Template
-------------------------- ----------- ----------- ----------
Authentication Key & Token 70 1 31
Authentication Key & Token 77 1 31
Generic Secret 1056 15 203
Generic Token 333 45 558
Other 839 63510 635
Password 1405 110 4170
Predefined Pattern 326 2 40
Private Key 1001 1 3
Other 850 63503 634
Password 1406 110 4170
Predefined Pattern 330 2 40
Private Key 1001 1 2
Seed, Salt, Nonce 40 4 4
TOTAL: 5070 63688 5644
Detected Credentials: 5730
credsweeper result_cnt : 5102, lost_cnt : 0, true_cnt : 4207, false_cnt : 895
TOTAL: 5093 63681 5642
Detected Credentials: 5840
credsweeper result_cnt : 5131, lost_cnt : 0, true_cnt : 4227, false_cnt : 904
Category TP FP TN FN FPR FNR ACC PRC RCL F1
-------------------------- ---- ---- -------- ---- --------- --------- -------- -------- -------- --------
Authentication Key & Token 54 4 28 16 0.125 0.228571 0.803922 0.931034 0.771429 0.84375
Authentication Key & Token 60 4 28 17 0.125 0.220779 0.807339 0.9375 0.779221 0.851064
Generic Secret 973 3 215 83 0.0137615 0.0785985 0.932496 0.996926 0.921402 0.957677
Generic Token 289 7 596 44 0.0116086 0.132132 0.945513 0.976351 0.867868 0.918919
Other 584 747 63398 255 0.0116455 0.303933 0.984581 0.438768 0.696067 0.538249
Password 995 130 4150 410 0.0303738 0.291815 0.905013 0.884444 0.708185 0.786561
Predefined Pattern 309 2 40 17 0.0476191 0.0521472 0.94837 0.993569 0.947853 0.970173
Private Key 967 0 4 34 0.033966 0.966169 1 0.966034 0.982724
Other 594 756 63381 256 0.0117873 0.301176 0.984428 0.44 0.698824 0.54
Password 996 130 4150 410 0.0303738 0.291607 0.90503 0.884547 0.708393 0.78673
Predefined Pattern 312 2 40 18 0.0476191 0.0545455 0.946237 0.993631 0.945455 0.968944
Private Key 967 0 3 34 0.033966 0.966135 1 0.966034 0.982724
Seed, Salt, Nonce 36 2 6 4 0.25 0.1 0.875 0.947368 0.9 0.923077
4207 895 19428493 863 4.606e-05 0.170217 0.99991 0.824579 0.829783 0.827173
4227 904 19428461 866 4.653e-05 0.170037 0.999909 0.823816 0.829963 0.826878
2 changes: 1 addition & 1 deletion credsweeper/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ class DiffRowType(Enum):
MIN_VARIABLE_LENGTH = 1
MIN_SEPARATOR_LENGTH = 1
MIN_VALUE_LENGTH = 4
MAX_LINE_LENGTH = 2000
MAX_LINE_LENGTH = 8000
""" values according https://docs.python.org/3/library/codecs.html """
UTF_8 = "utf_8"
UTF_16 = "utf_16"
Expand Down
4 changes: 2 additions & 2 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from pathlib import Path

# total number of files in test samples
SAMPLES_FILES_COUNT: int = 121
SAMPLES_FILES_COUNT: int = 120

# credentials count after scan
SAMPLES_CRED_COUNT: int = 380
Expand All @@ -16,7 +16,7 @@
# archived credentials that are not found without --depth
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 22
SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 16
SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 3
SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 1

# well known string with all latin letters
AZ_DATA = b"The quick brown fox jumps over the lazy dog"
Expand Down
58 changes: 5 additions & 53 deletions tests/data/depth_3.json
Original file line number Diff line number Diff line change
Expand Up @@ -6530,54 +6530,6 @@
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
"ml_probability": null,
"rule": "AWS Client ID",
"severity": "high",
"line_data_list": [
{
"line": "securelogin?user=admin,password=AKIAAPH9BLMHUAJIE6SN",
"line_num": 1,
"path": "tests/samples/oversize",
"info": "tests/samples/oversize|STRUCT|STRUCT:0|STRING:link_rel|RAW",
"value": "AKIAAPH9BLMHUAJIE6SN",
"value_start": 32,
"value_end": 52,
"variable": null,
"entropy_validation": {
"iterator": "BASE64_CHARS",
"entropy": 3.721928094887362,
"valid": false
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
"ml_probability": null,
"rule": "Password",
"severity": "medium",
"line_data_list": [
{
"line": "securelogin?user=admin,password=AKIAAPH9BLMHUAJIE6SN",
"line_num": 1,
"path": "tests/samples/oversize",
"info": "tests/samples/oversize|STRUCT|STRUCT:0|STRING:link_rel|RAW",
"value": "AKIAAPH9BLMHUAJIE6SN",
"value_start": 32,
"value_end": 52,
"variable": "admin,password",
"entropy_validation": {
"iterator": "BASE64_CHARS",
"entropy": 3.721928094887362,
"valid": false
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "VALIDATED_KEY",
Expand Down Expand Up @@ -8263,13 +8215,13 @@
"severity": "high",
"line_data_list": [
{
"line": "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t : ghs_00000000000000000000000000000004WZ4EQ",
"line_num": 1,
"line": "<w:document xmlns:o=\"urn:schemas-microsoft-com:office:office\" xmlns:r=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships\" xmlns:v=\"urn:schemas-microsoft-com:vml\" xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\" xmlns:w10=\"urn:schemas-microsoft-com:office:word\" xmlns:wp=\"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing\" xmlns:wps=\"http://schemas.microsoft.com/office/word/2010/wordprocessingShape\" xmlns:wpg=\"http://schemas.microsoft.com/office/word/2010/wordprocessingGroup\" xmlns:mc=\"http://schemas.openxmlformats.org/markup-compatibility/2006\" xmlns:wp14=\"http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing\" xmlns:w14=\"http://schemas.microsoft.com/office/word/2010/wordml\" xmlns:w15=\"http://schemas.microsoft.com/office/word/2012/wordml\" mc:Ignorable=\"w14 wp14 w15\"><w:body><w:p><w:pPr><w:pStyle w:val=\"Normal\"/><w:bidi w:val=\"0\"/><w:jc w:val=\"left\"/><w:rPr><w:rFonts w:ascii=\"sans-serif\" w:hAnsi=\"sans-serif\"/><w:b w:val=\"false\"/><w:i/><w:i/><w:color w:val=\"202122\"/><w:spacing w:val=\"0\"/><w:sz w:val=\"21\"/></w:rPr></w:pPr><w:r><w:rPr></w:rPr><w:t>Password = WeR15tr0n6</w:t></w:r></w:p><w:p><w:pPr><w:pStyle w:val=\"Normal\"/><w:pBdr></w:pBdr><w:shd w:fill=\"2B2B2B\"/><w:rPr><w:rFonts w:ascii=\"JetBrains Mono\" w:hAnsi=\"JetBrains Mono\"/><w:b w:val=\"false\"/><w:i w:val=\"false\"/><w:i w:val=\"false\"/><w:color w:val=\"6A8759\"/><w:sz w:val=\"20\"/></w:rPr></w:pPr><w:r><w:rPr><w:rFonts w:ascii=\"JetBrains Mono\" w:hAnsi=\"JetBrains Mono\"/><w:b w:val=\"false\"/><w:i w:val=\"false\"/><w:color w:val=\"6A8759\"/><w:spacing w:val=\"0\"/><w:sz w:val=\"20\"/></w:rPr><w:t>ghs_00000000000000000000000000000004WZ4EQ</w:t></w:r></w:p><w:p><w:pPr><w:pStyle w:val=\"Normal\"/><w:bidi w:val=\"0\"/><w:jc w:val=\"left\"/><w:rPr><w:rFonts w:ascii=\"sans-serif\" w:hAnsi=\"sans-serif\"/><w:b w:val=\"false\"/><w:i/><w:i/><w:color w:val=\"202122\"/><w:spacing w:val=\"0\"/><w:sz w:val=\"21\"/></w:rPr></w:pPr><w:r><w:rPr></w:rPr></w:r></w:p><w:sectPr><w:type w:val=\"nextPage\"/><w:pgSz w:w=\"11906\" w:h=\"16838\"/><w:pgMar w:left=\"1134\" w:right=\"1134\" w:gutter=\"0\" w:header=\"0\" w:top=\"1134\" w:footer=\"0\" w:bottom=\"1134\"/><w:pgNumType w:fmt=\"decimal\"/><w:formProt w:val=\"false\"/><w:textDirection w:val=\"lrTb\"/><w:docGrid w:type=\"default\" w:linePitch=\"100\" w:charSpace=\"0\"/></w:sectPr></w:body></w:document>",
"line_num": 2,
"path": "tests/samples/sample.docx",
"info": "tests/samples/sample.docx|ZIP|word/document.xml|XML",
"info": "tests/samples/sample.docx|ZIP|word/document.xml|RAW",
"value": "ghs_00000000000000000000000000000004WZ4EQ",
"value_start": 66,
"value_end": 107,
"value_start": 1628,
"value_end": 1669,
"variable": null,
"entropy_validation": {
"iterator": "BASE64_CHARS",
Expand Down
2 changes: 0 additions & 2 deletions tests/samples/oversize

This file was deleted.