Samsung · babenek · Dec 14, 2023 · Dec 6, 2023 · Dec 6, 2023 · Dec 6, 2023
@@ -1,25 +1,25 @@
-DATA: 19434458 valid lines. MARKUP: 74416 items
+DATA: 19419621 valid lines. MARKUP: 74230 items
 Category                      Positives    Negatives    Template
 --------------------------  -----------  -----------  ----------
 Authentication Key & Token           77            1          31
 Generic Secret                     1056           15         203
 Generic Token                       333           45         558
-Other                               850        63503         634
-Password                           1406          110        4170
+Other                               844        63294         635
+Password                           1402          110        4164
 Predefined Pattern                  330            2          40
-Private Key                        1001            1           2
+Private Key                        1008            2          32
 Seed, Salt, Nonce                    40            4           4
-TOTAL:                             5093        63681        5642
-Detected Credentials: 5840
-credsweeper result_cnt : 5131, lost_cnt : 0, true_cnt : 4227, false_cnt : 904
+TOTAL:                             5090        63473        5667
+Detected Credentials: 5864
+credsweeper result_cnt : 5161, lost_cnt : 0, true_cnt : 4263, false_cnt : 898
 Category                      TP    FP        TN    FN        FPR        FNR       ACC       PRC       RCL        F1
 --------------------------  ----  ----  --------  ----  ---------  ---------  --------  --------  --------  --------
 Authentication Key & Token    60     4        28    17  0.125      0.220779   0.807339  0.9375    0.779221  0.851064
 Generic Secret               973     3       215    83  0.0137615  0.0785985  0.932496  0.996926  0.921402  0.957677
 Generic Token                289     7       596    44  0.0116086  0.132132   0.945513  0.976351  0.867868  0.918919
-Other                        594   756     63381   256  0.0117873  0.301176   0.984428  0.44      0.698824  0.54
-Password                     996   130      4150   410  0.0303738  0.291607   0.90503   0.884547  0.708393  0.78673
+Other                        589   749     63180   255  0.0117161  0.302133   0.9845    0.440209  0.697867  0.539872
+Password                     996   129      4145   406  0.0301825  0.289586   0.905743  0.885333  0.710414  0.788287
 Predefined Pattern           312     2        40    18  0.0476191  0.0545455  0.946237  0.993631  0.945455  0.968944
-Private Key                  967     0         3    34             0.033966   0.966135  1         0.966034  0.982724
+Private Key                 1008     2        32     0  0.0588235             0.998081  0.99802   1         0.999009
 Seed, Salt, Nonce             36     2         6     4  0.25       0.1        0.875     0.947368  0.9       0.923077
-                            4227   904  19428461   866  4.653e-05  0.170037   0.999909  0.823816  0.829963  0.826878
+                            4263   898  19413633   827  4.625e-05  0.162475   0.999911  0.826003  0.837525  0.831724
@@ -29,7 +29,6 @@
 from credsweeper.filters.value_number_check import ValueNumberCheck
 from credsweeper.filters.value_pattern_check import ValuePatternCheck
 from credsweeper.filters.value_pattern_length_check import ValuePatternLengthCheck
-from credsweeper.filters.value_pem_pattern_check import ValuePemPatternCheck
 from credsweeper.filters.value_similarity_check import ValueSimilarityCheck
 from credsweeper.filters.value_split_keyword_check import ValueSplitKeywordCheck
 from credsweeper.filters.value_string_type_check import ValueStringTypeCheck

@@ -329,9 +329,7 @@
   severity: high
   type: pem_key
   values:
-    - (?P<value>-----BEGIN\s(?!ENCRYPTED|EC)[^-]*PRIVATE[^-]*KEY[^-]*-----(.+-----END[^-]+-----)?)
-  filter_type:
-    - LineSpecificKeyCheck
+    - (?P<value>-----BEGIN\s(?!ENCRYPTED)[^-]*PRIVATE[^-]*KEY[^-]*-----(.+-----END[^-]+-----)?)
   min_line_len: 27
 
 - name: Picatic API Key

@@ -39,10 +39,10 @@ class Rule:
     SEVERITY = "severity"
     TYPE = "type"
     VALUES = "values"
-    FILTER_TYPE = "filter_type"
     MIN_LINE_LEN = "min_line_len"
 
     # auxiliary fields
+    FILTER_TYPE = "filter_type"
     USE_ML = "use_ml"
     REQUIRED_SUBSTRINGS = "required_substrings"
     REQUIRED_REGEX = "required_regex"
@@ -65,7 +65,7 @@ def __init__(self, config: Config, rule_dict: Dict) -> None:
             self._malformed_rule_error(rule_dict, Rule.TYPE)
         self.__patterns = self._init_patterns(rule_dict[Rule.VALUES])
         # auxiliary fields
-        self.__filters = self._init_filters(rule_dict.get(Rule.FILTER_TYPE))
+        self.__filters = self._init_filters(rule_dict.get(Rule.FILTER_TYPE, []))
         self.__use_ml = bool(rule_dict.get(Rule.USE_ML))
         self.__validations = self._init_validations(rule_dict.get(Rule.VALIDATIONS))
         self.__required_substrings = set(i.strip().lower() for i in rule_dict.get(Rule.REQUIRED_SUBSTRINGS, []))
@@ -214,7 +214,7 @@ def _assert_rule_mandatory_fields(rule_template: Dict) -> None:
             ValueError if missing fields is present
 
         """
-        mandatory_fields = [Rule.NAME, Rule.SEVERITY, Rule.TYPE, Rule.VALUES, Rule.FILTER_TYPE, Rule.MIN_LINE_LEN]
+        mandatory_fields = [Rule.NAME, Rule.SEVERITY, Rule.TYPE, Rule.VALUES, Rule.MIN_LINE_LEN]
         missing_fields = [field for field in mandatory_fields if field not in rule_template]
         if len(missing_fields) > 0:
             raise ValueError(f"Malformed rule config file. Contain rule with missing fields: {missing_fields}.")

@@ -1,15 +1,16 @@
+import contextlib
 import logging
 import re
 import string
-from typing import Optional, List
+from typing import List
 
-from credsweeper.common.constants import Chars, PEM_BEGIN_PATTERN, PEM_END_PATTERN, RuleType
+from credsweeper.common.constants import PEM_BEGIN_PATTERN, PEM_END_PATTERN, RuleType, Chars
 from credsweeper.config import Config
 from credsweeper.credentials import Candidate, LineData
 from credsweeper.file_handler.analysis_target import AnalysisTarget
-from credsweeper.filters import ValuePatternCheck, ValuePemPatternCheck
 from credsweeper.rules import Rule
 from credsweeper.scanner.scan_type import ScanType
+from credsweeper.utils import Util
 from credsweeper.utils.entropy_validator import EntropyValidator
 
 logger = logging.getLogger(__name__)
@@ -24,12 +25,11 @@ class PemKeyPattern(ScanType):
         remove_characters: This characters would be striped from PEM lines before entropy check
 
     """
+    base64set = set(string.ascii_uppercase) | set(string.ascii_lowercase) | set(string.digits) | {'+', '/', '='}
 
     ignore_starts = [PEM_BEGIN_PATTERN, "Proc-Type", "Version", "DEK-Info"]
-    wrap_characters = "\\'\";,[]#*"
+    wrap_characters = "\\'\";,[]#*!"
     remove_characters = string.whitespace + wrap_characters
-    remove_characters_plus = remove_characters + '+'
-    pem_pattern_check: Optional[ValuePatternCheck] = None
     # last line contains 4 symbols, at least
     re_value_pem = re.compile(r"(?P<value>([^-]*" + PEM_END_PATTERN +
                               r"[^-]+-----)|(([a-zA-Z0-9/+=]{64}.*)?[a-zA-Z0-9/+=]{4})+)")
@@ -50,8 +50,6 @@ def run(cls, config: Config, rule: Rule, target: AnalysisTarget) -> List[Candida
         """
         assert rule.rule_type == RuleType.PEM_KEY, \
             "Rules provided to PemKeyPattern.run should have pattern_type equal to PEM_KEY_PATTERN"
-        if not cls.pem_pattern_check:
-            cls.pem_pattern_check = ValuePemPatternCheck(config)
         if candidates := cls._get_candidates(config, rule, target):
             candidate = candidates[0]
             if pem_lines := cls.detect_pem_key(config, rule, target):
@@ -83,34 +81,51 @@ def detect_pem_key(cls, config: Config, rule: Rule, target: AnalysisTarget) -> L
         # protection check for case when first line starts from 0
         start_pos = target.line_pos if 0 <= target.line_pos else 0
         finish_pos = min(start_pos + 200, target.lines_len)
+        begin_pattern_not_passed = True
         for line_pos in range(start_pos, finish_pos):
             line = target.lines[line_pos]
             if target.line_pos != line_pos:
                 _line = LineData(config, line, line_pos, target.line_nums[line_pos], target.file_path, target.file_type,
                                  target.info, cls.re_value_pem)
                 line_data.append(_line)
             # replace escaped line ends with real and process them - PEM does not contain '\' sign
+            while "\\\\" in line:
+                line = line.replace("\\\\", "\\")
             sublines = line.replace("\\r", '\n').replace("\\n", '\n').splitlines()
             for subline in sublines:
-                if cls.is_leading_config_line(subline):
+                if begin_pattern_not_passed or cls.is_leading_config_line(subline):
+                    if PEM_BEGIN_PATTERN in subline:
+                        begin_pattern_not_passed = False
                     continue
                 elif PEM_END_PATTERN in subline:
-                    # Check if entropy is high enough for base64 set with padding sign
-                    entropy_validator = EntropyValidator(key_data, Chars.BASE64_CHARS)
-                    if not entropy_validator.valid:
+                    if "PGP" in target.line_strip:
+                        # Check if entropy is high enough for base64 set with padding sign
+                        entropy_validator = EntropyValidator(key_data, Chars.BASE64_CHARS)
+                        if entropy_validator.valid:
+                            return line_data
                         logger.debug("Filtered with entropy %f '%s'", entropy_validator.entropy, key_data)
-                        return []
-                    # OPENSSH format has multiple AAAAA pattern
-                    if "OPENSSH" not in target.line_strip and cls.pem_pattern_check.equal_pattern_check(key_data):
-                        logger.debug("Filtered with ValuePemPatternCheck %s", target)
-                        return []
-                    # all OK - return line data with all lines which include PEM
-                    return line_data
+                    if "OPENSSH" in target.line_strip:
+                        # Check whether the key is encrypted
+                        with contextlib.suppress(Exception):
+                            decoded = Util.decode_base64(key_data, urlsafe_detect=True)
+                            if b"bcrypt" not in decoded:
+                                # all OK - the key is not encrypted in this top level
+                                return line_data
+                        logger.debug("Filtered with non asn1 '%s'", key_data)
+                    else:
+                        with contextlib.suppress(Exception):
+                            decoded = Util.decode_base64(key_data, urlsafe_detect=True)
+                            if Util.is_asn1(decoded):
+                                # all OK - the key is not encrypted in this top level
+                                return line_data
+                        logger.debug("Filtered with non asn1 '%s'", key_data)
+                    return []
                 else:
                     sanitized_line = cls.sanitize_line(subline)
                     # PEM key line should not contain spaces or . (and especially not ...)
-                    if ' ' in sanitized_line or "..." in sanitized_line:
-                        return []
+                    for i in sanitized_line:
+                        if i not in cls.base64set:
+                            return []
                     key_data += sanitized_line
         return []
 
@@ -146,11 +161,19 @@ def sanitize_line(cls, line: str, recurse_level: int = 5) -> str:
             line = line[2:]
         if line.endswith("*/"):
             line = line[:-2]
-        if '"' in line or "'" in line:
-            # remove concatenation only when quotes present
-            line = line.strip(cls.remove_characters_plus)
-        else:
-            line = line.strip(cls.remove_characters)
+        if line.endswith("\\"):
+            # line carry in many languages
+            line = line[:-1]
+
+        # remove concatenation carefully only when it is not part of base64
+        if line.startswith('+'):
+            if line[1] not in cls.base64set:
+                line = line[1:]
+        if line.endswith('+'):
+            if line[-2] not in cls.base64set:
+                line = line[:-1]
+
+        line = line.strip(cls.remove_characters)
         # check whether new iteration requires
         for x in string.whitespace:
             if line.startswith(x) or line.endswith(x):

@@ -236,14 +236,6 @@ credsweeper.filters.value\_pattern\_length\_check module
    :undoc-members:
    :show-inheritance:
 
-credsweeper.filters.value\_pem\_pattern\_check module
------------------------------------------------------
-
-.. automodule:: credsweeper.filters.value_pem_pattern_check
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
 credsweeper.filters.value\_similarity\_check module
 ---------------------------------------------------
 

@@ -4,14 +4,14 @@
 SAMPLES_FILES_COUNT: int = 122
 
 # credentials count after scan
-SAMPLES_CRED_COUNT: int = 382
-SAMPLES_CRED_LINE_COUNT: int = 394
+SAMPLES_CRED_COUNT: int = 383
+SAMPLES_CRED_LINE_COUNT: int = 399
 
 # credentials count after post-processing
-SAMPLES_POST_CRED_COUNT: int = 301
+SAMPLES_POST_CRED_COUNT: int = 302
 
 # with option --doc
-SAMPLES_IN_DOC = 393
+SAMPLES_IN_DOC = 394
 
 # archived credentials that are not found without --depth
 SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 22