Performance optimization (Samsung#304)

* optimization * style fix * suggested text read style * suggestions * Update credsweeper/__main__.py Co-authored-by: ShinHyung Choi <[email protected]> --------- Co-authored-by: ShinHyung Choi <[email protected]>
VladSamsung · May 16, 2023 · 0dd01d8 · 0dd01d8
1 parent 9179915
commit 0dd01d8
Show file tree

Hide file tree

Showing 28 changed files with 223 additions and 114 deletions.
diff --git a/.flake8 b/.flake8
@@ -1,4 +1,4 @@
 [flake8]
 max-line-length = 120
-extend-ignore = E203,E303,E131
+extend-ignore = E203,E303,E131,E402
 per-file-ignores = __init__.py:F401
diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
@@ -142,7 +142,7 @@ jobs:
     - name: Analysing the code with pylint for NEW missed docstrings of classes or functions
       if: ${{ always() && steps.setup_credsweeper.conclusion == 'success' }}
       run: |
-        pylint --disable=E,R,W,C0114,C0103,C0412,C0415,C0200,C0201,C0325 --verbose credsweeper 2>/dev/null | grep '^credsweeper/' | LC_ALL=C sort -g | diff cicd/missed_docstrings.txt -
+        pylint --disable=E,R,W,C0114,C0103,C0412,C0413,C0415,C0200,C0201,C0325 --verbose credsweeper 2>/dev/null | grep '^credsweeper/' | LC_ALL=C sort -g | diff cicd/missed_docstrings.txt -
 
     # # # yapf
 

diff --git a/credsweeper/__main__.py b/credsweeper/__main__.py
@@ -7,8 +7,7 @@
 from typing import Any, Union, Optional, Dict
 
 from credsweeper import __version__
-from credsweeper.app import CredSweeper
-from credsweeper.app_path import APP_PATH
+from credsweeper.app import APP_PATH, CredSweeper
 from credsweeper.common.constants import ThresholdPreset, Severity, RuleType, DiffRowType
 from credsweeper.file_handler.files_provider import FilesProvider
 from credsweeper.file_handler.patch_provider import PatchProvider

diff --git a/credsweeper/app.py b/credsweeper/app.py
@@ -8,7 +8,9 @@
 
 import pandas as pd
 
-from credsweeper.app_path import APP_PATH
+# Directory of credsweeper sources MUST be placed before imports to avoid circular import error
+APP_PATH = Path(__file__).resolve().parent
+
 from credsweeper.common.constants import KeyValidationOption, Severity, ThresholdPreset
 from credsweeper.config import Config
 from credsweeper.credentials import Candidate, CredentialManager
@@ -207,7 +209,7 @@ def run(self, content_provider: FilesProvider) -> int:
         _empty_list: List[Union[DiffContentProvider, TextContentProvider]] = []
         file_extractors: List[Union[DiffContentProvider, TextContentProvider]] = \
             content_provider.get_scannable_files(self.config) if content_provider else _empty_list
-        logger.info("Start Scanner")
+        logger.info(f"Start Scanner for {len(file_extractors)} providers")
         self.scan(file_extractors)
         self.post_processing()
         self.export_results()
@@ -308,19 +310,22 @@ def file_scan(self, content_provider: Union[DiffContentProvider, TextContentProv
     def post_processing(self) -> None:
         """Machine learning validation for received credential candidates."""
         if self._use_ml_validation():
-            logger.info("Run ML Validation")
+            logger.info(f"Run ML Validation for {len(self.credential_manager.candidates)} candidates")
             new_cred_list = []
             cred_groups = self.credential_manager.group_credentials()
             ml_cred_groups = []
             for group_key, group_candidates in cred_groups.items():
                 # Analyze with ML if all candidates in group require ML
-                if all(candidate.use_ml for candidate in group_candidates):
+                for candidate in group_candidates:
+                    if not candidate.use_ml:
+                        break
+                else:
                     ml_cred_groups.append((group_key.value, group_candidates))
+                    continue
                 # If at least one of credentials in the group do not require ML - automatically report to user
-                else:
-                    for candidate in group_candidates:
-                        candidate.ml_validation = KeyValidationOption.NOT_AVAILABLE
-                    new_cred_list += group_candidates
+                for candidate in group_candidates:
+                    candidate.ml_validation = KeyValidationOption.NOT_AVAILABLE
+                new_cred_list += group_candidates
 
             is_cred, probability = self.ml_validator.validate_groups(ml_cred_groups, self.ml_batch_size)
             for i, (_, group_candidates) in enumerate(ml_cred_groups):

diff --git a/credsweeper/app_path.py b/credsweeper/app_path.py
diff --git a/credsweeper/common/__init__.py b/credsweeper/common/__init__.py
@@ -1 +1,4 @@
 from credsweeper.common.keyword_checklist import KeywordChecklist
+
+# use the variable to avoid singleton creation and make testing easier
+static_keyword_checklist = KeywordChecklist()
diff --git a/credsweeper/common/keyword_checklist.py b/credsweeper/common/keyword_checklist.py
@@ -1,36 +1,32 @@
-import os
-from typing import List, Set
+from functools import cached_property
+from typing import Set
 
-from credsweeper.utils import Util
+from credsweeper.app import APP_PATH
 
 
 class KeywordChecklist:
     """KeywordsChecklist contains words 3 or more letters length"""
-    __keyword_list: List[str] = []
+    __keyword_set: Set[str]
+    __morpheme_set: Set[str]
+    KEYWORD_PATH = APP_PATH / "common" / "keyword_checklist.txt"
+    MORPHEME_PATH = APP_PATH / "common" / "morpheme_checklist.txt"
 
     def __init__(self) -> None:
-        dir_path = os.path.dirname(os.path.realpath(__file__))
-        file_path = os.path.join(dir_path, "keyword_checklist.txt")
-        self.set_list(Util.read_file(file_path))
+        # used suggested text read style. split() is preferred because it strips 0x0A on end the file
+        with open(self.KEYWORD_PATH, 'r') as f:
+            self.__keyword_set = set(f.read().split())
 
-    def get_list(self) -> List[str]:
-        """Get list with keywords.
+    @cached_property
+    def keyword_set(self) -> Set[str]:
+        """Get set with keywords.
 
         Return:
-            List of strings
+            Set of strings
 
         """
-        return self.__keyword_list
+        return self.__keyword_set
 
-    def set_list(self, keyword_list: List[str]) -> None:
-        """Remove old keywords and setup new one.
-
-        Args:
-            keyword_list: list of keywords to be added
-
-        """
-        keyword_set: Set[str] = set()
-        for i in keyword_list:
-            if 3 <= len(i):
-                keyword_set.add(i)
-        self.__keyword_list = list(keyword_set)
+    @cached_property
+    def keyword_len(self) -> int:
+        """Length of keyword_set"""
+        return len(self.__keyword_set)
diff --git a/credsweeper/credentials/line_data.py b/credsweeper/credentials/line_data.py
@@ -257,8 +257,10 @@ def is_comment(self) -> bool:
 
         """
         cleaned_line = self.line.strip()
-        starts_from_comment = any(cleaned_line.startswith(comment_start) for comment_start in self.comment_starts)
-        return starts_from_comment
+        for comment_start in self.comment_starts:
+            if cleaned_line.startswith(comment_start):
+                return True
+        return False
 
     def is_source_file(self) -> bool:
         """Check if file with credential is a source code file or not (data, log, plain text).

diff --git a/credsweeper/file_handler/data_content_provider.py b/credsweeper/file_handler/data_content_provider.py
@@ -147,8 +147,12 @@ def represent_as_html(self) -> bool:
         """
         try:
             text = self.data.decode(encoding=DEFAULT_ENCODING)
-            if any(tag in text for tag in ["</html>", "</body>", "</head>", "</div>", "</table>"]):
-                html = BeautifulSoup(text, features="html.parser")
+            html = None
+            for tag in ["</html>", "</body>", "</head>", "</div>", "</table>"]:
+                if tag in text:
+                    html = BeautifulSoup(text, features="html.parser")
+                    break
+            if html:
                 # simple parse as it is displayed to user
                 for line_number, line in enumerate(html.text.splitlines()):
                     if line and line.strip():

diff --git a/credsweeper/file_handler/file_path_extractor.py b/credsweeper/file_handler/file_path_extractor.py
@@ -130,10 +130,12 @@ def check_exclude_file(config: Config, path: str) -> bool:
         path = path.replace('\\', '/').lower()
         if config.not_allowed_path_pattern.match(path):
             return True
-        if any(exclude_pattern.match(path) for exclude_pattern in config.exclude_patterns):
-            return True
-        if any(exclude_path in path for exclude_path in config.exclude_paths):
-            return True
+        for exclude_pattern in config.exclude_patterns:
+            if exclude_pattern.match(path):
+                return True
+        for exclude_path in config.exclude_paths:
+            if exclude_path in path:
+                return True
         file_extension = Util.get_extension(path, lower=False)
         if file_extension in config.exclude_extensions:
             return True

diff --git a/credsweeper/filters/value_dictionary_keyword_check.py b/credsweeper/filters/value_dictionary_keyword_check.py
@@ -1,14 +1,11 @@
-from credsweeper.common import KeywordChecklist
+from credsweeper.common import static_keyword_checklist
 from credsweeper.credentials import LineData
 from credsweeper.filters import Filter
 
 
 class ValueDictionaryKeywordCheck(Filter):
     """Check that no word from dictionary present in the candidate value."""
 
-    def __init__(self) -> None:
-        self.keyword_checklist = KeywordChecklist()
-
     def run(self, line_data: LineData) -> bool:
         """Run filter checks on received credential candidate data 'line_data'.
 
@@ -21,6 +18,8 @@ def run(self, line_data: LineData) -> bool:
         """
         if not line_data.value:
             return True
-        if any(keyword in line_data.value.lower() for keyword in self.keyword_checklist.get_list()):
-            return True
+        line_data_value_lower = line_data.value.lower()
+        for keyword in static_keyword_checklist.keyword_set:
+            if keyword in line_data_value_lower:
+                return True
         return False
diff --git a/credsweeper/filters/value_file_path_check.py b/credsweeper/filters/value_file_path_check.py
@@ -23,7 +23,11 @@ def run(self, line_data: LineData) -> bool:
             return True
         contains_unix_separator = '/' in line_data.value
         contains_windows_separator = ':\\' in line_data.value
-        contains_special_characters = any(c in line_data.value for c in " !$`&*()+")
+        contains_special_characters = False
+        for i in " !$`&*()+":
+            if i in line_data.value:
+                contains_special_characters = True
+                break
         if (contains_unix_separator ^ contains_windows_separator) and not contains_special_characters:
             return True
         return False
diff --git a/credsweeper/filters/value_split_keyword_check.py b/credsweeper/filters/value_split_keyword_check.py
@@ -1,15 +1,13 @@
-from credsweeper.common import KeywordChecklist
+from typing import Union
+
+from credsweeper.common import static_keyword_checklist
 from credsweeper.credentials import LineData
 from credsweeper.filters import Filter
 
 
 class ValueSplitKeywordCheck(Filter):
     """Check value by splitting with standard whitespace separators and any word is not matched in checklist."""
 
-    def __init__(self) -> None:
-        """ValueSplitKeywordCheck constructor"""
-        self.keyword_checklist = KeywordChecklist()
-
     def run(self, line_data: LineData) -> bool:
         """Run filter checks on received credential candidate data 'line_data'.
 
@@ -22,7 +20,16 @@ def run(self, line_data: LineData) -> bool:
         """
         if not line_data.value:
             return True
-        words = line_data.value.lower().split()
-        if any(keyword in words for keyword in self.keyword_checklist.get_list()):
-            return True
+        words: Union[set, list] = line_data.value.lower().split()
+        if static_keyword_checklist.keyword_len < len(words):
+            words = set(words)
+        keyword_set = static_keyword_checklist.keyword_set
+        if static_keyword_checklist.keyword_len < len(words):
+            for keyword in keyword_set:
+                if keyword in words:
+                    return True
+        else:
+            for word in words:
+                if word in keyword_set:
+                    return True
         return False
diff --git a/credsweeper/logger/logger.py b/credsweeper/logger/logger.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 from typing import Optional
 
-from credsweeper.app_path import APP_PATH
+from credsweeper.app import APP_PATH
 from credsweeper.utils import Util