Skip to content

Commit

Permalink
Performance optimization (Samsung#304)
Browse files Browse the repository at this point in the history
* optimization

* style fix

* suggested text read style

* suggestions

* Update credsweeper/__main__.py

Co-authored-by: ShinHyung Choi <[email protected]>

---------

Co-authored-by: ShinHyung Choi <[email protected]>
  • Loading branch information
babenek and csh519 authored May 16, 2023
1 parent 9179915 commit 0dd01d8
Show file tree
Hide file tree
Showing 28 changed files with 223 additions and 114 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[flake8]
max-line-length = 120
extend-ignore = E203,E303,E131
extend-ignore = E203,E303,E131,E402
per-file-ignores = __init__.py:F401
2 changes: 1 addition & 1 deletion .github/workflows/check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ jobs:
- name: Analysing the code with pylint for NEW missed docstrings of classes or functions
if: ${{ always() && steps.setup_credsweeper.conclusion == 'success' }}
run: |
pylint --disable=E,R,W,C0114,C0103,C0412,C0415,C0200,C0201,C0325 --verbose credsweeper 2>/dev/null | grep '^credsweeper/' | LC_ALL=C sort -g | diff cicd/missed_docstrings.txt -
pylint --disable=E,R,W,C0114,C0103,C0412,C0413,C0415,C0200,C0201,C0325 --verbose credsweeper 2>/dev/null | grep '^credsweeper/' | LC_ALL=C sort -g | diff cicd/missed_docstrings.txt -
# # # yapf

Expand Down
3 changes: 1 addition & 2 deletions credsweeper/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@
from typing import Any, Union, Optional, Dict

from credsweeper import __version__
from credsweeper.app import CredSweeper
from credsweeper.app_path import APP_PATH
from credsweeper.app import APP_PATH, CredSweeper
from credsweeper.common.constants import ThresholdPreset, Severity, RuleType, DiffRowType
from credsweeper.file_handler.files_provider import FilesProvider
from credsweeper.file_handler.patch_provider import PatchProvider
Expand Down
21 changes: 13 additions & 8 deletions credsweeper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@

import pandas as pd

from credsweeper.app_path import APP_PATH
# Directory of credsweeper sources MUST be placed before imports to avoid circular import error
APP_PATH = Path(__file__).resolve().parent

from credsweeper.common.constants import KeyValidationOption, Severity, ThresholdPreset
from credsweeper.config import Config
from credsweeper.credentials import Candidate, CredentialManager
Expand Down Expand Up @@ -207,7 +209,7 @@ def run(self, content_provider: FilesProvider) -> int:
_empty_list: List[Union[DiffContentProvider, TextContentProvider]] = []
file_extractors: List[Union[DiffContentProvider, TextContentProvider]] = \
content_provider.get_scannable_files(self.config) if content_provider else _empty_list
logger.info("Start Scanner")
logger.info(f"Start Scanner for {len(file_extractors)} providers")
self.scan(file_extractors)
self.post_processing()
self.export_results()
Expand Down Expand Up @@ -308,19 +310,22 @@ def file_scan(self, content_provider: Union[DiffContentProvider, TextContentProv
def post_processing(self) -> None:
"""Machine learning validation for received credential candidates."""
if self._use_ml_validation():
logger.info("Run ML Validation")
logger.info(f"Run ML Validation for {len(self.credential_manager.candidates)} candidates")
new_cred_list = []
cred_groups = self.credential_manager.group_credentials()
ml_cred_groups = []
for group_key, group_candidates in cred_groups.items():
# Analyze with ML if all candidates in group require ML
if all(candidate.use_ml for candidate in group_candidates):
for candidate in group_candidates:
if not candidate.use_ml:
break
else:
ml_cred_groups.append((group_key.value, group_candidates))
continue
# If at least one of credentials in the group do not require ML - automatically report to user
else:
for candidate in group_candidates:
candidate.ml_validation = KeyValidationOption.NOT_AVAILABLE
new_cred_list += group_candidates
for candidate in group_candidates:
candidate.ml_validation = KeyValidationOption.NOT_AVAILABLE
new_cred_list += group_candidates

is_cred, probability = self.ml_validator.validate_groups(ml_cred_groups, self.ml_batch_size)
for i, (_, group_candidates) in enumerate(ml_cred_groups):
Expand Down
4 changes: 0 additions & 4 deletions credsweeper/app_path.py

This file was deleted.

3 changes: 3 additions & 0 deletions credsweeper/common/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
from credsweeper.common.keyword_checklist import KeywordChecklist

# use the variable to avoid singleton creation and make testing easier
static_keyword_checklist = KeywordChecklist()
42 changes: 19 additions & 23 deletions credsweeper/common/keyword_checklist.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,32 @@
import os
from typing import List, Set
from functools import cached_property
from typing import Set

from credsweeper.utils import Util
from credsweeper.app import APP_PATH


class KeywordChecklist:
"""KeywordsChecklist contains words 3 or more letters length"""
__keyword_list: List[str] = []
__keyword_set: Set[str]
__morpheme_set: Set[str]
KEYWORD_PATH = APP_PATH / "common" / "keyword_checklist.txt"
MORPHEME_PATH = APP_PATH / "common" / "morpheme_checklist.txt"

def __init__(self) -> None:
dir_path = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(dir_path, "keyword_checklist.txt")
self.set_list(Util.read_file(file_path))
# used suggested text read style. split() is preferred because it strips 0x0A on end the file
with open(self.KEYWORD_PATH, 'r') as f:
self.__keyword_set = set(f.read().split())

def get_list(self) -> List[str]:
"""Get list with keywords.
@cached_property
def keyword_set(self) -> Set[str]:
"""Get set with keywords.
Return:
List of strings
Set of strings
"""
return self.__keyword_list
return self.__keyword_set

def set_list(self, keyword_list: List[str]) -> None:
"""Remove old keywords and setup new one.
Args:
keyword_list: list of keywords to be added
"""
keyword_set: Set[str] = set()
for i in keyword_list:
if 3 <= len(i):
keyword_set.add(i)
self.__keyword_list = list(keyword_set)
@cached_property
def keyword_len(self) -> int:
"""Length of keyword_set"""
return len(self.__keyword_set)
6 changes: 4 additions & 2 deletions credsweeper/credentials/line_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,8 +257,10 @@ def is_comment(self) -> bool:
"""
cleaned_line = self.line.strip()
starts_from_comment = any(cleaned_line.startswith(comment_start) for comment_start in self.comment_starts)
return starts_from_comment
for comment_start in self.comment_starts:
if cleaned_line.startswith(comment_start):
return True
return False

def is_source_file(self) -> bool:
"""Check if file with credential is a source code file or not (data, log, plain text).
Expand Down
8 changes: 6 additions & 2 deletions credsweeper/file_handler/data_content_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,12 @@ def represent_as_html(self) -> bool:
"""
try:
text = self.data.decode(encoding=DEFAULT_ENCODING)
if any(tag in text for tag in ["</html>", "</body>", "</head>", "</div>", "</table>"]):
html = BeautifulSoup(text, features="html.parser")
html = None
for tag in ["</html>", "</body>", "</head>", "</div>", "</table>"]:
if tag in text:
html = BeautifulSoup(text, features="html.parser")
break
if html:
# simple parse as it is displayed to user
for line_number, line in enumerate(html.text.splitlines()):
if line and line.strip():
Expand Down
10 changes: 6 additions & 4 deletions credsweeper/file_handler/file_path_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,12 @@ def check_exclude_file(config: Config, path: str) -> bool:
path = path.replace('\\', '/').lower()
if config.not_allowed_path_pattern.match(path):
return True
if any(exclude_pattern.match(path) for exclude_pattern in config.exclude_patterns):
return True
if any(exclude_path in path for exclude_path in config.exclude_paths):
return True
for exclude_pattern in config.exclude_patterns:
if exclude_pattern.match(path):
return True
for exclude_path in config.exclude_paths:
if exclude_path in path:
return True
file_extension = Util.get_extension(path, lower=False)
if file_extension in config.exclude_extensions:
return True
Expand Down
11 changes: 5 additions & 6 deletions credsweeper/filters/value_dictionary_keyword_check.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
from credsweeper.common import KeywordChecklist
from credsweeper.common import static_keyword_checklist
from credsweeper.credentials import LineData
from credsweeper.filters import Filter


class ValueDictionaryKeywordCheck(Filter):
"""Check that no word from dictionary present in the candidate value."""

def __init__(self) -> None:
self.keyword_checklist = KeywordChecklist()

def run(self, line_data: LineData) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Expand All @@ -21,6 +18,8 @@ def run(self, line_data: LineData) -> bool:
"""
if not line_data.value:
return True
if any(keyword in line_data.value.lower() for keyword in self.keyword_checklist.get_list()):
return True
line_data_value_lower = line_data.value.lower()
for keyword in static_keyword_checklist.keyword_set:
if keyword in line_data_value_lower:
return True
return False
6 changes: 5 additions & 1 deletion credsweeper/filters/value_file_path_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@ def run(self, line_data: LineData) -> bool:
return True
contains_unix_separator = '/' in line_data.value
contains_windows_separator = ':\\' in line_data.value
contains_special_characters = any(c in line_data.value for c in " !$`&*()+")
contains_special_characters = False
for i in " !$`&*()+":
if i in line_data.value:
contains_special_characters = True
break
if (contains_unix_separator ^ contains_windows_separator) and not contains_special_characters:
return True
return False
23 changes: 15 additions & 8 deletions credsweeper/filters/value_split_keyword_check.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
from credsweeper.common import KeywordChecklist
from typing import Union

from credsweeper.common import static_keyword_checklist
from credsweeper.credentials import LineData
from credsweeper.filters import Filter


class ValueSplitKeywordCheck(Filter):
"""Check value by splitting with standard whitespace separators and any word is not matched in checklist."""

def __init__(self) -> None:
"""ValueSplitKeywordCheck constructor"""
self.keyword_checklist = KeywordChecklist()

def run(self, line_data: LineData) -> bool:
"""Run filter checks on received credential candidate data 'line_data'.
Expand All @@ -22,7 +20,16 @@ def run(self, line_data: LineData) -> bool:
"""
if not line_data.value:
return True
words = line_data.value.lower().split()
if any(keyword in words for keyword in self.keyword_checklist.get_list()):
return True
words: Union[set, list] = line_data.value.lower().split()
if static_keyword_checklist.keyword_len < len(words):
words = set(words)
keyword_set = static_keyword_checklist.keyword_set
if static_keyword_checklist.keyword_len < len(words):
for keyword in keyword_set:
if keyword in words:
return True
else:
for word in words:
if word in keyword_set:
return True
return False
2 changes: 1 addition & 1 deletion credsweeper/logger/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pathlib import Path
from typing import Optional

from credsweeper.app_path import APP_PATH
from credsweeper.app import APP_PATH
from credsweeper.utils import Util


Expand Down
Loading

0 comments on commit 0dd01d8

Please sign in to comment.