From 67963ee21cf4f4f970ee1d501085b6ee23142936 Mon Sep 17 00:00:00 2001 From: Quincey James <47681353+QuinceyJames@users.noreply.github.com> Date: Mon, 12 Dec 2022 04:19:32 -0500 Subject: [PATCH] Fix false positives and duplicate errors in the typosquatting algorithm (#108) * Fixed bug in typosquatting algorithm that may duplicate package name if found to be malicious * Exiting early from typosquatting algorithm if the requested package is one of the top packages to avoid expensive computation * Using canonicalize_name() in typosquatting algorithm to adhere to PEP and avoid false positives * Refactored for better encapsulation, coding conventions, typos, and added constructor call * Created tests for issue 71 --- guarddog/analyzer/metadata/typosquatting.py | 54 ++++++++----------- requirements.txt | 2 + tests/analyzer/metadata/test_typosquatting.py | 19 +++++++ 3 files changed, 43 insertions(+), 32 deletions(-) diff --git a/guarddog/analyzer/metadata/typosquatting.py b/guarddog/analyzer/metadata/typosquatting.py index 5fe87e7f..ced96aa5 100644 --- a/guarddog/analyzer/metadata/typosquatting.py +++ b/guarddog/analyzer/metadata/typosquatting.py @@ -3,7 +3,7 @@ from datetime import datetime, timedelta from itertools import permutations from typing import Optional - +from packaging.utils import canonicalize_name import requests from guarddog.analyzer.metadata.detector import Detector @@ -20,18 +20,8 @@ class TyposquatDetector(Detector): """ def __init__(self) -> None: - # Find top PyPI packages - top_packages_information = self._get_top_packages() - - # Get list of popular packages - self.popular_packages = [] - - for package in top_packages_information: - name = package["project"] - normalized_name = name.lower().replace("_", "-") - self.popular_packages.append(normalized_name) - - super() + self.popular_packages = self._get_top_packages() # Find top PyPI packages + super().__init__() # Call constructor def _get_top_packages(self) -> list: """ @@ -52,12 +42,12 @@ def _get_top_packages(self) -> list: popular_packages_url = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json" top_packages_filename = "top_pypi_packages.json" - resourcesdir = os.path.abspath(os.path.join(os.path.dirname(__file__), "resources")) - top_packages_path = os.path.join(resourcesdir, top_packages_filename) + resources_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "resources")) + top_packages_path = os.path.join(resources_dir, top_packages_filename) top_packages_information = None - if top_packages_filename in os.listdir(resourcesdir): + if top_packages_filename in os.listdir(resources_dir): update_time = datetime.fromtimestamp(os.path.getmtime(top_packages_path)) if datetime.now() - update_time <= timedelta(days=30): @@ -71,7 +61,10 @@ def _get_top_packages(self) -> list: top_packages_information = response["rows"] - return top_packages_information + def get_safe_name(package): + return canonicalize_name(package["project"]) + + return list(map(get_safe_name, top_packages_information)) def _is_distance_one_Levenshtein(self, name1, name2) -> bool: """ @@ -216,29 +209,26 @@ def get_typosquatted_package(self, package_name) -> list[str]: typosquatting from """ - typosquatted = [] - # Get permuted typosquats for normalized and confused names - normalized_name = package_name.lower().replace("_", "-") + normalized_name = canonicalize_name(package_name) + + if normalized_name in self.popular_packages: + return [] # Go through popular packages and find length one edit typosquats + typosquatted = set() for popular_package in self.popular_packages: - normalized_popular_package = popular_package.lower().replace("_", "-") - - if normalized_name == popular_package: - return [] - - if self._is_length_one_edit_away(normalized_name, normalized_popular_package): - typosquatted.append(popular_package) + if self._is_length_one_edit_away(normalized_name, popular_package): + typosquatted.add(popular_package) - alternate_popular_names = self._get_confused_forms(normalized_popular_package) - swapped_popular_names = self._generate_permutations(normalized_popular_package) + alternate_popular_names = self._get_confused_forms(popular_package) + swapped_popular_names = self._generate_permutations(popular_package) for name in alternate_popular_names + swapped_popular_names: if self._is_length_one_edit_away(normalized_name, name): - typosquatted.append(normalized_popular_package) + typosquatted.add(popular_package) - return typosquatted + return list(typosquatted) def detect(self, package_info) -> tuple[bool, Optional[str]]: """ @@ -255,7 +245,7 @@ def detect(self, package_info) -> tuple[bool, Optional[str]]: """ similar_package_names = self.get_typosquatted_package(package_info["info"]["name"]) if len(similar_package_names) > 0: - return True, "This package closely ressembles the following package names, and might be a typosquatting " \ + return True, "This package closely resembles the following package names, and might be a typosquatting " \ "attempt: " + ", ".join(similar_package_names) return False, None diff --git a/requirements.txt b/requirements.txt index c79e915f..3342e216 100644 --- a/requirements.txt +++ b/requirements.txt @@ -54,3 +54,5 @@ ujson==5.4.0 ; python_version >= "3.10" and python_version < "4" urllib3==1.26.11 ; python_version >= "3.10" and python_version < "4" wcmatch==8.4 ; python_version >= "3.10" and python_version < "4" websocket-client==1.3.3 ; python_version >= "3.10" and python_version < "4" + +packaging~=21.3 \ No newline at end of file diff --git a/tests/analyzer/metadata/test_typosquatting.py b/tests/analyzer/metadata/test_typosquatting.py index 47c22374..3e0ab401 100644 --- a/tests/analyzer/metadata/test_typosquatting.py +++ b/tests/analyzer/metadata/test_typosquatting.py @@ -42,3 +42,22 @@ def test_nontyposquats(self, name): project_info = generate_project_info("name", name) matches, _ = self.detector.detect(project_info) assert not matches + + def test_no_duplicate_errors(self): + """ + Verify that a package with a typo in the name only reports 1 error + + Regression test for https://github.com/DataDog/guarddog/issues/71 + """ + result = self.detector.get_typosquatted_package("pdfminer.sid") + assert len(result) == 1 + + def test_normalize_names(self): + """ + Verify that a package with 1 or more dots(.), hyphens(-) or underscore(_) gets normalized + to avoid false positives + + Regression test for https://github.com/DataDog/guarddog/issues/71 + """ + result = self.detector.get_typosquatted_package("pdfminer...---___six") + assert len(result) == 0