From 67963ee21cf4f4f970ee1d501085b6ee23142936 Mon Sep 17 00:00:00 2001
From: Quincey James <47681353+QuinceyJames@users.noreply.github.com>
Date: Mon, 12 Dec 2022 04:19:32 -0500
Subject: [PATCH] Fix false positives and duplicate errors in the typosquatting
 algorithm (#108)

* Fixed bug in typosquatting algorithm that may duplicate package name if found to be malicious

* Exiting early from typosquatting algorithm if the requested package is one of the top packages to avoid expensive computation

* Using canonicalize_name() in typosquatting algorithm to adhere to PEP and avoid false positives

* Refactored for better encapsulation, coding conventions, typos, and added constructor call

* Created tests for issue 71
---
 guarddog/analyzer/metadata/typosquatting.py   | 54 ++++++++-----------
 requirements.txt                              |  2 +
 tests/analyzer/metadata/test_typosquatting.py | 19 +++++++
 3 files changed, 43 insertions(+), 32 deletions(-)

diff --git a/guarddog/analyzer/metadata/typosquatting.py b/guarddog/analyzer/metadata/typosquatting.py
index 5fe87e7f..ced96aa5 100644
--- a/guarddog/analyzer/metadata/typosquatting.py
+++ b/guarddog/analyzer/metadata/typosquatting.py
@@ -3,7 +3,7 @@
 from datetime import datetime, timedelta
 from itertools import permutations
 from typing import Optional
-
+from packaging.utils import canonicalize_name
 import requests
 
 from guarddog.analyzer.metadata.detector import Detector
@@ -20,18 +20,8 @@ class TyposquatDetector(Detector):
     """
 
     def __init__(self) -> None:
-        # Find top PyPI packages
-        top_packages_information = self._get_top_packages()
-
-        # Get list of popular packages
-        self.popular_packages = []
-
-        for package in top_packages_information:
-            name = package["project"]
-            normalized_name = name.lower().replace("_", "-")
-            self.popular_packages.append(normalized_name)
-
-        super()
+        self.popular_packages = self._get_top_packages()  # Find top PyPI packages
+        super().__init__()  # Call constructor
 
     def _get_top_packages(self) -> list:
         """
@@ -52,12 +42,12 @@ def _get_top_packages(self) -> list:
         popular_packages_url = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json"
 
         top_packages_filename = "top_pypi_packages.json"
-        resourcesdir = os.path.abspath(os.path.join(os.path.dirname(__file__), "resources"))
-        top_packages_path = os.path.join(resourcesdir, top_packages_filename)
+        resources_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "resources"))
+        top_packages_path = os.path.join(resources_dir, top_packages_filename)
 
         top_packages_information = None
 
-        if top_packages_filename in os.listdir(resourcesdir):
+        if top_packages_filename in os.listdir(resources_dir):
             update_time = datetime.fromtimestamp(os.path.getmtime(top_packages_path))
 
             if datetime.now() - update_time <= timedelta(days=30):
@@ -71,7 +61,10 @@ def _get_top_packages(self) -> list:
 
             top_packages_information = response["rows"]
 
-        return top_packages_information
+        def get_safe_name(package):
+            return canonicalize_name(package["project"])
+
+        return list(map(get_safe_name, top_packages_information))
 
     def _is_distance_one_Levenshtein(self, name1, name2) -> bool:
         """
@@ -216,29 +209,26 @@ def get_typosquatted_package(self, package_name) -> list[str]:
             typosquatting from
         """
 
-        typosquatted = []
-
         # Get permuted typosquats for normalized and confused names
-        normalized_name = package_name.lower().replace("_", "-")
+        normalized_name = canonicalize_name(package_name)
+
+        if normalized_name in self.popular_packages:
+            return []
 
         # Go through popular packages and find length one edit typosquats
+        typosquatted = set()
         for popular_package in self.popular_packages:
-            normalized_popular_package = popular_package.lower().replace("_", "-")
-
-            if normalized_name == popular_package:
-                return []
-
-            if self._is_length_one_edit_away(normalized_name, normalized_popular_package):
-                typosquatted.append(popular_package)
+            if self._is_length_one_edit_away(normalized_name, popular_package):
+                typosquatted.add(popular_package)
 
-            alternate_popular_names = self._get_confused_forms(normalized_popular_package)
-            swapped_popular_names = self._generate_permutations(normalized_popular_package)
+            alternate_popular_names = self._get_confused_forms(popular_package)
+            swapped_popular_names = self._generate_permutations(popular_package)
 
             for name in alternate_popular_names + swapped_popular_names:
                 if self._is_length_one_edit_away(normalized_name, name):
-                    typosquatted.append(normalized_popular_package)
+                    typosquatted.add(popular_package)
 
-        return typosquatted
+        return list(typosquatted)
 
     def detect(self, package_info) -> tuple[bool, Optional[str]]:
         """
@@ -255,7 +245,7 @@ def detect(self, package_info) -> tuple[bool, Optional[str]]:
         """
         similar_package_names = self.get_typosquatted_package(package_info["info"]["name"])
         if len(similar_package_names) > 0:
-            return True, "This package closely ressembles the following package names, and might be a typosquatting " \
+            return True, "This package closely resembles the following package names, and might be a typosquatting " \
                          "attempt: " + ", ".join(similar_package_names)
 
         return False, None
diff --git a/requirements.txt b/requirements.txt
index c79e915f..3342e216 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -54,3 +54,5 @@ ujson==5.4.0 ; python_version >= "3.10" and python_version < "4"
 urllib3==1.26.11 ; python_version >= "3.10" and python_version < "4"
 wcmatch==8.4 ; python_version >= "3.10" and python_version < "4"
 websocket-client==1.3.3 ; python_version >= "3.10" and python_version < "4"
+
+packaging~=21.3
\ No newline at end of file
diff --git a/tests/analyzer/metadata/test_typosquatting.py b/tests/analyzer/metadata/test_typosquatting.py
index 47c22374..3e0ab401 100644
--- a/tests/analyzer/metadata/test_typosquatting.py
+++ b/tests/analyzer/metadata/test_typosquatting.py
@@ -42,3 +42,22 @@ def test_nontyposquats(self, name):
         project_info = generate_project_info("name", name)
         matches, _ = self.detector.detect(project_info)
         assert not matches
+
+    def test_no_duplicate_errors(self):
+        """
+        Verify that a package with a typo in the name only reports 1 error
+
+        Regression test for https://github.com/DataDog/guarddog/issues/71
+        """
+        result = self.detector.get_typosquatted_package("pdfminer.sid")
+        assert len(result) == 1
+
+    def test_normalize_names(self):
+        """
+        Verify that a package with 1 or more dots(.), hyphens(-) or underscore(_) gets normalized
+        to avoid false positives
+
+        Regression test for https://github.com/DataDog/guarddog/issues/71
+        """
+        result = self.detector.get_typosquatted_package("pdfminer...---___six")
+        assert len(result) == 0