Skip to content

Commit

Permalink
Fix false positives and duplicate errors in the typosquatting algorit…
Browse files Browse the repository at this point in the history
…hm (#108)

* Fixed bug in typosquatting algorithm that may duplicate package name if found to be malicious

* Exiting early from typosquatting algorithm if the requested package is one of the top packages to avoid expensive computation

* Using canonicalize_name() in typosquatting algorithm to adhere to PEP and avoid false positives

* Refactored for better encapsulation, coding conventions, typos, and added constructor call

* Created tests for issue 71
  • Loading branch information
QuinceyJames authored Dec 12, 2022
1 parent e2e05c7 commit 67963ee
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 32 deletions.
54 changes: 22 additions & 32 deletions guarddog/analyzer/metadata/typosquatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from datetime import datetime, timedelta
from itertools import permutations
from typing import Optional

from packaging.utils import canonicalize_name
import requests

from guarddog.analyzer.metadata.detector import Detector
Expand All @@ -20,18 +20,8 @@ class TyposquatDetector(Detector):
"""

def __init__(self) -> None:
# Find top PyPI packages
top_packages_information = self._get_top_packages()

# Get list of popular packages
self.popular_packages = []

for package in top_packages_information:
name = package["project"]
normalized_name = name.lower().replace("_", "-")
self.popular_packages.append(normalized_name)

super()
self.popular_packages = self._get_top_packages() # Find top PyPI packages
super().__init__() # Call constructor

def _get_top_packages(self) -> list:
"""
Expand All @@ -52,12 +42,12 @@ def _get_top_packages(self) -> list:
popular_packages_url = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json"

top_packages_filename = "top_pypi_packages.json"
resourcesdir = os.path.abspath(os.path.join(os.path.dirname(__file__), "resources"))
top_packages_path = os.path.join(resourcesdir, top_packages_filename)
resources_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "resources"))
top_packages_path = os.path.join(resources_dir, top_packages_filename)

top_packages_information = None

if top_packages_filename in os.listdir(resourcesdir):
if top_packages_filename in os.listdir(resources_dir):
update_time = datetime.fromtimestamp(os.path.getmtime(top_packages_path))

if datetime.now() - update_time <= timedelta(days=30):
Expand All @@ -71,7 +61,10 @@ def _get_top_packages(self) -> list:

top_packages_information = response["rows"]

return top_packages_information
def get_safe_name(package):
return canonicalize_name(package["project"])

return list(map(get_safe_name, top_packages_information))

def _is_distance_one_Levenshtein(self, name1, name2) -> bool:
"""
Expand Down Expand Up @@ -216,29 +209,26 @@ def get_typosquatted_package(self, package_name) -> list[str]:
typosquatting from
"""

typosquatted = []

# Get permuted typosquats for normalized and confused names
normalized_name = package_name.lower().replace("_", "-")
normalized_name = canonicalize_name(package_name)

if normalized_name in self.popular_packages:
return []

# Go through popular packages and find length one edit typosquats
typosquatted = set()
for popular_package in self.popular_packages:
normalized_popular_package = popular_package.lower().replace("_", "-")

if normalized_name == popular_package:
return []

if self._is_length_one_edit_away(normalized_name, normalized_popular_package):
typosquatted.append(popular_package)
if self._is_length_one_edit_away(normalized_name, popular_package):
typosquatted.add(popular_package)

alternate_popular_names = self._get_confused_forms(normalized_popular_package)
swapped_popular_names = self._generate_permutations(normalized_popular_package)
alternate_popular_names = self._get_confused_forms(popular_package)
swapped_popular_names = self._generate_permutations(popular_package)

for name in alternate_popular_names + swapped_popular_names:
if self._is_length_one_edit_away(normalized_name, name):
typosquatted.append(normalized_popular_package)
typosquatted.add(popular_package)

return typosquatted
return list(typosquatted)

def detect(self, package_info) -> tuple[bool, Optional[str]]:
"""
Expand All @@ -255,7 +245,7 @@ def detect(self, package_info) -> tuple[bool, Optional[str]]:
"""
similar_package_names = self.get_typosquatted_package(package_info["info"]["name"])
if len(similar_package_names) > 0:
return True, "This package closely ressembles the following package names, and might be a typosquatting " \
return True, "This package closely resembles the following package names, and might be a typosquatting " \
"attempt: " + ", ".join(similar_package_names)

return False, None
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,5 @@ ujson==5.4.0 ; python_version >= "3.10" and python_version < "4"
urllib3==1.26.11 ; python_version >= "3.10" and python_version < "4"
wcmatch==8.4 ; python_version >= "3.10" and python_version < "4"
websocket-client==1.3.3 ; python_version >= "3.10" and python_version < "4"

packaging~=21.3
19 changes: 19 additions & 0 deletions tests/analyzer/metadata/test_typosquatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,22 @@ def test_nontyposquats(self, name):
project_info = generate_project_info("name", name)
matches, _ = self.detector.detect(project_info)
assert not matches

def test_no_duplicate_errors(self):
"""
Verify that a package with a typo in the name only reports 1 error
Regression test for https://github.com/DataDog/guarddog/issues/71
"""
result = self.detector.get_typosquatted_package("pdfminer.sid")
assert len(result) == 1

def test_normalize_names(self):
"""
Verify that a package with 1 or more dots(.), hyphens(-) or underscore(_) gets normalized
to avoid false positives
Regression test for https://github.com/DataDog/guarddog/issues/71
"""
result = self.detector.get_typosquatted_package("pdfminer...---___six")
assert len(result) == 0

0 comments on commit 67963ee

Please sign in to comment.