Senopiece · eivankin · Jul 4, 2024 · Jul 4, 2024 · Jul 4, 2024 · Jul 4, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,178 @@
+name: CI/CD Workflow
+on: push
+jobs:
+  ruff-format:
+    name: Check formatting
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: chartboost/ruff-action@v1
+        with:
+          args: format --check
+
+  ruff-check:
+    name: Run linter
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: chartboost/ruff-action@v1
+
+  test-and-build:
+    name: Test & build Python package
+    runs-on: ubuntu-latest
+    needs: [ruff-check, ruff-format]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      # Установка Python c кэшированим зависимостей для pip
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          cache: 'pip'
+
+      - name: Install poetry
+        uses: abatilo/actions-poetry@v2
+
+      - name: Configure poetry
+        working-directory: ./pwdkek-python
+        run: |
+          poetry config virtualenvs.create true --local
+          poetry config virtualenvs.in-project true --local
+
+      - uses: actions/cache@v3
+        name: Enable dependencies cache
+        with:
+          path: ./pwdkek-python/.venv
+          key: venv-${{ hashFiles('pwdkek-python/poetry.lock') }}
+
+      - name: Install dependencies
+        working-directory: ./pwdkek-python
+        run: poetry install
+
+      - uses: actions/cache@v3
+        name: Enable datasets cache
+        with:
+          path: ./datasets
+          key: datasets-${{ hashFiles('pwdkek-python/pwdkek_python/builtin_datasets.py') }}
+
+      - name: Download datasets
+        run: |
+          cd pwdkek-python
+          export PYTHONPATH=$(pwd):$PYTHONPATH
+          python pwdkek_python/builtin_datasets.py
+
+      - name: Run tests (small dataset)
+        run: |
+          cd pwdkek-python
+          export PYTHONPATH=$(pwd):$PYTHONPATH
+          python tests/test.py --dataset small
+
+      - name: Run tests (big dataset)
+        run: |
+          cd pwdkek-python
+          export PYTHONPATH=$(pwd):$PYTHONPATH
+          python tests/test.py --dataset big
+
+      - name: Build package
+        working-directory: ./pwdkek-python
+        run: poetry build
+
+      - name: Store the distribution packages
+        uses: actions/upload-artifact@v3
+        with:
+          name: python-package-distributions
+          path: ./pwdkek-python/dist/
+
+  publish-to-testpypi:
+    name: Publish to TestPyPI
+    needs:
+      - test-and-build
+    runs-on: ubuntu-latest
+
+    environment:
+      name: testpypi
+      url: https://test.pypi.org/p/pwdkek-python
+
+    permissions:
+      id-token: write  # IMPORTANT: mandatory for trusted publishing
+
+    steps:
+      - name: Download all the dists
+        uses: actions/download-artifact@v3
+        with:
+          name: python-package-distributions
+          path: ./pwdkek-python/dist/
+      - name: Publish distribution 📦 to TestPyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          repository-url: https://test.pypi.org/legacy/
+          packages-dir: ./pwdkek-python/dist/
+
+  publish-to-pypi:
+    name: >-
+      Publish to PyPI
+    if: startsWith(github.ref, 'refs/tags/')  # only publish to PyPI on tag pushes
+    needs:
+      - test-and-build
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/pwdkek-python
+    permissions:
+      id-token: write  # IMPORTANT: mandatory for trusted publishing
+
+    steps:
+      - name: Download all the dists
+        uses: actions/download-artifact@v3
+        with:
+          name: python-package-distributions
+          path: ./pwdkek-python/dist/
+      - name: Publish distribution 📦 to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          packages-dir: ./pwdkek-python/dist/
+
+  github-release:
+    name: >-
+      Sign and upload package to GitHub Release
+    needs:
+      - publish-to-pypi
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: write  # IMPORTANT: mandatory for making GitHub Releases
+      id-token: write  # IMPORTANT: mandatory for sigstore
+
+    steps:
+      - name: Download all the dists
+        uses: actions/download-artifact@v3
+        with:
+          name: python-package-distributions
+          path: ./pwdkek-python/dist/
+      - name: Sign the dists with Sigstore
+        uses: sigstore/[email protected]
+        with:
+          inputs: >-
+            ./pwdkek-python/dist/*.tar.gz
+            ./pwdkek-python/dist/*.whl
+      - name: Create GitHub Release
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+        run: >-
+          gh release create
+          '${{ github.ref_name }}'
+          --repo '${{ github.repository }}'
+          --notes ""
+      - name: Upload artifact signatures to GitHub Release
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+        # Upload to GitHub Release using the `gh` CLI.
+        # `dist/` contains the built packages, and the
+        # sigstore-produced signatures and certificates.
+        run: >-
+          gh release upload
+          '${{ github.ref_name }}' ./pwdkek-python/dist/**
+          --repo '${{ github.repository }}'
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
diff --git a/datasets/.gitkeep b/datasets/.gitkeep
diff --git a/datasets/crackstation-human-only-utf8-filtered-sorted.txt.gz b/datasets/crackstation-human-only-utf8-filtered-sorted.txt.gz
diff --git a/datasets/rockyou-utf8-filtered-sorted.txt.gz b/datasets/rockyou-utf8-filtered-sorted.txt.gz
diff --git a/prepare_dataset.py b/prepare_dataset.py
diff --git a/pwdkek-python/README.md b/pwdkek-python/README.md
@@ -0,0 +1,3 @@
+## Pwd Kek
+
+Ckek how strong is your password
diff --git a/pwdkek-python/poetry.lock b/pwdkek-python/poetry.lock
diff --git a/pwdkek-python/pwdkek_python/__init__.py b/pwdkek-python/pwdkek_python/__init__.py
@@ -0,0 +1,11 @@
+from pwdkek_python.complexity_estimator import (
+    PasswordComplexityEstimator,
+    PasswordComplexityTiers,
+    PasswordComplexityEstimate,
+)
+
+__all__ = [
+    "PasswordComplexityEstimator",
+    "PasswordComplexityTiers",
+    "PasswordComplexityEstimate",
+]
diff --git a/pwdkek-python/pwdkek_python/__main__.py b/pwdkek-python/pwdkek_python/__main__.py
@@ -0,0 +1,60 @@
+import argparse
+from datetime import timedelta
+
+from pwdkek_python.complexity_estimator import PasswordComplexityEstimator
+from pwdkek_python.builtin_datasets import BuiltInDataset
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Password Complexity Estimator")
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        help="Built-in dataset name or path to the dataset file",
+        default="small",
+    )
+    args = parser.parse_args()
+
+    print("Loading...")
+    try:
+        if args.dataset in BuiltInDataset.names():
+            dataset = BuiltInDataset[args.dataset.upper()]
+        else:
+            dataset = args.dataset
+        estimator = PasswordComplexityEstimator(dataset)
+    except ValueError as e:
+        print("Error:", e)
+        return
+
+    try:
+        while True:
+            print()
+            try:
+                estimate = estimator.estimate(input("Enter a password: "))
+            except ValueError as e:
+                print(e)
+                continue
+
+            print("Password entropy:", estimate.entropy)
+
+            print("Time to decode with 1Gh/s: ", end="")
+            if estimate.ttd == timedelta.max:
+                print("Uncountable number of years")
+            else:
+                ttd = estimate.ttd
+                years = ttd.days // 365
+                days = ttd.days % 365
+                hours, remainder = divmod(ttd.seconds, 3600)
+                minutes, seconds = divmod(remainder, 60)
+                print(
+                    f"{years} years {days} days {hours} hours {minutes} minutes {seconds} seconds",
+                )
+
+            print("Tier:", estimate.tier.value)
+
+    except KeyboardInterrupt:
+        pass
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pwdkek-python/pwdkek_python/builtin_datasets.py b/pwdkek-python/pwdkek_python/builtin_datasets.py
@@ -0,0 +1,74 @@
+import gzip
+from enum import Enum
+from io import BytesIO
+from tempfile import NamedTemporaryFile
+from typing import NamedTuple
+from pathlib import Path
+from urllib.request import urlopen, Request
+
+
+DATASET_ROOT = Path(__file__).parent.parent.parent / "datasets"
+
+
+class BuiltinDatasetInfo(NamedTuple):
+    file_name: str
+    source_url: str
+
+    @property
+    def path(self) -> Path:
+        file_path = DATASET_ROOT / self.file_name
+        if not file_path.exists():
+            self.download()
+        return file_path
+
+    @staticmethod
+    def _remove_non_utf8(file_path: str) -> None:
+        with open(file_path, "rb") as file:
+            data = file.read()
+        with open(file_path, "wb") as file:
+            file.write(data.decode("utf-8", errors="ignore").encode("utf-8"))
+
+    def download(self) -> None:
+        from pwdkek_python.prepare_dataset import prepare_dataset
+
+        print("Downloading", self.source_url)
+        file_data = urlopen(
+            Request(self.source_url, headers={"User-Agent": "curl/8.3.0"})
+        )
+        decompressed_file = NamedTemporaryFile(delete=False)
+        with decompressed_file as tmp_file:
+            with gzip.open(BytesIO(file_data.read()), "rb") as file:
+                tmp_file.write(file.read())
+
+        print("Fixing encoding...")
+        self._remove_non_utf8(decompressed_file.name)
+
+        print("Preparing dataset...")
+        prepare_dataset(decompressed_file.name, DATASET_ROOT / self.file_name)
+        decompressed_file.close()
+        print("Done!")
+
+
+class BuiltInDataset(Enum):
+    SMALL = BuiltinDatasetInfo(
+        "rockyou-utf8-filtered-sorted.txt.gz",
+        "https://raw.githubusercontent.com/zacheller/rockyou/master/rockyou.txt.tar.gz",
+    )
+    BIG = BuiltinDatasetInfo(
+        "crackstation-human-only-utf8-filtered-sorted.txt.gz",
+        "http://download.g0tmi1k.com/wordlists/large/crackstation-human-only.txt.gz",
+    )
+
+    @classmethod
+    def names(cls):
+        return [
+            name.lower()
+            for name, value in vars(cls).items()
+            if isinstance(value, BuiltInDataset)
+        ]
+
+
+if __name__ == "__main__":
+    print("Available datasets:", BuiltInDataset.names())
+    assert BuiltInDataset.SMALL.value.path.exists()
+    assert BuiltInDataset.BIG.value.path.exists()
diff --git a/estimator.py → ...hon/pwdkek_python/complexity_estimator.py b/estimator.py → ...hon/pwdkek_python/complexity_estimator.py
@@ -1,11 +1,13 @@
-import argparse
 import bisect
 from dataclasses import dataclass
 from datetime import timedelta
 from enum import Enum
 import gzip
 from math import log2
 import string
+from pathlib import Path
+
+from pwdkek_python.builtin_datasets import BuiltInDataset
 
 PASSWORD_ALLOWED_CHARS = str(
     string.ascii_lowercase
@@ -33,8 +35,11 @@ class PasswordComplexityEstimate:
 class PasswordComplexityEstimator:
     def __init__(
         self,
-        dataset_path: str,
+        dataset_path: str | Path | BuiltInDataset,
     ):
+        if isinstance(dataset_path, BuiltInDataset):
+            dataset_path = dataset_path.value.path
+
         with gzip.open(dataset_path) as file:
             self._passwords = [line.decode() for line in file.readlines()]
 
@@ -111,54 +116,3 @@ def estimate(self, password: str):
             ttd,
             tier,
         )
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Password Complexity Estimator")
-    parser.add_argument(
-        "--dataset_path",
-        type=str,
-        help="Path to the dataset file",
-        default="datasets/rockyou-utf8-filtered-sorted.txt.gz",
-    )
-    args = parser.parse_args()
-
-    print("Loading...")
-    try:
-        estimator = PasswordComplexityEstimator(args.dataset_path)
-    except ValueError as e:
-        print("Error:", e)
-        return
-
-    try:
-        while True:
-            print()
-            try:
-                estimate = estimator.estimate(input("Enter a password: "))
-            except ValueError as e:
-                print(e)
-                continue
-
-            print("Password entropy:", estimate.entropy)
-
-            print("Time to decode with 1Gh/s: ", end="")
-            if estimate.ttd == timedelta.max:
-                print("Uncountable number of years")
-            else:
-                ttd = estimate.ttd
-                years = ttd.days // 365
-                days = ttd.days % 365
-                hours, remainder = divmod(ttd.seconds, 3600)
-                minutes, seconds = divmod(remainder, 60)
-                print(
-                    f"{years} years {days} days {hours} hours {minutes} minutes {seconds} seconds",
-                )
-
-            print("Tier:", estimate.tier.value)
-
-    except KeyboardInterrupt:
-        pass
-
-
-if __name__ == "__main__":
-    main()
diff --git a/pwdkek-python/pwdkek_python/prepare_dataset.py b/pwdkek-python/pwdkek_python/prepare_dataset.py
@@ -0,0 +1,25 @@
+import gzip
+import sys
+from pathlib import Path
+
+from pwdkek_python.complexity_estimator import PASSWORD_ALLOWED_CHARS
+
+
+def prepare_dataset(input_path: str | Path, output_path: str | Path) -> None:
+    with open(input_path, "r") as file:
+        passwords = [
+            "".join([ch for ch in line if ch in PASSWORD_ALLOWED_CHARS])
+            for line in file.readlines()
+        ]
+
+    passwords = list(filter(lambda pwd: len(pwd) != 0, passwords))
+    passwords.sort()
+
+    with gzip.open(output_path, "wt") as file:
+        for password in passwords:
+            file.write(password + "\n")
+
+
+if __name__ == "__main__":
+    # NOTE: convert input to utf-8 beforehand
+    prepare_dataset(sys.argv[1], sys.argv[2])
diff --git a/pwdkek-python/pyproject.toml b/pwdkek-python/pyproject.toml
@@ -0,0 +1,23 @@
+[tool.poetry]
+name = "pwdkek-python"
+version = "0.1.0"
+description = ""
+authors = [
+    "Vitaly Mahonin",
+    "Evgenij Ivankin",
+    "Vladislav Kolpachev",
+    "Nikolay Kurichev"
+]
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = "^3.11"
+
+
+[tool.poetry.group.dev.dependencies]
+pytest = "^8.2.2"
+ruff = "^0.5.0"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/pwdkek-python/tests/__init__.py b/pwdkek-python/tests/__init__.py
diff --git a/test.py → pwdkek-python/tests/test.py b/test.py → pwdkek-python/tests/test.py
@@ -1,43 +1,59 @@
 from datetime import timedelta
-from estimator import *
+
+from pwdkek_python.builtin_datasets import BuiltInDataset
+from pwdkek_python.complexity_estimator import (
+    PasswordComplexityTiers,
+    PasswordComplexityEstimator,
+)
 import json
 import argparse
 
+
 # Функция для загрузки паролей и их тиров из JSON файла
 def load_passwords(filename):
-    with open(filename, 'r', encoding='utf-8') as file:
+    with open(filename, "r", encoding="utf-8") as file:
         data = json.load(file)
     return data
 
+
 # Функция для тестирования паролей
 def test_passwords(filename):
-    parser = argparse.ArgumentParser(description="Password Complexity Estimator Testing")
+    parser = argparse.ArgumentParser(
+        description="Password Complexity Estimator Testing"
+    )
     parser.add_argument(
-        "--dataset_path",
+        "--dataset",
         type=str,
-        help="Path to the dataset file",
-        default="datasets/rockyou-utf8-filtered-sorted.txt.gz",
+        help="Built-in dataset name or path to the dataset file",
+        default="small",
     )
     args = parser.parse_args()
 
     password_data = load_passwords(filename)
-    estimator = PasswordComplexityEstimator(args.dataset_path)
+
+    if args.dataset in BuiltInDataset.names():
+        dataset = BuiltInDataset[args.dataset.upper()]
+    else:
+        dataset = args.dataset
+
+    estimator = PasswordComplexityEstimator(dataset)
 
     # Количество совпадений по тирам
-    correct_predictions = {tier_name.value : 0 for tier_name in PasswordComplexityTiers}
-    
+    correct_predictions = {tier_name.value: 0 for tier_name in PasswordComplexityTiers}
+
     # cумма предсказаний для самых плохих паролей
     sum_pathetic_pred = 0
-    tier_to_num = {tier_name.value : i for i, tier_name in enumerate(PasswordComplexityTiers)}
+    tier_to_num = {
+        tier_name.value: i for i, tier_name in enumerate(PasswordComplexityTiers)
+    }
     pathetic_tier = [tier_name.value for tier_name in PasswordComplexityTiers][0]
-    
+
     # число правильно определeнных самых плохих паролей
     cnt_correct_pathetic_pred = 0
 
     for expected_tier, passwords in password_data.items():
         print(f"Testing {expected_tier} passwords:")
         for password in passwords:
-
             try:
                 estimate = estimator.estimate(password)
             except ValueError as e:
@@ -49,24 +65,30 @@ def test_passwords(filename):
             if ttd == timedelta.max:
                 ttd = "Uncountable number of years"
 
-            print(f"Password: {password}, Predicted Tier: {predicted_tier}, Entropy: {round(entropy, 3)}, TTD: {ttd}")
+            print(
+                f"Password: {password}, Predicted Tier: {predicted_tier}, Entropy: {round(entropy, 3)}, TTD: {ttd}"
+            )
 
             if expected_tier == predicted_tier:
                 correct_predictions[predicted_tier] += 1
             if expected_tier == pathetic_tier:
                 sum_pathetic_pred += tier_to_num[predicted_tier]
                 if expected_tier == predicted_tier:
                     cnt_correct_pathetic_pred += 1
-                
-        print('-' * 100)
+
+        print("-" * 100)
     print("\nResults:")
     for tier_name, correct_count in correct_predictions.items():
         if tier_name in password_data:
-            print(f"Correct predictions for {tier_name} tier: {correct_count / len(password_data[tier_name])}")
-
-    print(f"\nAverage predictions for {pathetic_tier} passwords: {sum_pathetic_pred / len(password_data[pathetic_tier])}. must be {tier_to_num[pathetic_tier]}")
+            print(
+                f"Correct predictions for {tier_name} tier: {correct_count / len(password_data[tier_name])}"
+            )
+
+    print(
+        f"\nAverage predictions for {pathetic_tier} passwords: {sum_pathetic_pred / len(password_data[pathetic_tier])}. must be {tier_to_num[pathetic_tier]}"
+    )
     print("Recall:", cnt_correct_pathetic_pred / len(password_data[pathetic_tier]))
-
+
+
 if __name__ == "__main__":
-    test_passwords('test_passwords.json')
-
+    test_passwords("tests/test_files/test_passwords.json")
diff --git a/test_passwords.json → ...thon/tests/test_files/test_passwords.json b/test_passwords.json → ...thon/tests/test_files/test_passwords.json