hassansaei · hassansaei · Jan 13, 2025 · Jan 11, 2025 · Jan 11, 2025 · Jan 11, 2025
diff --git a/tests/test_data_config.json b/tests/test_data_config.json
@@ -208,7 +208,7 @@
             "value": 0.05869074492099323,
             "tolerance_percentage": 5
           },
-          "Confidence": "High_Precision"
+          "Confidence": "High_Precision*"
         },
         "check_igv_report": true
       },
@@ -248,7 +248,7 @@
             "value": 0.02168057579370336,
             "tolerance_percentage": 5
           },
-          "Confidence": "High_Precision"
+          "Confidence": "High_Precision*"
         },
         "check_igv_report": true
       },
@@ -288,7 +288,7 @@
             "value": 0.010855245823892079,
             "tolerance_percentage": 5
           },
-          "Confidence": "High_Precision"
+          "Confidence": "High_Precision*"
         },
         "check_igv_report": false
       }

diff --git a/tests/unit/test_scoring.py b/tests/unit/test_scoring.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+# tests/unit/test_scoring.py
+
+"""
+Unit tests for the scoring functionality in vntyper/scripts/scoring.py.
+Validates frame-score calculations, depth splitting, and frameshift extraction.
+"""
+
+import pytest
+import pandas as pd
+import numpy as np
+
+from vntyper.scripts.scoring import (
+    split_depth_and_calculate_frame_score,
+    split_frame_score,
+    extract_frameshifts,
+)
+
+
+@pytest.mark.parametrize("df_input,expected_len", [
+    (pd.DataFrame(), 0),
+])
+def test_split_depth_and_calculate_frame_score_empty_df(df_input, expected_len):
+    """
+    Verify that an empty input DataFrame remains empty.
+    """
+    out = split_depth_and_calculate_frame_score(df_input)
+    assert len(out) == expected_len, (
+        "Empty input should yield empty output after split_depth_and_calculate_frame_score."
+    )
+
+
+def test_split_depth_and_calculate_frame_score_no_frameshift():
+    """
+    If the difference (ALT length - REF length) is a multiple of 3,
+    the variant should not be retained (non-frameshift).
+    """
+    df = pd.DataFrame({
+        "Sample": ["Del:10:100"],   # Only the first 'Del' part is not used, but we keep format for test
+        "REF": ["ATG"],            # length 3
+        "ALT": ["ATGATG"],         # length 6  -> difference = 3 -> multiple of 3
+        "Motifs": ["mock_motif"],
+        "Variant": ["mock_variant"],
+        "POS": [123],
+        "Motif_sequence": ["mock_sequence"]
+    })
+    out = split_depth_and_calculate_frame_score(df)
+    # Because it's a multiple of 3 difference, is_frameshift == False => filtered out
+    assert out.empty, (
+        "Variants with multiple-of-3 difference should be filtered out as non-frameshift."
+    )
+
+
+def test_split_depth_and_calculate_frame_score_frameshift():
+    """
+    If the difference (ALT length - REF length) is not a multiple of 3,
+    the variant should be retained and a 'Frame_Score' should be added.
+    """
+    df = pd.DataFrame({
+        "Sample": ["Del:50:500"],
+        "REF": ["ATG"],            # length 3
+        "ALT": ["ATGA"],           # length 4 -> difference = 1 -> frameshift
+        "Motifs": ["mock_motif"],
+        "Variant": ["mock_variant"],
+        "POS": [456],
+        "Motif_sequence": ["mock_sequence"]
+    })
+    out = split_depth_and_calculate_frame_score(df)
+    assert not out.empty, "Expected to retain a frameshift variant (difference not multiple of 3)."
+    assert "Frame_Score" in out.columns, "Output should have a 'Frame_Score' column."
+    # Check that is_frameshift was True
+    assert "is_frameshift" in out.columns, "Output should have 'is_frameshift' marking frameshift or not."
+    assert all(out["is_frameshift"]), "All retained rows should be frameshift variants."
+
+
+def test_split_frame_score_empty_df():
+    """
+    Verify that an empty input DataFrame remains empty when split_frame_score is called.
+    """
+    df = pd.DataFrame()
+    out = split_frame_score(df)
+    assert out.empty, "Empty input should yield empty output after split_frame_score."
+
+
+def test_split_frame_score_basic():
+    """
+    Test basic splitting of frame score into 'direction' and 'frameshift_amount'.
+    """
+    df = pd.DataFrame({
+        "Frame_Score": [1.0, -2.0],  # not directly used, but indicates frameshift
+        "ref_len": [3, 6],
+        "alt_len": [4, 4],          # alt_len - ref_len => [1, -2]
+        "is_frameshift": [True, True]  # frameshift is assumed True from previous step
+    })
+    out = split_frame_score(df)
+
+    # We drop 'is_frameshift', 'ref_len', 'alt_len'
+    # We keep 'direction', 'frameshift_amount', 'Frame_Score', etc.
+    expected_columns = {"Frame_Score", "direction", "frameshift_amount"}
+    assert expected_columns.issubset(set(out.columns)), (
+        f"Output must contain at least: {expected_columns}"
+    )
+
+    # direction = sign(alt_len - ref_len)
+    # frameshift_amount = abs(alt_len - ref_len) % 3
+    # For row0: alt_len - ref_len = 1 => direction=1, frameshift_amount=1
+    # For row1: alt_len - ref_len = -2 => direction < 0 => -1, frameshift_amount=2
+    assert out.loc[0, "direction"] == 1, "Expected direction=1 for alt_len-ref_len=1."
+    assert out.loc[0, "frameshift_amount"] == 1, "Expected frameshift_amount=1 for difference=1."
+
+    assert out.loc[1, "direction"] == -1, "Expected direction=-1 for alt_len-ref_len=-2."
+    assert out.loc[1, "frameshift_amount"] == 2, "Expected frameshift_amount=2 for difference=-2."
+
+
+def test_extract_frameshifts_empty_df():
+    """
+    Verify that an empty input DataFrame remains empty in extract_frameshifts.
+    """
+    df = pd.DataFrame()
+    out = extract_frameshifts(df)
+    assert out.empty, "Empty input should yield empty output after extract_frameshifts."
+
+
+def test_extract_frameshifts_mixed():
+    """
+    Test that only frameshift rows meeting the 3n+1 insertion (direction>0) or
+    3n+2 deletion (direction<0) are retained.
+    """
+    df = pd.DataFrame({
+        "direction": [1, 1, -1, -1, 1],
+        "frameshift_amount": [1, 2, 2, 1, 1],
+        "Frame_Score": [0.33, 0.66, -0.66, -0.33, 0.33],
+        "Variant": ["ins_ok", "ins_wrong", "del_ok", "del_wrong", "ins_ok2"]
+    })
+    # insertion frameshift => direction>0 and frameshift_amount=1
+    # deletion frameshift => direction<0 and frameshift_amount=2
+    out = extract_frameshifts(df)
+    # Expect to retain rows: 0 (ins_ok), 2 (del_ok), 4 (ins_ok2)
+    # Indices 1 (ins_wrong) and 3 (del_wrong) should be dropped
+    assert len(out) == 3, "Expected to keep 3 rows that match frameshift patterns."
+    assert sorted(out["Variant"].tolist()) == sorted(["ins_ok", "del_ok", "ins_ok2"]), (
+        "Wrong set of frameshift variants retained."
+    )
diff --git a/tests/unit/test_variant_parsing.py b/tests/unit/test_variant_parsing.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+# tests/unit/test_variant_parsing.py
+
+"""
+Unit tests for variant_parsing.py, focusing on:
+  filter_by_alt_values_and_finalize()
+
+Ensures that ALT-based filtering rules are correctly applied:
+  - 'GG' alt requires a minimum Depth_Score.
+  - exclude_alts removed from final DataFrame.
+  - left/right columns dropped at the end.
+"""
+
+import pytest
+import pandas as pd
+from vntyper.scripts.variant_parsing import filter_by_alt_values_and_finalize
+
+
+@pytest.fixture
+def kestrel_config_mock():
+    """
+    Provide a mock kestrel_config dict specifically for ALT filtering tests.
+    Real config may contain more fields, but we only need 'alt_filtering' here.
+    """
+    return {
+        "alt_filtering": {
+            "gg_alt_value": "GG",
+            "gg_depth_score_threshold": 0.02,
+            "exclude_alts": ["BAD_ALT", "ZZZ"]
+        }
+    }
+
+
+def test_filter_by_alt_values_empty_df(kestrel_config_mock):
+    """
+    Test that an empty DataFrame simply returns empty without error.
+    """
+    df = pd.DataFrame()
+    out = filter_by_alt_values_and_finalize(df, kestrel_config_mock)
+    assert out.empty, "Empty input should yield empty output."
+
+
+def test_filter_by_alt_values_missing_columns(kestrel_config_mock):
+    """
+    Test that missing 'ALT' or 'Depth_Score' columns raises KeyError.
+    """
+    df = pd.DataFrame({
+        "ALT": ["GG", "ABC"],
+        # 'Depth_Score' is missing here
+    })
+
+    with pytest.raises(KeyError) as exc_info:
+        filter_by_alt_values_and_finalize(df, kestrel_config_mock)
+
+    assert "Missing required columns" in str(exc_info.value), (
+        "Expected KeyError due to missing required 'Depth_Score' column."
+    )
+
+
+def test_filter_by_alt_values_gg_filter_below_threshold(kestrel_config_mock):
+    """
+    If ALT='GG' but Depth_Score < threshold, that row should be removed.
+    """
+    df = pd.DataFrame({
+        "ALT": ["GG", "GG", "XYZ"],
+        "Depth_Score": [0.019, 0.02, 0.5]  # 0.019 < threshold => remove, 0.02 >= threshold => keep
+    })
+
+    out = filter_by_alt_values_and_finalize(df, kestrel_config_mock)
+    # The first row has Depth_Score=0.019 => < 0.02 => removed
+    # The second row has Depth_Score=0.02 => OK => keep
+    # The third row has ALT=XYZ => unaffected by the GG filter => keep
+    assert len(out) == 2, (
+        "Expected only 2 rows to remain after removing GG with insufficient Depth_Score."
+    )
+    # Check that 'GG' row with Depth_Score=0.019 was removed
+    assert (out["Depth_Score"] < 0.02).sum() == 0, "No row should have Depth_Score < 0.02 for 'GG' alt."
+
+
+def test_filter_by_alt_values_exclude_alts(kestrel_config_mock):
+    """
+    Test that ALTs in 'exclude_alts' are removed from the DataFrame.
+    """
+    df = pd.DataFrame({
+        "ALT": ["GG", "BAD_ALT", "OK_ALT", "ZZZ", "ANOTHER"],
+        "Depth_Score": [0.5, 0.1, 0.3, 0.2, 0.6]  # just some placeholder scores
+    })
+
+    out = filter_by_alt_values_and_finalize(df, kestrel_config_mock)
+    # Excluded: "BAD_ALT" and "ZZZ"
+    # Keep: "GG", "OK_ALT", "ANOTHER"
+    kept_alts = out["ALT"].tolist()
+    assert len(out) == 3, "Expected 3 ALTs after excluding 'BAD_ALT' and 'ZZZ'."
+    assert "BAD_ALT" not in kept_alts, "'BAD_ALT' should be removed."
+    assert "ZZZ" not in kept_alts, "'ZZZ' should be removed."
+
+
+def test_filter_by_alt_values_drop_left_right(kestrel_config_mock):
+    """
+    Test that 'left' and 'right' columns (if present) are dropped.
+    """
+    df = pd.DataFrame({
+        "ALT": ["GG", "ABC"],
+        "Depth_Score": [0.05, 0.02],
+        "left": ["some_left_data", "some_left_data"],
+        "right": ["some_right_data", "some_right_data"]
+    })
+    out = filter_by_alt_values_and_finalize(df, kestrel_config_mock)
+
+    # We keep both rows because 'GG' with Depth_Score=0.05 is fine,
+    # and "ABC" is not in exclude_alts => all good.
+    assert len(out) == 2, "Expected 2 rows total."
+    assert "left" not in out.columns and "right" not in out.columns, (
+        "Expected the 'left' and 'right' columns to be dropped."
+    )