Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix missing columns in kestrel_result.tsv and implement detailed screening summary #78

Merged
merged 10 commits into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions tests/test_data_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@
"value": 0.05869074492099323,
"tolerance_percentage": 5
},
"Confidence": "High_Precision"
"Confidence": "High_Precision*"
},
"check_igv_report": true
},
Expand Down Expand Up @@ -248,7 +248,7 @@
"value": 0.02168057579370336,
"tolerance_percentage": 5
},
"Confidence": "High_Precision"
"Confidence": "High_Precision*"
},
"check_igv_report": true
},
Expand Down Expand Up @@ -288,7 +288,7 @@
"value": 0.010855245823892079,
"tolerance_percentage": 5
},
"Confidence": "High_Precision"
"Confidence": "High_Precision*"
},
"check_igv_report": false
}
Expand Down
143 changes: 143 additions & 0 deletions tests/unit/test_scoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#!/usr/bin/env python3
# tests/unit/test_scoring.py

"""
Unit tests for the scoring functionality in vntyper/scripts/scoring.py.
Validates frame-score calculations, depth splitting, and frameshift extraction.
"""

import pytest
import pandas as pd
import numpy as np

from vntyper.scripts.scoring import (
split_depth_and_calculate_frame_score,
split_frame_score,
extract_frameshifts,
)


@pytest.mark.parametrize("df_input,expected_len", [
(pd.DataFrame(), 0),
])
def test_split_depth_and_calculate_frame_score_empty_df(df_input, expected_len):
"""
Verify that an empty input DataFrame remains empty.
"""
out = split_depth_and_calculate_frame_score(df_input)
assert len(out) == expected_len, (
"Empty input should yield empty output after split_depth_and_calculate_frame_score."
)


def test_split_depth_and_calculate_frame_score_no_frameshift():
"""
If the difference (ALT length - REF length) is a multiple of 3,
the variant should not be retained (non-frameshift).
"""
df = pd.DataFrame({
"Sample": ["Del:10:100"], # Only the first 'Del' part is not used, but we keep format for test
"REF": ["ATG"], # length 3
"ALT": ["ATGATG"], # length 6 -> difference = 3 -> multiple of 3
"Motifs": ["mock_motif"],
"Variant": ["mock_variant"],
"POS": [123],
"Motif_sequence": ["mock_sequence"]
})
out = split_depth_and_calculate_frame_score(df)
# Because it's a multiple of 3 difference, is_frameshift == False => filtered out
assert out.empty, (
"Variants with multiple-of-3 difference should be filtered out as non-frameshift."
)


def test_split_depth_and_calculate_frame_score_frameshift():
"""
If the difference (ALT length - REF length) is not a multiple of 3,
the variant should be retained and a 'Frame_Score' should be added.
"""
df = pd.DataFrame({
"Sample": ["Del:50:500"],
"REF": ["ATG"], # length 3
"ALT": ["ATGA"], # length 4 -> difference = 1 -> frameshift
"Motifs": ["mock_motif"],
"Variant": ["mock_variant"],
"POS": [456],
"Motif_sequence": ["mock_sequence"]
})
out = split_depth_and_calculate_frame_score(df)
assert not out.empty, "Expected to retain a frameshift variant (difference not multiple of 3)."
assert "Frame_Score" in out.columns, "Output should have a 'Frame_Score' column."
# Check that is_frameshift was True
assert "is_frameshift" in out.columns, "Output should have 'is_frameshift' marking frameshift or not."
assert all(out["is_frameshift"]), "All retained rows should be frameshift variants."


def test_split_frame_score_empty_df():
"""
Verify that an empty input DataFrame remains empty when split_frame_score is called.
"""
df = pd.DataFrame()
out = split_frame_score(df)
assert out.empty, "Empty input should yield empty output after split_frame_score."


def test_split_frame_score_basic():
"""
Test basic splitting of frame score into 'direction' and 'frameshift_amount'.
"""
df = pd.DataFrame({
"Frame_Score": [1.0, -2.0], # not directly used, but indicates frameshift
"ref_len": [3, 6],
"alt_len": [4, 4], # alt_len - ref_len => [1, -2]
"is_frameshift": [True, True] # frameshift is assumed True from previous step
})
out = split_frame_score(df)

# We drop 'is_frameshift', 'ref_len', 'alt_len'
# We keep 'direction', 'frameshift_amount', 'Frame_Score', etc.
expected_columns = {"Frame_Score", "direction", "frameshift_amount"}
assert expected_columns.issubset(set(out.columns)), (
f"Output must contain at least: {expected_columns}"
)

# direction = sign(alt_len - ref_len)
# frameshift_amount = abs(alt_len - ref_len) % 3
# For row0: alt_len - ref_len = 1 => direction=1, frameshift_amount=1
# For row1: alt_len - ref_len = -2 => direction < 0 => -1, frameshift_amount=2
assert out.loc[0, "direction"] == 1, "Expected direction=1 for alt_len-ref_len=1."
assert out.loc[0, "frameshift_amount"] == 1, "Expected frameshift_amount=1 for difference=1."

assert out.loc[1, "direction"] == -1, "Expected direction=-1 for alt_len-ref_len=-2."
assert out.loc[1, "frameshift_amount"] == 2, "Expected frameshift_amount=2 for difference=-2."


def test_extract_frameshifts_empty_df():
"""
Verify that an empty input DataFrame remains empty in extract_frameshifts.
"""
df = pd.DataFrame()
out = extract_frameshifts(df)
assert out.empty, "Empty input should yield empty output after extract_frameshifts."


def test_extract_frameshifts_mixed():
"""
Test that only frameshift rows meeting the 3n+1 insertion (direction>0) or
3n+2 deletion (direction<0) are retained.
"""
df = pd.DataFrame({
"direction": [1, 1, -1, -1, 1],
"frameshift_amount": [1, 2, 2, 1, 1],
"Frame_Score": [0.33, 0.66, -0.66, -0.33, 0.33],
"Variant": ["ins_ok", "ins_wrong", "del_ok", "del_wrong", "ins_ok2"]
})
# insertion frameshift => direction>0 and frameshift_amount=1
# deletion frameshift => direction<0 and frameshift_amount=2
out = extract_frameshifts(df)
# Expect to retain rows: 0 (ins_ok), 2 (del_ok), 4 (ins_ok2)
# Indices 1 (ins_wrong) and 3 (del_wrong) should be dropped
assert len(out) == 3, "Expected to keep 3 rows that match frameshift patterns."
assert sorted(out["Variant"].tolist()) == sorted(["ins_ok", "del_ok", "ins_ok2"]), (
"Wrong set of frameshift variants retained."
)
115 changes: 115 additions & 0 deletions tests/unit/test_variant_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#!/usr/bin/env python3
# tests/unit/test_variant_parsing.py

"""
Unit tests for variant_parsing.py, focusing on:
filter_by_alt_values_and_finalize()

Ensures that ALT-based filtering rules are correctly applied:
- 'GG' alt requires a minimum Depth_Score.
- exclude_alts removed from final DataFrame.
- left/right columns dropped at the end.
"""

import pytest
import pandas as pd
from vntyper.scripts.variant_parsing import filter_by_alt_values_and_finalize


@pytest.fixture
def kestrel_config_mock():
"""
Provide a mock kestrel_config dict specifically for ALT filtering tests.
Real config may contain more fields, but we only need 'alt_filtering' here.
"""
return {
"alt_filtering": {
"gg_alt_value": "GG",
"gg_depth_score_threshold": 0.02,
"exclude_alts": ["BAD_ALT", "ZZZ"]
}
}


def test_filter_by_alt_values_empty_df(kestrel_config_mock):
"""
Test that an empty DataFrame simply returns empty without error.
"""
df = pd.DataFrame()
out = filter_by_alt_values_and_finalize(df, kestrel_config_mock)
assert out.empty, "Empty input should yield empty output."


def test_filter_by_alt_values_missing_columns(kestrel_config_mock):
"""
Test that missing 'ALT' or 'Depth_Score' columns raises KeyError.
"""
df = pd.DataFrame({
"ALT": ["GG", "ABC"],
# 'Depth_Score' is missing here
})

with pytest.raises(KeyError) as exc_info:
filter_by_alt_values_and_finalize(df, kestrel_config_mock)

assert "Missing required columns" in str(exc_info.value), (
"Expected KeyError due to missing required 'Depth_Score' column."
)


def test_filter_by_alt_values_gg_filter_below_threshold(kestrel_config_mock):
"""
If ALT='GG' but Depth_Score < threshold, that row should be removed.
"""
df = pd.DataFrame({
"ALT": ["GG", "GG", "XYZ"],
"Depth_Score": [0.019, 0.02, 0.5] # 0.019 < threshold => remove, 0.02 >= threshold => keep
})

out = filter_by_alt_values_and_finalize(df, kestrel_config_mock)
# The first row has Depth_Score=0.019 => < 0.02 => removed
# The second row has Depth_Score=0.02 => OK => keep
# The third row has ALT=XYZ => unaffected by the GG filter => keep
assert len(out) == 2, (
"Expected only 2 rows to remain after removing GG with insufficient Depth_Score."
)
# Check that 'GG' row with Depth_Score=0.019 was removed
assert (out["Depth_Score"] < 0.02).sum() == 0, "No row should have Depth_Score < 0.02 for 'GG' alt."


def test_filter_by_alt_values_exclude_alts(kestrel_config_mock):
"""
Test that ALTs in 'exclude_alts' are removed from the DataFrame.
"""
df = pd.DataFrame({
"ALT": ["GG", "BAD_ALT", "OK_ALT", "ZZZ", "ANOTHER"],
"Depth_Score": [0.5, 0.1, 0.3, 0.2, 0.6] # just some placeholder scores
})

out = filter_by_alt_values_and_finalize(df, kestrel_config_mock)
# Excluded: "BAD_ALT" and "ZZZ"
# Keep: "GG", "OK_ALT", "ANOTHER"
kept_alts = out["ALT"].tolist()
assert len(out) == 3, "Expected 3 ALTs after excluding 'BAD_ALT' and 'ZZZ'."
assert "BAD_ALT" not in kept_alts, "'BAD_ALT' should be removed."
assert "ZZZ" not in kept_alts, "'ZZZ' should be removed."


def test_filter_by_alt_values_drop_left_right(kestrel_config_mock):
"""
Test that 'left' and 'right' columns (if present) are dropped.
"""
df = pd.DataFrame({
"ALT": ["GG", "ABC"],
"Depth_Score": [0.05, 0.02],
"left": ["some_left_data", "some_left_data"],
"right": ["some_right_data", "some_right_data"]
})
out = filter_by_alt_values_and_finalize(df, kestrel_config_mock)

# We keep both rows because 'GG' with Depth_Score=0.05 is fine,
# and "ABC" is not in exclude_alts => all good.
assert len(out) == 2, "Expected 2 rows total."
assert "left" not in out.columns and "right" not in out.columns, (
"Expected the 'left' and 'right' columns to be dropped."
)
Loading