Skip to content

Commit

Permalink
Fix #457
Browse files Browse the repository at this point in the history
  • Loading branch information
DimaMolod committed Nov 15, 2024
1 parent dd10f45 commit 89e076e
Show file tree
Hide file tree
Showing 3 changed files with 160 additions and 25 deletions.
1 change: 1 addition & 0 deletions .github/workflows/github_actions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ jobs:
pytest -s test/test_modelcif.py
pytest -s test/test_features_with_templates.py
pytest -s test/test_post_prediction.py
pytest -s test/test_parse_fold.py
#export PYTHONPATH=$PWD/alphapulldown/analysis_pipeline:$PYTHONPATH
## Test analysis pipeline
#conda install -c bioconda biopandas
Expand Down
54 changes: 29 additions & 25 deletions alphapulldown/utils/modelling_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
logging.set_verbosity(logging.INFO)


def parse_fold(input, features_directory, protein_delimiter):
def parse_fold(input_list, features_directory, protein_delimiter):
"""
Parses a list of protein fold specifications and returns structured folding jobs.
Expand All @@ -37,50 +37,54 @@ def parse_fold(input, features_directory, protein_delimiter):
FileNotFoundError: If any required protein features are missing.
"""
all_folding_jobs = []
for i in input:
formatted_folds, missing_features, unique_features = [], [], []
missing_features = set() # Initialize as a set to collect unique missing features
for i in input_list:
formatted_folds = []
protein_folds = [x.split(":") for x in i.split(protein_delimiter)]
for protein_fold in protein_folds:
name, number, region = None, 1, "all"

if len(protein_fold) ==1:
# protein_fold is in this format: [protein_name]
if len(protein_fold) == 1:
# Format: [protein_name]
name = protein_fold[0]
elif len(protein_fold) > 1:
name, number= protein_fold[0], protein_fold[1]
if ("-") in protein_fold[1]:
# protein_fold is in this format: [protein_name:1-10:14-30:40-100:etc]
name = protein_fold[0]
if "-" in protein_fold[1]:
# Format: [protein_name:1-10:14-30:40-100:etc]
try:
number = 1
region = protein_fold[1:]
region = [tuple(int(x) for x in r.split("-")) for r in region]
except Exception as e:
logging.error(f"Your format: {i} is wrong. The programme will terminate.")
except Exception:
logging.error(f"Your format: {i} is wrong. The program will terminate.")
sys.exit()
else:
# protein_fold is in this format: [protein_name:copy_number:1-10:14-30:40-100:etc]
# Format: [protein_name:copy_number:1-10:14-30:40-100:etc]
try:
number = protein_fold[1]
if len(protein_fold[2:]) > 0:
number = int(protein_fold[1])
if len(protein_fold) > 2:
region = protein_fold[2:]
region = [tuple(int(x) for x in r.split("-")) for r in region]
except Exception as e:
logging.error(f"Your format: {i} is wrong. The programme will terminate.")
except Exception:
logging.error(f"Your format: {i} is wrong. The program will terminate.")
sys.exit()

number = int(number)
unique_features.append(name)
if not any([exists(join(monomer_dir, f"{name}.pkl")) or exists(join(monomer_dir, f"{name}.pkl.xz")) for
monomer_dir in features_directory]):
missing_features.append(name)
# Check for missing features
if not any(
exists(join(monomer_dir, f"{name}{ext}"))
for monomer_dir in features_directory
for ext in [".pkl", ".pkl.xz"]
):
missing_features.add(name) # Use .add() since missing_features is a set

formatted_folds.extend([{name: region} for _ in range(number)])
all_folding_jobs.append(formatted_folds)
missing_features = set(missing_features)
if len(missing_features):
raise FileNotFoundError(
f"{missing_features} not found in {features_directory}"
)

if missing_features:
raise FileNotFoundError(
f"{sorted(missing_features)} not found in {features_directory}"
)
return all_folding_jobs

def pad_input_features(feature_dict: dict,
Expand Down
130 changes: 130 additions & 0 deletions test/test_parse_fold.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import logging
from absl.testing import parameterized
from unittest import mock
from alphapulldown.utils.modelling_setup import parse_fold

"""
Test parse_fold function with different scenarios
"""

class TestParseFold(parameterized.TestCase):

def setUp(self) -> None:
super().setUp()
# Set logging level to INFO
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

@parameterized.named_parameters(
{
'testcase_name': 'single_protein_no_copy',
'input': ['protein1'],
'features_directory': ['dir1'],
'protein_delimiter': '_',
'mock_side_effect': {
'dir1/protein1.pkl': True,
'dir1/protein1.pkl.xz': False,
},
'expected_result': [[{'protein1': 'all'}]],
},
{
'testcase_name': 'single_protein_with_copy_number',
'input': ['protein1:2'],
'features_directory': ['dir1'],
'protein_delimiter': '_',
'mock_side_effect': {
'dir1/protein1.pkl': True,
'dir1/protein1.pkl.xz': False,
},
'expected_result': [[{'protein1': 'all'}, {'protein1': 'all'}]],
},
{
'testcase_name': 'single_protein_with_region',
'input': ['protein1:1-10'],
'features_directory': ['dir1'],
'protein_delimiter': '_',
'mock_side_effect': {
'dir1/protein1.pkl': True,
'dir1/protein1.pkl.xz': False,
},
'expected_result': [[{'protein1': [(1, 10)]}]],
},
{
'testcase_name': 'single_protein_with_copy_and_regions',
'input': ['protein1:2:1-10:20-30'],
'features_directory': ['dir1'],
'protein_delimiter': '_',
'mock_side_effect': {
'dir1/protein1.pkl': True,
'dir1/protein1.pkl.xz': False,
},
'expected_result': [[{'protein1': [(1, 10), (20, 30)]}, {'protein1': [(1, 10), (20, 30)]}]],
},
{
'testcase_name': 'multiple_proteins',
'input': ['protein1:2_protein2:1-50'],
'features_directory': ['dir1'],
'protein_delimiter': '_',
'mock_side_effect': {
'dir1/protein1.pkl': True,
'dir1/protein1.pkl.xz': False,
'dir1/protein2.pkl': True,
'dir1/protein2.pkl.xz': False,
},
'expected_result': [[{'protein1': 'all'}, {'protein1': 'all'}, {'protein2': [(1, 50)]}]],
},
{
'testcase_name': 'missing_features',
'input': ['protein1', 'protein2'],
'features_directory': ['dir1'],
'protein_delimiter': '_',
'mock_side_effect': {
'dir1/protein1.pkl': False,
'dir1/protein1.pkl.xz': False,
'dir1/protein2.pkl': False,
'dir1/protein2.pkl.xz': False,
},
'expected_exception': FileNotFoundError,
'expected_exception_message': "['protein1', 'protein2'] not found in ['dir1']",
},
{
'testcase_name': 'invalid_format',
'input': ['protein1::1-10'],
'features_directory': ['dir1'],
'protein_delimiter': '_',
'mock_side_effect': {},
'expected_exception': SystemExit,
},
{
'testcase_name': 'feature_exists_in_multiple_dirs',
'input': ['protein1'],
'features_directory': ['dir1', 'dir2'],
'protein_delimiter': '_',
'mock_side_effect': {
'dir1/protein1.pkl': False,
'dir1/protein1.pkl.xz': False,
'dir2/protein1.pkl': True,
'dir2/protein1.pkl.xz': False,
},
'expected_result': [[{'protein1': 'all'}]],
},
)
def test_parse_fold(self, input, features_directory, protein_delimiter, mock_side_effect,
expected_result=None, expected_exception=None, expected_exception_message=None):
"""Test parse_fold with different input scenarios"""
with mock.patch('alphapulldown.utils.modelling_setup.exists') as mock_exists, \
mock.patch('sys.exit') as mock_exit:
mock_exists.side_effect = lambda path: mock_side_effect.get(path, False)
# Mock sys.exit to raise SystemExit exception
mock_exit.side_effect = SystemExit
logging.info(f"Testing with input: {input}, features_directory: {features_directory}, "
f"protein_delimiter: '{protein_delimiter}'")
logging.info(f"Mock side effects: {mock_side_effect}")
if expected_exception:
with self.assertRaises(expected_exception) as context:
result = parse_fold(input, features_directory, protein_delimiter)
if expected_exception_message:
self.assertEqual(str(context.exception), expected_exception_message)
else:
result = parse_fold(input, features_directory, protein_delimiter)
logging.info(f"Result: {result}, Expected: {expected_result}")
self.assertEqual(result, expected_result)

1 comment on commit 89e076e

@jkosinski
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice, will test!

Please sign in to comment.