From 89e076edbd55400af1113eb6ac8da942d80a159e Mon Sep 17 00:00:00 2001 From: Dima Molodenskiy Date: Fri, 15 Nov 2024 16:54:07 +0100 Subject: [PATCH] Fix #457 --- .github/workflows/github_actions.yml | 1 + alphapulldown/utils/modelling_setup.py | 54 +++++----- test/test_parse_fold.py | 130 +++++++++++++++++++++++++ 3 files changed, 160 insertions(+), 25 deletions(-) create mode 100644 test/test_parse_fold.py diff --git a/.github/workflows/github_actions.yml b/.github/workflows/github_actions.yml index 1c5d4a45..6404c856 100644 --- a/.github/workflows/github_actions.yml +++ b/.github/workflows/github_actions.yml @@ -68,6 +68,7 @@ jobs: pytest -s test/test_modelcif.py pytest -s test/test_features_with_templates.py pytest -s test/test_post_prediction.py + pytest -s test/test_parse_fold.py #export PYTHONPATH=$PWD/alphapulldown/analysis_pipeline:$PYTHONPATH ## Test analysis pipeline #conda install -c bioconda biopandas diff --git a/alphapulldown/utils/modelling_setup.py b/alphapulldown/utils/modelling_setup.py index 1bbdcddc..b2aa325d 100644 --- a/alphapulldown/utils/modelling_setup.py +++ b/alphapulldown/utils/modelling_setup.py @@ -21,7 +21,7 @@ logging.set_verbosity(logging.INFO) -def parse_fold(input, features_directory, protein_delimiter): +def parse_fold(input_list, features_directory, protein_delimiter): """ Parses a list of protein fold specifications and returns structured folding jobs. @@ -37,50 +37,54 @@ def parse_fold(input, features_directory, protein_delimiter): FileNotFoundError: If any required protein features are missing. """ all_folding_jobs = [] - for i in input: - formatted_folds, missing_features, unique_features = [], [], [] + missing_features = set() # Initialize as a set to collect unique missing features + for i in input_list: + formatted_folds = [] protein_folds = [x.split(":") for x in i.split(protein_delimiter)] for protein_fold in protein_folds: name, number, region = None, 1, "all" - if len(protein_fold) ==1: - # protein_fold is in this format: [protein_name] + if len(protein_fold) == 1: + # Format: [protein_name] name = protein_fold[0] elif len(protein_fold) > 1: - name, number= protein_fold[0], protein_fold[1] - if ("-") in protein_fold[1]: - # protein_fold is in this format: [protein_name:1-10:14-30:40-100:etc] + name = protein_fold[0] + if "-" in protein_fold[1]: + # Format: [protein_name:1-10:14-30:40-100:etc] try: number = 1 region = protein_fold[1:] region = [tuple(int(x) for x in r.split("-")) for r in region] - except Exception as e: - logging.error(f"Your format: {i} is wrong. The programme will terminate.") + except Exception: + logging.error(f"Your format: {i} is wrong. The program will terminate.") sys.exit() else: - # protein_fold is in this format: [protein_name:copy_number:1-10:14-30:40-100:etc] + # Format: [protein_name:copy_number:1-10:14-30:40-100:etc] try: - number = protein_fold[1] - if len(protein_fold[2:]) > 0: + number = int(protein_fold[1]) + if len(protein_fold) > 2: region = protein_fold[2:] region = [tuple(int(x) for x in r.split("-")) for r in region] - except Exception as e: - logging.error(f"Your format: {i} is wrong. The programme will terminate.") + except Exception: + logging.error(f"Your format: {i} is wrong. The program will terminate.") sys.exit() - + number = int(number) - unique_features.append(name) - if not any([exists(join(monomer_dir, f"{name}.pkl")) or exists(join(monomer_dir, f"{name}.pkl.xz")) for - monomer_dir in features_directory]): - missing_features.append(name) + # Check for missing features + if not any( + exists(join(monomer_dir, f"{name}{ext}")) + for monomer_dir in features_directory + for ext in [".pkl", ".pkl.xz"] + ): + missing_features.add(name) # Use .add() since missing_features is a set formatted_folds.extend([{name: region} for _ in range(number)]) all_folding_jobs.append(formatted_folds) - missing_features = set(missing_features) - if len(missing_features): - raise FileNotFoundError( - f"{missing_features} not found in {features_directory}" - ) + + if missing_features: + raise FileNotFoundError( + f"{sorted(missing_features)} not found in {features_directory}" + ) return all_folding_jobs def pad_input_features(feature_dict: dict, diff --git a/test/test_parse_fold.py b/test/test_parse_fold.py new file mode 100644 index 00000000..da88ac03 --- /dev/null +++ b/test/test_parse_fold.py @@ -0,0 +1,130 @@ +import logging +from absl.testing import parameterized +from unittest import mock +from alphapulldown.utils.modelling_setup import parse_fold + +""" +Test parse_fold function with different scenarios +""" + +class TestParseFold(parameterized.TestCase): + + def setUp(self) -> None: + super().setUp() + # Set logging level to INFO + logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + + @parameterized.named_parameters( + { + 'testcase_name': 'single_protein_no_copy', + 'input': ['protein1'], + 'features_directory': ['dir1'], + 'protein_delimiter': '_', + 'mock_side_effect': { + 'dir1/protein1.pkl': True, + 'dir1/protein1.pkl.xz': False, + }, + 'expected_result': [[{'protein1': 'all'}]], + }, + { + 'testcase_name': 'single_protein_with_copy_number', + 'input': ['protein1:2'], + 'features_directory': ['dir1'], + 'protein_delimiter': '_', + 'mock_side_effect': { + 'dir1/protein1.pkl': True, + 'dir1/protein1.pkl.xz': False, + }, + 'expected_result': [[{'protein1': 'all'}, {'protein1': 'all'}]], + }, + { + 'testcase_name': 'single_protein_with_region', + 'input': ['protein1:1-10'], + 'features_directory': ['dir1'], + 'protein_delimiter': '_', + 'mock_side_effect': { + 'dir1/protein1.pkl': True, + 'dir1/protein1.pkl.xz': False, + }, + 'expected_result': [[{'protein1': [(1, 10)]}]], + }, + { + 'testcase_name': 'single_protein_with_copy_and_regions', + 'input': ['protein1:2:1-10:20-30'], + 'features_directory': ['dir1'], + 'protein_delimiter': '_', + 'mock_side_effect': { + 'dir1/protein1.pkl': True, + 'dir1/protein1.pkl.xz': False, + }, + 'expected_result': [[{'protein1': [(1, 10), (20, 30)]}, {'protein1': [(1, 10), (20, 30)]}]], + }, + { + 'testcase_name': 'multiple_proteins', + 'input': ['protein1:2_protein2:1-50'], + 'features_directory': ['dir1'], + 'protein_delimiter': '_', + 'mock_side_effect': { + 'dir1/protein1.pkl': True, + 'dir1/protein1.pkl.xz': False, + 'dir1/protein2.pkl': True, + 'dir1/protein2.pkl.xz': False, + }, + 'expected_result': [[{'protein1': 'all'}, {'protein1': 'all'}, {'protein2': [(1, 50)]}]], + }, + { + 'testcase_name': 'missing_features', + 'input': ['protein1', 'protein2'], + 'features_directory': ['dir1'], + 'protein_delimiter': '_', + 'mock_side_effect': { + 'dir1/protein1.pkl': False, + 'dir1/protein1.pkl.xz': False, + 'dir1/protein2.pkl': False, + 'dir1/protein2.pkl.xz': False, + }, + 'expected_exception': FileNotFoundError, + 'expected_exception_message': "['protein1', 'protein2'] not found in ['dir1']", + }, + { + 'testcase_name': 'invalid_format', + 'input': ['protein1::1-10'], + 'features_directory': ['dir1'], + 'protein_delimiter': '_', + 'mock_side_effect': {}, + 'expected_exception': SystemExit, + }, + { + 'testcase_name': 'feature_exists_in_multiple_dirs', + 'input': ['protein1'], + 'features_directory': ['dir1', 'dir2'], + 'protein_delimiter': '_', + 'mock_side_effect': { + 'dir1/protein1.pkl': False, + 'dir1/protein1.pkl.xz': False, + 'dir2/protein1.pkl': True, + 'dir2/protein1.pkl.xz': False, + }, + 'expected_result': [[{'protein1': 'all'}]], + }, + ) + def test_parse_fold(self, input, features_directory, protein_delimiter, mock_side_effect, + expected_result=None, expected_exception=None, expected_exception_message=None): + """Test parse_fold with different input scenarios""" + with mock.patch('alphapulldown.utils.modelling_setup.exists') as mock_exists, \ + mock.patch('sys.exit') as mock_exit: + mock_exists.side_effect = lambda path: mock_side_effect.get(path, False) + # Mock sys.exit to raise SystemExit exception + mock_exit.side_effect = SystemExit + logging.info(f"Testing with input: {input}, features_directory: {features_directory}, " + f"protein_delimiter: '{protein_delimiter}'") + logging.info(f"Mock side effects: {mock_side_effect}") + if expected_exception: + with self.assertRaises(expected_exception) as context: + result = parse_fold(input, features_directory, protein_delimiter) + if expected_exception_message: + self.assertEqual(str(context.exception), expected_exception_message) + else: + result = parse_fold(input, features_directory, protein_delimiter) + logging.info(f"Result: {result}, Expected: {expected_result}") + self.assertEqual(result, expected_result)