From 111bece91d8531a885b3acb613eccdf7daf68c8b Mon Sep 17 00:00:00 2001 From: Andreas Eisenbarth Date: Mon, 11 Mar 2024 11:46:59 +0100 Subject: [PATCH 1/4] Add failing test case for neutral loss --- metaspace_converter/tests/to_anndata_test.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/metaspace_converter/tests/to_anndata_test.py b/metaspace_converter/tests/to_anndata_test.py index cc6aac9..cb4bbfe 100644 --- a/metaspace_converter/tests/to_anndata_test.py +++ b/metaspace_converter/tests/to_anndata_test.py @@ -36,9 +36,14 @@ def sm(metaspace_credentials) -> SMInstance: @pytest.mark.parametrize( ("dataset_id", "database", "fdr", "metadata_as_obs", "add_optical_image"), [ + # Just downloading, metadata as uns ("2021-09-03_11h43m13s", ("CoreMetabolome", "v3"), 0.1, False, False), + # Metadata as obs ("2021-09-03_11h43m13s", ("CoreMetabolome", "v3"), 0.1, True, False), + # Add optical image for SquidPy ("2021-09-03_11h43m13s", ("CoreMetabolome", "v3"), 0.1, False, True), + # Dataset with custom database, neutral losses + ("2022-11-18_16h40m47s", ("AE_spacem_tests", "v1"), 0.5, False, False), ], ) def test_metaspace_to_anndata( From d45129b5bbb0896af6ca19d8fb5daad337e883b2 Mon Sep 17 00:00:00 2001 From: Andreas Eisenbarth Date: Fri, 8 Mar 2024 16:48:01 +0100 Subject: [PATCH 2/4] Make annotations and ion images matching more robust --- metaspace_converter/tests/to_anndata_test.py | 2 + metaspace_converter/to_anndata.py | 49 ++++++++++++++------ 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/metaspace_converter/tests/to_anndata_test.py b/metaspace_converter/tests/to_anndata_test.py index cb4bbfe..a8c58f5 100644 --- a/metaspace_converter/tests/to_anndata_test.py +++ b/metaspace_converter/tests/to_anndata_test.py @@ -61,6 +61,8 @@ def test_metaspace_to_anndata( dataset = sm.dataset(id=dataset_id) assert actual.n_obs == np.prod(get_ion_image_shape(dataset)) assert actual.n_vars == len(dataset.annotations(fdr=fdr, database=database)) + assert actual.obs_names.is_unique + assert actual.var_names.is_unique assert { COL.ion_image_shape_y, COL.ion_image_shape_x, diff --git a/metaspace_converter/to_anndata.py b/metaspace_converter/to_anndata.py index e2fcb87..f72cbda 100644 --- a/metaspace_converter/to_anndata.py +++ b/metaspace_converter/to_anndata.py @@ -93,7 +93,6 @@ def metaspace_to_anndata( # Download annotations annotations = dataset.results(database=database, fdr=fdr, **annotation_filter) - annotations = _add_annotations_index(annotations, index_name=VAR_INDEX_NAME) annotations = _normalize_annotations_for_serialization(annotations) # Download ion images @@ -109,10 +108,24 @@ def metaspace_to_anndata( f"No isotope images available for dataset {dataset.id} and database " f"{database[0]} – {database[1]}. Was the database selected for processing on METASPACE?" ) - assert len(annotations) == len(isotope_images) + # Isotope images are also specific to neutral loss and chemical modification (if any) + # whereas annotations only include formula and adduct. Thus there can be a mismatch. + # Since we want to keep all isotope images, we add missing rows to annotations + isotope_images_index = pd.DataFrame( + [(img.formula, img.adduct, img.chem_mod, img.neutral_loss) for img in isotope_images], + columns=["formula", "adduct", "chem_mod", "neutral_loss"], + ) + annotations = pd.merge( + annotations, + isotope_images_index, + how="inner", + left_on=("formula", "adduct"), + right_on=("formula", "adduct"), + ) + annotations = _add_annotations_index(annotations, index_name=VAR_INDEX_NAME) - # Sort them matching the annotations. - isotope_images = _sort_isotope_images_like(isotope_images, annotations.index) + # Sort isotope images to match the annotations. + isotope_images = _sort_isotope_images_like(isotope_images, annotations) # Create X matrix (all ion pixels flattened to primary axis) shape = get_ion_image_shape(dataset) @@ -146,13 +159,20 @@ def metaspace_to_anndata( return adata -def create_annotation_id(formula: str, adduct: str) -> str: - return f"{formula}{adduct}" +def create_annotation_id( + formula: str, adduct: str, chem_mod: str = "", neutral_loss: str = "" +) -> str: + return f"{formula}{adduct}{chem_mod}{neutral_loss}" def _add_annotations_index(df: pd.DataFrame, index_name: str = VAR_INDEX_NAME) -> pd.DataFrame: df = df.reset_index() - df[index_name] = df.apply(lambda row: create_annotation_id(row.formula, row.adduct), axis=1) + df[index_name] = df.apply( + lambda row: create_annotation_id( + row.formula, row.adduct, getattr(row, "chemMod", ""), getattr(row, "neutralLoss", "") + ), + axis=1, + ) return df.set_index(index_name) @@ -201,14 +221,17 @@ def get_ion_image_shape( def _sort_isotope_images_like( - isotope_images: list[IsotopeImages], index: pd.Index + isotope_images: list[IsotopeImages], df: pd.DataFrame ) -> list[IsotopeImages]: - images_dict = {} - for isotope_image in isotope_images: - annotation_id = create_annotation_id(isotope_image.formula, isotope_image.adduct) - images_dict[annotation_id] = isotope_image + images_dict = { + (img.formula, img.adduct, img.chem_mod, img.neutral_loss): img for img in isotope_images + } # Return them in the requested order. - return [images_dict[key] for key in index] + # Note: pd.DataFrame.itertuples yields NamedTuple and is faster than iterrows. + return [ + images_dict[(row.formula, row.adduct, row.chem_mod, row.neutral_loss)] + for row in df.itertuples(index=False) + ] def _create_anndata_x(isotope_images: list[IsotopeImages], shape: Shape2d) -> np.ndarray: From 8b9f865b54a00b12845cd4aba3912e8994a39c8a Mon Sep 17 00:00:00 2001 From: Andreas Eisenbarth Date: Mon, 11 Mar 2024 11:34:06 +0100 Subject: [PATCH 3/4] Fix dataframe/images mismatch for neutral loss --- metaspace_converter/to_anndata.py | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/metaspace_converter/to_anndata.py b/metaspace_converter/to_anndata.py index f72cbda..33fd152 100644 --- a/metaspace_converter/to_anndata.py +++ b/metaspace_converter/to_anndata.py @@ -92,7 +92,14 @@ def metaspace_to_anndata( database = DEFAULT_DATABASE # Download annotations - annotations = dataset.results(database=database, fdr=fdr, **annotation_filter) + annotations = dataset.results( + database=database, + fdr=fdr, + include_chem_mods=True, + include_neutral_losses=True, + **annotation_filter, + ) + annotations = _add_annotations_index(annotations, index_name=VAR_INDEX_NAME) annotations = _normalize_annotations_for_serialization(annotations) # Download ion images @@ -108,21 +115,7 @@ def metaspace_to_anndata( f"No isotope images available for dataset {dataset.id} and database " f"{database[0]} – {database[1]}. Was the database selected for processing on METASPACE?" ) - # Isotope images are also specific to neutral loss and chemical modification (if any) - # whereas annotations only include formula and adduct. Thus there can be a mismatch. - # Since we want to keep all isotope images, we add missing rows to annotations - isotope_images_index = pd.DataFrame( - [(img.formula, img.adduct, img.chem_mod, img.neutral_loss) for img in isotope_images], - columns=["formula", "adduct", "chem_mod", "neutral_loss"], - ) - annotations = pd.merge( - annotations, - isotope_images_index, - how="inner", - left_on=("formula", "adduct"), - right_on=("formula", "adduct"), - ) - annotations = _add_annotations_index(annotations, index_name=VAR_INDEX_NAME) + assert len(annotations) == len(isotope_images) # Sort isotope images to match the annotations. isotope_images = _sort_isotope_images_like(isotope_images, annotations) @@ -229,7 +222,7 @@ def _sort_isotope_images_like( # Return them in the requested order. # Note: pd.DataFrame.itertuples yields NamedTuple and is faster than iterrows. return [ - images_dict[(row.formula, row.adduct, row.chem_mod, row.neutral_loss)] + images_dict[(row.formula, row.adduct, row.chemMod, row.neutralLoss)] for row in df.itertuples(index=False) ] From cb3b91e1680e63c41b807c22d0a0d4f730ff2ab2 Mon Sep 17 00:00:00 2001 From: Andreas Eisenbarth Date: Fri, 15 Mar 2024 18:13:14 +0100 Subject: [PATCH 4/4] Bump version to v1.1.1 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 860d151..3a6f63f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "metaspace-converter" -version = "1.1.0" +version = "1.1.1" authors = [ {name = "Tim Daniel Rose", email = "tim.rose@embl.de"}, {name = "Andreas Eisenbarth", email = "andreas.eisenbarth@embl.de"},