Update to Python 3.12 (#18)

vanallenlab · May 13, 2024 · 40fd946 · 40fd946
1 parent 37c7b04
commit 40fd946
Show file tree

Hide file tree

Showing 18 changed files with 350 additions and 158 deletions.
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ You can view additional documentation, including [descriptions of inputs](docs/d
 The codebase is available for download through this GitHub repository, [Dockerhub](https://hub.docker.com/r/vanallenlab/moalmanac/), and [Terra](https://portal.firecloud.org/#methods/vanallenlab/moalmanac/2). The method can also be run on Terra, without having to use Terra, by using [our portal](https://portal.moalmanac.org/). **Accessing Molecular Oncology Almanac through GitHub will require building some of the [datasources](moalmanac/datasources/) but they are also contained in the Docker container**.
 
 ### Installation
-Molecular Oncology Almanac is a Python application using Python 3.11. This application, datasources, and all dependencies are packaged on Docker and can be downloaded with the command
+Molecular Oncology Almanac is a Python application using Python 3.12. This application, datasources, and all dependencies are packaged on Docker and can be downloaded with the command
  ```bash
 docker pull vanallenlab/moalmanac
 ```
@@ -30,7 +30,7 @@ git clone https://github.com/vanallenlab/moalmanac.git
 
 We recommend using a [virtual environment](https://docs.python.org/3/tutorial/venv.html) and running Python with either [Anaconda](https://www.anaconda.com/download/) or  [Miniconda](https://conda.io/miniconda.html). After installing Anaconda or Miniconda, you can set up by running
 ```bash
-conda create -n moalmanac python=3.11 -y
+conda create -n moalmanac python=3.12 -y
 source activate moalmanac
 pip install -r requirements.txt
 ```

diff --git a/moalmanac/annotator.py b/moalmanac/annotator.py
@@ -13,7 +13,7 @@
 EXAC_CONFIG = CONFIG['exac']
 
 
-class Annotator(object):
+class Annotator:
     """
     Annotates variants using datasources
     """
@@ -126,6 +126,12 @@ def create_id_series(series, columns):
                 idx += '_' + str(series[col])
         return idx
 
+    @classmethod
+    def fill_na(cls, dataframe, column, fill_value, fill_data_type):
+        if column not in dataframe.columns:
+            dataframe = cls.preallocate_empty_columns(dataframe, [column])
+        return dataframe.loc[dataframe.index, column].astype(fill_data_type).fillna(fill_value)
+
     @classmethod
     def match_ds(cls, df, ds, bin_column, compare_columns):
         df[bin_column] = cls.preallocate_bin(bin_column, df.index)
@@ -147,7 +153,7 @@ def preallocate_empty_columns(df, columns):
         return df
 
 
-class ACMG(object):
+class ACMG:
     gene = datasources.ACMG.gene
 
     bin_name = Annotator.acmg_bin
@@ -406,7 +412,15 @@ def annotate(cls, df, dbs, ontology):
         for feature_type, group in df.groupby(cls.feature_type):
             feature_type_records = cls.subset_records(ds, cls.feature_type, feature_type)
             table = pd.DataFrame(feature_type_records)
-            table[cls.implication_map] = table[cls.implication].replace(cls.predictive_implication_map)
+
+            # this is required for python 3.12 and pandas 2.2.2 to opt into future behavior for type downcasting
+            with pd.option_context("future.no_silent_downcasting", True):
+                table[cls.implication_map] = (
+                    table[cls.implication]
+                    .astype(str)
+                    .replace(cls.predictive_implication_map)
+                    .astype(float)
+                )
 
             if feature_type in [cls.somatic_variant, cls.germline_variant, cls.copynumber_variant, cls.fusion]:
                 idx = group[cls.feature].isin(list_genes)
@@ -415,7 +429,8 @@ def annotate(cls, df, dbs, ontology):
 
             for index in group.index:
                 annotation_function = annotation_function_dict[feature_type]
-                df.loc[index, :] = annotation_function(sliced_series=df.loc[index, :], ontology=ontology, table=table)
+                new_series = annotation_function(sliced_series=df.loc[index, :], ontology=ontology, table=table)
+                df.loc[index, new_series.index] = new_series
 
         return df
 
@@ -554,7 +569,7 @@ def annotate_copy_number(cls, sliced_series, ontology, table):
 
     @classmethod
     def annotate_fusion(cls, sliced_series, ontology, table):
-        series = sliced_series.fillna('').copy(deep=True)
+        series = sliced_series.fillna(pd.NA).copy(deep=True)
         feature = series.loc[cls.feature]
         alt_type = series.loc[cls.alt_type]
         alt = series.loc[cls.alt]
@@ -896,7 +911,7 @@ def annotate(cls, df, dbs):
         return Annotator.annotate(df, dbs, datasources.CancerHotspots, cls.bin_name, cls.comparison_columns)
 
 
-class CancerHotspots3D(object):
+class CancerHotspots3D:
     gene = datasources.CancerHotspots3D.gene
     alteration = datasources.CancerHotspots3D.alt
 
@@ -908,7 +923,7 @@ def annotate(cls, df, dbs):
         return Annotator.annotate(df, dbs, datasources.CancerHotspots3D, cls.bin_name, cls.comparison_columns)
 
 
-class CancerGeneCensus(object):
+class CancerGeneCensus:
     gene = datasources.CancerGeneCensus.gene
 
     bin_name = Annotator.cgc_bin
@@ -919,7 +934,7 @@ def annotate(cls, df, dbs):
         return Annotator.annotate(df, dbs, datasources.CancerGeneCensus, cls.bin_name, cls.comparison_columns)
 
 
-class ClinVar(object):
+class ClinVar:
     chr = datasources.ClinVar.chr
     start = datasources.ClinVar.start
     end = datasources.ClinVar.end
@@ -946,7 +961,7 @@ def annotate(cls, df, dbs):
         return features.Features.preallocate_missing_columns(df)
 
 
-class Cosmic(object):
+class Cosmic:
     gene = datasources.Cosmic.gene
     alteration = datasources.Cosmic.alt
 
@@ -978,17 +993,28 @@ class ExAC:
     @classmethod
     def append_exac_af(cls, df, ds, ds_columns):
         variants, not_variants = cls.subset_for_variants(df)
-        #ds = ds.loc[:, [cls.chr, cls.start, cls.ref, cls.alt, cls.af]]
         ds = ds.loc[:, ds_columns]
 
         for column, data_type in [(cls.str_columns, str), (cls.int_columns, float), (cls.int_columns, int)]:
-            variants.loc[variants.index, column] = cls.format_columns(variants, column, data_type)
-            ds.loc[ds.index, column] = cls.format_columns(ds, column, data_type)
+            variants[column] = variants[column].astype(data_type)
+            ds[column] = ds[column].astype(data_type)
 
         merged = variants.merge(ds, how='left')
-        merged.loc[merged.index, cls.af] = cls.fill_na(merged, cls.af, 0.0, float, 6)
-        not_variants.loc[not_variants.index, cls.af] = cls.fill_na(not_variants, cls.af, 0.0, float, 6)
-        return pd.concat([merged, not_variants]).sort_index()
+        merged.loc[merged.index, cls.af] = Annotator.fill_na(
+            dataframe=merged,
+            column=cls.af,
+            fill_value=0.0,
+            fill_data_type=float
+        )
+        not_variants.loc[not_variants.index, cls.af] = Annotator.fill_na(
+            dataframe=not_variants,
+            column=cls.af,
+            fill_value=0.0,
+            fill_data_type=float
+        )
+        result = pd.concat([merged, not_variants])
+        result[cls.af] = result[cls.af].astype(float).round(6)
+        return result
 
     @classmethod
     def annotate(cls, df, dbs):
@@ -1018,12 +1044,6 @@ def annotate_common_af(cls, series_exac_af):
     def drop_existing_columns(cls, dataframe):
         return dataframe.drop(dataframe.columns[dataframe.columns.str.contains('exac')], axis=1)
 
-    @classmethod
-    def fill_na(cls, dataframe, column, fill_value, fill_data_type, round_places):
-        if column not in dataframe.columns:
-            dataframe = Annotator.preallocate_empty_columns(dataframe, [column])
-        return dataframe.loc[dataframe.index, column].fillna(fill_value).astype(fill_data_type).round(round_places)
-
     @classmethod
     def format_columns(cls, dataframe, column, data_type):
         return dataframe.loc[dataframe.index, column].astype(data_type)
@@ -1071,7 +1091,7 @@ def annotate(cls, df, dbs):
         return features.Features.preallocate_missing_columns(df_annotated)
 
 
-class GSEACancerModules(object):
+class GSEACancerModules:
     gene = datasources.GSEACancerModules.gene
 
     bin_name = Annotator.gsea_module_bin
@@ -1082,7 +1102,7 @@ def annotate(cls, df, dbs):
         return Annotator.annotate(df, dbs, datasources.GSEACancerModules, cls.bin_name, cls.comparison_columns)
 
 
-class GSEACancerPathways(object):
+class GSEACancerPathways:
     gene = datasources.GSEACancerPathways.gene
 
     bin_name = Annotator.gsea_pathway_bin
@@ -1093,7 +1113,7 @@ def annotate(cls, df, dbs):
         return Annotator.annotate(df, dbs, datasources.GSEACancerPathways, cls.bin_name, cls.comparison_columns)
 
 
-class Hereditary(object):
+class Hereditary:
     gene = datasources.Hereditary.gene
 
     bin_name = Annotator.hereditary_bin
@@ -1104,7 +1124,7 @@ def annotate(cls, df, dbs):
         return Annotator.annotate(df, dbs, datasources.Hereditary, cls.bin_name, cls.comparison_columns)
 
 
-class MSI(object):
+class MSI:
     gene = datasources.Datasources.feature
 
     bin_name = Annotator.msi_bin
@@ -1124,7 +1144,7 @@ def annotate(cls, df):
         return df
 
 
-class OverlapValidation(object):
+class OverlapValidation:
     section = 'validation_sequencing'
     gene = COLNAMES[section]['gene']
     feature_type = COLNAMES[section]['feature_type']
@@ -1146,7 +1166,13 @@ def append_validation(cls, primary, validation):
         df = cls.drop_validation_columns(primary)
         df = cls.merge_data_frames(df, validation, cls.merge_cols)
         idx = cls.get_mutation_index(df)
-        df.loc[idx, cls.fill_cols] = df.loc[idx, cls.fill_cols].fillna(0.0)
+        for column in cls.fill_cols:
+            df.loc[idx, column] = Annotator.fill_na(
+                dataframe=df.loc[idx, :],
+                column=column,
+                fill_value=0.0,
+                fill_data_type=float
+            )
 
         df.loc[idx, cls.validation_detection_power] = cls.calculate_validation_detection_power(
             df.loc[idx, cls.tumor_f].astype(float),
@@ -1185,7 +1211,7 @@ def round_series(series, n):
         return series.round(n)
 
 
-class OverlapSomaticGermline(object):
+class OverlapSomaticGermline:
     section = 'overlap_somatic_germline'
     gene = COLNAMES[section]['gene']
     alt_type = COLNAMES[section]['alt_type']
@@ -1229,7 +1255,7 @@ def annotate(cls, actionable, efficacy, dictionary, append_lookup=True):
             dataframe = efficacy[efficacy[cls.feature_display].eq(feature)]
             efficacy_observed = cls.search_for_significance(dataframe[cls.pvalue])
             actionable.loc[index, cls.efficacy] = efficacy_observed
-        actionable[cls.efficacy].fillna(pd.NA, inplace=True)
+        actionable[cls.efficacy] = actionable[cls.efficacy].fillna(pd.NA)
         idx = actionable.index
         if append_lookup:
             actionable.loc[idx, cls.lookup] = cls.create_lookup(idx, series_features.index, dictionary)
@@ -1369,7 +1395,14 @@ def annotate_fusions(cls, df, dbs):
             group3.rename(columns={cls.evidence_map_str: cls.group3})[cls.group3],
             group4.rename(columns={cls.evidence_map_str: cls.group4})[cls.group4],
         ], axis=1)
-        values = values.fillna(-1).idxmax(axis=1)
+
+        # this is required for python 3.12 and pandas 2.2.2 to opt into future behavior for type downcasting
+        with pd.option_context("future.no_silent_downcasting", True):
+            values = (
+                values
+                .fillna(-1.0)
+                .idxmax(axis=1)
+            )
 
         idx_group1 = values[values.eq(cls.group1)].index
         idx_group2 = values[values.eq(cls.group2)].index
@@ -1399,7 +1432,14 @@ def annotate_fusions(cls, df, dbs):
             group1.rename(columns={cls.evidence_map_str: cls.group1})[cls.group1],
             group2.rename(columns={cls.evidence_map_str: cls.group2})[cls.group2],
         ], axis=1)
-        values = values.fillna(-1).idxmax(axis=1)
+
+        # this is required for python 3.12 and pandas 2.2.2 to opt into future behavior for type downcasting
+        with pd.option_context("future.no_silent_downcasting", True):
+            values = (
+                values
+                .fillna(-1.0)
+                .idxmax(axis=1)
+            )
 
         idx_group1 = values[values.eq(cls.group1)].index
         idx_group2 = values[values.eq(cls.group2)].index
@@ -1421,7 +1461,14 @@ def annotate_fusions(cls, df, dbs):
             group3.rename(columns={cls.evidence_map_str: cls.group3})[cls.group3],
             group4.rename(columns={cls.evidence_map_str: cls.group4})[cls.group4],
         ], axis=1)
-        values = values.fillna(-1).idxmax(axis=1)
+
+        # this is required for python 3.12 and pandas 2.2.2 to opt into future behavior for type downcasting
+        with pd.option_context("future.no_silent_downcasting", True):
+            values = (
+                values
+                .fillna(-1.0)
+                .idxmax(axis=1)
+            )
 
         idx_group3 = values[values.eq(cls.group3)].index
         idx_group4 = values[values.eq(cls.group4)].index
@@ -1463,7 +1510,9 @@ def annotate_somatic_variants(cls, df, dbs):
         df = df[df[cls.feature_type].eq(cls.somatic_variant)]
         db = Almanac.subset_records(almanac['content'], cls.feature_type, cls.somatic_variant)
         db = pd.DataFrame(db)
-        db[cls.variant_annotation].replace({'Oncogenic Mutations': '', 'Activating mutation': ''}, inplace=True)
+
+        replacement_dictionary = {'Oncogenic Mutations': '', 'Activating mutation': ''}
+        db[cls.variant_annotation] = db[cls.variant_annotation].replace(replacement_dictionary)
 
         column_map = {cls.gene: cls.feature,
                       cls.variant_annotation: cls.alteration_type,
@@ -1556,7 +1605,15 @@ def format_db(cls, feature_columns, column_map, db):
         column_map[cls.predictive_implication] = cls.evidence
         db = db.loc[:, columns].drop_duplicates()
         db.rename(columns=column_map, inplace=True)
-        db[cls.evidence_map_str] = db[cls.evidence].replace(cls.evidence_map)
+
+        # this is required for python 3.12 and pandas 2.2.2 to opt into future behavior for type downcasting
+        with pd.option_context("future.no_silent_downcasting", True):
+            db[cls.evidence_map_str] = (
+                db[cls.evidence]
+                .astype(str)
+                .replace(cls.evidence_map)
+                .astype(int)
+            )
         db.sort_values([cls.evidence_map_str, cls.feature_display], ascending=[False, True], inplace=True)
         db[cls.merged] = 1
         return db

diff --git a/moalmanac/config.ini b/moalmanac/config.ini
@@ -8,7 +8,7 @@ include_preclinical_efficacy_in_actionability_report = on
 plot_preclinical_efficacy = on
 
 [versions]
-interpreter = 0.5.0
+interpreter = 0.6.0
 database = v.2023-11-09
 
 [exac]

diff --git a/moalmanac/evaluator.py b/moalmanac/evaluator.py
@@ -141,8 +141,8 @@ def map_almanac_bins(cls, series):
         return series.map(cls.almanac_bin_map)
 
     @staticmethod
-    def remap_almanac_bins(series, old_values, new_values):
-        return series.astype(int).replace(to_replace=old_values, value=new_values)
+    def remap_almanac_bins(series, old_value, new_value):
+        return series.astype(int).replace(to_replace=old_value, value=new_value)
 
     @classmethod
     def remove_low_allele_fraction_variants(cls, df):
@@ -177,7 +177,7 @@ def split_camelcase(string):
 
     @classmethod
     def subset_almanac_bin(cls, df):
-        return df[df[cls.almanac_bin].fillna(0.0).astype(float) != 0.0]
+        return df[df[cls.almanac_bin].astype(float).fillna(0.0) != 0.0]
 
 
 class Actionable:
@@ -255,10 +255,10 @@ def evaluate(cls, somatic, germline, ms_variants, ms_status, burden, signatures,
         actionable_list = []
         for dataframe in [somatic, germline, ms_variants_summary, ms_status, burden, signatures, wgd]:
             actionable_list.append(Evaluator.subset_almanac_bin(dataframe))
-        df = pd.concat(actionable_list, ignore_index=True)
+        df = features.Features.concat_list_of_dataframes(list_of_dataframes=actionable_list)
 
         df[Evaluator.feature_display] = cls.format_feature_display(
-            df.fillna(''), Evaluator.feature_display,
+            df, Evaluator.feature_display,
             Evaluator.feature_type, Evaluator.feature,
             Evaluator.alt_type, Evaluator.alt)
         return df.sort_values(cls.sort_columns, ascending=False)
@@ -434,7 +434,7 @@ def evaluate_status(cls, df, variants):
         columns = [Evaluator.almanac_bin, Evaluator.sensitive_bin, Evaluator.resistance_bin, Evaluator.prognostic_bin]
         if variants.empty:
             for bin_column in columns:
-                df[bin_column] = Evaluator.remap_almanac_bins(df[bin_column].fillna(0), [3], [2])
+                df[bin_column] = Evaluator.remap_almanac_bins(series=df[bin_column], old_value=3, new_value=2)
         return Evaluator.evaluate_almanac(df)
 
     @classmethod
@@ -449,7 +449,8 @@ def evaluate_variants(cls, somatic, germline):
 
         msi_somatic = cls.return_msi_variants(somatic)
         msi_germline = cls.return_msi_variants(germline)
-        return pd.concat([msi_somatic, msi_germline], axis=0, ignore_index=True)
+        msi_variants = [msi_somatic, msi_germline]
+        return features.Features.concat_list_of_dataframes(list_of_dataframes=msi_variants)
 
 
 class Strategies: