Skip to content

Commit

Permalink
Update to Python 3.12 (#18)
Browse files Browse the repository at this point in the history
  • Loading branch information
brendanreardon authored May 13, 2024
1 parent 37c7b04 commit 40fd946
Show file tree
Hide file tree
Showing 18 changed files with 350 additions and 158 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ You can view additional documentation, including [descriptions of inputs](docs/d
The codebase is available for download through this GitHub repository, [Dockerhub](https://hub.docker.com/r/vanallenlab/moalmanac/), and [Terra](https://portal.firecloud.org/#methods/vanallenlab/moalmanac/2). The method can also be run on Terra, without having to use Terra, by using [our portal](https://portal.moalmanac.org/). **Accessing Molecular Oncology Almanac through GitHub will require building some of the [datasources](moalmanac/datasources/) but they are also contained in the Docker container**.

### Installation
Molecular Oncology Almanac is a Python application using Python 3.11. This application, datasources, and all dependencies are packaged on Docker and can be downloaded with the command
Molecular Oncology Almanac is a Python application using Python 3.12. This application, datasources, and all dependencies are packaged on Docker and can be downloaded with the command
```bash
docker pull vanallenlab/moalmanac
```
Expand All @@ -30,7 +30,7 @@ git clone https://github.com/vanallenlab/moalmanac.git

We recommend using a [virtual environment](https://docs.python.org/3/tutorial/venv.html) and running Python with either [Anaconda](https://www.anaconda.com/download/) or [Miniconda](https://conda.io/miniconda.html). After installing Anaconda or Miniconda, you can set up by running
```bash
conda create -n moalmanac python=3.11 -y
conda create -n moalmanac python=3.12 -y
source activate moalmanac
pip install -r requirements.txt
```
Expand Down
125 changes: 91 additions & 34 deletions moalmanac/annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
EXAC_CONFIG = CONFIG['exac']


class Annotator(object):
class Annotator:
"""
Annotates variants using datasources
"""
Expand Down Expand Up @@ -126,6 +126,12 @@ def create_id_series(series, columns):
idx += '_' + str(series[col])
return idx

@classmethod
def fill_na(cls, dataframe, column, fill_value, fill_data_type):
if column not in dataframe.columns:
dataframe = cls.preallocate_empty_columns(dataframe, [column])
return dataframe.loc[dataframe.index, column].astype(fill_data_type).fillna(fill_value)

@classmethod
def match_ds(cls, df, ds, bin_column, compare_columns):
df[bin_column] = cls.preallocate_bin(bin_column, df.index)
Expand All @@ -147,7 +153,7 @@ def preallocate_empty_columns(df, columns):
return df


class ACMG(object):
class ACMG:
gene = datasources.ACMG.gene

bin_name = Annotator.acmg_bin
Expand Down Expand Up @@ -406,7 +412,15 @@ def annotate(cls, df, dbs, ontology):
for feature_type, group in df.groupby(cls.feature_type):
feature_type_records = cls.subset_records(ds, cls.feature_type, feature_type)
table = pd.DataFrame(feature_type_records)
table[cls.implication_map] = table[cls.implication].replace(cls.predictive_implication_map)

# this is required for python 3.12 and pandas 2.2.2 to opt into future behavior for type downcasting
with pd.option_context("future.no_silent_downcasting", True):
table[cls.implication_map] = (
table[cls.implication]
.astype(str)
.replace(cls.predictive_implication_map)
.astype(float)
)

if feature_type in [cls.somatic_variant, cls.germline_variant, cls.copynumber_variant, cls.fusion]:
idx = group[cls.feature].isin(list_genes)
Expand All @@ -415,7 +429,8 @@ def annotate(cls, df, dbs, ontology):

for index in group.index:
annotation_function = annotation_function_dict[feature_type]
df.loc[index, :] = annotation_function(sliced_series=df.loc[index, :], ontology=ontology, table=table)
new_series = annotation_function(sliced_series=df.loc[index, :], ontology=ontology, table=table)
df.loc[index, new_series.index] = new_series

return df

Expand Down Expand Up @@ -554,7 +569,7 @@ def annotate_copy_number(cls, sliced_series, ontology, table):

@classmethod
def annotate_fusion(cls, sliced_series, ontology, table):
series = sliced_series.fillna('').copy(deep=True)
series = sliced_series.fillna(pd.NA).copy(deep=True)
feature = series.loc[cls.feature]
alt_type = series.loc[cls.alt_type]
alt = series.loc[cls.alt]
Expand Down Expand Up @@ -896,7 +911,7 @@ def annotate(cls, df, dbs):
return Annotator.annotate(df, dbs, datasources.CancerHotspots, cls.bin_name, cls.comparison_columns)


class CancerHotspots3D(object):
class CancerHotspots3D:
gene = datasources.CancerHotspots3D.gene
alteration = datasources.CancerHotspots3D.alt

Expand All @@ -908,7 +923,7 @@ def annotate(cls, df, dbs):
return Annotator.annotate(df, dbs, datasources.CancerHotspots3D, cls.bin_name, cls.comparison_columns)


class CancerGeneCensus(object):
class CancerGeneCensus:
gene = datasources.CancerGeneCensus.gene

bin_name = Annotator.cgc_bin
Expand All @@ -919,7 +934,7 @@ def annotate(cls, df, dbs):
return Annotator.annotate(df, dbs, datasources.CancerGeneCensus, cls.bin_name, cls.comparison_columns)


class ClinVar(object):
class ClinVar:
chr = datasources.ClinVar.chr
start = datasources.ClinVar.start
end = datasources.ClinVar.end
Expand All @@ -946,7 +961,7 @@ def annotate(cls, df, dbs):
return features.Features.preallocate_missing_columns(df)


class Cosmic(object):
class Cosmic:
gene = datasources.Cosmic.gene
alteration = datasources.Cosmic.alt

Expand Down Expand Up @@ -978,17 +993,28 @@ class ExAC:
@classmethod
def append_exac_af(cls, df, ds, ds_columns):
variants, not_variants = cls.subset_for_variants(df)
#ds = ds.loc[:, [cls.chr, cls.start, cls.ref, cls.alt, cls.af]]
ds = ds.loc[:, ds_columns]

for column, data_type in [(cls.str_columns, str), (cls.int_columns, float), (cls.int_columns, int)]:
variants.loc[variants.index, column] = cls.format_columns(variants, column, data_type)
ds.loc[ds.index, column] = cls.format_columns(ds, column, data_type)
variants[column] = variants[column].astype(data_type)
ds[column] = ds[column].astype(data_type)

merged = variants.merge(ds, how='left')
merged.loc[merged.index, cls.af] = cls.fill_na(merged, cls.af, 0.0, float, 6)
not_variants.loc[not_variants.index, cls.af] = cls.fill_na(not_variants, cls.af, 0.0, float, 6)
return pd.concat([merged, not_variants]).sort_index()
merged.loc[merged.index, cls.af] = Annotator.fill_na(
dataframe=merged,
column=cls.af,
fill_value=0.0,
fill_data_type=float
)
not_variants.loc[not_variants.index, cls.af] = Annotator.fill_na(
dataframe=not_variants,
column=cls.af,
fill_value=0.0,
fill_data_type=float
)
result = pd.concat([merged, not_variants])
result[cls.af] = result[cls.af].astype(float).round(6)
return result

@classmethod
def annotate(cls, df, dbs):
Expand Down Expand Up @@ -1018,12 +1044,6 @@ def annotate_common_af(cls, series_exac_af):
def drop_existing_columns(cls, dataframe):
return dataframe.drop(dataframe.columns[dataframe.columns.str.contains('exac')], axis=1)

@classmethod
def fill_na(cls, dataframe, column, fill_value, fill_data_type, round_places):
if column not in dataframe.columns:
dataframe = Annotator.preallocate_empty_columns(dataframe, [column])
return dataframe.loc[dataframe.index, column].fillna(fill_value).astype(fill_data_type).round(round_places)

@classmethod
def format_columns(cls, dataframe, column, data_type):
return dataframe.loc[dataframe.index, column].astype(data_type)
Expand Down Expand Up @@ -1071,7 +1091,7 @@ def annotate(cls, df, dbs):
return features.Features.preallocate_missing_columns(df_annotated)


class GSEACancerModules(object):
class GSEACancerModules:
gene = datasources.GSEACancerModules.gene

bin_name = Annotator.gsea_module_bin
Expand All @@ -1082,7 +1102,7 @@ def annotate(cls, df, dbs):
return Annotator.annotate(df, dbs, datasources.GSEACancerModules, cls.bin_name, cls.comparison_columns)


class GSEACancerPathways(object):
class GSEACancerPathways:
gene = datasources.GSEACancerPathways.gene

bin_name = Annotator.gsea_pathway_bin
Expand All @@ -1093,7 +1113,7 @@ def annotate(cls, df, dbs):
return Annotator.annotate(df, dbs, datasources.GSEACancerPathways, cls.bin_name, cls.comparison_columns)


class Hereditary(object):
class Hereditary:
gene = datasources.Hereditary.gene

bin_name = Annotator.hereditary_bin
Expand All @@ -1104,7 +1124,7 @@ def annotate(cls, df, dbs):
return Annotator.annotate(df, dbs, datasources.Hereditary, cls.bin_name, cls.comparison_columns)


class MSI(object):
class MSI:
gene = datasources.Datasources.feature

bin_name = Annotator.msi_bin
Expand All @@ -1124,7 +1144,7 @@ def annotate(cls, df):
return df


class OverlapValidation(object):
class OverlapValidation:
section = 'validation_sequencing'
gene = COLNAMES[section]['gene']
feature_type = COLNAMES[section]['feature_type']
Expand All @@ -1146,7 +1166,13 @@ def append_validation(cls, primary, validation):
df = cls.drop_validation_columns(primary)
df = cls.merge_data_frames(df, validation, cls.merge_cols)
idx = cls.get_mutation_index(df)
df.loc[idx, cls.fill_cols] = df.loc[idx, cls.fill_cols].fillna(0.0)
for column in cls.fill_cols:
df.loc[idx, column] = Annotator.fill_na(
dataframe=df.loc[idx, :],
column=column,
fill_value=0.0,
fill_data_type=float
)

df.loc[idx, cls.validation_detection_power] = cls.calculate_validation_detection_power(
df.loc[idx, cls.tumor_f].astype(float),
Expand Down Expand Up @@ -1185,7 +1211,7 @@ def round_series(series, n):
return series.round(n)


class OverlapSomaticGermline(object):
class OverlapSomaticGermline:
section = 'overlap_somatic_germline'
gene = COLNAMES[section]['gene']
alt_type = COLNAMES[section]['alt_type']
Expand Down Expand Up @@ -1229,7 +1255,7 @@ def annotate(cls, actionable, efficacy, dictionary, append_lookup=True):
dataframe = efficacy[efficacy[cls.feature_display].eq(feature)]
efficacy_observed = cls.search_for_significance(dataframe[cls.pvalue])
actionable.loc[index, cls.efficacy] = efficacy_observed
actionable[cls.efficacy].fillna(pd.NA, inplace=True)
actionable[cls.efficacy] = actionable[cls.efficacy].fillna(pd.NA)
idx = actionable.index
if append_lookup:
actionable.loc[idx, cls.lookup] = cls.create_lookup(idx, series_features.index, dictionary)
Expand Down Expand Up @@ -1369,7 +1395,14 @@ def annotate_fusions(cls, df, dbs):
group3.rename(columns={cls.evidence_map_str: cls.group3})[cls.group3],
group4.rename(columns={cls.evidence_map_str: cls.group4})[cls.group4],
], axis=1)
values = values.fillna(-1).idxmax(axis=1)

# this is required for python 3.12 and pandas 2.2.2 to opt into future behavior for type downcasting
with pd.option_context("future.no_silent_downcasting", True):
values = (
values
.fillna(-1.0)
.idxmax(axis=1)
)

idx_group1 = values[values.eq(cls.group1)].index
idx_group2 = values[values.eq(cls.group2)].index
Expand Down Expand Up @@ -1399,7 +1432,14 @@ def annotate_fusions(cls, df, dbs):
group1.rename(columns={cls.evidence_map_str: cls.group1})[cls.group1],
group2.rename(columns={cls.evidence_map_str: cls.group2})[cls.group2],
], axis=1)
values = values.fillna(-1).idxmax(axis=1)

# this is required for python 3.12 and pandas 2.2.2 to opt into future behavior for type downcasting
with pd.option_context("future.no_silent_downcasting", True):
values = (
values
.fillna(-1.0)
.idxmax(axis=1)
)

idx_group1 = values[values.eq(cls.group1)].index
idx_group2 = values[values.eq(cls.group2)].index
Expand All @@ -1421,7 +1461,14 @@ def annotate_fusions(cls, df, dbs):
group3.rename(columns={cls.evidence_map_str: cls.group3})[cls.group3],
group4.rename(columns={cls.evidence_map_str: cls.group4})[cls.group4],
], axis=1)
values = values.fillna(-1).idxmax(axis=1)

# this is required for python 3.12 and pandas 2.2.2 to opt into future behavior for type downcasting
with pd.option_context("future.no_silent_downcasting", True):
values = (
values
.fillna(-1.0)
.idxmax(axis=1)
)

idx_group3 = values[values.eq(cls.group3)].index
idx_group4 = values[values.eq(cls.group4)].index
Expand Down Expand Up @@ -1463,7 +1510,9 @@ def annotate_somatic_variants(cls, df, dbs):
df = df[df[cls.feature_type].eq(cls.somatic_variant)]
db = Almanac.subset_records(almanac['content'], cls.feature_type, cls.somatic_variant)
db = pd.DataFrame(db)
db[cls.variant_annotation].replace({'Oncogenic Mutations': '', 'Activating mutation': ''}, inplace=True)

replacement_dictionary = {'Oncogenic Mutations': '', 'Activating mutation': ''}
db[cls.variant_annotation] = db[cls.variant_annotation].replace(replacement_dictionary)

column_map = {cls.gene: cls.feature,
cls.variant_annotation: cls.alteration_type,
Expand Down Expand Up @@ -1556,7 +1605,15 @@ def format_db(cls, feature_columns, column_map, db):
column_map[cls.predictive_implication] = cls.evidence
db = db.loc[:, columns].drop_duplicates()
db.rename(columns=column_map, inplace=True)
db[cls.evidence_map_str] = db[cls.evidence].replace(cls.evidence_map)

# this is required for python 3.12 and pandas 2.2.2 to opt into future behavior for type downcasting
with pd.option_context("future.no_silent_downcasting", True):
db[cls.evidence_map_str] = (
db[cls.evidence]
.astype(str)
.replace(cls.evidence_map)
.astype(int)
)
db.sort_values([cls.evidence_map_str, cls.feature_display], ascending=[False, True], inplace=True)
db[cls.merged] = 1
return db
Expand Down
2 changes: 1 addition & 1 deletion moalmanac/config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ include_preclinical_efficacy_in_actionability_report = on
plot_preclinical_efficacy = on

[versions]
interpreter = 0.5.0
interpreter = 0.6.0
database = v.2023-11-09

[exac]
Expand Down
15 changes: 8 additions & 7 deletions moalmanac/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,8 @@ def map_almanac_bins(cls, series):
return series.map(cls.almanac_bin_map)

@staticmethod
def remap_almanac_bins(series, old_values, new_values):
return series.astype(int).replace(to_replace=old_values, value=new_values)
def remap_almanac_bins(series, old_value, new_value):
return series.astype(int).replace(to_replace=old_value, value=new_value)

@classmethod
def remove_low_allele_fraction_variants(cls, df):
Expand Down Expand Up @@ -177,7 +177,7 @@ def split_camelcase(string):

@classmethod
def subset_almanac_bin(cls, df):
return df[df[cls.almanac_bin].fillna(0.0).astype(float) != 0.0]
return df[df[cls.almanac_bin].astype(float).fillna(0.0) != 0.0]


class Actionable:
Expand Down Expand Up @@ -255,10 +255,10 @@ def evaluate(cls, somatic, germline, ms_variants, ms_status, burden, signatures,
actionable_list = []
for dataframe in [somatic, germline, ms_variants_summary, ms_status, burden, signatures, wgd]:
actionable_list.append(Evaluator.subset_almanac_bin(dataframe))
df = pd.concat(actionable_list, ignore_index=True)
df = features.Features.concat_list_of_dataframes(list_of_dataframes=actionable_list)

df[Evaluator.feature_display] = cls.format_feature_display(
df.fillna(''), Evaluator.feature_display,
df, Evaluator.feature_display,
Evaluator.feature_type, Evaluator.feature,
Evaluator.alt_type, Evaluator.alt)
return df.sort_values(cls.sort_columns, ascending=False)
Expand Down Expand Up @@ -434,7 +434,7 @@ def evaluate_status(cls, df, variants):
columns = [Evaluator.almanac_bin, Evaluator.sensitive_bin, Evaluator.resistance_bin, Evaluator.prognostic_bin]
if variants.empty:
for bin_column in columns:
df[bin_column] = Evaluator.remap_almanac_bins(df[bin_column].fillna(0), [3], [2])
df[bin_column] = Evaluator.remap_almanac_bins(series=df[bin_column], old_value=3, new_value=2)
return Evaluator.evaluate_almanac(df)

@classmethod
Expand All @@ -449,7 +449,8 @@ def evaluate_variants(cls, somatic, germline):

msi_somatic = cls.return_msi_variants(somatic)
msi_germline = cls.return_msi_variants(germline)
return pd.concat([msi_somatic, msi_germline], axis=0, ignore_index=True)
msi_variants = [msi_somatic, msi_germline]
return features.Features.concat_list_of_dataframes(list_of_dataframes=msi_variants)


class Strategies:
Expand Down
Loading

0 comments on commit 40fd946

Please sign in to comment.