diff --git a/multidms/data.py b/multidms/data.py index de697cb..5259026 100644 --- a/multidms/data.py +++ b/multidms/data.py @@ -599,7 +599,6 @@ def targets(self) -> dict: """The functional scores for each variant in the training data.""" return self._training_data["y"] - # TODO, rename mutparser @property def mutparser(self) -> MutationParser: """ @@ -608,7 +607,6 @@ def mutparser(self) -> MutationParser: """ return self._mutparser - # TODO, rename @property def parse_mut(self) -> MutationParser: """ @@ -618,7 +616,6 @@ def parse_mut(self) -> MutationParser: """ return self.mutparser.parse_mut - # TODO, document rename issue @property def parse_muts(self) -> partial: """ @@ -628,11 +625,6 @@ def parse_muts(self) -> partial: """ return self._parse_muts - # TODO should this be cached? how does caching interact with the way in - # which we applying this function in parallel? - # although, unless the variants are un-collapsed, this cache will be - # pretty useless. - # although it could be useful for the Model.add_phenotypes_to_df method. def convert_subs_wrt_ref_seq(self, condition, aa_subs): """ Covert amino acid substitutions to be with respect to the reference sequence. diff --git a/multidms/model.py b/multidms/model.py index 291588a..717dfbe 100644 --- a/multidms/model.py +++ b/multidms/model.py @@ -209,7 +209,7 @@ def __init__( epistatic_model=multidms.biophysical.sigmoidal_global_epistasis, output_activation=multidms.biophysical.identity_activation, conditional_shifts=True, - alpha_d=False, # TODO raise issue to be squashed in this PR + alpha_d=False, gamma_corrected=False, PRNGKey=0, init_beta_naught=0.0, @@ -375,6 +375,25 @@ def loss(self) -> float: data = (self.data.training_data["X"], self.data.training_data["y"]) return jax.jit(self.model_components["objective"])(self.params, data, **kwargs) + @property + def conditional_loss(self) -> float: + """Compute loss individually for each condition.""" + kwargs = { + "scale_coeff_ridge_beta": 0.0, + "scale_coeff_ridge_shift": 0.0, + "scale_coeff_ridge_gamma": 0.0, + "scale_ridge_alpha_d": 0.0, + } + + X, y = self.data.training_data["X"], self.data.training_data["y"] + loss_fxn = jax.jit(self.model_components["objective"]) + ret = {} + for condition in self.data.conditions: + condition_data = ({condition: X[condition]}, {condition: y[condition]}) + ret[condition] = float(loss_fxn(self.params, condition_data, **kwargs)) + ret["total"] = sum(ret.values()) + return ret + @property def variants_df(self): """ @@ -546,7 +565,7 @@ def get_mutations_df( return mutations_df[col_order] - def get_df_loss(self, df, error_if_unknown=False, verbose=False): + def get_df_loss(self, df, error_if_unknown=False, verbose=False, conditional=False): """ Get the loss of the model on a given data frame. @@ -563,10 +582,13 @@ def get_df_loss(self, df, error_if_unknown=False, verbose=False): in the loss calculation. If `True`, raise an error. verbose : bool If True, print the number of valid and invalid variants. + conditional : bool + If True, return the loss for each condition as a dictionary. + If False, return the total loss. Returns ------- - float + float or dict The loss of the model on the given data frame. """ substitutions_col = "aa_substitutions" @@ -579,8 +601,11 @@ def get_df_loss(self, df, error_if_unknown=False, verbose=False): if condition_col not in df.columns: raise ValueError("`df` lacks `condition_col` " f"{condition_col}") - X, y = {}, {} + loss_fxn = jax.jit(self.model_components["objective"]) + + ret = {} for condition, condition_df in df.groupby(condition_col): + X, y = {}, {} variant_subs = condition_df[substitutions_col] if condition not in self.data.reference_sequence_conditions: variant_subs = condition_df.apply( @@ -592,14 +617,23 @@ def get_df_loss(self, df, error_if_unknown=False, verbose=False): # build binary variants as csr matrix, make prediction, and append valid, invalid = 0, 0 # row indices of elements that are one - binary_variants = [] + # binary_variants = [] variant_targets = [] + row_ind = [] # row indices of elements that are one + col_ind = [] # column indices of elements that are one for subs, target in zip(variant_subs, condition_df[func_score_col]): try: - binary_variants.append(ref_bmap.sub_str_to_binary(subs)) + # binary_variants.append(ref_bmap.sub_str_to_binary(subs)) + # variant_targets.append(target) + # valid += 1 + + for isub in ref_bmap.sub_str_to_indices(subs): + row_ind.append(valid) + col_ind.append(isub) variant_targets.append(target) valid += 1 + except ValueError: if error_if_unknown: raise ValueError( @@ -615,12 +649,26 @@ def get_df_loss(self, df, error_if_unknown=False, verbose=False): f"{valid}, n invalid variants: {invalid}" ) + # X[condition] = sparse.BCOO.from_scipy_sparse( + # scipy.sparse.csr_matrix(onp.vstack(binary_variants)) + # ) X[condition] = sparse.BCOO.from_scipy_sparse( - scipy.sparse.csr_matrix(onp.vstack(binary_variants)) + scipy.sparse.csr_matrix( + (onp.ones(len(row_ind), dtype="int8"), (row_ind, col_ind)), + shape=(valid, ref_bmap.binarylength), + dtype="int8", + ) ) + y[condition] = jnp.array(variant_targets) - return self.model_components["objective"](self.params, (X, y)) + ret[condition] = float(loss_fxn(self.params, (X, y))) + + ret["total"] = sum(ret.values()) + + if not conditional: + return ret["total"] + return ret def add_phenotypes_to_df( self, diff --git a/multidms/model_collection.py b/multidms/model_collection.py index 678b544..5f2248d 100644 --- a/multidms/model_collection.py +++ b/multidms/model_collection.py @@ -396,8 +396,19 @@ def __init__(self, fit_models): ) all_mutations = set.union(all_mutations, set(fit.data.mutations)) - # add the final training loss to the fit_models dataframe - fit_models["training_loss"] = fit_models.step_loss.apply(lambda x: x[-1]) + # initialize empty columns for conditional loss + fit_models.assign( + **{ + f"{condition}_loss_training": onp.nan + for condition in first_dataset.conditions + }, + total_loss=onp.nan, + ) + # assign coditional loss columns + for idx, fit in fit_models.iterrows(): + conditional_loss = fit.model.conditional_loss + for condition, loss in conditional_loss.items(): + fit_models.loc[idx, f"{condition}_loss_training"] = loss self._site_map_union = site_map_union self._conditions = first_dataset.conditions @@ -432,7 +443,6 @@ def all_mutations(self) -> tuple: """The mutations shared by each fitting dataset.""" return self._all_mutations - # TODO remove verbose everywhere @lru_cache(maxsize=10) def split_apply_combine_muts( self, @@ -482,32 +492,52 @@ def split_apply_combine_muts( A dataframe containing the aggregated mutational parameter values """ print("cache miss - this could take a moment") + queried_fits = ( + self.fit_models.query(query) if query is not None else self.fit_models + ) + if len(queried_fits) == 0: + raise ValueError("invalid query, no fits returned") + if groupby is None: - groupby = tuple( - set(self.fit_models.columns) - - set(["model", "data", "step_loss", "verbose"]) + # groupby = tuple( + # set(queried_fits.columns) + # - set( + # ["model", "dataset_name", "verbose"] + # + [col for col in queried_fits.columns if "loss" in col] + # ) + # ) + ret = ( + pd.concat( + [ + fit["model"].get_mutations_df(return_split=False, **kwargs) + for _, fit in queried_fits.iterrows() + ], + join="inner", # the columns will always match based on class req. + ) + .query( + f"mutation.isin({list(self.shared_mutations)})" + if inner_merge_dataset_muts + else "mutation.notna()" + ) + .groupby("mutation") + .aggregate(aggregate_func) ) + return ret elif isinstance(groupby, str): groupby = tuple([groupby]) elif isinstance(groupby, tuple): - if not all(feature in self.fit_models.columns for feature in groupby): + if not all(feature in queried_fits.columns for feature in groupby): raise ValueError( f"invalid groupby, values must be in {self.fit_models.columns}" ) else: raise ValueError( "invalid groupby, must be tuple with values " - f"in {self.fit_models.columns}" + f"in {queried_fits.columns}" ) - queried_fits = ( - self.fit_models.query(query) if query is not None else self.fit_models - ) - if len(queried_fits) == 0: - raise ValueError("invalid query, no fits returned") - ret = pd.concat( [ pd.concat( @@ -566,20 +596,69 @@ def add_validation_loss(self, test_data, overwrite=False): # check there's a testing dataframe for each unique dataset_name assert set(test_data.keys()) == set(self.fit_models["dataset_name"].unique()) - if "validation_loss" in self.fit_models.columns and not overwrite: + validation_cols_exist = onp.any( + [ + f"{condition}_loss_validation" in self.fit_models.columns + for condition in self.conditions + ] + ) + if validation_cols_exist and not overwrite: raise ValueError( "validation_loss already exists in self.fit_models, set overwrite=True " "to overwrite" ) - self.fit_models["validation_loss"] = onp.nan + self.fit_models = self.fit_models.assign( + **{ + f"{condition}_loss_validation": onp.nan for condition in self.conditions + }, + total_loss_validation=onp.nan, + ) + for idx, fit in self.fit_models.iterrows(): - self.fit_models.loc[idx, "validation_loss"] = fit["model"].get_df_loss( - test_data[fit["dataset_name"]] + condional_df_loss = fit.model.get_df_loss( + test_data[fit["dataset_name"]], conditional=True ) + for condition, loss in condional_df_loss.items(): + self.fit_models.loc[idx, f"{condition}_loss_validation"] = loss return None + def get_conditional_loss_df(self, query=None): + """ + return a long form dataframe with columns + "dataset_name", "scale_coeff_lasso_shift", + "split" ("training" or "validation"), + "loss" (actual value), and "condition". + + Parameters + ---------- + query : str, optional + The query to apply to the fit_models dataframe + before formatting the loss dataframe. The default is None. + """ + if query is not None: + queried_fits = self.fit_models.query(query) + else: + queried_fits = self.fit_models + if len(queried_fits) == 0: + raise ValueError("invalid query, no fits returned") + + id_vars = ["dataset_name", "scale_coeff_lasso_shift"] + value_vars = [ + c for c in queried_fits.columns if "loss" in c and c != "step_loss" + ] + loss_df = queried_fits.melt( + id_vars=id_vars, + value_vars=value_vars, + var_name="condition", + value_name="loss", + ).assign( + split=lambda x: x.condition.str.split("_").str.get(-1), + condition=lambda x: x.condition.str.split("_").str[:-2].str.join("_"), + ) + return loss_df + def mut_param_heatmap( self, query=None, @@ -652,7 +731,11 @@ def mut_param_heatmap( if len(queried_fits) == 0: raise ValueError("invalid query, no fits returned") shouldbe_uniform = list( - set(queried_fits.columns) - set(["model", "dataset_name", "step_loss"]) + set(queried_fits.columns) + - set( + ["model", "dataset_name"] + + [col for col in queried_fits.columns if "loss" in col] + ) ) if len(queried_fits.groupby(list(shouldbe_uniform)).groups) > 1: raise ValueError( @@ -921,9 +1004,6 @@ def mut_type(mut): return "stop" if mut.endswith("*") else "nonsynonymous" # apply, drop, and melt - # TODO This throws deprecation warning - # because of the include_groups argument ... - # set to False, and lose the drop call after ... sparsity_df = ( df.drop(columns=to_throw) .assign(mut_type=lambda x: x.mutation.apply(mut_type)) diff --git a/notebooks/simulation_validation.ipynb b/notebooks/simulation_validation.ipynb index 3c7ea87..81f02f4 100644 --- a/notebooks/simulation_validation.ipynb +++ b/notebooks/simulation_validation.ipynb @@ -3,7 +3,16 @@ { "cell_type": "markdown", "id": "8799e8a1", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.01964, + "end_time": "2024-02-29T00:34:31.527624", + "exception": false, + "start_time": "2024-02-29T00:34:31.507984", + "status": "completed" + }, + "tags": [] + }, "source": [ "# Simulation validation" ] @@ -11,7 +20,16 @@ { "cell_type": "markdown", "id": "04132d8b", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.01352, + "end_time": "2024-02-29T00:34:31.575811", + "exception": false, + "start_time": "2024-02-29T00:34:31.562291", + "status": "completed" + }, + "tags": [] + }, "source": [ "### Overview" ] @@ -19,7 +37,16 @@ { "cell_type": "markdown", "id": "8e60a7c7", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.016545, + "end_time": "2024-02-29T00:34:31.610566", + "exception": false, + "start_time": "2024-02-29T00:34:31.594021", + "status": "completed" + }, + "tags": [] + }, "source": [ "The bloom lab had developed a [pipeline to simulate DMS data](https://jbloomlab.github.io/dms_variants/codonvariant_sim_data.html) to test `dms_variants` global epistasis fitting. Taking inspriation from that, we build upon the pipeline by simulating data for _two_ homologs - introducing shifts in mutational effects at a subset of sites with the goal validating the `multidms` joint-fitting approach.\n", "\n", @@ -36,7 +63,16 @@ { "cell_type": "markdown", "id": "57667722", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.019253, + "end_time": "2024-02-29T00:34:31.644417", + "exception": false, + "start_time": "2024-02-29T00:34:31.625164", + "status": "completed" + }, + "tags": [] + }, "source": [ "### Import `Python` modules" ] @@ -44,7 +80,16 @@ { "cell_type": "markdown", "id": "f45340e3", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.013267, + "end_time": "2024-02-29T00:34:31.673469", + "exception": false, + "start_time": "2024-02-29T00:34:31.660202", + "status": "completed" + }, + "tags": [] + }, "source": [ "\n", "For this analysis, we'll be using the `dms_variants` package for simulation. All dependencies needed can be installed by cloning the `multidms` repo and executing the command:\n", @@ -58,7 +103,22 @@ "cell_type": "code", "execution_count": 1, "id": "945088a4", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:31.706759Z", + "iopub.status.busy": "2024-02-29T00:34:31.706295Z", + "iopub.status.idle": "2024-02-29T00:34:33.798722Z", + "shell.execute_reply": "2024-02-29T00:34:33.798279Z" + }, + "papermill": { + "duration": 2.110239, + "end_time": "2024-02-29T00:34:33.800778", + "exception": false, + "start_time": "2024-02-29T00:34:31.690539", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "import warnings\n", @@ -86,7 +146,16 @@ { "cell_type": "markdown", "id": "713489f3", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.018664, + "end_time": "2024-02-29T00:34:33.840419", + "exception": false, + "start_time": "2024-02-29T00:34:33.821755", + "status": "completed" + }, + "tags": [] + }, "source": [ "### Define notebook parameters" ] @@ -94,7 +163,16 @@ { "cell_type": "markdown", "id": "b8287fde", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.021961, + "end_time": "2024-02-29T00:34:33.884623", + "exception": false, + "start_time": "2024-02-29T00:34:33.862662", + "status": "completed" + }, + "tags": [] + }, "source": [ "Set parameters that define this notebook's behavior. These can all be modifed (using `papermill`) to reasonable values and the notebook should execute as expected." ] @@ -104,6 +182,19 @@ "execution_count": 2, "id": "b7f2abe0", "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:33.914842Z", + "iopub.status.busy": "2024-02-29T00:34:33.913996Z", + "iopub.status.idle": "2024-02-29T00:34:33.923184Z", + "shell.execute_reply": "2024-02-29T00:34:33.922372Z" + }, + "papermill": { + "duration": 0.024895, + "end_time": "2024-02-29T00:34:33.924990", + "exception": false, + "start_time": "2024-02-29T00:34:33.900095", + "status": "completed" + }, "tags": [ "parameters" ] @@ -155,19 +246,69 @@ "csv_output_dir = 'output' # the directory to save the output csv files" ] }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f34e96af", + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:33.961809Z", + "iopub.status.busy": "2024-02-29T00:34:33.961507Z", + "iopub.status.idle": "2024-02-29T00:34:33.963900Z", + "shell.execute_reply": "2024-02-29T00:34:33.963637Z" + }, + "papermill": { + "duration": 0.019976, + "end_time": "2024-02-29T00:34:33.964906", + "exception": false, + "start_time": "2024-02-29T00:34:33.944930", + "status": "completed" + }, + "tags": [ + "injected-parameters" + ] + }, + "outputs": [], + "source": [ + "# Parameters\n", + "csv_output_dir = \"output/default\"\n" + ] + }, { "cell_type": "markdown", "id": "17c59457", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.016047, + "end_time": "2024-02-29T00:34:34.000845", + "exception": false, + "start_time": "2024-02-29T00:34:33.984798", + "status": "completed" + }, + "tags": [] + }, "source": [ "Set some configurations and a few other more experimental parameters which could break things if not set correctly:" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "034af31c", "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:34.038093Z", + "iopub.status.busy": "2024-02-29T00:34:34.037827Z", + "iopub.status.idle": "2024-02-29T00:34:34.053918Z", + "shell.execute_reply": "2024-02-29T00:34:34.053446Z" + }, + "papermill": { + "duration": 0.036704, + "end_time": "2024-02-29T00:34:34.055604", + "exception": false, + "start_time": "2024-02-29T00:34:34.018900", + "status": "completed" + }, "tags": [] }, "outputs": [], @@ -185,7 +326,16 @@ { "cell_type": "markdown", "id": "cbc66da4", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.017388, + "end_time": "2024-02-29T00:34:34.095292", + "exception": false, + "start_time": "2024-02-29T00:34:34.077904", + "status": "completed" + }, + "tags": [] + }, "source": [ "### Simulate a reference sequence" ] @@ -193,7 +343,16 @@ { "cell_type": "markdown", "id": "7ed89b9e", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.016817, + "end_time": "2024-02-29T00:34:34.130874", + "exception": false, + "start_time": "2024-02-29T00:34:34.114057", + "status": "completed" + }, + "tags": [] + }, "source": [ "\n", "Start by simulating a sequence of nucleotides, then translate it to amino acids. We'll use this sequence as the reference sequence for the simulated datasets." @@ -201,9 +360,24 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "9e58bfbf", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:34.164537Z", + "iopub.status.busy": "2024-02-29T00:34:34.164210Z", + "iopub.status.idle": "2024-02-29T00:34:34.169773Z", + "shell.execute_reply": "2024-02-29T00:34:34.169082Z" + }, + "papermill": { + "duration": 0.023153, + "end_time": "2024-02-29T00:34:34.171065", + "exception": false, + "start_time": "2024-02-29T00:34:34.147912", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "name": "stdout", @@ -226,7 +400,16 @@ { "cell_type": "markdown", "id": "e11375e7", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.015366, + "end_time": "2024-02-29T00:34:34.207080", + "exception": false, + "start_time": "2024-02-29T00:34:34.191714", + "status": "completed" + }, + "tags": [] + }, "source": [ "### Mutational effects" ] @@ -234,16 +417,40 @@ { "cell_type": "markdown", "id": "9e1589d2", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.016547, + "end_time": "2024-02-29T00:34:34.242433", + "exception": false, + "start_time": "2024-02-29T00:34:34.225886", + "status": "completed" + }, + "tags": [] + }, "source": [ "Simulate latent mutational effects, $\\beta_m$, storing data in a `SigmoidPhenotypeSimulator` object. Also create an identical object for the second homolog. Later, we'll update this object to include shifted mutational effects." ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "79557b66", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:34.276235Z", + "iopub.status.busy": "2024-02-29T00:34:34.275941Z", + "iopub.status.idle": "2024-02-29T00:34:34.291830Z", + "shell.execute_reply": "2024-02-29T00:34:34.291565Z" + }, + "papermill": { + "duration": 0.033563, + "end_time": "2024-02-29T00:34:34.293304", + "exception": false, + "start_time": "2024-02-29T00:34:34.259741", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "mut_pheno_args = {\n", @@ -261,16 +468,40 @@ { "cell_type": "markdown", "id": "f5f2e6aa", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.016868, + "end_time": "2024-02-29T00:34:34.332694", + "exception": false, + "start_time": "2024-02-29T00:34:34.315826", + "status": "completed" + }, + "tags": [] + }, "source": [ "Plot the distribution of mutation effects for the first homolog." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "5203a90f", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:34.362927Z", + "iopub.status.busy": "2024-02-29T00:34:34.362671Z", + "iopub.status.idle": "2024-02-29T00:34:34.575359Z", + "shell.execute_reply": "2024-02-29T00:34:34.574976Z" + }, + "papermill": { + "duration": 0.228677, + "end_time": "2024-02-29T00:34:34.577379", + "exception": false, + "start_time": "2024-02-29T00:34:34.348702", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -304,7 +535,16 @@ { "cell_type": "markdown", "id": "9c980f6d", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.018979, + "end_time": "2024-02-29T00:34:34.618714", + "exception": false, + "start_time": "2024-02-29T00:34:34.599735", + "status": "completed" + }, + "tags": [] + }, "source": [ "As expected, our mutational effects are weighted in the negative direction, as we expect the majority of mutations to be deleterious to a protein." ] @@ -312,7 +552,16 @@ { "cell_type": "markdown", "id": "6251707b", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.018351, + "end_time": "2024-02-29T00:34:34.655949", + "exception": false, + "start_time": "2024-02-29T00:34:34.637598", + "status": "completed" + }, + "tags": [] + }, "source": [ "### Non-reference homolog sequence" ] @@ -320,7 +569,16 @@ { "cell_type": "markdown", "id": "607f79be", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.018, + "end_time": "2024-02-29T00:34:34.693018", + "exception": false, + "start_time": "2024-02-29T00:34:34.675018", + "status": "completed" + }, + "tags": [] + }, "source": [ "\n", "Next, we'll simulate the DNA/protein sequence of second homolog by making a defined number of random amino-acid mutations to the first homolog. When choosing the \"bundle\" of mutation which separate the two homologs, we avoid mutations that decrease or increase the latent phenotype by more than `min_muteffect_in_bundle` and `max_muteffect_in_bundle` respectively. This is to ensure that the latent phenotype of the second homolog is not too different from the first homolog, an important assumption for the `multidms` model." @@ -328,9 +586,24 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "14df6b6f", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:34.726794Z", + "iopub.status.busy": "2024-02-29T00:34:34.726517Z", + "iopub.status.idle": "2024-02-29T00:34:34.736592Z", + "shell.execute_reply": "2024-02-29T00:34:34.736099Z" + }, + "papermill": { + "duration": 0.027396, + "end_time": "2024-02-29T00:34:34.737744", + "exception": false, + "start_time": "2024-02-29T00:34:34.710348", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "name": "stdout", @@ -393,7 +666,16 @@ { "cell_type": "markdown", "id": "01a30920", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.016097, + "end_time": "2024-02-29T00:34:34.775201", + "exception": false, + "start_time": "2024-02-29T00:34:34.759104", + "status": "completed" + }, + "tags": [] + }, "source": [ "### Shifted mutational effects" ] @@ -401,16 +683,40 @@ { "cell_type": "markdown", "id": "e77b42ce", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.016245, + "end_time": "2024-02-29T00:34:34.812209", + "exception": false, + "start_time": "2024-02-29T00:34:34.795964", + "status": "completed" + }, + "tags": [] + }, "source": [ "Next, randomly choose a subset of (identical and non-identical) sites that will have shifted mutational effects. Do this independently for sites that are identical and non-identical between homologs, so that we are sure to have shifted sites in each category." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "0acc8bcb", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:34.844036Z", + "iopub.status.busy": "2024-02-29T00:34:34.843784Z", + "iopub.status.idle": "2024-02-29T00:34:34.849415Z", + "shell.execute_reply": "2024-02-29T00:34:34.848692Z" + }, + "papermill": { + "duration": 0.024576, + "end_time": "2024-02-29T00:34:34.850891", + "exception": false, + "start_time": "2024-02-29T00:34:34.826315", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "name": "stdout", @@ -446,16 +752,40 @@ { "cell_type": "markdown", "id": "8b993fde", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.013714, + "end_time": "2024-02-29T00:34:34.882214", + "exception": false, + "start_time": "2024-02-29T00:34:34.868500", + "status": "completed" + }, + "tags": [] + }, "source": [ "Next, we'll create a dataframe for all mutational effects and shifts, simulating shifts at each of the above sites by randomly simulate a shift in the effect of each mutation by drawing shifts from a Gaussian distribution:" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "294fd875", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:34.917501Z", + "iopub.status.busy": "2024-02-29T00:34:34.917212Z", + "iopub.status.idle": "2024-02-29T00:34:34.945845Z", + "shell.execute_reply": "2024-02-29T00:34:34.945592Z" + }, + "papermill": { + "duration": 0.047524, + "end_time": "2024-02-29T00:34:34.947590", + "exception": false, + "start_time": "2024-02-29T00:34:34.900066", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -645,7 +975,7 @@ "[200 rows x 8 columns]" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -697,16 +1027,40 @@ { "cell_type": "markdown", "id": "417ea658", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.019569, + "end_time": "2024-02-29T00:34:34.991841", + "exception": false, + "start_time": "2024-02-29T00:34:34.972272", + "status": "completed" + }, + "tags": [] + }, "source": [ "Plot the distribution of all simulated shifts, excluding mutations to stop codons and sites that have no shifted effects." ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "a5f6ac40", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:35.026470Z", + "iopub.status.busy": "2024-02-29T00:34:35.026306Z", + "iopub.status.idle": "2024-02-29T00:34:35.153983Z", + "shell.execute_reply": "2024-02-29T00:34:35.153689Z" + }, + "papermill": { + "duration": 0.146479, + "end_time": "2024-02-29T00:34:35.155659", + "exception": false, + "start_time": "2024-02-29T00:34:35.009180", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -745,16 +1099,40 @@ { "cell_type": "markdown", "id": "b9f1e450", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.017814, + "end_time": "2024-02-29T00:34:35.193144", + "exception": false, + "start_time": "2024-02-29T00:34:35.175330", + "status": "completed" + }, + "tags": [] + }, "source": [ "We can also plot the shifts as a heatmap, and mark the wildtype amino acid at each site (`x` for reference, `o` for non-reference):" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "77d40b6b", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:35.233727Z", + "iopub.status.busy": "2024-02-29T00:34:35.233424Z", + "iopub.status.idle": "2024-02-29T00:34:35.676299Z", + "shell.execute_reply": "2024-02-29T00:34:35.675960Z" + }, + "papermill": { + "duration": 0.468511, + "end_time": "2024-02-29T00:34:35.678533", + "exception": false, + "start_time": "2024-02-29T00:34:35.210022", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -830,16 +1208,40 @@ { "cell_type": "markdown", "id": "4cd2316d", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.020643, + "end_time": "2024-02-29T00:34:35.720846", + "exception": false, + "start_time": "2024-02-29T00:34:35.700203", + "status": "completed" + }, + "tags": [] + }, "source": [ "Recal that we have already created a `SigmoidPhenotypeSimulator` object for the second homolog by making a copy of the one for the first homolog. This object stores the homolog's wildtype latent phenotype and the latent effects of individual mutations. Below, we update both of these traits based on the simulated shifts from above. To update the wildtype latent phenotype of the second homolog, we add the effects of all mutations that separate the homologs. In computing this sum, we will use $\\beta$ parameters for the second homolog, which already include shifted effects." ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "69e499bf", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:35.761637Z", + "iopub.status.busy": "2024-02-29T00:34:35.761329Z", + "iopub.status.idle": "2024-02-29T00:34:35.955349Z", + "shell.execute_reply": "2024-02-29T00:34:35.955003Z" + }, + "papermill": { + "duration": 0.21662, + "end_time": "2024-02-29T00:34:35.957432", + "exception": false, + "start_time": "2024-02-29T00:34:35.740812", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "name": "stdout", @@ -1059,7 +1461,7 @@ "976 L V True " ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1086,7 +1488,16 @@ { "cell_type": "markdown", "id": "1a626b8c", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.019498, + "end_time": "2024-02-29T00:34:35.996809", + "exception": false, + "start_time": "2024-02-29T00:34:35.977311", + "status": "completed" + }, + "tags": [] + }, "source": [ "At this point, all mutations in the second homolog's `SigmoidPhenotypeSimulator` object are defined relative to the wildtype sequence of the first homolog, which we will call the \"reference\" homolog. However, to compute phenotypes for the second homolog, we still need to assign the effects of mutations at non-identical sites in the \"non-reference\" homolog. For instance, if the wildtype amino acid at site 30 is an A in the reference homolog, but is a Y in a non-reference homolog, then the effect of a Y30G is absent from the second homolog's simulator object. \n", "\n", @@ -1111,16 +1522,40 @@ { "cell_type": "markdown", "id": "c9aa946c", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.020013, + "end_time": "2024-02-29T00:34:36.037929", + "exception": false, + "start_time": "2024-02-29T00:34:36.017916", + "status": "completed" + }, + "tags": [] + }, "source": [ "The below cell adds mutational effects for missing entries using the above strategy:" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "b309408d", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:36.078327Z", + "iopub.status.busy": "2024-02-29T00:34:36.077965Z", + "iopub.status.idle": "2024-02-29T00:34:36.119848Z", + "shell.execute_reply": "2024-02-29T00:34:36.119548Z" + }, + "papermill": { + "duration": 0.064156, + "end_time": "2024-02-29T00:34:36.121268", + "exception": false, + "start_time": "2024-02-29T00:34:36.057112", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "# iterate over \"bundle\" of mutations which disguish the homologs\n", @@ -1154,9 +1589,24 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "e395986e", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:36.165360Z", + "iopub.status.busy": "2024-02-29T00:34:36.164994Z", + "iopub.status.idle": "2024-02-29T00:34:36.182367Z", + "shell.execute_reply": "2024-02-29T00:34:36.182101Z" + }, + "papermill": { + "duration": 0.039501, + "end_time": "2024-02-29T00:34:36.183903", + "exception": false, + "start_time": "2024-02-29T00:34:36.144402", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -1283,7 +1733,7 @@ "4 G G False " ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1296,7 +1746,16 @@ { "cell_type": "markdown", "id": "c12e0498", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.019956, + "end_time": "2024-02-29T00:34:36.227486", + "exception": false, + "start_time": "2024-02-29T00:34:36.207530", + "status": "completed" + }, + "tags": [] + }, "source": [ "### Variant libraries" ] @@ -1304,7 +1763,16 @@ { "cell_type": "markdown", "id": "bd558e2b", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.021956, + "end_time": "2024-02-29T00:34:36.270990", + "exception": false, + "start_time": "2024-02-29T00:34:36.249034", + "status": "completed" + }, + "tags": [] + }, "source": [ "\n", "Simulate a set of variant libraries that one might use in an actual experiment for each homolog, each with two replicate libraries." @@ -1312,9 +1780,24 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "761b1f99", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:36.313412Z", + "iopub.status.busy": "2024-02-29T00:34:36.312949Z", + "iopub.status.idle": "2024-02-29T00:34:38.616563Z", + "shell.execute_reply": "2024-02-29T00:34:38.616133Z" + }, + "papermill": { + "duration": 2.327804, + "end_time": "2024-02-29T00:34:38.618392", + "exception": false, + "start_time": "2024-02-29T00:34:36.290588", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "libs = [\"lib_1\", \"lib_2\"] # distinct libraries of gene\n", @@ -1340,9 +1823,24 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "b31c079b", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:38.664028Z", + "iopub.status.busy": "2024-02-29T00:34:38.663736Z", + "iopub.status.idle": "2024-02-29T00:34:38.758277Z", + "shell.execute_reply": "2024-02-29T00:34:38.757601Z" + }, + "papermill": { + "duration": 0.118092, + "end_time": "2024-02-29T00:34:38.759804", + "exception": false, + "start_time": "2024-02-29T00:34:38.641712", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -1400,7 +1898,7 @@ "2 all libraries barcoded variants 50000" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1412,16 +1910,40 @@ { "cell_type": "markdown", "id": "9b238384", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.021013, + "end_time": "2024-02-29T00:34:38.803023", + "exception": false, + "start_time": "2024-02-29T00:34:38.782010", + "status": "completed" + }, + "tags": [] + }, "source": [ "plot the number of variant support sequences as a histogram" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "2b6906d2", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:38.847375Z", + "iopub.status.busy": "2024-02-29T00:34:38.847070Z", + "iopub.status.idle": "2024-02-29T00:34:39.209162Z", + "shell.execute_reply": "2024-02-29T00:34:39.208707Z" + }, + "papermill": { + "duration": 0.385653, + "end_time": "2024-02-29T00:34:39.210796", + "exception": false, + "start_time": "2024-02-29T00:34:38.825143", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -1451,9 +1973,24 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "fd6df84d", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:39.254329Z", + "iopub.status.busy": "2024-02-29T00:34:39.254187Z", + "iopub.status.idle": "2024-02-29T00:34:39.344295Z", + "shell.execute_reply": "2024-02-29T00:34:39.343724Z" + }, + "papermill": { + "duration": 0.112759, + "end_time": "2024-02-29T00:34:39.345974", + "exception": false, + "start_time": "2024-02-29T00:34:39.233215", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -1511,7 +2048,7 @@ "2 all libraries barcoded variants 50000" ] }, - "execution_count": 18, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1522,9 +2059,24 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "id": "a277a3b6", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:39.394173Z", + "iopub.status.busy": "2024-02-29T00:34:39.393554Z", + "iopub.status.idle": "2024-02-29T00:34:39.746048Z", + "shell.execute_reply": "2024-02-29T00:34:39.745636Z" + }, + "papermill": { + "duration": 0.377287, + "end_time": "2024-02-29T00:34:39.746980", + "exception": false, + "start_time": "2024-02-29T00:34:39.369693", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -1555,7 +2107,16 @@ { "cell_type": "markdown", "id": "b7e46507", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.020609, + "end_time": "2024-02-29T00:34:39.788453", + "exception": false, + "start_time": "2024-02-29T00:34:39.767844", + "status": "completed" + }, + "tags": [] + }, "source": [ "### Variant phenotypes and enrichments" ] @@ -1563,7 +2124,16 @@ { "cell_type": "markdown", "id": "314bacc6", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.02437, + "end_time": "2024-02-29T00:34:39.836671", + "exception": false, + "start_time": "2024-02-29T00:34:39.812301", + "status": "completed" + }, + "tags": [] + }, "source": [ "Now that we have simulated libraries of variants $v_i \\in V$, we'll assign their respective ground truth phenotypes and enrichments. \n", "\n", @@ -1572,9 +2142,24 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "id": "852d78df", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:39.884471Z", + "iopub.status.busy": "2024-02-29T00:34:39.884339Z", + "iopub.status.idle": "2024-02-29T00:34:39.964207Z", + "shell.execute_reply": "2024-02-29T00:34:39.963714Z" + }, + "papermill": { + "duration": 0.105259, + "end_time": "2024-02-29T00:34:39.965624", + "exception": false, + "start_time": "2024-02-29T00:34:39.860365", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -1646,7 +2231,7 @@ "4 lib_1 5.000000" ] }, - "execution_count": 20, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1671,7 +2256,16 @@ { "cell_type": "markdown", "id": "cf40ef6d", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.022011, + "end_time": "2024-02-29T00:34:40.015755", + "exception": false, + "start_time": "2024-02-29T00:34:39.993744", + "status": "completed" + }, + "tags": [] + }, "source": [ "Next, we can compute an _observed phenotype_, $p(v, d)$ = $g_{\\theta}(\\phi(v, d))$, where $g_{\\theta}$ is the global epistasis function implimented as a flexible sigmoid with two free parameters, $\\theta_{s}$ and $\\theta_{b}$, that serve as a scale and bias, respectively. Concretely, $g_{\\theta}(z) = \\theta_{b} + \\frac{\\theta_{s}}{1 + e^{-z}}$. For the purposes of this analysis, we compute a fixed value for the $\\theta_{b}$ parameter, based upon the specified parameters for $\\theta_{s}$ (`sigmoid_phenotype_scale`) and $\\phi(v_{WT})$ (`wt_latent`) such that the reference wildtype phenotype is exactly 0. \n", "\n", @@ -1680,9 +2274,24 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "id": "63735ecc", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:40.060380Z", + "iopub.status.busy": "2024-02-29T00:34:40.060259Z", + "iopub.status.idle": "2024-02-29T00:34:40.063323Z", + "shell.execute_reply": "2024-02-29T00:34:40.062801Z" + }, + "papermill": { + "duration": 0.027502, + "end_time": "2024-02-29T00:34:40.064409", + "exception": false, + "start_time": "2024-02-29T00:34:40.036907", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "# define numpy wrapper for multidms native jax sigmoid fxn\n", @@ -1696,16 +2305,40 @@ { "cell_type": "markdown", "id": "2da19ec8", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.023566, + "end_time": "2024-02-29T00:34:40.114000", + "exception": false, + "start_time": "2024-02-29T00:34:40.090434", + "status": "completed" + }, + "tags": [] + }, "source": [ "Visualize the sigmoidal function that we'll be using to map latent phenotypes to observed phenotypes. Place vertical line at the reference (\"h1\") wildtype latent phenotype, as well as a dashed line for the non-reference (\"h2\") phenotype:" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "id": "a0ff09be", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:40.166908Z", + "iopub.status.busy": "2024-02-29T00:34:40.166584Z", + "iopub.status.idle": "2024-02-29T00:34:40.303966Z", + "shell.execute_reply": "2024-02-29T00:34:40.303330Z" + }, + "papermill": { + "duration": 0.163939, + "end_time": "2024-02-29T00:34:40.305255", + "exception": false, + "start_time": "2024-02-29T00:34:40.141316", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -1747,16 +2380,40 @@ { "cell_type": "markdown", "id": "f0ad9343", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.02341, + "end_time": "2024-02-29T00:34:40.350718", + "exception": false, + "start_time": "2024-02-29T00:34:40.327308", + "status": "completed" + }, + "tags": [] + }, "source": [ "Next, assign each barcoded variant an _observed phenotype_, $p(v, d)$, as well as an _observed enrichment_ , $2^{p(v, d)}$. " ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "id": "1f5eba40", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:40.397368Z", + "iopub.status.busy": "2024-02-29T00:34:40.397248Z", + "iopub.status.idle": "2024-02-29T00:34:40.892669Z", + "shell.execute_reply": "2024-02-29T00:34:40.891992Z" + }, + "papermill": { + "duration": 0.521204, + "end_time": "2024-02-29T00:34:40.894389", + "exception": false, + "start_time": "2024-02-29T00:34:40.373185", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -1847,7 +2504,7 @@ "4 0.000000 1.000000 h1 " ] }, - "execution_count": 23, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -1885,16 +2542,40 @@ { "cell_type": "markdown", "id": "f15c56b0", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.024508, + "end_time": "2024-02-29T00:34:40.943840", + "exception": false, + "start_time": "2024-02-29T00:34:40.919332", + "status": "completed" + }, + "tags": [] + }, "source": [ "Note that The enrichment score transforms phenotypes to be between 0 and 1, and can thus be used to compute the number of counts for each variant in the post-selection library using a multinomial distribution where the probability of observing a variant is proportional to its enrichment. Below, we plot each variant's latent phenotype against its observed enrichment:" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "id": "1515dc7e", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:40.993815Z", + "iopub.status.busy": "2024-02-29T00:34:40.993563Z", + "iopub.status.idle": "2024-02-29T00:34:42.946872Z", + "shell.execute_reply": "2024-02-29T00:34:42.946186Z" + }, + "papermill": { + "duration": 1.978901, + "end_time": "2024-02-29T00:34:42.948622", + "exception": false, + "start_time": "2024-02-29T00:34:40.969721", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -1944,7 +2625,16 @@ { "cell_type": "markdown", "id": "277a1047", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.023193, + "end_time": "2024-02-29T00:34:42.996279", + "exception": false, + "start_time": "2024-02-29T00:34:42.973086", + "status": "completed" + }, + "tags": [] + }, "source": [ "### Pre and post-selection variant read counts" ] @@ -1952,16 +2642,40 @@ { "cell_type": "markdown", "id": "5d55dc2a", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.024516, + "end_time": "2024-02-29T00:34:43.045381", + "exception": false, + "start_time": "2024-02-29T00:34:43.020865", + "status": "completed" + }, + "tags": [] + }, "source": [ "Next, we will simulate both pre and post-selection counts of each variant subjected to an experimental selection step. Post-selection counts will be simulated by applying the specified selection bottleneck(s) to the pre-selection counts. To do this, we will use the functionality provided by [dms_variants.simulate.simulateSampleCounts](https://jbloomlab.github.io/dms_variants/dms_variants.simulate.html#dms_variants.simulate.simulateSampleCounts) and feed it our custom phenotype function." ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "id": "0096b542", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:43.094942Z", + "iopub.status.busy": "2024-02-29T00:34:43.094662Z", + "iopub.status.idle": "2024-02-29T00:34:44.129174Z", + "shell.execute_reply": "2024-02-29T00:34:44.128408Z" + }, + "papermill": { + "duration": 1.061395, + "end_time": "2024-02-29T00:34:44.131065", + "exception": false, + "start_time": "2024-02-29T00:34:43.069670", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "bottlenecks = {\n", @@ -2001,16 +2715,40 @@ { "cell_type": "markdown", "id": "3666f515", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.022819, + "end_time": "2024-02-29T00:34:44.177798", + "exception": false, + "start_time": "2024-02-29T00:34:44.154979", + "status": "completed" + }, + "tags": [] + }, "source": [ "Plot the number of counts for each variant in each sample. The horizontal dashed line shows the total number of variants. The plot shows that all variants are well-sampled in the pre-selection libraries, but that post- selection some variants are sampled more or less. This is expected since selection will decrease and increase the frequency of variants:" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "id": "4d7a084e", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:44.225390Z", + "iopub.status.busy": "2024-02-29T00:34:44.225220Z", + "iopub.status.idle": "2024-02-29T00:34:46.194709Z", + "shell.execute_reply": "2024-02-29T00:34:46.193998Z" + }, + "papermill": { + "duration": 1.995058, + "end_time": "2024-02-29T00:34:46.196079", + "exception": false, + "start_time": "2024-02-29T00:34:44.201021", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -2058,16 +2796,40 @@ { "cell_type": "markdown", "id": "25b05ba7", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.029105, + "end_time": "2024-02-29T00:34:46.255449", + "exception": false, + "start_time": "2024-02-29T00:34:46.226344", + "status": "completed" + }, + "tags": [] + }, "source": [ "Distribution of the number of amino-acid mutations per variant in each sample. As expected, mutations go down after selection:" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "id": "313299ff", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:46.311928Z", + "iopub.status.busy": "2024-02-29T00:34:46.311669Z", + "iopub.status.idle": "2024-02-29T00:34:47.747319Z", + "shell.execute_reply": "2024-02-29T00:34:47.746616Z" + }, + "papermill": { + "duration": 1.467702, + "end_time": "2024-02-29T00:34:47.748975", + "exception": false, + "start_time": "2024-02-29T00:34:46.281273", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -2112,16 +2874,40 @@ { "cell_type": "markdown", "id": "4892952d", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.03045, + "end_time": "2024-02-29T00:34:47.808058", + "exception": false, + "start_time": "2024-02-29T00:34:47.777608", + "status": "completed" + }, + "tags": [] + }, "source": [ "Plot how thoroughly amino-acid mutations are sampled. The plots below show that the stop mutations are sampled very poorly post-selection because they are eliminated during selection:" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "id": "ad6573dd", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:47.868586Z", + "iopub.status.busy": "2024-02-29T00:34:47.868451Z", + "iopub.status.idle": "2024-02-29T00:34:50.454388Z", + "shell.execute_reply": "2024-02-29T00:34:50.453659Z" + }, + "papermill": { + "duration": 2.62257, + "end_time": "2024-02-29T00:34:50.456781", + "exception": false, + "start_time": "2024-02-29T00:34:47.834211", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -2167,7 +2953,16 @@ { "cell_type": "markdown", "id": "22a8c780", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.032857, + "end_time": "2024-02-29T00:34:50.526797", + "exception": false, + "start_time": "2024-02-29T00:34:50.493940", + "status": "completed" + }, + "tags": [] + }, "source": [ "### Prep training data" ] @@ -2175,7 +2970,16 @@ { "cell_type": "markdown", "id": "5128374f", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.034289, + "end_time": "2024-02-29T00:34:50.595913", + "exception": false, + "start_time": "2024-02-29T00:34:50.561624", + "status": "completed" + }, + "tags": [] + }, "source": [ "Prepare the training dataframes for fitting our `multidms` models. \n", "\n", @@ -2191,16 +2995,40 @@ { "cell_type": "markdown", "id": "8b3b29a4", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.032182, + "end_time": "2024-02-29T00:34:50.662153", + "exception": false, + "start_time": "2024-02-29T00:34:50.629971", + "status": "completed" + }, + "tags": [] + }, "source": [ "Start by creating training data with ground truth phenotype target. Because the barcode replicates share ground truth phenotypes we can can collapse the counts accross replicates by simple dropping duplicates." ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "id": "68ca3401", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:50.731814Z", + "iopub.status.busy": "2024-02-29T00:34:50.731492Z", + "iopub.status.idle": "2024-02-29T00:34:50.777719Z", + "shell.execute_reply": "2024-02-29T00:34:50.777037Z" + }, + "papermill": { + "duration": 0.082835, + "end_time": "2024-02-29T00:34:50.779277", + "exception": false, + "start_time": "2024-02-29T00:34:50.696442", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -2291,7 +3119,7 @@ "4 observed_phenotype 0.00 " ] }, - "execution_count": 29, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -2322,16 +3150,40 @@ { "cell_type": "markdown", "id": "a02331a7", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.03401, + "end_time": "2024-02-29T00:34:50.845702", + "exception": false, + "start_time": "2024-02-29T00:34:50.811692", + "status": "completed" + }, + "tags": [] + }, "source": [ "Next, compute functional scores from pre-post counts in each bottleneck _after_ aggregating the barcode replicate counts for unique variants. The [dms_variants.CodonVariantTable.func_scores](https://jbloomlab.github.io/dms_variants/dms_variants.codonvarianttable.html#dms_variants.codonvarianttable.CodonVariantTable.func_scores) method provides the ability to compute functional scores from pre and post-selection counts. We'll use this to compute functional scores for each of the three targets:" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "id": "0cbf0498", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:50.918795Z", + "iopub.status.busy": "2024-02-29T00:34:50.918493Z", + "iopub.status.idle": "2024-02-29T00:34:51.288193Z", + "shell.execute_reply": "2024-02-29T00:34:51.287481Z" + }, + "papermill": { + "duration": 0.410818, + "end_time": "2024-02-29T00:34:51.289337", + "exception": false, + "start_time": "2024-02-29T00:34:50.878519", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -2415,7 +3267,7 @@ "4 lib_1 h1 F18T Q21E C44T loose_bottle -5.313711" ] }, - "execution_count": 30, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -2440,16 +3292,40 @@ { "cell_type": "markdown", "id": "da591a0c", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.031903, + "end_time": "2024-02-29T00:34:51.354206", + "exception": false, + "start_time": "2024-02-29T00:34:51.322303", + "status": "completed" + }, + "tags": [] + }, "source": [ "Finally, combine the two dataframes computed above and classify the variants based on the number of amino acid substitutions." ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "id": "139d5de1", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:51.427369Z", + "iopub.status.busy": "2024-02-29T00:34:51.427097Z", + "iopub.status.idle": "2024-02-29T00:34:51.607260Z", + "shell.execute_reply": "2024-02-29T00:34:51.606549Z" + }, + "papermill": { + "duration": 0.22054, + "end_time": "2024-02-29T00:34:51.608854", + "exception": false, + "start_time": "2024-02-29T00:34:51.388314", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -2627,7 +3503,7 @@ "[181368 rows x 7 columns]" ] }, - "execution_count": 31, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -2668,16 +3544,40 @@ { "cell_type": "markdown", "id": "afa1c65b", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.037371, + "end_time": "2024-02-29T00:34:51.679191", + "exception": false, + "start_time": "2024-02-29T00:34:51.641820", + "status": "completed" + }, + "tags": [] + }, "source": [ "Let's plot the functional scores as a function of the latent phenotype." ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "id": "3decc114", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:51.751797Z", + "iopub.status.busy": "2024-02-29T00:34:51.751491Z", + "iopub.status.idle": "2024-02-29T00:34:55.740728Z", + "shell.execute_reply": "2024-02-29T00:34:55.740123Z" + }, + "papermill": { + "duration": 4.028516, + "end_time": "2024-02-29T00:34:55.741750", + "exception": false, + "start_time": "2024-02-29T00:34:51.713234", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -2709,16 +3609,40 @@ { "cell_type": "markdown", "id": "ec130fc0", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.037233, + "end_time": "2024-02-29T00:34:55.815845", + "exception": false, + "start_time": "2024-02-29T00:34:55.778612", + "status": "completed" + }, + "tags": [] + }, "source": [ "Plot a pairplot to see how targets compare." ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "id": "fe7a3a85", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:55.889730Z", + "iopub.status.busy": "2024-02-29T00:34:55.889351Z", + "iopub.status.idle": "2024-02-29T00:34:57.289238Z", + "shell.execute_reply": "2024-02-29T00:34:57.288482Z" + }, + "papermill": { + "duration": 1.4391, + "end_time": "2024-02-29T00:34:57.290703", + "exception": false, + "start_time": "2024-02-29T00:34:55.851603", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -2760,7 +3684,16 @@ { "cell_type": "markdown", "id": "72ffeb89", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.038115, + "end_time": "2024-02-29T00:34:57.371219", + "exception": false, + "start_time": "2024-02-29T00:34:57.333104", + "status": "completed" + }, + "tags": [] + }, "source": [ "We can see here that the tight bottleneck indeed introduces more noise in the data, as the functional scores are less correlated with the ground truth phenotype. " ] @@ -2768,16 +3701,40 @@ { "cell_type": "markdown", "id": "1654b229", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.040508, + "end_time": "2024-02-29T00:34:57.451729", + "exception": false, + "start_time": "2024-02-29T00:34:57.411221", + "status": "completed" + }, + "tags": [] + }, "source": [ "Plot the functional scores distributions of the bottleneck counts-computed functional scores. Recall that we collapsed barcode replicates so there is only a single wildtype phenotype in each of the datasets, thus we will not plot them below." ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 35, "id": "a3ee3b92", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:34:57.534454Z", + "iopub.status.busy": "2024-02-29T00:34:57.533890Z", + "iopub.status.idle": "2024-02-29T00:35:01.155312Z", + "shell.execute_reply": "2024-02-29T00:35:01.154930Z" + }, + "papermill": { + "duration": 3.666169, + "end_time": "2024-02-29T00:35:01.158538", + "exception": false, + "start_time": "2024-02-29T00:34:57.492369", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -2818,9 +3775,24 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 36, "id": "04b83d84", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:35:01.237511Z", + "iopub.status.busy": "2024-02-29T00:35:01.237192Z", + "iopub.status.idle": "2024-02-29T00:35:01.679453Z", + "shell.execute_reply": "2024-02-29T00:35:01.679140Z" + }, + "papermill": { + "duration": 0.486032, + "end_time": "2024-02-29T00:35:01.681451", + "exception": false, + "start_time": "2024-02-29T00:35:01.195419", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -2923,7 +3895,7 @@ "4 observed_phenotype 0.00 wildtype 5.00 " ] }, - "execution_count": 35, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -2936,7 +3908,16 @@ { "cell_type": "markdown", "id": "deba1ea6", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.046178, + "end_time": "2024-02-29T00:35:01.774821", + "exception": false, + "start_time": "2024-02-29T00:35:01.728643", + "status": "completed" + }, + "tags": [] + }, "source": [ "### Create `Data` objects for each library replicate, and bottleneck" ] @@ -2944,16 +3925,40 @@ { "cell_type": "markdown", "id": "36343df0", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.046487, + "end_time": "2024-02-29T00:35:01.866196", + "exception": false, + "start_time": "2024-02-29T00:35:01.819709", + "status": "completed" + }, + "tags": [] + }, "source": [ "`multidms` model fitting is generally a two-step process: (1) Create `Data` objects from functional score dataframes which, among other things, encode the variant data into one-hot encoded matrices, and (2) fit a collection of models across a grid of `Data` objects and hyperparameters. Here, we'll create `Data` objects for each library replicate / fitting target combination:" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 37, "id": "1aa7536f", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:35:01.958269Z", + "iopub.status.busy": "2024-02-29T00:35:01.957940Z", + "iopub.status.idle": "2024-02-29T00:35:50.017741Z", + "shell.execute_reply": "2024-02-29T00:35:50.016984Z" + }, + "papermill": { + "duration": 48.156805, + "end_time": "2024-02-29T00:35:50.064907", + "exception": false, + "start_time": "2024-02-29T00:35:01.908102", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -2966,7 +3971,7 @@ " Data(lib_2_tight_bottle_func_score)]" ] }, - "execution_count": 36, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -2989,7 +3994,16 @@ { "cell_type": "markdown", "id": "9fe48f02", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.046998, + "end_time": "2024-02-29T00:35:50.155194", + "exception": false, + "start_time": "2024-02-29T00:35:50.108196", + "status": "completed" + }, + "tags": [] + }, "source": [ "### Fit models to training data (lasso sweep)" ] @@ -2997,16 +4011,40 @@ { "cell_type": "markdown", "id": "861fad04", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.042824, + "end_time": "2024-02-29T00:35:50.242728", + "exception": false, + "start_time": "2024-02-29T00:35:50.199904", + "status": "completed" + }, + "tags": [] + }, "source": [ "Next, we'll fit a set of models to each of the datasets defined above using the `multidms.fit_models` [function](https://matsengrp.github.io/multidms/multidms.model_collection.html#multidms.model_collection.fit_models). For each dataset, we independently fit models with a number of different lasso penalty coefficients ($\\lambda_{L1}$) as defined at the top of this notebook. " ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 38, "id": "12c0c274", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:35:50.336699Z", + "iopub.status.busy": "2024-02-29T00:35:50.336373Z", + "iopub.status.idle": "2024-02-29T00:35:50.342749Z", + "shell.execute_reply": "2024-02-29T00:35:50.341951Z" + }, + "papermill": { + "duration": 0.053019, + "end_time": "2024-02-29T00:35:50.344603", + "exception": false, + "start_time": "2024-02-29T00:35:50.291584", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "name": "stdout", @@ -3050,16 +4088,40 @@ { "cell_type": "markdown", "id": "61f4b640", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.045571, + "end_time": "2024-02-29T00:35:50.433647", + "exception": false, + "start_time": "2024-02-29T00:35:50.388076", + "status": "completed" + }, + "tags": [] + }, "source": [ "Fit the models:" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 39, "id": "c013b7c0", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:35:50.524016Z", + "iopub.status.busy": "2024-02-29T00:35:50.523724Z", + "iopub.status.idle": "2024-02-29T00:49:16.116645Z", + "shell.execute_reply": "2024-02-29T00:49:16.116038Z" + }, + "papermill": { + "duration": 805.642944, + "end_time": "2024-02-29T00:49:16.118829", + "exception": false, + "start_time": "2024-02-29T00:35:50.475885", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "_, _, fit_collection_df = multidms.model_collection.fit_models(fitting_params)" @@ -3068,7 +4130,16 @@ { "cell_type": "markdown", "id": "9eca7abf", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.043496, + "end_time": "2024-02-29T00:49:16.210350", + "exception": false, + "start_time": "2024-02-29T00:49:16.166854", + "status": "completed" + }, + "tags": [] + }, "source": [ "\n", "Note that the return type of [multidms.fit_models](https://matsengrp.github.io/multidms/multidms.html#multidms.fit_models) is a `tuple(int, int, pd.DataFrame)` where the first two values are the number of fits that were successful and the number that failed, respectively, and the third value is a dataframe where each row contains a fit `Model` object and the hyperparameters used to fit it." @@ -3076,9 +4147,24 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 40, "id": "2592665c", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:49:16.296274Z", + "iopub.status.busy": "2024-02-29T00:49:16.295943Z", + "iopub.status.idle": "2024-02-29T00:49:16.306566Z", + "shell.execute_reply": "2024-02-29T00:49:16.306068Z" + }, + "papermill": { + "duration": 0.055081, + "end_time": "2024-02-29T00:49:16.308174", + "exception": false, + "start_time": "2024-02-29T00:49:16.253093", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -3150,7 +4236,7 @@ "4 Model(Model-0) lib_1_observed_phenotype_func_score 0.00004" ] }, - "execution_count": 39, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -3162,16 +4248,40 @@ { "cell_type": "markdown", "id": "3a1338c2", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.043755, + "end_time": "2024-02-29T00:49:16.392736", + "exception": false, + "start_time": "2024-02-29T00:49:16.348981", + "status": "completed" + }, + "tags": [] + }, "source": [ "Add a few helpful features to this dataframe for plotting by splitting the \"dataset_name\" (name of the Data Object that was used for fitting) into more understandable columns:" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 41, "id": "e2dd2e22", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:49:16.481880Z", + "iopub.status.busy": "2024-02-29T00:49:16.481623Z", + "iopub.status.idle": "2024-02-29T00:49:16.490200Z", + "shell.execute_reply": "2024-02-29T00:49:16.489717Z" + }, + "papermill": { + "duration": 0.055211, + "end_time": "2024-02-29T00:49:16.491884", + "exception": false, + "start_time": "2024-02-29T00:49:16.436673", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -3249,7 +4359,7 @@ "4 Model(Model-0) observed_phenotype lib_1 0.00004" ] }, - "execution_count": 40, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -3273,7 +4383,16 @@ { "cell_type": "markdown", "id": "0210ef48", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.043084, + "end_time": "2024-02-29T00:49:16.577660", + "exception": false, + "start_time": "2024-02-29T00:49:16.534576", + "status": "completed" + }, + "tags": [] + }, "source": [ "### Model vs. truth mutational effects" ] @@ -3281,16 +4400,40 @@ { "cell_type": "markdown", "id": "225dbcd5", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.042994, + "end_time": "2024-02-29T00:49:16.665056", + "exception": false, + "start_time": "2024-02-29T00:49:16.622062", + "status": "completed" + }, + "tags": [] + }, "source": [ "The `multidms.ModelCollection` takes a a dataframe such as the one created above and provides a few helpful methods for model selection and aggregation. We'll use these methods to evaluate the fits and select the best model for each dataset." ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 42, "id": "230f40d8", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:49:16.751780Z", + "iopub.status.busy": "2024-02-29T00:49:16.751533Z", + "iopub.status.idle": "2024-02-29T00:49:16.757923Z", + "shell.execute_reply": "2024-02-29T00:49:16.757432Z" + }, + "papermill": { + "duration": 0.050271, + "end_time": "2024-02-29T00:49:16.759548", + "exception": false, + "start_time": "2024-02-29T00:49:16.709277", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "model_collection = multidms.model_collection.ModelCollection(fit_collection_df)" @@ -3299,16 +4442,40 @@ { "cell_type": "markdown", "id": "b7b072da", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.042849, + "end_time": "2024-02-29T00:49:16.844277", + "exception": false, + "start_time": "2024-02-29T00:49:16.801428", + "status": "completed" + }, + "tags": [] + }, "source": [ "Get the mutational parameters for each of the models using `model_collection.split_apply_combine_muts()` [method](https://matsengrp.github.io/multidms/multidms.model_collection.html#multidms.model_collection.ModelCollection.split_apply_combine_muts), and merge them with the simulated ground truth parameters:" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 43, "id": "7ab1830e", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:49:16.937391Z", + "iopub.status.busy": "2024-02-29T00:49:16.937106Z", + "iopub.status.idle": "2024-02-29T00:49:24.067185Z", + "shell.execute_reply": "2024-02-29T00:49:24.066474Z" + }, + "papermill": { + "duration": 7.179912, + "end_time": "2024-02-29T00:49:24.068926", + "exception": false, + "start_time": "2024-02-29T00:49:16.889014", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "name": "stdout", @@ -3412,7 +4579,7 @@ "4 0.000498 0.0 " ] }, - "execution_count": 42, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } @@ -3445,22 +4612,47 @@ " )\n", ")\n", "assert collection_muts_df.shape[0] == len(mut_effects_df) * len(fit_collection_df)\n", + "collection_muts_df.to_csv(f\"{csv_output_dir}/collection_muts.csv\", index=False)\n", "collection_muts_df[[\"mutation\", \"library\", \"measurement_type\", \"scale_coeff_lasso_shift\", \"predicted_shift_h2\", \"true_shift\"]].head()" ] }, { "cell_type": "markdown", "id": "366e5adc", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.040963, + "end_time": "2024-02-29T00:49:24.152801", + "exception": false, + "start_time": "2024-02-29T00:49:24.111838", + "status": "completed" + }, + "tags": [] + }, "source": [ "To compare model fits parameter values to the simulated ground truth values, add mean squared error and pearsonr metrics to the `ModelCollection.fit_models` attribute:" ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 44, "id": "86d6e7d8", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:49:24.243704Z", + "iopub.status.busy": "2024-02-29T00:49:24.243393Z", + "iopub.status.idle": "2024-02-29T00:49:24.291209Z", + "shell.execute_reply": "2024-02-29T00:49:24.290553Z" + }, + "papermill": { + "duration": 0.093765, + "end_time": "2024-02-29T00:49:24.292844", + "exception": false, + "start_time": "2024-02-29T00:49:24.199079", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -3563,7 +4755,7 @@ "4 0.978463 0.000831 " ] }, - "execution_count": 43, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -3604,16 +4796,40 @@ { "cell_type": "markdown", "id": "617da09d", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.040379, + "end_time": "2024-02-29T00:49:24.375825", + "exception": false, + "start_time": "2024-02-29T00:49:24.335446", + "status": "completed" + }, + "tags": [] + }, "source": [ "Next, we'll make some summary plots with the predicted vs. simulated ground MSE truth parameters across model fits" ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 45, "id": "abc9b066", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:49:24.462845Z", + "iopub.status.busy": "2024-02-29T00:49:24.462549Z", + "iopub.status.idle": "2024-02-29T00:49:25.125744Z", + "shell.execute_reply": "2024-02-29T00:49:25.124817Z" + }, + "papermill": { + "duration": 0.711374, + "end_time": "2024-02-29T00:49:25.127331", + "exception": false, + "start_time": "2024-02-29T00:49:24.415957", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -3708,9 +4924,24 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 46, "id": "3c620246", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:49:25.221533Z", + "iopub.status.busy": "2024-02-29T00:49:25.221183Z", + "iopub.status.idle": "2024-02-29T00:49:25.230651Z", + "shell.execute_reply": "2024-02-29T00:49:25.230190Z" + }, + "papermill": { + "duration": 0.055111, + "end_time": "2024-02-29T00:49:25.231558", + "exception": false, + "start_time": "2024-02-29T00:49:25.176447", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -3807,7 +5038,7 @@ "4 observed_phenotype lib_1 beta 0.85 " ] }, - "execution_count": 45, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } @@ -3820,7 +5051,16 @@ { "cell_type": "markdown", "id": "38a147f0", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.045748, + "end_time": "2024-02-29T00:49:25.326574", + "exception": false, + "start_time": "2024-02-29T00:49:25.280826", + "status": "completed" + }, + "tags": [] + }, "source": [ "### Shift sparsity" ] @@ -3828,16 +5068,40 @@ { "cell_type": "markdown", "id": "0ffe5061", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.045085, + "end_time": "2024-02-29T00:49:25.418621", + "exception": false, + "start_time": "2024-02-29T00:49:25.373536", + "status": "completed" + }, + "tags": [] + }, "source": [ "Another way we might evaluate the fits is by looking at the _sparsity_ of the models by computing the percentage of shift parameters that are equal to zero among the models. We look at this metric separately for mutations to stop codons and mutations to non-stop codons because we expect the stop codons to be equally deleterious in both homologs, and thus we expect all the shift parameters associated with stop codon mutations to be zero in a \"good\" fit." ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 47, "id": "5e61b979", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:49:25.519290Z", + "iopub.status.busy": "2024-02-29T00:49:25.518464Z", + "iopub.status.idle": "2024-02-29T00:49:27.815133Z", + "shell.execute_reply": "2024-02-29T00:49:27.814412Z" + }, + "papermill": { + "duration": 2.347922, + "end_time": "2024-02-29T00:49:27.816911", + "exception": false, + "start_time": "2024-02-29T00:49:25.468989", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "name": "stdout", @@ -3935,7 +5199,7 @@ "4 shift_h2 0.150526 " ] }, - "execution_count": 46, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } @@ -3947,9 +5211,24 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 48, "id": "12cb640d", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:49:27.913197Z", + "iopub.status.busy": "2024-02-29T00:49:27.912893Z", + "iopub.status.idle": "2024-02-29T00:49:28.298501Z", + "shell.execute_reply": "2024-02-29T00:49:28.297799Z" + }, + "papermill": { + "duration": 0.434367, + "end_time": "2024-02-29T00:49:28.300661", + "exception": false, + "start_time": "2024-02-29T00:49:27.866294", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -4022,9 +5301,24 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 49, "id": "ad2e1eb9", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:49:28.396381Z", + "iopub.status.busy": "2024-02-29T00:49:28.396123Z", + "iopub.status.idle": "2024-02-29T00:49:28.440373Z", + "shell.execute_reply": "2024-02-29T00:49:28.439683Z" + }, + "papermill": { + "duration": 0.095451, + "end_time": "2024-02-29T00:49:28.442010", + "exception": false, + "start_time": "2024-02-29T00:49:28.346559", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -4146,7 +5440,7 @@ "4 " ] }, - "execution_count": 48, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } @@ -4159,7 +5453,16 @@ { "cell_type": "markdown", "id": "8eef4165", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.051739, + "end_time": "2024-02-29T00:49:28.542508", + "exception": false, + "start_time": "2024-02-29T00:49:28.490769", + "status": "completed" + }, + "tags": [] + }, "source": [ "### Replicate correlations" ] @@ -4167,16 +5470,40 @@ { "cell_type": "markdown", "id": "0a72ff12", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.049117, + "end_time": "2024-02-29T00:49:28.640374", + "exception": false, + "start_time": "2024-02-29T00:49:28.591257", + "status": "completed" + }, + "tags": [] + }, "source": [ "Another common metric to look at when evaluating models is to look at the correlation of the inferred mutational effects between the two libraries. We can do this by computing the pearson correlation coefficient between the inferred mutational effects for each mutation in the two libraries using the `multidms.ModelCollection.mut_param_dataset_correlation` [method](https://matsengrp.github.io/multidms/multidms.model_collection.html#multidms.model_collection.ModelCollection.mut_param_dataset_correlation)." ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 50, "id": "21b98a31", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:49:28.740835Z", + "iopub.status.busy": "2024-02-29T00:49:28.740531Z", + "iopub.status.idle": "2024-02-29T00:49:29.087860Z", + "shell.execute_reply": "2024-02-29T00:49:29.086992Z" + }, + "papermill": { + "duration": 0.399283, + "end_time": "2024-02-29T00:49:29.089633", + "exception": false, + "start_time": "2024-02-29T00:49:28.690350", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -4261,7 +5588,7 @@ "0 0.000040 " ] }, - "execution_count": 49, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } @@ -4274,16 +5601,40 @@ { "cell_type": "markdown", "id": "a0be631b", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.04511, + "end_time": "2024-02-29T00:49:29.184451", + "exception": false, + "start_time": "2024-02-29T00:49:29.139341", + "status": "completed" + }, + "tags": [] + }, "source": [ "By default, the method returns a dataframe giving parameter between each unique pair of datasets, meaning it includes correlations between models fit on different measurement types. We'll only be comparing models trained the same fitting target combination." ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 51, "id": "5bdd3daf", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:49:29.300265Z", + "iopub.status.busy": "2024-02-29T00:49:29.300004Z", + "iopub.status.idle": "2024-02-29T00:49:29.317828Z", + "shell.execute_reply": "2024-02-29T00:49:29.317348Z" + }, + "papermill": { + "duration": 0.0872, + "end_time": "2024-02-29T00:49:29.319548", + "exception": false, + "start_time": "2024-02-29T00:49:29.232348", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -4401,7 +5752,7 @@ "0 shift 0.005385 0.000000 loose_bottle" ] }, - "execution_count": 50, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -4431,9 +5782,24 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 52, "id": "6647a07b", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:49:29.420961Z", + "iopub.status.busy": "2024-02-29T00:49:29.420664Z", + "iopub.status.idle": "2024-02-29T00:49:29.992914Z", + "shell.execute_reply": "2024-02-29T00:49:29.992215Z" + }, + "papermill": { + "duration": 0.626478, + "end_time": "2024-02-29T00:49:29.995319", + "exception": false, + "start_time": "2024-02-29T00:49:29.368841", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -4519,9 +5885,24 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 53, "id": "f7dea999", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:49:30.104661Z", + "iopub.status.busy": "2024-02-29T00:49:30.104275Z", + "iopub.status.idle": "2024-02-29T00:49:30.117749Z", + "shell.execute_reply": "2024-02-29T00:49:30.117253Z" + }, + "papermill": { + "duration": 0.067458, + "end_time": "2024-02-29T00:49:30.119474", + "exception": false, + "start_time": "2024-02-29T00:49:30.052016", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -4599,7 +5980,7 @@ "0 beta 0.67 0.00004 loose_bottle" ] }, - "execution_count": 52, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } @@ -4612,7 +5993,16 @@ { "cell_type": "markdown", "id": "e1e5084f", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.050407, + "end_time": "2024-02-29T00:49:30.219237", + "exception": false, + "start_time": "2024-02-29T00:49:30.168830", + "status": "completed" + }, + "tags": [] + }, "source": [ "### Model vs. truth variant phenotypes" ] @@ -4620,7 +6010,16 @@ { "cell_type": "markdown", "id": "51fa3b22", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.051806, + "end_time": "2024-02-29T00:49:30.324155", + "exception": false, + "start_time": "2024-02-29T00:49:30.272349", + "status": "completed" + }, + "tags": [] + }, "source": [ "Let's take a look at how the lasso constraint affects the models' ability to predict the true latent and observed phenotypes. \n", "\n", @@ -4629,9 +6028,24 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 54, "id": "4051d61d", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:49:30.428289Z", + "iopub.status.busy": "2024-02-29T00:49:30.427983Z", + "iopub.status.idle": "2024-02-29T00:49:37.683480Z", + "shell.execute_reply": "2024-02-29T00:49:37.682789Z" + }, + "papermill": { + "duration": 7.309785, + "end_time": "2024-02-29T00:49:37.685307", + "exception": false, + "start_time": "2024-02-29T00:49:30.375522", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -4778,7 +6192,7 @@ "4 0.999803 1.000000 0 " ] }, - "execution_count": 53, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } @@ -4815,16 +6229,40 @@ { "cell_type": "markdown", "id": "21d85524", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.051853, + "end_time": "2024-02-29T00:49:37.791006", + "exception": false, + "start_time": "2024-02-29T00:49:37.739153", + "status": "completed" + }, + "tags": [] + }, "source": [ "Next, add the simulated ground truth phenotypes:" ] }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 55, "id": "93dc0b7e", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:49:37.899286Z", + "iopub.status.busy": "2024-02-29T00:49:37.898932Z", + "iopub.status.idle": "2024-02-29T00:49:47.439528Z", + "shell.execute_reply": "2024-02-29T00:49:47.438794Z" + }, + "papermill": { + "duration": 9.598734, + "end_time": "2024-02-29T00:49:47.441611", + "exception": false, + "start_time": "2024-02-29T00:49:37.842877", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "variants_df = pd.concat(\n", @@ -4856,16 +6294,40 @@ { "cell_type": "markdown", "id": "c868aa25", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.05196, + "end_time": "2024-02-29T00:49:47.548125", + "exception": false, + "start_time": "2024-02-29T00:49:47.496165", + "status": "completed" + }, + "tags": [] + }, "source": [ "For clarity, let's define and arrange the columns we now have:" ] }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 56, "id": "dd3f2191", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:49:47.656194Z", + "iopub.status.busy": "2024-02-29T00:49:47.655906Z", + "iopub.status.idle": "2024-02-29T00:49:47.659421Z", + "shell.execute_reply": "2024-02-29T00:49:47.658761Z" + }, + "papermill": { + "duration": 0.059182, + "end_time": "2024-02-29T00:49:47.661154", + "exception": false, + "start_time": "2024-02-29T00:49:47.601972", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "cols = [\n", @@ -4896,16 +6358,40 @@ { "cell_type": "markdown", "id": "0c066aa1", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.050837, + "end_time": "2024-02-29T00:49:47.763017", + "exception": false, + "start_time": "2024-02-29T00:49:47.712180", + "status": "completed" + }, + "tags": [] + }, "source": [ "Add correlation of ground truth targets / predicted phenotypes to the fit collection dataframe and plot them:" ] }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 57, "id": "fad2f16d", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:49:47.870991Z", + "iopub.status.busy": "2024-02-29T00:49:47.870412Z", + "iopub.status.idle": "2024-02-29T00:49:48.016441Z", + "shell.execute_reply": "2024-02-29T00:49:48.016021Z" + }, + "papermill": { + "duration": 0.199094, + "end_time": "2024-02-29T00:49:48.017329", + "exception": false, + "start_time": "2024-02-29T00:49:47.818235", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -4990,7 +6476,7 @@ "4 0.999954 -9.617601e-07 " ] }, - "execution_count": 56, + "execution_count": 57, "metadata": {}, "output_type": "execute_result" } @@ -5008,9 +6494,24 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 58, "id": "29b9faff", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:49:48.121051Z", + "iopub.status.busy": "2024-02-29T00:49:48.120687Z", + "iopub.status.idle": "2024-02-29T00:49:48.461982Z", + "shell.execute_reply": "2024-02-29T00:49:48.461259Z" + }, + "papermill": { + "duration": 0.394924, + "end_time": "2024-02-29T00:49:48.463520", + "exception": false, + "start_time": "2024-02-29T00:49:48.068596", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -5073,16 +6574,40 @@ { "cell_type": "markdown", "id": "eba4daaf", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.05686, + "end_time": "2024-02-29T00:49:48.573207", + "exception": false, + "start_time": "2024-02-29T00:49:48.516347", + "status": "completed" + }, + "tags": [] + }, "source": [ "As expected, the lasso constraint has a negative effect on the models' ability to predict the true latent and observed phenotypes. This is because a model with more freedom to choose parameters is likely to overfit to the training data. \n" ] }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 59, "id": "28637817", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:49:48.683057Z", + "iopub.status.busy": "2024-02-29T00:49:48.682702Z", + "iopub.status.idle": "2024-02-29T00:49:48.694143Z", + "shell.execute_reply": "2024-02-29T00:49:48.693734Z" + }, + "papermill": { + "duration": 0.07002, + "end_time": "2024-02-29T00:49:48.695849", + "exception": false, + "start_time": "2024-02-29T00:49:48.625829", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -5186,7 +6711,7 @@ "4 -0.0 " ] }, - "execution_count": 58, + "execution_count": 59, "metadata": {}, "output_type": "execute_result" } @@ -5199,7 +6724,16 @@ { "cell_type": "markdown", "id": "bfb4289a", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.057606, + "end_time": "2024-02-29T00:49:48.807177", + "exception": false, + "start_time": "2024-02-29T00:49:48.749571", + "status": "completed" + }, + "tags": [] + }, "source": [ "### Cross validation\n" ] @@ -5207,16 +6741,40 @@ { "cell_type": "markdown", "id": "f91a5611", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.055511, + "end_time": "2024-02-29T00:49:48.918062", + "exception": false, + "start_time": "2024-02-29T00:49:48.862551", + "status": "completed" + }, + "tags": [] + }, "source": [ "Above, we saw evidence that the lasso negatively impacts the model performance on the training data. To test for overfitting, we can perform cross validation to test that the model is actually more accurate on unseen data." ] }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 60, "id": "bf5868db", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:49:49.029654Z", + "iopub.status.busy": "2024-02-29T00:49:49.029366Z", + "iopub.status.idle": "2024-02-29T00:50:31.194425Z", + "shell.execute_reply": "2024-02-29T00:50:31.193528Z" + }, + "papermill": { + "duration": 42.223798, + "end_time": "2024-02-29T00:50:31.196584", + "exception": false, + "start_time": "2024-02-29T00:49:48.972786", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "train, test = [], {}\n", @@ -5244,9 +6802,24 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 61, "id": "3bf0974a", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T00:50:31.312848Z", + "iopub.status.busy": "2024-02-29T00:50:31.312517Z", + "iopub.status.idle": "2024-02-29T01:02:14.823727Z", + "shell.execute_reply": "2024-02-29T01:02:14.823183Z" + }, + "papermill": { + "duration": 703.572337, + "end_time": "2024-02-29T01:02:14.825681", + "exception": false, + "start_time": "2024-02-29T00:50:31.253344", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "fitting_params[\"dataset\"] = train \n", @@ -5256,16 +6829,40 @@ { "cell_type": "markdown", "id": "34667c06", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.056304, + "end_time": "2024-02-29T01:02:14.944461", + "exception": false, + "start_time": "2024-02-29T01:02:14.888157", + "status": "completed" + }, + "tags": [] + }, "source": [ "The `multidms.ModelCollection.add_validation_loss` method takes in unseen data, computes model loss on that data, and appends it to the `ModelCollection.fit_models` dataframe." ] }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 62, "id": "53256ee3", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T01:02:15.054802Z", + "iopub.status.busy": "2024-02-29T01:02:15.054077Z", + "iopub.status.idle": "2024-02-29T01:05:00.544980Z", + "shell.execute_reply": "2024-02-29T01:05:00.544279Z" + }, + "papermill": { + "duration": 165.607936, + "end_time": "2024-02-29T01:05:00.606632", + "exception": false, + "start_time": "2024-02-29T01:02:14.998696", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -5350,7 +6947,7 @@ "4 0.001680 " ] }, - "execution_count": 61, + "execution_count": 62, "metadata": {}, "output_type": "execute_result" } @@ -5363,9 +6960,24 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 63, "id": "75525d3d", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T01:05:00.718980Z", + "iopub.status.busy": "2024-02-29T01:05:00.718596Z", + "iopub.status.idle": "2024-02-29T01:05:01.093692Z", + "shell.execute_reply": "2024-02-29T01:05:01.093308Z" + }, + "papermill": { + "duration": 0.436654, + "end_time": "2024-02-29T01:05:01.095479", + "exception": false, + "start_time": "2024-02-29T01:05:00.658825", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -5445,16 +7057,40 @@ { "cell_type": "markdown", "id": "d56d5111", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.055636, + "end_time": "2024-02-29T01:05:01.208428", + "exception": false, + "start_time": "2024-02-29T01:05:01.152792", + "status": "completed" + }, + "tags": [] + }, "source": [ "Above we can see that indeed for both the loose and tight bottleneck datasets, the lasso constraint provides a more robust model to unseen data. We don't however see this effect for the models trained on ground truth observed phenotypes, as there is no noise for the models to overfit to." ] }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 64, "id": "ba808bc3", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T01:05:01.329695Z", + "iopub.status.busy": "2024-02-29T01:05:01.329108Z", + "iopub.status.idle": "2024-02-29T01:05:01.344607Z", + "shell.execute_reply": "2024-02-29T01:05:01.344076Z" + }, + "papermill": { + "duration": 0.081709, + "end_time": "2024-02-29T01:05:01.346254", + "exception": false, + "start_time": "2024-02-29T01:05:01.264545", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -5551,7 +7187,7 @@ "4 lib_1 training " ] }, - "execution_count": 63, + "execution_count": 64, "metadata": {}, "output_type": "execute_result" } @@ -5564,7 +7200,16 @@ { "cell_type": "markdown", "id": "64bb8eb4", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.0578, + "end_time": "2024-02-29T01:05:01.460945", + "exception": false, + "start_time": "2024-02-29T01:05:01.403145", + "status": "completed" + }, + "tags": [] + }, "source": [ "### Final Selection" ] @@ -5572,7 +7217,16 @@ { "cell_type": "markdown", "id": "96592bfc", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.063147, + "end_time": "2024-02-29T01:05:01.580154", + "exception": false, + "start_time": "2024-02-29T01:05:01.517007", + "status": "completed" + }, + "tags": [] + }, "source": [ "This analysis above provides a number of clues to inform our choice of model. With empricial data we obviously won't have comparisons to the ground truth values, and thus the choice of a lasso penalty will laregely depend on the shift parameter sparsity, correlation of inferred mutational effects between replicate libraries, and cross validation performance. Focusing on the _loose bottleneck_ training dataset (a fairly realistic level of noise that we often observe in real experiments), it seems that a lasso penalty of $1e-4$ provides a false positive rate of nearly zero (i.e. the stop codon sparsity $\\approx 100\\%$), a good correlation of inferred mutational effects between replicate libraries (>0.95 pearsonr), and a relatively low loss on validation when compared to the other models. " ] @@ -5580,16 +7234,40 @@ { "cell_type": "markdown", "id": "787a5e9b", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.058956, + "end_time": "2024-02-29T01:05:01.702397", + "exception": false, + "start_time": "2024-02-29T01:05:01.643441", + "status": "completed" + }, + "tags": [] + }, "source": [ "To validate the ability to correctly infer the shape of epistasis, we'll plot the inferred global epistasis function for each of the models, and compare it to the true global epistasis function." ] }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 65, "id": "a6f9e348", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T01:05:01.820425Z", + "iopub.status.busy": "2024-02-29T01:05:01.819543Z", + "iopub.status.idle": "2024-02-29T01:05:20.157589Z", + "shell.execute_reply": "2024-02-29T01:05:20.156856Z" + }, + "papermill": { + "duration": 18.400595, + "end_time": "2024-02-29T01:05:20.160404", + "exception": false, + "start_time": "2024-02-29T01:05:01.759809", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -5682,16 +7360,40 @@ { "cell_type": "markdown", "id": "e29a25c1", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.061802, + "end_time": "2024-02-29T01:05:20.289399", + "exception": false, + "start_time": "2024-02-29T01:05:20.227597", + "status": "completed" + }, + "tags": [] + }, "source": [ "Plot model vs. measured (functional score) vs. ground truth enrichments." ] }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 66, "id": "a7b2ddc3", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T01:05:20.418405Z", + "iopub.status.busy": "2024-02-29T01:05:20.418274Z", + "iopub.status.idle": "2024-02-29T01:05:27.465728Z", + "shell.execute_reply": "2024-02-29T01:05:27.465273Z" + }, + "papermill": { + "duration": 7.114521, + "end_time": "2024-02-29T01:05:27.469731", + "exception": false, + "start_time": "2024-02-29T01:05:20.355210", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -5742,7 +7444,16 @@ { "cell_type": "markdown", "id": "a84b60d4", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.072737, + "end_time": "2024-02-29T01:05:27.607642", + "exception": false, + "start_time": "2024-02-29T01:05:27.534905", + "status": "completed" + }, + "tags": [] + }, "source": [ "Amazingly, the model does better at predicting true enrichments than even the counts based functional scores!" ] @@ -5750,16 +7461,40 @@ { "cell_type": "markdown", "id": "5bb61ac8", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.06761, + "end_time": "2024-02-29T01:05:27.745219", + "exception": false, + "start_time": "2024-02-29T01:05:27.677609", + "status": "completed" + }, + "tags": [] + }, "source": [ "Finally, let's plot the relationship between the model's inferred shifts, and the ground truth shifts. " ] }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 67, "id": "c9438f7c", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-29T01:05:27.879775Z", + "iopub.status.busy": "2024-02-29T01:05:27.879462Z", + "iopub.status.idle": "2024-02-29T01:05:28.440871Z", + "shell.execute_reply": "2024-02-29T01:05:28.440429Z" + }, + "papermill": { + "duration": 0.631333, + "end_time": "2024-02-29T01:05:28.442390", + "exception": false, + "start_time": "2024-02-29T01:05:27.811057", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -5831,8 +7566,22 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" + }, + "papermill": { + "default_parameters": {}, + "duration": 1861.05853, + "end_time": "2024-02-29T01:05:31.233360", + "environment_variables": {}, + "exception": null, + "input_path": "simulation_validation.ipynb", + "output_path": "simulation_validation.ipynb", + "parameters": { + "csv_output_dir": "output/default" + }, + "start_time": "2024-02-29T00:34:30.174830", + "version": "2.5.0" } }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/tests/test_data.py b/tests/test_data.py index 4d2b195..3852936 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -43,6 +43,7 @@ alphabet=multidms.AAS_WITHSTOP, reference="a", assert_site_integrity=True, + name="test_data", ) model = multidms.Model(data, PRNGKey=23) @@ -449,3 +450,69 @@ def test_model_get_df_loss(): loss = model.loss df_loss = model.get_df_loss(TEST_FUNC_SCORES) assert loss == df_loss + + # also test that is it's the same if we add an unknown variant to training + test_with_unknown = TEST_FUNC_SCORES.copy() + test_with_unknown.loc[len(test_with_unknown)] = ["a", "E100T MIE", 0.2] + df_loss = model.get_df_loss(test_with_unknown) + assert loss == df_loss + + +def test_model_get_df_loss_conditional(): + """ + Test that the loss is correctly calculated + across each condition, by summing the conditions to be sure + they match the total loss. + """ + model = multidms.Model(data, PRNGKey=23) + model.fit(maxiter=2) + loss = model.loss + df_loss = model.get_df_loss(TEST_FUNC_SCORES, conditional=True) + # remove full and compare sum of the rest + df_loss.pop("total") + assert loss == sum(df_loss.values()) + + +def test_conditional_loss(): + """ + Test that the conditional loss is correctly calculated + by comparing the result of model.conditional_loss() + to the results of model.get_df_loss() + when given the training dataframe. + """ + model = multidms.Model(data, PRNGKey=23) + model.fit(maxiter=2) + loss = model.conditional_loss + df_loss = model.get_df_loss(TEST_FUNC_SCORES, conditional=True) + assert loss == df_loss + + +def test_ModelCollection_get_conditional_loss_df(): + """ + Test that correctness of the conditional loss df + format and values by comparing the results of + ModelCollection.get_conditional_loss_df to the + results of Model.conditional_loss. + """ + params = { + "dataset": [data], + "iterations_per_step": [2], + "scale_coeff_lasso_shift": [0.0, 1e-5], + } + _, _, fit_models_df = multidms.model_collection.fit_models( + params, + n_threads=-1, + ) + mc = multidms.model_collection.ModelCollection(fit_models_df) + df_loss = mc.get_conditional_loss_df() + # without validation loss, we expect the loss dataframe + # to have a row for each model-condition pair + total loss + n_expected_training_loss_rows = len(mc.fit_models) * (len(data.conditions) + 1) + assert df_loss.shape[0] == n_expected_training_loss_rows + + mc.add_validation_loss(TEST_FUNC_SCORES) + df_loss = mc.get_conditional_loss_df() + # with validation loss, we expect the loss dataframe + # to have a row for each model-condition-split (training/validation) pair + # + total loss + assert df_loss.shape[0] == n_expected_training_loss_rows * 2