From da3f204c2224d2bf012c64091489b0bad26a24c4 Mon Sep 17 00:00:00 2001 From: abearab Date: Fri, 9 Aug 2024 02:31:34 -0700 Subject: [PATCH 01/23] set `minimum_reads=0` --- screenpro/assays/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/screenpro/assays/__init__.py b/screenpro/assays/__init__.py index 61ed3c2..260eb96 100644 --- a/screenpro/assays/__init__.py +++ b/screenpro/assays/__init__.py @@ -110,7 +110,7 @@ def _auto_run_name(self): ) return run_name - def filterLowCounts(self, filter_type='all', minimum_reads=50): + def filterLowCounts(self, filter_type='all', minimum_reads=0): """ Filter low counts in adata.X """ From 0ee04340974e1b1a279b29ced3da3393694b6216 Mon Sep 17 00:00:00 2001 From: abearab Date: Fri, 9 Aug 2024 02:32:34 -0700 Subject: [PATCH 02/23] mend --- screenpro/phenoscore/delta.py | 1 - 1 file changed, 1 deletion(-) diff --git a/screenpro/phenoscore/delta.py b/screenpro/phenoscore/delta.py index 15aa2b6..2bf4a63 100644 --- a/screenpro/phenoscore/delta.py +++ b/screenpro/phenoscore/delta.py @@ -215,7 +215,6 @@ def getBestTargetByTSS(score_df,target_col,pvalue_col): """ collapse the gene-transcript indices into a single score for a gene by best p-value """ - #TODO: implement this function, see #90 return score_df.groupby(target_col).apply( lambda x: x.loc[x[pvalue_col].idxmin()] ) From af6c59c8f808557f14d2302b4ec283fc52c91983 Mon Sep 17 00:00:00 2001 From: abearab Date: Fri, 9 Aug 2024 20:30:42 -0700 Subject: [PATCH 03/23] add `applyNAtoLowCounts` function related to #16 --- screenpro/phenoscore/delta.py | 67 ++++++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 12 deletions(-) diff --git a/screenpro/phenoscore/delta.py b/screenpro/phenoscore/delta.py index 2bf4a63..81640a5 100644 --- a/screenpro/phenoscore/delta.py +++ b/screenpro/phenoscore/delta.py @@ -12,7 +12,7 @@ ### Key functions for calculating delta phenotype score -def compareByReplicates(adata, df_cond_ref, df_cond_test, var_names='target', test='ttest', ctrl_label='negative_control', growth_rate=1): +def compareByReplicates(adata, df_cond_ref, df_cond_test, var_names='target', test='ttest', ctrl_label='negative_control', growth_rate=1, filter_type='mean', filter_threshold=40): """Calculate phenotype score and p-values comparing `cond_test` vs `cond_ref`. Args: @@ -29,6 +29,13 @@ def compareByReplicates(adata, df_cond_ref, df_cond_test, var_names='target', te """ adat = adata.copy() + # apply NA to low counts + df_cond_ref, df_cond_test = applyNAtoLowCounts( + df_cond_ref=df_cond_ref, df_cond_test=df_cond_test, + filter_type=filter_type, + filter_threshold=filter_threshold + ) + # convert to numpy arrays x = df_cond_ref.to_numpy() y = df_cond_test.to_numpy() @@ -69,10 +76,17 @@ def compareByReplicates(adata, df_cond_ref, df_cond_test, var_names='target', te return result -def compareByTargetGroup(adata, df_cond_ref, df_cond_test, keep_top_n, var_names='target', test='ttest', ctrl_label='negative_control', growth_rate=1): +def compareByTargetGroup(adata, df_cond_ref, df_cond_test, keep_top_n, var_names='target', test='ttest', ctrl_label='negative_control', growth_rate=1, filter_type='mean', filter_threshold=40): adat = adata.copy() + # apply NA to low counts + df_cond_ref, df_cond_test = applyNAtoLowCounts( + df_cond_ref=df_cond_ref, df_cond_test=df_cond_test, + filter_type=filter_type, + filter_threshold=filter_threshold + ) + # get control values x_ctrl = df_cond_ref[adat.var.targetType.eq(ctrl_label)].to_numpy() y_ctrl = df_cond_test[adat.var.targetType.eq(ctrl_label)].to_numpy() @@ -185,18 +199,22 @@ def calculateDelta(x, y, x_ctrl, y_ctrl, growth_rate): Returns: np.array: array of scores """ - # calculate control median and std - ctrl_median = np.median( - calculateLog2e(x=x_ctrl, y=y_ctrl), - axis=0 # for each individual sample (i.e. replicate) - ) + if (x == np.nan or y == np.nan) or (len(x) == 0 or len(y) == 0): + delta = np.nan + + else: + # calculate control median and std + ctrl_median = np.median( + calculateLog2e(x=x_ctrl, y=y_ctrl), + axis=0 # for each individual sample (i.e. replicate) + ) - # calculate log2e (i.e. log2 fold change enrichment y / x) - log2e = calculateLog2e(x=x, y=y) + # calculate log2e (i.e. log2 fold change enrichment y / x) + log2e = calculateLog2e(x=x, y=y) - # calculate delta score normalized by control median and growth rate - delta = (log2e - ctrl_median) / growth_rate - + # calculate delta score normalized by control median and growth rate + delta = (log2e - ctrl_median) / growth_rate + return delta @@ -299,3 +317,28 @@ def generatePseudoGeneAnnData(adata, num_pseudogenes='auto', pseudogene_size='au out.obs = adata_ctrl.obs.copy() return out + + +def applyNAtoLowCounts(df_cond_ref, df_cond_test, filter_type, filter_threshold): + # more flexible read filtering by adding np.nan scores and/or pvalues to low count rows + # keep row if either both/all columns are above threshold, or if either/any column is + # in other words, mask if any column is below threshold or only if all columns are below + # source https://github.com/mhorlbeck/ScreenProcessing/blob/master/process_experiments.py#L464C1-L478C1 + + df = pd.concat({'ref':df_cond_ref, 'test':df_cond_test},axis=1) + + if filter_type == 'mean': + failFilterColumn = df.apply( + lambda row: np.mean(row) < filter_threshold, axis=1) + elif filter_type == 'both' or filter_type == 'all': + failFilterColumn = df.apply( + lambda row: min(row) < filter_threshold, axis=1) + elif filter_type == 'either' or filter_type == 'any': + failFilterColumn = df.apply( + lambda row: max(row) < filter_threshold, axis=1) + else: + raise ValueError('filter type not recognized or not implemented') + + df.loc[failFilterColumn, :] = np.nan + + return df['ref'], df['test'] \ No newline at end of file From d0135edb8a6718e6223fdda538d8e4eaf1ace132 Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 11 Aug 2024 09:34:46 -0700 Subject: [PATCH 04/23] mend --- screenpro/phenoscore/delta.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/screenpro/phenoscore/delta.py b/screenpro/phenoscore/delta.py index 81640a5..32cb17f 100644 --- a/screenpro/phenoscore/delta.py +++ b/screenpro/phenoscore/delta.py @@ -23,6 +23,8 @@ def compareByReplicates(adata, df_cond_ref, df_cond_test, var_names='target', te test (str): test to use for calculating p-value ('MW': Mann-Whitney U rank; 'ttest' : t-test) ctrl_label (str): control label, default is 'negative_control' growth_rate (int): growth rate + filter_type (str): filter type to apply to low counts ('mean', 'both', 'either') + filter_threshold (int): filter threshold for low counts (default is 40) Returns: pd.DataFrame: result dataframe @@ -328,17 +330,17 @@ def applyNAtoLowCounts(df_cond_ref, df_cond_test, filter_type, filter_threshold) df = pd.concat({'ref':df_cond_ref, 'test':df_cond_test},axis=1) if filter_type == 'mean': - failFilterColumn = df.apply( + filter = df.apply( lambda row: np.mean(row) < filter_threshold, axis=1) elif filter_type == 'both' or filter_type == 'all': - failFilterColumn = df.apply( + filter = df.apply( lambda row: min(row) < filter_threshold, axis=1) elif filter_type == 'either' or filter_type == 'any': - failFilterColumn = df.apply( + filter = df.apply( lambda row: max(row) < filter_threshold, axis=1) else: raise ValueError('filter type not recognized or not implemented') - df.loc[failFilterColumn, :] = np.nan + df.loc[filter, :] = np.nan return df['ref'], df['test'] \ No newline at end of file From 877fa2c9fc5d858e8faa896ddb15161d14955a10 Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 11 Aug 2024 09:50:02 -0700 Subject: [PATCH 05/23] add `number_of_guide_elements` column to result table --- screenpro/phenoscore/delta.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/screenpro/phenoscore/delta.py b/screenpro/phenoscore/delta.py index 32cb17f..f281484 100644 --- a/screenpro/phenoscore/delta.py +++ b/screenpro/phenoscore/delta.py @@ -96,12 +96,13 @@ def compareByTargetGroup(adata, df_cond_ref, df_cond_test, keep_top_n, var_names targets = [] scores = [] p_values = [] + target_sizes = [] # group by target genes or pseudogenes to aggregate counts for score calculation for target_name, target_group in adat.var.groupby(var_names): # calculate phenotype scores and p-values for each target group - target_scores, target_p_values = scoreTargetGroup( + target_score, target_p_value, target_size = scoreTargetGroup( target_group=target_group, df_cond_ref=df_cond_ref, df_cond_test=df_cond_test, @@ -110,9 +111,10 @@ def compareByTargetGroup(adata, df_cond_ref, df_cond_test, keep_top_n, var_names keep_top_n=keep_top_n ) - scores.append(target_scores) - p_values.append(target_p_values) + scores.append(target_score) + p_values.append(target_p_value) targets.append(target_name) + target_sizes.append(target_size) # average scores across replicates scores = [np.mean(s) for s in scores] @@ -131,6 +133,7 @@ def compareByTargetGroup(adata, df_cond_ref, df_cond_test, keep_top_n, var_names pd.Series(scores, name='score'), pd.Series(p_values, name=f'{test} pvalue'), pd.Series(adj_p_values, name='BH adj_pvalue'), + pd.Series(target_sizes, name='number_of_guide_elements'), ], axis=1) # add targets information @@ -242,8 +245,8 @@ def getBestTargetByTSS(score_df,target_col,pvalue_col): def scoreTargetGroup(target_group, df_cond_ref, df_cond_test, x_ctrl, y_ctrl, test='ttest', growth_rate=1, keep_top_n=None): # select target group and convert to numpy arrays - x = df_cond_ref.loc[target_group.index,:].to_numpy() - y = df_cond_test.loc[target_group.index,:].to_numpy() + x = df_cond_ref.loc[target_group.index,:].dropna().to_numpy() + y = df_cond_test.loc[target_group.index,:].dropna().to_numpy() # calculate phenotype scores target_scores = calculateDelta( @@ -251,22 +254,26 @@ def scoreTargetGroup(target_group, df_cond_ref, df_cond_test, x_ctrl, y_ctrl, te x_ctrl = x_ctrl, y_ctrl = y_ctrl, growth_rate = growth_rate, ) + + # get target size + target_size = target_scores.shape[0] # number of guide elements in the target group - if keep_top_n is None or keep_top_n is False: + if keep_top_n is None or keep_top_n is False or target_size <= keep_top_n: # average scores across guides - target_scores = np.mean(target_scores, axis=0) + target_score = np.mean(target_scores, axis=0) elif keep_top_n > 0: # get top n scores per target - np.apply_along_axis(averageBestN, axis=0, arr=target_scores, numToAverage=keep_top_n) + target_score = np.apply_along_axis(averageBestN, axis=0, arr=target_scores, numToAverage=keep_top_n) + target_size = keep_top_n # update target size to keep_top_n else: raise ValueError(f'Invalid value for keep_top_n: {keep_top_n}') # compute p-value - target_p_values = matrixStat(x, y, test=test, level='all') + target_p_value = matrixStat(x, y, test=test, level='all') - return target_scores, target_p_values + return target_score, target_p_value, target_size def generatePseudoGeneAnnData(adata, num_pseudogenes='auto', pseudogene_size='auto', ctrl_label='negative_control'): From b113ca55450f2be0469840420fbe325ac30eb37e Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 11 Aug 2024 09:53:25 -0700 Subject: [PATCH 06/23] mend --- screenpro/phenoscore/delta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/screenpro/phenoscore/delta.py b/screenpro/phenoscore/delta.py index f281484..094c0be 100644 --- a/screenpro/phenoscore/delta.py +++ b/screenpro/phenoscore/delta.py @@ -204,7 +204,7 @@ def calculateDelta(x, y, x_ctrl, y_ctrl, growth_rate): Returns: np.array: array of scores """ - if (x == np.nan or y == np.nan) or (len(x) == 0 or len(y) == 0): + if (x.all() == np.nan or y.all() == np.nan) or (len(x) == 0 or len(y) == 0): delta = np.nan else: From 27330d4b47702b74cd5b4e5495ac502210a19dd3 Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 11 Aug 2024 10:03:51 -0700 Subject: [PATCH 07/23] mend --- screenpro/phenoscore/delta.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/screenpro/phenoscore/delta.py b/screenpro/phenoscore/delta.py index 094c0be..019cdca 100644 --- a/screenpro/phenoscore/delta.py +++ b/screenpro/phenoscore/delta.py @@ -204,7 +204,8 @@ def calculateDelta(x, y, x_ctrl, y_ctrl, growth_rate): Returns: np.array: array of scores """ - if (x.all() == np.nan or y.all() == np.nan) or (len(x) == 0 or len(y) == 0): + # check if x and y are empty + if len(x) == 0 or len(y) == 0: delta = np.nan else: From 4508704bd8327d5942cdc5077e54a39544192d82 Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 11 Aug 2024 10:20:09 -0700 Subject: [PATCH 08/23] mend --- screenpro/phenoscore/delta.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/screenpro/phenoscore/delta.py b/screenpro/phenoscore/delta.py index 019cdca..0154ba5 100644 --- a/screenpro/phenoscore/delta.py +++ b/screenpro/phenoscore/delta.py @@ -204,23 +204,18 @@ def calculateDelta(x, y, x_ctrl, y_ctrl, growth_rate): Returns: np.array: array of scores """ - # check if x and y are empty - if len(x) == 0 or len(y) == 0: - delta = np.nan - - else: - # calculate control median and std - ctrl_median = np.median( - calculateLog2e(x=x_ctrl, y=y_ctrl), - axis=0 # for each individual sample (i.e. replicate) - ) + # calculate control median and std + ctrl_median = np.median( + calculateLog2e(x=x_ctrl, y=y_ctrl), + axis=0 # for each individual sample (i.e. replicate) + ) - # calculate log2e (i.e. log2 fold change enrichment y / x) - log2e = calculateLog2e(x=x, y=y) + # calculate log2e (i.e. log2 fold change enrichment y / x) + log2e = calculateLog2e(x=x, y=y) - # calculate delta score normalized by control median and growth rate - delta = (log2e - ctrl_median) / growth_rate - + # calculate delta score normalized by control median and growth rate + delta = (log2e - ctrl_median) / growth_rate + return delta From 400c6f1591926c1eda3748a28acb84f8b740899c Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 11 Aug 2024 10:43:29 -0700 Subject: [PATCH 09/23] mend --- screenpro/phenoscore/delta.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/screenpro/phenoscore/delta.py b/screenpro/phenoscore/delta.py index 0154ba5..38b2c75 100644 --- a/screenpro/phenoscore/delta.py +++ b/screenpro/phenoscore/delta.py @@ -254,11 +254,11 @@ def scoreTargetGroup(target_group, df_cond_ref, df_cond_test, x_ctrl, y_ctrl, te # get target size target_size = target_scores.shape[0] # number of guide elements in the target group - if keep_top_n is None or keep_top_n is False or target_size <= keep_top_n: + if (keep_top_n is None or keep_top_n is False) and target_size <= keep_top_n: # average scores across guides target_score = np.mean(target_scores, axis=0) - elif keep_top_n > 0: + elif keep_top_n > 0 and target_size > keep_top_n: # get top n scores per target target_score = np.apply_along_axis(averageBestN, axis=0, arr=target_scores, numToAverage=keep_top_n) target_size = keep_top_n # update target size to keep_top_n From b4d18114f6e8daeb2242c0b20b7bf2b01a2a03df Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 11 Aug 2024 10:44:23 -0700 Subject: [PATCH 10/23] temp commit --- screenpro/phenoscore/__init__.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/screenpro/phenoscore/__init__.py b/screenpro/phenoscore/__init__.py index 0be500d..397f0f2 100644 --- a/screenpro/phenoscore/__init__.py +++ b/screenpro/phenoscore/__init__.py @@ -116,15 +116,15 @@ def runPhenoScore(adata, cond_ref, cond_test, score_level, var_names='target', t growth_rate=growth_rate, ) - # get best best transcript as lowest p-value for each target - if collapse_var not in [False, None]: - if collapse_var not in result.columns: - raise ValueError(f'collapse_var "{collapse_var}" not found in result columns.') - else: - result = getBestTargetByTSS( - score_df=result, target_col=collapse_var, pvalue_col=f'{test} pvalue' - ) - result.index.name = None + # # get best best transcript as lowest p-value for each target + # if collapse_var not in [False, None]: + # if collapse_var not in result.columns: + # raise ValueError(f'collapse_var "{collapse_var}" not found in result columns.') + # else: + # result = getBestTargetByTSS( + # score_df=result, target_col=collapse_var, pvalue_col=f'{test} pvalue' + # ) + # result.index.name = None # change target name to control label if it is a pseudo gene result['target'] = result['target'].apply(lambda x: ctrl_label if 'pseudo' in x else x).to_list() From 5f7268a98a92ab800c57c4848cbe03d3ce6e258c Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 11 Aug 2024 10:48:08 -0700 Subject: [PATCH 11/23] mend --- screenpro/phenoscore/delta.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/screenpro/phenoscore/delta.py b/screenpro/phenoscore/delta.py index 38b2c75..56eb8ac 100644 --- a/screenpro/phenoscore/delta.py +++ b/screenpro/phenoscore/delta.py @@ -254,11 +254,11 @@ def scoreTargetGroup(target_group, df_cond_ref, df_cond_test, x_ctrl, y_ctrl, te # get target size target_size = target_scores.shape[0] # number of guide elements in the target group - if (keep_top_n is None or keep_top_n is False) and target_size <= keep_top_n: + if (keep_top_n is None or keep_top_n is False) or target_size <= keep_top_n: # average scores across guides target_score = np.mean(target_scores, axis=0) - elif keep_top_n > 0 and target_size > keep_top_n: + elif keep_top_n > 0 or target_size > keep_top_n: # get top n scores per target target_score = np.apply_along_axis(averageBestN, axis=0, arr=target_scores, numToAverage=keep_top_n) target_size = keep_top_n # update target size to keep_top_n From f8621c6821a1a3c628089b51a06ccae17d3395ba Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 11 Aug 2024 22:58:56 -0700 Subject: [PATCH 12/23] mend --- screenpro/phenoscore/delta.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/screenpro/phenoscore/delta.py b/screenpro/phenoscore/delta.py index 56eb8ac..18bbea2 100644 --- a/screenpro/phenoscore/delta.py +++ b/screenpro/phenoscore/delta.py @@ -90,8 +90,8 @@ def compareByTargetGroup(adata, df_cond_ref, df_cond_test, keep_top_n, var_names ) # get control values - x_ctrl = df_cond_ref[adat.var.targetType.eq(ctrl_label)].to_numpy() - y_ctrl = df_cond_test[adat.var.targetType.eq(ctrl_label)].to_numpy() + x_ctrl = df_cond_ref[adat.var.targetType.eq(ctrl_label)].dropna().to_numpy() + y_ctrl = df_cond_test[adat.var.targetType.eq(ctrl_label)].dropna().to_numpy() targets = [] scores = [] @@ -116,8 +116,8 @@ def compareByTargetGroup(adata, df_cond_ref, df_cond_test, keep_top_n, var_names targets.append(target_name) target_sizes.append(target_size) - # average scores across replicates - scores = [np.mean(s) for s in scores] + # # average scores across replicates + # scores = [np.mean(s) for s in scores] # get adjusted p-values adj_p_values = multipleTestsCorrection(np.array(p_values)) @@ -254,7 +254,10 @@ def scoreTargetGroup(target_group, df_cond_ref, df_cond_test, x_ctrl, y_ctrl, te # get target size target_size = target_scores.shape[0] # number of guide elements in the target group - if (keep_top_n is None or keep_top_n is False) or target_size <= keep_top_n: + if target_size == 0: + target_score = np.full(target_scores.shape[1], np.nan) + + elif (keep_top_n is None or keep_top_n is False) or target_size <= keep_top_n: # average scores across guides target_score = np.mean(target_scores, axis=0) From 8760bb81b35a55323ac04e9b4a4d471c909e1aab Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 11 Aug 2024 23:01:07 -0700 Subject: [PATCH 13/23] mend --- screenpro/phenoscore/delta.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/screenpro/phenoscore/delta.py b/screenpro/phenoscore/delta.py index 18bbea2..4daa193 100644 --- a/screenpro/phenoscore/delta.py +++ b/screenpro/phenoscore/delta.py @@ -116,8 +116,8 @@ def compareByTargetGroup(adata, df_cond_ref, df_cond_test, keep_top_n, var_names targets.append(target_name) target_sizes.append(target_size) - # # average scores across replicates - # scores = [np.mean(s) for s in scores] + # average scores across replicates + scores = [np.mean(s) for s in scores] # get adjusted p-values adj_p_values = multipleTestsCorrection(np.array(p_values)) From 4d1a02734f3e0dbf95f94fe70f9be0db2b1ae2df Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 11 Aug 2024 23:23:15 -0700 Subject: [PATCH 14/23] fix collapse --- screenpro/phenoscore/__init__.py | 18 +++++++++--------- screenpro/phenoscore/delta.py | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/screenpro/phenoscore/__init__.py b/screenpro/phenoscore/__init__.py index 397f0f2..0be500d 100644 --- a/screenpro/phenoscore/__init__.py +++ b/screenpro/phenoscore/__init__.py @@ -116,15 +116,15 @@ def runPhenoScore(adata, cond_ref, cond_test, score_level, var_names='target', t growth_rate=growth_rate, ) - # # get best best transcript as lowest p-value for each target - # if collapse_var not in [False, None]: - # if collapse_var not in result.columns: - # raise ValueError(f'collapse_var "{collapse_var}" not found in result columns.') - # else: - # result = getBestTargetByTSS( - # score_df=result, target_col=collapse_var, pvalue_col=f'{test} pvalue' - # ) - # result.index.name = None + # get best best transcript as lowest p-value for each target + if collapse_var not in [False, None]: + if collapse_var not in result.columns: + raise ValueError(f'collapse_var "{collapse_var}" not found in result columns.') + else: + result = getBestTargetByTSS( + score_df=result, target_col=collapse_var, pvalue_col=f'{test} pvalue' + ) + result.index.name = None # change target name to control label if it is a pseudo gene result['target'] = result['target'].apply(lambda x: ctrl_label if 'pseudo' in x else x).to_list() diff --git a/screenpro/phenoscore/delta.py b/screenpro/phenoscore/delta.py index 4daa193..16fd455 100644 --- a/screenpro/phenoscore/delta.py +++ b/screenpro/phenoscore/delta.py @@ -234,7 +234,7 @@ def getBestTargetByTSS(score_df,target_col,pvalue_col): """ collapse the gene-transcript indices into a single score for a gene by best p-value """ - return score_df.groupby(target_col).apply( + return score_df.dropna().groupby(target_col).apply( lambda x: x.loc[x[pvalue_col].idxmin()] ) From 86fc23d13aa9b0f4a346b0fd54097937524bd2ab Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 11 Aug 2024 23:23:50 -0700 Subject: [PATCH 15/23] mend --- screenpro/phenoscore/delta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/screenpro/phenoscore/delta.py b/screenpro/phenoscore/delta.py index 16fd455..0df8922 100644 --- a/screenpro/phenoscore/delta.py +++ b/screenpro/phenoscore/delta.py @@ -349,4 +349,4 @@ def applyNAtoLowCounts(df_cond_ref, df_cond_test, filter_type, filter_threshold) df.loc[filter, :] = np.nan - return df['ref'], df['test'] \ No newline at end of file + return df['ref'], df['test'] From 638631c77a09047433caf2d2c2b5c493ddc0a3ab Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 12 Aug 2024 09:44:51 -0700 Subject: [PATCH 16/23] add count filter args --- screenpro/phenoscore/__init__.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/screenpro/phenoscore/__init__.py b/screenpro/phenoscore/__init__.py index 0be500d..a9ae7bc 100644 --- a/screenpro/phenoscore/__init__.py +++ b/screenpro/phenoscore/__init__.py @@ -28,7 +28,9 @@ def runPhenoScore(adata, cond_ref, cond_test, score_level, var_names='target', test='ttest', growth_rate=1, n_reps='auto', keep_top_n = None, collapse_var=False, num_pseudogenes='auto', pseudogene_size='auto', - count_layer=None, ctrl_label='negative_control'): + count_layer=None, count_filter_type='mean', count_filter_threshold=40, + ctrl_label='negative_control' + ): """Calculate phenotype score and p-values when comparing `cond_test` vs `cond_ref`. Args: @@ -44,6 +46,8 @@ def runPhenoScore(adata, cond_ref, cond_test, score_level, var_names='target', t num_pseudogenes (int): number of pseudogenes to generate pseudogene_size (int): number of sgRNA elements in each pseudogene count_layer (str): count layer to use for calculating score, default is None (use default count layer in adata.X) + count_filter_type (str): filter type for counts, default is 'mean' + count_filter_threshold (int): filter threshold for counts, default is 40 ctrl_label (str): control label, default is 'negative_control' Returns: @@ -85,7 +89,9 @@ def runPhenoScore(adata, cond_ref, cond_test, score_level, var_names='target', t var_names=var_names, test=test, ctrl_label=ctrl_label, - growth_rate=growth_rate + growth_rate=growth_rate, + filter_type=count_filter_type, + filter_threshold=count_filter_threshold ) elif score_level in ['compare_guides']: @@ -114,6 +120,8 @@ def runPhenoScore(adata, cond_ref, cond_test, score_level, var_names='target', t test=test, ctrl_label=ctrl_label, growth_rate=growth_rate, + filter_type=count_filter_type, + filter_threshold=count_filter_threshold ) # get best best transcript as lowest p-value for each target From 81305dff4d30c3522f15211274067ec9a70e254c Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 12 Aug 2024 09:46:56 -0700 Subject: [PATCH 17/23] mend --- screenpro/assays/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/screenpro/assays/__init__.py b/screenpro/assays/__init__.py index 260eb96..7cd9c2d 100644 --- a/screenpro/assays/__init__.py +++ b/screenpro/assays/__init__.py @@ -110,7 +110,7 @@ def _auto_run_name(self): ) return run_name - def filterLowCounts(self, filter_type='all', minimum_reads=0): + def filterLowCounts(self, filter_type='all', minimum_reads=1): """ Filter low counts in adata.X """ From 9e640e118cc02b0aad98c0d83c78d99cb6828213 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 12 Aug 2024 12:18:24 -0700 Subject: [PATCH 18/23] update README --- README.md | 54 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 9f5471a..b6e5e78 100644 --- a/README.md +++ b/README.md @@ -36,28 +36,6 @@ ___ ___ -
- Benchmarking -
- - Benchmarking ScreenPro2 with other CRISPR screen analysis tools - - ### More thoughtful NGS read trimming recovers more sgRNA counts - - ### ScreenPro2 statistical analysis is more accurate than ScreenProcessing - - ### ScreenPro2 is more flexible than ScreenProcessing - - Not only does ScreenPro2 have more features than ScreenProcessing, but it is also more flexible. ScreenPro2 can process data from diverse CRISPR screen platforms and is designed to be modular to enable easy extension to custom CRISPR screen platforms or other commonly used platforms in addition to the ones currently implemented. - - ### ScreenPro2 is faster than ScreenProcessing - - Last but not least, ScreenPro2 runs faster than ScreenProcessing (thanks to [biobear](https://github.com/wheretrue/biobear)) for processing FASTQ files. - -
- -___ - ## Installation ScreenPro2 is available on [PyPI](https://pypi.org/project/ScreenPro2/) and can be installed with pip: ```bash @@ -97,8 +75,6 @@ Data analysis for CRISPR screens with NGS readouts can be broken down into three The first step in analyzing CRISPR screens with deep sequencing readouts is to process the FASTQ files and generate counts for each guide RNA element in the library. ScreenPro2 has built-in functionalities to process FASTQ files and generate counts for different types of CRISPR screens platforms (see [Supported CRISPR Screen Platforms](#supported-crispr-screen-platforms)). -___ -
Command Line Interface (CLI)
@@ -133,11 +109,10 @@ ___ -o --write-count-matrix ``` + ___
-___ -
Python Package Usage
@@ -205,6 +180,8 @@ ___ ```python adata = counter.build_counts_anndata() ``` + + ___
@@ -285,6 +262,31 @@ screen.calculateFlowBasedScreen( ) ``` +___ + +
+ Benchmarking +
+ + Coming soon... + +
+ + + + ### Step 3: Data visualization Once the phenotypes are calculated, you can extract and explore the results using the `.phenotypes` attribute of the `PooledScreens` object. Currently, there are very limited functionalities built-in to visualize the results, but we are working on adding more features to make it easier for users. However, you can easily extract the results and use other libraries like `seaborn` and `matplotlib` in Python or `ggplot2` in R to visualize the results. From 88492281866b3e1a07e672b103c0e0ef4f59c94b Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 12 Aug 2024 12:20:58 -0700 Subject: [PATCH 19/23] update README --- README.md | 131 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 68 insertions(+), 63 deletions(-) diff --git a/README.md b/README.md index b6e5e78..87ecffd 100644 --- a/README.md +++ b/README.md @@ -189,80 +189,85 @@ The first step in analyzing CRISPR screens with deep sequencing readouts is to p Once you have the counts, you can use ScreenPro2 `phenoscore` and `phenostats` modules to calculate the phenotype scores and statistics between screen arms. -#### Load Data -First, load your data into an `AnnData` object (see [anndata](https://anndata.readthedocs.io/en/latest/index.html) for more information). - -The `AnnData` object must have the following contents: -- `adata.X` – counts matrix (samples x targets) where each value represents the sequencing count from NGS data. -- `adata.obs` – a pandas dataframe of samples metadata including "condition" and "replicate" columns. - - "condition": the condition for each sample in the experiment. - - "replicate": the replicate number for each sample in the experiment. -- `adata.var` – a pandas dataframe of targets in sgRNA library including "target" and "targetType" columns. - - "target": the target for each entry in reference sgRNA library. For single sgRNA libraries, this column can be - used to store gene names. For dual or multiple targeting sgRNA libraries, this column can be used to store gene pairs - or any other relevant information about the target. - - "targetType": the type of target for each entry in reference sgRNA library. Note that this column is used to - distinguish between different types of sgRNAs in the library and negative control sgRNAs can be defined as `"targetType" == "negative_control"`. - This is important for the phenotype calculation step. - - -ScreenPro2 has a built-in class for different types of CRISPR screen assays. Currently, there is a class called `PooledScreens` -that can be used to process data from pooled CRISPR screens. To create a `PooledScreens` object from an `AnnData` object, -you can use the following example code: +
+ Python Package Usage +
-```python -import pandas as pd -import anndata as ad -from screenpro.assays import PooledScreens + #### Load Data + First, load your data into an `AnnData` object (see [anndata](https://anndata.readthedocs.io/en/latest/index.html) for more information). -adata = ad.AnnData( - X = counts_df, # pandas dataframe of counts (samples x targets) - obs = meta_df, # pandas dataframe of samples metadata including "condition" and "replicate" columns - var = target_df # pandas dataframe of targets metadata including "target" and "targetType" columns -) + The `AnnData` object must have the following contents: + - `adata.X` – counts matrix (samples x targets) where each value represents the sequencing count from NGS data. + - `adata.obs` – a pandas dataframe of samples metadata including "condition" and "replicate" columns. + - "condition": the condition for each sample in the experiment. + - "replicate": the replicate number for each sample in the experiment. + - `adata.var` – a pandas dataframe of targets in sgRNA library including "target" and "targetType" columns. + - "target": the target for each entry in reference sgRNA library. For single sgRNA libraries, this column can be + used to store gene names. For dual or multiple targeting sgRNA libraries, this column can be used to store gene pairs + or any other relevant information about the target. + - "targetType": the type of target for each entry in reference sgRNA library. Note that this column is used to + distinguish between different types of sgRNAs in the library and negative control sgRNAs can be defined as `"targetType" == "negative_control"`. + This is important for the phenotype calculation step. -screen = PooledScreens(adata) -``` -image + ScreenPro2 has a built-in class for different types of CRISPR screen assays. Currently, there is a class called `PooledScreens` + that can be used to process data from pooled CRISPR screens. To create a `PooledScreens` object from an `AnnData` object, + you can use the following example code: -#### Perform Screen Processing Analysis -Once the screen object is created, you can use several available workflows to calculate the phenotype scores and statisitics by comparing each entry in reference sgRNA library between screen arms. Then, these scores and statistics are used to nominate hits. + ```python + import pandas as pd + import anndata as ad + from screenpro.assays import PooledScreens + + adata = ad.AnnData( + X = counts_df, # pandas dataframe of counts (samples x targets) + obs = meta_df, # pandas dataframe of samples metadata including "condition" and "replicate" columns + var = target_df # pandas dataframe of targets metadata including "target" and "targetType" columns + ) -##### Drug Screen Workflow: calculate `gamma`, `rho`, and `tau` scores -`.calculateDrugScreen` method can be used to calculate the enrichment of each gene between screen arms for a drug -screen experiment. This method calculates `gamma`, `rho`, and `tau` scores for each gene and adds them to the -`.phenotypes` attribute of the `PooledScreens` object. + screen = PooledScreens(adata) + ``` -Here is an example for running the workflow on a [CRISPRi-dual-sgRNA-screens](#dcas9-crisprai-dual-sgrna-screens) dataset: + image -```python -# Run the ScreenPro2 workflow for CRISPRi-dual-sgRNA-screens -screen.calculateDrugScreen( - t0='T0', - untreated='DMSO', # replace with the label for untreated condition - treated='Drug', # replace with the label for treated condition - score_level='compare_reps' -) -``` -___ -For example, in a Decitabine CRISPRi drug screen (see Figure 1B-C in [this bioRxiv paper](https://www.biorxiv.org/content/10.1101/2022.12.14.518457v2.full)), each phenotype score represents a comparison between different arms of the screen and `rho` scores shows the main drug phenotype as illustrated here: -image + #### Perform Screen Processing Analysis + Once the screen object is created, you can use several available workflows to calculate the phenotype scores and statisitics by comparing each entry in reference sgRNA library between screen arms. Then, these scores and statistics are used to nominate hits. -##### Flow cytometry based screen workflow: calculate phenotype score to compare high and low bins -`.calculateFlowBasedScreen` method can be used to calculate the enrichment of each target between high bin vs. low bin -of a flow cytometry-based screen experiment. This method calculates `PhenoScore` for each target and adds them to the -`.phenotypes` attribute of the `PooledScreens` object. + ##### Drug Screen Workflow: calculate `gamma`, `rho`, and `tau` scores + `.calculateDrugScreen` method can be used to calculate the enrichment of each gene between screen arms for a drug + screen experiment. This method calculates `gamma`, `rho`, and `tau` scores for each gene and adds them to the + `.phenotypes` attribute of the `PooledScreens` object. -```python -# Run the ScreenPro2 workflow for CRISPRi-dual-sgRNA-screens -screen.calculateFlowBasedScreen( - low_bin='low_bin', high_bin='high_bin', - score_level='compare_reps' -) -``` + Here is an example for running the workflow on a [CRISPRi-dual-sgRNA-screens](#dcas9-crisprai-dual-sgrna-screens) dataset: -___ + ```python + # Run the ScreenPro2 workflow for CRISPRi-dual-sgRNA-screens + screen.calculateDrugScreen( + t0='T0', + untreated='DMSO', # replace with the label for untreated condition + treated='Drug', # replace with the label for treated condition + score_level='compare_reps' + ) + ``` + ___ + For example, in a Decitabine CRISPRi drug screen (see Figure 1B-C in [this bioRxiv paper](https://www.biorxiv.org/content/10.1101/2022.12.14.518457v2.full)), each phenotype score represents a comparison between different arms of the screen and `rho` scores shows the main drug phenotype as illustrated here: + image + + ##### Flow cytometry based screen workflow: calculate phenotype score to compare high and low bins + `.calculateFlowBasedScreen` method can be used to calculate the enrichment of each target between high bin vs. low bin + of a flow cytometry-based screen experiment. This method calculates `PhenoScore` for each target and adds them to the + `.phenotypes` attribute of the `PooledScreens` object. + + ```python + # Run the ScreenPro2 workflow for CRISPRi-dual-sgRNA-screens + screen.calculateFlowBasedScreen( + low_bin='low_bin', high_bin='high_bin', + score_level='compare_reps' + ) + ``` + ___ + +
Benchmarking From e952c8e013369fe6a8b400d1604441ba6fc3ebdc Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 12 Aug 2024 12:24:35 -0700 Subject: [PATCH 20/23] update README --- README.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 87ecffd..3bfd440 100644 --- a/README.md +++ b/README.md @@ -190,10 +190,9 @@ The first step in analyzing CRISPR screens with deep sequencing readouts is to p Once you have the counts, you can use ScreenPro2 `phenoscore` and `phenostats` modules to calculate the phenotype scores and statistics between screen arms.
- Python Package Usage + Load Data
- #### Load Data First, load your data into an `AnnData` object (see [anndata](https://anndata.readthedocs.io/en/latest/index.html) for more information). The `AnnData` object must have the following contents: @@ -230,7 +229,13 @@ Once you have the counts, you can use ScreenPro2 `phenoscore` and `phenostats` m image - #### Perform Screen Processing Analysis + ___ + +
+ +
+ Perform Screen Processing Analysis + Once the screen object is created, you can use several available workflows to calculate the phenotype scores and statisitics by comparing each entry in reference sgRNA library between screen arms. Then, these scores and statistics are used to nominate hits. ##### Drug Screen Workflow: calculate `gamma`, `rho`, and `tau` scores @@ -270,7 +275,7 @@ Once you have the counts, you can use ScreenPro2 `phenoscore` and `phenostats` m
- Benchmarking + Benchmarking ScreenPro2 vs other CRISPR screen processing tools
Coming soon... From e44860c5af5118cde4c2004ad395a2ade3e85491 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 12 Aug 2024 12:33:41 -0700 Subject: [PATCH 21/23] mend --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 3bfd440..a9c2bbb 100644 --- a/README.md +++ b/README.md @@ -235,6 +235,7 @@ Once you have the counts, you can use ScreenPro2 `phenoscore` and `phenostats` m
Perform Screen Processing Analysis +
Once the screen object is created, you can use several available workflows to calculate the phenotype scores and statisitics by comparing each entry in reference sgRNA library between screen arms. Then, these scores and statistics are used to nominate hits. From 7b4736892adc7df20076c167d06feff45266c728 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 12 Aug 2024 12:35:54 -0700 Subject: [PATCH 22/23] mend --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a9c2bbb..8a3d78f 100644 --- a/README.md +++ b/README.md @@ -234,7 +234,7 @@ Once you have the counts, you can use ScreenPro2 `phenoscore` and `phenostats` m
- Perform Screen Processing Analysis + Run workflows
Once the screen object is created, you can use several available workflows to calculate the phenotype scores and statisitics by comparing each entry in reference sgRNA library between screen arms. Then, these scores and statistics are used to nominate hits. From 88ed081b5e51c91fbf352529b4f84cd501f214e7 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 12 Aug 2024 13:12:02 -0700 Subject: [PATCH 23/23] bump version 0.4.11 --- screenpro/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/screenpro/__init__.py b/screenpro/__init__.py index e838233..8e44134 100644 --- a/screenpro/__init__.py +++ b/screenpro/__init__.py @@ -31,6 +31,6 @@ from .dashboard import DrugScreenDashboard -__version__ = "0.4.10" +__version__ = "0.4.11" __author__ = "Abe Arab" __email__ = 'abea@arcinstitute.org' # "abarbiology@gmail.com"