diff --git a/gget/gget_mutate.py b/gget/gget_mutate.py index 920bd3ac..5f8d00d6 100644 --- a/gget/gget_mutate.py +++ b/gget/gget_mutate.py @@ -22,12 +22,6 @@ mutation_pattern = r"(?:c|g)\.([0-9_\-\+\*]+)([a-zA-Z>]+)" # more complex: r'c\.([0-9_\-\+\*\(\)\?]+)([a-zA-Z>\(\)0-9]+)' -def reverse_complement(seq): - if pd.isna(seq): # Check if the sequence is NaN - return np.nan - complement = str.maketrans("ATCGNatcgn.*", "TAGCNtagcn.*") - return seq.translate(complement)[::-1] - # Get complement complement = { "A": "T", @@ -124,43 +118,74 @@ def convert_chromosome_value_to_int_when_possible(val): return val -def merge_gtf_transcript_locations_into_cosmic_csv(mutations, gtf_path, gtf_transcript_id_column): - gtf_df = pd.read_csv(gtf_path, sep='\t', comment='#', header=None, names=[ - 'seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']) +def merge_gtf_transcript_locations_into_cosmic_csv( + mutations, gtf_path, gtf_transcript_id_column +): + gtf_df = pd.read_csv( + gtf_path, + sep="\t", + comment="#", + header=None, + names=[ + "seqname", + "source", + "feature", + "start", + "end", + "score", + "strand", + "frame", + "attribute", + ], + ) - if 'strand' in mutations.columns: - mutations.rename(columns={'strand': 'strand_original'}, inplace=True) + if "strand" in mutations.columns: + mutations.rename(columns={"strand": "strand_original"}, inplace=True) - gtf_df = gtf_df[gtf_df['feature'] == 'transcript'] + gtf_df = gtf_df[gtf_df["feature"] == "transcript"] - gtf_df['transcript_id'] = gtf_df['attribute'].str.extract('transcript_id "([^"]+)"') + gtf_df["transcript_id"] = gtf_df["attribute"].str.extract('transcript_id "([^"]+)"') - assert len(gtf_df['transcript_id']) == len(set(gtf_df['transcript_id'])), "Duplicate transcript_id values found!" + assert len(gtf_df["transcript_id"]) == len( + set(gtf_df["transcript_id"]) + ), "Duplicate transcript_id values found!" # Filter out rows where transcript_id is NaN - gtf_df = gtf_df.dropna(subset=['transcript_id']) + gtf_df = gtf_df.dropna(subset=["transcript_id"]) + + gtf_df = gtf_df[["transcript_id", "start", "end", "strand"]].rename( + columns={ + "transcript_id": gtf_transcript_id_column, + "start": "start_transcript_position", + "end": "end_transcript_position", + } + ) - gtf_df = gtf_df[['transcript_id', 'start', 'end', 'strand']].rename( - columns={'transcript_id': gtf_transcript_id_column, 'start': 'start_transcript_position', 'end': 'end_transcript_position'}) - - merged_df = pd.merge(mutations, gtf_df, on=gtf_transcript_id_column, how='left') + merged_df = pd.merge(mutations, gtf_df, on=gtf_transcript_id_column, how="left") # Fill NaN values - merged_df['start_transcript_position'] = merged_df['start_transcript_position'].fillna(0) - merged_df['end_transcript_position'] = merged_df['end_transcript_position'].fillna(9999999) - merged_df['strand'] = merged_df['strand'].fillna('.') + merged_df["start_transcript_position"] = merged_df[ + "start_transcript_position" + ].fillna(0) + merged_df["end_transcript_position"] = merged_df["end_transcript_position"].fillna( + 9999999 + ) + merged_df["strand"] = merged_df["strand"].fillna(".") return merged_df + def get_sequence_length(seq_id, seq_dict): return len(seq_dict.get(seq_id, "")) + def get_nucleotide_at_position(seq_id, pos, seq_dict): full_seq = seq_dict.get(seq_id, "") if pos < len(full_seq): return full_seq[pos] return None + def translate_sequence(sequence, start, end): amino_acid_sequence = "" for i in range(start, end, 3): @@ -176,14 +201,17 @@ def translate_sequence(sequence, start, end): # def remove_all_but_first_gt(line): # return line[:1] + line[1:].replace(">", "") + def remove_gt_after_semicolon(line): - parts = line.split(';') + parts = line.split(";") # Remove '>' from the beginning of each part except the first part - parts = [parts[0]] + [part.lstrip('>') for part in parts[1:]] - return ';'.join(parts) + parts = [parts[0]] + [part.lstrip(">") for part in parts[1:]] + return ";".join(parts) -def wt_fragment_and_mutant_fragment_share_kmer(mutated_fragment: str, wildtype_fragment: str, k: int) -> bool: +def wt_fragment_and_mutant_fragment_share_kmer( + mutated_fragment: str, wildtype_fragment: str, k: int +) -> bool: if len(mutated_fragment) <= k: if mutated_fragment in wildtype_fragment: return True @@ -191,7 +219,7 @@ def wt_fragment_and_mutant_fragment_share_kmer(mutated_fragment: str, wildtype_f return False else: for mutant_position in range(len(mutated_fragment) - k): - mutant_kmer = mutated_fragment[mutant_position:mutant_position + k] + mutant_kmer = mutated_fragment[mutant_position : mutant_position + k] if mutant_kmer in wildtype_fragment: # wt_position = wildtype_fragment.find(mutant_kmer) return True @@ -233,9 +261,9 @@ def add_mutation_type(mutations, mut_column): return mutations -def extract_sequence(row, seq_dict, seq_id_column = "seq_ID"): +def extract_sequence(row, seq_dict, seq_id_column="seq_ID"): if pd.isna(row["start_mutation_position"]) or pd.isna(row["end_mutation_position"]): - return None + return None seq = seq_dict[row[seq_id_column]][ int(row["start_mutation_position"]) : int(row["end_mutation_position"]) + 1 ] @@ -328,10 +356,22 @@ def mutate( mut_column: str = "mutation", seq_id_column: str = "seq_ID", mut_id_column: Optional[str] = None, - k: Optional[int] = None, + gtf: Optional[str] = None, + gtf_transcript_id_column: Optional[str] = None, + k: int = 30, + min_seq_len: Optional[int] = None, + optimize_flanking_regions: bool = False, + remove_seqs_with_wt_kmers: bool = False, + max_ambiguous: Optional[int] = None, + merge_identical: bool = True, + update_df: bool = False, + update_df_out: Optional[str] = None, + store_full_sequences: bool = False, + translate: bool = False, + translate_start: Union[int, str, None] = None, + translate_end: Union[int, str, None] = None, out: Optional[str] = None, verbose: bool = True, - **kwargs, ): """ Takes in nucleotide sequences and mutations (in standard mutation annotation - see below) @@ -353,7 +393,7 @@ def mutate( NOTE: Only the letters until the first space or dot will be used as sequence identifiers - Version numbers of Ensembl IDs will be ignored. NOTE: When 'sequences' input is a genome, also see 'gtf' argument below. - + - mutations Path to csv or tsv file (str) (e.g., 'mutations.csv') or data frame (DataFrame object) containing information about the mutations in the following format: @@ -371,73 +411,69 @@ def mutate( Alternatively: Input mutation(s) as a string or list, e.g., 'c.2C>T' or ['c.2C>T', 'c.1A>C']. If a list is provided, the number of mutations must equal the number of input sequences. - + For more information on the standard mutation annotation, see https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1867422/. - + Additional input arguments: - mut_column (str) Name of the column containing the mutations to be performed in 'mutations'. Default: 'mutation'. - seq_id_column (str) Name of the column containing the IDs of the sequences to be mutated in 'mutations'. Default: 'seq_ID'. - mut_id_column (str) Name of the column containing the IDs of each mutation in 'mutations'. Default: Will use mut_column. - - gtf (str) Path to .gtf file. When providing a genome fasta file as input for 'sequences', you can provide a .gtf file here + - gtf (str) Path to .gtf file. When providing a genome fasta file as input for 'sequences', you can provide a .gtf file here and the input sequences will be defined according to the transcript boundaries. Default: None - gtf_transcript_id_column (str) Column name in the input 'mutations' file containing the transcript ID. In this case, column seq_id_column should contain the chromosome number. Required when 'gtf' is provided. Default: None Mutant sequence generation/filtering options: - - k (int) Length of sequences flanking the mutation. Default: None (take entire sequence). + - k (int) Length of sequences flanking the mutation. Default: 30. If k > total length of the sequence, the entire sequence will be kept. - - min_seq_len (int) Minimum length of the mutant output sequence. Mutant sequences smaller than this will be dropped. + - min_seq_len (int) Minimum length of the mutant output sequence. Mutant sequences smaller than this will be dropped. Default: None - - optimize_flanking_regions (True/False) Whether to remove nucleotides from either end of the mutant sequence to ensure (when possible) + - optimize_flanking_regions (True/False) Whether to remove nucleotides from either end of the mutant sequence to ensure (when possible) that the mutant sequence does not contain any k-mers also found in the wildtype/input sequence. Default: False - - remove_seqs_with_wt_kmers (True/False) Removes output sequences where at least one (k+1)-mer is also present in the wildtype/input sequence in the same region. - If optimize_flanking_regions=True, only sequences for which a wildtpye kmer is still present after optimization will be removed. + - remove_seqs_with_wt_kmers (True/False) Removes output sequences where at least one (k+1)-mer is also present in the wildtype/input sequence in the same region. + If optimize_flanking_regions=True, only sequences for which a wildtype kmer is still present after optimization will be removed. Default: False - max_ambiguous (int) Maximum number of 'N' characters allowed in the output sequence. Default: None (no 'N' filter will be applied) - - merge_identical (True/False) Whether to merge identical mutant sequences in the output (identical sequences will be merged by concatenating the sequence + - merge_identical (True/False) Whether to merge identical mutant sequences in the output (identical sequences will be merged by concatenating the sequence headers for all identical sequences). Default: True - - merge_identical_rc (True/False) Whether to merge identical sequences and their reverse complements in the output. Only effective when merge_identical is also True. Default: True # Optional arguments to generate additional output stored in a copy of the 'mutations' DataFrame - - update_df (True/False) Whether to update the input 'mutations' DataFrame to include additional columns with the mutation type, + - update_df (True/False) Whether to update the input 'mutations' DataFrame to include additional columns with the mutation type, wildtype nucleotide sequence, and mutant nucleotide sequence (only valid if 'mutations' is a csv or tsv file). Default: False - update_df_out (str) Path to output csv file containing the updated DataFrame. Only valid if update_df=True. Default: None -> the new DataFrame will be saved in the same directory as the 'mutations' DataFrame with appendix '_updated' - - store_full_sequences (True/False) Whether to also include the complete wildtype and mutant sequences in the updated 'mutations' DataFrame (not just the sub-sequence with + - store_full_sequences (True/False) Whether to also include the complete wildtype and mutant sequences in the updated 'mutations' DataFrame (not just the sub-sequence with k-length flanks). Only valid if update_df=True. Default: False - - translate (True/False) Add additional columns to the 'mutations' DataFrame containing the wildtype and mutant amino acid sequences. + - translate (True/False) Add additional columns to the 'mutations' DataFrame containing the wildtype and mutant amino acid sequences. Only valid if store_full_sequences=True. Default: False - - translate_start (int | str | None) The position in the input nucleotide sequence to start translating. If a string is provided, it should correspond + - translate_start (int | str | None) The position in the input nucleotide sequence to start translating. If a string is provided, it should correspond to a column name in 'mutations' containing the open reading frame start positions for each sequence/mutation. Only valid if translate=True. Default: None (translate from the beginning of the sequence) - - translate_end (int | str | None) The position in the input nucleotide sequence to end translating. If a string is provided, it should correspond + - translate_end (int | str | None) The position in the input nucleotide sequence to end translating. If a string is provided, it should correspond to a column name in 'mutations' containing the open reading frame end positions for each sequence/mutation. Only valid if translate=True. Default: None (translate from to the end of the sequence) - + # General arguments: - out (str) Path to output fasta file containing the mutated sequences, e.g., 'path/to/output_fasta.fa'. Default: None -> returns a list of the mutated sequences to standard out. The identifiers (following the '>') of the mutated sequences in the output fasta will be '>[seq_ID]_[mut_ID]'. - verbose (True/False) whether to print progress information. Default: True - Saves mutated sequences in fasta format (or returns a list containing the mutated sequences if out=None). + Saves mutated sequences in fasta format (or, if out=None: when update_df is True, returns the mutation dataframe, otherwise returns a list containing the mutated sequences). """ - if kwargs.get("gtf") or kwargs.get("gtf_transcript_id_column") or kwargs.get("optimize_flanking_regions") or kwargs.get("remove_seqs_with_wt_kmers") or kwargs.get("min_seq_len") or kwargs.get("max_ambiguous") or kwargs.get("merge_identical") or kwargs.get("merge_identical_rc") or kwargs.get("update_df") or kwargs.get("update_df_out") or kwargs.get("store_full_sequences") or kwargs.get("translate") or kwargs.get("translate_start") or kwargs.get("translate_end"): - # print a log message and raise exception - logger.critical( - """ - It appears that you are passing in arguments that are not supported anymore in gget mutate. For use of these arguments, please check out https://github.com/pachterlab/kvar. - """ - ) - raise NotImplementedError - global intronic_mutations, posttranslational_region_mutations, unknown_mutations, uncertain_mutations, ambiguous_position_mutations, cosmic_incorrect_wt_base, mut_idx_outside_seq - columns_to_keep = ["header", seq_id_column, mut_column, "mutation_type", "wt_sequence", "mutant_sequence"] - - if k is None: - k = 999999999 # take entire sequence by default + columns_to_keep = [ + "header", + seq_id_column, + mut_column, + "mutation_type", + "wt_sequence", + "mutant_sequence", + "start_mutation_position", + "end_mutation_position" + ] # Load input sequences and their identifiers from fasta file if "." in sequences: @@ -472,14 +508,18 @@ def mutate( mutations = pd.read_csv(mutations) for col in mutations.columns: if col not in columns_to_keep: - columns_to_keep.append(col) # append "mutation_aa", "gene_name", "mutation_id" + columns_to_keep.append( + col + ) # append "mutation_aa", "gene_name", "mutation_id" elif isinstance(mutations, str) and mutations.endswith(".tsv"): mutations_path = mutations mutations = pd.read_csv(mutations, sep="\t") for col in mutations.columns: if col not in columns_to_keep: - columns_to_keep.append(col) # append "mutation_aa", "gene_name", "mutation_id" + columns_to_keep.append( + col + ) # append "mutation_aa", "gene_name", "mutation_id" # Handle mutations passed as a list elif isinstance(mutations, list): @@ -560,10 +600,16 @@ def mutate( mutations = mutations.dropna(subset=[seq_id_column]) # ensure seq_ID column is string type, and chromosome numbers don't have decimals - mutations[seq_id_column] = mutations[seq_id_column].apply(convert_chromosome_value_to_int_when_possible) + mutations[seq_id_column] = mutations[seq_id_column].apply( + convert_chromosome_value_to_int_when_possible + ) mutations = add_mutation_type(mutations, mut_column) + # Link sequences to their mutations using the sequence identifiers + if store_full_sequences: + mutations["wt_sequence_full"] = mutations[seq_id_column].map(seq_dict) + # Handle sequences that were not found based on their sequence IDs seqs_not_found = mutations[~mutations[seq_id_column].isin(seq_dict.keys())] if 0 < len(seqs_not_found) < 20: @@ -594,16 +640,14 @@ def mutate( ) total_mutations = mutations.shape[0] - mutations["mutant_sequence"] = "" - if mut_id_column is not None: - mutations["header"] = ( - ">" + mutations[mut_id_column] - ) - else: - mutations["header"] = ( - ">" + mutations[seq_id_column] + ":" + mutations[mut_column] - ) + if mut_id_column is None: + mut_id_column = mut_column + + mutations["mutant_sequence"] = "" + mutations["header"] = ( + ">" + mutations[seq_id_column] + ":" + mutations[mut_id_column] + ) # Calculate number of bad mutations uncertain_mutations = mutations[mut_column].str.contains(r"\?").sum() @@ -630,7 +674,7 @@ def mutate( if mutations.empty: logger.warning("No valid mutations found in the input.") - return [] + return mutations if update_df else [] # Split nucleotide positions into start and end positions split_positions = mutations["nucleotide_positions"].str.split("_", expand=True) @@ -656,7 +700,9 @@ def mutate( mutations["end_mutation_position"] -= 1 # don't forget to increment by 1 later # Calculate sequence length - mutations["sequence_length"] = mutations[seq_id_column].apply(lambda x: get_sequence_length(x, seq_dict)) + mutations["sequence_length"] = mutations[seq_id_column].apply( + lambda x: get_sequence_length(x, seq_dict) + ) # Filter out mutations with positions outside the sequence index_error_mask = ( @@ -669,7 +715,7 @@ def mutate( if mutations.empty: logger.warning("No valid mutations found in the input.") - return [] + return mutations if update_df else [] # Create masks for each type of mutation mutations["wt_nucleotides_ensembl"] = None @@ -680,6 +726,31 @@ def mutate( duplication_mask = mutations["mutation_type"] == "duplication" inversion_mask = mutations["mutation_type"] == "inversion" + if remove_seqs_with_wt_kmers: + long_duplications = ( + (duplication_mask) + & ( + ( + mutations["end_mutation_position"] + - mutations["start_mutation_position"] + ) + >= k + ) + ).sum() + logger.info(f"Removing {long_duplications} duplications > k") + mutations = mutations[ + ~( + (duplication_mask) + & ( + ( + mutations["end_mutation_position"] + - mutations["start_mutation_position"] + ) + >= k + ) + ) + ] + # Create a mask for all non-substitution mutations non_substitution_mask = ( deletion_mask | delins_mask | insertion_mask | duplication_mask | inversion_mask @@ -687,12 +758,16 @@ def mutate( # Extract the WT nucleotides for the substitution rows from reference fasta (i.e., Ensembl) start_positions = mutations.loc[substitution_mask, "start_mutation_position"].values - + # Get the nucleotides at the start positions - wt_nucleotides_substitution = np.array([ - get_nucleotide_at_position(seq_id, pos, seq_dict) - for seq_id, pos in zip(mutations.loc[substitution_mask, seq_id_column], start_positions) - ]) + wt_nucleotides_substitution = np.array( + [ + get_nucleotide_at_position(seq_id, pos, seq_dict) + for seq_id, pos in zip( + mutations.loc[substitution_mask, seq_id_column], start_positions + ) + ] + ) mutations.loc[substitution_mask, "wt_nucleotides_ensembl"] = ( wt_nucleotides_substitution @@ -705,8 +780,9 @@ def mutate( ].str[0] congruent_wt_bases_mask = ( - (mutations["wt_nucleotides_cosmic"] == mutations["wt_nucleotides_ensembl"]) | - mutations[["wt_nucleotides_cosmic", "wt_nucleotides_ensembl"]].isna().any(axis=1) + mutations["wt_nucleotides_cosmic"] == mutations["wt_nucleotides_ensembl"] + ) | mutations[["wt_nucleotides_cosmic", "wt_nucleotides_ensembl"]].isna().any( + axis=1 ) cosmic_incorrect_wt_base = (~congruent_wt_bases_mask).sum() @@ -715,7 +791,7 @@ def mutate( if mutations.empty: logger.warning("No valid mutations found in the input.") - return [] + return mutations if update_df else [] # Adjust the start and end positions for insertions mutations.loc[ @@ -775,7 +851,62 @@ def mutate( axis=1 ) # don't forget to increment by 1 later on - mut_apply = (lambda *args, **kwargs: mutations.progress_apply(*args, **kwargs)) if verbose else mutations.apply + if gtf is not None: + assert mutations_path.endswith(".csv") or mutations_path.endswith( + ".tsv" + ), "Mutations must be a CSV or TSV file" + if ( + "start_transcript_position" not in mutations.columns + and "end_transcript_position" not in mutations.columns + ): # * currently hard-coded column names, but optionally can be changed to arguments later + mutations = merge_gtf_transcript_locations_into_cosmic_csv( + mutations, gtf, gtf_transcript_id_column=gtf_transcript_id_column + ) + columns_to_keep.extend( + ["start_transcript_position", "end_transcript_position", "strand"] + ) + else: + logger.warning( + "Transcript positions already present in the input mutations file. Skipping GTF file merging." + ) + + # adjust start_transcript_position to be 0-index + mutations["start_transcript_position"] -= 1 + + mutations["start_kmer_position"] = mutations[ + ["start_kmer_position", "start_transcript_position"] + ].max(axis=1) + mutations["end_kmer_position"] = mutations[ + ["end_kmer_position", "end_transcript_position"] + ].min(axis=1) + + mut_apply = ( + (lambda *args, **kwargs: mutations.progress_apply(*args, **kwargs)) + if verbose + else mutations.apply + ) + + if update_df and store_full_sequences: + # Extract flank sequences + if verbose: + tqdm.pandas(desc="Extracting full left flank sequences") + + mutations["left_flank_region_full"] = mut_apply( + lambda row: seq_dict[row[seq_id_column]][ + 0 : row["start_mutation_position"] + ], + axis=1, + ) # ? vectorize + + if verbose: + tqdm.pandas(desc="Extracting full right flank sequences") + + mutations["right_flank_region_full"] = mut_apply( + lambda row: seq_dict[row[seq_id_column]][ + row["end_mutation_position"] + 1 : row["sequence_length"] + ], + axis=1, + ) # ? vectorize if verbose: tqdm.pandas(desc="Extracting k-mer left flank sequences") @@ -812,8 +943,52 @@ def mutate( # To what extend the beginning of i overlaps with the beginning of d --> shave up to that many nucleotides off the beginning of r1 until k - len(r1) ≥ extent of overlap # To what extend the end of i overlaps with the beginning of d --> shave up to that many nucleotides off the end of r2 until k - len(r2) ≥ extent of overlap - mutations["updated_left_flank_start"] = 0 - mutations["updated_right_flank_end"] = 0 + if optimize_flanking_regions: + # Apply the function for beginning of mut_nucleotides with right_flank_region + mutations.loc[ + non_substitution_mask, "beginning_mutation_overlap_with_right_flank" + ] = mutations.loc[non_substitution_mask].apply( + calculate_beginning_mutation_overlap_with_right_flank, axis=1 + ) + + # Apply the function for end of mut_nucleotides with left_flank_region + mutations.loc[non_substitution_mask, "end_mutation_overlap_with_left_flank"] = ( + mutations.loc[non_substitution_mask].apply( + calculate_end_mutation_overlap_with_left_flank, axis=1 + ) + ) + + # Calculate k-len(flank) (see above instructions) + mutations.loc[non_substitution_mask, "k_minus_left_flank_length"] = ( + k - mutations.loc[non_substitution_mask, "left_flank_region"].apply(len) + ) + mutations.loc[non_substitution_mask, "k_minus_right_flank_length"] = ( + k - mutations.loc[non_substitution_mask, "right_flank_region"].apply(len) + ) + + mutations.loc[non_substitution_mask, "updated_left_flank_start"] = np.maximum( + mutations.loc[ + non_substitution_mask, "beginning_mutation_overlap_with_right_flank" + ] + - mutations.loc[non_substitution_mask, "k_minus_left_flank_length"], + 0, + ) + mutations.loc[non_substitution_mask, "updated_right_flank_end"] = np.maximum( + mutations.loc[non_substitution_mask, "end_mutation_overlap_with_left_flank"] + - mutations.loc[non_substitution_mask, "k_minus_right_flank_length"], + 0, + ) + + mutations["updated_left_flank_start"] = ( + mutations["updated_left_flank_start"].fillna(0).astype(int) + ) + mutations["updated_right_flank_end"] = ( + mutations["updated_right_flank_end"].fillna(0).astype(int) + ) + + else: + mutations["updated_left_flank_start"] = 0 + mutations["updated_right_flank_end"] = 0 # Create WT substitution k-mer sequences mutations.loc[substitution_mask, "wt_sequence"] = ( @@ -828,7 +1003,9 @@ def mutate( ].apply( lambda row: row["left_flank_region"][row["updated_left_flank_start"] :] + row["wt_nucleotides_ensembl"] - + row["right_flank_region"][: len(row['right_flank_region']) - row["updated_right_flank_end"]], + + row["right_flank_region"][ + : len(row["right_flank_region"]) - row["updated_right_flank_end"] + ], axis=1, ) @@ -845,10 +1022,43 @@ def mutate( ].apply( lambda row: row["left_flank_region"][row["updated_left_flank_start"] :] + row["mut_nucleotides"] - + row["right_flank_region"][: len(row['right_flank_region']) - row["updated_right_flank_end"]], + + row["right_flank_region"][ + : len(row["right_flank_region"]) - row["updated_right_flank_end"] + ], axis=1, ) + if remove_seqs_with_wt_kmers: + if verbose: + tqdm.pandas( + desc="Removing mutant fragments that share a kmer with wt fragments" + ) + + mutations["wt_fragment_and_mutant_fragment_share_kmer"] = mut_apply( + lambda row: wt_fragment_and_mutant_fragment_share_kmer( + mutated_fragment=row["mutant_sequence"], + wildtype_fragment=row["wt_sequence"], + k=k + 1, + ), + axis=1, + ) + + mutations_overlapping_with_wt = mutations[ + "wt_fragment_and_mutant_fragment_share_kmer" + ].sum() + + mutations = mutations[~mutations["wt_fragment_and_mutant_fragment_share_kmer"]] + + if update_df and store_full_sequences: + columns_to_keep.extend(["wt_sequence_full", "mutant_sequence_full"]) + + # Create full sequences (substitution and non-substitution) + mutations["mutant_sequence_full"] = ( + mutations["left_flank_region_full"] + + mutations["mut_nucleotides"] + + mutations["right_flank_region_full"] + ) + # Calculate k-mer lengths and report the distribution mutations["mutant_sequence_kmer_length"] = mutations["mutant_sequence"].apply( lambda x: len(x) if pd.notna(x) else 0 @@ -856,6 +1066,49 @@ def mutate( max_length = mutations["mutant_sequence_kmer_length"].max() + if min_seq_len: + rows_less_than_minimum = ( + mutations["mutant_sequence_kmer_length"] < min_seq_len + ).sum() + + mutations = mutations[mutations["mutant_sequence_kmer_length"] >= min_seq_len] + + if verbose: + logger.info( + f"Removed {rows_less_than_minimum} mutant kmers with length less than {min_seq_len}..." + ) + + if max_ambiguous is not None: + # Get number of 'N' or 'n' occuring in the sequence + mutations["num_N"] = mutations["mutant_sequence"].str.lower().str.count("n") + num_rows_with_N = (mutations["num_N"] > max_ambiguous).sum() + mutations = mutations[mutations["num_N"] <= max_ambiguous] + + if verbose: + logger.info( + f"Removed {num_rows_with_N} mutant kmers containing more than {max_ambiguous} 'N's..." + ) + + # Drop the 'num_N' column after filtering + mutations = mutations.drop(columns=["num_N"]) + + try: + # Create bins of width 5 from 0 to max_length + bins = range(0, max_length + 6, 5) + + # Bin the lengths and count the number of elements in each bin + binned_lengths = pd.cut( + mutations["mutant_sequence_kmer_length"], bins=bins, right=False + ) + bin_counts = binned_lengths.value_counts().sort_index() + + # Display the report + if verbose: + logger.debug("Report of the number of elements in each bin of width 5:") + logger.debug(bin_counts) + except Exception as e: + pass + # split_cols = mutations[mut_id_column].str.split("_", n=1, expand=True) # if split_cols.shape[1] == 1: @@ -872,7 +1125,7 @@ def mutate( # if remove_seqs_with_wt_kmers: # good_mutations = good_mutations - long_duplications - mutations_overlapping_with_wt - + # if min_seq_len: # good_mutations = good_mutations - rows_less_than_minimum @@ -890,15 +1143,158 @@ def mutate( {mut_idx_outside_seq} mutations with indices outside of the sequence length found ({mut_idx_outside_seq/total_mutations*100:.2f}%) """ + if remove_seqs_with_wt_kmers: + report += f"""{long_duplications} duplications longer than k found ({long_duplications/total_mutations*100:.2f}%) + {mutations_overlapping_with_wt} mutations with overlapping kmers found ({mutations_overlapping_with_wt/total_mutations*100:.2f}%) + """ + + if min_seq_len: + report += f"""{rows_less_than_minimum} mutations with fragment length < k found ({rows_less_than_minimum/total_mutations*100:.2f}%) + """ + + if max_ambiguous is not None: + report += f"""{num_rows_with_N} mutations with Ns found ({num_rows_with_N/total_mutations*100:.2f}%) + """ + if good_mutations != total_mutations: logger.warning(report) else: logger.info("All mutations correctly recorded") + if translate and update_df and store_full_sequences: + columns_to_keep.extend(["wt_sequence_aa_full", "mutant_sequence_aa_full"]) + + if not mutations_path: + assert ( + type(translate_start) != str and type(translate_end) != str + ), "translate_start and translate_end must be integers when translating sequences (or default None)." + if translate_start is None: + translate_start = 0 + if translate_end is None: + translate_end = mutations["sequence_length"][0] + + # combined_df['ORF'] = combined_df[translate_start] % 3 + + if verbose: + tqdm.pandas(desc="Translating WT amino acid sequences") + mutations["wt_sequence_aa_full"] = mutations[ + "wt_sequence_full" + ].progress_apply( + lambda x: translate_sequence( + x, start=translate_start, end=translate_end + ) + ) + else: + mutations["wt_sequence_aa_full"] = mutations["wt_sequence_full"].apply( + lambda x: translate_sequence( + x, start=translate_start, end=translate_end + ) + ) + + if verbose: + tqdm.pandas(desc="Translating mutant amino acid sequences") + + mutations["mutant_sequence_aa_full"] = mutations[ + "mutant_sequence_full" + ].progress_apply( + lambda x: translate_sequence( + x, start=translate_start, end=translate_end + ) + ) + + else: + mutations["mutant_sequence_aa_full"] = mutations[ + "mutant_sequence_full" + ].apply( + lambda x: translate_sequence( + x, start=translate_start, end=translate_end + ) + ) + + print(f"Translated mutated sequences: {mutations['wt_sequence_aa_full']}") + else: + if not translate_start: + translate_start = "translate_start" + + if not translate_end: + translate_end = "translate_end" + + if translate_start not in mutations.columns: + mutations["translate_start"] = 0 + + if translate_end not in mutations.columns: + mutations["translate_end"] = mutations["sequence_length"] + + if verbose: + tqdm.pandas(desc="Translating WT amino acid sequences") + + mutations["wt_sequence_aa_full"] = mut_apply( + lambda row: translate_sequence( + row["wt_sequence_full"], row[translate_start], row[translate_end] + ), + axis=1, + ) + + if verbose: + tqdm.pandas(desc="Translating mutant amino acid sequences") + + mutations["mutant_sequence_aa_full"] = mut_apply( + lambda row: translate_sequence( + row["mutant_sequence_full"], + row[translate_start], + row[translate_end], + ), + axis=1, + ) + mutations = mutations[columns_to_keep] + if merge_identical: + logger.info("Merging identical mutated sequences") + if update_df: + logger.warning( + "Merging identical mutated sequences can take a while if update_df=True since it will concatenate all MCRSs too)" + ) + mutations = ( + mutations.groupby("mutant_sequence", sort=False) + .agg( + lambda x: ";".join(x.astype(str)) + ) # Concatenate values with semicolons + .reset_index() + ) + + else: + mutations = ( + mutations.groupby("mutant_sequence", sort=False, group_keys=False)[ + "header" + ] + .apply(";".join) + .reset_index() + ) + + # apply remove_gt_after_semicolon to mutant_sequence + mutations["header"] = mutations["header"].apply(remove_gt_after_semicolon) + + # Calculate the number of semicolons in each entry + mutations["semicolon_count"] = mutations["header"].str.count(";") + + mutations["semicolon_count"] += 1 + + # Convert all 1 values to NaN + mutations["semicolon_count"] = mutations["semicolon_count"].replace(1, np.nan) + + # Take the sum across all rows of the new column + total_semicolons = int(mutations["semicolon_count"].sum()) + + mutations = mutations.drop(columns=["semicolon_count"]) + + if verbose: + logger.info( + f"{total_semicolons} identical mutated sequences were merged (headers were combined and separated using a semicolon (;). Occurences of identical mutated sequences may be reduced by increasing k." + ) + empty_kmer_count = (mutations["mutant_sequence"] == "").sum() - + if empty_kmer_count > 0 and verbose: logger.warning( f"{empty_kmer_count} mutated sequences were empty and were not included in the output." @@ -906,7 +1302,26 @@ def mutate( mutations = mutations[mutations["mutant_sequence"] != ""] - mutations['header'] = mutations['header'].str[1:] # remove the > character + mutations["header"] = mutations["header"].str[1:] # remove the > character + + if update_df: + logger.info("Saving dataframe with updated mutation info...") + saved_updated_df = True + logger.warning( + "File size can be very large if the number of mutations is large." + ) + if not update_df_out: + if not mutations_path: + # logger.warning( + # "mutations_path must be provided if update_df is True and update_df_out is not provided." + # ) + saved_updated_df = False + else: + base_name, ext = os.path.splitext(mutations_path) + update_df_out = f"{base_name}_updated{ext}" + if saved_updated_df: + mutations.to_csv(update_df_out, index=False) + print(f"Updated mutation info has been saved to {update_df_out}") mutations["fasta_format"] = ( ">" + mutations["header"] + "\n" + mutations["mutant_sequence"] + "\n" @@ -922,13 +1337,17 @@ def mutate( # When out=None, return list of mutated seqs else: - all_mut_seqs = [] - all_mut_seqs.extend(mutations["mutant_sequence"].values) - - # Remove empty strings from final list of mutated sequences - # (these are introduced when unknown mutations are encountered) - while "" in all_mut_seqs: - all_mut_seqs.remove("") - - if len(all_mut_seqs) > 0: - return all_mut_seqs + if update_df: + return mutations[columns_to_keep] + else: + all_mut_seqs = [] + all_mut_seqs.extend(mutations["mutant_sequence"].values) + + # Remove empty strings from final list of mutated sequences + # (these are introduced when unknown mutations are encountered) + while "" in all_mut_seqs: + all_mut_seqs.remove("") + + if len(all_mut_seqs) > 0: + return all_mut_seqs + return []