diff --git a/gget/gget_mutate.py b/gget/gget_mutate.py
index 920bd3ac..5f8d00d6 100644
--- a/gget/gget_mutate.py
+++ b/gget/gget_mutate.py
@@ -22,12 +22,6 @@
 
 mutation_pattern = r"(?:c|g)\.([0-9_\-\+\*]+)([a-zA-Z>]+)"  # more complex: r'c\.([0-9_\-\+\*\(\)\?]+)([a-zA-Z>\(\)0-9]+)'
 
-def reverse_complement(seq):
-    if pd.isna(seq):  # Check if the sequence is NaN
-        return np.nan
-    complement = str.maketrans("ATCGNatcgn.*", "TAGCNtagcn.*")
-    return seq.translate(complement)[::-1]
-
 # Get complement
 complement = {
     "A": "T",
@@ -124,43 +118,74 @@ def convert_chromosome_value_to_int_when_possible(val):
         return val
 
 
-def merge_gtf_transcript_locations_into_cosmic_csv(mutations, gtf_path, gtf_transcript_id_column):
-    gtf_df = pd.read_csv(gtf_path, sep='\t', comment='#', header=None, names=[
-    'seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute'])
+def merge_gtf_transcript_locations_into_cosmic_csv(
+    mutations, gtf_path, gtf_transcript_id_column
+):
+    gtf_df = pd.read_csv(
+        gtf_path,
+        sep="\t",
+        comment="#",
+        header=None,
+        names=[
+            "seqname",
+            "source",
+            "feature",
+            "start",
+            "end",
+            "score",
+            "strand",
+            "frame",
+            "attribute",
+        ],
+    )
 
-    if 'strand' in mutations.columns:
-        mutations.rename(columns={'strand': 'strand_original'}, inplace=True)
+    if "strand" in mutations.columns:
+        mutations.rename(columns={"strand": "strand_original"}, inplace=True)
 
-    gtf_df = gtf_df[gtf_df['feature'] == 'transcript']
+    gtf_df = gtf_df[gtf_df["feature"] == "transcript"]
 
-    gtf_df['transcript_id'] = gtf_df['attribute'].str.extract('transcript_id "([^"]+)"')
+    gtf_df["transcript_id"] = gtf_df["attribute"].str.extract('transcript_id "([^"]+)"')
 
-    assert len(gtf_df['transcript_id']) == len(set(gtf_df['transcript_id'])), "Duplicate transcript_id values found!"
+    assert len(gtf_df["transcript_id"]) == len(
+        set(gtf_df["transcript_id"])
+    ), "Duplicate transcript_id values found!"
 
     # Filter out rows where transcript_id is NaN
-    gtf_df = gtf_df.dropna(subset=['transcript_id'])
+    gtf_df = gtf_df.dropna(subset=["transcript_id"])
+
+    gtf_df = gtf_df[["transcript_id", "start", "end", "strand"]].rename(
+        columns={
+            "transcript_id": gtf_transcript_id_column,
+            "start": "start_transcript_position",
+            "end": "end_transcript_position",
+        }
+    )
 
-    gtf_df = gtf_df[['transcript_id', 'start', 'end', 'strand']].rename(
-        columns={'transcript_id': gtf_transcript_id_column, 'start': 'start_transcript_position', 'end': 'end_transcript_position'})
-    
-    merged_df = pd.merge(mutations, gtf_df, on=gtf_transcript_id_column, how='left')
+    merged_df = pd.merge(mutations, gtf_df, on=gtf_transcript_id_column, how="left")
 
     # Fill NaN values
-    merged_df['start_transcript_position'] = merged_df['start_transcript_position'].fillna(0)
-    merged_df['end_transcript_position'] = merged_df['end_transcript_position'].fillna(9999999)
-    merged_df['strand'] = merged_df['strand'].fillna('.')
+    merged_df["start_transcript_position"] = merged_df[
+        "start_transcript_position"
+    ].fillna(0)
+    merged_df["end_transcript_position"] = merged_df["end_transcript_position"].fillna(
+        9999999
+    )
+    merged_df["strand"] = merged_df["strand"].fillna(".")
 
     return merged_df
 
+
 def get_sequence_length(seq_id, seq_dict):
     return len(seq_dict.get(seq_id, ""))
 
+
 def get_nucleotide_at_position(seq_id, pos, seq_dict):
     full_seq = seq_dict.get(seq_id, "")
     if pos < len(full_seq):
         return full_seq[pos]
     return None
 
+
 def translate_sequence(sequence, start, end):
     amino_acid_sequence = ""
     for i in range(start, end, 3):
@@ -176,14 +201,17 @@ def translate_sequence(sequence, start, end):
 # def remove_all_but_first_gt(line):
 #     return line[:1] + line[1:].replace(">", "")
 
+
 def remove_gt_after_semicolon(line):
-    parts = line.split(';')
+    parts = line.split(";")
     # Remove '>' from the beginning of each part except the first part
-    parts = [parts[0]] + [part.lstrip('>') for part in parts[1:]]
-    return ';'.join(parts)
+    parts = [parts[0]] + [part.lstrip(">") for part in parts[1:]]
+    return ";".join(parts)
 
 
-def wt_fragment_and_mutant_fragment_share_kmer(mutated_fragment: str, wildtype_fragment: str, k: int) -> bool:
+def wt_fragment_and_mutant_fragment_share_kmer(
+    mutated_fragment: str, wildtype_fragment: str, k: int
+) -> bool:
     if len(mutated_fragment) <= k:
         if mutated_fragment in wildtype_fragment:
             return True
@@ -191,7 +219,7 @@ def wt_fragment_and_mutant_fragment_share_kmer(mutated_fragment: str, wildtype_f
             return False
     else:
         for mutant_position in range(len(mutated_fragment) - k):
-            mutant_kmer = mutated_fragment[mutant_position:mutant_position + k]
+            mutant_kmer = mutated_fragment[mutant_position : mutant_position + k]
             if mutant_kmer in wildtype_fragment:
                 # wt_position = wildtype_fragment.find(mutant_kmer)
                 return True
@@ -233,9 +261,9 @@ def add_mutation_type(mutations, mut_column):
     return mutations
 
 
-def extract_sequence(row, seq_dict, seq_id_column = "seq_ID"):
+def extract_sequence(row, seq_dict, seq_id_column="seq_ID"):
     if pd.isna(row["start_mutation_position"]) or pd.isna(row["end_mutation_position"]):
-        return None    
+        return None
     seq = seq_dict[row[seq_id_column]][
         int(row["start_mutation_position"]) : int(row["end_mutation_position"]) + 1
     ]
@@ -328,10 +356,22 @@ def mutate(
     mut_column: str = "mutation",
     seq_id_column: str = "seq_ID",
     mut_id_column: Optional[str] = None,
-    k: Optional[int] = None,
+    gtf: Optional[str] = None,
+    gtf_transcript_id_column: Optional[str] = None,
+    k: int = 30,
+    min_seq_len: Optional[int] = None,
+    optimize_flanking_regions: bool = False,
+    remove_seqs_with_wt_kmers: bool = False,
+    max_ambiguous: Optional[int] = None,
+    merge_identical: bool = True,
+    update_df: bool = False,
+    update_df_out: Optional[str] = None,
+    store_full_sequences: bool = False,
+    translate: bool = False,
+    translate_start: Union[int, str, None] = None,
+    translate_end: Union[int, str, None] = None,
     out: Optional[str] = None,
     verbose: bool = True,
-    **kwargs,
 ):
     """
     Takes in nucleotide sequences and mutations (in standard mutation annotation - see below)
@@ -353,7 +393,7 @@ def mutate(
                     NOTE: Only the letters until the first space or dot will be used as sequence identifiers
                     - Version numbers of Ensembl IDs will be ignored.
                     NOTE: When 'sequences' input is a genome, also see 'gtf' argument below.
-                    
+
     - mutations     Path to csv or tsv file (str) (e.g., 'mutations.csv') or data frame (DataFrame object)
                     containing information about the mutations in the following format:
 
@@ -371,73 +411,69 @@ def mutate(
 
                     Alternatively: Input mutation(s) as a string or list, e.g., 'c.2C>T' or ['c.2C>T', 'c.1A>C'].
                     If a list is provided, the number of mutations must equal the number of input sequences.
-                    
+
                     For more information on the standard mutation annotation, see https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1867422/.
-   
+
     Additional input arguments:
     - mut_column                   (str) Name of the column containing the mutations to be performed in 'mutations'. Default: 'mutation'.
     - seq_id_column                (str) Name of the column containing the IDs of the sequences to be mutated in 'mutations'. Default: 'seq_ID'.
     - mut_id_column                (str) Name of the column containing the IDs of each mutation in 'mutations'. Default: Will use mut_column.
-    - gtf                          (str) Path to .gtf file. When providing a genome fasta file as input for 'sequences', you can provide a .gtf file here 
+    - gtf                          (str) Path to .gtf file. When providing a genome fasta file as input for 'sequences', you can provide a .gtf file here
                                    and the input sequences will be defined according to the transcript boundaries. Default: None
     - gtf_transcript_id_column     (str) Column name in the input 'mutations' file containing the transcript ID. In this case, column seq_id_column should contain the chromosome number.
                                    Required when 'gtf' is provided. Default: None
 
     Mutant sequence generation/filtering options:
-    - k                            (int) Length of sequences flanking the mutation. Default: None (take entire sequence).
+    - k                            (int) Length of sequences flanking the mutation. Default: 30.
                                    If k > total length of the sequence, the entire sequence will be kept.
-    - min_seq_len                  (int) Minimum length of the mutant output sequence. Mutant sequences smaller than this will be dropped. 
+    - min_seq_len                  (int) Minimum length of the mutant output sequence. Mutant sequences smaller than this will be dropped.
                                    Default: None
-    - optimize_flanking_regions    (True/False) Whether to remove nucleotides from either end of the mutant sequence to ensure (when possible) 
+    - optimize_flanking_regions    (True/False) Whether to remove nucleotides from either end of the mutant sequence to ensure (when possible)
                                    that the mutant sequence does not contain any k-mers also found in the wildtype/input sequence. Default: False
-    - remove_seqs_with_wt_kmers    (True/False) Removes output sequences where at least one (k+1)-mer is also present in the wildtype/input sequence in the same region. 
-                                   If optimize_flanking_regions=True, only sequences for which a wildtpye kmer is still present after optimization will be removed.
+    - remove_seqs_with_wt_kmers    (True/False) Removes output sequences where at least one (k+1)-mer is also present in the wildtype/input sequence in the same region.
+                                   If optimize_flanking_regions=True, only sequences for which a wildtype kmer is still present after optimization will be removed.
                                    Default: False
     - max_ambiguous                (int) Maximum number of 'N' characters allowed in the output sequence. Default: None (no 'N' filter will be applied)
-    - merge_identical              (True/False) Whether to merge identical mutant sequences in the output (identical sequences will be merged by concatenating the sequence 
+    - merge_identical              (True/False) Whether to merge identical mutant sequences in the output (identical sequences will be merged by concatenating the sequence
                                    headers for all identical sequences). Default: True
-    - merge_identical_rc           (True/False) Whether to merge identical sequences and their reverse complements in the output. Only effective when merge_identical is also True. Default: True
 
     # Optional arguments to generate additional output stored in a copy of the 'mutations' DataFrame
-    - update_df                    (True/False) Whether to update the input 'mutations' DataFrame to include additional columns with the mutation type, 
+    - update_df                    (True/False) Whether to update the input 'mutations' DataFrame to include additional columns with the mutation type,
                                    wildtype nucleotide sequence, and mutant nucleotide sequence (only valid if 'mutations' is a csv or tsv file). Default: False
     - update_df_out                (str) Path to output csv file containing the updated DataFrame. Only valid if update_df=True.
                                    Default: None -> the new DataFrame will be saved in the same directory as the 'mutations' DataFrame with appendix '_updated'
-    - store_full_sequences         (True/False) Whether to also include the complete wildtype and mutant sequences in the updated 'mutations' DataFrame (not just the sub-sequence with 
+    - store_full_sequences         (True/False) Whether to also include the complete wildtype and mutant sequences in the updated 'mutations' DataFrame (not just the sub-sequence with
                                    k-length flanks). Only valid if update_df=True. Default: False
-    - translate                    (True/False) Add additional columns to the 'mutations' DataFrame containing the wildtype and mutant amino acid sequences. 
+    - translate                    (True/False) Add additional columns to the 'mutations' DataFrame containing the wildtype and mutant amino acid sequences.
                                    Only valid if store_full_sequences=True. Default: False
-    - translate_start              (int | str | None) The position in the input nucleotide sequence to start translating. If a string is provided, it should correspond 
+    - translate_start              (int | str | None) The position in the input nucleotide sequence to start translating. If a string is provided, it should correspond
                                    to a column name in 'mutations' containing the open reading frame start positions for each sequence/mutation.
                                    Only valid if translate=True. Default: None (translate from the beginning of the sequence)
-    - translate_end                (int | str | None) The position in the input nucleotide sequence to end translating. If a string is provided, it should correspond 
+    - translate_end                (int | str | None) The position in the input nucleotide sequence to end translating. If a string is provided, it should correspond
                                    to a column name in 'mutations' containing the open reading frame end positions for each sequence/mutation.
                                    Only valid if translate=True. Default: None (translate from to the end of the sequence)
-    
+
     # General arguments:
     - out                          (str) Path to output fasta file containing the mutated sequences, e.g., 'path/to/output_fasta.fa'.
                                    Default: None -> returns a list of the mutated sequences to standard out.
                                    The identifiers (following the '>') of the mutated sequences in the output fasta will be '>[seq_ID]_[mut_ID]'.
     - verbose                      (True/False) whether to print progress information. Default: True
 
-    Saves mutated sequences in fasta format (or returns a list containing the mutated sequences if out=None).
+    Saves mutated sequences in fasta format (or, if out=None: when update_df is True, returns the mutation dataframe, otherwise returns a list containing the mutated sequences).
     """
 
-    if kwargs.get("gtf") or kwargs.get("gtf_transcript_id_column") or kwargs.get("optimize_flanking_regions") or kwargs.get("remove_seqs_with_wt_kmers") or kwargs.get("min_seq_len") or kwargs.get("max_ambiguous") or kwargs.get("merge_identical") or kwargs.get("merge_identical_rc") or kwargs.get("update_df") or kwargs.get("update_df_out") or kwargs.get("store_full_sequences") or kwargs.get("translate") or kwargs.get("translate_start") or kwargs.get("translate_end"):
-        # print a log message and raise exception
-        logger.critical(
-            """
-            It appears that you are passing in arguments that are not supported anymore in gget mutate. For use of these arguments, please check out https://github.com/pachterlab/kvar.
-            """
-        )
-        raise NotImplementedError
- 
     global intronic_mutations, posttranslational_region_mutations, unknown_mutations, uncertain_mutations, ambiguous_position_mutations, cosmic_incorrect_wt_base, mut_idx_outside_seq
 
-    columns_to_keep = ["header", seq_id_column, mut_column, "mutation_type", "wt_sequence", "mutant_sequence"]
-
-    if k is None:
-        k = 999999999  # take entire sequence by default
+    columns_to_keep = [
+        "header",
+        seq_id_column,
+        mut_column,
+        "mutation_type",
+        "wt_sequence",
+        "mutant_sequence",
+        "start_mutation_position",
+        "end_mutation_position"
+    ]
 
     # Load input sequences and their identifiers from fasta file
     if "." in sequences:
@@ -472,14 +508,18 @@ def mutate(
         mutations = pd.read_csv(mutations)
         for col in mutations.columns:
             if col not in columns_to_keep:
-                columns_to_keep.append(col)  # append "mutation_aa", "gene_name", "mutation_id"
+                columns_to_keep.append(
+                    col
+                )  # append "mutation_aa", "gene_name", "mutation_id"
 
     elif isinstance(mutations, str) and mutations.endswith(".tsv"):
         mutations_path = mutations
         mutations = pd.read_csv(mutations, sep="\t")
         for col in mutations.columns:
             if col not in columns_to_keep:
-                columns_to_keep.append(col)  # append "mutation_aa", "gene_name", "mutation_id"
+                columns_to_keep.append(
+                    col
+                )  # append "mutation_aa", "gene_name", "mutation_id"
 
     # Handle mutations passed as a list
     elif isinstance(mutations, list):
@@ -560,10 +600,16 @@ def mutate(
         mutations = mutations.dropna(subset=[seq_id_column])
 
     # ensure seq_ID column is string type, and chromosome numbers don't have decimals
-    mutations[seq_id_column] = mutations[seq_id_column].apply(convert_chromosome_value_to_int_when_possible)
+    mutations[seq_id_column] = mutations[seq_id_column].apply(
+        convert_chromosome_value_to_int_when_possible
+    )
 
     mutations = add_mutation_type(mutations, mut_column)
 
+    # Link sequences to their mutations using the sequence identifiers
+    if store_full_sequences:
+        mutations["wt_sequence_full"] = mutations[seq_id_column].map(seq_dict)
+
     # Handle sequences that were not found based on their sequence IDs
     seqs_not_found = mutations[~mutations[seq_id_column].isin(seq_dict.keys())]
     if 0 < len(seqs_not_found) < 20:
@@ -594,16 +640,14 @@ def mutate(
         )
 
     total_mutations = mutations.shape[0]
-    mutations["mutant_sequence"] = ""
 
-    if mut_id_column is not None:
-        mutations["header"] = (
-            ">" + mutations[mut_id_column]
-        )
-    else:
-        mutations["header"] = (
-            ">" + mutations[seq_id_column] + ":" + mutations[mut_column]
-        )
+    if mut_id_column is None:
+        mut_id_column = mut_column
+
+    mutations["mutant_sequence"] = ""
+    mutations["header"] = (
+        ">" + mutations[seq_id_column] + ":" + mutations[mut_id_column]
+    )
 
     # Calculate number of bad mutations
     uncertain_mutations = mutations[mut_column].str.contains(r"\?").sum()
@@ -630,7 +674,7 @@ def mutate(
 
     if mutations.empty:
         logger.warning("No valid mutations found in the input.")
-        return []
+        return mutations if update_df else []
 
     # Split nucleotide positions into start and end positions
     split_positions = mutations["nucleotide_positions"].str.split("_", expand=True)
@@ -656,7 +700,9 @@ def mutate(
     mutations["end_mutation_position"] -= 1  # don't forget to increment by 1 later
 
     # Calculate sequence length
-    mutations["sequence_length"] = mutations[seq_id_column].apply(lambda x: get_sequence_length(x, seq_dict))
+    mutations["sequence_length"] = mutations[seq_id_column].apply(
+        lambda x: get_sequence_length(x, seq_dict)
+    )
 
     # Filter out mutations with positions outside the sequence
     index_error_mask = (
@@ -669,7 +715,7 @@ def mutate(
 
     if mutations.empty:
         logger.warning("No valid mutations found in the input.")
-        return []
+        return mutations if update_df else []
 
     # Create masks for each type of mutation
     mutations["wt_nucleotides_ensembl"] = None
@@ -680,6 +726,31 @@ def mutate(
     duplication_mask = mutations["mutation_type"] == "duplication"
     inversion_mask = mutations["mutation_type"] == "inversion"
 
+    if remove_seqs_with_wt_kmers:
+        long_duplications = (
+            (duplication_mask)
+            & (
+                (
+                    mutations["end_mutation_position"]
+                    - mutations["start_mutation_position"]
+                )
+                >= k
+            )
+        ).sum()
+        logger.info(f"Removing {long_duplications} duplications > k")
+        mutations = mutations[
+            ~(
+                (duplication_mask)
+                & (
+                    (
+                        mutations["end_mutation_position"]
+                        - mutations["start_mutation_position"]
+                    )
+                    >= k
+                )
+            )
+        ]
+
     # Create a mask for all non-substitution mutations
     non_substitution_mask = (
         deletion_mask | delins_mask | insertion_mask | duplication_mask | inversion_mask
@@ -687,12 +758,16 @@ def mutate(
 
     # Extract the WT nucleotides for the substitution rows from reference fasta (i.e., Ensembl)
     start_positions = mutations.loc[substitution_mask, "start_mutation_position"].values
-    
+
     # Get the nucleotides at the start positions
-    wt_nucleotides_substitution = np.array([
-        get_nucleotide_at_position(seq_id, pos, seq_dict)
-        for seq_id, pos in zip(mutations.loc[substitution_mask, seq_id_column], start_positions)
-    ])
+    wt_nucleotides_substitution = np.array(
+        [
+            get_nucleotide_at_position(seq_id, pos, seq_dict)
+            for seq_id, pos in zip(
+                mutations.loc[substitution_mask, seq_id_column], start_positions
+            )
+        ]
+    )
 
     mutations.loc[substitution_mask, "wt_nucleotides_ensembl"] = (
         wt_nucleotides_substitution
@@ -705,8 +780,9 @@ def mutate(
     ].str[0]
 
     congruent_wt_bases_mask = (
-        (mutations["wt_nucleotides_cosmic"] == mutations["wt_nucleotides_ensembl"]) |
-        mutations[["wt_nucleotides_cosmic", "wt_nucleotides_ensembl"]].isna().any(axis=1)
+        mutations["wt_nucleotides_cosmic"] == mutations["wt_nucleotides_ensembl"]
+    ) | mutations[["wt_nucleotides_cosmic", "wt_nucleotides_ensembl"]].isna().any(
+        axis=1
     )
 
     cosmic_incorrect_wt_base = (~congruent_wt_bases_mask).sum()
@@ -715,7 +791,7 @@ def mutate(
 
     if mutations.empty:
         logger.warning("No valid mutations found in the input.")
-        return []
+        return mutations if update_df else []
 
     # Adjust the start and end positions for insertions
     mutations.loc[
@@ -775,7 +851,62 @@ def mutate(
         axis=1
     )  # don't forget to increment by 1 later on
 
-    mut_apply = (lambda *args, **kwargs: mutations.progress_apply(*args, **kwargs)) if verbose else mutations.apply
+    if gtf is not None:
+        assert mutations_path.endswith(".csv") or mutations_path.endswith(
+            ".tsv"
+        ), "Mutations must be a CSV or TSV file"
+        if (
+            "start_transcript_position" not in mutations.columns
+            and "end_transcript_position" not in mutations.columns
+        ):  # * currently hard-coded column names, but optionally can be changed to arguments later
+            mutations = merge_gtf_transcript_locations_into_cosmic_csv(
+                mutations, gtf, gtf_transcript_id_column=gtf_transcript_id_column
+            )
+            columns_to_keep.extend(
+                ["start_transcript_position", "end_transcript_position", "strand"]
+            )
+        else:
+            logger.warning(
+                "Transcript positions already present in the input mutations file. Skipping GTF file merging."
+            )
+
+        # adjust start_transcript_position to be 0-index
+        mutations["start_transcript_position"] -= 1
+
+        mutations["start_kmer_position"] = mutations[
+            ["start_kmer_position", "start_transcript_position"]
+        ].max(axis=1)
+        mutations["end_kmer_position"] = mutations[
+            ["end_kmer_position", "end_transcript_position"]
+        ].min(axis=1)
+
+    mut_apply = (
+        (lambda *args, **kwargs: mutations.progress_apply(*args, **kwargs))
+        if verbose
+        else mutations.apply
+    )
+
+    if update_df and store_full_sequences:
+        # Extract flank sequences
+        if verbose:
+            tqdm.pandas(desc="Extracting full left flank sequences")
+
+        mutations["left_flank_region_full"] = mut_apply(
+            lambda row: seq_dict[row[seq_id_column]][
+                0 : row["start_mutation_position"]
+            ],
+            axis=1,
+        )  # ? vectorize
+
+        if verbose:
+            tqdm.pandas(desc="Extracting full right flank sequences")
+
+        mutations["right_flank_region_full"] = mut_apply(
+            lambda row: seq_dict[row[seq_id_column]][
+                row["end_mutation_position"] + 1 : row["sequence_length"]
+            ],
+            axis=1,
+        )  # ? vectorize
 
     if verbose:
         tqdm.pandas(desc="Extracting k-mer left flank sequences")
@@ -812,8 +943,52 @@ def mutate(
     # To what extend the beginning of i overlaps with the beginning of d --> shave up to that many nucleotides off the beginning of r1 until k - len(r1) ≥ extent of overlap
     # To what extend the end of i overlaps with the beginning of d --> shave up to that many nucleotides off the end of r2 until k - len(r2) ≥ extent of overlap
 
-    mutations["updated_left_flank_start"] = 0
-    mutations["updated_right_flank_end"] = 0
+    if optimize_flanking_regions:
+        # Apply the function for beginning of mut_nucleotides with right_flank_region
+        mutations.loc[
+            non_substitution_mask, "beginning_mutation_overlap_with_right_flank"
+        ] = mutations.loc[non_substitution_mask].apply(
+            calculate_beginning_mutation_overlap_with_right_flank, axis=1
+        )
+
+        # Apply the function for end of mut_nucleotides with left_flank_region
+        mutations.loc[non_substitution_mask, "end_mutation_overlap_with_left_flank"] = (
+            mutations.loc[non_substitution_mask].apply(
+                calculate_end_mutation_overlap_with_left_flank, axis=1
+            )
+        )
+
+        # Calculate k-len(flank) (see above instructions)
+        mutations.loc[non_substitution_mask, "k_minus_left_flank_length"] = (
+            k - mutations.loc[non_substitution_mask, "left_flank_region"].apply(len)
+        )
+        mutations.loc[non_substitution_mask, "k_minus_right_flank_length"] = (
+            k - mutations.loc[non_substitution_mask, "right_flank_region"].apply(len)
+        )
+
+        mutations.loc[non_substitution_mask, "updated_left_flank_start"] = np.maximum(
+            mutations.loc[
+                non_substitution_mask, "beginning_mutation_overlap_with_right_flank"
+            ]
+            - mutations.loc[non_substitution_mask, "k_minus_left_flank_length"],
+            0,
+        )
+        mutations.loc[non_substitution_mask, "updated_right_flank_end"] = np.maximum(
+            mutations.loc[non_substitution_mask, "end_mutation_overlap_with_left_flank"]
+            - mutations.loc[non_substitution_mask, "k_minus_right_flank_length"],
+            0,
+        )
+
+        mutations["updated_left_flank_start"] = (
+            mutations["updated_left_flank_start"].fillna(0).astype(int)
+        )
+        mutations["updated_right_flank_end"] = (
+            mutations["updated_right_flank_end"].fillna(0).astype(int)
+        )
+
+    else:
+        mutations["updated_left_flank_start"] = 0
+        mutations["updated_right_flank_end"] = 0
 
     # Create WT substitution k-mer sequences
     mutations.loc[substitution_mask, "wt_sequence"] = (
@@ -828,7 +1003,9 @@ def mutate(
     ].apply(
         lambda row: row["left_flank_region"][row["updated_left_flank_start"] :]
         + row["wt_nucleotides_ensembl"]
-        + row["right_flank_region"][: len(row['right_flank_region']) - row["updated_right_flank_end"]],
+        + row["right_flank_region"][
+            : len(row["right_flank_region"]) - row["updated_right_flank_end"]
+        ],
         axis=1,
     )
 
@@ -845,10 +1022,43 @@ def mutate(
     ].apply(
         lambda row: row["left_flank_region"][row["updated_left_flank_start"] :]
         + row["mut_nucleotides"]
-        + row["right_flank_region"][: len(row['right_flank_region']) - row["updated_right_flank_end"]],
+        + row["right_flank_region"][
+            : len(row["right_flank_region"]) - row["updated_right_flank_end"]
+        ],
         axis=1,
     )
 
+    if remove_seqs_with_wt_kmers:
+        if verbose:
+            tqdm.pandas(
+                desc="Removing mutant fragments that share a kmer with wt fragments"
+            )
+
+        mutations["wt_fragment_and_mutant_fragment_share_kmer"] = mut_apply(
+            lambda row: wt_fragment_and_mutant_fragment_share_kmer(
+                mutated_fragment=row["mutant_sequence"],
+                wildtype_fragment=row["wt_sequence"],
+                k=k + 1,
+            ),
+            axis=1,
+        )
+
+        mutations_overlapping_with_wt = mutations[
+            "wt_fragment_and_mutant_fragment_share_kmer"
+        ].sum()
+
+        mutations = mutations[~mutations["wt_fragment_and_mutant_fragment_share_kmer"]]
+
+    if update_df and store_full_sequences:
+        columns_to_keep.extend(["wt_sequence_full", "mutant_sequence_full"])
+
+        # Create full sequences (substitution and non-substitution)
+        mutations["mutant_sequence_full"] = (
+            mutations["left_flank_region_full"]
+            + mutations["mut_nucleotides"]
+            + mutations["right_flank_region_full"]
+        )
+
     # Calculate k-mer lengths and report the distribution
     mutations["mutant_sequence_kmer_length"] = mutations["mutant_sequence"].apply(
         lambda x: len(x) if pd.notna(x) else 0
@@ -856,6 +1066,49 @@ def mutate(
 
     max_length = mutations["mutant_sequence_kmer_length"].max()
 
+    if min_seq_len:
+        rows_less_than_minimum = (
+            mutations["mutant_sequence_kmer_length"] < min_seq_len
+        ).sum()
+
+        mutations = mutations[mutations["mutant_sequence_kmer_length"] >= min_seq_len]
+
+        if verbose:
+            logger.info(
+                f"Removed {rows_less_than_minimum} mutant kmers with length less than {min_seq_len}..."
+            )
+
+    if max_ambiguous is not None:
+        # Get number of 'N' or 'n' occuring in the sequence
+        mutations["num_N"] = mutations["mutant_sequence"].str.lower().str.count("n")
+        num_rows_with_N = (mutations["num_N"] > max_ambiguous).sum()
+        mutations = mutations[mutations["num_N"] <= max_ambiguous]
+
+        if verbose:
+            logger.info(
+                f"Removed {num_rows_with_N} mutant kmers containing more than {max_ambiguous} 'N's..."
+            )
+
+        # Drop the 'num_N' column after filtering
+        mutations = mutations.drop(columns=["num_N"])
+
+    try:
+        # Create bins of width 5 from 0 to max_length
+        bins = range(0, max_length + 6, 5)
+
+        # Bin the lengths and count the number of elements in each bin
+        binned_lengths = pd.cut(
+            mutations["mutant_sequence_kmer_length"], bins=bins, right=False
+        )
+        bin_counts = binned_lengths.value_counts().sort_index()
+
+        # Display the report
+        if verbose:
+            logger.debug("Report of the number of elements in each bin of width 5:")
+            logger.debug(bin_counts)
+    except Exception as e:
+        pass
+
     # split_cols = mutations[mut_id_column].str.split("_", n=1, expand=True)
 
     # if split_cols.shape[1] == 1:
@@ -872,7 +1125,7 @@ def mutate(
 
     # if remove_seqs_with_wt_kmers:
     #     good_mutations = good_mutations - long_duplications - mutations_overlapping_with_wt
-        
+
     # if min_seq_len:
     #     good_mutations = good_mutations - rows_less_than_minimum
 
@@ -890,15 +1143,158 @@ def mutate(
         {mut_idx_outside_seq} mutations with indices outside of the sequence length found ({mut_idx_outside_seq/total_mutations*100:.2f}%)
         """
 
+    if remove_seqs_with_wt_kmers:
+        report += f"""{long_duplications} duplications longer than k found ({long_duplications/total_mutations*100:.2f}%)
+        {mutations_overlapping_with_wt} mutations with overlapping kmers found ({mutations_overlapping_with_wt/total_mutations*100:.2f}%)
+        """
+
+    if min_seq_len:
+        report += f"""{rows_less_than_minimum} mutations with fragment length < k found ({rows_less_than_minimum/total_mutations*100:.2f}%)
+        """
+
+    if max_ambiguous is not None:
+        report += f"""{num_rows_with_N} mutations with Ns found ({num_rows_with_N/total_mutations*100:.2f}%)
+        """
+
     if good_mutations != total_mutations:
         logger.warning(report)
     else:
         logger.info("All mutations correctly recorded")
 
+    if translate and update_df and store_full_sequences:
+        columns_to_keep.extend(["wt_sequence_aa_full", "mutant_sequence_aa_full"])
+
+        if not mutations_path:
+            assert (
+                type(translate_start) != str and type(translate_end) != str
+            ), "translate_start and translate_end must be integers when translating sequences (or default None)."
+            if translate_start is None:
+                translate_start = 0
+            if translate_end is None:
+                translate_end = mutations["sequence_length"][0]
+
+            # combined_df['ORF'] = combined_df[translate_start] % 3
+
+            if verbose:
+                tqdm.pandas(desc="Translating WT amino acid sequences")
+                mutations["wt_sequence_aa_full"] = mutations[
+                    "wt_sequence_full"
+                ].progress_apply(
+                    lambda x: translate_sequence(
+                        x, start=translate_start, end=translate_end
+                    )
+                )
+            else:
+                mutations["wt_sequence_aa_full"] = mutations["wt_sequence_full"].apply(
+                    lambda x: translate_sequence(
+                        x, start=translate_start, end=translate_end
+                    )
+                )
+
+            if verbose:
+                tqdm.pandas(desc="Translating mutant amino acid sequences")
+
+                mutations["mutant_sequence_aa_full"] = mutations[
+                    "mutant_sequence_full"
+                ].progress_apply(
+                    lambda x: translate_sequence(
+                        x, start=translate_start, end=translate_end
+                    )
+                )
+
+            else:
+                mutations["mutant_sequence_aa_full"] = mutations[
+                    "mutant_sequence_full"
+                ].apply(
+                    lambda x: translate_sequence(
+                        x, start=translate_start, end=translate_end
+                    )
+                )
+
+            print(f"Translated mutated sequences: {mutations['wt_sequence_aa_full']}")
+        else:
+            if not translate_start:
+                translate_start = "translate_start"
+
+            if not translate_end:
+                translate_end = "translate_end"
+
+            if translate_start not in mutations.columns:
+                mutations["translate_start"] = 0
+
+            if translate_end not in mutations.columns:
+                mutations["translate_end"] = mutations["sequence_length"]
+
+            if verbose:
+                tqdm.pandas(desc="Translating WT amino acid sequences")
+
+            mutations["wt_sequence_aa_full"] = mut_apply(
+                lambda row: translate_sequence(
+                    row["wt_sequence_full"], row[translate_start], row[translate_end]
+                ),
+                axis=1,
+            )
+
+            if verbose:
+                tqdm.pandas(desc="Translating mutant amino acid sequences")
+
+            mutations["mutant_sequence_aa_full"] = mut_apply(
+                lambda row: translate_sequence(
+                    row["mutant_sequence_full"],
+                    row[translate_start],
+                    row[translate_end],
+                ),
+                axis=1,
+            )
+
     mutations = mutations[columns_to_keep]
 
+    if merge_identical:
+        logger.info("Merging identical mutated sequences")
+        if update_df:
+            logger.warning(
+                "Merging identical mutated sequences can take a while if update_df=True since it will concatenate all MCRSs too)"
+            )
+            mutations = (
+                mutations.groupby("mutant_sequence", sort=False)
+                .agg(
+                    lambda x: ";".join(x.astype(str))
+                )  # Concatenate values with semicolons
+                .reset_index()
+            )
+
+        else:
+            mutations = (
+                mutations.groupby("mutant_sequence", sort=False, group_keys=False)[
+                    "header"
+                ]
+                .apply(";".join)
+                .reset_index()
+            )
+
+        # apply remove_gt_after_semicolon to mutant_sequence
+        mutations["header"] = mutations["header"].apply(remove_gt_after_semicolon)
+
+        # Calculate the number of semicolons in each entry
+        mutations["semicolon_count"] = mutations["header"].str.count(";")
+
+        mutations["semicolon_count"] += 1
+
+        # Convert all 1 values to NaN
+        mutations["semicolon_count"] = mutations["semicolon_count"].replace(1, np.nan)
+
+        # Take the sum across all rows of the new column
+        total_semicolons = int(mutations["semicolon_count"].sum())
+
+        mutations = mutations.drop(columns=["semicolon_count"])
+
+        if verbose:
+            logger.info(
+                f"{total_semicolons} identical mutated sequences were merged (headers were combined and separated using a semicolon (;). Occurences of identical mutated sequences may be reduced by increasing k."
+            )
+
     empty_kmer_count = (mutations["mutant_sequence"] == "").sum()
-    
+
     if empty_kmer_count > 0 and verbose:
         logger.warning(
             f"{empty_kmer_count} mutated sequences were empty and were not included in the output."
@@ -906,7 +1302,26 @@ def mutate(
 
     mutations = mutations[mutations["mutant_sequence"] != ""]
 
-    mutations['header'] = mutations['header'].str[1:]  # remove the > character
+    mutations["header"] = mutations["header"].str[1:]  # remove the > character
+
+    if update_df:
+        logger.info("Saving dataframe with updated mutation info...")
+        saved_updated_df = True
+        logger.warning(
+            "File size can be very large if the number of mutations is large."
+        )
+        if not update_df_out:
+            if not mutations_path:
+                # logger.warning(
+                #     "mutations_path must be provided if update_df is True and update_df_out is not provided."
+                # )
+                saved_updated_df = False
+            else:
+                base_name, ext = os.path.splitext(mutations_path)
+                update_df_out = f"{base_name}_updated{ext}"
+        if saved_updated_df:
+            mutations.to_csv(update_df_out, index=False)
+            print(f"Updated mutation info has been saved to {update_df_out}")
 
     mutations["fasta_format"] = (
         ">" + mutations["header"] + "\n" + mutations["mutant_sequence"] + "\n"
@@ -922,13 +1337,17 @@ def mutate(
 
     # When out=None, return list of mutated seqs
     else:
-        all_mut_seqs = []
-        all_mut_seqs.extend(mutations["mutant_sequence"].values)
-
-        # Remove empty strings from final list of mutated sequences
-        # (these are introduced when unknown mutations are encountered)
-        while "" in all_mut_seqs:
-            all_mut_seqs.remove("")
-
-        if len(all_mut_seqs) > 0:
-            return all_mut_seqs
+        if update_df:
+            return mutations[columns_to_keep]
+        else:
+            all_mut_seqs = []
+            all_mut_seqs.extend(mutations["mutant_sequence"].values)
+    
+            # Remove empty strings from final list of mutated sequences
+            # (these are introduced when unknown mutations are encountered)
+            while "" in all_mut_seqs:
+                all_mut_seqs.remove("")
+    
+            if len(all_mut_seqs) > 0:
+                return all_mut_seqs
+            return []