a sketch

matsengrp · Dec 13, 2024 · e400070 · e400070
1 parent 22c8873
commit e400070
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 1 deletion.
diff --git a/netam/framework.py b/netam/framework.py
@@ -347,7 +347,26 @@ def trimmed_shm_model_outputs_of_crepe(crepe, parents):
     return trimmed_rates, trimmed_csps
 
 
-def load_pcp_df(pcp_df_path_gz, sample_count=None, chosen_v_families=None):
+def join_chains(pcp_df):
+    """Join the parent and child chains in the pcp_df.
+    
+    Make a parent column that is the parent_h + "^^^" + parent_l, and same for child.
+
+    TODO update for case of just parent and child
+    """
+    if "parent_h" in pcp_df.columns and "parent_l" in pcp_df.columns and "child_h" in pcp_df.columns and "child_l" in pcp_df.columns:
+        pcp_df["parent"] = pcp_df["parent_h"] + "^^^" + pcp_df["parent_l"]
+        pcp_df["child"] = pcp_df["child_h"] + "^^^" + pcp_df["child_l"]
+        pcp_df.drop(columns=["parent_h", "parent_l", "child_h", "child_l"], inplace=True)
+    else:
+        # TODO but there is a chance we'll have some data sets that are just light chain, in which case we'll want to pad on the left. 
+        # I suggest in that case that we just ask for the light chain column to be named parent_l and child_l, and that you can ask for that column name here.
+        # Let's allow that for the parent_h case too, just in case.
+        pcp_df["parent"] += "^^^"
+        pcp_df["child"] += "^^^"
+    return pcp_df
+
+def load_pcp_df(pcp_df_path_gz, sample_count=None, chosen_v_families=None, joined_mode=False):
     """Load a PCP dataframe from a gzipped CSV file.
 
     `orig_pcp_idx` is the index column from the original file, even if we subset by
@@ -358,6 +377,9 @@ def load_pcp_df(pcp_df_path_gz, sample_count=None, chosen_v_families=None):
         .reset_index()
         .rename(columns={"index": "orig_pcp_idx"})
     )
+    if joined_mode:
+        pcp_df = join_chains(pcp_df)
+    # TODO assert that we have parent and child columns
     pcp_df["v_family"] = pcp_df["v_gene"].str.split("-").str[0]
     if chosen_v_families is not None:
         chosen_v_families = set(chosen_v_families)

diff --git a/netam/sequences.py b/netam/sequences.py
@@ -9,6 +9,7 @@
 from Bio.Seq import Seq
 
 AA_STR_SORTED = "ACDEFGHIKLMNPQRSTVWY"
+TOKEN_STR_SORTED = "ACDEFGHIKLMNPQRSTVWY^"
 NT_STR_SORTED = "ACGT"
 CODONS = [
     "".join(codon_list)