Skip to content

Commit

Permalink
a sketch
Browse files Browse the repository at this point in the history
  • Loading branch information
matsen authored and willdumm committed Dec 13, 2024
1 parent 22c8873 commit e400070
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 1 deletion.
24 changes: 23 additions & 1 deletion netam/framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,26 @@ def trimmed_shm_model_outputs_of_crepe(crepe, parents):
return trimmed_rates, trimmed_csps


def load_pcp_df(pcp_df_path_gz, sample_count=None, chosen_v_families=None):
def join_chains(pcp_df):
"""Join the parent and child chains in the pcp_df.
Make a parent column that is the parent_h + "^^^" + parent_l, and same for child.
TODO update for case of just parent and child
"""
if "parent_h" in pcp_df.columns and "parent_l" in pcp_df.columns and "child_h" in pcp_df.columns and "child_l" in pcp_df.columns:
pcp_df["parent"] = pcp_df["parent_h"] + "^^^" + pcp_df["parent_l"]
pcp_df["child"] = pcp_df["child_h"] + "^^^" + pcp_df["child_l"]
pcp_df.drop(columns=["parent_h", "parent_l", "child_h", "child_l"], inplace=True)
else:
# TODO but there is a chance we'll have some data sets that are just light chain, in which case we'll want to pad on the left.
# I suggest in that case that we just ask for the light chain column to be named parent_l and child_l, and that you can ask for that column name here.
# Let's allow that for the parent_h case too, just in case.
pcp_df["parent"] += "^^^"
pcp_df["child"] += "^^^"
return pcp_df

def load_pcp_df(pcp_df_path_gz, sample_count=None, chosen_v_families=None, joined_mode=False):
"""Load a PCP dataframe from a gzipped CSV file.
`orig_pcp_idx` is the index column from the original file, even if we subset by
Expand All @@ -358,6 +377,9 @@ def load_pcp_df(pcp_df_path_gz, sample_count=None, chosen_v_families=None):
.reset_index()
.rename(columns={"index": "orig_pcp_idx"})
)
if joined_mode:
pcp_df = join_chains(pcp_df)
# TODO assert that we have parent and child columns
pcp_df["v_family"] = pcp_df["v_gene"].str.split("-").str[0]
if chosen_v_families is not None:
chosen_v_families = set(chosen_v_families)
Expand Down
1 change: 1 addition & 0 deletions netam/sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from Bio.Seq import Seq

AA_STR_SORTED = "ACDEFGHIKLMNPQRSTVWY"
TOKEN_STR_SORTED = "ACDEFGHIKLMNPQRSTVWY^"
NT_STR_SORTED = "ACGT"
CODONS = [
"".join(codon_list)
Expand Down

0 comments on commit e400070

Please sign in to comment.