Feat/sasa (#9) sub-commands contacts, sasa, and seq

* feat(sasa): add method to calculate the per-atom SASA * fix(sasa): ignore waters when calculating SASA * refactor(cli): BREAKING CHANGE: move contacts and sasa to separate sub-commands * style(cli): sort imports * refactor(cli): hardcode output path by contacts * feat(cli): separate contacts and seq sub-commands * docs: version bump to v0.3.0
y1zhou · Aug 14, 2024 · 2dca67f · 2dca67f
1 parent 4146bf7
commit 2dca67f
Show file tree

Hide file tree

Showing 10 changed files with 411 additions and 199 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,3 +15,6 @@ Cargo.lock
 
 # macOS files
 .DS_Store
+
+# Test generated files
+*.csv
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,7 +22,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Removed
 
 - PLACEHOLDER
-- 
+
+## [0.3.0] - 2024-08-14
+
+### Added
+
+- `sasa` command to calculate the atom level SASA
+- `seq` command to extract protein sequences from PDB files
+
+### Fixed
+
+- Only report chains that are part of the ligand or receptor for `contacts`
+
+### Changed
+
+- Moved previous top-level command to `contacts` sub-command
+- Better path parsing
+
 ## [0.2.0] - 2024-08-08
 
 ### Added
@@ -46,6 +62,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Initial release
 - Detection of common protein-protein interactions in a PDB or mmCIF file
 
-[unreleased]: https://github.com/y1zhou/arpeggia/compare/v0.2.0...HEAD
-[0.1.0]: https://github.com/y1zhou/arpeggia/releases/tag/v0.1.0
+[unreleased]: https://github.com/y1zhou/arpeggia/compare/v0.3.0...HEAD
+[0.2.0]: https://github.com/y1zhou/arpeggia/releases/tag/v0.3.0
 [0.2.0]: https://github.com/y1zhou/arpeggia/releases/tag/v0.2.0
+[0.1.0]: https://github.com/y1zhou/arpeggia/releases/tag/v0.1.0
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,8 +1,9 @@
 [package]
 name = "arpeggia"
-version = "0.2.0"
+version = "0.3.0"
 description = "A port of the Arpeggio library for Rust"
 edition = "2021"
+authors = ["Yi Zhou <[email protected]>"]
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
@@ -13,5 +14,6 @@ pdbtbx = { version = "0.11.0", features = ["rayon", "rstar"], git = "https://git
 polars = { version = "0.39.2", features = ["lazy"] }
 rayon = "1.10.0"
 rstar = "0.12.0"
+rust-sasa = "0.2.2"
 tracing = "0.1.40"
 tracing-subscriber = "0.3.18"
diff --git a/src/cli/contacts.rs b/src/cli/contacts.rs
@@ -0,0 +1,172 @@
+use crate::interactions::{InteractionComplex, Interactions, ResultEntry};
+use crate::utils::{load_model, write_df_to_csv};
+use clap::Parser;
+use pdbtbx::*;
+use polars::prelude::*;
+use std::path::{Path, PathBuf};
+use tracing::{debug, error, info, trace, warn};
+
+#[derive(Parser, Debug, Clone)]
+#[command(version, about)]
+pub(crate) struct Args {
+    /// Path to the PDB or mmCIF file to be analyzed
+    #[arg(short, long)]
+    input: PathBuf,
+
+    /// Output directory
+    #[arg(short, long)]
+    output: PathBuf,
+
+    /// Group chains for interactions:
+    /// e.g. A,B/C,D
+    /// where chains A and B are the "ligand" and C and D are the "receptor"
+    #[arg(short, long)]
+    groups: String,
+
+    /// Compensation factor for VdW radii dependent interaction types
+    #[arg(short = 'c', long = "vdw-comp", default_value_t = 0.1)]
+    vdw_comp: f64,
+
+    /// Distance cutoff when searching for neighboring atoms
+    #[arg(short, long, default_value_t = 4.5)]
+    dist_cutoff: f64,
+
+    /// Number of threads to use for parallel processing
+    #[arg(short = 'j', long = "num-threads", default_value_t = 0)]
+    num_threads: usize,
+}
+
+pub(crate) fn run(args: &Args) {
+    trace!("{args:?}");
+
+    // Create Rayon thread pool
+    rayon::ThreadPoolBuilder::new()
+        .num_threads(args.num_threads)
+        .build_global()
+        .unwrap();
+    debug!("Using {} thread(s)", rayon::current_num_threads());
+
+    // Make sure `input` exists
+    let input_path = Path::new(&args.input).canonicalize().unwrap();
+    let input_file: String = input_path.to_str().unwrap().parse().unwrap();
+
+    // Load file as complex structure
+    let (pdb, pdb_warnings) = load_model(&input_file);
+    if !pdb_warnings.is_empty() {
+        pdb_warnings.iter().for_each(|e| match e.level() {
+            pdbtbx::ErrorLevel::BreakingError => error!("{e}"),
+            pdbtbx::ErrorLevel::InvalidatingError => error!("{e}"),
+            _ => warn!("{e}"),
+        });
+    }
+
+    let (df_atomic, df_ring, i_complex) =
+        get_contacts(pdb, args.groups.as_str(), args.vdw_comp, args.dist_cutoff);
+
+    // Information on the sequence of the chains in the model
+    let num_chains = i_complex.ligand.len() + i_complex.receptor.len();
+    info!(
+        "Loaded {} {}",
+        num_chains,
+        match num_chains {
+            1 => "chain",
+            _ => "chains",
+        }
+    );
+
+    debug!(
+        "Parsed ligand chains {lig:?}; receptor chains {receptor:?}",
+        lig = i_complex.ligand,
+        receptor = i_complex.receptor
+    );
+
+    // Prepare output directory
+    let output_path = Path::new(&args.output).canonicalize().unwrap();
+    let _ = std::fs::create_dir_all(output_path.clone());
+    let output_file = output_path.join("contacts.csv");
+
+    let output_file_str = output_file.to_str().unwrap();
+    debug!("Results will be saved to {output_file_str}");
+
+    // Save results and log the identified interactions
+    info!(
+        "Found {} atom-atom contacts\n{}",
+        df_atomic.shape().0,
+        df_atomic
+    );
+    let df_clash = df_atomic
+        .clone()
+        .lazy()
+        .filter(col("interaction").eq(lit("StericClash")))
+        .collect()
+        .unwrap();
+    if df_clash.height() > 0 {
+        warn!("Found {} steric clashes\n{}", df_clash.shape().0, df_clash);
+    }
+    info!("Found {} ring contacts\n{}", df_ring.shape().0, df_ring);
+
+    // Concate dataframes for saving to CSV
+    let mut df_contacts = concat(
+        [df_atomic.clone().lazy(), df_ring.clone().lazy()],
+        UnionArgs::default(),
+    )
+    .unwrap()
+    .collect()
+    .unwrap();
+
+    // Save res to CSV files
+    write_df_to_csv(&mut df_contacts, output_file);
+}
+
+pub fn get_contacts(
+    pdb: PDB,
+    groups: &str,
+    vdw_comp: f64,
+    dist_cutoff: f64,
+) -> (DataFrame, DataFrame, InteractionComplex) {
+    let i_complex = InteractionComplex::new(pdb, groups, vdw_comp, dist_cutoff);
+
+    // Find interactions
+    let atomic_contacts = i_complex.get_atomic_contacts();
+    let df_atomic = results_to_df(&atomic_contacts);
+
+    let mut ring_contacts: Vec<ResultEntry> = Vec::new();
+    ring_contacts.extend(i_complex.get_ring_atom_contacts());
+    ring_contacts.extend(i_complex.get_ring_ring_contacts());
+    let df_ring = results_to_df(&ring_contacts)
+        // .drop_many(&["from_atomn", "from_atomi", "to_atomn", "to_atomi"])
+        .sort(
+            [
+                "from_chain",
+                "from_resi",
+                "from_altloc",
+                "to_chain",
+                "to_resi",
+                "to_altloc",
+            ],
+            Default::default(),
+        )
+        .unwrap();
+
+    (df_atomic, df_ring, i_complex)
+}
+
+fn results_to_df(res: &[ResultEntry]) -> DataFrame {
+    df!(
+        "interaction" => res.iter().map(|x| x.interaction.to_string()).collect::<Vec<String>>(),
+        "distance" => res.iter().map(|x| x.distance).collect::<Vec<f64>>(),
+        "from_chain" => res.iter().map(|x| x.ligand.chain.to_owned()).collect::<Vec<String>>(),
+        "from_resn" => res.iter().map(|x| x.ligand.resn.to_owned()).collect::<Vec<String>>(),
+        "from_resi" => res.iter().map(|x| x.ligand.resi as i64).collect::<Vec<i64>>(),
+        "from_altloc" => res.iter().map(|x| x.ligand.altloc.to_owned()).collect::<Vec<String>>(),
+        "from_atomn" => res.iter().map(|x| x.ligand.atomn.to_owned()).collect::<Vec<String>>(),
+        "from_atomi" => res.iter().map(|x| x.ligand.atomi as i64).collect::<Vec<i64>>(),
+        "to_chain" => res.iter().map(|x| x.receptor.chain.to_owned()).collect::<Vec<String>>(),
+        "to_resn" => res.iter().map(|x| x.receptor.resn.to_owned()).collect::<Vec<String>>(),
+        "to_resi" => res.iter().map(|x| x.receptor.resi as i64).collect::<Vec<i64>>(),
+        "to_altloc" => res.iter().map(|x| x.receptor.altloc.to_owned()).collect::<Vec<String>>(),
+        "to_atomn" => res.iter().map(|x| x.receptor.atomn.to_owned()).collect::<Vec<String>>(),
+        "to_atomi" => res.iter().map(|x| x.receptor.atomi as i64).collect::<Vec<i64>>(),
+    )
+    .unwrap()
+}
diff --git a/src/cli/mod.rs b/src/cli/mod.rs
@@ -0,0 +1,3 @@
+pub mod contacts;
+pub mod pdb2seq;
+pub mod sasa;
diff --git a/src/cli/pdb2seq.rs b/src/cli/pdb2seq.rs
@@ -0,0 +1,29 @@
+use crate::chains::ChainExt;
+use clap::Parser;
+use std::path::PathBuf;
+
+use crate::utils::load_model;
+use std::path::Path;
+
+#[derive(Parser, Debug, Clone)]
+#[command(version, about)]
+pub(crate) struct Args {
+    /// Path to the PDB or mmCIF file to be analyzed
+    input: Vec<PathBuf>,
+}
+
+pub(crate) fn run(args: &Args) {
+    for f in &args.input {
+        // Make sure `input` exists
+        let input_path = Path::new(f).canonicalize().unwrap();
+        let input_file: String = input_path.to_str().unwrap().parse().unwrap();
+
+        // Load file and print sequences
+        let (pdb, _) = load_model(&input_file);
+        println!("File: {}", input_file);
+        for chain in pdb.chains() {
+            println!("{}: {}", chain.id(), chain.pdb_seq().join(""));
+        }
+        println!();
+    }
+}