Skip to content

Commit

Permalink
Merge pull request #16 from anergictcell/release/0.2.0
Browse files Browse the repository at this point in the history
Release/0.2.0

- Add GeneticCode to modify the translation table based on the applied genetic code. This change impacts some public functions of the QC-check module.
- Allow generic `Read + Seek` objects for FastaReader. This enables reading directly from S3 or other remote sources.
- Allow `FastaWriter` to write to different files (only one at a time). This means you don't have to initiate a new `FastaWriter` for every output file, but can reuse an existing instance and simply change the output writer.
  • Loading branch information
anergictcell authored Oct 8, 2022
2 parents 02640ef + b7313a2 commit 7f909a7
Show file tree
Hide file tree
Showing 20 changed files with 1,206 additions and 269 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/target
.DS_Store
Cargo.toml.bck
Cargo.lock
tests/data/hg19.fasta*
tests/data/prepare_example_files.sh
tests/data/hg19.ncbiRefSeq.gtf
Expand Down
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Changelog

# 0.2
- Add GeneticCode to modify the translation table based on the applied genetic code. This change impacts some public functions of the QC-check module.
- Allow generic `Read + Seek` objects for FastaReader. This enables reading directly from S3 or other remote sources.
- Allow `FastaWriter` to write to different files (only one at a time). This means you don't have to initiate a new `FastaWriter` for every output file, but can reuse an existing instance and simply change the output writer.

# 0.1.3
- Add QC-check module to check if transcripts make sense with a given reference genome.

Expand Down
81 changes: 0 additions & 81 deletions Cargo.lock

This file was deleted.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "atglib"
version = "0.1.4"
version = "0.2.0"
edition = "2021"
authors = ["Jonas Marcello <[email protected]>"]
description = "A library to handle transcripts for genomics and transcriptomics"
Expand Down
32 changes: 32 additions & 0 deletions JUSTFILE
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
set positional-arguments

new version:
#!/usr/bin/env sh
if git rev-parse --quiet --verify release/{{version}} > /dev/null; then
echo "Release branch exists already"
else
echo "Creating release branch release/{{version}}"
git checkout main && \
git pull && \
git checkout -b release/{{version}} && \
sed -i .bck "s/^version =.*$/version = \"{{version}}\"/" ./Cargo.toml && \
cargo check && \
git commit -am "Prepare release branch {{version}}" && \
git push -u origin release/{{version}}
fi
@check version:
git checkout release/{{version}} && git pull
echo "Running linter and unittests"
cargo clippy && cargo fmt && cargo test -q && cargo doc

@release version:
git tag {{version}}
git push --tags
cargo publish

test:
#!/usr/bin/env zsh
echo -ne "Checking formatting and doc generation"
(cargo clippy && cargo fmt --check && cargo test -q && cargo doc && \
echo " \e[32m\e[1mOK\e[0m") || echo "\e[31m\e[1mERROR\e[0m"
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ When it comes to functional correctness, I try my best to test the functionality
- [ ] Parallelize input parsing
- [ ] Check if exons can be stored in smaller vec
- [ ] Use std::mem::replace to move out of attributes, e.g. in TranscriptBuilder and remove Copy/Clone traits <https://stackoverflow.com/questions/31307680/how-to-move-one-field-out-of-a-struct-that-implements-drop-trait>
- [ ] Change `Codon` to `GenomicCodon`
- [ ] Update error handling and streamling error types

## Known issues
### GTF parsing
Expand Down
149 changes: 95 additions & 54 deletions src/fasta/reader.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::collections::HashMap;
use std::convert::TryFrom;
use std::fs::{read_to_string, File};
use std::fs::File;
use std::io::{BufReader, Read, Seek, SeekFrom};

use std::path::Path;
Expand Down Expand Up @@ -75,20 +75,18 @@ struct FastaIndex {
}

impl FastaIndex {
/// Crates a new [`FastaIndex`] by parsing the fai file
pub fn new<P: AsRef<Path> + std::fmt::Display>(filename: P) -> FastaResult<Self> {
/// Crates a new [`FastaIndex`] from a `Reader`
pub fn from_reader<R: std::io::Read>(mut reader: R) -> FastaResult<Self> {
let mut content = String::new();
reader.read_to_string(&mut content)?;
Self::from_str(&content)
}

fn from_str(content: &str) -> FastaResult<Self> {
let mut idx = Self {
chromosomes: HashMap::new(),
};
let content = match read_to_string(&filename) {
Ok(x) => x,
Err(err) => {
return Err(FastaError::new(format!(
"Unable to read from fasta index file {}: {}",
filename, err
)))
}
};

for line in content.lines() {
let chrom = ChromosomeIndex::new(line)?;
idx.chromosomes.insert(chrom.name().to_string(), chrom);
Expand Down Expand Up @@ -164,9 +162,84 @@ impl FastaReader<File> {
/// ```
pub fn from_file<P: AsRef<Path> + std::fmt::Display>(path: P) -> FastaResult<Self> {
let fai_path = format!("{}.fai", path);
Self::new(path, fai_path)
FastaReader::new(path, fai_path)
}

/// Creates a `FastaReader` by specifying both fasta and fai file
///
/// Use this method if the fasta-index file (fai) does not follow standard
/// naming conventions. In most cases, you want to use
/// [`from_file`](`FastaReader::from_file`) instead.
///
/// # Examples
///
/// ```rust
/// use atglib;
/// use atglib::fasta::FastaReader;
/// let mut reader = FastaReader::new("tests/data/small.fasta", "tests/data/small.fasta.fai").unwrap();
/// let seq = reader.read_sequence("chr1", 1, 10).unwrap();
/// assert_eq!(&seq.to_string(), "GCCTCAGAGG");
/// ```
pub fn new<P: AsRef<Path> + std::fmt::Display, P2: AsRef<Path> + std::fmt::Display>(
fasta_path: P,
fai_path: P2,
) -> FastaResult<Self> {
let fasta_reader = match File::open(fasta_path.as_ref()) {
Ok(x) => x,
Err(err) => {
return Err(FastaError::new(format!(
"unable to open fasta file {}: {}",
fasta_path, err
)))
}
};

let fai_reader = match File::open(fai_path.as_ref()) {
Ok(x) => x,
Err(err) => {
return Err(FastaError::new(format!(
"unable to open fasta index file {}: {}",
fai_path, err
)))
}
};

FastaReader::from_reader(fasta_reader, fai_reader)
}
}

impl<R: std::io::Read> FastaReader<R> {
/// Creates a `FastaReader` from `Reader` instaces for the Fasta file and the Fasta index
///
/// Use this method if both fasta and index are not files on the file system, but e.g. HTTP streams etc.
/// If you have normal files, you want to use
/// [`from_file`](`FastaReader::from_file`) instead.
///
/// # Examples
///
/// ```rust
/// use std::fs::File;
/// use atglib;
/// use atglib::fasta::FastaReader;
/// let mut fasta = File::open("tests/data/small.fasta").unwrap();
/// let mut index = File::open("tests/data/small.fasta.fai").unwrap();
///
/// let mut reader = FastaReader::from_reader(fasta, index).unwrap();
/// let seq = reader.read_sequence("chr1", 1, 10).unwrap();
/// assert_eq!(&seq.to_string(), "GCCTCAGAGG");
/// ```
pub fn from_reader<R2: std::io::Read>(
fasta_reader: R,
fai_reader: R2,
) -> FastaResult<FastaReader<R>> {
Ok(FastaReader {
inner: BufReader::new(fasta_reader),
idx: FastaIndex::from_reader(fai_reader)?,
})
}
}

impl<R: std::io::Read + std::io::Seek> FastaReader<R> {
/// Returns the raw-bytes of the Fasta file for the genomic range
///
/// Reads from the FastaReader and returns the raw bytes
Expand Down Expand Up @@ -201,7 +274,7 @@ impl FastaReader<File> {
/// let mut reader = FastaReader::from_file("tests/data/small.fasta").unwrap();
///
/// // read the nucleotide at position 150 of chromosome 5
/// let seq = reader.read_sequence("chr5", 150, 150).unwrap();
/// let seq = reader.read_sequence("chrM", 150, 150).unwrap();
/// assert_eq!(&seq.to_string(), "G");
///
/// // read the first 10 nucleotides of chromosome 1
Expand All @@ -216,48 +289,15 @@ impl FastaReader<File> {
let length = usize::try_from(end - start)?;
Ok(Sequence::from_raw_bytes(&raw_bytes, length)?)
}

/// Creates a `FastaReader` by specifying both fasta and fai file
///
/// Use this method if the fasta-index file (fai) does not follow standard
/// naming conventions. In most cases, you want to use
/// [`from_file`](`FastaReader::from_file`) instead.
///
/// # Examples
///
/// ```rust
/// use atglib;
/// use atglib::fasta::FastaReader;
/// let mut reader = FastaReader::new("tests/data/small.fasta", "tests/data/small.fasta.fai").unwrap();
/// let seq = reader.read_sequence("chr1", 1, 10).unwrap();
/// assert_eq!(&seq.to_string(), "GCCTCAGAGG");
/// ```
pub fn new<P: AsRef<Path> + std::fmt::Display, P2: AsRef<Path> + std::fmt::Display>(
fasta_path: P,
fai_path: P2,
) -> FastaResult<Self> {
let reader = match File::open(fasta_path.as_ref()) {
Ok(x) => x,
Err(err) => {
return Err(FastaError::new(format!(
"unable to open fasta file {}: {}",
fasta_path, err
)))
}
};
Ok(FastaReader {
inner: BufReader::new(reader),
idx: FastaIndex::new(fai_path)?,
})
}
}

#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_fai_reading() {
let fai = FastaIndex::new("tests/data/small.fasta.fai").unwrap();
let fai =
FastaIndex::from_reader(File::open("tests/data/small.fasta.fai").unwrap()).unwrap();
assert_eq!(fai.offset("chr1", 1).unwrap(), 6);
assert_eq!(fai.offset("chr1", 50).unwrap(), 55);
assert_eq!(fai.offset("chr1", 51).unwrap(), 57);
Expand All @@ -271,7 +311,8 @@ mod tests {

#[test]
fn test_fai_errors() {
let fai = FastaIndex::new("tests/data/small.fasta.fai").unwrap();
let fai =
FastaIndex::from_reader(File::open("tests/data/small.fasta.fai").unwrap()).unwrap();
assert_eq!(
fai.offset("chr6", 1).unwrap_err().to_string(),
"index for chr6 does not exist".to_string()
Expand All @@ -298,7 +339,7 @@ mod tests {
);

assert_eq!(
fai.offset("chr5", 151).unwrap_err().to_string(),
fai.offset("chrM", 151).unwrap_err().to_string(),
"position 151 is greater than chromome length 150".to_string()
);
}
Expand Down Expand Up @@ -327,16 +368,16 @@ mod tests {
let seq = fasta.read_sequence("chr4", 148, 149).unwrap();
assert_eq!(&seq.to_string(), "TA");

let seq = fasta.read_sequence("chr5", 101, 150).unwrap();
let seq = fasta.read_sequence("chrM", 101, 150).unwrap();
assert_eq!(
&seq.to_string(),
"TGACCTGCAGGGTCGAGGAGTTGACGGTGCTGAGTTCCCTGCACTCTCAG"
);

let seq = fasta.read_sequence("chr5", 150, 150).unwrap();
let seq = fasta.read_sequence("chrM", 150, 150).unwrap();
assert_eq!(&seq.to_string(), "G");

let seq = fasta.read_sequence("chr5", 1, 150).unwrap();
let seq = fasta.read_sequence("chrM", 1, 150).unwrap();
assert_eq!(seq.len(), 150);
}
}
Loading

0 comments on commit 7f909a7

Please sign in to comment.