Skip to content

Commit

Permalink
feat: add sequence reader trait
Browse files Browse the repository at this point in the history
  • Loading branch information
suchapalaver committed Dec 29, 2022
1 parent 1ddda78 commit ef34f0f
Show file tree
Hide file tree
Showing 9 changed files with 198 additions and 28 deletions.
118 changes: 118 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ bytes = "1.3.0"
custom_error = "1.9.2"
dashmap = "5.4.0"
fxhash = "0.2.1"
needletail = "0.4.1"
rayon = "*"

[dev-dependencies]
Expand Down
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@

`krust` is a [k-mer](https://en.wikipedia.org/wiki/K-mer) counter--a bioinformatics 101 tool for counting the frequency of substrings of length `k` within strings of DNA data. It's written in Rust and run from the command line. It takes a fasta file of DNA sequences and will output all canonical k-mers (the double helix means each k-mer has a [reverse complement](https://en.wikipedia.org/wiki/Complementarity_(molecular_biology)#DNA_and_RNA_base_pair_complementarity)) and their frequency across all records in the given fasta file.

Run `krust` to count *5*-mers like this:
`krust` supports either `rust-bio`, by default, or `needletail`, with **any** additional command line argument, for FASTA reading.

Run `krust` with `rust-bio`'s FASTA reader to count *5*-mers like this:

```bash
cargo run --release 5 your/local/path/to/fasta_data.fa > output.tsv
```

or, searching for *21*-mers:
or, searching for *21*-mers with `needletail` as the FASTA reader like this:

```bash
cargo run --release 21 your/local/path/to/fasta_data.fa > output.tsv
cargo run --release 21 your/local/path/to/fasta_data.fa . > output.tsv
```

`krust` prints to `stdout`, writing, on alternate lines:
Expand All @@ -23,5 +25,3 @@ cargo run --release 21 your/local/path/to/fasta_data.fa > output.tsv
{canonical k-mer}
...
```

`krust` uses [`rust-bio`](https://docs.rs/bio/0.38.0/bio/), [`rayon`](https://docs.rs/rayon/1.5.1/rayon/), and [`dashmap`](https://docs.rs/crate/dashmap/4.0.2).
5 changes: 4 additions & 1 deletion src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use std::{env, error::Error, path::PathBuf};
pub struct Config {
pub k: usize,
pub path: PathBuf,
pub reader: bool,
}

impl Config {
Expand All @@ -22,6 +23,8 @@ impl Config {
None => return Err("filepath argument needed".into()),
};

Ok(Config { k, path })
let reader = args.next().is_some();

Ok(Config { k, path, reader })
}
}
2 changes: 1 addition & 1 deletion src/kmer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ impl Kmer {
pub(crate) fn pack(&mut self) {
let iter = match self.bytes.is_empty() {
true => &self.reverse_complement,
false => &self.bytes
false => &self.bytes,
};
for elem in iter.iter() {
self.packed_bits <<= 2;
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,5 @@
pub mod config;
pub(crate) mod kmer;
pub(crate) mod reader;
pub mod startup;
2 changes: 1 addition & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ fn main() {
process::exit(1);
});

if let Err(e) = startup::run(config.path, config.k) {
if let Err(e) = startup::run(config.path, config.k, config.reader) {
eprintln!("Application error: {}", e);
drop(e);
process::exit(1);
Expand Down
41 changes: 41 additions & 0 deletions src/reader.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
use std::{error::Error, fmt::Debug, path::Path, vec::IntoIter};

use bio::io::fasta::Reader;
use bytes::Bytes;
use needletail::parse_fastx_file;
use rayon::prelude::{ParallelBridge, ParallelIterator};

pub(crate) trait SequenceReader {
fn sequence_reader<P: AsRef<Path> + Debug>(path: P) -> Result<IntoIter<Bytes>, Box<dyn Error>>;
}

pub(crate) struct RustBio;

impl SequenceReader for RustBio {
fn sequence_reader<P: AsRef<Path> + Debug>(path: P) -> Result<IntoIter<Bytes>, Box<dyn Error>> {
Ok(Reader::from_file(path)?
.records()
.into_iter()
.par_bridge()
.map(|read| read.expect("Error reading fasta record."))
.map(|record| Bytes::copy_from_slice(record.seq()))
.collect::<Vec<Bytes>>()
.into_iter())
}
}

pub(crate) struct Needletail;

impl SequenceReader for Needletail {
fn sequence_reader<P: AsRef<Path> + Debug>(path: P) -> Result<IntoIter<Bytes>, Box<dyn Error>> {
let mut reader = parse_fastx_file(path)?;
let mut v = Vec::new();
while let Some(record) = reader.next() {
let record = record.expect("invalid record");
let seq = record.seq();
let seq = Bytes::copy_from_slice(&seq);
v.push(seq);
}
Ok(v.into_iter())
}
}
46 changes: 26 additions & 20 deletions src/startup.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use super::kmer::Kmer;
use bio::io::fasta::Reader;
use super::{
kmer::Kmer,
reader::{Needletail, RustBio, SequenceReader},
};
use bytes::Bytes;
use dashmap::DashMap;
use fxhash::FxHasher;
Expand All @@ -18,11 +20,20 @@ custom_error::custom_error! { pub ProcessError
WriteError{source: IoError} = "Unable to write output",
}

pub fn run<P>(path: P, k: usize) -> Result<(), ProcessError>
pub fn run<P>(path: P, k: usize, reader: bool) -> Result<(), ProcessError>
where
P: AsRef<Path> + Debug,
{
DashFx::new().build(path, k)?.output(k)?;
let (reader, name) = match reader {
true => (Needletail::sequence_reader(path), "needletail"),
false => (RustBio::sequence_reader(path), "rust-bio")
};

println!("\nReading fasta with {} ...", name);

DashFx::new()
.build(reader?, k)?
.output(k)?;

Ok(())
}
Expand All @@ -35,7 +46,11 @@ type DashFx = DashMap<u64, i32, BuildHasherDefault<FxHasher>>;

trait KmerMap {
fn new() -> Self;
fn build<P: AsRef<Path> + Debug>(self, path: P, k: usize) -> Result<Self, Box<dyn Error>>
fn build(
self,
sequences: impl Iterator<Item = Bytes>,
k: usize,
) -> Result<Self, Box<dyn Error>>
where
Self: Sized;
fn process_sequence(&self, seq: &Bytes, k: &usize);
Expand All @@ -53,8 +68,12 @@ impl KmerMap for DashFx {
/// using a customized [`dashmap`](https://docs.rs/dashmap/4.0.2/dashmap/struct.DashMap.html)
/// with [`FxHasher`](https://docs.rs/fxhash/0.2.1/fxhash/struct.FxHasher.html) to update in parallel a
/// hashmap of canonical k-mers (keys) and their frequency in the data (values)
fn build<P: AsRef<Path> + Debug>(self, path: P, k: usize) -> Result<Self, Box<dyn Error>> {
for seq in sequence_reader(path)? {
fn build(
self,
sequences: impl Iterator<Item = Bytes>,
k: usize,
) -> Result<Self, Box<dyn Error>> {
for seq in sequences {
self.process_sequence(&seq, &k)
}

Expand Down Expand Up @@ -143,16 +162,3 @@ impl KmerMap for DashFx {
Ok(())
}
}

fn sequence_reader<P: AsRef<Path> + Debug>(
path: P,
) -> Result<impl Iterator<Item = Bytes>, Box<dyn Error>> {
Ok(Reader::from_file(path)?
.records()
.into_iter()
.par_bridge()
.map(|read| read.expect("Error reading fasta record."))
.map(|record| Bytes::copy_from_slice(record.seq()))
.collect::<Vec<Bytes>>()
.into_iter())
}

0 comments on commit ef34f0f

Please sign in to comment.