Skip to content

Commit

Permalink
feat: use cargo features to select reader
Browse files Browse the repository at this point in the history
  • Loading branch information
suchapalaver committed Jan 15, 2023
1 parent a6a1845 commit 1e21228
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 82 deletions.
9 changes: 8 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,11 @@ needletail = "0.4.1"
rayon = "*"

[dev-dependencies]
insta = "1.14.1"
insta = "1.14.1"

[features]
default = ["rust-bio"]
# Use needletail fasta reader
needletail = []
# Use rust-bio fasta reader
rust-bio = []
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

## Counts k-mers, written in rust

`krust` is a [k-mer](https://en.wikipedia.org/wiki/K-mer) counter - a bioinformatics 101 tool for counting the frequency of substrings of length `k` within strings of DNA data. `krust` is written in Rust and run from the command line. It takes a fasta file of DNA sequences and will output all canonical k-mers (the double helix means each k-mer has a [reverse complement](https://en.wikipedia.org/wiki/Complementarity_(molecular_biology)#DNA_and_RNA_base_pair_complementarity)) and their frequency across all records in the given data. `krust` is tested for accuracy against [jellyfish](https://github.com/gmarcais/Jellyfish).

```bash
Usage: krust <k> <path> [reader]

Expand All @@ -15,20 +17,18 @@ Options:
-V, --version Print version information
```
`krust` is a [k-mer](https://en.wikipedia.org/wiki/K-mer) counter - a bioinformatics 101 tool for counting the frequency of substrings of length `k` within strings of DNA data. `krust` is written in Rust and run from the command line. It takes a fasta file of DNA sequences and will output all canonical k-mers (the double helix means each k-mer has a [reverse complement](https://en.wikipedia.org/wiki/Complementarity_(molecular_biology)#DNA_and_RNA_base_pair_complementarity)) and their frequency across all records in the given data. `krust` is tested for accuracy against [jellyfish](https://github.com/gmarcais/Jellyfish).
`krust` supports either `rust-bio` or `needletail` to read fasta records.
`krust` supports either `rust-bio` or `needletail` to read fasta record. Use the `--features` flag to select.
Run `krust` with `rust-bio`'s fasta reader to count *5*-mers like this:
```bash
cargo run --release 5 your/local/path/to/fasta_data.fa
cargo run --release --features rust-bio -- 5 your/local/path/to/fasta_data.fa
```
or, searching for *21*-mers with `needletail` as the fasta reader like this:
or, searching for *21*-mers with `needletail` as the fasta reader, like this:
```bash
cargo run --release 21 your/local/path/to/fasta_data.fa needletail
cargo run --release --features needletail -- 21 your/local/path/to/fasta_data.fa
```
`krust` prints to `stdout`, writing, on alternate lines:
Expand Down
11 changes: 2 additions & 9 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@ use colored::Colorize;
pub struct Config {
pub k: usize,
pub path: PathBuf,
pub reader: bool,
}

impl Config {
pub fn new(k: &str, path: &str, reader: &str) -> Result<Config, Box<dyn Error>> {
pub fn new(k: &str, path: &str) -> Result<Config, Box<dyn Error>> {
let k: usize = match k.parse::<usize>() {
Ok(k) if k > 0 && k < 33 => k,
Ok(_) => return Err("k-mer length needs to be larger than zero and, for krust currently, no more than 32".into()),
Expand All @@ -21,12 +20,6 @@ impl Config {
Err(e) => return Err(format!("Issue with file path: {}", e.to_string().bold()).into()),
};

let reader = match reader {
reader if matches!(reader, "needletail") => true,
reader if matches!(reader, "rust-bio") => false,
_ => return Err(format!("Invalid reader argument: \"{}\"", reader.bold()).into()),
};

Ok(Config { k, path, reader })
Ok(Config { k, path })
}
}
38 changes: 20 additions & 18 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,44 +20,46 @@ fn main() {
.help("path to a FASTA file, e.g. /home/lisa/bio/cerevisiae.pan.fa")
.required(true),
)
.arg(
Arg::new("reader")
.help("select *rust-bio* or *needletail* as FASTA reader")
.required(false)
.default_value("rust-bio"),
)
.get_matches();

let k = matches.get_one::<String>("k").expect("required");
let path = matches.get_one::<String>("path").expect("required");
let reader = matches.get_one::<String>("reader").unwrap();

eprintln!();
println!();

let config = Config::new(k, path, reader).unwrap_or_else(|e| {
eprintln!(
let config = Config::new(k, path).unwrap_or_else(|e| {
println!(
"{}\n {}",
"Problem parsing arguments:".blue().bold(),
e.to_string().blue()
);
eprintln!();
eprintln!(
println!();
println!(
"{}\n {}\n {}\n {}",
"Help menu:".blue().bold(),
"$ cargo run -- --help".bold(),
"or".underline(),
"$ krust --help".bold()
);
eprintln!();
println!();
process::exit(1);
});

eprintln!("{}: {}", "k-length".bold(), k.blue().bold());
eprintln!("{}: {}", "data".bold(), path.underline().bold().blue());
eprintln!("{}: {}", "reader".bold(), reader.blue().bold());
eprintln!();
println!("{}: {}", "k-length".bold(), k.blue().bold());
println!("{}: {}", "data".bold(), path.underline().bold().blue());
println!(
"{}: {}",
"reader".bold(),
match cfg!(feature = "needletail") {
true => "needletail",
_ => "rust-bio",
}
.blue()
.bold()
);
println!();

if let Err(e) = run::run(config.path, config.k, config.reader) {
if let Err(e) = run::run(config.path, config.k) {
eprintln!(
"{}\n {}",
"Application error:".blue().bold(),
Expand Down
55 changes: 20 additions & 35 deletions src/reader.rs
Original file line number Diff line number Diff line change
@@ -1,43 +1,28 @@
use std::{error::Error, fmt::Debug, path::Path};

use bio::io::fasta::Reader;
use bytes::Bytes;
use needletail::parse_fastx_file;
use rayon::{
prelude::IntoParallelIterator,
vec::IntoIter,
};
use rayon::{prelude::IntoParallelIterator, vec::IntoIter};

pub(crate) trait SequenceReader {
fn sequence_reader<P: AsRef<Path> + Debug>(path: P) -> Result<IntoIter<Bytes>, Box<dyn Error>>;
#[cfg(not(feature = "needletail"))]
pub(crate) fn read<P: AsRef<Path> + Debug>(path: P) -> Result<IntoIter<Bytes>, Box<dyn Error>> {
Ok(bio::io::fasta::Reader::from_file(path)?
.records()
.into_iter()
.map(|read| read.expect("Error reading fasta record."))
.map(|record| Bytes::copy_from_slice(record.seq()))
.collect::<Vec<Bytes>>()
.into_par_iter())
}

pub(crate) struct RustBio;

impl SequenceReader for RustBio {
fn sequence_reader<P: AsRef<Path> + Debug>(path: P) -> Result<IntoIter<Bytes>, Box<dyn Error>> {
Ok(Reader::from_file(path)?
.records()
.into_iter()
.map(|read| read.expect("Error reading fasta record."))
.map(|record| Bytes::copy_from_slice(record.seq()))
.collect::<Vec<Bytes>>()
.into_par_iter())
}
}

pub(crate) struct Needletail;

impl SequenceReader for Needletail {
fn sequence_reader<P: AsRef<Path> + Debug>(path: P) -> Result<IntoIter<Bytes>, Box<dyn Error>> {
let mut reader = parse_fastx_file(path)?;
let mut v = Vec::new();
while let Some(record) = reader.next() {
let record = record.expect("invalid record");
let seq = record.seq();
let seq = Bytes::copy_from_slice(&seq);
v.push(seq);
}
Ok(v.into_par_iter())
#[cfg(feature = "needletail")]
pub(crate) fn read<P: AsRef<Path> + Debug>(path: P) -> Result<IntoIter<Bytes>, Box<dyn Error>> {
let mut reader = needletail::parse_fastx_file(path)?;
let mut v = Vec::new();
while let Some(record) = reader.next() {
let record = record.expect("invalid record");
let seq = record.seq();
let seq = Bytes::copy_from_slice(&seq);
v.push(seq);
}
Ok(v.into_par_iter())
}
21 changes: 8 additions & 13 deletions src/run.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
use super::{
kmer::Kmer,
reader::{Needletail, RustBio, SequenceReader},
};
use super::{kmer::Kmer, reader::read};
use bytes::Bytes;
use dashmap::DashMap;
use fxhash::FxHasher;
Expand All @@ -20,16 +17,11 @@ custom_error::custom_error! { pub ProcessError
WriteError{source: IoError} = "Unable to write output: {source}",
}

pub fn run<P>(path: P, k: usize, reader: bool) -> Result<(), ProcessError>
pub fn run<P>(path: P, k: usize) -> Result<(), ProcessError>
where
P: AsRef<Path> + Debug,
{
let reader = match reader {
true => Needletail::sequence_reader(path),
false => RustBio::sequence_reader(path),
};

KmerMap::new().build(reader?, k)?.output(k)?;
KmerMap::new().build(read(path)?, k)?.output(k)?;

Ok(())
}
Expand All @@ -44,7 +36,9 @@ struct KmerMap(DashFx);

impl KmerMap {
fn new() -> Self {
Self(DashMap::with_hasher(BuildHasherDefault::<FxHasher>::default()))
Self(DashMap::with_hasher(
BuildHasherDefault::<FxHasher>::default(),
))
}

/// Reads sequences from fasta records in parallel using [`rayon`](https://docs.rs/rayon/1.5.1/rayon/),
Expand Down Expand Up @@ -119,7 +113,8 @@ impl KmerMap {
}

fn stream(self, k: usize) -> IntoIter<String, i32> {
self.0.into_iter()
self.0
.into_iter()
.par_bridge()
.map(|(packed_bits, count)| Kmer {
packed_bits,
Expand Down

0 comments on commit 1e21228

Please sign in to comment.