Skip to content

Commit

Permalink
support for graph generation
Browse files Browse the repository at this point in the history
  • Loading branch information
gsc74 committed Jul 21, 2022
1 parent 5916e54 commit da91420
Show file tree
Hide file tree
Showing 9 changed files with 88 additions and 82 deletions.
16 changes: 14 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ git clone https://github.com/gsc74/minichain
cd minichain && make
# Map sequence to graph
./minichain -cx lr test/MT.gfa test/MT-orangA.fa > out.gaf
# Generate graph
./minichain -t32 -cxggs test/MT-human.fa test/MT-orangA.fa test/MT-orangA.fa -l500 -d500 > out.gaf
# Call per-sample path in each bubble/variation
./minichain -t32 -cxasm -l10k --call test/MT.gfa test/MT-orangA.fa > orangA.call.bed
```

## Table of Contents
Expand All @@ -14,12 +18,13 @@ cd minichain && make
- [Users' Guide](#uguide)
- [Installation](#install)
- [Sequence mapping](#map)
- [Graph generation](#graph_gen)
- [Limitations](#limit)
- [Credits](#credit)

## <a name="intro"></a>Introduction

minichain is a sequence-to-graph mapper.
minichain is a sequence-to-graph mapper and graph generation tool.

## <a name="uguide"></a>Users' Guide

Expand All @@ -39,11 +44,18 @@ minichain can be used for sequence-to-sequence mapping as well as sequence-to-gr
./minichain -t32 -cx lr test/MT.gfa test/MT-orangA.fa > out.gaf
```

### <a name="graph_gen"></a>Graph generation
minichain can be used for incremental graph generation, currently minichain only supports event insertions, inversions are not yet supported.
```sh
# Generate graph
./minichain -t32 -cxggs test/MT-huma.fa test/MT-orangA.fa test/MT-orangA.fa -l500 -d500 > out.gaf
```

## <a name="limit"></a>Limitations

* Current version(v1.0) only supports acyclic [rGFA][rgfa] and [GFA][gfa1] for sequence-to-graph mapping.

* Graph Generation is not supported.
* Inversions are not yet supported in graph generation.

## <a name="credit"></a>Credits
minichain utilises code base of [minigraph][minigraph], which is released under MIT License.
Expand Down
5 changes: 3 additions & 2 deletions ggsimple.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "sys.h"
#include "ggen.h"
#include "kvec-km.h"
#include <stdbool.h>

int32_t mg_gc_index(void *km, int min_mapq, int min_map_len, int min_depth_len, const gfa_t *g, int32_t n_seq, mg_gchains_t *const* gcs,
double *a_dens, int32_t **soff_, int32_t **qoff_, mg_intv_t **sintv_, mg_intv_t **qintv_)
Expand Down Expand Up @@ -255,7 +256,7 @@ void mg_ggsimple(void *km, const mg_ggopt_t *opt, gfa_t *g, int32_t n_seq, const
} else score = -1, mlen = 0, blen = pd > qd? pd : qd;
fprintf(stderr, "\nIS\t%d==%d\tnwcmp:%d\tmlen:%d\tblen:%d\n", pd, l_pseq, score, mlen, blen);
}
if (is_inv) { // turn one inversion to two events
if (is_inv && false) { // turn one inversion to two events
gfa_ins_t I_inv[2];
I_inv[0].ctg = I_inv[1].ctg = I.ctg;
// the first event
Expand Down Expand Up @@ -516,7 +517,7 @@ void mg_ggsimple_cigar(void *km, const mg_ggopt_t *opt, gfa_t *g, int32_t n_seq,
fprintf(stderr, "\nIS\t%d==%d\tnwcmp:%d\tmlen:%d\tblen:%d\n", pd, l_pseq, score, mlen, blen);
//if (I.voff[0] == 2305301) { for (k = st; k < en; ++k) fprintf(stderr, "%d%c", intv[k].len, "MIDNSHP=XB"[intv[k].op]); fprintf(stderr, "\n"); }
}
if (is_inv) { // turn one inversion to two events
if (is_inv && false) { // turn one inversion to two events
gfa_ins_t I_inv[2];
I_inv[0].ctg = I_inv[1].ctg = I.ctg;
// the first event
Expand Down
6 changes: 3 additions & 3 deletions graphUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ int graphUtils::is_cyclic() // Check cyclicity of component and convert to acycl
q.clear();
in_degree.clear();
out_degree.clear();
std::cerr << "[Connected components : " << num_cid << ", components with cycle : " << cycle_count << "]"<< std::endl;
// std::cerr << "[Connected components : " << num_cid << ", components with cycle : " << cycle_count << "]"<< std::endl;
return cycle_count;
}

Expand Down Expand Up @@ -761,7 +761,7 @@ bool compare_dups(const Tuples &a, const Tuples &b){
return std::tie( a.top_v , a.pos, a.task, a.d, a.path) == std::tie( b.top_v , b.pos, b.task, b.d, b.path);
};

std::vector<mg128_t> graphUtils::Chaining(std::vector<mg128_t> anchors, float c)
std::vector<mg128_t> graphUtils::Chaining(std::vector<mg128_t> anchors)
{
if (param_z)
{
Expand Down Expand Up @@ -861,7 +861,7 @@ std::vector<mg128_t> graphUtils::Chaining(std::vector<mg128_t> anchors, float c)
for (auto t:T) // in Topological Order of their nodes
{
int64_t c_ = 1;
int64_t d_ = c;
int64_t d_ = scale_factor;
if(t.task == 0)
{
int64_t val_1 = c_*( M[cid][t.anchor].c - 1 + M[cid][t.anchor].x - 1 + dist2begin[cid][t.path][t.v] + Distance[cid][t.path][t.w]);
Expand Down
5 changes: 2 additions & 3 deletions graphUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,8 @@ class graphUtils
/* Map Top_Sort */
std::vector<std::vector<int>> map_top_sort;
float scale_factor;
int param_z;
bool param_z;
int lin_ref = 0;
std::string seq_name;


graphUtils(gfa_t *g); // This is constructor
Expand All @@ -123,7 +122,7 @@ class graphUtils

void MPC_index();

std::vector<mg128_t> Chaining(std::vector<mg128_t> anchors, float c);
std::vector<mg128_t> Chaining(std::vector<mg128_t> anchors);

};

Expand Down
32 changes: 32 additions & 0 deletions index.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "kthread.h"
#include "kvec-km.h"
#include "sys.h"
#include "graphUtils.h"

#define idx_hash(a) ((a)>>1)
#define idx_eq(a, b) ((a)>>1 == (b)>>1)
Expand Down Expand Up @@ -208,6 +209,15 @@ mg_idx_t *mg_index_core(gfa_t *g, int k, int w, int b, int n_threads)
return gi;
}

/* Pass parameters */
params* par;
void pass_par(bool &param_z, float &scale_factor)
{
par = new params();
par->param_z = param_z;
par->scale_factor = scale_factor;
}

mg_idx_t *mg_index(gfa_t *g, const mg_idxopt_t *io, int n_threads, mg_mapopt_t *mo)
{
int32_t i, j;
Expand All @@ -222,6 +232,28 @@ mg_idx_t *mg_index(gfa_t *g, const mg_idxopt_t *io, int n_threads, mg_mapopt_t *
if (gi == 0) return 0;
gi->es = gfa_edseq_init(gi->g);
gi->n_seg = g->n_seg;
/* Indexing */
graphUtils *graphOp = new graphUtils(g);
graphOp->param_z = par->param_z;
graphOp->read_graph();
omp_set_dynamic(0);
omp_set_num_threads(n_threads);
graphOp->scale_factor = par->scale_factor;
// graphOp->print_graph();
graphOp->Connected_components();
int cycle_count = graphOp->is_cyclic();
if (cycle_count == 0)
{
graphOp->topologicat_sort();
graphOp->MPC();
graphOp->MPC_index();
}else
{
std::cerr << "[Please provide acyclic rGFA]" << std::endl;
exit(0);
}
get_Op(graphOp); // pass pointer to map-algo.c

if (mg_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] indexed the graph\n", __func__,
realtime() - mg_realtime0, cputime() / (realtime() - mg_realtime0));
Expand Down
59 changes: 17 additions & 42 deletions main.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "ketopt.h"
#include "graphUtils.h"
#include <chrono>
#include <stdbool.h>

#ifdef __linux__
#include <sys/resource.h>
Expand Down Expand Up @@ -104,7 +105,7 @@ int main(int argc, char *argv[])
mg_ggopt_t gpt;
int i, c, ret, n_threads = 4;
float scale_factor = 200;
int z = 0;
bool z = false;
char *s;
FILE *fp_help = stderr;
gfa_t *g;
Expand Down Expand Up @@ -250,30 +251,34 @@ int main(int argc, char *argv[])
fprintf(fp_help, " -p FLOAT min secondary-to-primary score ratio [%g]\n", opt.pri_ratio);
fprintf(fp_help, " -N INT retain at most INT secondary mappings [%d]\n", opt.best_n);
fprintf(fp_help, " -D skip self diagonal matches\n");
// fprintf(fp_help, " Graph generation:\n");
// fprintf(fp_help, " --ggen perform incremental graph generation\n");
// fprintf(fp_help, " -q INT min mapping quality [%d]\n", gpt.min_mapq);
// fprintf(fp_help, " -l NUM min alignment length [%d]\n", gpt.min_map_len);
// fprintf(fp_help, " -d NUM min alignment length for depth calculation [%d]\n", gpt.min_depth_len);
// fprintf(fp_help, " -L INT min variant length [%d]\n", gpt.min_var_len);
// fprintf(fp_help, " --call call the graph path in each bubble and output BED\n");
fprintf(fp_help, " Graph generation:\n");
fprintf(fp_help, " --ggen perform incremental graph generation\n");
fprintf(fp_help, " -q INT min mapping quality [%d]\n", gpt.min_mapq);
fprintf(fp_help, " -l NUM min alignment length [%d]\n", gpt.min_map_len);
fprintf(fp_help, " -d NUM min alignment length for depth calculation [%d]\n", gpt.min_depth_len);
fprintf(fp_help, " -L INT min variant length [%d]\n", gpt.min_var_len);
fprintf(fp_help, " --call call the graph path in each bubble and output BED\n");
fprintf(fp_help, " Input/output:\n");
fprintf(fp_help, " -t INT number of threads [%d]\n", n_threads);
fprintf(fp_help, " -s float scale factor [%f]\n", scale_factor);
fprintf(fp_help, " -z int debug_chain [%d]\n", z);
fprintf(fp_help, " -z bool debug_chain [%d]\n", z);
fprintf(fp_help, " -o FILE output mappings to FILE [stdout]\n");
fprintf(fp_help, " -K NUM minibatch size for mapping [500M]\n");
fprintf(fp_help, " -S output linear chains in * sName sLen nMz div sStart sEnd qStart qEnd\n");
fprintf(fp_help, " --vc output in the vertex coordinate\n");
fprintf(fp_help, " Preset:\n");
fprintf(fp_help, " -x STR preset []\n");
fprintf(fp_help, " - lr: noisy long read mapping (the default)\n");
// fprintf(fp_help, " - asm: asm-to-ref mapping\n");
// fprintf(fp_help, " - sr: short reads\n");
// fprintf(fp_help, " - ggs: incremental graph generation\n");
fprintf(fp_help, " - asm: asm-to-ref mapping\n");
fprintf(fp_help, " - sr: short reads\n");
fprintf(fp_help, " - ggs: incremental graph generation\n");
return fp_help == stdout? 0 : 1;
}

// Pass parameters to index.c
// std::cerr << " Param_z : " << z << " Scale_factor : " << scale_factor << std::endl;
pass_par(z,scale_factor);

g = gfa_read(argv[o.ind]);
if (g == 0) {
fprintf(stderr, "[ERROR] failed to load the graph from file '%s'\n", argv[o.ind]);
Expand All @@ -283,39 +288,9 @@ int main(int argc, char *argv[])
}

if (gpt.algo == MG_G_NONE && !(gpt.flag & MG_G_CALL)) {

// std::cerr << "[Mapping Reads]" << std::endl;
/* Indexing */
std::chrono::time_point<std::chrono::system_clock> start, end;
start = std::chrono::system_clock::now(); // start time
graphUtils *graphOp = new graphUtils(g);
graphOp->param_z = z;
graphOp->read_graph();
omp_set_dynamic(0);
omp_set_num_threads(n_threads);
graphOp->scale_factor = scale_factor;
// graphOp->print_graph();
graphOp->Connected_components();
int cycle_count = graphOp->is_cyclic();
if (cycle_count == 0)
{
graphOp->topologicat_sort();
graphOp->MPC();
graphOp->MPC_index();
end = std::chrono::system_clock::now(); //end time
std::chrono::duration<double> elapsed_seconds = end - start;
std::cerr << "[Indexing time for Graph Chaining : " << elapsed_seconds.count() << "(s)]"<< std::endl;
}else
{
std::cerr << "[Please provide acyclic rGFA]" << std::endl;
exit(0);
}
get_Op(graphOp); // pass pointer to map-algo.c
ret = mg_map_files(g, argc - (o.ind + 1), (const char**)&argv[o.ind + 1], &ipt, &opt, n_threads);
} else {
if (gpt.flag & MG_G_CALL) gfa_sort_ref_arc(g);
std::cerr << "[Graph Generation is not supported.]" << std::endl;
exit(0);
ret = mg_ggen(g, argc - (o.ind + 1), (const char**)&argv[o.ind + 1], &ipt, &opt, &gpt, n_threads);
}

Expand Down
13 changes: 1 addition & 12 deletions map-algo.c
Original file line number Diff line number Diff line change
Expand Up @@ -392,8 +392,6 @@ void mg_map_frag(const mg_idx_t *gi, int n_segs, const int *qlens, const char **
*/

// Integrate Chain


mg128_t *anchor_;
anchor_ = a;
std::vector<mg128_t> anchor;
Expand All @@ -402,24 +400,15 @@ void mg_map_frag(const mg_idx_t *gi, int n_segs, const int *qlens, const char **
{
anchor[i] = anchor_[i];
}

/* Chaining */
std::chrono::time_point<std::chrono::system_clock> start, end;
start = std::chrono::system_clock::now(); // start time
graphOp->seq_name = qname;
std::vector<mg128_t> best = graphOp->Chaining(anchor ,graphOp->scale_factor);
end = std::chrono::system_clock::now(); //end time
std::chrono::duration<double> elapsed_seconds = end - start;
elapsed_seconds = end - start;
// std::cerr << " Chaining Time : " << elapsed_seconds.count() << " (s) "<< std::endl;
std::vector<mg128_t> best = graphOp->Chaining(anchor);
kfree(b->km, a);
KMALLOC(b->km, a, best.size());
for (size_t i = 0; i < best.size(); i++)
{
a[i] = best[i];
}
n_a = best.size();

/* clear anchors */
anchor.clear();

Expand Down
8 changes: 8 additions & 0 deletions mgpriv.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,14 @@ void radix_sort_128x(mg128_t *beg, mg128_t *end);
void radix_sort_gfa64(uint64_t *beg, uint64_t *end);
uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);

void pass_par(bool &param_z, float &scale_factor);
struct params
{
float scale_factor;
bool param_z;
};


#ifdef __cplusplus
}
#endif
Expand Down
Loading

0 comments on commit da91420

Please sign in to comment.