diff --git a/Dockerfile b/Dockerfile index b04c38d..a779675 100644 --- a/Dockerfile +++ b/Dockerfile @@ -56,7 +56,7 @@ RUN git clone --recursive https://github.com/ekg/seqwish \ RUN git clone --recursive https://github.com/ekg/smoothxg \ && cd smoothxg \ && git pull \ - && git checkout e39380d \ + && git checkout e1d5548 \ && git submodule update --init --recursive \ && sed -i 's/-march=native/-march=haswell/g' deps/abPOA/CMakeLists.txt \ && sed -i 's/-mcx16 //g' deps/WFA/CMakeLists.txt \ diff --git a/README.md b/README.md index 9ca33ce..60750a7 100644 --- a/README.md +++ b/README.md @@ -21,11 +21,12 @@ By tracing the paths of the input sequences through the graph, it produces a var 3. _[smoothxg](https://github.com/ekg/smoothxg)_: (*normalization*) -- The graph is then sorted with a form of multi-dimensional scaling in 1D, groomed, and topologically ordered locally. The 1D order is then broken into "blocks" which are "smoothed" using the partial order alignment (POA) algorithm implemented in [abPOA](https://github.com/yangao07/abPOA) or [spoa](https://github.com/rvaser/spoa). -This normalizes their mutual alignment and remove artifacts of the edit-distance based alignment. +This normalizes their mutual alignment and removes artifacts resulting from transitive ambiguities in the pairwise alignments. It ensures that the graph always has local partial order, which is essential for many applications and matches our prior expectations about small-scale variation in genomes. This step yields a rebuilt graph, a consensus subgraph, and a whole genome alignment in [MAF](http://www.bx.psu.edu/~dcking/man/maf.xhtml) format. -Optional post-processing steps provide 1D and 2D diagnostic visualizations of the graph. +Optional post-processing steps provide 1D and 2D diagnostic visualizations of the graph, basic graph metrics. +The output graph (`*.smooth.gfa`) is suitable as input to `vg deconstruct` to obtain a VCF file relative to any set of reference sequences used in the construction, or for read mapping in `vg` or with `GraphAligner`. ## general usage diff --git a/pggb b/pggb index 1f32f87..33bd7c5 100755 --- a/pggb +++ b/pggb @@ -29,6 +29,7 @@ poa_params="1,19,39,3,81,1" do_viz=false do_layout=false threads=1 +poa_threads=0 mapper=wfmash no_merge_segments=false do_stats=false @@ -47,7 +48,7 @@ fi # read the options cmd=$0" "$@ -TEMP=`getopt -o i:o:p:n:s:l:K:k:B:w:j:P:Fe:t:vLhMSY:G:C:Q:d:I:R:NrmZ --long input-fasta:,output-dir:,map-pct-id:,n-mappings:,segment-length:,block-length-min:,mash-kmer:,min-match-length:,transclose-batch:,block-weight-max:,path-jump-max:,subpath-min:,edge-jump-max:,threads:,do-viz,do-layout,help,no-merge-segments,do-stats,exclude-delim:,poa-length-target:,poa-params:,write-maf,consensus-spec:,consensus-prefix:,split-min-depth:,block-id-min:,block-ratio-min:,no-splits,resume,multiqc,pigz-compress -n 'pggb' -- "$@"` +TEMP=`getopt -o i:o:p:n:s:l:K:k:B:w:j:P:Fe:t:T:vLhMSY:G:C:Q:d:I:R:NrmZ --long input-fasta:,output-dir:,map-pct-id:,n-mappings:,segment-length:,block-length-min:,mash-kmer:,min-match-length:,transclose-batch:,block-weight-max:,path-jump-max:,subpath-min:,edge-jump-max:,threads:,poa-threads:,do-viz,do-layout,help,no-merge-segments,do-stats,exclude-delim:,poa-length-target:,poa-params:,write-maf,consensus-spec:,consensus-prefix:,split-min-depth:,block-id-min:,block-ratio-min:,no-splits,resume,multiqc,pigz-compress -n 'pggb' -- "$@"` eval set -- "$TEMP" # extract options and their arguments into variables. @@ -77,6 +78,7 @@ while true ; do -C|--consensus-spec) consensus_spec=$2 ; shift 2 ;; -Q|--consensus-prefix) consensus_prefix=$2 ; shift 2 ;; -t|--threads) threads=$2 ; shift 2 ;; + -T|--poa-threads) poa_threads=$2 ; shift 2 ;; -v|--do-viz) do_viz=true ; shift ;; -L|--do-layout) do_layout=true ; shift ;; -S|--do-stats) do_stats=true ; shift ;; @@ -101,6 +103,11 @@ then >&2 echo "Mandatory arguments -i, -s, -n, -p" fi +if [[ $poa_threads == 0 ]]; +then + poa_threads=$threads +fi + if [[ $block_length == false ]]; then block_length=$(echo $segment_length '*' 3 | bc) @@ -159,6 +166,7 @@ then echo " -r, --resume PATH do not overwrite existing output from wfmash, seqwish, smoothxg in given directory" echo " [default: start pipeline from scratch in a new directory]" echo " -t, --threads N number of compute threads to use in parallel steps" + echo " -T, --poa-threads N number of compute threads to use during POA (set lower if you OOM during smoothing)" echo " -Z, --pigz-compress compress alignment (.paf), graph (.gfa, .og), and MSA (.maf) outputs with pigz" echo " -h, --help this text" echo @@ -251,6 +259,7 @@ smoothxg: split-min-depth: $split_min_depth block-id-min: $block_id_min block-ratio-min: $block_ratio_min + poa_threads: $poa_threads odgi: viz: $do_viz layout: $do_layout @@ -314,6 +323,7 @@ do if [[ ! -s $prefix_smoothed.smooth.$i.gfa || $resume == false ]]; then $timer -f "$fmt" smoothxg \ -t $threads \ + -T $poa_threads \ -g "$input_gfa" \ -w $max_block_weight \ -K \ @@ -333,6 +343,7 @@ do if [[ ! -s $prefix_smoothed.smooth.gfa || $resume == false ]]; then $timer -f "$fmt" smoothxg \ -t $threads \ + -T $poa_threads \ -g "$input_gfa" \ -w $max_block_weight \ -K \