Skip to content

Commit

Permalink
PHI v1.0 code-base
Browse files Browse the repository at this point in the history
  • Loading branch information
gsc74 committed Oct 26, 2024
0 parents commit 0ef5375
Show file tree
Hide file tree
Showing 126 changed files with 10,820 additions and 0 deletions.
120 changes: 120 additions & 0 deletions Installdeps
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/usr/bin/bash

source $HOME/.bashrc
conda create -n PHI -c conda-forge gxx_linux-64=13.2.0 make libstdcxx-ng cmake zlib bzip2 xz lz4 zstd -y
conda activate PHI

rm -rf ../extra
rm -rf temp_bin
mkdir -p temp_bin
cd temp_bin
mkdir -p ../extra
mkdir -p ../extra/plugins
mkdir -p ../extra/lib
mkdir -p ../extra/bin
mkdir -p ../extra/include
git clone https://github.com/jltsiren/gbwtgraph.git
git clone https://github.com/vgteam/sdsl-lite.git
git clone https://github.com/vgteam/libhandlegraph.git
git clone https://github.com/jltsiren/gbwt.git
git clone https://github.com/samtools/htslib.git
git clone https://github.com/samtools/samtools.git
git clone https://github.com/samtools/bcftools.git

# htslib
cd htslib
git submodule update --init --recursive
autoreconf -i
./configure --prefix=$(pwd)/../../extra
make -j4
make install

# samtools
cd ../samtools
autoheader
autoconf -Wno-syntax
./configure --prefix=$(pwd)/../../extra
make -j4
make install
cd ..

# bcftools
cd bcftools
make -j4
cp bcftools ../../extra/bin/
cp -r plugins/* ../../extra/plugins/
cd ..

wget --no-check-certificate https://packages.gurobi.com/11.0/gurobi11.0.2_linux64.tar.gz
tar -xvf gurobi11.0.2_linux64.tar.gz
cd gurobi1102/linux64
cp lib/* ../../../extra/lib/
cp include/* ../../../extra/include/
cd ../../

# zlib
wget --no-check-certificate https://zlib.net/current/zlib.tar.gz
tar -xvf zlib.tar.gz
cd zlib-*
./configure --prefix=$(pwd)/../../extra
make -j4
make install
cd ..

cd sdsl-lite
cd build
cmake .. && make -j4
cd ../..

cd libhandlegraph
mkdir -p build
cd build
cmake .. && make -j4 CPPFLAGS="-I../sdsl-lite/include" LDFLAGS="-L../sdsl-lite/build/lib/ -L../sdsl-lite/build/external/libdivsufsort/lib/"
cd ../..

cd gbwt
make -j4 CPPFLAGS="-I../sdsl-lite/include -I../libhandlegraph/src/include/" LDFLAGS="-L../sdsl-lite/build/lib/ -L../sdsl-lite/build/external/libdivsufsort/lib/ -L../libhandlegraph/build/"
cd ..

cd gbwtgraph
make CPPFLAGS="-I../sdsl-lite/include -I../gbwt/include -I../libhandlegraph/src/include/" \
LDFLAGS="-L../gbwt/lib -L../sdsl-lite/build/lib/ -L../sdsl-lite/build/external/libdivsufsort/lib/ -L../libhandlegraph/build/" -j4

cp bin/gfa2gbwt ../../extra/bin/
cp lib/* ../../extra/lib/
cd ..
cp gbwt/lib/* ../extra/lib/
cp sdsl-lite/build/lib/* ../extra/lib/
cp sdsl-lite/build/external/libdivsufsort/lib/* ../extra/lib/
cp libhandlegraph/build/libhandlegraph.a ../extra/lib/
cp libhandlegraph/build/libhandlegraph.so ../extra/lib/


# get vg
wget https://github.com/vgteam/vg/releases/download/v1.60.0/vg -O ../extra/bin/vg
chmod +x ../extra/bin/vg
cd ..
rm -rf temp_bin


# # Define the paths
# BIN_PATH="$(pwd)/extra/bin"
# LIB_PATH="$(pwd)/extra/lib"

# # Check if BIN_PATH is already in .bashrc
# if ! grep -q "$BIN_PATH" ~/.bashrc; then
# echo "Adding $BIN_PATH to PATH in .bashrc"
# echo "export PATH=\"$BIN_PATH:\$PATH\"" >> ~/.bashrc
# else
# echo "$BIN_PATH is already in PATH"
# fi

# # Check if LIB_PATH is already in .bashrc
# if ! grep -q "$LIB_PATH" ~/.bashrc; then
# echo "Adding $LIB_PATH to LD_LIBRARY_PATH in .bashrc"
# echo "export LD_LIBRARY_PATH=\"$LIB_PATH:\$LD_LIBRARY_PATH\"" >> ~/.bashrc
# else
# echo "$LIB_PATH is already in LD_LIBRARY_PATH"
# fi

# echo "Done! Paths are now updated and applied. Please source ~/.bashrc to apply the changes."
62 changes: 62 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
This software constitutes a joint work and the contributions of individual
authors are subject to different licenses. Contributions and licenses are
listed in the applicable source files, with specific details on each
individual contribution captured in the revision control system.

--
For all code, except as indicated otherwise:

PUBLIC DOMAIN NOTICE

This software is freely available to the public for use
without a copyright notice. Restrictions cannot be placed on its present or
future use.

--
For code used from minigraph:

URL: https://lh3.github.io/minigraph

The MIT License

Copyright (c) 2019- Dana-Farber Cancer Institute

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

--
For code used for Murmurhash3

Apache License, Version 2.0

Austin Appleby

--
For vg toolkit

The MIT License (MIT)

Copyright (c) 2014 Erik Garrison


--
For GBWTgraph

Copyright (c) 2019, 2020, 2021, 2022, 2023, 2024 Jouni Siren and other authors
23 changes: 23 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
CXX = g++ -std=c++11
CXXFLAGS = -fopenmp -pthread -march=native -mtune=native -O3 -lgurobi_c++ -lgurobi110 -lm -lz -lpthread -ldl
GUROBI_HOME = /home/ghanshyam/opt/gurobi1101/linux64
INLCLUDES = -Iextra/include
LIBS = -Lextra/lib

all: PHI

src_dir := src

OBJS = $(src_dir)/main.o $(src_dir)/gfa-io.o $(src_dir)/gfa-base.o \
$(src_dir)/options.o $(src_dir)/kalloc.o \
$(src_dir)/misc.o $(src_dir)/sys.o $(src_dir)/ILP_index.o \
$(src_dir)/MurmurHash3.o

PHI: $(OBJS)
$(CXX) $^ -o $@ $(INLCLUDES) $(LIBS) $(CXXFLAGS)

$(src_dir)/%.o: $(src_dir)/%.cpp
$(CXX) -c $< -o $@ $(INLCLUDES) $(LIBS) $(CXXFLAGS)

clean:
rm -f $(src_dir)/*.o PHI
86 changes: 86 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
<div align="center">
<img src="test/logo/logo_phi.png" alt="PHI Logo" width="200">
</div>

## <div align="center"><span style="color:red;"><b>PHI</b></span> (<span style="color:red;"><b>P</b></span>angenome-based <span style="color:red;"><b>H</b></span>aplotype <span style="color:red;"><b>I</b></span>nference)</div>


## <a name="started"></a>Getting Started

### Prerequisites

Before using PHI, please ensure that **Miniforge** is installed: [Miniforge Installation Guide](https://github.com/conda-forge/miniforge). This package installer is used for installing a few dependencies such as VG and samtools. To run PHI, you also need a Gurobi license. You can get a free academic license [here](https://www.gurobi.com/academia/academic-program-and-licenses/). You should download and save `gurobi.lic` file in your home directory.

## <a name="get_phi"></a>Get PHI

```bash
git clone https://github.com/at-cg/PHI
cd PHI
# Install dependencies (Miniforge is required)
./Installdeps
export PATH="$(pwd)/extra/bin:$PATH"
export LD_LIBRARY_PATH="$(pwd)/extra/lib:$LD_LIBRARY_PATH"
make

# test run
./PHI -t32 -g test/MHC_4.gfa.gz -r test/CHM13_reads.fq.gz -o CHM13.fa

# test run with VCF file as input
./vcf2gfa.py -v test/MHC_4.vcf.gz -r test/MHC-CHM13.0.fa.gz | bgzip > test/MHC_4_vcf.gfa.gz
./PHI -t32 -g test/MHC_4_vcf.gfa.gz -r test/CHM13_reads.fq.gz -o CHM13.fa
```

#### Adding Binary and Library Paths to `.bashrc`
To ensure that the `extra/bin` and `extra/lib` directories are automatically loaded for every terminal session, you can export them to your `~/.bashrc`. This will make sure the required binaries and libraries for `PHI` are available.

```bash
# Add extra/bin and extra/lib to .bashrc
echo 'export PATH="$(pwd)/extra/bin:$PATH"' >> ~/.bashrc
echo 'export LD_LIBRARY_PATH="$(pwd)/extra/lib:$LD_LIBRARY_PATH"' >> ~/.bashrc
source ~/.bashrc
```

## Table of Contents

- [Getting Started](#started)
- [Get PHI](#get_phi)
- [Introduction](#intro)
- [Results](#results)
- [Future work](#future)
- [Publications](#pub)

## <a name="intro"></a>Introduction
PHI is a pangenome-based genotyping method. It estimates complete haplotype sequence from low-coverage sequencing data (short-reads or long-reads of a haploid genome). Users should provide a pangenome graph reference in either:
- Graph Format ([GFA v1.1](http://gfa-spec.github.io/GFA-spec/GFA1.html#gfa-11)): A sequence graph-based representation of the pangenome graph. Graph should be acyclic.
- Variant Call Format ([VCF](https://samtools.github.io/hts-specs/VCFv4.2.pdf)): A list of multi-sample, multi-allelic phased variants along with a reference genome.

Output of PHI is the haplotype sequence (FASTA) associated with the optimal inferred path from the graph. It identifies a path in the pangenome graph that maximizes the matches between the path and read k-mers while minimizing recombination events (haplotype switches) along the path. We implemented integer programming to compute an optimal solution. The integer program is solved optimally using the [Gurobi optimizer](https://www.gurobi.com). Details of these formulations are described in our [paper](#publications).


## <a name="results"></a>Results
We benchmarked PHI (v1.0) using short-read datasets sampled from MHC sequences of five haplotypes (APD, DBB, MANN, QBL, and SSTO). This data was generated by [Houwaart et al. (2022)](https://doi.org/10.1111/tan.15020). These datasets were downsampled to various coverages ranging from 0.1x to 10x. We built a pangenome graph using [Minigraph-Cactus](https://github.com/ComparativeGenomicsToolkit/cactus/tree/master), comprising 49 complete [MHC sequences](https://doi.org/10.5281/zenodo.6617246). To assess the accuracy of PHI, we evaluated the edit distance between the inferred haplotype sequences and the MHC sequences from Houwaart et al. that were determined using de novo assembly and curation.

<p align="center" id="F1-score">
<img src="data/edit_distances.jpg" width="700" alt="F1-score"/>
</p>

> Edit distance between ground-truth haplotype sequences and the sequences estimated by different tools (PHI, VG, and PanGenie). Lower edit distance implies higher accuracy. PHI provides advangate over existing methods on low-coverage inputs.
In PHI, we have implemented two integer programs (referred to as ILP and IQP respectively). They both solve the same problem, but differ in terms of their runtime and memory-usage. IQP is generally faster but it requires more memory. Users can select between the two using command line argument (see `./PHI -h`).

<p align="center" id="F1-score">
<img src="data/phi_vs_phi_ilp.jpg" width="700" alt="F1-score"/>
</p>

> Performance comparison between ILP and IQP.
The scripts to reproduce the results are available [here](data).


## <a name="future"></a>Future Work
- Add support for diploid haplotype estimation.
- Scale to pangenome graphs having larger number of genomes.


## <a name="pub"></a>Publications
- **Ghanshyam Chandra, Md Helal Hossen, Stephan Scholz, Alexander T Dilthey, Daniel Gibney and Chirag Jain**. "[Integer programming framework for pangenome-based genome inference](https://www.biorxiv.org/)". *bioRxiv* 2024.
Binary file added data/Ground_truth/APD.fasta.gz
Binary file not shown.
Binary file added data/Ground_truth/COX.fasta.gz
Binary file not shown.
Binary file added data/Ground_truth/DBB.fasta.gz
Binary file not shown.
Binary file added data/Ground_truth/KAS116.fasta.gz
Binary file not shown.
Binary file added data/Ground_truth/MANN.fasta.gz
Binary file not shown.
Binary file added data/Ground_truth/PGF.fasta.gz
Binary file not shown.
Binary file added data/Ground_truth/QBL.fasta.gz
Binary file not shown.
Binary file added data/Ground_truth/SSTO.fasta.gz
Binary file not shown.
49 changes: 49 additions & 0 deletions data/MHC.seqfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
CHM13.0 data/hprc_haps/MHC-CHM13.0.fa
HG002.1 data/hprc_haps/MHC-HG002.1.fa
HG002.2 data/hprc_haps/MHC-HG002.2.fa
HG00438.1 data/hprc_haps/MHC-HG00438.1.fa
HG00438.2 data/hprc_haps/MHC-HG00438.2.fa
HG005.1 data/hprc_haps/MHC-HG005.1.fa
HG005.2 data/hprc_haps/MHC-HG005.2.fa
HG00621.1 data/hprc_haps/MHC-HG00621.1.fa
HG00621.2 data/hprc_haps/MHC-HG00621.2.fa
HG00741.1 data/hprc_haps/MHC-HG00741.1.fa
HG00741.2 data/hprc_haps/MHC-HG00741.2.fa
HG01106.1 data/hprc_haps/MHC-HG01106.1.fa
HG01106.2 data/hprc_haps/MHC-HG01106.2.fa
HG01109.1 data/hprc_haps/MHC-HG01109.1.fa
HG01109.2 data/hprc_haps/MHC-HG01109.2.fa
HG01123.1 data/hprc_haps/MHC-HG01123.1.fa
HG01123.2 data/hprc_haps/MHC-HG01123.2.fa
HG01258.1 data/hprc_haps/MHC-HG01258.1.fa
HG01258.2 data/hprc_haps/MHC-HG01258.2.fa
HG01358.1 data/hprc_haps/MHC-HG01358.1.fa
HG01358.2 data/hprc_haps/MHC-HG01358.2.fa
HG01361.1 data/hprc_haps/MHC-HG01361.1.fa
HG01361.2 data/hprc_haps/MHC-HG01361.2.fa
HG01891.1 data/hprc_haps/MHC-HG01891.1.fa
HG01891.2 data/hprc_haps/MHC-HG01891.2.fa
HG01928.1 data/hprc_haps/MHC-HG01928.1.fa
HG01928.2 data/hprc_haps/MHC-HG01928.2.fa
HG01952.1 data/hprc_haps/MHC-HG01952.1.fa
HG01952.2 data/hprc_haps/MHC-HG01952.2.fa
HG01978.1 data/hprc_haps/MHC-HG01978.1.fa
HG01978.2 data/hprc_haps/MHC-HG01978.2.fa
HG02080.1 data/hprc_haps/MHC-HG02080.1.fa
HG02080.2 data/hprc_haps/MHC-HG02080.2.fa
HG02257.1 data/hprc_haps/MHC-HG02257.1.fa
HG02257.2 data/hprc_haps/MHC-HG02257.2.fa
HG02486.1 data/hprc_haps/MHC-HG02486.1.fa
HG02486.2 data/hprc_haps/MHC-HG02486.2.fa
HG02559.1 data/hprc_haps/MHC-HG02559.1.fa
HG02559.2 data/hprc_haps/MHC-HG02559.2.fa
HG02622.1 data/hprc_haps/MHC-HG02622.1.fa
HG02622.2 data/hprc_haps/MHC-HG02622.2.fa
HG02717.1 data/hprc_haps/MHC-HG02717.1.fa
HG02717.2 data/hprc_haps/MHC-HG02717.2.fa
HG02886.1 data/hprc_haps/MHC-HG02886.1.fa
HG02886.2 data/hprc_haps/MHC-HG02886.2.fa
HG03540.1 data/hprc_haps/MHC-HG03540.1.fa
HG03540.2 data/hprc_haps/MHC-HG03540.2.fa
NA18906.1 data/hprc_haps/MHC-NA18906.1.fa
NA18906.2 data/hprc_haps/MHC-NA18906.2.fa
Loading

0 comments on commit 0ef5375

Please sign in to comment.