Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: at-cg/minichain
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: v1.3
Choose a base ref
...
head repository: at-cg/minichain
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: master
Choose a head ref
  • 5 commits
  • 15 files changed
  • 2 contributors

Commits on Feb 15, 2024

  1. added zenodo link for datasets

    Ghanshyam Chandra authored and Ghanshyam Chandra committed Feb 15, 2024
    Copy the full SHA
    781a944 View commit details
  2. Update README.md

    gsc74 authored Feb 15, 2024
    Copy the full SHA
    a1d2e48 View commit details
  3. Update README.md

    gsc74 authored Feb 15, 2024
    Copy the full SHA
    9ab3d0f View commit details

Commits on May 20, 2024

  1. Copy the full SHA
    74e16ad View commit details
  2. added plots

    gsc74 committed May 20, 2024
    Copy the full SHA
    d21e694 View commit details
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -110,6 +110,8 @@ count.

> Box plots show the levels of consistency between the haplotype recombination pairs in Minichain’s output chain and the ground-truth. We used different substitution rates and recombination penalties. Median values are highlighted with light green lines.
Datasets for benchmarking are available at [Zenodo](https://zenodo.org/records/10665350)

### v1.2 and earlier versions

We compared Minichain (v1.2) with existing sequence to graph aligners to demonstrate scalability and accuracy gains. Our experiments used human pangenome DAGs built by using subsets of [94 high-quality haplotype assemblies](https://github.com/human-pangenomics/HPP_Year1_Assemblies) provided by the Human Pangenome Reference Consortium, and [CHM13 human genome assembly](https://www.ncbi.nlm.nih.gov/assembly/GCA_009914755.4) provided by the Telomere-to-Telomere consortium. Using a simulated long read dataset with 0.5x coverage, and DAGs of three different sizes, we see superior read mapping precision ([as shown in the figure](#Plot)). For the largest DAG constructed from all 95 haplotypes, Minichain used 10 minutes and 25 GB RAM with 32 threads. The scripts to reproduce this benchmark are available [here](data/v1.0).
Binary file removed data/v1.3/Graphs/MHC-CHM13.0.gfa.gz
Binary file not shown.
6 changes: 4 additions & 2 deletions data/v1.3/Map_Graph.py
Original file line number Diff line number Diff line change
@@ -197,6 +197,8 @@ def count_switches(walk):
max_ = re.findall(r'\bMax: (\d+)\b', out[5])
mean_ = re.findall(r'\bMean: (\d+)\b', out[5])
accuracy_ = re.findall(r'\bAccuracy: (\d+\.\d+)\b', out[5])
precision_ = re.findall(r'\bprecision: (\d+\.\d+)\b', out[5])
recall_ = re.findall(r'\brecall: (\d+\.\d+)\b', out[5])
recomb_ = 0
frac_seq = 0.0
identity = 0.0
@@ -213,8 +215,8 @@ def count_switches(walk):

# avoid thread collision
with open(align_dir + "/R" + r + "_" + h.split('/')[1].split('.fa')[0] + ".txt", 'w') as f:
f.write("File\tMin\tMax\tMean\tRecomb\tFracSeq\tIdentity\tTrueRecomb\tAccuracy\n")
f.write(h.split('/')[1].split('.fa')[0] + "\t" + str(min_[0]) + "\t" + str(max_[0]) + "\t" + str(mean_[0]) + "\t" + str(recomb_) + "\t" + str(frac_seq) + "\t" + str(identity) + "\t" + str(true_recomb_) + "\t" + str(accuracy_[0]) + "\n")
f.write("File\tMin\tMax\tMean\tRecomb\tFracSeq\tIdentity\tTrueRecomb\tAccuracy\tprecision\trecall\n")
f.write(h.split('/')[1].split('.fa')[0] + "\t" + str(min_[0]) + "\t" + str(max_[0]) + "\t" + str(mean_[0]) + "\t" + str(recomb_) + "\t" + str(frac_seq) + "\t" + str(identity) + "\t" + str(true_recomb_) + "\t" + str(accuracy_[0]) + "\t" + str(precision_[0]) + "\t" + str(recall_[0]))



47 changes: 42 additions & 5 deletions data/v1.3/Plot.py
Original file line number Diff line number Diff line change
@@ -45,7 +45,9 @@
count_recomb[r][align].append(float(field[5]))
count_recomb[r][align].append(float(field[6]))
count_recomb[r][align].append(int(field[7]))
count_recomb[r][align].append(float(field[8].split('\n')[0]))
count_recomb[r][align].append(float(field[8]))
count_recomb[r][align].append(float(field[9]))
count_recomb[r][align].append(float(field[10].split('\n')[0]))

print("============================ Substitution rate : " + m + " ============================")
print("=======================================================================================")
@@ -59,6 +61,8 @@
accuracy_data = []
pearson_corr = []
kendall_corr = []
precision_data = []
recall_data = []

# Initialize x-tick labels
xtick_labels = []
@@ -72,6 +76,8 @@
chain_data_ = []
groud_truth_ = []
accuracy_ = []
precision_ = []
recall_ = []
mean_rec = 0.0
mean_acc = 0.0
count = 0
@@ -88,6 +94,8 @@
identity_.append(count_recomb[r][align][5])
groud_truth_.append(count_recomb[r][align][6])
accuracy_.append(count_recomb[r][align][7])
precision_.append(count_recomb[r][align][8])
recall_.append(count_recomb[r][align][9])
mean_acc += count_recomb[r][align][7]
chain_data.append(chain_data_)
recomb_data.append(recomb_)
@@ -99,16 +107,16 @@
len_ = str(len(recomb_))
if r == '2000000000':
xtick_labels.append("$\infty$")
xtick_labels_.append("Recombination penalty = $\infty$")
xtick_labels_.append("$\\rho = \infty$")
elif r == '0':
xtick_labels.append("0")
xtick_labels_.append("Recombination penalty = $0$")
xtick_labels_.append("$\\rho = 0$")
elif r == '1000':
xtick_labels.append("$10^3$")
xtick_labels_.append("$\\rho = 10^3$")
elif r == '10000':
xtick_labels.append("$10^4$")
xtick_labels_.append("Recombination penalty = $10^4$")
xtick_labels_.append("$\\rho = 10^4$")
elif r == '100000':
xtick_labels.append("$10^5$")
xtick_labels_.append("$\\rho = 10^5$")
@@ -235,4 +243,33 @@
# ax10.set_title("F1 Score vs Recombination Penalty")
plt.tight_layout()
plt.savefig(folder + "/F1_Score_vs_R.pdf", bbox_inches='tight', dpi=1200, format='pdf')
# plt.show()
# plt.show()


fig11, ax11 = plt.subplots(figsize=(4, 3))
ax11.boxplot(precision_data, showfliers=False, showmeans=False, meanline=False, medianprops={'color':'lime', 'linewidth':2.2})
# add data points
for i in range(len(precision_data)):
y = precision_data[i]
x = np.random.normal(i + 1, 0.04, size=len(y))
ax11.plot(x, y, 'r.', alpha=0.3, markersize=10)
ax11.set_xticks(np.arange(1, len(R) + 1))
ax11.set_xticklabels(xtick_labels, rotation=45)
ax11.set_xlabel("Recombination penalty")
ax11.set_ylabel("Precision")
plt.tight_layout()
plt.savefig(folder + "/Precision_vs_R.pdf", bbox_inches='tight', dpi=1200, format='pdf')

fig12, ax12 = plt.subplots(figsize=(4, 3))
ax12.boxplot(recall_data, showfliers=False, showmeans=False, meanline=False, medianprops={'color':'lime', 'linewidth':2.2})
# add data points
for i in range(len(recall_data)):
y = recall_data[i]
x = np.random.normal(i + 1, 0.04, size=len(y))
ax12.plot(x, y, 'r.', alpha=0.3, markersize=10)
ax12.set_xticks(np.arange(1, len(R) + 1))
ax12.set_xticklabels(xtick_labels, rotation=45)
ax12.set_xlabel("Recombination penalty")
ax12.set_ylabel("Recall")
plt.tight_layout()
plt.savefig(folder + "/Recall_vs_R.pdf", bbox_inches='tight', dpi=1200, format='pdf')
Binary file removed data/v1.3/Reads/MHC_CHM13_ONT_filt.part_001.fq.gz
Binary file not shown.
Binary file removed data/v1.3/Reads/MHC_CHM13_ONT_filt.part_002.fq.gz
Binary file not shown.
Binary file removed data/v1.3/Reads/MHC_CHM13_PacBio_filt.fq.gz
Binary file not shown.
Binary file not shown.
Binary file removed data/v1.3/Reads/MHC_CHM13_PacBio_filt.part_002.fq.gz
Binary file not shown.
25 changes: 23 additions & 2 deletions data/v1.3/Reproduce.py
Original file line number Diff line number Diff line change
@@ -86,9 +86,30 @@

map_threads = 6
# Generate the graph
os.system("python3 Gen_Graph.py -t " + str(threads))
# os.system("python3 Gen_Graph.py -t " + str(threads))
# Simulate queries
os.system("source ~/.bashrc && conda activate MC && python3 Simulate_query.py -t " + str(threads))
# os.system("source ~/.bashrc && conda activate MC && python3 Simulate_query.py -t " + str(threads))

os.system("mkdir -p Graphs")
os.system("mkdir -p Reads")
os.system("curl https://zenodo.org/api/records/10665350/files-archive -o Data.zip")
os.system("unzip Data.zip")
os.system("mv *.gfa.gz Graphs/")
os.system("mv *.fq.gz Reads/")
os.system("rm -rf Query_0.1")
os.system("rm -rf Query_1")
os.system("rm -rf Query_5")
os.system("tar -xvf Query_0.1.tar.gz")
os.system("tar -xvf Query_1.tar.gz")
os.system("tar -xvf Query_5.tar.gz")
os.system("cd Query_0.1 && gunzip *")
os.system("cd Query_1 && gunzip *")
os.system("cd Query_5 && gunzip *")
os.system("rm Query_0.1.tar.gz")
os.system("rm Query_1.tar.gz")
os.system("rm Query_5.tar.gz")
os.system("echo $(pwd)")

# Map the queries
os.system("source ~/.bashrc && conda activate MC && python3 Map_Graph.py -t " + str(threads))
# Map the reads
Loading