at-cg · Feb 15, 2024 · Feb 15, 2024 · Feb 15, 2024 · May 20, 2024 · May 20, 2024
diff --git a/README.md b/README.md
@@ -110,6 +110,8 @@ count.
 
 > Box plots show the levels of consistency between the haplotype recombination pairs in Minichain’s output chain and the ground-truth. We used different substitution rates and recombination penalties. Median values are highlighted with light green lines.
 
+Datasets for benchmarking are available at [Zenodo](https://zenodo.org/records/10665350)
+
 ### v1.2 and earlier versions
 
 We compared Minichain (v1.2) with existing sequence to graph aligners to demonstrate scalability and accuracy gains. Our experiments used human pangenome DAGs built by using subsets of [94 high-quality haplotype assemblies](https://github.com/human-pangenomics/HPP_Year1_Assemblies) provided by the Human Pangenome Reference Consortium, and [CHM13 human genome assembly](https://www.ncbi.nlm.nih.gov/assembly/GCA_009914755.4) provided by the Telomere-to-Telomere consortium. Using a simulated long read dataset with 0.5x coverage, and DAGs of three different sizes, we see superior read mapping precision ([as shown in the figure](#Plot)). For the largest DAG constructed from all 95 haplotypes, Minichain used 10 minutes and 25 GB RAM with 32 threads. The scripts to reproduce this benchmark are available [here](data/v1.0).

diff --git a/data/v1.3/Graphs/MHC-CHM13.0.gfa.gz b/data/v1.3/Graphs/MHC-CHM13.0.gfa.gz
diff --git a/data/v1.3/Map_Graph.py b/data/v1.3/Map_Graph.py
@@ -197,6 +197,8 @@ def count_switches(walk):
         max_ = re.findall(r'\bMax: (\d+)\b', out[5])
         mean_ = re.findall(r'\bMean: (\d+)\b', out[5])
         accuracy_ = re.findall(r'\bAccuracy: (\d+\.\d+)\b', out[5])
+        precision_ = re.findall(r'\bprecision: (\d+\.\d+)\b', out[5])
+        recall_ = re.findall(r'\brecall: (\d+\.\d+)\b', out[5])
         recomb_ = 0
         frac_seq = 0.0
         identity = 0.0
@@ -213,8 +215,8 @@ def count_switches(walk):
 
         # avoid thread collision
         with open(align_dir + "/R" + r + "_" + h.split('/')[1].split('.fa')[0] + ".txt", 'w') as f:
-            f.write("File\tMin\tMax\tMean\tRecomb\tFracSeq\tIdentity\tTrueRecomb\tAccuracy\n")
-            f.write(h.split('/')[1].split('.fa')[0]  + "\t" + str(min_[0]) + "\t" + str(max_[0]) + "\t" + str(mean_[0]) + "\t" + str(recomb_) + "\t" + str(frac_seq) + "\t" + str(identity) + "\t" + str(true_recomb_) + "\t" + str(accuracy_[0]) + "\n")
+            f.write("File\tMin\tMax\tMean\tRecomb\tFracSeq\tIdentity\tTrueRecomb\tAccuracy\tprecision\trecall\n")
+            f.write(h.split('/')[1].split('.fa')[0]  + "\t" + str(min_[0]) + "\t" + str(max_[0]) + "\t" + str(mean_[0]) + "\t" + str(recomb_) + "\t" + str(frac_seq) + "\t" + str(identity) + "\t" + str(true_recomb_) + "\t" + str(accuracy_[0]) + "\t" + str(precision_[0]) + "\t" + str(recall_[0]))
 
 
 

diff --git a/data/v1.3/Plot.py b/data/v1.3/Plot.py
@@ -45,7 +45,9 @@
                     count_recomb[r][align].append(float(field[5]))
                     count_recomb[r][align].append(float(field[6]))
                     count_recomb[r][align].append(int(field[7]))
-                    count_recomb[r][align].append(float(field[8].split('\n')[0]))
+                    count_recomb[r][align].append(float(field[8]))
+                    count_recomb[r][align].append(float(field[9]))
+                    count_recomb[r][align].append(float(field[10].split('\n')[0]))
 
     print("============================ Substitution rate : " + m + " ============================")
     print("=======================================================================================")
@@ -59,6 +61,8 @@
     accuracy_data = []
     pearson_corr = []
     kendall_corr = []
+    precision_data = []
+    recall_data = []
 
     # Initialize x-tick labels
     xtick_labels = []
@@ -72,6 +76,8 @@
         chain_data_ = []
         groud_truth_ = []
         accuracy_ = []
+        precision_ = []
+        recall_ = []
         mean_rec = 0.0
         mean_acc = 0.0
         count = 0
@@ -88,6 +94,8 @@
             identity_.append(count_recomb[r][align][5])
             groud_truth_.append(count_recomb[r][align][6])
             accuracy_.append(count_recomb[r][align][7])
+            precision_.append(count_recomb[r][align][8])
+            recall_.append(count_recomb[r][align][9])
             mean_acc += count_recomb[r][align][7]
         chain_data.append(chain_data_)
         recomb_data.append(recomb_)
@@ -99,16 +107,16 @@
         len_ = str(len(recomb_))
         if r == '2000000000':
             xtick_labels.append("$\infty$")
-            xtick_labels_.append("Recombination penalty = $\infty$")
+            xtick_labels_.append("$\\rho = \infty$")
         elif r == '0':
             xtick_labels.append("0")
-            xtick_labels_.append("Recombination penalty =  $0$")
+            xtick_labels_.append("$\\rho = 0$")
         elif r == '1000':
             xtick_labels.append("$10^3$")
             xtick_labels_.append("$\\rho = 10^3$")
         elif r == '10000':
             xtick_labels.append("$10^4$")
-            xtick_labels_.append("Recombination penalty = $10^4$")
+            xtick_labels_.append("$\\rho = 10^4$")
         elif r == '100000':
             xtick_labels.append("$10^5$")
             xtick_labels_.append("$\\rho = 10^5$")
@@ -235,4 +243,33 @@
     # ax10.set_title("F1 Score vs Recombination Penalty")
     plt.tight_layout()
     plt.savefig(folder + "/F1_Score_vs_R.pdf", bbox_inches='tight', dpi=1200, format='pdf')
-    # plt.show()
+    # plt.show()
+
+
+    fig11, ax11 = plt.subplots(figsize=(4, 3))
+    ax11.boxplot(precision_data, showfliers=False, showmeans=False, meanline=False, medianprops={'color':'lime', 'linewidth':2.2})
+    # add data points
+    for i in range(len(precision_data)):
+        y = precision_data[i]
+        x = np.random.normal(i + 1, 0.04, size=len(y))
+        ax11.plot(x, y, 'r.', alpha=0.3, markersize=10)
+    ax11.set_xticks(np.arange(1, len(R) + 1))
+    ax11.set_xticklabels(xtick_labels, rotation=45)
+    ax11.set_xlabel("Recombination penalty")
+    ax11.set_ylabel("Precision")
+    plt.tight_layout()
+    plt.savefig(folder + "/Precision_vs_R.pdf", bbox_inches='tight', dpi=1200, format='pdf')
+
+    fig12, ax12 = plt.subplots(figsize=(4, 3))
+    ax12.boxplot(recall_data, showfliers=False, showmeans=False, meanline=False, medianprops={'color':'lime', 'linewidth':2.2})
+    # add data points
+    for i in range(len(recall_data)):
+        y = recall_data[i]
+        x = np.random.normal(i + 1, 0.04, size=len(y))
+        ax12.plot(x, y, 'r.', alpha=0.3, markersize=10)
+    ax12.set_xticks(np.arange(1, len(R) + 1))
+    ax12.set_xticklabels(xtick_labels, rotation=45)
+    ax12.set_xlabel("Recombination penalty")
+    ax12.set_ylabel("Recall")
+    plt.tight_layout()
+    plt.savefig(folder + "/Recall_vs_R.pdf", bbox_inches='tight', dpi=1200, format='pdf')
diff --git a/data/v1.3/Reads/MHC_CHM13_ONT_filt.part_001.fq.gz b/data/v1.3/Reads/MHC_CHM13_ONT_filt.part_001.fq.gz
diff --git a/data/v1.3/Reads/MHC_CHM13_ONT_filt.part_002.fq.gz b/data/v1.3/Reads/MHC_CHM13_ONT_filt.part_002.fq.gz
diff --git a/data/v1.3/Reads/MHC_CHM13_PacBio_filt.fq.gz b/data/v1.3/Reads/MHC_CHM13_PacBio_filt.fq.gz
diff --git a/data/v1.3/Reads/MHC_CHM13_PacBio_filt.part_001.fq.gz b/data/v1.3/Reads/MHC_CHM13_PacBio_filt.part_001.fq.gz
diff --git a/data/v1.3/Reads/MHC_CHM13_PacBio_filt.part_002.fq.gz b/data/v1.3/Reads/MHC_CHM13_PacBio_filt.part_002.fq.gz
diff --git a/data/v1.3/Reproduce.py b/data/v1.3/Reproduce.py
@@ -86,9 +86,30 @@
 
 map_threads = 6
 # Generate the graph
-os.system("python3 Gen_Graph.py -t " + str(threads))
+# os.system("python3 Gen_Graph.py -t " + str(threads))
 # Simulate queries
-os.system("source ~/.bashrc && conda activate MC && python3 Simulate_query.py -t " + str(threads))
+# os.system("source ~/.bashrc && conda activate MC && python3 Simulate_query.py -t " + str(threads))
+
+os.system("mkdir -p Graphs")
+os.system("mkdir -p Reads")
+os.system("curl https://zenodo.org/api/records/10665350/files-archive -o Data.zip")
+os.system("unzip Data.zip")
+os.system("mv *.gfa.gz Graphs/")
+os.system("mv *.fq.gz Reads/")
+os.system("rm -rf Query_0.1")
+os.system("rm -rf Query_1")
+os.system("rm -rf Query_5")
+os.system("tar -xvf Query_0.1.tar.gz")
+os.system("tar -xvf Query_1.tar.gz")
+os.system("tar -xvf Query_5.tar.gz")
+os.system("cd Query_0.1 && gunzip *")
+os.system("cd Query_1 && gunzip *")
+os.system("cd Query_5 && gunzip *")
+os.system("rm Query_0.1.tar.gz")
+os.system("rm Query_1.tar.gz")
+os.system("rm Query_5.tar.gz")
+os.system("echo $(pwd)")
+
 # Map the queries
 os.system("source ~/.bashrc && conda activate MC && python3 Map_Graph.py -t " + str(threads))
 # Map the reads