From 4207933d49988f1c17ed400ebc713c3eb38cf74d Mon Sep 17 00:00:00 2001 From: Martin Sosic Date: Thu, 1 Dec 2016 23:32:06 +0100 Subject: [PATCH] Fixed bug with band being reduced to aggresively. --- edlib/src/edlib.cpp | 57 +++++++++++++++++++++++++++------------------ test/runTests.cpp | 54 ++++++++++++++++++++++++++++++------------ 2 files changed, 73 insertions(+), 38 deletions(-) diff --git a/edlib/src/edlib.cpp b/edlib/src/edlib.cpp index 8c52204..0a25395 100644 --- a/edlib/src/edlib.cpp +++ b/edlib/src/edlib.cpp @@ -684,7 +684,7 @@ static int myersCalcEditDistanceNW(Word* Peq, int W, int maxNumBlocks, k = min(k, bl->score + max(targetLength - c - 1, queryLength - ((1 + lastBlock) * WORD_SIZE - 1) - 1) + (lastBlock == maxNumBlocks - 1 ? W : 0)); - + //---------- Adjust number of blocks according to Ukkonen ----------// //--- Adjust last block ---// // If block is not beneath band, calculate next block. Only next because others are certainly beneath band. @@ -700,12 +700,14 @@ static int myersCalcEditDistanceNW(Word* Peq, int W, int maxNumBlocks, hout = newHout; } - // While block is out of band, move one block up. - This is optimal now, by my formula. - // NOTICE: I added + W, and now it works! This has to be added because query is padded with W cells. + // While block is out of band, move one block up. + // NOTE: Condition used here is more loose than the one from the article, since I simplified the max() part of it. + // I could consider adding that max part, for optimal performance. while (lastBlock >= firstBlock && (bl->score >= k + WORD_SIZE - || ((lastBlock + 1) * WORD_SIZE - 1 > - k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength + W))) { + || ((lastBlock + 1) * WORD_SIZE - 1 > + // TODO: Does not work if do not put +1! Why??? + k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength + 1))) { lastBlock--; bl--; } //-------------------------// @@ -720,19 +722,20 @@ static int myersCalcEditDistanceNW(Word* Peq, int W, int maxNumBlocks, } //--------------------------/ - + // TODO: consider if this part is useful, it does not seem to help much if (c % STRONG_REDUCE_NUM == 0) { // Every some columns do more expensive but more efficient reduction while (lastBlock >= firstBlock) { // If all cells outside of band, remove block vector scores = getBlockCellValues(*bl); - int r = (lastBlock + 1) * WORD_SIZE - 1; + int numCells = lastBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE; + int r = lastBlock * WORD_SIZE + numCells - 1; bool reduce = true; - for (int i = 0; i < WORD_SIZE; i++) { + for (int i = WORD_SIZE - numCells; i < WORD_SIZE; i++) { // TODO: Does not work if do not put +1! Why??? - if (scores[i] <= k && r <= k - scores[i] - targetLength + c + queryLength + W + 1) { + if (scores[i] <= k && r <= k - scores[i] - targetLength + c + queryLength + 1) { reduce = false; - break; + break; } r--; } @@ -743,9 +746,10 @@ static int myersCalcEditDistanceNW(Word* Peq, int W, int maxNumBlocks, while (firstBlock <= lastBlock) { // If all cells outside of band, remove block vector scores = getBlockCellValues(blocks[firstBlock]); - int r = (firstBlock + 1) * WORD_SIZE - 1; + int numCells = firstBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE; + int r = firstBlock * WORD_SIZE + numCells - 1; bool reduce = true; - for (int i = 0; i < WORD_SIZE; i++) { + for (int i = WORD_SIZE - numCells; i < WORD_SIZE; i++) { if (scores[i] <= k && r >= scores[i] - k - targetLength + c + queryLength) { reduce = false; break; @@ -780,7 +784,6 @@ static int myersCalcEditDistanceNW(Word* Peq, int W, int maxNumBlocks, } } //----------------------------------------------------------// - //---- If this is stop column, save it and finish ----// if (c == targetStopPosition) { for (int b = firstBlock; b <= lastBlock; b++) { @@ -1125,23 +1128,31 @@ static int obtainAlignmentHirschberg( // Calculate left half. AlignmentData* alignDataLeftHalf = NULL; - myersCalcEditDistanceNW(Peq, W, maxNumBlocks, - query, queryLength, - target, targetLength, - alphabetLength, bestScore, - &score_, &endLocation_, false, &alignDataLeftHalf, leftHalfWidth - 1); + int leftHalfCalcStatus = myersCalcEditDistanceNW( + Peq, W, maxNumBlocks, + query, queryLength, + target, targetLength, + alphabetLength, bestScore, + &score_, &endLocation_, false, &alignDataLeftHalf, leftHalfWidth - 1); // Calculate right half. AlignmentData* alignDataRightHalf = NULL; - myersCalcEditDistanceNW(rPeq, W, maxNumBlocks, - rQuery, queryLength, - rTarget, targetLength, - alphabetLength, bestScore, - &score_, &endLocation_, false, &alignDataRightHalf, rightHalfWidth - 1); + int rightHalfCalcStatus = myersCalcEditDistanceNW( + rPeq, W, maxNumBlocks, + rQuery, queryLength, + rTarget, targetLength, + alphabetLength, bestScore, + &score_, &endLocation_, false, &alignDataRightHalf, rightHalfWidth - 1); delete[] Peq; delete[] rPeq; + if (leftHalfCalcStatus == EDLIB_STATUS_ERROR || rightHalfCalcStatus == EDLIB_STATUS_ERROR) { + if (alignDataLeftHalf) delete alignDataLeftHalf; + if (alignDataRightHalf) delete alignDataRightHalf; + return EDLIB_STATUS_ERROR; + } + // Unwrap the left half. int firstBlockIdxLeft = alignDataLeftHalf->firstBlocks[0]; int lastBlockIdxLeft = alignDataLeftHalf->lastBlocks[0]; diff --git a/test/runTests.cpp b/test/runTests.cpp index a14a3d1..d78fe9c 100644 --- a/test/runTests.cpp +++ b/test/runTests.cpp @@ -322,19 +322,19 @@ bool test6() { int targetLength = 420; char query[13] = {1,3,0,1,1,1,3,0,1,3,1,3,3}; char target[420] = {0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0,1,3,0,1,3, - 3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2,3,2,3,3, - 1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0,1,3,0,1, - 3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2,3,2,3, - 3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0,1,3,0, - 1,3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2,3,2, - 3,3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0,1,3, - 0,1,3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2,3, - 2,3,3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0,1, - 3,0,1,3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2, - 3,2,3,3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0, - 1,3,0,1,3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2, - 2,3,2,3,3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1, - 0,1,3,0,1,3,3,3,1,3,2,2,3,2,3,3,1}; + 3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2,3,2,3,3, + 1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0,1,3,0,1, + 3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2,3,2,3, + 3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0,1,3,0, + 1,3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2,3,2, + 3,3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0,1,3, + 0,1,3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2,3, + 2,3,3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0,1, + 3,0,1,3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2, + 3,2,3,3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0, + 1,3,0,1,3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2, + 2,3,2,3,3,1,0,1,1,1,0,1,3,0,1,3,3,3,1,3,2,2,3,2,3,3,1,0,1,1,1, + 0,1,3,0,1,3,3,3,1,3,2,2,3,2,3,3,1}; bool r = executeTest(query, queryLength, target, targetLength, EDLIB_MODE_HW); r = r && executeTest(query, queryLength, target, targetLength, EDLIB_MODE_NW); @@ -367,6 +367,28 @@ bool test8() { return r; } +bool test9() { + int queryLength = 64; + int targetLength = 393; + char query[64] = {9,5,5,9,9,4,6,0,1,1,5,4,6,0,6,5,5,6,5,2,2,0,6,0,8,3,7,0,6,6,4,8,3,1,9,4,5,5,5,7,8,2, + 3,6,4,1,1,2,7,7,6,0,9,2,0,9,6,9,9,4,6,5,2,9}; + char target[393] = {7,1,6,2,9,1,1,7,5,5,4,9,6,7,3,4,6,9,4,5,2,6,6,0,7,8,4,3,3,9,5,2,0,1,7,1,4,0,9,9,7, + 5,0,6,2,4,0,9,3,6,6,7,4,3,9,3,3,4,7,8,5,4,1,7,7,0,9,3,0,8,4,0,3,4,6,7,0,8,6,6,6,5, + 5,2,0,5,5,3,1,4,1,6,8,4,3,7,6,2,0,9,0,4,9,5,1,5,3,1,3,1,9,9,6,5,1,8,0,6,1,1,1,5,9, + 1,1,2,1,8,5,1,7,7,8,6,5,9,1,0,2,4,1,2,5,0,9,6,8,1,4,2,4,5,9,3,9,0,5,0,8,0,3,7,0,1, + 3,5,0,6,5,5,2,8,9,7,0,8,5,1,9,0,3,3,7,2,6,6,4,3,8,5,6,2,2,6,5,8,3,8,4,0,3,7,8,2,6, + 9,0,2,0,1,2,5,6,1,9,4,8,3,7,8,8,5,2,3,1,8,1,6,6,7,6,9,6,5,3,3,6,5,7,8,6,1,3,4,2,4, + 0,0,7,7,1,8,5,3,3,6,1,4,5,7,3,1,8,0,8,1,5,6,6,2,4,4,3,9,8,7,3,8,0,3,8,1,3,3,4,6,1, + 8,2,6,7,5,8,6,7,8,7,4,5,6,6,9,0,1,1,1,9,4,9,1,9,9,2,2,4,8,0,6,6,4,4,4,2,2,2,9,3,1, + 6,8,7,2,9,8,6,0,1,7,7,2,8,6,2,2,1,6,0,3,4,9,8,9,3,2,3,5,3,6,6,9,6,6,2,6,6,0,8,7,9, + 5,9,7,4,3,1,7,2,1,0,6,0,0,7,5,2,1,2,6,9,1,5,6,7}; + + bool r = executeTest(query, queryLength, target, targetLength, EDLIB_MODE_HW); + r = r && executeTest(query, queryLength, target, targetLength, EDLIB_MODE_NW); + r = r && executeTest(query, queryLength, target, targetLength, EDLIB_MODE_SHW); + return r; +} + bool testCigar() { unsigned char alignment[] = {EDLIB_EDOP_MATCH, EDLIB_EDOP_MATCH, EDLIB_EDOP_INSERT, EDLIB_EDOP_INSERT, EDLIB_EDOP_INSERT, EDLIB_EDOP_DELETE, EDLIB_EDOP_INSERT, EDLIB_EDOP_INSERT, @@ -378,6 +400,7 @@ bool testCigar() { pass = false; printf("Expected %s, got %s\n", expected, cigar); } + printf("Cigar extended: "); printf(pass ? "\x1B[32m""OK""\x1B[0m\n" : "\x1B[31m""FAIL""\x1B[0m\n"); if (cigar) free(cigar); @@ -388,6 +411,7 @@ bool testCigar() { pass = false; printf("Expected %s, got %s\n", expected2, cigar); } + printf("Cigar standard: "); printf(pass ? "\x1B[32m""OK""\x1B[0m\n" : "\x1B[31m""FAIL""\x1B[0m\n"); if (cigar) free(cigar); @@ -396,9 +420,9 @@ bool testCigar() { bool runTests() { // TODO: make this global vector where tests have to add themselves. - int numTests = 9; + int numTests = 10; bool (* tests [])() = {test1, test2, test3, test4, test5, test6, - test7, test8, testCigar}; + test7, test8, test9, testCigar}; bool allTestsPassed = true; for (int i = 0; i < numTests; i++) {