Merge pull request #16 from hundun000/master

update DFA：可返回匹配文本对应的startIndex
noahzark · Jul 14, 2022 · a1198c9 · a1198c9
2 parents 37b3b34 + 7bbb2a7
commit a1198c9
Show file tree

Hide file tree

Showing 2 changed files with 137 additions and 12 deletions.
diff --git a/src/main/java/com/centaurstech/algorithm/DFAFilter.java b/src/main/java/com/centaurstech/algorithm/DFAFilter.java
@@ -12,6 +12,22 @@
  */
 public class DFAFilter {
 
+    public static class DfaResultNode {
+        private final String text;
+        private final int startIndex;
+
+        public DfaResultNode(String text, int startIndex) {
+            super();
+            this.text = text;
+            this.startIndex = startIndex;
+        }
+        public String getText() {
+            return text;
+        }
+        public int getStartIndex() {
+            return startIndex;
+        }
+    }
 
     public enum MatchType {
         /**
@@ -224,38 +240,51 @@ public List<String> getSensitiveWordList(String txt, MatchType matchType) {
     }
 
     /**
+     * @Deprecated use getDfaResults
      * 计算txt中的所有敏感词。结果排序与该敏感词在原文中的其实位置一致。
      * @param lengthFilterFloor include; 不限制则传null;
      * @param lengthFilterCeiling exclude; 不限制则传null;
      */
+    @Deprecated
     public List<String> getSensitiveWordList(String txt, MatchType matchType, Integer lengthFilterFloor, Integer lengthFilterCeiling) {
-
-        List<String> result = new ArrayList<>();
+        List<DfaResultNode> dfaResultNodes = getDfaResult(txt, matchType, lengthFilterFloor, lengthFilterCeiling);
+        return dfaResultNodes.stream().map(node -> node.getText()).collect(Collectors.toList());
+    }
+
+    /**
+     * 计算txt中的所有敏感词。结果包含其在原串的起始位置。同一个敏感词出现于不同位置时得到多个结果。
+     * @param lengthFilterFloor include; 不限制则传null;
+     * @param lengthFilterCeiling exclude; 不限制则传null;
+     * @return not null
+     */
+    public List<DfaResultNode> getDfaResult(String txt, MatchType matchType, Integer lengthFilterFloor, Integer lengthFilterCeiling) {
+        List<DfaResultNode> result = new ArrayList<>();
 
         for (int i = 0; i < txt.length(); i++) {
 
-            List<String> wordsFromBeginIndex = getSensitiveWordsFromBeginIndex(txt, i);
+            List<DfaResultNode> wordsFromBeginIndex = getDfaResultNodesFromBeginIndex(txt, i);
 
-            wordsFromBeginIndex.removeIf(word -> {
+            wordsFromBeginIndex.removeIf(node -> {
+                String word = node.getText();
                 boolean floorMatch = lengthFilterFloor == null || word.length() >= lengthFilterFloor;
                 boolean ceilingMatch = lengthFilterCeiling == null || word.length() < lengthFilterCeiling;
                 return !floorMatch || !ceilingMatch;
             });
 
 
             if (wordsFromBeginIndex.size() > 0) {
-                wordsFromBeginIndex.sort(Comparator.comparingInt(String::length));
+                wordsFromBeginIndex.sort(Comparator.comparingInt(node -> node.getText().length()));
 
                 switch (matchType) {
                     case MIN:
                         result.add(wordsFromBeginIndex.get(0));
                         // skip some length
-                        i = i + wordsFromBeginIndex.get(0).length() - 1;
+                        i = i + wordsFromBeginIndex.get(0).getText().length() - 1;
                         break;
                     case MAX:
                         result.add(wordsFromBeginIndex.get(wordsFromBeginIndex.size() - 1));
                         // skip some length
-                        i = i + wordsFromBeginIndex.get(wordsFromBeginIndex.size() - 1).length() - 1;
+                        i = i + wordsFromBeginIndex.get(wordsFromBeginIndex.size() - 1).getText().length() - 1;
                         break;
                     case ALL:    
                         result.addAll(wordsFromBeginIndex);
@@ -339,7 +368,7 @@ public String replaceSensitiveWord(String txt, String replaceStr) {
     /**
      * 计算txt中，以beginIndex为起点，的所有敏感词
      */
-    private List<String> getSensitiveWordsFromBeginIndex(String txt, int beginIndex) {
+    private List<DfaResultNode> getDfaResultNodesFromBeginIndex(String input, int beginIndex) {
 
         List<Integer> matchedLengthList = new ArrayList<>(1);
 
@@ -348,8 +377,8 @@ private List<String> getSensitiveWordsFromBeginIndex(String txt, int beginIndex)
         Character checkingChar;
         DFANode currentNode = sensitiveWordMapRoot;
 
-        for (int i = beginIndex; i < txt.length(); i++) {
-            checkingChar = txt.charAt(i);
+        for (int i = beginIndex; i < input.length(); i++) {
+            checkingChar = input.charAt(i);
             currentNode = currentNode.get(checkingChar);
             if (currentNode != null) {
                 matchedLength++;
@@ -360,8 +389,11 @@ private List<String> getSensitiveWordsFromBeginIndex(String txt, int beginIndex)
                 break;
             }
         }
-        List<String> result = matchedLengthList.stream()
-                .map(length -> txt.substring(beginIndex, beginIndex + length))
+        List<DfaResultNode> result = matchedLengthList.stream()
+                .map(length -> {
+                    String text = input.substring(beginIndex, beginIndex + length);
+                    return new DfaResultNode(text, beginIndex);
+                })
                 .collect(Collectors.toList());
 
         return result;

diff --git a/src/test/java/com/centaurstech/algorithm/DFAFilterTest.java b/src/test/java/com/centaurstech/algorithm/DFAFilterTest.java
@@ -3,6 +3,7 @@
 import org.junit.Test;
 
 import com.centaurstech.algorithm.DFAFilter.MatchType;
+import com.centaurstech.algorithm.DFAFilter.DfaResultNode;
 
 import static org.junit.Assert.assertEquals;
 
@@ -15,6 +16,98 @@ public class DFAFilterTest {
 
     @Test
     public void testDFAFilter() {
+        Set<String> sensitiveWordSet = new HashSet<>();
+        sensitiveWordSet.add("foo");
+        sensitiveWordSet.add("工程师foo");
+        sensitiveWordSet.add("bar");
+        //初始化敏感词库
+
+        DFAFilter dfaFilter = DFAFilter.fromWordSet(sensitiveWordSet);
+
+        String txt = "测试工程师foo来到bar";
+        String notMatchedTxt = "测试工程师boo来到far";
+
+        List<DfaResultNode> assertList;
+
+        // ------ MAX ------
+        // without length filter
+        assertList = Arrays.asList(
+                new DfaResultNode("工程师foo", 2), 
+                new DfaResultNode("bar", 10)
+                );
+        assertEquals(true, dfaFilter.contains(txt, MatchType.MAX, null, null));
+        assertEquals(false, dfaFilter.contains(notMatchedTxt, MatchType.MAX, null, null));
+        assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.MAX, null, null));
+        // with length filter
+        assertList = Arrays.asList(
+                new DfaResultNode("工程师foo", 2), 
+                new DfaResultNode("bar", 10)
+                );
+        assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.MAX, 4, null));
+        assertList = Arrays.asList(
+                new DfaResultNode("foo", 5), 
+                new DfaResultNode("bar", 10)
+                );
+        assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.MAX, null, 4));
+        assertList = Arrays.asList(
+                new DfaResultNode("工程师foo", 2), 
+                new DfaResultNode("bar", 10)
+                );
+        assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.MAX, 4, 6));
+
+        // ------ MIN ------
+        // without length filter
+        assertList = Arrays.asList(
+                new DfaResultNode("foo", 5), 
+                new DfaResultNode("bar", 10)
+                );
+        assertEquals(true, dfaFilter.contains(txt, MatchType.MIN, null, null));
+        assertEquals(false, dfaFilter.contains(notMatchedTxt, MatchType.MIN, null, null));
+        assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.MIN, null, null));
+        // with length filter
+        assertList = Arrays.asList();
+        assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.MIN, 4, null));
+        assertList = Arrays.asList(
+                new DfaResultNode("foo", 5)
+                , new DfaResultNode("bar", 10)
+                );
+        assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.MIN, null, 4));
+        assertList = Arrays.asList();
+        assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.MIN, 4, 6));
+
+        // ------ ALL ------
+        // without length filter
+        assertList = Arrays.asList(
+                new DfaResultNode("工程师foo", 2), 
+                new DfaResultNode("foo", 5), 
+                new DfaResultNode("bar", 10)
+                );
+        assertEquals(true, dfaFilter.contains(txt, MatchType.ALL, null, null));
+        assertEquals(false, dfaFilter.contains(notMatchedTxt, MatchType.ALL, null, null));
+        assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.ALL, null, null));
+        // with length filter
+        assertList = Arrays.asList(
+                new DfaResultNode("工程师foo", 2)
+                );
+        assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.ALL, 4, null));
+        assertList = Arrays.asList(
+                new DfaResultNode("foo", 5), 
+                new DfaResultNode("bar", 10)
+                );
+        assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.ALL, null, 4));
+        assertList = Arrays.asList(
+                new DfaResultNode("工程师foo", 2)
+                );
+        assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.ALL, 4, 6));
+    }
+
+    private static <T> boolean assertListContentToStringEquals(List<T> list1, List<T> list2) {
+        return list1.toString().equals(list2.toString());
+    }
+
+    @SuppressWarnings("deprecation")
+    @Test
+    public void testDFAFilterLegacy() {
         Set<String> sensitiveWordSet = new HashSet<>();
         sensitiveWordSet.add("white");
         sensitiveWordSet.add("album");