Skip to content

Commit

Permalink
Merge pull request #16 from hundun000/master
Browse files Browse the repository at this point in the history
update DFA:可返回匹配文本对应的startIndex
  • Loading branch information
noahzark authored Jul 14, 2022
2 parents 37b3b34 + 7bbb2a7 commit a1198c9
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 12 deletions.
56 changes: 44 additions & 12 deletions src/main/java/com/centaurstech/algorithm/DFAFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,22 @@
*/
public class DFAFilter {

public static class DfaResultNode {
private final String text;
private final int startIndex;

public DfaResultNode(String text, int startIndex) {
super();
this.text = text;
this.startIndex = startIndex;
}
public String getText() {
return text;
}
public int getStartIndex() {
return startIndex;
}
}

public enum MatchType {
/**
Expand Down Expand Up @@ -224,38 +240,51 @@ public List<String> getSensitiveWordList(String txt, MatchType matchType) {
}

/**
* @Deprecated use getDfaResults
* 计算txt中的所有敏感词。结果排序与该敏感词在原文中的其实位置一致。
* @param lengthFilterFloor include; 不限制则传null;
* @param lengthFilterCeiling exclude; 不限制则传null;
*/
@Deprecated
public List<String> getSensitiveWordList(String txt, MatchType matchType, Integer lengthFilterFloor, Integer lengthFilterCeiling) {

List<String> result = new ArrayList<>();
List<DfaResultNode> dfaResultNodes = getDfaResult(txt, matchType, lengthFilterFloor, lengthFilterCeiling);
return dfaResultNodes.stream().map(node -> node.getText()).collect(Collectors.toList());
}

/**
* 计算txt中的所有敏感词。结果包含其在原串的起始位置。同一个敏感词出现于不同位置时得到多个结果。
* @param lengthFilterFloor include; 不限制则传null;
* @param lengthFilterCeiling exclude; 不限制则传null;
* @return not null
*/
public List<DfaResultNode> getDfaResult(String txt, MatchType matchType, Integer lengthFilterFloor, Integer lengthFilterCeiling) {
List<DfaResultNode> result = new ArrayList<>();

for (int i = 0; i < txt.length(); i++) {

List<String> wordsFromBeginIndex = getSensitiveWordsFromBeginIndex(txt, i);
List<DfaResultNode> wordsFromBeginIndex = getDfaResultNodesFromBeginIndex(txt, i);

wordsFromBeginIndex.removeIf(word -> {
wordsFromBeginIndex.removeIf(node -> {
String word = node.getText();
boolean floorMatch = lengthFilterFloor == null || word.length() >= lengthFilterFloor;
boolean ceilingMatch = lengthFilterCeiling == null || word.length() < lengthFilterCeiling;
return !floorMatch || !ceilingMatch;
});


if (wordsFromBeginIndex.size() > 0) {
wordsFromBeginIndex.sort(Comparator.comparingInt(String::length));
wordsFromBeginIndex.sort(Comparator.comparingInt(node -> node.getText().length()));

switch (matchType) {
case MIN:
result.add(wordsFromBeginIndex.get(0));
// skip some length
i = i + wordsFromBeginIndex.get(0).length() - 1;
i = i + wordsFromBeginIndex.get(0).getText().length() - 1;
break;
case MAX:
result.add(wordsFromBeginIndex.get(wordsFromBeginIndex.size() - 1));
// skip some length
i = i + wordsFromBeginIndex.get(wordsFromBeginIndex.size() - 1).length() - 1;
i = i + wordsFromBeginIndex.get(wordsFromBeginIndex.size() - 1).getText().length() - 1;
break;
case ALL:
result.addAll(wordsFromBeginIndex);
Expand Down Expand Up @@ -339,7 +368,7 @@ public String replaceSensitiveWord(String txt, String replaceStr) {
/**
* 计算txt中,以beginIndex为起点,的所有敏感词
*/
private List<String> getSensitiveWordsFromBeginIndex(String txt, int beginIndex) {
private List<DfaResultNode> getDfaResultNodesFromBeginIndex(String input, int beginIndex) {

List<Integer> matchedLengthList = new ArrayList<>(1);

Expand All @@ -348,8 +377,8 @@ private List<String> getSensitiveWordsFromBeginIndex(String txt, int beginIndex)
Character checkingChar;
DFANode currentNode = sensitiveWordMapRoot;

for (int i = beginIndex; i < txt.length(); i++) {
checkingChar = txt.charAt(i);
for (int i = beginIndex; i < input.length(); i++) {
checkingChar = input.charAt(i);
currentNode = currentNode.get(checkingChar);
if (currentNode != null) {
matchedLength++;
Expand All @@ -360,8 +389,11 @@ private List<String> getSensitiveWordsFromBeginIndex(String txt, int beginIndex)
break;
}
}
List<String> result = matchedLengthList.stream()
.map(length -> txt.substring(beginIndex, beginIndex + length))
List<DfaResultNode> result = matchedLengthList.stream()
.map(length -> {
String text = input.substring(beginIndex, beginIndex + length);
return new DfaResultNode(text, beginIndex);
})
.collect(Collectors.toList());

return result;
Expand Down
93 changes: 93 additions & 0 deletions src/test/java/com/centaurstech/algorithm/DFAFilterTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import org.junit.Test;

import com.centaurstech.algorithm.DFAFilter.MatchType;
import com.centaurstech.algorithm.DFAFilter.DfaResultNode;

import static org.junit.Assert.assertEquals;

Expand All @@ -15,6 +16,98 @@ public class DFAFilterTest {

@Test
public void testDFAFilter() {
Set<String> sensitiveWordSet = new HashSet<>();
sensitiveWordSet.add("foo");
sensitiveWordSet.add("工程师foo");
sensitiveWordSet.add("bar");
//初始化敏感词库

DFAFilter dfaFilter = DFAFilter.fromWordSet(sensitiveWordSet);

String txt = "测试工程师foo来到bar";
String notMatchedTxt = "测试工程师boo来到far";

List<DfaResultNode> assertList;

// ------ MAX ------
// without length filter
assertList = Arrays.asList(
new DfaResultNode("工程师foo", 2),
new DfaResultNode("bar", 10)
);
assertEquals(true, dfaFilter.contains(txt, MatchType.MAX, null, null));
assertEquals(false, dfaFilter.contains(notMatchedTxt, MatchType.MAX, null, null));
assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.MAX, null, null));
// with length filter
assertList = Arrays.asList(
new DfaResultNode("工程师foo", 2),
new DfaResultNode("bar", 10)
);
assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.MAX, 4, null));
assertList = Arrays.asList(
new DfaResultNode("foo", 5),
new DfaResultNode("bar", 10)
);
assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.MAX, null, 4));
assertList = Arrays.asList(
new DfaResultNode("工程师foo", 2),
new DfaResultNode("bar", 10)
);
assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.MAX, 4, 6));

// ------ MIN ------
// without length filter
assertList = Arrays.asList(
new DfaResultNode("foo", 5),
new DfaResultNode("bar", 10)
);
assertEquals(true, dfaFilter.contains(txt, MatchType.MIN, null, null));
assertEquals(false, dfaFilter.contains(notMatchedTxt, MatchType.MIN, null, null));
assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.MIN, null, null));
// with length filter
assertList = Arrays.asList();
assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.MIN, 4, null));
assertList = Arrays.asList(
new DfaResultNode("foo", 5)
, new DfaResultNode("bar", 10)
);
assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.MIN, null, 4));
assertList = Arrays.asList();
assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.MIN, 4, 6));

// ------ ALL ------
// without length filter
assertList = Arrays.asList(
new DfaResultNode("工程师foo", 2),
new DfaResultNode("foo", 5),
new DfaResultNode("bar", 10)
);
assertEquals(true, dfaFilter.contains(txt, MatchType.ALL, null, null));
assertEquals(false, dfaFilter.contains(notMatchedTxt, MatchType.ALL, null, null));
assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.ALL, null, null));
// with length filter
assertList = Arrays.asList(
new DfaResultNode("工程师foo", 2)
);
assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.ALL, 4, null));
assertList = Arrays.asList(
new DfaResultNode("foo", 5),
new DfaResultNode("bar", 10)
);
assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.ALL, null, 4));
assertList = Arrays.asList(
new DfaResultNode("工程师foo", 2)
);
assertListContentToStringEquals(assertList, dfaFilter.getDfaResult(txt, MatchType.ALL, 4, 6));
}

private static <T> boolean assertListContentToStringEquals(List<T> list1, List<T> list2) {
return list1.toString().equals(list2.toString());
}

@SuppressWarnings("deprecation")
@Test
public void testDFAFilterLegacy() {
Set<String> sensitiveWordSet = new HashSet<>();
sensitiveWordSet.add("white");
sensitiveWordSet.add("album");
Expand Down

0 comments on commit a1198c9

Please sign in to comment.