Skip to content

Commit

Permalink
演示自定义停用词Filter过滤器
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Aug 4, 2018
1 parent d6f3a69 commit 48809ba
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 5 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,5 @@ pyhanlp/static/hanlp.properties
tmp
*.swp
*.swo
/pyhanlp/static/*.java
/pyhanlp/static/*.class
39 changes: 34 additions & 5 deletions tests/demos/demo_stopword.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,49 @@
# # -*- coding:utf-8 -*-
# Author:wancong
# Date: 2018-04-30

# 在import pyhanlp之前编译自己的Java class,并放入pyhanlp/static中
import os

from pyhanlp.static import STATIC_ROOT, HANLP_JAR_PATH

java_code_path = os.path.join(STATIC_ROOT, 'MyFilter.java')
with open(java_code_path, 'w') as out:
java_code = """
import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary;
import com.hankcs.hanlp.dictionary.stopword.Filter;
import com.hankcs.hanlp.seg.common.Term;
public class MyFilter implements Filter
{
public boolean shouldInclude(Term term)
{
if (term.nature.startsWith('m')) return true; // 数词保留
return !CoreStopWordDictionary.contains(term.word); // 停用词过滤
}
}
"""
out.write(java_code)
os.system('javac -cp {} {} -d {}'.format(HANLP_JAR_PATH, java_code_path, STATIC_ROOT))
# 编译结束才可以启动hanlp
from pyhanlp import *


def demo_stopword():
""" 演示文本分类最基本的调用方式
中文情感挖掘语料-ChnSentiCorp 谭松波
TO-DO: 还可以自定义过滤逻辑
"""
>>> demo_stopword()
[小区/n, 反对/v, 喂养/v, 流浪猫/nz, 赞成/v, 喂养/v, 小宝贝/nz]
[小区/n, 居民/n, 反对/v, 喂养/v, 流浪猫/nz, 居民/n, 赞成/v, 喂养/v, 小宝贝/nz]
[小区/n, 居民/n, 有/vyou, 的/ude1, 反对/v, 喂养/v, 流浪猫/nz, ,/w, 而/cc, 有的/rz, 居民/n, 却/d, 赞成/v, 喂养/v, 这些/rz, 小宝贝/nz]
[小区/n, 居民/n, 反对/v, 喂养/v, 流浪猫/nz, 居民/n, 赞成/v, 喂养/v, 小宝贝/nz]
[数字/n, 123/m, 保留/v]
"""
CoreStopWordDictionary = JClass("com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary")
Filter = JClass("com.hankcs.hanlp.dictionary.stopword.Filter")
Term = JClass("com.hankcs.hanlp.seg.common.Term")
BasicTokenizer = JClass("com.hankcs.hanlp.tokenizer.BasicTokenizer")
NotionalTokenizer =JClass("com.hankcs.hanlp.tokenizer.NotionalTokenizer")
NotionalTokenizer = JClass("com.hankcs.hanlp.tokenizer.NotionalTokenizer")

text = "小区居民有的反对喂养流浪猫,而有的居民却赞成喂养这些小宝贝"
# 可以动态修改停用词词典
Expand All @@ -35,7 +58,13 @@ def demo_stopword():
CoreStopWordDictionary.apply(term_list)
print(term_list)

# 还可以自定义过滤逻辑
MyFilter = JClass('MyFilter')
CoreStopWordDictionary.FILTER = MyFilter()
print(NotionalTokenizer.segment("数字123的保留")) # “的”位于stopwords.txt所以被过滤,数字得到保留


if __name__ == "__main__":
import doctest

doctest.testmod(verbose=True)

0 comments on commit 48809ba

Please sign in to comment.