From efc784312c5fc6f7b605e2a8166679442d7ae45a Mon Sep 17 00:00:00 2001 From: Sun Junyi Date: Mon, 1 Jul 2013 10:53:39 +0800 Subject: [PATCH] add ChineseAnalyzer for whoosh search engine --- jieba/analyse/__init__.py | 1 + jieba/analyse/analyzer.py | 33 ++++++++++++++++++++++ test/test_whoosh.py | 59 +++++++++++++++++++++++++++++++++++++++ test/test_whoosh_flie.py | 38 +++++++++++++++++++++++++ 4 files changed, 131 insertions(+) create mode 100644 jieba/analyse/analyzer.py create mode 100644 test/test_whoosh.py create mode 100644 test/test_whoosh_flie.py diff --git a/jieba/analyse/__init__.py b/jieba/analyse/__init__.py index 667251c..c002e33 100644 --- a/jieba/analyse/__init__.py +++ b/jieba/analyse/__init__.py @@ -1,5 +1,6 @@ import jieba import os +from analyzer import ChineseAnalyzer _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) f_name = os.path.join(_curpath,"idf.txt") diff --git a/jieba/analyse/analyzer.py b/jieba/analyse/analyzer.py new file mode 100644 index 0000000..970504f --- /dev/null +++ b/jieba/analyse/analyzer.py @@ -0,0 +1,33 @@ +#encoding=utf-8 +from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter +from whoosh.analysis import Tokenizer,Token + +import jieba +import re + +STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can', + 'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may', + 'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this', + 'to', 'us', 'we', 'when', 'will', 'with', 'yet', + 'you', 'your',u'的',u'了',u'和')) + +accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+") + +class ChineseTokenizer(Tokenizer): + def __call__(self,text,**kargs): + words = jieba.tokenize(text,mode="search") + token = Token() + for (w,start_pos,stop_pos) in words: + if not accepted_chars.match(w): + if len(w)>1: + pass + else: + continue + token.text = w + token.pos = start_pos + token.startchar = start_pos + token.endchar = stop_pos + yield token + +def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1): + return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize) \ No newline at end of file diff --git a/test/test_whoosh.py b/test/test_whoosh.py new file mode 100644 index 0000000..4069160 --- /dev/null +++ b/test/test_whoosh.py @@ -0,0 +1,59 @@ +# -*- coding: UTF-8 -*- +import sys +sys.path.append("../") +from whoosh.index import create_in +from whoosh.fields import * +from whoosh.qparser import QueryParser + +from jieba.analyse import ChineseAnalyzer + +analyzer = ChineseAnalyzer() + +schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer)) +ix = create_in("tmp", schema) +writer = ix.writer() + +writer.add_document( + title=u"document1", + path=u"/a", + content=u"This is the first document we’ve added!" +) + +writer.add_document( + title=u"document2", + path=u"/b", + content=u"The second one 你 中文测试中文 is even more interesting! 吃水果" +) + +writer.add_document( + title=u"document3", + path=u"/c", + content=u"买水果然后来世博园。" +) + +writer.add_document( + title=u"document4", + path=u"/c", + content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" +) + +writer.add_document( + title=u"document4", + path=u"/c", + content=u"咱俩交换一下吧。" +) + +writer.commit() +searcher = ix.searcher() +parser = QueryParser("content", schema=ix.schema) + +for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交换"): + print "result of ",keyword + q = parser.parse(keyword) + results = searcher.search(q) + for hit in results: + print hit.highlights("content") + print "="*10 + +for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft"): + print t.text diff --git a/test/test_whoosh_flie.py b/test/test_whoosh_flie.py new file mode 100644 index 0000000..243e41b --- /dev/null +++ b/test/test_whoosh_flie.py @@ -0,0 +1,38 @@ +# -*- coding: UTF-8 -*- +import sys +sys.path.append("../") +from whoosh.index import create_in +from whoosh.fields import * +from whoosh.qparser import QueryParser + +from jieba.analyse import ChineseAnalyzer + +analyzer = ChineseAnalyzer() + +schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer)) +ix = create_in("tmp", schema) +writer = ix.writer() + +file_name = sys.argv[1] + +with open(file_name,"rb") as inf: + i=0 + for line in inf: + i+=1 + writer.add_document( + title=u"line"+str(i), + path=u"/a", + content=line.decode('gbk','ignore') + ) +writer.commit() + +searcher = ix.searcher() +parser = QueryParser("content", schema=ix.schema) + +for keyword in (u"水果小姐",u"你",u"first",u"中文",u"交换机",u"交换"): + print "result of ",keyword + q = parser.parse(keyword) + results = searcher.search(q) + for hit in results: + print hit.highlights("content") + print "="*10