From efc784312c5fc6f7b605e2a8166679442d7ae45a Mon Sep 17 00:00:00 2001
From: Sun Junyi <ccnusjy@gmail.com>
Date: Mon, 1 Jul 2013 10:53:39 +0800
Subject: [PATCH] add ChineseAnalyzer for whoosh search engine

---
 jieba/analyse/__init__.py |  1 +
 jieba/analyse/analyzer.py | 33 ++++++++++++++++++++++
 test/test_whoosh.py       | 59 +++++++++++++++++++++++++++++++++++++++
 test/test_whoosh_flie.py  | 38 +++++++++++++++++++++++++
 4 files changed, 131 insertions(+)
 create mode 100644 jieba/analyse/analyzer.py
 create mode 100644 test/test_whoosh.py
 create mode 100644 test/test_whoosh_flie.py

diff --git a/jieba/analyse/__init__.py b/jieba/analyse/__init__.py
index 667251c..c002e33 100644
--- a/jieba/analyse/__init__.py
+++ b/jieba/analyse/__init__.py
@@ -1,5 +1,6 @@
 import jieba
 import os
+from analyzer import ChineseAnalyzer
 
 _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
 f_name = os.path.join(_curpath,"idf.txt")
diff --git a/jieba/analyse/analyzer.py b/jieba/analyse/analyzer.py
new file mode 100644
index 0000000..970504f
--- /dev/null
+++ b/jieba/analyse/analyzer.py
@@ -0,0 +1,33 @@
+#encoding=utf-8
+from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter
+from whoosh.analysis import Tokenizer,Token 
+
+import jieba
+import re
+
+STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
+                        'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
+                        'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
+                        'to', 'us', 'we', 'when', 'will', 'with', 'yet',
+                        'you', 'your',u'的',u'了',u'和'))
+
+accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")
+
+class ChineseTokenizer(Tokenizer):
+	def __call__(self,text,**kargs):
+		words = jieba.tokenize(text,mode="search")
+		token  = Token()
+		for (w,start_pos,stop_pos) in words:
+			if not accepted_chars.match(w):
+				if len(w)>1:
+					pass
+				else:
+					continue
+			token.text = w
+			token.pos = start_pos
+			token.startchar = start_pos
+			token.endchar = stop_pos
+			yield token
+
+def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1):
+	return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)
\ No newline at end of file
diff --git a/test/test_whoosh.py b/test/test_whoosh.py
new file mode 100644
index 0000000..4069160
--- /dev/null
+++ b/test/test_whoosh.py
@@ -0,0 +1,59 @@
+# -*- coding: UTF-8 -*-
+import sys
+sys.path.append("../")
+from whoosh.index import create_in
+from whoosh.fields import *
+from whoosh.qparser import QueryParser
+
+from jieba.analyse import ChineseAnalyzer 
+
+analyzer = ChineseAnalyzer()
+
+schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
+ix = create_in("tmp", schema)
+writer = ix.writer()
+
+writer.add_document(
+	title=u"document1", 
+	path=u"/a",
+	content=u"This is the first document we’ve added!"
+)
+
+writer.add_document(
+	title=u"document2", 
+	path=u"/b",
+	content=u"The second one 你 中文测试中文 is even more interesting! 吃水果"
+)
+
+writer.add_document(
+	title=u"document3", 
+	path=u"/c",
+	content=u"买水果然后来世博园。"
+)
+
+writer.add_document(
+	title=u"document4", 
+	path=u"/c",
+	content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
+)
+
+writer.add_document(
+	title=u"document4", 
+	path=u"/c",
+	content=u"咱俩交换一下吧。"
+)
+
+writer.commit()
+searcher = ix.searcher()
+parser = QueryParser("content", schema=ix.schema)
+
+for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交换"):
+	print "result of ",keyword
+	q = parser.parse(keyword)
+	results = searcher.search(q)
+	for hit in results:  
+	    print hit.highlights("content")
+	print "="*10
+
+for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft"):
+	print t.text
diff --git a/test/test_whoosh_flie.py b/test/test_whoosh_flie.py
new file mode 100644
index 0000000..243e41b
--- /dev/null
+++ b/test/test_whoosh_flie.py
@@ -0,0 +1,38 @@
+# -*- coding: UTF-8 -*-
+import sys
+sys.path.append("../")
+from whoosh.index import create_in
+from whoosh.fields import *
+from whoosh.qparser import QueryParser
+
+from jieba.analyse import ChineseAnalyzer 
+
+analyzer = ChineseAnalyzer()
+
+schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
+ix = create_in("tmp", schema)
+writer = ix.writer()
+
+file_name = sys.argv[1]
+
+with open(file_name,"rb") as inf:
+	i=0
+	for line in inf:
+		i+=1
+		writer.add_document(
+			title=u"line"+str(i), 
+			path=u"/a",
+			content=line.decode('gbk','ignore')
+		)
+writer.commit()
+
+searcher = ix.searcher()
+parser = QueryParser("content", schema=ix.schema)
+
+for keyword in (u"水果小姐",u"你",u"first",u"中文",u"交换机",u"交换"):
+	print "result of ",keyword
+	q = parser.parse(keyword)
+	results = searcher.search(q)
+	for hit in results:  
+	    print hit.highlights("content")
+	print "="*10