You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
jieba/test/test_whoosh.py

64 lines
1.6 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: UTF-8 -*-
import sys,os
sys.path.append("../")
from whoosh.index import create_in,open_dir
from whoosh.fields import *
from whoosh.qparser import QueryParser
from jieba.analyse import ChineseAnalyzer
analyzer = ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
if not os.path.exists("tmp"):
os.mkdir("tmp")
ix = create_in("tmp", schema) # for create new index
#ix = open_dir("tmp") # for read only
writer = ix.writer()
writer.add_document(
title=u"document1",
path=u"/a",
content=u"This is the first document weve added!"
)
writer.add_document(
title=u"document2",
path=u"/b",
content=u"The second one 你 中文测试中文 is even more interesting! 吃水果"
)
writer.add_document(
title=u"document3",
path=u"/c",
content=u"买水果然后来世博园。"
)
writer.add_document(
title=u"document4",
path=u"/c",
content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
)
writer.add_document(
title=u"document4",
path=u"/c",
content=u"咱俩交换一下吧。"
)
writer.commit()
searcher = ix.searcher()
parser = QueryParser("content", schema=ix.schema)
for keyword in (u"水果世博园",u"",u"first",u"中文",u"交换机",u"交换"):
print "result of ",keyword
q = parser.parse(keyword)
results = searcher.search(q)
for hit in results:
print hit.highlights("content")
print "="*10
for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream."):
print t.text