add 'search mode' for jieba.tokenize

pull/71/head
Sun Junyi 12 years ago
parent 237dc6625e
commit f08690a2df

@ -335,11 +335,29 @@ def get_abs_path_dict():
abs_path = os.path.join(_curpath,DICTIONARY)
return abs_path
def tokenize(unicode_sentence):
def tokenize(unicode_sentence,mode="default"):
#mode ("default" or "search")
if not isinstance(unicode_sentence, unicode):
raise Exception("jieba: the input parameter should unicode.")
start = 0
if mode=='default':
for w in cut(unicode_sentence):
width = len(w)
yield (w,start,start+width)
start+=width
else:
for w in cut(unicode_sentence):
width = len(w)
if len(w)>2:
for i in xrange(len(w)-1):
gram2 = w[i:i+2]
if gram2 in FREQ:
yield (gram2,start+i,start+i+2)
if len(w)>3:
for i in xrange(len(w)-2):
gram3 = w[i:i+3]
if gram3 in FREQ:
yield (gram3,start+i,start+i+3)
yield (w,start,start+width)
start+=width

@ -3,15 +3,19 @@ import sys
sys.path.append("../")
import jieba
g_mode="default"
def cuttest(test_sent):
global g_mode
test_sent = test_sent.decode('utf-8')
result = jieba.tokenize(test_sent)
result = jieba.tokenize(test_sent,mode=g_mode)
for tk in result:
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
if __name__ == "__main__":
for m in ("default","search"):
g_mode = m
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。")
cuttest("我不喜欢日本和服。")
cuttest("雷猴回归人间。")

Loading…
Cancel
Save