pull/814/head
fxsjy
parent d703bce302
commit 2eb11c8028

@ -205,14 +205,15 @@ class Tokenizer(object):
eng_scan = 0
yield eng_buf
if len(L) == 1 and k > old_j:
if re_eng.match(sentence[k]):
word = sentence[k:L[0] + 1]
if re_eng.match(word):
if eng_scan == 0:
eng_scan = 1
eng_buf = sentence[k]
eng_buf = word
else:
eng_buf += sentence[k]
eng_buf += word
if eng_scan == 0:
yield sentence[k:L[0] + 1]
yield word
old_j = L[0]
else:
for j in L:

@ -98,3 +98,4 @@ if __name__ == "__main__":
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
jieba.add_word('超敏C反应蛋白')
cuttest('超敏C反应蛋白是什么, java好学吗?,小潘老板都学Python')
cuttest('steel健身爆发力运动兴奋补充剂')

Loading…
Cancel
Save