From 2eb11c802804a533c65252ac9d4b0394613fcfd6 Mon Sep 17 00:00:00 2001 From: fxsjy Date: Mon, 13 Jan 2020 20:53:43 +0800 Subject: [PATCH] fix issue #810 --- jieba/__init__.py | 9 +++++---- test/test_cutall.py | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/jieba/__init__.py b/jieba/__init__.py index 04f53b1..e66aa49 100755 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -205,14 +205,15 @@ class Tokenizer(object): eng_scan = 0 yield eng_buf if len(L) == 1 and k > old_j: - if re_eng.match(sentence[k]): + word = sentence[k:L[0] + 1] + if re_eng.match(word): if eng_scan == 0: eng_scan = 1 - eng_buf = sentence[k] + eng_buf = word else: - eng_buf += sentence[k] + eng_buf += word if eng_scan == 0: - yield sentence[k:L[0] + 1] + yield word old_j = L[0] else: for j in L: diff --git a/test/test_cutall.py b/test/test_cutall.py index bb9acf6..28499d8 100644 --- a/test/test_cutall.py +++ b/test/test_cutall.py @@ -98,3 +98,4 @@ if __name__ == "__main__": cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。') jieba.add_word('超敏C反应蛋白') cuttest('超敏C反应蛋白是什么, java好学吗?,小潘老板都学Python') + cuttest('steel健身爆发力运动兴奋补充剂')