diff --git a/jieba/__init__.py b/jieba/__init__.py index 04f53b1..e66aa49 100755 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -205,14 +205,15 @@ class Tokenizer(object): eng_scan = 0 yield eng_buf if len(L) == 1 and k > old_j: - if re_eng.match(sentence[k]): + word = sentence[k:L[0] + 1] + if re_eng.match(word): if eng_scan == 0: eng_scan = 1 - eng_buf = sentence[k] + eng_buf = word else: - eng_buf += sentence[k] + eng_buf += word if eng_scan == 0: - yield sentence[k:L[0] + 1] + yield word old_j = L[0] else: for j in L: diff --git a/test/test_cutall.py b/test/test_cutall.py index bb9acf6..28499d8 100644 --- a/test/test_cutall.py +++ b/test/test_cutall.py @@ -98,3 +98,4 @@ if __name__ == "__main__": cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。') jieba.add_word('超敏C反应蛋白') cuttest('超敏C反应蛋白是什么, java好学吗?,小潘老板都学Python') + cuttest('steel健身爆发力运动兴奋补充剂')