From 97c32464e122055b10d511bfd1eab0b38b08622a Mon Sep 17 00:00:00 2001 From: fxsjy Date: Fri, 3 Jan 2020 14:10:48 +0800 Subject: [PATCH] fix issue #798 --- jieba/__init__.py | 26 +++++++++++++++++--------- test/test_cutall.py | 2 ++ 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/jieba/__init__.py b/jieba/__init__.py index d4d87fd..9e4d64b 100755 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -46,8 +46,6 @@ re_eng = re.compile('[a-zA-Z0-9]', re.U) re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U) re_skip_default = re.compile("(\r\n|\s)", re.U) -re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U) -re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U) def setLogLevel(log_level): global logger @@ -200,15 +198,29 @@ class Tokenizer(object): def __cut_all(self, sentence): dag = self.get_DAG(sentence) old_j = -1 + eng_scan = 0 + eng_buf = u'' for k, L in iteritems(dag): + if eng_scan==1 and not re_eng.match(sentence[k]): + eng_scan = 0 + yield eng_buf if len(L) == 1 and k > old_j: - yield sentence[k:L[0] + 1] + if re_eng.match(sentence[k]): + if eng_scan == 0: + eng_scan = 1 + eng_buf = sentence[k] + else: + eng_buf += sentence[k] + if eng_scan == 0: + yield sentence[k:L[0] + 1] old_j = L[0] else: for j in L: if j > k: yield sentence[k:j + 1] old_j = j + if eng_scan==1: + yield eng_buf def __cut_DAG_NO_HMM(self, sentence): DAG = self.get_DAG(sentence) @@ -299,12 +311,8 @@ class Tokenizer(object): continue yield sent return - if cut_all: - re_han = re_han_cut_all - re_skip = re_skip_cut_all - else: - re_han = re_han_default - re_skip = re_skip_default + re_han = re_han_default + re_skip = re_skip_default if cut_all: cut_block = self.__cut_all elif HMM: diff --git a/test/test_cutall.py b/test/test_cutall.py index d42da32..bb9acf6 100644 --- a/test/test_cutall.py +++ b/test/test_cutall.py @@ -96,3 +96,5 @@ if __name__ == "__main__": cuttest('AT&T是一件不错的公司,给你发offer了吗?') cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159') cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。') + jieba.add_word('超敏C反应蛋白') + cuttest('超敏C反应蛋白是什么, java好学吗?,小潘老板都学Python')