pull/583/merge
fxsjy 5 years ago
parent 0489a6979e
commit 97c32464e1

@ -46,8 +46,6 @@ re_eng = re.compile('[a-zA-Z0-9]', re.U)
re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
re_skip_default = re.compile("(\r\n|\s)", re.U)
re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U)
re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)
def setLogLevel(log_level):
global logger
@ -200,15 +198,29 @@ class Tokenizer(object):
def __cut_all(self, sentence):
dag = self.get_DAG(sentence)
old_j = -1
eng_scan = 0
eng_buf = u''
for k, L in iteritems(dag):
if eng_scan==1 and not re_eng.match(sentence[k]):
eng_scan = 0
yield eng_buf
if len(L) == 1 and k > old_j:
yield sentence[k:L[0] + 1]
if re_eng.match(sentence[k]):
if eng_scan == 0:
eng_scan = 1
eng_buf = sentence[k]
else:
eng_buf += sentence[k]
if eng_scan == 0:
yield sentence[k:L[0] + 1]
old_j = L[0]
else:
for j in L:
if j > k:
yield sentence[k:j + 1]
old_j = j
if eng_scan==1:
yield eng_buf
def __cut_DAG_NO_HMM(self, sentence):
DAG = self.get_DAG(sentence)
@ -299,12 +311,8 @@ class Tokenizer(object):
continue
yield sent
return
if cut_all:
re_han = re_han_cut_all
re_skip = re_skip_cut_all
else:
re_han = re_han_default
re_skip = re_skip_default
re_han = re_han_default
re_skip = re_skip_default
if cut_all:
cut_block = self.__cut_all
elif HMM:

@ -96,3 +96,5 @@ if __name__ == "__main__":
cuttest('AT&T是一件不错的公司给你发offer了吗')
cuttest('C++和c#是什么关系11+122=133是吗PI=3.14159')
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
jieba.add_word('超敏C反应蛋白')
cuttest('超敏C反应蛋白是什么, java好学吗?,小潘老板都学Python')

Loading…
Cancel
Save