|
|
@ -46,8 +46,6 @@ re_eng = re.compile('[a-zA-Z0-9]', re.U)
|
|
|
|
re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
|
|
|
|
re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
|
|
|
|
|
|
|
|
|
|
|
|
re_skip_default = re.compile("(\r\n|\s)", re.U)
|
|
|
|
re_skip_default = re.compile("(\r\n|\s)", re.U)
|
|
|
|
re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U)
|
|
|
|
|
|
|
|
re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def setLogLevel(log_level):
|
|
|
|
def setLogLevel(log_level):
|
|
|
|
global logger
|
|
|
|
global logger
|
|
|
@ -200,8 +198,20 @@ class Tokenizer(object):
|
|
|
|
def __cut_all(self, sentence):
|
|
|
|
def __cut_all(self, sentence):
|
|
|
|
dag = self.get_DAG(sentence)
|
|
|
|
dag = self.get_DAG(sentence)
|
|
|
|
old_j = -1
|
|
|
|
old_j = -1
|
|
|
|
|
|
|
|
eng_scan = 0
|
|
|
|
|
|
|
|
eng_buf = u''
|
|
|
|
for k, L in iteritems(dag):
|
|
|
|
for k, L in iteritems(dag):
|
|
|
|
|
|
|
|
if eng_scan==1 and not re_eng.match(sentence[k]):
|
|
|
|
|
|
|
|
eng_scan = 0
|
|
|
|
|
|
|
|
yield eng_buf
|
|
|
|
if len(L) == 1 and k > old_j:
|
|
|
|
if len(L) == 1 and k > old_j:
|
|
|
|
|
|
|
|
if re_eng.match(sentence[k]):
|
|
|
|
|
|
|
|
if eng_scan == 0:
|
|
|
|
|
|
|
|
eng_scan = 1
|
|
|
|
|
|
|
|
eng_buf = sentence[k]
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
eng_buf += sentence[k]
|
|
|
|
|
|
|
|
if eng_scan == 0:
|
|
|
|
yield sentence[k:L[0] + 1]
|
|
|
|
yield sentence[k:L[0] + 1]
|
|
|
|
old_j = L[0]
|
|
|
|
old_j = L[0]
|
|
|
|
else:
|
|
|
|
else:
|
|
|
@ -209,6 +219,8 @@ class Tokenizer(object):
|
|
|
|
if j > k:
|
|
|
|
if j > k:
|
|
|
|
yield sentence[k:j + 1]
|
|
|
|
yield sentence[k:j + 1]
|
|
|
|
old_j = j
|
|
|
|
old_j = j
|
|
|
|
|
|
|
|
if eng_scan==1:
|
|
|
|
|
|
|
|
yield eng_buf
|
|
|
|
|
|
|
|
|
|
|
|
def __cut_DAG_NO_HMM(self, sentence):
|
|
|
|
def __cut_DAG_NO_HMM(self, sentence):
|
|
|
|
DAG = self.get_DAG(sentence)
|
|
|
|
DAG = self.get_DAG(sentence)
|
|
|
@ -299,10 +311,6 @@ class Tokenizer(object):
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
yield sent
|
|
|
|
yield sent
|
|
|
|
return
|
|
|
|
return
|
|
|
|
if cut_all:
|
|
|
|
|
|
|
|
re_han = re_han_cut_all
|
|
|
|
|
|
|
|
re_skip = re_skip_cut_all
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
re_han = re_han_default
|
|
|
|
re_han = re_han_default
|
|
|
|
re_skip = re_skip_default
|
|
|
|
re_skip = re_skip_default
|
|
|
|
if cut_all:
|
|
|
|
if cut_all:
|
|
|
|