|
|
@ -40,7 +40,7 @@ re_eng = re.compile('[a-zA-Z0-9]', re.U)
|
|
|
|
|
|
|
|
|
|
|
|
# \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
|
|
|
|
# \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
|
|
|
|
# \r\n|\s : whitespace characters. Will not be handled.
|
|
|
|
# \r\n|\s : whitespace characters. Will not be handled.
|
|
|
|
re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)", re.U)
|
|
|
|
re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)", re.U)
|
|
|
|
re_skip_default = re.compile("(\r\n|\s)", re.U)
|
|
|
|
re_skip_default = re.compile("(\r\n|\s)", re.U)
|
|
|
|
re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U)
|
|
|
|
re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U)
|
|
|
|
re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)
|
|
|
|
re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)
|
|
|
@ -409,6 +409,8 @@ class Tokenizer(object):
|
|
|
|
wfrag = word[:ch + 1]
|
|
|
|
wfrag = word[:ch + 1]
|
|
|
|
if wfrag not in self.FREQ:
|
|
|
|
if wfrag not in self.FREQ:
|
|
|
|
self.FREQ[wfrag] = 0
|
|
|
|
self.FREQ[wfrag] = 0
|
|
|
|
|
|
|
|
if freq == 0:
|
|
|
|
|
|
|
|
finalseg.add_force_split(word)
|
|
|
|
|
|
|
|
|
|
|
|
def del_word(self, word):
|
|
|
|
def del_word(self, word):
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|