|
|
|
@ -3,6 +3,7 @@ import os
|
|
|
|
|
import viterbi
|
|
|
|
|
import jieba
|
|
|
|
|
import sys
|
|
|
|
|
|
|
|
|
|
default_encoding = sys.getfilesystemencoding()
|
|
|
|
|
|
|
|
|
|
def load_model(f_name):
|
|
|
|
@ -60,10 +61,31 @@ def __cut(sentence):
|
|
|
|
|
if next<len(sentence):
|
|
|
|
|
yield pair(sentence[next:], pos_list[next][1] )
|
|
|
|
|
|
|
|
|
|
def __cut_detail(sentence):
|
|
|
|
|
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\r\n]")
|
|
|
|
|
re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+")
|
|
|
|
|
blocks = re_han.split(sentence)
|
|
|
|
|
for blk in blocks:
|
|
|
|
|
if re_han.match(blk):
|
|
|
|
|
for word in __cut(blk):
|
|
|
|
|
yield word
|
|
|
|
|
else:
|
|
|
|
|
tmp = re_skip.split(blk)
|
|
|
|
|
for x in tmp:
|
|
|
|
|
if x!="":
|
|
|
|
|
if re_num.match(x):
|
|
|
|
|
yield pair(x,'m')
|
|
|
|
|
elif re_eng.match(x):
|
|
|
|
|
yield pair(x,'eng')
|
|
|
|
|
else:
|
|
|
|
|
yield pair(x,'x')
|
|
|
|
|
|
|
|
|
|
def __cut_DAG(sentence):
|
|
|
|
|
DAG = jieba.get_DAG(sentence)
|
|
|
|
|
route ={}
|
|
|
|
|
|
|
|
|
|
jieba.calc(sentence,DAG,0,route=route)
|
|
|
|
|
|
|
|
|
|
x = 0
|
|
|
|
|
buf =u''
|
|
|
|
|
N = len(sentence)
|
|
|
|
@ -78,7 +100,7 @@ def __cut_DAG(sentence):
|
|
|
|
|
yield pair(buf,word_tag_tab.get(buf,'x'))
|
|
|
|
|
buf=u''
|
|
|
|
|
else:
|
|
|
|
|
regognized = __cut(buf)
|
|
|
|
|
regognized = __cut_detail(buf)
|
|
|
|
|
for t in regognized:
|
|
|
|
|
yield t
|
|
|
|
|
buf=u''
|
|
|
|
@ -89,7 +111,7 @@ def __cut_DAG(sentence):
|
|
|
|
|
if len(buf)==1:
|
|
|
|
|
yield pair(buf,word_tag_tab.get(buf,'x'))
|
|
|
|
|
else:
|
|
|
|
|
regognized = __cut(buf)
|
|
|
|
|
regognized = __cut_detail(buf)
|
|
|
|
|
for t in regognized:
|
|
|
|
|
yield t
|
|
|
|
|
|
|
|
|
@ -100,10 +122,9 @@ def cut(sentence):
|
|
|
|
|
sentence = sentence.decode('utf-8')
|
|
|
|
|
except:
|
|
|
|
|
sentence = sentence.decode('gbk','ignore')
|
|
|
|
|
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n%]")
|
|
|
|
|
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile(ur"[^\r\n]")
|
|
|
|
|
re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+")
|
|
|
|
|
blocks = re_han.split(sentence)
|
|
|
|
|
|
|
|
|
|
for blk in blocks:
|
|
|
|
|
if re_han.match(blk):
|
|
|
|
|
for word in __cut_DAG(blk):
|
|
|
|
|