support en-chn mixed words, like B超

pull/21/head
Sun Junyi 12 years ago
parent e0bd9a6a50
commit 379cd4933a

@ -9,6 +9,7 @@ import marshal
FREQ = {}
total =0.0
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile(ur"[^\r\n]")
def gen_trie(f_name):
lfreq = {}
@ -119,7 +120,7 @@ def __cut_DAG(sentence):
yield buf
buf=u''
else:
regognized = finalseg.__cut(buf)
regognized = finalseg.cut(buf)
for t in regognized:
yield t
buf=u''
@ -130,7 +131,7 @@ def __cut_DAG(sentence):
if len(buf)==1:
yield buf
else:
regognized = finalseg.__cut(buf)
regognized = finalseg.cut(buf)
for t in regognized:
yield t
@ -141,7 +142,7 @@ def cut(sentence,cut_all=False):
sentence = sentence.decode('utf-8')
except:
sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
blocks = re_han.split(sentence)
cut_block = __cut_DAG
if cut_all:

@ -283173,7 +283173,7 @@
自学辅导 3 n
自守 3 v
自定 3 d
自定义 3 l
自定义 13 l
自定义词 3 n
自审 6 v
自宫 3 n
@ -367419,4 +367419,6 @@
龟龙片甲 3 nz
龟龙麟凤 3 ns
龠 5 g
龢 732 zg
龢 732 zg
B超 3 n
T恤 4 n

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save