support cxfree py2exe; keep white space

pull/66/merge
fxsjy 12 years ago
parent 7343679ba8
commit c015f4e297

@ -221,15 +221,11 @@ def cut(sentence,cut_all=False):
else:
tmp = re_skip.split(blk)
for x in tmp:
if re_skip.match(x):
if x.strip(' ')!='':
yield x
if not cut_all:
for xx in x:
yield xx
else:
if not cut_all:
for xx in x:
yield xx
else:
yield x
yield x
def cut_for_search(sentence):
words = cut(sentence)

@ -1,20 +1,12 @@
import re
import os
from math import log
import prob_start
import prob_trans
import prob_emit
MIN_FLOAT=-3.14e100
def load_model(f_name):
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
prob_p_path = os.path.join(_curpath,f_name)
tab = eval(open(prob_p_path,"rb").read())
return tab
prob_start = load_model("prob_start.py")
prob_trans = load_model("prob_trans.py")
prob_emit = load_model("prob_emit.py")
PrevStatus = {
'B':('E','S'),
'M':('M','B'),
@ -44,7 +36,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
def __cut(sentence):
prob, pos_list = viterbi(sentence,('B','M','E','S'), prob_start, prob_trans, prob_emit)
prob, pos_list = viterbi(sentence,('B','M','E','S'), prob_start.P, prob_trans.P, prob_emit.P)
begin, next = 0,0
#print pos_list, sentence
for i,char in enumerate(sentence):

@ -1,4 +1,4 @@
{'B': {u'\u4e00': -3.6544978750449433,
P={'B': {u'\u4e00': -3.6544978750449433,
u'\u4e01': -8.125041941842026,
u'\u4e03': -7.817392401429855,
u'\u4e07': -6.3096425804013165,

@ -1,4 +1,4 @@
{'B': -0.26268660809250016,
P={'B': -0.26268660809250016,
'E': -3.14e+100,
'M': -3.14e+100,
'S': -1.4652633398537678}

@ -1,4 +1,4 @@
{'B': {'E': -0.510825623765990, 'M': -0.916290731874155},
P={'B': {'E': -0.510825623765990, 'M': -0.916290731874155},
'E': {'B': -0.5897149736854513, 'S': -0.8085250474669937},
'M': {'E': -0.33344856811948514, 'M': -1.2603623820268226},
'S': {'B': -0.7211965654669841, 'S': -0.6658631448798212}}

@ -3,6 +3,10 @@ import os
import viterbi
import jieba
import sys
import prob_start
import prob_trans
import prob_emit
import char_state_tab
default_encoding = sys.getfilesystemencoding()
@ -20,11 +24,6 @@ def load_model(f_name):
result[word.decode('utf-8')]=tag
return result
prob_start = load_model("prob_start.py")
prob_trans = load_model("prob_trans.py")
prob_emit = load_model("prob_emit.py")
char_state_tab = load_model("char_state_tab.py")
word_tag_tab = load_model("../dict.txt")
if jieba.user_word_tag_tab:
@ -48,7 +47,7 @@ class pair(object):
return self.__unicode__().encode(arg)
def __cut(sentence):
prob, pos_list = viterbi.viterbi(sentence,char_state_tab, prob_start, prob_trans, prob_emit)
prob, pos_list = viterbi.viterbi(sentence,char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P)
begin, next = 0,0
for i,char in enumerate(sentence):
@ -142,17 +141,13 @@ def __cut_internal(sentence):
else:
tmp = re_skip.split(blk)
for x in tmp:
if re_skip.match(x):
if x.strip(' ')!='':
yield pair(x,'')
else:
for xx in x:
if re_num.match(xx):
yield pair(xx,'m')
elif re_eng.match(x):
yield pair(xx,'eng')
else:
yield pair(xx,'x')
for xx in x:
if re_num.match(xx):
yield pair(xx,'m')
elif re_eng.match(x):
yield pair(xx,'eng')
else:
yield pair(xx,'x')
def __lcut_internal(sentence):
return list(__cut_internal(sentence))

@ -1,4 +1,4 @@
{u'\u4e00': (('B', 'm'),
P={u'\u4e00': (('B', 'm'),
('S', 'm'),
('B', 'd'),
('B', 'a'),

@ -1,4 +1,4 @@
{('B', 'a'): {u'\u4e00': -3.618715666782108,
P={('B', 'a'): {u'\u4e00': -3.618715666782108,
u'\u4e07': -10.500566885381515,
u'\u4e0a': -8.541143017159477,
u'\u4e0b': -8.445222895280738,

@ -1,4 +1,4 @@
{('B', 'a'): -4.762305214596967,
P={('B', 'a'): -4.762305214596967,
('B', 'ad'): -6.680066036784177,
('B', 'ag'): -3.14e+100,
('B', 'an'): -8.697083223018778,

@ -1,4 +1,4 @@
{('B', 'a'): {('E', 'a'): -0.0050648453069648755,
P={('B', 'a'): {('E', 'a'): -0.0050648453069648755,
('M', 'a'): -5.287963037107507},
('B', 'ad'): {('E', 'ad'): -0.0007479013978476627,
('M', 'ad'): -7.198613337130562},

Loading…
Cancel
Save