use logarithmic addition instead of multiplication, to avoid bad case in issue19

pull/21/head v0.24
Sun Junyi 12 years ago
parent 06ebc6f71c
commit fd20cbbd4b

@ -6,10 +6,11 @@ import finalseg
import time
import tempfile
import marshal
from math import log
FREQ = {}
total =0.0
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile(ur"[^\r\n]")
def gen_trie(f_name):
lfreq = {}
@ -46,7 +47,7 @@ if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(
if load_from_cache_fail:
trie,FREQ,total = gen_trie(os.path.join(_curpath,"dict.txt"))
FREQ = dict([(k,float(v)/total) for k,v in FREQ.iteritems()]) #normalize
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.iteritems()]) #normalize
min_freq = min(FREQ.itervalues())
print >> sys.stderr, "dumping model to file cache"
marshal.dump((trie,FREQ,total,min_freq),open(cache_file,'wb'))
@ -72,7 +73,7 @@ def calc(sentence,DAG,idx,route):
N = len(sentence)
route[N] = (1.0,'')
for idx in xrange(N-1,-1,-1):
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) * route[x+1][0],x ) for x in DAG[idx] ]
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0],x ) for x in DAG[idx] ]
route[idx] = max(candidates)
def get_DAG(sentence):
@ -142,7 +143,9 @@ def cut(sentence,cut_all=False):
sentence = sentence.decode('utf-8')
except:
sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile(ur"[^\r\n]")
if cut_all:
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
blocks = re_han.split(sentence)
cut_block = __cut_DAG
if cut_all:
@ -182,7 +185,7 @@ def load_userdict(f):
if line.rstrip()=='': continue
word,freq = line.split(" ")
freq = float(freq)
FREQ[word] = freq / total
FREQ[word] = log(freq / total)
p = trie
for c in word:
if not c in p:

@ -1,10 +1,14 @@
import re
import os
from math import log
MIN_FLOAT=-3.14e100
def load_model(f_name):
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
prob_p_path = os.path.join(_curpath,f_name)
return eval(open(prob_p_path,"rb").read())
tab = eval(open(prob_p_path,"rb").read())
return tab
prob_start = load_model("prob_start.py")
prob_trans = load_model("prob_trans.py")
@ -16,13 +20,13 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
V = [{}] #tabular
path = {}
for y in states: #init
V[0][y] = start_p[y] * emit_p[y].get(obs[0],0)
V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT)
path[y] = [y]
for t in range(1,len(obs)):
V.append({})
newpath = {}
for y in states:
(prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in states ])
(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + emit_p[y].get(obs[t],MIN_FLOAT) ,y0) for y0 in states ])
V[t][y] =prob
newpath[y] = path[state] + [y]
path = newpath

File diff suppressed because it is too large Load Diff

@ -1 +1,4 @@
{'B': 0.7689828525554734, 'E': 0.0, 'M': 0.0, 'S': 0.23101714744452656}
{'B': -0.26268660809250016,
'E': -3.14e+100,
'M': -3.14e+100,
'S': -1.4652633398537678}

@ -1,4 +1,4 @@
{'B': {'E': 0.8518218565181658, 'M': 0.14817814348183422},
'E': {'B': 0.5544853051164425, 'S': 0.44551469488355755},
'M': {'E': 0.7164487459986911, 'M': 0.2835512540013088},
'S': {'B': 0.48617017333894563, 'S': 0.5138298266610544}}
{'B': {'E': -0.16037786260859094, 'M': -1.9093400568760384},
'E': {'B': -0.5897149736854513, 'S': -0.8085250474669937},
'M': {'E': -0.33344856811948514, 'M': -1.2603623820268226},
'S': {'B': -0.7211965654669841, 'S': -0.6658631448798212}}

File diff suppressed because it is too large Load Diff

@ -1,256 +1,256 @@
{('B', 'a'): 0.008545886571090637,
('B', 'ad'): 0.0012556950477614949,
('B', 'ag'): 0.0,
('B', 'an'): 0.0001670724139577068,
('B', 'b'): 0.006615272009801582,
('B', 'bg'): 0.0,
('B', 'c'): 0.03258575057944956,
('B', 'd'): 0.018778408940230508,
('B', 'df'): 0.00013790104009207547,
('B', 'dg'): 0.0,
('B', 'e'): 0.00019093990166595064,
('B', 'en'): 0.0,
('B', 'f'): 0.004121119544290101,
('B', 'g'): 0.0,
('B', 'h'): 1.3259715393468796e-06,
('B', 'i'): 0.0022077426130125543,
('B', 'in'): 0.0,
('B', 'j'): 0.006360685474246981,
('B', 'jn'): 0.0,
('B', 'k'): 0.0,
('B', 'l'): 0.007402899104173628,
('B', 'ln'): 0.0,
('B', 'm'): 0.02592804748038888,
('B', 'mg'): 0.0,
('B', 'mq'): 0.0011284017799841944,
('B', 'n'): 0.18330097962777328,
('B', 'ng'): 0.0,
('B', 'nr'): 0.10741562843095136,
('B', 'nrfg'): 0.0028123856349547313,
('B', 'nrt'): 0.006835383285333164,
('B', 'ns'): 0.05943667425122387,
('B', 'nt'): 0.007859033313708954,
('B', 'nz'): 0.0193127754705873,
('B', 'o'): 0.00021745933245288822,
('B', 'p'): 0.014980826451541043,
('B', 'q'): 0.00091359439061,
('B', 'qe'): 0.0,
('B', 'qg'): 0.0,
('B', 'r'): 0.033047188675142274,
('B', 'rg'): 0.0,
('B', 'rr'): 3.977914618040638e-06,
('B', 'rz'): 0.0003540344010056168,
('B', 's'): 0.0039951522480521475,
('B', 't'): 0.03457072997385184,
('B', 'tg'): 0.0,
('B', 'u'): 0.00010475175160840347,
('B', 'ud'): 0.0,
('B', 'ug'): 0.0,
('B', 'uj'): 0.0,
('B', 'ul'): 0.0,
('B', 'uv'): 0.0,
('B', 'uz'): 0.0,
('B', 'v'): 0.06897173559066729,
('B', 'vd'): 0.00011801146700187228,
('B', 'vg'): 0.0,
('B', 'vi'): 3.977914618040638e-06,
('B', 'vn'): 0.01314700781262431,
('B', 'vq'): 5.303886157387518e-06,
('B', 'w'): 0.0,
('B', 'x'): 0.0,
('B', 'y'): 5.303886157387518e-05,
('B', 'yg'): 0.0,
('B', 'z'): 0.0008711633013508998,
('B', 'zg'): 0.0,
('E', 'a'): 0.0,
('E', 'ad'): 0.0,
('E', 'ag'): 0.0,
('E', 'an'): 0.0,
('E', 'b'): 0.0,
('E', 'bg'): 0.0,
('E', 'c'): 0.0,
('E', 'd'): 0.0,
('E', 'df'): 0.0,
('E', 'dg'): 0.0,
('E', 'e'): 0.0,
('E', 'en'): 0.0,
('E', 'f'): 0.0,
('E', 'g'): 0.0,
('E', 'h'): 0.0,
('E', 'i'): 0.0,
('E', 'in'): 0.0,
('E', 'j'): 0.0,
('E', 'jn'): 0.0,
('E', 'k'): 0.0,
('E', 'l'): 0.0,
('E', 'ln'): 0.0,
('E', 'm'): 0.0,
('E', 'mg'): 0.0,
('E', 'mq'): 0.0,
('E', 'n'): 0.0,
('E', 'ng'): 0.0,
('E', 'nr'): 0.0,
('E', 'nrfg'): 0.0,
('E', 'nrt'): 0.0,
('E', 'ns'): 0.0,
('E', 'nt'): 0.0,
('E', 'nz'): 0.0,
('E', 'o'): 0.0,
('E', 'p'): 0.0,
('E', 'q'): 0.0,
('E', 'qe'): 0.0,
('E', 'qg'): 0.0,
('E', 'r'): 0.0,
('E', 'rg'): 0.0,
('E', 'rr'): 0.0,
('E', 'rz'): 0.0,
('E', 's'): 0.0,
('E', 't'): 0.0,
('E', 'tg'): 0.0,
('E', 'u'): 0.0,
('E', 'ud'): 0.0,
('E', 'ug'): 0.0,
('E', 'uj'): 0.0,
('E', 'ul'): 0.0,
('E', 'uv'): 0.0,
('E', 'uz'): 0.0,
('E', 'v'): 0.0,
('E', 'vd'): 0.0,
('E', 'vg'): 0.0,
('E', 'vi'): 0.0,
('E', 'vn'): 0.0,
('E', 'vq'): 0.0,
('E', 'w'): 0.0,
('E', 'x'): 0.0,
('E', 'y'): 0.0,
('E', 'yg'): 0.0,
('E', 'z'): 0.0,
('E', 'zg'): 0.0,
('M', 'a'): 0.0,
('M', 'ad'): 0.0,
('M', 'ag'): 0.0,
('M', 'an'): 0.0,
('M', 'b'): 0.0,
('M', 'bg'): 0.0,
('M', 'c'): 0.0,
('M', 'd'): 0.0,
('M', 'df'): 0.0,
('M', 'dg'): 0.0,
('M', 'e'): 0.0,
('M', 'en'): 0.0,
('M', 'f'): 0.0,
('M', 'g'): 0.0,
('M', 'h'): 0.0,
('M', 'i'): 0.0,
('M', 'in'): 0.0,
('M', 'j'): 0.0,
('M', 'jn'): 0.0,
('M', 'k'): 0.0,
('M', 'l'): 0.0,
('M', 'ln'): 0.0,
('M', 'm'): 0.0,
('M', 'mg'): 0.0,
('M', 'mq'): 0.0,
('M', 'n'): 0.0,
('M', 'ng'): 0.0,
('M', 'nr'): 0.0,
('M', 'nrfg'): 0.0,
('M', 'nrt'): 0.0,
('M', 'ns'): 0.0,
('M', 'nt'): 0.0,
('M', 'nz'): 0.0,
('M', 'o'): 0.0,
('M', 'p'): 0.0,
('M', 'q'): 0.0,
('M', 'qe'): 0.0,
('M', 'qg'): 0.0,
('M', 'r'): 0.0,
('M', 'rg'): 0.0,
('M', 'rr'): 0.0,
('M', 'rz'): 0.0,
('M', 's'): 0.0,
('M', 't'): 0.0,
('M', 'tg'): 0.0,
('M', 'u'): 0.0,
('M', 'ud'): 0.0,
('M', 'ug'): 0.0,
('M', 'uj'): 0.0,
('M', 'ul'): 0.0,
('M', 'uv'): 0.0,
('M', 'uz'): 0.0,
('M', 'v'): 0.0,
('M', 'vd'): 0.0,
('M', 'vg'): 0.0,
('M', 'vi'): 0.0,
('M', 'vn'): 0.0,
('M', 'vq'): 0.0,
('M', 'w'): 0.0,
('M', 'x'): 0.0,
('M', 'y'): 0.0,
('M', 'yg'): 0.0,
('M', 'z'): 0.0,
('M', 'zg'): 0.0,
('S', 'a'): 0.020190568629634933,
('S', 'ad'): 1.5911658472162552e-05,
('S', 'ag'): 0.0009546995083297532,
('S', 'an'): 2.651943078693759e-06,
('S', 'b'): 0.0015447568433391145,
('S', 'bg'): 0.0,
('S', 'c'): 0.008337709039413178,
('S', 'd'): 0.020162723227308648,
('S', 'df'): 0.0,
('S', 'dg'): 0.0001299452108559942,
('S', 'e'): 0.0026254236479068215,
('S', 'en'): 0.0,
('S', 'f'): 0.0055452129775486496,
('S', 'g'): 0.0014917179817652395,
('S', 'h'): 0.00017502824319378808,
('S', 'i'): 0.0,
('S', 'in'): 0.0,
('S', 'j'): 0.007357816071835834,
('S', 'jn'): 0.0,
('S', 'k'): 0.000967959223723222,
('S', 'l'): 0.0,
('S', 'ln'): 0.0,
('S', 'm'): 0.038036819577704585,
('S', 'mg'): 1.988957309020319e-05,
('S', 'mq'): 0.0,
('S', 'n'): 0.021170461597212278,
('S', 'ng'): 0.007347208299521059,
('S', 'nr'): 0.011291973629078026,
('S', 'nrfg'): 0.0,
('S', 'nrt'): 0.0,
('S', 'ns'): 0.0,
('S', 'nt'): 5.303886157387518e-06,
('S', 'nz'): 0.0,
('S', 'o'): 0.00021082947475615385,
('S', 'p'): 0.05044658721445203,
('S', 'q'): 0.007531518343490275,
('S', 'qe'): 0.0,
('S', 'qg'): 0.0,
('S', 'r'): 0.06306851029749498,
('S', 'rg'): 3.447526002301887e-05,
('S', 'rr'): 0.0,
('S', 'rz'): 0.0,
('S', 's'): 0.0,
('S', 't'): 0.0,
('S', 'tg'): 0.0018868575004906095,
('S', 'u'): 0.000967959223723222,
('S', 'ud'): 0.000440222551063164,
('S', 'ug'): 0.0005317145872780986,
('S', 'uj'): 0.001056799316859463,
('S', 'ul'): 0.00022143724707092888,
('S', 'uv'): 0.00028640985249892595,
('S', 'uz'): 9.149203621493468e-05,
('S', 'v'): 0.04720326082920956,
('S', 'vd'): 0.0,
('S', 'vg'): 0.0026240976763674743,
('S', 'vi'): 0.0,
('S', 'vn'): 1.0607772314775036e-05,
('S', 'vq'): 0.0,
('S', 'w'): 0.0,
('S', 'x'): 0.0002187853039922351,
('S', 'y'): 0.00203536631289746,
('S', 'yg'): 1.3259715393468796e-06,
('S', 'z'): 0.0,
('S', 'zg'): 0.0}
{('B', 'a'): -4.762305214596967,
('B', 'ad'): -6.680066036784177,
('B', 'ag'): -3.14e+100,
('B', 'an'): -8.697083223018778,
('B', 'b'): -5.018374362109218,
('B', 'bg'): -3.14e+100,
('B', 'c'): -3.423880184954888,
('B', 'd'): -3.9750475297585357,
('B', 'df'): -8.888974230828882,
('B', 'dg'): -3.14e+100,
('B', 'e'): -8.563551830394255,
('B', 'en'): -3.14e+100,
('B', 'f'): -5.491630418482717,
('B', 'g'): -3.14e+100,
('B', 'h'): -13.533365129970255,
('B', 'i'): -6.1157847275557105,
('B', 'in'): -3.14e+100,
('B', 'j'): -5.0576191284681915,
('B', 'jn'): -3.14e+100,
('B', 'k'): -3.14e+100,
('B', 'l'): -4.905883584659895,
('B', 'ln'): -3.14e+100,
('B', 'm'): -3.6524299819046386,
('B', 'mg'): -3.14e+100,
('B', 'mq'): -6.78695300139688,
('B', 'n'): -1.6966257797548328,
('B', 'ng'): -3.14e+100,
('B', 'nr'): -2.2310495913769506,
('B', 'nrfg'): -5.873722175405573,
('B', 'nrt'): -4.985642733519195,
('B', 'ns'): -2.8228438314969213,
('B', 'nt'): -4.846091668182416,
('B', 'nz'): -3.94698846057672,
('B', 'o'): -8.433498702146057,
('B', 'p'): -4.200984132085048,
('B', 'q'): -6.998123858956596,
('B', 'qe'): -3.14e+100,
('B', 'qg'): -3.14e+100,
('B', 'r'): -3.4098187790818413,
('B', 'rg'): -3.14e+100,
('B', 'rr'): -12.434752841302146,
('B', 'rz'): -7.946116471570005,
('B', 's'): -5.522673590839954,
('B', 't'): -3.3647479094528574,
('B', 'tg'): -3.14e+100,
('B', 'u'): -9.163917277503234,
('B', 'ud'): -3.14e+100,
('B', 'ug'): -3.14e+100,
('B', 'uj'): -3.14e+100,
('B', 'ul'): -3.14e+100,
('B', 'uv'): -3.14e+100,
('B', 'uz'): -3.14e+100,
('B', 'v'): -2.6740584874265685,
('B', 'vd'): -9.044728760238115,
('B', 'vg'): -3.14e+100,
('B', 'vi'): -12.434752841302146,
('B', 'vn'): -4.3315610890163585,
('B', 'vq'): -12.147070768850364,
('B', 'w'): -3.14e+100,
('B', 'x'): -3.14e+100,
('B', 'y'): -9.844485675856319,
('B', 'yg'): -3.14e+100,
('B', 'z'): -7.045681111485645,
('B', 'zg'): -3.14e+100,
('E', 'a'): -3.14e+100,
('E', 'ad'): -3.14e+100,
('E', 'ag'): -3.14e+100,
('E', 'an'): -3.14e+100,
('E', 'b'): -3.14e+100,
('E', 'bg'): -3.14e+100,
('E', 'c'): -3.14e+100,
('E', 'd'): -3.14e+100,
('E', 'df'): -3.14e+100,
('E', 'dg'): -3.14e+100,
('E', 'e'): -3.14e+100,
('E', 'en'): -3.14e+100,
('E', 'f'): -3.14e+100,
('E', 'g'): -3.14e+100,
('E', 'h'): -3.14e+100,
('E', 'i'): -3.14e+100,
('E', 'in'): -3.14e+100,
('E', 'j'): -3.14e+100,
('E', 'jn'): -3.14e+100,
('E', 'k'): -3.14e+100,
('E', 'l'): -3.14e+100,
('E', 'ln'): -3.14e+100,
('E', 'm'): -3.14e+100,
('E', 'mg'): -3.14e+100,
('E', 'mq'): -3.14e+100,
('E', 'n'): -3.14e+100,
('E', 'ng'): -3.14e+100,
('E', 'nr'): -3.14e+100,
('E', 'nrfg'): -3.14e+100,
('E', 'nrt'): -3.14e+100,
('E', 'ns'): -3.14e+100,
('E', 'nt'): -3.14e+100,
('E', 'nz'): -3.14e+100,
('E', 'o'): -3.14e+100,
('E', 'p'): -3.14e+100,
('E', 'q'): -3.14e+100,
('E', 'qe'): -3.14e+100,
('E', 'qg'): -3.14e+100,
('E', 'r'): -3.14e+100,
('E', 'rg'): -3.14e+100,
('E', 'rr'): -3.14e+100,
('E', 'rz'): -3.14e+100,
('E', 's'): -3.14e+100,
('E', 't'): -3.14e+100,
('E', 'tg'): -3.14e+100,
('E', 'u'): -3.14e+100,
('E', 'ud'): -3.14e+100,
('E', 'ug'): -3.14e+100,
('E', 'uj'): -3.14e+100,
('E', 'ul'): -3.14e+100,
('E', 'uv'): -3.14e+100,
('E', 'uz'): -3.14e+100,
('E', 'v'): -3.14e+100,
('E', 'vd'): -3.14e+100,
('E', 'vg'): -3.14e+100,
('E', 'vi'): -3.14e+100,
('E', 'vn'): -3.14e+100,
('E', 'vq'): -3.14e+100,
('E', 'w'): -3.14e+100,
('E', 'x'): -3.14e+100,
('E', 'y'): -3.14e+100,
('E', 'yg'): -3.14e+100,
('E', 'z'): -3.14e+100,
('E', 'zg'): -3.14e+100,
('M', 'a'): -3.14e+100,
('M', 'ad'): -3.14e+100,
('M', 'ag'): -3.14e+100,
('M', 'an'): -3.14e+100,
('M', 'b'): -3.14e+100,
('M', 'bg'): -3.14e+100,
('M', 'c'): -3.14e+100,
('M', 'd'): -3.14e+100,
('M', 'df'): -3.14e+100,
('M', 'dg'): -3.14e+100,
('M', 'e'): -3.14e+100,
('M', 'en'): -3.14e+100,
('M', 'f'): -3.14e+100,
('M', 'g'): -3.14e+100,
('M', 'h'): -3.14e+100,
('M', 'i'): -3.14e+100,
('M', 'in'): -3.14e+100,
('M', 'j'): -3.14e+100,
('M', 'jn'): -3.14e+100,
('M', 'k'): -3.14e+100,
('M', 'l'): -3.14e+100,
('M', 'ln'): -3.14e+100,
('M', 'm'): -3.14e+100,
('M', 'mg'): -3.14e+100,
('M', 'mq'): -3.14e+100,
('M', 'n'): -3.14e+100,
('M', 'ng'): -3.14e+100,
('M', 'nr'): -3.14e+100,
('M', 'nrfg'): -3.14e+100,
('M', 'nrt'): -3.14e+100,
('M', 'ns'): -3.14e+100,
('M', 'nt'): -3.14e+100,
('M', 'nz'): -3.14e+100,
('M', 'o'): -3.14e+100,
('M', 'p'): -3.14e+100,
('M', 'q'): -3.14e+100,
('M', 'qe'): -3.14e+100,
('M', 'qg'): -3.14e+100,
('M', 'r'): -3.14e+100,
('M', 'rg'): -3.14e+100,
('M', 'rr'): -3.14e+100,
('M', 'rz'): -3.14e+100,
('M', 's'): -3.14e+100,
('M', 't'): -3.14e+100,
('M', 'tg'): -3.14e+100,
('M', 'u'): -3.14e+100,
('M', 'ud'): -3.14e+100,
('M', 'ug'): -3.14e+100,
('M', 'uj'): -3.14e+100,
('M', 'ul'): -3.14e+100,
('M', 'uv'): -3.14e+100,
('M', 'uz'): -3.14e+100,
('M', 'v'): -3.14e+100,
('M', 'vd'): -3.14e+100,
('M', 'vg'): -3.14e+100,
('M', 'vi'): -3.14e+100,
('M', 'vn'): -3.14e+100,
('M', 'vq'): -3.14e+100,
('M', 'w'): -3.14e+100,
('M', 'x'): -3.14e+100,
('M', 'y'): -3.14e+100,
('M', 'yg'): -3.14e+100,
('M', 'z'): -3.14e+100,
('M', 'zg'): -3.14e+100,
('S', 'a'): -3.9025396831295227,
('S', 'ad'): -11.048458480182255,
('S', 'ag'): -6.954113917960154,
('S', 'an'): -12.84021794941031,
('S', 'b'): -6.472888763970454,
('S', 'bg'): -3.14e+100,
('S', 'c'): -4.786966795861212,
('S', 'd'): -3.903919764181873,
('S', 'df'): -3.14e+100,
('S', 'dg'): -8.948397651299683,
('S', 'e'): -5.942513006281674,
('S', 'en'): -3.14e+100,
('S', 'f'): -5.194820249981676,
('S', 'g'): -6.507826815331734,
('S', 'h'): -8.650563207383884,
('S', 'i'): -3.14e+100,
('S', 'in'): -3.14e+100,
('S', 'j'): -4.911992119644354,
('S', 'jn'): -3.14e+100,
('S', 'k'): -6.940320595827818,
('S', 'l'): -3.14e+100,
('S', 'ln'): -3.14e+100,
('S', 'm'): -3.269200652116097,
('S', 'mg'): -10.825314928868044,
('S', 'mq'): -3.14e+100,
('S', 'n'): -3.8551483897645107,
('S', 'ng'): -4.913434861102905,
('S', 'nr'): -4.483663103956885,
('S', 'nrfg'): -3.14e+100,
('S', 'nrt'): -3.14e+100,
('S', 'ns'): -3.14e+100,
('S', 'nt'): -12.147070768850364,
('S', 'nz'): -3.14e+100,
('S', 'o'): -8.464460927750023,
('S', 'p'): -2.9868401813596317,
('S', 'q'): -4.888658618255058,
('S', 'qe'): -3.14e+100,
('S', 'qg'): -3.14e+100,
('S', 'r'): -2.7635336784127853,
('S', 'rg'): -10.275268591948773,
('S', 'rr'): -3.14e+100,
('S', 'rz'): -3.14e+100,
('S', 's'): -3.14e+100,
('S', 't'): -3.14e+100,
('S', 'tg'): -6.272842531880403,
('S', 'u'): -6.940320595827818,
('S', 'ud'): -7.728230161053767,
('S', 'ug'): -7.5394037026636855,
('S', 'uj'): -6.85251045118004,
('S', 'ul'): -8.4153713175535,
('S', 'uv'): -8.15808672228609,
('S', 'uz'): -9.299258625372996,
('S', 'v'): -3.053292303412302,
('S', 'vd'): -3.14e+100,
('S', 'vg'): -5.9430181843676895,
('S', 'vi'): -3.14e+100,
('S', 'vn'): -11.453923588290419,
('S', 'vq'): -3.14e+100,
('S', 'w'): -3.14e+100,
('S', 'x'): -8.427419656069674,
('S', 'y'): -6.1970794699489575,
('S', 'yg'): -13.533365129970255,
('S', 'z'): -3.14e+100,
('S', 'zg'): -3.14e+100}

File diff suppressed because it is too large Load Diff

@ -1,4 +1,5 @@
import operator
MIN_FLOAT=-3.14e100
def get_top_states(t_state_v,K=4):
items = t_state_v.items()
@ -10,7 +11,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
mem_path = [{}]
all_states = trans_p.keys()
for y in states.get(obs[0],all_states): #init
V[0][y] = start_p[y] * emit_p[y].get(obs[0],0)
V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT)
mem_path[0][y] = ''
for t in range(1,len(obs)):
V.append({})
@ -24,7 +25,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
if len(obs_states)==0: obs_states = all_states
for y in obs_states:
(prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in prev_states])
(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + emit_p[y].get(obs[t],MIN_FLOAT) ,y0) for y0 in prev_states])
V[t][y] =prob
mem_path[t][y] = state

@ -1,6 +1,6 @@
from distutils.core import setup
setup(name='jieba',
version='0.23',
version='0.24',
description='Chinese Words Segementation Utilities',
author='Sun, Junyi',
author_email='ccnusjy@gmail.com',

Loading…
Cancel
Save