fix bugs in jieba for py3k

pull/50/head
Sun Junyi
commit 0f4f9067c3

@ -10,11 +10,16 @@ jieba
Feature
========
* 支持种分词模式:
* 支持种分词模式:
* 1精确模式试图将句子最精确地切开适合文本分析
* 2全模式把句子中所有的可以成词的词语都扫描出来, 速度非常快,但是不能解决歧义;
* 3) 搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。
Python Version
==============
* 目前master分支是只支持Python2.x 的
* Python3.x 版本的分支也已经基本可用: https://github.com/fxsjy/jieba/tree/jieba3k
Usage
========
* 全自动安装:`easy_install jieba` 或者 `pip install jieba`
@ -60,8 +65,7 @@ Output:
【新词识别】:他, 来到, 了, 网易, 杭研, 大厦 (此处“杭研”并没有在词典中但是也被Viterbi算法识别出来了)
【搜索引擎模式】: 小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, 后, 在
, 日本, 京都, 大学, 日本京都大学, 深造
【搜索引擎模式】: 小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, 后, 在, 日本, 京都, 大学, 日本京都大学, 深造
功能 2) :添加自定义词典
================
@ -105,7 +109,18 @@ Output:
爱 v
北京 ns
天安门 ns
其他词典
========
1. 占用内存较小的词典文件
https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.small
2. 支持繁体分词更好的词典文件
https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big
下载你所需要的词典然后覆盖jieba/dict.txt 即可。
分词速度
@ -116,14 +131,21 @@ Output:
在线演示
=========
http://209.222.69.242:9000/
http://jiebademo.ap01.aws.af.cm/
(Powered by Appfog)
常见问题
=========
1模型的数据是如何生成的https://github.com/fxsjy/jieba/issues/7
2这个库的授权是? https://github.com/fxsjy/jieba/issues/2
更多问题请点击https://github.com/fxsjy/jieba/issues?sort=updated&state=closed
Change Log
==========
http://www.oschina.net/p/jieba/news#list
jieba
========
@ -220,4 +242,6 @@ Segmentation speed
Online demo
=========
http://209.222.69.242:9000/
http://jiebademo.ap01.aws.af.cm/
(Powered by Appfog)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -5,9 +5,15 @@ import pprint
from . import finalseg
import time
import tempfile
import marshal
from math import log
import random
FREQ = {}
total =0.0
def gen_trie(f_name):
lfreq = {}
trie = {}
@ -33,13 +39,32 @@ print("Building Trie...",file=sys.stderr)
t1 = time.time()
trie,FREQ,total = gen_trie(os.path.join(_curpath,"dict.txt"))
FREQ = dict([(k,float(v)/total) for k,v in FREQ.items()]) #normalize
min_freq = min(FREQ.values())
print("dumping model to file cache",file=sys.stderr)
print("loading model cost ", time.time() - t1, "seconds." ,file=sys.stderr)
print("Trie has been built succesfully.",file=sys.stderr)
cache_file = os.path.join(tempfile.gettempdir(),"jieba.cache")
load_from_cache_fail = True
if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(os.path.join(_curpath,"dict.txt")):
print("loading model from cache", file=sys.stderr)
try:
trie,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
load_from_cache_fail = False
except:
load_from_cache_fail = True
if load_from_cache_fail:
trie,FREQ,total = gen_trie(os.path.join(_curpath,"dict.txt"))
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.items()]) #normalize
min_freq = min(FREQ.values())
print("dumping model to file cache", file=sys.stderr)
tmp_suffix = "."+str(random.random())
marshal.dump((trie,FREQ,total,min_freq),open(cache_file+tmp_suffix,'wb'))
if os.name=='nt':
import shutil
replace_file = shutil.move
else:
replace_file = os.rename
replace_file(cache_file+tmp_suffix,cache_file)
print("loading model cost ", time.time() - t1, "seconds.", file= sys.stderr)
print("Trie has been built succesfully.", file= sys.stderr)
def __cut_all(sentence):
@ -59,7 +84,7 @@ def calc(sentence,DAG,idx,route):
N = len(sentence)
route[N] = (1.0,'')
for idx in range(N-1,-1,-1):
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) * route[x+1][0],x ) for x in DAG[idx] ]
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0],x ) for x in DAG[idx] ]
route[idx] = max(candidates)
def get_DAG(sentence):
@ -94,7 +119,7 @@ def __cut_DAG(sentence):
route ={}
calc(sentence,DAG,0,route=route)
x = 0
buf =u''
buf =''
N = len(sentence)
while x<N:
y = route[x][1]+1
@ -105,12 +130,12 @@ def __cut_DAG(sentence):
if len(buf)>0:
if len(buf)==1:
yield buf
buf=u''
buf=''
else:
regognized = finalseg.__cut(buf)
regognized = finalseg.cut(buf)
for t in regognized:
yield t
buf=u''
buf=''
yield l_word
x =y
@ -118,7 +143,7 @@ def __cut_DAG(sentence):
if len(buf)==1:
yield buf
else:
regognized = finalseg.__cut(buf)
regognized = finalseg.cut(buf)
for t in regognized:
yield t
@ -129,7 +154,11 @@ def cut(sentence,cut_all=False):
sentence = sentence.decode('utf-8')
except:
sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"[^a-zA-Z0-9+#\n]")
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile("[^\r\n]")
if cut_all:
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("[^a-zA-Z0-9+#\n]")
blocks = re_han.split(sentence)
cut_block = __cut_DAG
if cut_all:
@ -169,7 +198,7 @@ def load_userdict(f):
if line.rstrip()=='': continue
word,freq = line.split(" ")
freq = float(freq)
FREQ[word] = freq / total
FREQ[word] = log(freq / total)
p = trie
for c in word:
if not c in p:

@ -283173,7 +283173,7 @@
自学辅导 3 n
自守 3 v
自定 3 d
自定义 3 l
自定义 13 l
自定义词 3 n
自审 6 v
自宫 3 n
@ -367419,4 +367419,6 @@
龟龙片甲 3 nz
龟龙麟凤 3 ns
龠 5 g
龢 732 zg
龢 732 zg
B超 3 n
T恤 4 n

@ -1,10 +1,14 @@
import re
import os
from math import log
MIN_FLOAT=-3.14e100
def load_model(f_name):
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
prob_p_path = os.path.join(_curpath,f_name)
return eval(open(prob_p_path,"rb").read())
tab = eval(open(prob_p_path,"rb").read())
return tab
prob_start = load_model("prob_start.py")
prob_trans = load_model("prob_trans.py")
@ -16,13 +20,13 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
V = [{}] #tabular
path = {}
for y in states: #init
V[0][y] = start_p[y] * emit_p[y].get(obs[0],0)
V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT)
path[y] = [y]
for t in range(1,len(obs)):
V.append({})
newpath = {}
for y in states:
(prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in states ])
(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + emit_p[y].get(obs[t],MIN_FLOAT) ,y0) for y0 in states ])
V[t][y] =prob
newpath[y] = path[state] + [y]
path = newpath
@ -50,12 +54,12 @@ def __cut(sentence):
yield sentence[next:]
def cut(sentence):
if not ( type(sentence) is unicode):
if not ( type(sentence) is str):
try:
sentence = sentence.decode('utf-8')
except:
sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"[^a-zA-Z0-9+#\n]")
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("[^a-zA-Z0-9+#\n]")
blocks = re_han.split(sentence)
for blk in blocks:
if re_han.match(blk):

File diff suppressed because it is too large Load Diff

@ -1 +1,4 @@
{'B': 0.7689828525554734, 'E': 0.0, 'M': 0.0, 'S': 0.23101714744452656}
{'B': -0.26268660809250016,
'E': -3.14e+100,
'M': -3.14e+100,
'S': -1.4652633398537678}

@ -1,4 +1,4 @@
{'B': {'E': 0.8518218565181658, 'M': 0.14817814348183422},
'E': {'B': 0.5544853051164425, 'S': 0.44551469488355755},
'M': {'E': 0.7164487459986911, 'M': 0.2835512540013088},
'S': {'B': 0.48617017333894563, 'S': 0.5138298266610544}}
{'B': {'E': -0.16037786260859094, 'M': -1.9093400568760384},
'E': {'B': -0.5897149736854513, 'S': -0.8085250474669937},
'M': {'E': -0.33344856811948514, 'M': -1.2603623820268226},
'S': {'B': -0.7211965654669841, 'S': -0.6658631448798212}}

@ -3,6 +3,7 @@ import os
from . import viterbi
import jieba
import sys
default_encoding = sys.getfilesystemencoding()
def load_model(f_name):
@ -33,7 +34,7 @@ class pair(object):
self.flag = flag
def __unicode__(self):
return self.word+u"/"+self.flag
return self.word+"/"+self.flag
def __repr__(self):
return self.__str__()
@ -61,12 +62,33 @@ def __cut(sentence):
if next<len(sentence):
yield pair(sentence[next:], pos_list[next][1] )
def __cut_detail(sentence):
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("[^a-zA-Z0-9+#\r\n]")
re_eng,re_num = re.compile("[a-zA-Z+#]+"), re.compile("[0-9]+")
blocks = re_han.split(sentence)
for blk in blocks:
if re_han.match(blk):
for word in __cut(blk):
yield word
else:
tmp = re_skip.split(blk)
for x in tmp:
if x!="":
if re_num.match(x):
yield pair(x,'m')
elif re_eng.match(x):
yield pair(x,'eng')
else:
yield pair(x,'x')
def __cut_DAG(sentence):
DAG = jieba.get_DAG(sentence)
route ={}
jieba.calc(sentence,DAG,0,route=route)
x = 0
buf =u''
buf =''
N = len(sentence)
while x<N:
y = route[x][1]+1
@ -77,12 +99,12 @@ def __cut_DAG(sentence):
if len(buf)>0:
if len(buf)==1:
yield pair(buf,word_tag_tab.get(buf,'x'))
buf=u''
buf=''
else:
regognized = __cut(buf)
regognized = __cut_detail(buf)
for t in regognized:
yield t
buf=u''
buf=''
yield pair(l_word,word_tag_tab.get(l_word,'x'))
x =y
@ -90,7 +112,7 @@ def __cut_DAG(sentence):
if len(buf)==1:
yield pair(buf,word_tag_tab.get(buf,'x'))
else:
regognized = __cut(buf)
regognized = __cut_detail(buf)
for t in regognized:
yield t
@ -101,10 +123,11 @@ def cut(sentence):
sentence = sentence.decode('utf-8')
except:
sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"[^a-zA-Z0-9+#\n%]")
re_eng,re_num = re.compile(r"[a-zA-Z+#]+"), re.compile(r"[0-9]+")
blocks = re_han.split(sentence)
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile("[^\r\n]")
re_eng,re_num = re.compile("[a-zA-Z+#]+"), re.compile("[0-9]+")
blocks = re_han.split(sentence)
for blk in blocks:
if re_han.match(blk):
for word in __cut_DAG(blk):

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,256 +1,256 @@
{('B', 'a'): 0.008545886571090637,
('B', 'ad'): 0.0012556950477614949,
('B', 'ag'): 0.0,
('B', 'an'): 0.0001670724139577068,
('B', 'b'): 0.006615272009801582,
('B', 'bg'): 0.0,
('B', 'c'): 0.03258575057944956,
('B', 'd'): 0.018778408940230508,
('B', 'df'): 0.00013790104009207547,
('B', 'dg'): 0.0,
('B', 'e'): 0.00019093990166595064,
('B', 'en'): 0.0,
('B', 'f'): 0.004121119544290101,
('B', 'g'): 0.0,
('B', 'h'): 1.3259715393468796e-06,
('B', 'i'): 0.0022077426130125543,
('B', 'in'): 0.0,
('B', 'j'): 0.006360685474246981,
('B', 'jn'): 0.0,
('B', 'k'): 0.0,
('B', 'l'): 0.007402899104173628,
('B', 'ln'): 0.0,
('B', 'm'): 0.02592804748038888,
('B', 'mg'): 0.0,
('B', 'mq'): 0.0011284017799841944,
('B', 'n'): 0.18330097962777328,
('B', 'ng'): 0.0,
('B', 'nr'): 0.10741562843095136,
('B', 'nrfg'): 0.0028123856349547313,
('B', 'nrt'): 0.006835383285333164,
('B', 'ns'): 0.05943667425122387,
('B', 'nt'): 0.007859033313708954,
('B', 'nz'): 0.0193127754705873,
('B', 'o'): 0.00021745933245288822,
('B', 'p'): 0.014980826451541043,
('B', 'q'): 0.00091359439061,
('B', 'qe'): 0.0,
('B', 'qg'): 0.0,
('B', 'r'): 0.033047188675142274,
('B', 'rg'): 0.0,
('B', 'rr'): 3.977914618040638e-06,
('B', 'rz'): 0.0003540344010056168,
('B', 's'): 0.0039951522480521475,
('B', 't'): 0.03457072997385184,
('B', 'tg'): 0.0,
('B', 'u'): 0.00010475175160840347,
('B', 'ud'): 0.0,
('B', 'ug'): 0.0,
('B', 'uj'): 0.0,
('B', 'ul'): 0.0,
('B', 'uv'): 0.0,
('B', 'uz'): 0.0,
('B', 'v'): 0.06897173559066729,
('B', 'vd'): 0.00011801146700187228,
('B', 'vg'): 0.0,
('B', 'vi'): 3.977914618040638e-06,
('B', 'vn'): 0.01314700781262431,
('B', 'vq'): 5.303886157387518e-06,
('B', 'w'): 0.0,
('B', 'x'): 0.0,
('B', 'y'): 5.303886157387518e-05,
('B', 'yg'): 0.0,
('B', 'z'): 0.0008711633013508998,
('B', 'zg'): 0.0,
('E', 'a'): 0.0,
('E', 'ad'): 0.0,
('E', 'ag'): 0.0,
('E', 'an'): 0.0,
('E', 'b'): 0.0,
('E', 'bg'): 0.0,
('E', 'c'): 0.0,
('E', 'd'): 0.0,
('E', 'df'): 0.0,
('E', 'dg'): 0.0,
('E', 'e'): 0.0,
('E', 'en'): 0.0,
('E', 'f'): 0.0,
('E', 'g'): 0.0,
('E', 'h'): 0.0,
('E', 'i'): 0.0,
('E', 'in'): 0.0,
('E', 'j'): 0.0,
('E', 'jn'): 0.0,
('E', 'k'): 0.0,
('E', 'l'): 0.0,
('E', 'ln'): 0.0,
('E', 'm'): 0.0,
('E', 'mg'): 0.0,
('E', 'mq'): 0.0,
('E', 'n'): 0.0,
('E', 'ng'): 0.0,
('E', 'nr'): 0.0,
('E', 'nrfg'): 0.0,
('E', 'nrt'): 0.0,
('E', 'ns'): 0.0,
('E', 'nt'): 0.0,
('E', 'nz'): 0.0,
('E', 'o'): 0.0,
('E', 'p'): 0.0,
('E', 'q'): 0.0,
('E', 'qe'): 0.0,
('E', 'qg'): 0.0,
('E', 'r'): 0.0,
('E', 'rg'): 0.0,
('E', 'rr'): 0.0,
('E', 'rz'): 0.0,
('E', 's'): 0.0,
('E', 't'): 0.0,
('E', 'tg'): 0.0,
('E', 'u'): 0.0,
('E', 'ud'): 0.0,
('E', 'ug'): 0.0,
('E', 'uj'): 0.0,
('E', 'ul'): 0.0,
('E', 'uv'): 0.0,
('E', 'uz'): 0.0,
('E', 'v'): 0.0,
('E', 'vd'): 0.0,
('E', 'vg'): 0.0,
('E', 'vi'): 0.0,
('E', 'vn'): 0.0,
('E', 'vq'): 0.0,
('E', 'w'): 0.0,
('E', 'x'): 0.0,
('E', 'y'): 0.0,
('E', 'yg'): 0.0,
('E', 'z'): 0.0,
('E', 'zg'): 0.0,
('M', 'a'): 0.0,
('M', 'ad'): 0.0,
('M', 'ag'): 0.0,
('M', 'an'): 0.0,
('M', 'b'): 0.0,
('M', 'bg'): 0.0,
('M', 'c'): 0.0,
('M', 'd'): 0.0,
('M', 'df'): 0.0,
('M', 'dg'): 0.0,
('M', 'e'): 0.0,
('M', 'en'): 0.0,
('M', 'f'): 0.0,
('M', 'g'): 0.0,
('M', 'h'): 0.0,
('M', 'i'): 0.0,
('M', 'in'): 0.0,
('M', 'j'): 0.0,
('M', 'jn'): 0.0,
('M', 'k'): 0.0,
('M', 'l'): 0.0,
('M', 'ln'): 0.0,
('M', 'm'): 0.0,
('M', 'mg'): 0.0,
('M', 'mq'): 0.0,
('M', 'n'): 0.0,
('M', 'ng'): 0.0,
('M', 'nr'): 0.0,
('M', 'nrfg'): 0.0,
('M', 'nrt'): 0.0,
('M', 'ns'): 0.0,
('M', 'nt'): 0.0,
('M', 'nz'): 0.0,
('M', 'o'): 0.0,
('M', 'p'): 0.0,
('M', 'q'): 0.0,
('M', 'qe'): 0.0,
('M', 'qg'): 0.0,
('M', 'r'): 0.0,
('M', 'rg'): 0.0,
('M', 'rr'): 0.0,
('M', 'rz'): 0.0,
('M', 's'): 0.0,
('M', 't'): 0.0,
('M', 'tg'): 0.0,
('M', 'u'): 0.0,
('M', 'ud'): 0.0,
('M', 'ug'): 0.0,
('M', 'uj'): 0.0,
('M', 'ul'): 0.0,
('M', 'uv'): 0.0,
('M', 'uz'): 0.0,
('M', 'v'): 0.0,
('M', 'vd'): 0.0,
('M', 'vg'): 0.0,
('M', 'vi'): 0.0,
('M', 'vn'): 0.0,
('M', 'vq'): 0.0,
('M', 'w'): 0.0,
('M', 'x'): 0.0,
('M', 'y'): 0.0,
('M', 'yg'): 0.0,
('M', 'z'): 0.0,
('M', 'zg'): 0.0,
('S', 'a'): 0.020190568629634933,
('S', 'ad'): 1.5911658472162552e-05,
('S', 'ag'): 0.0009546995083297532,
('S', 'an'): 2.651943078693759e-06,
('S', 'b'): 0.0015447568433391145,
('S', 'bg'): 0.0,
('S', 'c'): 0.008337709039413178,
('S', 'd'): 0.020162723227308648,
('S', 'df'): 0.0,
('S', 'dg'): 0.0001299452108559942,
('S', 'e'): 0.0026254236479068215,
('S', 'en'): 0.0,
('S', 'f'): 0.0055452129775486496,
('S', 'g'): 0.0014917179817652395,
('S', 'h'): 0.00017502824319378808,
('S', 'i'): 0.0,
('S', 'in'): 0.0,
('S', 'j'): 0.007357816071835834,
('S', 'jn'): 0.0,
('S', 'k'): 0.000967959223723222,
('S', 'l'): 0.0,
('S', 'ln'): 0.0,
('S', 'm'): 0.038036819577704585,
('S', 'mg'): 1.988957309020319e-05,
('S', 'mq'): 0.0,
('S', 'n'): 0.021170461597212278,
('S', 'ng'): 0.007347208299521059,
('S', 'nr'): 0.011291973629078026,
('S', 'nrfg'): 0.0,
('S', 'nrt'): 0.0,
('S', 'ns'): 0.0,
('S', 'nt'): 5.303886157387518e-06,
('S', 'nz'): 0.0,
('S', 'o'): 0.00021082947475615385,
('S', 'p'): 0.05044658721445203,
('S', 'q'): 0.007531518343490275,
('S', 'qe'): 0.0,
('S', 'qg'): 0.0,
('S', 'r'): 0.06306851029749498,
('S', 'rg'): 3.447526002301887e-05,
('S', 'rr'): 0.0,
('S', 'rz'): 0.0,
('S', 's'): 0.0,
('S', 't'): 0.0,
('S', 'tg'): 0.0018868575004906095,
('S', 'u'): 0.000967959223723222,
('S', 'ud'): 0.000440222551063164,
('S', 'ug'): 0.0005317145872780986,
('S', 'uj'): 0.001056799316859463,
('S', 'ul'): 0.00022143724707092888,
('S', 'uv'): 0.00028640985249892595,
('S', 'uz'): 9.149203621493468e-05,
('S', 'v'): 0.04720326082920956,
('S', 'vd'): 0.0,
('S', 'vg'): 0.0026240976763674743,
('S', 'vi'): 0.0,
('S', 'vn'): 1.0607772314775036e-05,
('S', 'vq'): 0.0,
('S', 'w'): 0.0,
('S', 'x'): 0.0002187853039922351,
('S', 'y'): 0.00203536631289746,
('S', 'yg'): 1.3259715393468796e-06,
('S', 'z'): 0.0,
('S', 'zg'): 0.0}
{('B', 'a'): -4.762305214596967,
('B', 'ad'): -6.680066036784177,
('B', 'ag'): -3.14e+100,
('B', 'an'): -8.697083223018778,
('B', 'b'): -5.018374362109218,
('B', 'bg'): -3.14e+100,
('B', 'c'): -3.423880184954888,
('B', 'd'): -3.9750475297585357,
('B', 'df'): -8.888974230828882,
('B', 'dg'): -3.14e+100,
('B', 'e'): -8.563551830394255,
('B', 'en'): -3.14e+100,
('B', 'f'): -5.491630418482717,
('B', 'g'): -3.14e+100,
('B', 'h'): -13.533365129970255,
('B', 'i'): -6.1157847275557105,
('B', 'in'): -3.14e+100,
('B', 'j'): -5.0576191284681915,
('B', 'jn'): -3.14e+100,
('B', 'k'): -3.14e+100,
('B', 'l'): -4.905883584659895,
('B', 'ln'): -3.14e+100,
('B', 'm'): -3.6524299819046386,
('B', 'mg'): -3.14e+100,
('B', 'mq'): -6.78695300139688,
('B', 'n'): -1.6966257797548328,
('B', 'ng'): -3.14e+100,
('B', 'nr'): -2.2310495913769506,
('B', 'nrfg'): -5.873722175405573,
('B', 'nrt'): -4.985642733519195,
('B', 'ns'): -2.8228438314969213,
('B', 'nt'): -4.846091668182416,
('B', 'nz'): -3.94698846057672,
('B', 'o'): -8.433498702146057,
('B', 'p'): -4.200984132085048,
('B', 'q'): -6.998123858956596,
('B', 'qe'): -3.14e+100,
('B', 'qg'): -3.14e+100,
('B', 'r'): -3.4098187790818413,
('B', 'rg'): -3.14e+100,
('B', 'rr'): -12.434752841302146,
('B', 'rz'): -7.946116471570005,
('B', 's'): -5.522673590839954,
('B', 't'): -3.3647479094528574,
('B', 'tg'): -3.14e+100,
('B', 'u'): -9.163917277503234,
('B', 'ud'): -3.14e+100,
('B', 'ug'): -3.14e+100,
('B', 'uj'): -3.14e+100,
('B', 'ul'): -3.14e+100,
('B', 'uv'): -3.14e+100,
('B', 'uz'): -3.14e+100,
('B', 'v'): -2.6740584874265685,
('B', 'vd'): -9.044728760238115,
('B', 'vg'): -3.14e+100,
('B', 'vi'): -12.434752841302146,
('B', 'vn'): -4.3315610890163585,
('B', 'vq'): -12.147070768850364,
('B', 'w'): -3.14e+100,
('B', 'x'): -3.14e+100,
('B', 'y'): -9.844485675856319,
('B', 'yg'): -3.14e+100,
('B', 'z'): -7.045681111485645,
('B', 'zg'): -3.14e+100,
('E', 'a'): -3.14e+100,
('E', 'ad'): -3.14e+100,
('E', 'ag'): -3.14e+100,
('E', 'an'): -3.14e+100,
('E', 'b'): -3.14e+100,
('E', 'bg'): -3.14e+100,
('E', 'c'): -3.14e+100,
('E', 'd'): -3.14e+100,
('E', 'df'): -3.14e+100,
('E', 'dg'): -3.14e+100,
('E', 'e'): -3.14e+100,
('E', 'en'): -3.14e+100,
('E', 'f'): -3.14e+100,
('E', 'g'): -3.14e+100,
('E', 'h'): -3.14e+100,
('E', 'i'): -3.14e+100,
('E', 'in'): -3.14e+100,
('E', 'j'): -3.14e+100,
('E', 'jn'): -3.14e+100,
('E', 'k'): -3.14e+100,
('E', 'l'): -3.14e+100,
('E', 'ln'): -3.14e+100,
('E', 'm'): -3.14e+100,
('E', 'mg'): -3.14e+100,
('E', 'mq'): -3.14e+100,
('E', 'n'): -3.14e+100,
('E', 'ng'): -3.14e+100,
('E', 'nr'): -3.14e+100,
('E', 'nrfg'): -3.14e+100,
('E', 'nrt'): -3.14e+100,
('E', 'ns'): -3.14e+100,
('E', 'nt'): -3.14e+100,
('E', 'nz'): -3.14e+100,
('E', 'o'): -3.14e+100,
('E', 'p'): -3.14e+100,
('E', 'q'): -3.14e+100,
('E', 'qe'): -3.14e+100,
('E', 'qg'): -3.14e+100,
('E', 'r'): -3.14e+100,
('E', 'rg'): -3.14e+100,
('E', 'rr'): -3.14e+100,
('E', 'rz'): -3.14e+100,
('E', 's'): -3.14e+100,
('E', 't'): -3.14e+100,
('E', 'tg'): -3.14e+100,
('E', 'u'): -3.14e+100,
('E', 'ud'): -3.14e+100,
('E', 'ug'): -3.14e+100,
('E', 'uj'): -3.14e+100,
('E', 'ul'): -3.14e+100,
('E', 'uv'): -3.14e+100,
('E', 'uz'): -3.14e+100,
('E', 'v'): -3.14e+100,
('E', 'vd'): -3.14e+100,
('E', 'vg'): -3.14e+100,
('E', 'vi'): -3.14e+100,
('E', 'vn'): -3.14e+100,
('E', 'vq'): -3.14e+100,
('E', 'w'): -3.14e+100,
('E', 'x'): -3.14e+100,
('E', 'y'): -3.14e+100,
('E', 'yg'): -3.14e+100,
('E', 'z'): -3.14e+100,
('E', 'zg'): -3.14e+100,
('M', 'a'): -3.14e+100,
('M', 'ad'): -3.14e+100,
('M', 'ag'): -3.14e+100,
('M', 'an'): -3.14e+100,
('M', 'b'): -3.14e+100,
('M', 'bg'): -3.14e+100,
('M', 'c'): -3.14e+100,
('M', 'd'): -3.14e+100,
('M', 'df'): -3.14e+100,
('M', 'dg'): -3.14e+100,
('M', 'e'): -3.14e+100,
('M', 'en'): -3.14e+100,
('M', 'f'): -3.14e+100,
('M', 'g'): -3.14e+100,
('M', 'h'): -3.14e+100,
('M', 'i'): -3.14e+100,
('M', 'in'): -3.14e+100,
('M', 'j'): -3.14e+100,
('M', 'jn'): -3.14e+100,
('M', 'k'): -3.14e+100,
('M', 'l'): -3.14e+100,
('M', 'ln'): -3.14e+100,
('M', 'm'): -3.14e+100,
('M', 'mg'): -3.14e+100,
('M', 'mq'): -3.14e+100,
('M', 'n'): -3.14e+100,
('M', 'ng'): -3.14e+100,
('M', 'nr'): -3.14e+100,
('M', 'nrfg'): -3.14e+100,
('M', 'nrt'): -3.14e+100,
('M', 'ns'): -3.14e+100,
('M', 'nt'): -3.14e+100,
('M', 'nz'): -3.14e+100,
('M', 'o'): -3.14e+100,
('M', 'p'): -3.14e+100,
('M', 'q'): -3.14e+100,
('M', 'qe'): -3.14e+100,
('M', 'qg'): -3.14e+100,
('M', 'r'): -3.14e+100,
('M', 'rg'): -3.14e+100,
('M', 'rr'): -3.14e+100,
('M', 'rz'): -3.14e+100,
('M', 's'): -3.14e+100,
('M', 't'): -3.14e+100,
('M', 'tg'): -3.14e+100,
('M', 'u'): -3.14e+100,
('M', 'ud'): -3.14e+100,
('M', 'ug'): -3.14e+100,
('M', 'uj'): -3.14e+100,
('M', 'ul'): -3.14e+100,
('M', 'uv'): -3.14e+100,
('M', 'uz'): -3.14e+100,
('M', 'v'): -3.14e+100,
('M', 'vd'): -3.14e+100,
('M', 'vg'): -3.14e+100,
('M', 'vi'): -3.14e+100,
('M', 'vn'): -3.14e+100,
('M', 'vq'): -3.14e+100,
('M', 'w'): -3.14e+100,
('M', 'x'): -3.14e+100,
('M', 'y'): -3.14e+100,
('M', 'yg'): -3.14e+100,
('M', 'z'): -3.14e+100,
('M', 'zg'): -3.14e+100,
('S', 'a'): -3.9025396831295227,
('S', 'ad'): -11.048458480182255,
('S', 'ag'): -6.954113917960154,
('S', 'an'): -12.84021794941031,
('S', 'b'): -6.472888763970454,
('S', 'bg'): -3.14e+100,
('S', 'c'): -4.786966795861212,
('S', 'd'): -3.903919764181873,
('S', 'df'): -3.14e+100,
('S', 'dg'): -8.948397651299683,
('S', 'e'): -5.942513006281674,
('S', 'en'): -3.14e+100,
('S', 'f'): -5.194820249981676,
('S', 'g'): -6.507826815331734,
('S', 'h'): -8.650563207383884,
('S', 'i'): -3.14e+100,
('S', 'in'): -3.14e+100,
('S', 'j'): -4.911992119644354,
('S', 'jn'): -3.14e+100,
('S', 'k'): -6.940320595827818,
('S', 'l'): -3.14e+100,
('S', 'ln'): -3.14e+100,
('S', 'm'): -3.269200652116097,
('S', 'mg'): -10.825314928868044,
('S', 'mq'): -3.14e+100,
('S', 'n'): -3.8551483897645107,
('S', 'ng'): -4.913434861102905,
('S', 'nr'): -4.483663103956885,
('S', 'nrfg'): -3.14e+100,
('S', 'nrt'): -3.14e+100,
('S', 'ns'): -3.14e+100,
('S', 'nt'): -12.147070768850364,
('S', 'nz'): -3.14e+100,
('S', 'o'): -8.464460927750023,
('S', 'p'): -2.9868401813596317,
('S', 'q'): -4.888658618255058,
('S', 'qe'): -3.14e+100,
('S', 'qg'): -3.14e+100,
('S', 'r'): -2.7635336784127853,
('S', 'rg'): -10.275268591948773,
('S', 'rr'): -3.14e+100,
('S', 'rz'): -3.14e+100,
('S', 's'): -3.14e+100,
('S', 't'): -3.14e+100,
('S', 'tg'): -6.272842531880403,
('S', 'u'): -6.940320595827818,
('S', 'ud'): -7.728230161053767,
('S', 'ug'): -7.5394037026636855,
('S', 'uj'): -6.85251045118004,
('S', 'ul'): -8.4153713175535,
('S', 'uv'): -8.15808672228609,
('S', 'uz'): -9.299258625372996,
('S', 'v'): -3.053292303412302,
('S', 'vd'): -3.14e+100,
('S', 'vg'): -5.9430181843676895,
('S', 'vi'): -3.14e+100,
('S', 'vn'): -11.453923588290419,
('S', 'vq'): -3.14e+100,
('S', 'w'): -3.14e+100,
('S', 'x'): -8.427419656069674,
('S', 'y'): -6.1970794699489575,
('S', 'yg'): -13.533365129970255,
('S', 'z'): -3.14e+100,
('S', 'zg'): -3.14e+100}

File diff suppressed because it is too large Load Diff

@ -1,4 +1,5 @@
import operator
MIN_FLOAT=-3.14e100
def get_top_states(t_state_v,K=4):
items = t_state_v.items()
@ -10,7 +11,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
mem_path = [{}]
all_states = trans_p.keys()
for y in states.get(obs[0],all_states): #init
V[0][y] = start_p[y] * emit_p[y].get(obs[0],0)
V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT)
mem_path[0][y] = ''
for t in range(1,len(obs)):
V.append({})
@ -24,7 +25,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
if len(obs_states)==0: obs_states = all_states
for y in obs_states:
(prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in prev_states])
(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + emit_p[y].get(obs[t],MIN_FLOAT) ,y0) for y0 in prev_states])
V[t][y] =prob
mem_path[t][y] = state

@ -1,6 +1,6 @@
from distutils.core import setup
setup(name='jieba',
version='0.22',
version='0.25',
description='Chinese Words Segementation Utilities',
author='Sun, Junyi',
author_email='ccnusjy@gmail.com',

@ -88,4 +88,5 @@ if __name__ == "__main__":
cuttest('一次性交多少钱')
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
cuttest('小和尚留了一个像大和尚一样的和尚头')
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')

@ -90,4 +90,5 @@ if __name__ == "__main__":
cuttest('一次性交多少钱')
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
cuttest('小和尚留了一个像大和尚一样的和尚头')
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')

Loading…
Cancel
Save