fix bugs in jieba for py3k

12 years ago · 0f4f9067c3
parent 04d08f25d1 87c2799692
commit 0f4f9067c3
18 changed files with 854842 additions and 107042 deletions
--- a/README.md
+++ b/README.md
@ -10,11 +10,16 @@ jieba

 Feature
 ========
-* 支持两种分词模式：
+* 支持三种分词模式：
 * 1）精确模式，试图将句子最精确地切开，适合文本分析；
 * 2）全模式，把句子中所有的可以成词的词语都扫描出来, 速度非常快，但是不能解决歧义；
 * 3) 搜索引擎模式，在精确模式的基础上，对长词再次切分，提高召回率，适合用于搜索引擎分词。

+Python Version
+==============
+* 目前master分支是只支持Python2.x 的
+* Python3.x 版本的分支也已经基本可用： https://github.com/fxsjy/jieba/tree/jieba3k
+
 Usage
 ========
 * 全自动安装：`easy_install jieba` 或者 `pip install jieba`
@ -60,8 +65,7 @@ Output:

 	【新词识别】：他, 来到, 了, 网易, 杭研, 大厦    (此处，“杭研”并没有在词典中，但是也被Viterbi算法识别出来了)

-	【搜索引擎模式】： 小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, 后, 在
-, 日本, 京都, 大学, 日本京都大学, 深造
+	【搜索引擎模式】： 小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, 后, 在, 日本, 京都, 大学, 日本京都大学, 深造

 功能 2) ：添加自定义词典
 ================
@ -105,7 +109,18 @@ Output:
 		爱 v
 		北京 ns
 		天安门 ns
+		
+		
+
+其他词典
+========
+1. 占用内存较小的词典文件
+https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.small
+
+2. 支持繁体分词更好的词典文件
+https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big

+下载你所需要的词典，然后覆盖jieba/dict.txt 即可。


 分词速度
@ -116,14 +131,21 @@ Output:

 在线演示
 =========
-http://209.222.69.242:9000/
+http://jiebademo.ap01.aws.af.cm/
+
+(Powered by Appfog)

 常见问题
 =========
 1）模型的数据是如何生成的？https://github.com/fxsjy/jieba/issues/7
 
 2）这个库的授权是? https://github.com/fxsjy/jieba/issues/2
-
+ 
+ 更多问题请点击：https://github.com/fxsjy/jieba/issues?sort=updated&state=closed
+ 
+Change Log
+==========
+http://www.oschina.net/p/jieba/news#list

 jieba
 ========
@ -220,4 +242,6 @@ Segmentation speed

 Online demo
 =========
-http://209.222.69.242:9000/
+http://jiebademo.ap01.aws.af.cm/
+
+(Powered by Appfog)
--- a/extra_dict/dict.txt.big
+++ b/extra_dict/dict.txt.big
--- a/extra_dict/dict.txt.small
+++ b/extra_dict/dict.txt.small
--- a/jieba/init.py
+++ b/jieba/init.py
@ -5,9 +5,15 @@ import pprint
 from . import finalseg
 import time

+import tempfile
+import marshal
+from math import log
+import random
+
 FREQ = {}
 total =0.0

+
 def gen_trie(f_name):
 	lfreq = {}
 	trie = {}
@ -33,13 +39,32 @@ print("Building Trie...",file=sys.stderr)

 t1 = time.time()

-trie,FREQ,total = gen_trie(os.path.join(_curpath,"dict.txt"))
-FREQ = dict([(k,float(v)/total) for k,v in FREQ.items()]) #normalize
-min_freq = min(FREQ.values())
-print("dumping model to file cache",file=sys.stderr)
-
-print("loading model cost ", time.time() - t1, "seconds." ,file=sys.stderr)
-print("Trie has been built succesfully.",file=sys.stderr)
+cache_file = os.path.join(tempfile.gettempdir(),"jieba.cache")
+load_from_cache_fail = True
+if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(os.path.join(_curpath,"dict.txt")):
+	print("loading model from cache", file=sys.stderr)
+	try:
+		trie,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
+		load_from_cache_fail = False
+	except:
+		load_from_cache_fail = True
+
+if load_from_cache_fail:
+	trie,FREQ,total = gen_trie(os.path.join(_curpath,"dict.txt"))
+	FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.items()]) #normalize
+	min_freq = min(FREQ.values())
+	print("dumping model to file cache", file=sys.stderr)
+	tmp_suffix = "."+str(random.random())
+	marshal.dump((trie,FREQ,total,min_freq),open(cache_file+tmp_suffix,'wb'))
+	if os.name=='nt':
+		import shutil
+		replace_file = shutil.move
+	else:
+		replace_file = os.rename
+	replace_file(cache_file+tmp_suffix,cache_file)
+
+print("loading model cost ", time.time() - t1, "seconds.", file= sys.stderr)
+print("Trie has been built succesfully.", file= sys.stderr)


 def __cut_all(sentence):
@ -59,7 +84,7 @@ def calc(sentence,DAG,idx,route):
 	N = len(sentence)
 	route[N] = (1.0,'')
 	for idx in range(N-1,-1,-1):
-		candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) * route[x+1][0],x ) for x in DAG[idx] ]
+		candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0],x ) for x in DAG[idx] ]
 		route[idx] = max(candidates)

 def get_DAG(sentence):
@ -94,7 +119,7 @@ def __cut_DAG(sentence):
 	route ={}
 	calc(sentence,DAG,0,route=route)
 	x = 0
-	buf =u''
+	buf =''
 	N = len(sentence)
 	while x<N:
 		y = route[x][1]+1
@ -105,12 +130,12 @@ def __cut_DAG(sentence):
 			if len(buf)>0:
 				if len(buf)==1:
 					yield buf
-					buf=u''
+					buf=''
 				else:
-					regognized = finalseg.__cut(buf)
+					regognized = finalseg.cut(buf)
 					for t in regognized:
 						yield t
-					buf=u''
+					buf=''
 			yield l_word		
 		x =y

@ -118,7 +143,7 @@ def __cut_DAG(sentence):
 		if len(buf)==1:
 			yield buf
 		else:
-			regognized = finalseg.__cut(buf)
+			regognized = finalseg.cut(buf)
 			for t in regognized:
 				yield t

@ -129,7 +154,11 @@ def cut(sentence,cut_all=False):
 			sentence = sentence.decode('utf-8')
 		except:
 			sentence = sentence.decode('gbk','ignore')
-	re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"[^a-zA-Z0-9+#\n]")
+
+	re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile("[^\r\n]")
+	if cut_all:
+		re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("[^a-zA-Z0-9+#\n]")
+
 	blocks = re_han.split(sentence)
 	cut_block = __cut_DAG
 	if cut_all:
@ -169,7 +198,7 @@ def load_userdict(f):
 		if line.rstrip()=='': continue
 		word,freq = line.split(" ")
 		freq = float(freq)
-		FREQ[word] = freq / total
+		FREQ[word] = log(freq / total)
 		p = trie
 		for c in word:
 			if not c in p:
--- a/jieba/dict.txt
+++ b/jieba/dict.txt
@ -283173,7 +283173,7 @@
 自学辅导 3 n
 自守 3 v
 自定 3 d
-自定义 3 l
+自定义 13 l
 自定义词 3 n
 自审 6 v
 自宫 3 n
@ -367419,4 +367419,6 @@
 龟龙片甲 3 nz
 龟龙麟凤 3 ns
 龠 5 g
-龢 732 zg
+龢 732 zg
+B超 3 n
+T恤 4 n
--- a/jieba/finalseg/init.py
+++ b/jieba/finalseg/init.py
@ -1,10 +1,14 @@
 import re
 import os
+from math import log
+
+MIN_FLOAT=-3.14e100

 def load_model(f_name):
 	_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
 	prob_p_path = os.path.join(_curpath,f_name)
-	return eval(open(prob_p_path,"rb").read())
+	tab = eval(open(prob_p_path,"rb").read())
+	return tab

 prob_start = load_model("prob_start.py")
 prob_trans = load_model("prob_trans.py")
@ -16,13 +20,13 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
 	V = [{}] #tabular
 	path = {}
 	for y in states: #init
-		V[0][y] = start_p[y] * emit_p[y].get(obs[0],0)
+		V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT)
 		path[y] = [y]
 	for t in range(1,len(obs)):
 		V.append({})
 		newpath = {}
 		for y in states:
-			(prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in states ])
+			(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + emit_p[y].get(obs[t],MIN_FLOAT) ,y0) for y0 in states ])
 			V[t][y] =prob
 			newpath[y] = path[state] + [y]
 		path = newpath
@ -50,12 +54,12 @@ def __cut(sentence):
 		yield sentence[next:]

 def cut(sentence):
-	if not ( type(sentence) is unicode):
+	if not ( type(sentence) is str):
 		try:
 			sentence = sentence.decode('utf-8')
 		except:
 			sentence = sentence.decode('gbk','ignore')
-	re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"[^a-zA-Z0-9+#\n]")
+	re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("[^a-zA-Z0-9+#\n]")
 	blocks = re_han.split(sentence)
 	for blk in blocks:
 		if re_han.match(blk):
--- a/jieba/finalseg/prob_emit.py
+++ b/jieba/finalseg/prob_emit.py
--- a/jieba/finalseg/prob_start.py
+++ b/jieba/finalseg/prob_start.py
@ -1 +1,4 @@
-{'B': 0.7689828525554734, 'E': 0.0, 'M': 0.0, 'S': 0.23101714744452656}
+{'B': -0.26268660809250016,
+ 'E': -3.14e+100,
+ 'M': -3.14e+100,
+ 'S': -1.4652633398537678}
--- a/jieba/finalseg/prob_trans.py
+++ b/jieba/finalseg/prob_trans.py
@ -1,4 +1,4 @@
-{'B': {'E': 0.8518218565181658, 'M': 0.14817814348183422},
- 'E': {'B': 0.5544853051164425, 'S': 0.44551469488355755},
- 'M': {'E': 0.7164487459986911, 'M': 0.2835512540013088},
- 'S': {'B': 0.48617017333894563, 'S': 0.5138298266610544}}
+{'B': {'E': -0.16037786260859094, 'M': -1.9093400568760384},
+ 'E': {'B': -0.5897149736854513, 'S': -0.8085250474669937},
+ 'M': {'E': -0.33344856811948514, 'M': -1.2603623820268226},
+ 'S': {'B': -0.7211965654669841, 'S': -0.6658631448798212}}
--- a/jieba/posseg/init.py
+++ b/jieba/posseg/init.py
@ -3,6 +3,7 @@ import os
 from . import viterbi
 import jieba
 import sys
+
 default_encoding = sys.getfilesystemencoding()

 def load_model(f_name):
@ -33,7 +34,7 @@ class pair(object):
 		self.flag = flag

 	def __unicode__(self):
-		return self.word+u"/"+self.flag
+		return self.word+"/"+self.flag

 	def __repr__(self):
 		return self.__str__()
@ -61,12 +62,33 @@ def __cut(sentence):
 	if next<len(sentence):
 		yield pair(sentence[next:], pos_list[next][1] )

+def __cut_detail(sentence):
+	re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("[^a-zA-Z0-9+#\r\n]")
+	re_eng,re_num = re.compile("[a-zA-Z+#]+"), re.compile("[0-9]+")
+	blocks = re_han.split(sentence)
+	for blk in blocks:
+		if re_han.match(blk):
+				for word in __cut(blk):
+					yield word
+		else:
+			tmp = re_skip.split(blk)
+			for x in tmp:
+				if x!="":
+					if re_num.match(x):
+						yield pair(x,'m')
+					elif re_eng.match(x):
+						yield pair(x,'eng')
+					else:
+						yield pair(x,'x')
+
 def __cut_DAG(sentence):
 	DAG = jieba.get_DAG(sentence)
 	route ={}
+	
 	jieba.calc(sentence,DAG,0,route=route)
+
 	x = 0
-	buf =u''
+	buf =''
 	N = len(sentence)
 	while x<N:
 		y = route[x][1]+1
@ -77,12 +99,12 @@ def __cut_DAG(sentence):
 			if len(buf)>0:
 				if len(buf)==1:
 					yield pair(buf,word_tag_tab.get(buf,'x'))
-					buf=u''
+					buf=''
 				else:
-					regognized = __cut(buf)
+					regognized = __cut_detail(buf)
 					for t in regognized:
 						yield t
-					buf=u''
+					buf=''
 			yield pair(l_word,word_tag_tab.get(l_word,'x'))
 		x =y

@ -90,7 +112,7 @@ def __cut_DAG(sentence):
 		if len(buf)==1:
 			yield pair(buf,word_tag_tab.get(buf,'x'))
 		else:
-			regognized = __cut(buf)
+			regognized = __cut_detail(buf)
 			for t in regognized:
 				yield t

@ -101,10 +123,11 @@ def cut(sentence):
 			sentence = sentence.decode('utf-8')
 		except:
 			sentence = sentence.decode('gbk','ignore')
-	re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"[^a-zA-Z0-9+#\n%]")
-	re_eng,re_num = re.compile(r"[a-zA-Z+#]+"), re.compile(r"[0-9]+")
-	blocks = re_han.split(sentence)

+	re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile("[^\r\n]")
+	re_eng,re_num = re.compile("[a-zA-Z+#]+"), re.compile("[0-9]+")
+
+	blocks = re_han.split(sentence)
 	for blk in blocks:
 		if re_han.match(blk):
 				for word in __cut_DAG(blk):
--- a/jieba/posseg/char_state_tab.py
+++ b/jieba/posseg/char_state_tab.py
--- a/jieba/posseg/prob_emit.py
+++ b/jieba/posseg/prob_emit.py
--- a/jieba/posseg/prob_start.py
+++ b/jieba/posseg/prob_start.py
@ -1,256 +1,256 @@
-{('B', 'a'): 0.008545886571090637,
- ('B', 'ad'): 0.0012556950477614949,
- ('B', 'ag'): 0.0,
- ('B', 'an'): 0.0001670724139577068,
- ('B', 'b'): 0.006615272009801582,
- ('B', 'bg'): 0.0,
- ('B', 'c'): 0.03258575057944956,
- ('B', 'd'): 0.018778408940230508,
- ('B', 'df'): 0.00013790104009207547,
- ('B', 'dg'): 0.0,
- ('B', 'e'): 0.00019093990166595064,
- ('B', 'en'): 0.0,
- ('B', 'f'): 0.004121119544290101,
- ('B', 'g'): 0.0,
- ('B', 'h'): 1.3259715393468796e-06,
- ('B', 'i'): 0.0022077426130125543,
- ('B', 'in'): 0.0,
- ('B', 'j'): 0.006360685474246981,
- ('B', 'jn'): 0.0,
- ('B', 'k'): 0.0,
- ('B', 'l'): 0.007402899104173628,
- ('B', 'ln'): 0.0,
- ('B', 'm'): 0.02592804748038888,
- ('B', 'mg'): 0.0,
- ('B', 'mq'): 0.0011284017799841944,
- ('B', 'n'): 0.18330097962777328,
- ('B', 'ng'): 0.0,
- ('B', 'nr'): 0.10741562843095136,
- ('B', 'nrfg'): 0.0028123856349547313,
- ('B', 'nrt'): 0.006835383285333164,
- ('B', 'ns'): 0.05943667425122387,
- ('B', 'nt'): 0.007859033313708954,
- ('B', 'nz'): 0.0193127754705873,
- ('B', 'o'): 0.00021745933245288822,
- ('B', 'p'): 0.014980826451541043,
- ('B', 'q'): 0.00091359439061,
- ('B', 'qe'): 0.0,
- ('B', 'qg'): 0.0,
- ('B', 'r'): 0.033047188675142274,
- ('B', 'rg'): 0.0,
- ('B', 'rr'): 3.977914618040638e-06,
- ('B', 'rz'): 0.0003540344010056168,
- ('B', 's'): 0.0039951522480521475,
- ('B', 't'): 0.03457072997385184,
- ('B', 'tg'): 0.0,
- ('B', 'u'): 0.00010475175160840347,
- ('B', 'ud'): 0.0,
- ('B', 'ug'): 0.0,
- ('B', 'uj'): 0.0,
- ('B', 'ul'): 0.0,
- ('B', 'uv'): 0.0,
- ('B', 'uz'): 0.0,
- ('B', 'v'): 0.06897173559066729,
- ('B', 'vd'): 0.00011801146700187228,
- ('B', 'vg'): 0.0,
- ('B', 'vi'): 3.977914618040638e-06,
- ('B', 'vn'): 0.01314700781262431,
- ('B', 'vq'): 5.303886157387518e-06,
- ('B', 'w'): 0.0,
- ('B', 'x'): 0.0,
- ('B', 'y'): 5.303886157387518e-05,
- ('B', 'yg'): 0.0,
- ('B', 'z'): 0.0008711633013508998,
- ('B', 'zg'): 0.0,
- ('E', 'a'): 0.0,
- ('E', 'ad'): 0.0,
- ('E', 'ag'): 0.0,
- ('E', 'an'): 0.0,
- ('E', 'b'): 0.0,
- ('E', 'bg'): 0.0,
- ('E', 'c'): 0.0,
- ('E', 'd'): 0.0,
- ('E', 'df'): 0.0,
- ('E', 'dg'): 0.0,
- ('E', 'e'): 0.0,
- ('E', 'en'): 0.0,
- ('E', 'f'): 0.0,
- ('E', 'g'): 0.0,
- ('E', 'h'): 0.0,
- ('E', 'i'): 0.0,
- ('E', 'in'): 0.0,
- ('E', 'j'): 0.0,
- ('E', 'jn'): 0.0,
- ('E', 'k'): 0.0,
- ('E', 'l'): 0.0,
- ('E', 'ln'): 0.0,
- ('E', 'm'): 0.0,
- ('E', 'mg'): 0.0,
- ('E', 'mq'): 0.0,
- ('E', 'n'): 0.0,
- ('E', 'ng'): 0.0,
- ('E', 'nr'): 0.0,
- ('E', 'nrfg'): 0.0,
- ('E', 'nrt'): 0.0,
- ('E', 'ns'): 0.0,
- ('E', 'nt'): 0.0,
- ('E', 'nz'): 0.0,
- ('E', 'o'): 0.0,
- ('E', 'p'): 0.0,
- ('E', 'q'): 0.0,
- ('E', 'qe'): 0.0,
- ('E', 'qg'): 0.0,
- ('E', 'r'): 0.0,
- ('E', 'rg'): 0.0,
- ('E', 'rr'): 0.0,
- ('E', 'rz'): 0.0,
- ('E', 's'): 0.0,
- ('E', 't'): 0.0,
- ('E', 'tg'): 0.0,
- ('E', 'u'): 0.0,
- ('E', 'ud'): 0.0,
- ('E', 'ug'): 0.0,
- ('E', 'uj'): 0.0,
- ('E', 'ul'): 0.0,
- ('E', 'uv'): 0.0,
- ('E', 'uz'): 0.0,
- ('E', 'v'): 0.0,
- ('E', 'vd'): 0.0,
- ('E', 'vg'): 0.0,
- ('E', 'vi'): 0.0,
- ('E', 'vn'): 0.0,
- ('E', 'vq'): 0.0,
- ('E', 'w'): 0.0,
- ('E', 'x'): 0.0,
- ('E', 'y'): 0.0,
- ('E', 'yg'): 0.0,
- ('E', 'z'): 0.0,
- ('E', 'zg'): 0.0,
- ('M', 'a'): 0.0,
- ('M', 'ad'): 0.0,
- ('M', 'ag'): 0.0,
- ('M', 'an'): 0.0,
- ('M', 'b'): 0.0,
- ('M', 'bg'): 0.0,
- ('M', 'c'): 0.0,
- ('M', 'd'): 0.0,
- ('M', 'df'): 0.0,
- ('M', 'dg'): 0.0,
- ('M', 'e'): 0.0,
- ('M', 'en'): 0.0,
- ('M', 'f'): 0.0,
- ('M', 'g'): 0.0,
- ('M', 'h'): 0.0,
- ('M', 'i'): 0.0,
- ('M', 'in'): 0.0,
- ('M', 'j'): 0.0,
- ('M', 'jn'): 0.0,
- ('M', 'k'): 0.0,
- ('M', 'l'): 0.0,
- ('M', 'ln'): 0.0,
- ('M', 'm'): 0.0,
- ('M', 'mg'): 0.0,
- ('M', 'mq'): 0.0,
- ('M', 'n'): 0.0,
- ('M', 'ng'): 0.0,
- ('M', 'nr'): 0.0,
- ('M', 'nrfg'): 0.0,
- ('M', 'nrt'): 0.0,
- ('M', 'ns'): 0.0,
- ('M', 'nt'): 0.0,
- ('M', 'nz'): 0.0,
- ('M', 'o'): 0.0,
- ('M', 'p'): 0.0,
- ('M', 'q'): 0.0,
- ('M', 'qe'): 0.0,
- ('M', 'qg'): 0.0,
- ('M', 'r'): 0.0,
- ('M', 'rg'): 0.0,
- ('M', 'rr'): 0.0,
- ('M', 'rz'): 0.0,
- ('M', 's'): 0.0,
- ('M', 't'): 0.0,
- ('M', 'tg'): 0.0,
- ('M', 'u'): 0.0,
- ('M', 'ud'): 0.0,
- ('M', 'ug'): 0.0,
- ('M', 'uj'): 0.0,
- ('M', 'ul'): 0.0,
- ('M', 'uv'): 0.0,
- ('M', 'uz'): 0.0,
- ('M', 'v'): 0.0,
- ('M', 'vd'): 0.0,
- ('M', 'vg'): 0.0,
- ('M', 'vi'): 0.0,
- ('M', 'vn'): 0.0,
- ('M', 'vq'): 0.0,
- ('M', 'w'): 0.0,
- ('M', 'x'): 0.0,
- ('M', 'y'): 0.0,
- ('M', 'yg'): 0.0,
- ('M', 'z'): 0.0,
- ('M', 'zg'): 0.0,
- ('S', 'a'): 0.020190568629634933,
- ('S', 'ad'): 1.5911658472162552e-05,
- ('S', 'ag'): 0.0009546995083297532,
- ('S', 'an'): 2.651943078693759e-06,
- ('S', 'b'): 0.0015447568433391145,
- ('S', 'bg'): 0.0,
- ('S', 'c'): 0.008337709039413178,
- ('S', 'd'): 0.020162723227308648,
- ('S', 'df'): 0.0,
- ('S', 'dg'): 0.0001299452108559942,
- ('S', 'e'): 0.0026254236479068215,
- ('S', 'en'): 0.0,
- ('S', 'f'): 0.0055452129775486496,
- ('S', 'g'): 0.0014917179817652395,
- ('S', 'h'): 0.00017502824319378808,
- ('S', 'i'): 0.0,
- ('S', 'in'): 0.0,
- ('S', 'j'): 0.007357816071835834,
- ('S', 'jn'): 0.0,
- ('S', 'k'): 0.000967959223723222,
- ('S', 'l'): 0.0,
- ('S', 'ln'): 0.0,
- ('S', 'm'): 0.038036819577704585,
- ('S', 'mg'): 1.988957309020319e-05,
- ('S', 'mq'): 0.0,
- ('S', 'n'): 0.021170461597212278,
- ('S', 'ng'): 0.007347208299521059,
- ('S', 'nr'): 0.011291973629078026,
- ('S', 'nrfg'): 0.0,
- ('S', 'nrt'): 0.0,
- ('S', 'ns'): 0.0,
- ('S', 'nt'): 5.303886157387518e-06,
- ('S', 'nz'): 0.0,
- ('S', 'o'): 0.00021082947475615385,
- ('S', 'p'): 0.05044658721445203,
- ('S', 'q'): 0.007531518343490275,
- ('S', 'qe'): 0.0,
- ('S', 'qg'): 0.0,
- ('S', 'r'): 0.06306851029749498,
- ('S', 'rg'): 3.447526002301887e-05,
- ('S', 'rr'): 0.0,
- ('S', 'rz'): 0.0,
- ('S', 's'): 0.0,
- ('S', 't'): 0.0,
- ('S', 'tg'): 0.0018868575004906095,
- ('S', 'u'): 0.000967959223723222,
- ('S', 'ud'): 0.000440222551063164,
- ('S', 'ug'): 0.0005317145872780986,
- ('S', 'uj'): 0.001056799316859463,
- ('S', 'ul'): 0.00022143724707092888,
- ('S', 'uv'): 0.00028640985249892595,
- ('S', 'uz'): 9.149203621493468e-05,
- ('S', 'v'): 0.04720326082920956,
- ('S', 'vd'): 0.0,
- ('S', 'vg'): 0.0026240976763674743,
- ('S', 'vi'): 0.0,
- ('S', 'vn'): 1.0607772314775036e-05,
- ('S', 'vq'): 0.0,
- ('S', 'w'): 0.0,
- ('S', 'x'): 0.0002187853039922351,
- ('S', 'y'): 0.00203536631289746,
- ('S', 'yg'): 1.3259715393468796e-06,
- ('S', 'z'): 0.0,
- ('S', 'zg'): 0.0}
+{('B', 'a'): -4.762305214596967,
+ ('B', 'ad'): -6.680066036784177,
+ ('B', 'ag'): -3.14e+100,
+ ('B', 'an'): -8.697083223018778,
+ ('B', 'b'): -5.018374362109218,
+ ('B', 'bg'): -3.14e+100,
+ ('B', 'c'): -3.423880184954888,
+ ('B', 'd'): -3.9750475297585357,
+ ('B', 'df'): -8.888974230828882,
+ ('B', 'dg'): -3.14e+100,
+ ('B', 'e'): -8.563551830394255,
+ ('B', 'en'): -3.14e+100,
+ ('B', 'f'): -5.491630418482717,
+ ('B', 'g'): -3.14e+100,
+ ('B', 'h'): -13.533365129970255,
+ ('B', 'i'): -6.1157847275557105,
+ ('B', 'in'): -3.14e+100,
+ ('B', 'j'): -5.0576191284681915,
+ ('B', 'jn'): -3.14e+100,
+ ('B', 'k'): -3.14e+100,
+ ('B', 'l'): -4.905883584659895,
+ ('B', 'ln'): -3.14e+100,
+ ('B', 'm'): -3.6524299819046386,
+ ('B', 'mg'): -3.14e+100,
+ ('B', 'mq'): -6.78695300139688,
+ ('B', 'n'): -1.6966257797548328,
+ ('B', 'ng'): -3.14e+100,
+ ('B', 'nr'): -2.2310495913769506,
+ ('B', 'nrfg'): -5.873722175405573,
+ ('B', 'nrt'): -4.985642733519195,
+ ('B', 'ns'): -2.8228438314969213,
+ ('B', 'nt'): -4.846091668182416,
+ ('B', 'nz'): -3.94698846057672,
+ ('B', 'o'): -8.433498702146057,
+ ('B', 'p'): -4.200984132085048,
+ ('B', 'q'): -6.998123858956596,
+ ('B', 'qe'): -3.14e+100,
+ ('B', 'qg'): -3.14e+100,
+ ('B', 'r'): -3.4098187790818413,
+ ('B', 'rg'): -3.14e+100,
+ ('B', 'rr'): -12.434752841302146,
+ ('B', 'rz'): -7.946116471570005,
+ ('B', 's'): -5.522673590839954,
+ ('B', 't'): -3.3647479094528574,
+ ('B', 'tg'): -3.14e+100,
+ ('B', 'u'): -9.163917277503234,
+ ('B', 'ud'): -3.14e+100,
+ ('B', 'ug'): -3.14e+100,
+ ('B', 'uj'): -3.14e+100,
+ ('B', 'ul'): -3.14e+100,
+ ('B', 'uv'): -3.14e+100,
+ ('B', 'uz'): -3.14e+100,
+ ('B', 'v'): -2.6740584874265685,
+ ('B', 'vd'): -9.044728760238115,
+ ('B', 'vg'): -3.14e+100,
+ ('B', 'vi'): -12.434752841302146,
+ ('B', 'vn'): -4.3315610890163585,
+ ('B', 'vq'): -12.147070768850364,
+ ('B', 'w'): -3.14e+100,
+ ('B', 'x'): -3.14e+100,
+ ('B', 'y'): -9.844485675856319,
+ ('B', 'yg'): -3.14e+100,
+ ('B', 'z'): -7.045681111485645,
+ ('B', 'zg'): -3.14e+100,
+ ('E', 'a'): -3.14e+100,
+ ('E', 'ad'): -3.14e+100,
+ ('E', 'ag'): -3.14e+100,
+ ('E', 'an'): -3.14e+100,
+ ('E', 'b'): -3.14e+100,
+ ('E', 'bg'): -3.14e+100,
+ ('E', 'c'): -3.14e+100,
+ ('E', 'd'): -3.14e+100,
+ ('E', 'df'): -3.14e+100,
+ ('E', 'dg'): -3.14e+100,
+ ('E', 'e'): -3.14e+100,
+ ('E', 'en'): -3.14e+100,
+ ('E', 'f'): -3.14e+100,
+ ('E', 'g'): -3.14e+100,
+ ('E', 'h'): -3.14e+100,
+ ('E', 'i'): -3.14e+100,
+ ('E', 'in'): -3.14e+100,
+ ('E', 'j'): -3.14e+100,
+ ('E', 'jn'): -3.14e+100,
+ ('E', 'k'): -3.14e+100,
+ ('E', 'l'): -3.14e+100,
+ ('E', 'ln'): -3.14e+100,
+ ('E', 'm'): -3.14e+100,
+ ('E', 'mg'): -3.14e+100,
+ ('E', 'mq'): -3.14e+100,
+ ('E', 'n'): -3.14e+100,
+ ('E', 'ng'): -3.14e+100,
+ ('E', 'nr'): -3.14e+100,
+ ('E', 'nrfg'): -3.14e+100,
+ ('E', 'nrt'): -3.14e+100,
+ ('E', 'ns'): -3.14e+100,
+ ('E', 'nt'): -3.14e+100,
+ ('E', 'nz'): -3.14e+100,
+ ('E', 'o'): -3.14e+100,
+ ('E', 'p'): -3.14e+100,
+ ('E', 'q'): -3.14e+100,
+ ('E', 'qe'): -3.14e+100,
+ ('E', 'qg'): -3.14e+100,
+ ('E', 'r'): -3.14e+100,
+ ('E', 'rg'): -3.14e+100,
+ ('E', 'rr'): -3.14e+100,
+ ('E', 'rz'): -3.14e+100,
+ ('E', 's'): -3.14e+100,
+ ('E', 't'): -3.14e+100,
+ ('E', 'tg'): -3.14e+100,
+ ('E', 'u'): -3.14e+100,
+ ('E', 'ud'): -3.14e+100,
+ ('E', 'ug'): -3.14e+100,
+ ('E', 'uj'): -3.14e+100,
+ ('E', 'ul'): -3.14e+100,
+ ('E', 'uv'): -3.14e+100,
+ ('E', 'uz'): -3.14e+100,
+ ('E', 'v'): -3.14e+100,
+ ('E', 'vd'): -3.14e+100,
+ ('E', 'vg'): -3.14e+100,
+ ('E', 'vi'): -3.14e+100,
+ ('E', 'vn'): -3.14e+100,
+ ('E', 'vq'): -3.14e+100,
+ ('E', 'w'): -3.14e+100,
+ ('E', 'x'): -3.14e+100,
+ ('E', 'y'): -3.14e+100,
+ ('E', 'yg'): -3.14e+100,
+ ('E', 'z'): -3.14e+100,
+ ('E', 'zg'): -3.14e+100,
+ ('M', 'a'): -3.14e+100,
+ ('M', 'ad'): -3.14e+100,
+ ('M', 'ag'): -3.14e+100,
+ ('M', 'an'): -3.14e+100,
+ ('M', 'b'): -3.14e+100,
+ ('M', 'bg'): -3.14e+100,
+ ('M', 'c'): -3.14e+100,
+ ('M', 'd'): -3.14e+100,
+ ('M', 'df'): -3.14e+100,
+ ('M', 'dg'): -3.14e+100,
+ ('M', 'e'): -3.14e+100,
+ ('M', 'en'): -3.14e+100,
+ ('M', 'f'): -3.14e+100,
+ ('M', 'g'): -3.14e+100,
+ ('M', 'h'): -3.14e+100,
+ ('M', 'i'): -3.14e+100,
+ ('M', 'in'): -3.14e+100,
+ ('M', 'j'): -3.14e+100,
+ ('M', 'jn'): -3.14e+100,
+ ('M', 'k'): -3.14e+100,
+ ('M', 'l'): -3.14e+100,
+ ('M', 'ln'): -3.14e+100,
+ ('M', 'm'): -3.14e+100,
+ ('M', 'mg'): -3.14e+100,
+ ('M', 'mq'): -3.14e+100,
+ ('M', 'n'): -3.14e+100,
+ ('M', 'ng'): -3.14e+100,
+ ('M', 'nr'): -3.14e+100,
+ ('M', 'nrfg'): -3.14e+100,
+ ('M', 'nrt'): -3.14e+100,
+ ('M', 'ns'): -3.14e+100,
+ ('M', 'nt'): -3.14e+100,
+ ('M', 'nz'): -3.14e+100,
+ ('M', 'o'): -3.14e+100,
+ ('M', 'p'): -3.14e+100,
+ ('M', 'q'): -3.14e+100,
+ ('M', 'qe'): -3.14e+100,
+ ('M', 'qg'): -3.14e+100,
+ ('M', 'r'): -3.14e+100,
+ ('M', 'rg'): -3.14e+100,
+ ('M', 'rr'): -3.14e+100,
+ ('M', 'rz'): -3.14e+100,
+ ('M', 's'): -3.14e+100,
+ ('M', 't'): -3.14e+100,
+ ('M', 'tg'): -3.14e+100,
+ ('M', 'u'): -3.14e+100,
+ ('M', 'ud'): -3.14e+100,
+ ('M', 'ug'): -3.14e+100,
+ ('M', 'uj'): -3.14e+100,
+ ('M', 'ul'): -3.14e+100,
+ ('M', 'uv'): -3.14e+100,
+ ('M', 'uz'): -3.14e+100,
+ ('M', 'v'): -3.14e+100,
+ ('M', 'vd'): -3.14e+100,
+ ('M', 'vg'): -3.14e+100,
+ ('M', 'vi'): -3.14e+100,
+ ('M', 'vn'): -3.14e+100,
+ ('M', 'vq'): -3.14e+100,
+ ('M', 'w'): -3.14e+100,
+ ('M', 'x'): -3.14e+100,
+ ('M', 'y'): -3.14e+100,
+ ('M', 'yg'): -3.14e+100,
+ ('M', 'z'): -3.14e+100,
+ ('M', 'zg'): -3.14e+100,
+ ('S', 'a'): -3.9025396831295227,
+ ('S', 'ad'): -11.048458480182255,
+ ('S', 'ag'): -6.954113917960154,
+ ('S', 'an'): -12.84021794941031,
+ ('S', 'b'): -6.472888763970454,
+ ('S', 'bg'): -3.14e+100,
+ ('S', 'c'): -4.786966795861212,
+ ('S', 'd'): -3.903919764181873,
+ ('S', 'df'): -3.14e+100,
+ ('S', 'dg'): -8.948397651299683,
+ ('S', 'e'): -5.942513006281674,
+ ('S', 'en'): -3.14e+100,
+ ('S', 'f'): -5.194820249981676,
+ ('S', 'g'): -6.507826815331734,
+ ('S', 'h'): -8.650563207383884,
+ ('S', 'i'): -3.14e+100,
+ ('S', 'in'): -3.14e+100,
+ ('S', 'j'): -4.911992119644354,
+ ('S', 'jn'): -3.14e+100,
+ ('S', 'k'): -6.940320595827818,
+ ('S', 'l'): -3.14e+100,
+ ('S', 'ln'): -3.14e+100,
+ ('S', 'm'): -3.269200652116097,
+ ('S', 'mg'): -10.825314928868044,
+ ('S', 'mq'): -3.14e+100,
+ ('S', 'n'): -3.8551483897645107,
+ ('S', 'ng'): -4.913434861102905,
+ ('S', 'nr'): -4.483663103956885,
+ ('S', 'nrfg'): -3.14e+100,
+ ('S', 'nrt'): -3.14e+100,
+ ('S', 'ns'): -3.14e+100,
+ ('S', 'nt'): -12.147070768850364,
+ ('S', 'nz'): -3.14e+100,
+ ('S', 'o'): -8.464460927750023,
+ ('S', 'p'): -2.9868401813596317,
+ ('S', 'q'): -4.888658618255058,
+ ('S', 'qe'): -3.14e+100,
+ ('S', 'qg'): -3.14e+100,
+ ('S', 'r'): -2.7635336784127853,
+ ('S', 'rg'): -10.275268591948773,
+ ('S', 'rr'): -3.14e+100,
+ ('S', 'rz'): -3.14e+100,
+ ('S', 's'): -3.14e+100,
+ ('S', 't'): -3.14e+100,
+ ('S', 'tg'): -6.272842531880403,
+ ('S', 'u'): -6.940320595827818,
+ ('S', 'ud'): -7.728230161053767,
+ ('S', 'ug'): -7.5394037026636855,
+ ('S', 'uj'): -6.85251045118004,
+ ('S', 'ul'): -8.4153713175535,
+ ('S', 'uv'): -8.15808672228609,
+ ('S', 'uz'): -9.299258625372996,
+ ('S', 'v'): -3.053292303412302,
+ ('S', 'vd'): -3.14e+100,
+ ('S', 'vg'): -5.9430181843676895,
+ ('S', 'vi'): -3.14e+100,
+ ('S', 'vn'): -11.453923588290419,
+ ('S', 'vq'): -3.14e+100,
+ ('S', 'w'): -3.14e+100,
+ ('S', 'x'): -8.427419656069674,
+ ('S', 'y'): -6.1970794699489575,
+ ('S', 'yg'): -13.533365129970255,
+ ('S', 'z'): -3.14e+100,
+ ('S', 'zg'): -3.14e+100}
--- a/jieba/posseg/prob_trans.py
+++ b/jieba/posseg/prob_trans.py
--- a/jieba/posseg/viterbi.py
+++ b/jieba/posseg/viterbi.py
@ -1,4 +1,5 @@
 import operator
+MIN_FLOAT=-3.14e100

 def get_top_states(t_state_v,K=4):
 	items = t_state_v.items()
@ -10,7 +11,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
 	mem_path = [{}]
 	all_states = trans_p.keys()
 	for y in states.get(obs[0],all_states): #init
-		V[0][y] = start_p[y] * emit_p[y].get(obs[0],0)
+		V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT)
 		mem_path[0][y] = ''
 	for t in range(1,len(obs)):
 		V.append({})
@ -24,7 +25,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p):

 		if len(obs_states)==0: obs_states = all_states
 		for y in obs_states:
-			(prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in prev_states])
+			(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + emit_p[y].get(obs[t],MIN_FLOAT) ,y0) for y0 in prev_states])
 			V[t][y] =prob
 			mem_path[t][y] = state

--- a/setup.py
+++ b/setup.py
@ -1,6 +1,6 @@
 from distutils.core import setup  
 setup(name='jieba',  
-      version='0.22',  
+      version='0.25',  
      description='Chinese Words Segementation Utilities',  
      author='Sun, Junyi',  
      author_email='ccnusjy@gmail.com',  
--- a/test/test.py
+++ b/test/test.py
@ -88,4 +88,5 @@ if __name__ == "__main__":
 	cuttest('一次性交多少钱')
 	cuttest('两块五一套，三块八一斤，四块七一本，五块六一条')
 	cuttest('小和尚留了一个像大和尚一样的和尚头')
-	cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
+	cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
+	cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
--- a/test/test_pos.py
+++ b/test/test_pos.py
@ -90,4 +90,5 @@ if __name__ == "__main__":
 	cuttest('一次性交多少钱')
 	cuttest('两块五一套，三块八一斤，四块七一本，五块六一条')
 	cuttest('小和尚留了一个像大和尚一样的和尚头')
-	cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
+	cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
+	cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')