first commit

13 years ago · 51765aa6dd
parent a36b98b1fb
commit 51765aa6dd
7 changed files with 28537 additions and 7 deletions
--- a/jieba/init.py
+++ b/jieba/init.py
@ -2,12 +2,20 @@ import re
 import math
 import os,sys
 import pprint
+import finalseg
+
+FREQ = {}
+total =0

 def gen_trie(f_name):
+	global total
 	trie = {}
 	for line in open(f_name):
 		word,freq = line.strip().split(" ")
 		word = word.decode('utf-8')
+		freq = int(freq)
+		FREQ[word] = freq
+		total+=freq
 		p = trie
 		for c in word:
 			if not c in p:
@ -17,11 +25,16 @@ def gen_trie(f_name):
 	return trie

 _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
-print >> sys.stderr, "loading dictionary..."
+
+print >> sys.stderr, "Building Trie..."
 trie = gen_trie(os.path.join(_curpath,"dict.txt"))
-print >> sys.stderr,"done."
+FREQ = dict([(k,float(v)/total) for k,v in FREQ.iteritems()]) #normalize
+min_freq = min(FREQ.itervalues())
+#print min_freq
+print >> sys.stderr,"Trie has been built succesfully."

-def __cut(sentence):
+
+def __cut_all(sentence):
 	N = len(sentence)
 	i,j=0,0
 	p = trie
@ -35,12 +48,81 @@ def __cut(sentence):
 			if j>=N:
 				i+=1
 				j=i
+				p=trie
+		else:
+			p = trie
+			i+=1
+			j=i
+
+def calc(sentence,DAG,idx,route):
+	if idx in route:
+		return route[idx]
+	if idx>=len(sentence):
+		return (1.0,'')
+	next = DAG[idx]
+	best = max([ ( FREQ.get(sentence[idx:x+1],min_freq) * calc(sentence,DAG,x+1,route=route)[0],x )for x in next ])
+	route[idx]=best
+	return best
+
+
+def __cut_DAG(sentence):
+	N = len(sentence)
+	i,j=0,0
+	p = trie
+	DAG = {}
+	while i<N:
+		c = sentence[j]
+		if c in p:
+			p = p[c]
+			if '' in p:
+				if not i in DAG:
+					DAG[i]=[]
+				DAG[i].append(j)
+			j+=1
+			if j>=N:
+				i+=1
+				j=i
+				p=trie
 		else:
 			p = trie
 			i+=1
 			j=i
+	for i in xrange(len(sentence)):
+		if not i in DAG:
+			DAG[i] =[i]
+	#pprint.pprint(DAG)
+	route ={}
+	calc(sentence,DAG,0,route=route)
+	x = 0
+	buf =u''
+	while x<N:
+		y = route[x][1]+1
+		l_word = sentence[x:y]
+		if y-x==1:
+			buf+= l_word
+		else:
+			if len(buf)>0:
+				if len(buf)==1:
+					yield buf
+					buf=u''
+				else:
+					regognized = finalseg.cut(buf)
+					for t in regognized:
+						yield t
+					buf=u''
+			yield l_word		
+		x =y
+
+	if len(buf)>0:
+		if len(buf)==1:
+			yield buf
+		else:
+			regognized = finalseg.cut(buf)
+			for t in regognized:
+				yield t
+

-def cut(sentence):
+def cut(sentence,cut_all=False):
 	if not ( type(sentence) is unicode):
 		try:
 			sentence = sentence.decode('utf-8')
@ -48,10 +130,13 @@ def cut(sentence):
 			sentence = sentence.decode('gbk','ignore')
 	re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
 	blocks = re_han.split(sentence)
-	
+	cut_block = __cut_DAG
+	if cut_all:
+		cut_block = __cut_all
 	for blk in blocks:
 		if re_han.match(blk):
-				for word in __cut(blk):
+				#pprint.pprint(__cut_DAG(blk))
+				for word in cut_block(blk):
 					yield word
 		else:
 			tmp = re_skip.split(blk)
--- a/jieba/finalseg/init.py
+++ b/jieba/finalseg/init.py
@ -0,0 +1,70 @@
+import re
+import os
+
+def load_model(f_name):
+	_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
+	prob_p_path = os.path.join(_curpath,f_name)
+	return eval(open(prob_p_path,"rb").read())
+
+prob_start = load_model("prob_start.py")
+prob_trans = load_model("prob_trans.py")
+prob_emit = load_model("prob_emit.py")
+
+
+
+def viterbi(obs, states, start_p, trans_p, emit_p):
+	V = [{}] #tabular
+	path = {}
+	for y in states: #init
+		V[0][y] = start_p[y] * emit_p[y].get(obs[0],0)
+		path[y] = [y]
+	for t in range(1,len(obs)):
+		V.append({})
+		newpath = {}
+		for y in states:
+			(prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in states ])
+			V[t][y] =prob
+			newpath[y] = path[state] + [y]
+		path = newpath
+	
+	if emit_p['M'].get(obs[-1],0)> emit_p['S'].get(obs[-1],0):
+		(prob, state) = max([(V[len(obs) - 1][y], y) for y in ('E',)])
+	else:
+		(prob, state) = max([(V[len(obs) - 1][y], y) for y in ('E','S')])
+	
+	return (prob, path[state])
+
+
+def __cut(sentence):
+	prob, pos_list =  viterbi(sentence,('B','M','E','S'), prob_start, prob_trans, prob_emit)
+	begin, next = 0,0
+	for i,char in enumerate(sentence):
+		pos = pos_list[i]
+		if pos=='B':
+			begin = i
+		elif pos=='E':
+			yield sentence[begin:i+1]
+			next = i+1
+		elif pos=='S':
+			yield char
+			next = i+1
+	if next<len(sentence):
+		yield sentence[next:]
+
+def cut(sentence):
+	if not ( type(sentence) is unicode):
+		try:
+			sentence = sentence.decode('utf-8')
+		except:
+			sentence = sentence.decode('gbk','ignore')
+	re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
+	blocks = re_han.split(sentence)
+	for blk in blocks:
+		if re_han.match(blk):
+			for word in __cut(blk):
+				yield word
+		else:
+			tmp = re_skip.split(blk)
+			for x in tmp:
+				if x!="":
+					yield x
--- a/jieba/finalseg/prob_emit.py
+++ b/jieba/finalseg/prob_emit.py
--- a/jieba/finalseg/prob_start.py
+++ b/jieba/finalseg/prob_start.py
@ -0,0 +1 @@
+{'B': 0.6887918653263693, 'E': 0.0, 'M': 0.0, 'S': 0.31120813467363073}
--- a/jieba/finalseg/prob_trans.py
+++ b/jieba/finalseg/prob_trans.py
@ -0,0 +1,4 @@
+{'B': {'E': 0.8623367940544834, 'M': 0.13766320594551662},
+ 'E': {'B': 0.5544856664818801, 'S': 0.4455143335181199},
+ 'M': {'E': 0.7024280846522946, 'M': 0.2975719153477054},
+ 'S': {'B': 0.48617131037009215, 'S': 0.5138286896299078}}
--- a/test.py
+++ b/test.py
@ -1,5 +1,6 @@
 #encoding=utf-8
 import sys
+sys.path.append("../")
 import jieba

 def cuttest(test_sent):
@ -10,6 +11,83 @@ def cuttest(test_sent):


 if __name__ == "__main__":
+	cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空，我爱北京，我爱Python和C++。")
+	cuttest("我不喜欢日本和服。")
+	cuttest("雷猴回归人间。")
 	cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
+	cuttest("我需要廉租房")
 	cuttest("永和服装饰品有限公司")
-	cuttest("我爱北京天安门")
+	cuttest("我爱北京天安门")
+	cuttest("abc")
+	cuttest("隐马尔可夫")
+	cuttest("雷猴是个好网站")
+	cuttest("“Microsoft”一词由“MICROcomputer（微型计算机）”和“SOFTware（软件）”两部分组成")
+	cuttest("草泥马和欺实马是今年的流行词汇")
+	cuttest("伊藤洋华堂总府店")
+	cuttest("中国科学院计算技术研究所")
+	cuttest("罗密欧与朱丽叶")
+	cuttest("我购买了道具和服装")
+	cuttest("PS: 我觉得开源有一个好处，就是能够敦促自己不断改进，避免敞帚自珍")
+	cuttest("湖北省石首市")
+	cuttest("湖北省十堰市")
+	cuttest("总经理完成了这件事情")
+	cuttest("电脑修好了")
+	cuttest("做好了这件事情就一了百了了")
+	cuttest("人们审美的观点是不同的")
+	cuttest("我们买了一个美的空调")
+	cuttest("线程初始化时我们要注意")
+	cuttest("一个分子是由好多原子组织成的")
+	cuttest("祝你马到功成")
+	cuttest("他掉进了无底洞里")
+	cuttest("中国的首都是北京")
+	cuttest("孙君意")
+	cuttest("外交部发言人马朝旭")
+	cuttest("领导人会议和第四届东亚峰会")
+	cuttest("在过去的这五年")
+	cuttest("还需要很长的路要走")
+	cuttest("60周年首都阅兵")
+	cuttest("你好人们审美的观点是不同的")
+	cuttest("买水果然后来世博园")
+	cuttest("买水果然后去世博园")
+	cuttest("但是后来我才知道你是对的")
+	cuttest("存在即合理")
+	cuttest("的的的的的在的的的的就以和和和")
+	cuttest("I love你，不以为耻，反以为rong")
+	cuttest("因")
+	cuttest("")
+	cuttest("hello你好人们审美的观点是不同的")
+	cuttest("很好但主要是基于网页形式")
+	cuttest("hello你好人们审美的观点是不同的")
+	cuttest("为什么我不能拥有想要的生活")
+	cuttest("后来我才")
+	cuttest("此次来中国是为了")
+	cuttest("使用了它就可以解决一些问题")
+	cuttest(",使用了它就可以解决一些问题")
+	cuttest("其实使用了它就可以解决一些问题")
+	cuttest("好人使用了它就可以解决一些问题")
+	cuttest("是因为和国家")
+	cuttest("老年搜索还支持")
+	cuttest("干脆就把那部蒙人的闲法给废了拉倒！RT @laoshipukong : 27日，全国人大常委会第三次审议侵权责任法草案，删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
+	cuttest("大")
+	cuttest("")
+	cuttest("他说的确实在理")
+	cuttest("长春市长春节讲话")
+	cuttest("结婚的和尚未结婚的")
+	cuttest("结合成分子时")
+	cuttest("旅游和服务是最好的")
+	cuttest("这件事情的确是我的错")
+	cuttest("供大家参考指正")
+	cuttest("哈尔滨政府公布塌桥原因")
+	cuttest("我在机场入口处")
+	cuttest("邢永臣摄影报道")
+	cuttest("BP神经网络如何训练才能在分类时增加区分度？")
+	cuttest("南京市长江大桥")
+	cuttest("应一些使用者的建议，也为了便于利用NiuTrans用于SMT研究")
+	cuttest('长春市长春药店')
+	cuttest('邓颖超生前最喜欢的衣服')
+	cuttest('胡锦涛是热爱世界和平的政治局常委')
+	cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
+	cuttest('一次性交多少钱')
+	cuttest('两块五一套，三块八一斤，四块七一本，五块六一条')
+	cuttest('小和尚留了一个像大和尚一样的和尚头')
+	cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
--- a/test2.py
+++ b/test2.py
@ -0,0 +1,93 @@
+#encoding=utf-8
+import sys
+sys.path.append("../")
+import jieba
+
+def cuttest(test_sent):
+	result = jieba.cut(test_sent,cut_all=True)
+	for word in result:
+		print word, "/", 
+	print ""
+
+
+if __name__ == "__main__":
+	cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空，我爱北京，我爱Python和C++。")
+	cuttest("我不喜欢日本和服。")
+	cuttest("雷猴回归人间。")
+	cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
+	cuttest("我需要廉租房")
+	cuttest("永和服装饰品有限公司")
+	cuttest("我爱北京天安门")
+	cuttest("abc")
+	cuttest("隐马尔可夫")
+	cuttest("雷猴是个好网站")
+	cuttest("“Microsoft”一词由“MICROcomputer（微型计算机）”和“SOFTware（软件）”两部分组成")
+	cuttest("草泥马和欺实马是今年的流行词汇")
+	cuttest("伊藤洋华堂总府店")
+	cuttest("中国科学院计算技术研究所")
+	cuttest("罗密欧与朱丽叶")
+	cuttest("我购买了道具和服装")
+	cuttest("PS: 我觉得开源有一个好处，就是能够敦促自己不断改进，避免敞帚自珍")
+	cuttest("湖北省石首市")
+	cuttest("湖北省十堰市")
+	cuttest("总经理完成了这件事情")
+	cuttest("电脑修好了")
+	cuttest("做好了这件事情就一了百了了")
+	cuttest("人们审美的观点是不同的")
+	cuttest("我们买了一个美的空调")
+	cuttest("线程初始化时我们要注意")
+	cuttest("一个分子是由好多原子组织成的")
+	cuttest("祝你马到功成")
+	cuttest("他掉进了无底洞里")
+	cuttest("中国的首都是北京")
+	cuttest("孙君意")
+	cuttest("外交部发言人马朝旭")
+	cuttest("领导人会议和第四届东亚峰会")
+	cuttest("在过去的这五年")
+	cuttest("还需要很长的路要走")
+	cuttest("60周年首都阅兵")
+	cuttest("你好人们审美的观点是不同的")
+	cuttest("买水果然后来世博园")
+	cuttest("买水果然后去世博园")
+	cuttest("但是后来我才知道你是对的")
+	cuttest("存在即合理")
+	cuttest("的的的的的在的的的的就以和和和")
+	cuttest("I love你，不以为耻，反以为rong")
+	cuttest("因")
+	cuttest("")
+	cuttest("hello你好人们审美的观点是不同的")
+	cuttest("很好但主要是基于网页形式")
+	cuttest("hello你好人们审美的观点是不同的")
+	cuttest("为什么我不能拥有想要的生活")
+	cuttest("后来我才")
+	cuttest("此次来中国是为了")
+	cuttest("使用了它就可以解决一些问题")
+	cuttest(",使用了它就可以解决一些问题")
+	cuttest("其实使用了它就可以解决一些问题")
+	cuttest("好人使用了它就可以解决一些问题")
+	cuttest("是因为和国家")
+	cuttest("老年搜索还支持")
+	cuttest("干脆就把那部蒙人的闲法给废了拉倒！RT @laoshipukong : 27日，全国人大常委会第三次审议侵权责任法草案，删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
+	cuttest("大")
+	cuttest("")
+	cuttest("他说的确实在理")
+	cuttest("长春市长春节讲话")
+	cuttest("结婚的和尚未结婚的")
+	cuttest("结合成分子时")
+	cuttest("旅游和服务是最好的")
+	cuttest("这件事情的确是我的错")
+	cuttest("供大家参考指正")
+	cuttest("哈尔滨政府公布塌桥原因")
+	cuttest("我在机场入口处")
+	cuttest("邢永臣摄影报道")
+	cuttest("BP神经网络如何训练才能在分类时增加区分度？")
+	cuttest("南京市长江大桥")
+	cuttest("应一些使用者的建议，也为了便于利用NiuTrans用于SMT研究")
+	cuttest('长春市长春药店')
+	cuttest('邓颖超生前最喜欢的衣服')
+	cuttest('胡锦涛是热爱世界和平的政治局常委')
+	cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
+	cuttest('一次性交多少钱')
+	cuttest('两块五一套，三块八一斤，四块七一本，五块六一条')
+	cuttest('小和尚留了一个像大和尚一样的和尚头')
+	cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
				`@ -0,0 +1 @@`
				`{'B': 0.6887918653263693, 'E': 0.0, 'M': 0.0, 'S': 0.31120813467363073}`