merge change from master

12 years ago · ca97b19951
parent 08bfabb9d7 38b6bcd54e
commit ca97b19951
18 changed files with 19254 additions and 61618 deletions
--- a/5
+++ b/5
@ -1,3 +1,8 @@
+2013-06-17: version 0.29.1
+==========================
+1) 优化了viterbi算法的代码，分词速度提升15%
+2) 去除了词典中的一些低质词
+
 2013-06-07: version 0.29
 ==========================
 1) 提升了finalseg子模块命名体识别的准确度
--- a/README.md
+++ b/README.md
@ -4,9 +4,12 @@ jieba
 "Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module.
 - _Scroll down for English documentation._

+
 注意！
 ========
 这个branch `jieba3k`是专门用于Python3.x的版本
+=======
+

 Feature
 ========
@ -19,6 +22,7 @@ Feature
 * 支持自定义词典


+
 在线演示
 =========
 http://jiebademo.ap01.aws.af.cm/
@ -30,6 +34,7 @@ Python Version
 * 目前master分支是只支持Python2.x 的
 * Python3.x 版本的分支也已经基本可用： https://github.com/fxsjy/jieba/tree/jieba3k

+
 Usage
 ========
 * 全自动安装：`easy_install jieba` 或者 `pip install jieba`
--- a/extra_dict/dict.txt.big
+++ b/extra_dict/dict.txt.big
--- a/jieba/init.py
+++ b/jieba/init.py
@ -226,14 +226,12 @@ def cut(sentence,cut_all=False):
 			tmp = re_skip.split(blk)
 			for x in tmp:
 				if re_skip.match(x):
-					if x.strip(' ')!='':
-						yield x
+					yield x
+				elif not cut_all:
+					for xx in x:
+						yield xx
 				else:
-					if not cut_all:
-						for xx in x:
-							yield xx
-					else:
-						yield x
+					yield x

 def cut_for_search(sentence):
 	words = cut(sentence)
@ -316,7 +314,7 @@ def enable_parallel(processnum):

 def disable_parallel():
 	global pool,cut,cut_for_search
-	if pool != None:
+	if 'pool' in globals():
 		pool.close()
 		pool = None
 	cut = __ref_cut
@ -330,3 +328,8 @@ def set_dictionary(dictionary_path):
 			raise Exception("path does not exists:" + abs_path)
 		DICTIONARY = abs_path
 		initialized = False
+
+def get_abs_path_dict():
+	_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
+	abs_path = os.path.join(_curpath,DICTIONARY)
+	return abs_path
--- a/jieba/dict.txt
+++ b/jieba/dict.txt
--- a/jieba/finalseg/init.py
+++ b/jieba/finalseg/init.py
@ -1,20 +1,18 @@
 import re
 import os
 from math import log
+from . import prob_start
+from . import prob_trans
+from . import prob_emit

 MIN_FLOAT=-3.14e100

-def load_model(f_name):
-	_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
-	prob_p_path = os.path.join(_curpath,f_name)
-	tab = eval(open(prob_p_path,"rb").read())
-	return tab
-
-prob_start = load_model("prob_start.py")
-prob_trans = load_model("prob_trans.py")
-prob_emit = load_model("prob_emit.py")
-
-
+PrevStatus = {
+	'B':('E','S'),
+	'M':('M','B'),
+	'S':('S','E'),
+	'E':('B','M')
+}

 def viterbi(obs, states, start_p, trans_p, emit_p):
 	V = [{}] #tabular
@ -26,7 +24,8 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
 		V.append({})
 		newpath = {}
 		for y in states:
-			(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + emit_p[y].get(obs[t],MIN_FLOAT) ,y0) for y0 in states ])
+			em_p = emit_p[y].get(obs[t],MIN_FLOAT)
+			(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + em_p ,y0) for y0 in PrevStatus[y] ])
 			V[t][y] =prob
 			newpath[y] = path[state] + [y]
 		path = newpath
@ -37,7 +36,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p):


 def __cut(sentence):
-	prob, pos_list =  viterbi(sentence,('B','M','E','S'), prob_start, prob_trans, prob_emit)
+	prob, pos_list =  viterbi(sentence,('B','M','E','S'), prob_start.P, prob_trans.P, prob_emit.P)
 	begin, next = 0,0
 	#print pos_list, sentence
 	for i,char in enumerate(sentence):
--- a/jieba/finalseg/prob_emit.py
+++ b/jieba/finalseg/prob_emit.py
@ -1,4 +1,4 @@
-{'B': {'\u4e00': -3.6544978750449433,
+P={'B': {'\u4e00': -3.6544978750449433,
       '\u4e01': -8.125041941842026,
       '\u4e03': -7.817392401429855,
       '\u4e07': -6.3096425804013165,
--- a/jieba/finalseg/prob_start.py
+++ b/jieba/finalseg/prob_start.py
@ -1,4 +1,4 @@
-{'B': -0.26268660809250016,
+P={'B': -0.26268660809250016,
 'E': -3.14e+100,
 'M': -3.14e+100,
 'S': -1.4652633398537678}
--- a/jieba/finalseg/prob_trans.py
+++ b/jieba/finalseg/prob_trans.py
@ -1,4 +1,4 @@
-{'B': {'E': -0.510825623765990, 'M': -0.916290731874155},
+P={'B': {'E': -0.510825623765990, 'M': -0.916290731874155},
 'E': {'B': -0.5897149736854513, 'S': -0.8085250474669937},
 'M': {'E': -0.33344856811948514, 'M': -1.2603623820268226},
 'S': {'B': -0.7211965654669841, 'S': -0.6658631448798212}}
--- a/jieba/posseg/init.py
+++ b/jieba/posseg/init.py
@ -3,6 +3,10 @@ import os
 from . import viterbi
 import jieba
 import sys
+from . import prob_start
+from . import prob_trans
+from . import prob_emit
+from . import char_state_tab

 default_encoding = sys.getfilesystemencoding()

@ -13,7 +17,7 @@ def load_model(f_name):
 		return eval(open(prob_p_path,"rb").read())
 	else:
 		result = {}
-		for line in open(prob_p_path,"rb"):
+		for line in open(f_name,"rb"):
 			line = line.strip()
 			if line=="":continue
 			line = line.decode("utf-8")
@ -21,12 +25,7 @@ def load_model(f_name):
 			result[word]=tag
 		return result

-
-prob_start = load_model("prob_start.py")
-prob_trans = load_model("prob_trans.py")
-prob_emit = load_model("prob_emit.py")
-char_state_tab = load_model("char_state_tab.py")
-word_tag_tab = load_model("../dict.txt")
+word_tag_tab = load_model(jieba.get_abs_path_dict())

 if jieba.user_word_tag_tab:
 	word_tag_tab.update(jieba.user_word_tag_tab)
@ -49,7 +48,7 @@ class pair(object):
 		return self.__unicode__().encode(arg)

 def __cut(sentence):
-	prob, pos_list =  viterbi.viterbi(sentence,char_state_tab, prob_start, prob_trans, prob_emit)
+	prob, pos_list =  viterbi.viterbi(sentence,char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P)
 	begin, next = 0,0

 	for i,char in enumerate(sentence):
@ -148,8 +147,7 @@ def __cut_internal(sentence):
 			tmp = re_skip.split(blk)
 			for x in tmp:
 				if re_skip.match(x):
-					if x.strip(' ')!='':
-						yield pair(x,'')
+					yield pair(x,'x')
 				else:
 					for xx in x:
 						if re_num.match(xx):
--- a/jieba/posseg/char_state_tab.py
+++ b/jieba/posseg/char_state_tab.py
@ -1,4 +1,4 @@
-{'\u4e00': (('B', 'm'),
+P={'\u4e00': (('B', 'm'),
             ('S', 'm'),
             ('B', 'd'),
             ('B', 'a'),
--- a/jieba/posseg/prob_emit.py
+++ b/jieba/posseg/prob_emit.py
@ -1,4 +1,4 @@
-{('B', 'a'): {'\u4e00': -3.618715666782108,
+P={('B', 'a'): {'\u4e00': -3.618715666782108,
              '\u4e07': -10.500566885381515,
              '\u4e0a': -8.541143017159477,
              '\u4e0b': -8.445222895280738,
--- a/jieba/posseg/prob_start.py
+++ b/jieba/posseg/prob_start.py
@ -1,4 +1,4 @@
-{('B', 'a'): -4.762305214596967,
+P={('B', 'a'): -4.762305214596967,
 ('B', 'ad'): -6.680066036784177,
 ('B', 'ag'): -3.14e+100,
 ('B', 'an'): -8.697083223018778,
--- a/jieba/posseg/prob_trans.py
+++ b/jieba/posseg/prob_trans.py
@ -1,4 +1,4 @@
-{('B', 'a'): {('E', 'a'): -0.0050648453069648755,
+P={('B', 'a'): {('E', 'a'): -0.0050648453069648755,
              ('M', 'a'): -5.287963037107507},
 ('B', 'ad'): {('E', 'ad'): -0.0007479013978476627,
               ('M', 'ad'): -7.198613337130562},
--- a/setup.py
+++ b/setup.py
@ -1,6 +1,6 @@
 from distutils.core import setup  
 setup(name='jieba',  
-      version='0.29',  
+      version='0.29.1',  
      description='Chinese Words Segementation Utilities',  
      author='Sun, Junyi',  
      author_email='ccnusjy@gmail.com',  
--- a/test/jieba_test.py
+++ b/test/jieba_test.py
@ -1,6 +1,7 @@
 #-*-coding: utf-8 -*-
 import sys
 sys.path.append("../")
+from imp import reload
 import unittest
 import types
 import jieba
@ -108,8 +109,8 @@ class JiebaTestCase(unittest.TestCase):
            assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
            result = list(result)
            assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
-            print >> sys.stderr, " , ".join(result)
-        print  >> sys.stderr, "testDefaultCut"
+            print(" , ".join(result),file=sys.stderr)
+        print("testDefaultCut",file=sys.stderr)

    def testCutAll(self):
        for content in test_contents:
@ -117,8 +118,8 @@ class JiebaTestCase(unittest.TestCase):
            assert isinstance(result, types.GeneratorType), "Test CutAll Generator error"
            result = list(result)
            assert isinstance(result, list), "Test CutAll error on content: %s" % content
-            print >> sys.stderr, " , ".join(result)
-        print  >> sys.stderr, "testCutAll"
+            print(" , ".join(result), file=sys.stderr)
+        print("testCutAll",file=sys.stderr)

    def testSetDictionary(self):
        jieba.set_dictionary("foobar.txt")
@ -127,8 +128,8 @@ class JiebaTestCase(unittest.TestCase):
            assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error"
            result = list(result)
            assert isinstance(result, list), "Test SetDictionary error on content: %s" % content
-            print >> sys.stderr, " , ".join(result)
-        print  >> sys.stderr, "testSetDictionary"
+            print(" , ".join(result), file=sys.stderr)
+        print("testSetDictionary",file=sys.stderr)

    def testCutForSearch(self):
        for content in test_contents:
@ -136,8 +137,8 @@ class JiebaTestCase(unittest.TestCase):
            assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
            result = list(result)
            assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
-            print >> sys.stderr, " , ".join(result)
-        print  >> sys.stderr, "testCutForSearch"
+            print(" , ".join(result), file=sys.stderr)
+        print("testCutForSearch",file=sys.stderr)

    def testPosseg(self):
        import jieba.posseg as pseg
@ -146,8 +147,8 @@ class JiebaTestCase(unittest.TestCase):
            assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Posseg error on content: %s" % content
-            print >> sys.stderr, " , ".join([w.word + " / " + w.flag for w in result])
-        print  >> sys.stderr, "testPosseg"
+            print(" , ".join([w.word + " / " + w.flag for w in result]),file=sys.stderr)
+        print("testPosseg",file=sys.stderr)

 if __name__ == "__main__":
    unittest.main()
--- a/test/test.py
+++ b/test/test.py
@ -9,87 +9,91 @@ def cuttest(test_sent):


 if __name__ == "__main__":
-	cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空，我爱北京，我爱Python和C++。")
-	cuttest("我不喜欢日本和服。")
-	cuttest("雷猴回归人间。")
-	cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
-	cuttest("我需要廉租房")
-	cuttest("永和服装饰品有限公司")
-	cuttest("我爱北京天安门")
-	cuttest("abc")
-	cuttest("隐马尔可夫")
-	cuttest("雷猴是个好网站")
-	cuttest("“Microsoft”一词由“MICROcomputer（微型计算机）”和“SOFTware（软件）”两部分组成")
-	cuttest("草泥马和欺实马是今年的流行词汇")
-	cuttest("伊藤洋华堂总府店")
-	cuttest("中国科学院计算技术研究所")
-	cuttest("罗密欧与朱丽叶")
-	cuttest("我购买了道具和服装")
-	cuttest("PS: 我觉得开源有一个好处，就是能够敦促自己不断改进，避免敞帚自珍")
-	cuttest("湖北省石首市")
-	cuttest("湖北省十堰市")
-	cuttest("总经理完成了这件事情")
-	cuttest("电脑修好了")
-	cuttest("做好了这件事情就一了百了了")
-	cuttest("人们审美的观点是不同的")
-	cuttest("我们买了一个美的空调")
-	cuttest("线程初始化时我们要注意")
-	cuttest("一个分子是由好多原子组织成的")
-	cuttest("祝你马到功成")
-	cuttest("他掉进了无底洞里")
-	cuttest("中国的首都是北京")
-	cuttest("孙君意")
-	cuttest("外交部发言人马朝旭")
-	cuttest("领导人会议和第四届东亚峰会")
-	cuttest("在过去的这五年")
-	cuttest("还需要很长的路要走")
-	cuttest("60周年首都阅兵")
-	cuttest("你好人们审美的观点是不同的")
-	cuttest("买水果然后来世博园")
-	cuttest("买水果然后去世博园")
-	cuttest("但是后来我才知道你是对的")
-	cuttest("存在即合理")
-	cuttest("的的的的的在的的的的就以和和和")
-	cuttest("I love你，不以为耻，反以为rong")
-	cuttest("因")
-	cuttest("")
-	cuttest("hello你好人们审美的观点是不同的")
-	cuttest("很好但主要是基于网页形式")
-	cuttest("hello你好人们审美的观点是不同的")
-	cuttest("为什么我不能拥有想要的生活")
-	cuttest("后来我才")
-	cuttest("此次来中国是为了")
-	cuttest("使用了它就可以解决一些问题")
-	cuttest(",使用了它就可以解决一些问题")
-	cuttest("其实使用了它就可以解决一些问题")
-	cuttest("好人使用了它就可以解决一些问题")
-	cuttest("是因为和国家")
-	cuttest("老年搜索还支持")
-	cuttest("干脆就把那部蒙人的闲法给废了拉倒！RT @laoshipukong : 27日，全国人大常委会第三次审议侵权责任法草案，删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
-	cuttest("大")
-	cuttest("")
-	cuttest("他说的确实在理")
-	cuttest("长春市长春节讲话")
-	cuttest("结婚的和尚未结婚的")
-	cuttest("结合成分子时")
-	cuttest("旅游和服务是最好的")
-	cuttest("这件事情的确是我的错")
-	cuttest("供大家参考指正")
-	cuttest("哈尔滨政府公布塌桥原因")
-	cuttest("我在机场入口处")
-	cuttest("邢永臣摄影报道")
-	cuttest("BP神经网络如何训练才能在分类时增加区分度？")
-	cuttest("南京市长江大桥")
-	cuttest("应一些使用者的建议，也为了便于利用NiuTrans用于SMT研究")
-	cuttest('长春市长春药店')
-	cuttest('邓颖超生前最喜欢的衣服')
-	cuttest('胡锦涛是热爱世界和平的政治局常委')
-	cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
-	cuttest('一次性交多少钱')
-	cuttest('两块五一套，三块八一斤，四块七一本，五块六一条')
-	cuttest('小和尚留了一个像大和尚一样的和尚头')
-	cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
-	cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
-	cuttest('AT&T是一件不错的公司，给你发offer了吗？')
-	cuttest('C++和c#是什么关系？11+122=133，是吗？PI=3.14159')
-	cuttest('你认识那个和主席握手的的哥吗？他开一辆黑色的士。')
+    cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空，我爱北京，我爱Python和C++。")
+    cuttest("我不喜欢日本和服。")
+    cuttest("雷猴回归人间。")
+    cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
+    cuttest("我需要廉租房")
+    cuttest("永和服装饰品有限公司")
+    cuttest("我爱北京天安门")
+    cuttest("abc")
+    cuttest("隐马尔可夫")
+    cuttest("雷猴是个好网站")
+    cuttest("“Microsoft”一词由“MICROcomputer（微型计算机）”和“SOFTware（软件）”两部分组成")
+    cuttest("草泥马和欺实马是今年的流行词汇")
+    cuttest("伊藤洋华堂总府店")
+    cuttest("中国科学院计算技术研究所")
+    cuttest("罗密欧与朱丽叶")
+    cuttest("我购买了道具和服装")
+    cuttest("PS: 我觉得开源有一个好处，就是能够敦促自己不断改进，避免敞帚自珍")
+    cuttest("湖北省石首市")
+    cuttest("湖北省十堰市")
+    cuttest("总经理完成了这件事情")
+    cuttest("电脑修好了")
+    cuttest("做好了这件事情就一了百了了")
+    cuttest("人们审美的观点是不同的")
+    cuttest("我们买了一个美的空调")
+    cuttest("线程初始化时我们要注意")
+    cuttest("一个分子是由好多原子组织成的")
+    cuttest("祝你马到功成")
+    cuttest("他掉进了无底洞里")
+    cuttest("中国的首都是北京")
+    cuttest("孙君意")
+    cuttest("外交部发言人马朝旭")
+    cuttest("领导人会议和第四届东亚峰会")
+    cuttest("在过去的这五年")
+    cuttest("还需要很长的路要走")
+    cuttest("60周年首都阅兵")
+    cuttest("你好人们审美的观点是不同的")
+    cuttest("买水果然后来世博园")
+    cuttest("买水果然后去世博园")
+    cuttest("但是后来我才知道你是对的")
+    cuttest("存在即合理")
+    cuttest("的的的的的在的的的的就以和和和")
+    cuttest("I love你，不以为耻，反以为rong")
+    cuttest("因")
+    cuttest("")
+    cuttest("hello你好人们审美的观点是不同的")
+    cuttest("很好但主要是基于网页形式")
+    cuttest("hello你好人们审美的观点是不同的")
+    cuttest("为什么我不能拥有想要的生活")
+    cuttest("后来我才")
+    cuttest("此次来中国是为了")
+    cuttest("使用了它就可以解决一些问题")
+    cuttest(",使用了它就可以解决一些问题")
+    cuttest("其实使用了它就可以解决一些问题")
+    cuttest("好人使用了它就可以解决一些问题")
+    cuttest("是因为和国家")
+    cuttest("老年搜索还支持")
+    cuttest("干脆就把那部蒙人的闲法给废了拉倒！RT @laoshipukong : 27日，全国人大常委会第三次审议侵权责任法草案，删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
+    cuttest("大")
+    cuttest("")
+    cuttest("他说的确实在理")
+    cuttest("长春市长春节讲话")
+    cuttest("结婚的和尚未结婚的")
+    cuttest("结合成分子时")
+    cuttest("旅游和服务是最好的")
+    cuttest("这件事情的确是我的错")
+    cuttest("供大家参考指正")
+    cuttest("哈尔滨政府公布塌桥原因")
+    cuttest("我在机场入口处")
+    cuttest("邢永臣摄影报道")
+    cuttest("BP神经网络如何训练才能在分类时增加区分度？")
+    cuttest("南京市长江大桥")
+    cuttest("应一些使用者的建议，也为了便于利用NiuTrans用于SMT研究")
+    cuttest('长春市长春药店')
+    cuttest('邓颖超生前最喜欢的衣服')
+    cuttest('胡锦涛是热爱世界和平的政治局常委')
+    cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
+    cuttest('一次性交多少钱')
+    cuttest('两块五一套，三块八一斤，四块七一本，五块六一条')
+    cuttest('小和尚留了一个像大和尚一样的和尚头')
+    cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
+    cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
+    cuttest('AT&T是一件不错的公司，给你发offer了吗？')
+    cuttest('C++和c#是什么关系？11+122=133，是吗？PI=3.14159')
+    cuttest('你认识那个和主席握手的的哥吗？他开一辆黑色的士。')
+    cuttest('枪杆子中出政权')
+    cuttest('张三风同学走上了不归路')
+    cuttest('阿Q腰间挂着BB机手里拿着大哥大，说：我一般吃饭不AA制的。')
+    cuttest('在1号店能买到小S和大S八卦的书。')
--- a/test/test_pos_file.py
+++ b/test/test_pos_file.py
@ -1,4 +1,3 @@
-import urllib2
 import sys,time
 import sys
 sys.path.append("../")
@ -16,7 +15,7 @@ tm_cost = t2-t1

 log_f = open("1.log","wb")
 for w in words:
-	print(w.encode("utf-8"), "/" ,file=log_f)
+	log_f.write(bytes(w.word+"/"+w.flag+" ",'utf-8'))

-print 'speed' , len(content)/tm_cost, " bytes/second"
+print('speed' , len(content)/tm_cost, " bytes/second")