From 9c07d80edb8ea09dea4a1790a68211c3d381e5ef Mon Sep 17 00:00:00 2001 From: Sun Junyi Date: Wed, 28 Nov 2012 10:50:40 +0800 Subject: [PATCH] first py3k version of jieba --- jieba/__init__.py | 48 ++++++++++++++----------------------- jieba/analyse/__init__.py | 2 +- jieba/finalseg/__init__.py | 2 +- jieba/posseg/__init__.py | 13 +++++----- test/demo.py | 8 +++---- test/extract_tags.py | 4 ++-- test/jiebacmd.py | 2 +- test/test.py | 4 +--- test/test2.py | 4 +--- test/test_cut_for_search.py | 5 ++-- test/test_file.py | 7 +++--- test/test_pos.py | 4 ++-- test/test_pos_file.py | 20 ---------------- 13 files changed, 43 insertions(+), 80 deletions(-) delete mode 100644 test/test_pos_file.py diff --git a/jieba/__init__.py b/jieba/__init__.py index 945affa..38b2555 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -2,10 +2,8 @@ import re import math import os,sys import pprint -import finalseg +from . import finalseg import time -import tempfile -import marshal FREQ = {} total =0.0 @@ -31,33 +29,23 @@ def gen_trie(f_name): _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) -print >> sys.stderr, "Building Trie..." +print("Building Trie...",file=sys.stderr) + t1 = time.time() -cache_file = os.path.join(tempfile.gettempdir(),"jieba.cache") -load_from_cache_fail = True -if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(os.path.join(_curpath,"dict.txt")): - print >> sys.stderr, "loading model from cache" - try: - trie,FREQ,total,min_freq = marshal.load(open(cache_file,'rb')) - load_from_cache_fail = False - except: - load_from_cache_fail = True - -if load_from_cache_fail: - trie,FREQ,total = gen_trie(os.path.join(_curpath,"dict.txt")) - FREQ = dict([(k,float(v)/total) for k,v in FREQ.iteritems()]) #normalize - min_freq = min(FREQ.itervalues()) - print >> sys.stderr, "dumping model to file cache" - marshal.dump((trie,FREQ,total,min_freq),open(cache_file,'wb')) - -print >> sys.stderr, "loading model cost ", time.time() - t1, "seconds." -print >> sys.stderr, "Trie has been built succesfully." + +trie,FREQ,total = gen_trie(os.path.join(_curpath,"dict.txt")) +FREQ = dict([(k,float(v)/total) for k,v in FREQ.items()]) #normalize +min_freq = min(FREQ.values()) +print("dumping model to file cache",file=sys.stderr) + +print("loading model cost ", time.time() - t1, "seconds." ,file=sys.stderr) +print("Trie has been built succesfully.",file=sys.stderr) def __cut_all(sentence): dag = get_DAG(sentence) old_j = -1 - for k,L in dag.iteritems(): + for k,L in dag.items(): if len(L)==1 and k>old_j: yield sentence[k:L[0]+1] old_j = L[0] @@ -70,7 +58,7 @@ def __cut_all(sentence): def calc(sentence,DAG,idx,route): N = len(sentence) route[N] = (1.0,'') - for idx in xrange(N-1,-1,-1): + for idx in range(N-1,-1,-1): candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) * route[x+1][0],x ) for x in DAG[idx] ] route[idx] = max(candidates) @@ -96,7 +84,7 @@ def get_DAG(sentence): p = trie i+=1 j=i - for i in xrange(len(sentence)): + for i in range(len(sentence)): if not i in DAG: DAG[i] =[i] return DAG @@ -136,12 +124,12 @@ def __cut_DAG(sentence): def cut(sentence,cut_all=False): - if not ( type(sentence) is unicode): + if( type(sentence) is bytes): try: sentence = sentence.decode('utf-8') except: sentence = sentence.decode('gbk','ignore') - re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]") + re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"[^a-zA-Z0-9+#\n]") blocks = re_han.split(sentence) cut_block = __cut_DAG if cut_all: @@ -161,12 +149,12 @@ def cut_for_search(sentence): words = cut(sentence) for w in words: if len(w)>2: - for i in xrange(len(w)-1): + for i in range(len(w)-1): gram2 = w[i:i+2] if gram2 in FREQ: yield gram2 if len(w)>3: - for i in xrange(len(w)-2): + for i in range(len(w)-2): gram3 = w[i:i+3] if gram3 in FREQ: yield gram3 diff --git a/jieba/analyse/__init__.py b/jieba/analyse/__init__.py index 6182c0a..30652f6 100644 --- a/jieba/analyse/__init__.py +++ b/jieba/analyse/__init__.py @@ -19,7 +19,7 @@ def extract_tags(sentence,topK=20): if len(w.strip())<2: continue freq[w]=freq.get(w,0.0)+1.0 total = sum(freq.values()) - freq = [(k,v/total) for k,v in freq.iteritems()] + freq = [(k,v/total) for k,v in freq.items()] tf_idf_list = [(v * idf_freq.get(k,max_idf),k) for k,v in freq] st_list = sorted(tf_idf_list,reverse=True) diff --git a/jieba/finalseg/__init__.py b/jieba/finalseg/__init__.py index f4ce1c4..36c91a3 100644 --- a/jieba/finalseg/__init__.py +++ b/jieba/finalseg/__init__.py @@ -55,7 +55,7 @@ def cut(sentence): sentence = sentence.decode('utf-8') except: sentence = sentence.decode('gbk','ignore') - re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]") + re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"[^a-zA-Z0-9+#\n]") blocks = re_han.split(sentence) for blk in blocks: if re_han.match(blk): diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index 0104bec..b2a43d0 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -1,6 +1,6 @@ import re import os -import viterbi +from . import viterbi import jieba import sys default_encoding = sys.getfilesystemencoding() @@ -15,8 +15,9 @@ def load_model(f_name): for line in open(prob_p_path,"rb"): line = line.strip() if line=="":continue - word, _, tag = line.split(' ') - result[word.decode('utf-8')]=tag + line = line.decode("utf-8") + word, _, tag = line.split(" ") + result[word]=tag return result @@ -95,13 +96,13 @@ def __cut_DAG(sentence): def cut(sentence): - if not ( type(sentence) is unicode): + if ( type(sentence) is bytes): try: sentence = sentence.decode('utf-8') except: sentence = sentence.decode('gbk','ignore') - re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n%]") - re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+") + re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"[^a-zA-Z0-9+#\n%]") + re_eng,re_num = re.compile(r"[a-zA-Z+#]+"), re.compile(r"[0-9]+") blocks = re_han.split(sentence) for blk in blocks: diff --git a/test/demo.py b/test/demo.py index 19d314a..52b6d55 100644 --- a/test/demo.py +++ b/test/demo.py @@ -5,13 +5,13 @@ sys.path.append("../") import jieba seg_list = jieba.cut("我来到北京清华大学",cut_all=True) -print "Full Mode:", "/ ".join(seg_list) #全模式 +print("Full Mode:", "/ ".join(seg_list)) #全模式 seg_list = jieba.cut("我来到北京清华大学",cut_all=False) -print "Default Mode:", "/ ".join(seg_list) #默认模式 +print("Default Mode:", "/ ".join(seg_list)) #默认模式 seg_list = jieba.cut("他来到了网易杭研大厦") -print ", ".join(seg_list) +print(", ".join(seg_list)) seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式 -print ", ".join(seg_list) +print(", ".join(seg_list)) diff --git a/test/extract_tags.py b/test/extract_tags.py index 58e7de3..120199c 100644 --- a/test/extract_tags.py +++ b/test/extract_tags.py @@ -13,7 +13,7 @@ opt, args = parser.parse_args() if len(args) <1: - print USAGE + print(USAGE) sys.exit(1) file_name = args[0] @@ -28,6 +28,6 @@ content = open(file_name,'rb').read() tags = jieba.analyse.extract_tags(content,topK=topK) -print ",".join(tags) +print(",".join(tags)) diff --git a/test/jiebacmd.py b/test/jiebacmd.py index f80f1e9..dc93edc 100644 --- a/test/jiebacmd.py +++ b/test/jiebacmd.py @@ -23,6 +23,6 @@ while True: break line = line.strip() for word in jieba.cut(line): - print word.encode(default_encoding) + print(word.encode(default_encoding)) diff --git a/test/test.py b/test/test.py index 7c0755f..f1ee936 100644 --- a/test/test.py +++ b/test/test.py @@ -5,9 +5,7 @@ import jieba def cuttest(test_sent): result = jieba.cut(test_sent) - for word in result: - print word, "/", - print "" + print("/ ".join(result)) if __name__ == "__main__": diff --git a/test/test2.py b/test/test2.py index b267bc5..562ce20 100644 --- a/test/test2.py +++ b/test/test2.py @@ -5,9 +5,7 @@ import jieba def cuttest(test_sent): result = jieba.cut(test_sent,cut_all=True) - for word in result: - print word, "/", - print "" + print("/ ".join(result)) if __name__ == "__main__": diff --git a/test/test_cut_for_search.py b/test/test_cut_for_search.py index 51f82a4..af35e41 100644 --- a/test/test_cut_for_search.py +++ b/test/test_cut_for_search.py @@ -5,9 +5,8 @@ import jieba def cuttest(test_sent): result = jieba.cut_for_search(test_sent) - for word in result: - print word, "/", - print "" + print("/ ".join(result)) + if __name__ == "__main__": diff --git a/test/test_file.py b/test/test_file.py index 2107c36..4051541 100644 --- a/test/test_file.py +++ b/test/test_file.py @@ -1,4 +1,3 @@ -import urllib2 import sys,time import sys sys.path.append("../") @@ -13,8 +12,8 @@ t2 = time.time() tm_cost = t2-t1 log_f = open("1.log","wb") -for w in words: - print >> log_f, w.encode("gbk"), "/" , -print 'speed' , len(content)/tm_cost, " bytes/second" +log_f.write(bytes("/ ".join(words),'utf-8')) + +print('speed' , len(content)/tm_cost, " bytes/second") diff --git a/test/test_pos.py b/test/test_pos.py index df9e0d7..e182be2 100644 --- a/test/test_pos.py +++ b/test/test_pos.py @@ -6,8 +6,8 @@ import jieba.posseg as pseg def cuttest(test_sent): result = pseg.cut(test_sent) for w in result: - print w.word, "/", w.flag, ", ", - print "" + sys.stdout.write(w.word+ "/"+ w.flag + ", ") + print("") if __name__ == "__main__": diff --git a/test/test_pos_file.py b/test/test_pos_file.py deleted file mode 100644 index fd14a2d..0000000 --- a/test/test_pos_file.py +++ /dev/null @@ -1,20 +0,0 @@ -import urllib2 -import sys,time -import sys -sys.path.append("../") -import jieba.posseg as pseg - -url = sys.argv[1] -content = open(url,"rb").read() -t1 = time.time() -words = list(pseg.cut(content)) - -t2 = time.time() -tm_cost = t2-t1 - -log_f = open("1.log","wb") -for w in words: - print >> log_f, w.encode("gbk"), "/" , - -print 'speed' , len(content)/tm_cost, " bytes/second" -