From 51765aa6ddb9cd9bf108647f662d0a340a878623 Mon Sep 17 00:00:00 2001 From: fxsjy Date: Mon, 1 Oct 2012 15:25:06 +0800 Subject: [PATCH] first commit --- jieba/__init__.py | 97 +- jieba/finalseg/__init__.py | 70 + jieba/finalseg/prob_emit.py | 28199 +++++++++++++++++++++++++++++++++ jieba/finalseg/prob_start.py | 1 + jieba/finalseg/prob_trans.py | 4 + test.py | 80 +- test2.py | 93 + 7 files changed, 28537 insertions(+), 7 deletions(-) create mode 100644 jieba/finalseg/__init__.py create mode 100644 jieba/finalseg/prob_emit.py create mode 100644 jieba/finalseg/prob_start.py create mode 100644 jieba/finalseg/prob_trans.py create mode 100644 test2.py diff --git a/jieba/__init__.py b/jieba/__init__.py index e5b5b5f..fdf7d51 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -2,12 +2,20 @@ import re import math import os,sys import pprint +import finalseg + +FREQ = {} +total =0 def gen_trie(f_name): + global total trie = {} for line in open(f_name): word,freq = line.strip().split(" ") word = word.decode('utf-8') + freq = int(freq) + FREQ[word] = freq + total+=freq p = trie for c in word: if not c in p: @@ -17,11 +25,16 @@ def gen_trie(f_name): return trie _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) -print >> sys.stderr, "loading dictionary..." + +print >> sys.stderr, "Building Trie..." trie = gen_trie(os.path.join(_curpath,"dict.txt")) -print >> sys.stderr,"done." +FREQ = dict([(k,float(v)/total) for k,v in FREQ.iteritems()]) #normalize +min_freq = min(FREQ.itervalues()) +#print min_freq +print >> sys.stderr,"Trie has been built succesfully." -def __cut(sentence): + +def __cut_all(sentence): N = len(sentence) i,j=0,0 p = trie @@ -35,12 +48,81 @@ def __cut(sentence): if j>=N: i+=1 j=i + p=trie + else: + p = trie + i+=1 + j=i + +def calc(sentence,DAG,idx,route): + if idx in route: + return route[idx] + if idx>=len(sentence): + return (1.0,'') + next = DAG[idx] + best = max([ ( FREQ.get(sentence[idx:x+1],min_freq) * calc(sentence,DAG,x+1,route=route)[0],x )for x in next ]) + route[idx]=best + return best + + +def __cut_DAG(sentence): + N = len(sentence) + i,j=0,0 + p = trie + DAG = {} + while i=N: + i+=1 + j=i + p=trie else: p = trie i+=1 j=i + for i in xrange(len(sentence)): + if not i in DAG: + DAG[i] =[i] + #pprint.pprint(DAG) + route ={} + calc(sentence,DAG,0,route=route) + x = 0 + buf =u'' + while x0: + if len(buf)==1: + yield buf + buf=u'' + else: + regognized = finalseg.cut(buf) + for t in regognized: + yield t + buf=u'' + yield l_word + x =y + + if len(buf)>0: + if len(buf)==1: + yield buf + else: + regognized = finalseg.cut(buf) + for t in regognized: + yield t + -def cut(sentence): +def cut(sentence,cut_all=False): if not ( type(sentence) is unicode): try: sentence = sentence.decode('utf-8') @@ -48,10 +130,13 @@ def cut(sentence): sentence = sentence.decode('gbk','ignore') re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]") blocks = re_han.split(sentence) - + cut_block = __cut_DAG + if cut_all: + cut_block = __cut_all for blk in blocks: if re_han.match(blk): - for word in __cut(blk): + #pprint.pprint(__cut_DAG(blk)) + for word in cut_block(blk): yield word else: tmp = re_skip.split(blk) diff --git a/jieba/finalseg/__init__.py b/jieba/finalseg/__init__.py new file mode 100644 index 0000000..dd124fe --- /dev/null +++ b/jieba/finalseg/__init__.py @@ -0,0 +1,70 @@ +import re +import os + +def load_model(f_name): + _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) + prob_p_path = os.path.join(_curpath,f_name) + return eval(open(prob_p_path,"rb").read()) + +prob_start = load_model("prob_start.py") +prob_trans = load_model("prob_trans.py") +prob_emit = load_model("prob_emit.py") + + + +def viterbi(obs, states, start_p, trans_p, emit_p): + V = [{}] #tabular + path = {} + for y in states: #init + V[0][y] = start_p[y] * emit_p[y].get(obs[0],0) + path[y] = [y] + for t in range(1,len(obs)): + V.append({}) + newpath = {} + for y in states: + (prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in states ]) + V[t][y] =prob + newpath[y] = path[state] + [y] + path = newpath + + if emit_p['M'].get(obs[-1],0)> emit_p['S'].get(obs[-1],0): + (prob, state) = max([(V[len(obs) - 1][y], y) for y in ('E',)]) + else: + (prob, state) = max([(V[len(obs) - 1][y], y) for y in ('E','S')]) + + return (prob, path[state]) + + +def __cut(sentence): + prob, pos_list = viterbi(sentence,('B','M','E','S'), prob_start, prob_trans, prob_emit) + begin, next = 0,0 + for i,char in enumerate(sentence): + pos = pos_list[i] + if pos=='B': + begin = i + elif pos=='E': + yield sentence[begin:i+1] + next = i+1 + elif pos=='S': + yield char + next = i+1 + if next