improve POS tagging

pull/13/head
fxsjy 12 years ago
parent 7612a62115
commit 90cd4b3014

@ -82,7 +82,7 @@ def calc(sentence,DAG,idx,route):
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) * route[x+1][0],x ) for x in DAG[idx] ]
route[idx] = max(candidates)
def __cut_DAG(sentence):
def get_DAG(sentence):
N = len(sentence)
i,j=0,0
p = trie
@ -107,11 +107,15 @@ def __cut_DAG(sentence):
for i in xrange(len(sentence)):
if not i in DAG:
DAG[i] =[i]
#pprint.pprint(DAG)
return DAG
def __cut_DAG(sentence):
DAG = get_DAG(sentence)
route ={}
calc(sentence,DAG,0,route=route)
x = 0
buf =u''
N = len(sentence)
while x<N:
y = route[x][1]+1
l_word = sentence[x:y]

@ -1,6 +1,8 @@
import re
import os
import viterbi
import jieba
def load_model(f_name):
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
@ -19,9 +21,32 @@ prob_trans = load_model("prob_trans.py")
prob_emit = load_model("prob_emit.py")
char_state_tab = load_model("char_state_tab.py")
class pair(object):
def __init__(self,word,flag):
self.word = word
self.flag = flag
def __unicode__(self):
return self.word+u"/"+self.flag
def __repr__(self):
return self.__unicode__()
def __cut(sentence):
prob, pos_list = viterbi.viterbi(sentence,char_state_tab, prob_start, prob_trans, prob_emit)
def encode(self,arg):
return self.__unicode__().encode(arg)
def __cut(sentence,tags_limited=False):
limit_tags = None
if tags_limited:
limit_tags = []
if len(sentence)==1:
limit_tags = ['S']
else:
limit_tags.append('B')
for i in xrange(len(sentence)-2):
limit_tags.append('M')
limit_tags.append('E')
prob, pos_list = viterbi.viterbi(sentence,char_state_tab, prob_start, prob_trans, prob_emit,limit_tags)
begin, next = 0,0
for i,char in enumerate(sentence):
@ -29,13 +54,48 @@ def __cut(sentence):
if pos=='B':
begin = i
elif pos=='E':
yield sentence[begin:i+1]+"/"+pos_list[i][1]
yield pair(sentence[begin:i+1], pos_list[i][1])
next = i+1
elif pos=='S':
yield char+"/"+pos_list[i][1]
yield pair(char,pos_list[i][1])
next = i+1
if next<len(sentence):
yield sentence[next:]+"/"+pos_list[next][1]
yield pair(sentence[next:], pos_list[next][1] )
def __cut_DAG(sentence):
DAG = jieba.get_DAG(sentence)
route ={}
jieba.calc(sentence,DAG,0,route=route)
x = 0
buf =u''
N = len(sentence)
while x<N:
y = route[x][1]+1
l_word = sentence[x:y]
if y-x==1:
buf+= l_word
else:
if len(buf)>0:
if len(buf)==1:
yield list(__cut(buf))[0]
buf=u''
else:
regognized = __cut(buf)
for t in regognized:
yield t
buf=u''
for w in __cut(l_word,tags_limited=True):
yield w
x =y
if len(buf)>0:
if len(buf)==1:
yield list(__cut(buf))[0]
else:
regognized = __cut(buf)
for t in regognized:
yield t
def cut(sentence):
if not ( type(sentence) is unicode):
@ -48,10 +108,15 @@ def cut(sentence):
for blk in blocks:
if re_han.match(blk):
for word in __cut(blk):
for word in __cut_DAG(blk):
yield word
else:
tmp = re_skip.split(blk)
for x in tmp:
if x!="":
yield x
if re.match(ur"[0-9]+",x):
yield pair(x,'m')
elif re.match(ur"[a-zA-Z+#]+",x):
yield pair(x,'eng')
else:
yield pair(x,'x')

@ -5,7 +5,7 @@ def get_top_states(t_state_v,K=4):
topK= sorted(items,key=operator.itemgetter(1),reverse=True)[:K]
return [x[0] for x in topK]
def viterbi(obs, states, start_p, trans_p, emit_p):
def viterbi(obs, states, start_p, trans_p, emit_p,limit_tags):
V = [{}] #tabular
mem_path = [{}]
all_states = trans_p.keys()
@ -15,19 +15,24 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
for t in range(1,len(obs)):
V.append({})
mem_path.append({})
prev_states =[ x for x in mem_path[t-1].keys() if len(trans_p[x])>0 ]
#print get_top_states(V[t-1])
prev_states = get_top_states(V[t-1])
prev_states =[ x for x in mem_path[t-1].keys() if len(trans_p[x])>0 ]
tmp = prev_states
if limit_tags:
prev_states = [x for x in prev_states if x[0]==limit_tags[t-1]]
if len(prev_states)==0:
prev_states = tmp
prev_states_expect_next = set( (y for x in prev_states for y in trans_p[x].keys() ) )
obs_states = states.get(obs[t],all_states)
obs_states = set(obs_states) & set(prev_states_expect_next)
if limit_tags:
obs_states = [x for x in obs_states if x[0]==limit_tags[t]]
if len(obs_states)==0: obs_states = all_states
for y in obs_states:
(prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in prev_states])
V[t][y] =prob
mem_path[t][y] = state
last = [(V[-1][y], y) for y in mem_path[-1].keys() ]
#if len(last)==0:
#print obs

@ -7,5 +7,5 @@ setup(name='jieba',
url='http://github.com/fxsjy',
packages=['jieba'],
package_dir={'jieba':'jieba'},
package_data={'jieba':['*.*','finalseg/*']}
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']}
)

@ -0,0 +1,2 @@
西三旗硅谷先锋小区半地下室出租,便宜可合租硅谷
工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作

@ -0,0 +1,20 @@
import urllib2
import sys,time
import sys
sys.path.append("../")
import jieba
url = sys.argv[1]
content = open(url,"rb").read()
t1 = time.time()
words = list(jieba.cut(content))
t2 = time.time()
tm_cost = t2-t1
log_f = open("1.log","wb")
for w in words:
print >> log_f, w.encode("gbk"), "/" ,
print 'speed' , len(content)/tm_cost, " bytes/second"

@ -0,0 +1,20 @@
import urllib2
import sys,time
import sys
sys.path.append("../")
import jieba.posseg as pseg
url = sys.argv[1]
content = open(url,"rb").read()
t1 = time.time()
words = list(pseg.cut(content))
t2 = time.time()
tm_cost = t2-t1
log_f = open("1.log","wb")
for w in words:
print >> log_f, w.encode("gbk"), "/" ,
print 'speed' , len(content)/tm_cost, " bytes/second"
Loading…
Cancel
Save