From 18678d50c6b245bdde170da769e5b9318cba70e5 Mon Sep 17 00:00:00 2001 From: fxsjy Date: Tue, 28 Jan 2014 13:48:03 +0800 Subject: [PATCH] fix bug issue #132 --- jieba/posseg/char_state_tab.py | 2 +- jieba/posseg/viterbi.py | 2 ++ test/test_bug.py | 9 +++++++++ test/test_userdict.py | 2 +- test/userdict.txt | 3 ++- 5 files changed, 15 insertions(+), 3 deletions(-) create mode 100644 test/test_bug.py diff --git a/jieba/posseg/char_state_tab.py b/jieba/posseg/char_state_tab.py index 50cbac3..991dfe5 100644 --- a/jieba/posseg/char_state_tab.py +++ b/jieba/posseg/char_state_tab.py @@ -52545,7 +52545,7 @@ P={u'\u4e00': (('B', 'm'), u'\u8dd6': (('S', 'g'), ('M', 'n'), ('B', 'n'), ('E', 'nr')), u'\u8dd7': (('B', 'n'),), u'\u8dda': (('E', 'v'), ('S', 'x'), ('E', 'l'), ('E', 'nr'), ('E', 'vn')), - u'\u8ddb': (('B', 'n'), ('B', 'v')), + u'\u8ddb': (('B', 'n'), ('B', 'v'), ('S','a') ), u'\u8ddd': (('B', 'n'), ('S', 'p'), ('E', 'n'), diff --git a/jieba/posseg/viterbi.py b/jieba/posseg/viterbi.py index 97b4d6e..4451dcb 100644 --- a/jieba/posseg/viterbi.py +++ b/jieba/posseg/viterbi.py @@ -23,7 +23,9 @@ def viterbi(obs, states, start_p, trans_p, emit_p): obs_states = states.get(obs[t],all_states) obs_states = set(obs_states) & set(prev_states_expect_next) + if len(obs_states)==0: obs_states = prev_states_expect_next if len(obs_states)==0: obs_states = all_states + for y in obs_states: (prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + emit_p[y].get(obs[t],MIN_FLOAT) ,y0) for y0 in prev_states]) V[t][y] =prob diff --git a/test/test_bug.py b/test/test_bug.py new file mode 100644 index 0000000..e6837e3 --- /dev/null +++ b/test/test_bug.py @@ -0,0 +1,9 @@ +#encoding=utf-8 +import sys +sys.path.append("../") +import jieba +import jieba.posseg as pseg +words=pseg.cut("又跛又啞") +for w in words: + print w.word,w.flag + diff --git a/test/test_userdict.py b/test/test_userdict.py index d349923..e5a4727 100644 --- a/test/test_userdict.py +++ b/test/test_userdict.py @@ -5,7 +5,7 @@ import jieba jieba.load_userdict("userdict.txt") import jieba.posseg as pseg -test_sent = "李小福是创新办主任也是云计算方面的专家;" +test_sent = "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿" test_sent += "例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型" words = jieba.cut(test_sent) for w in words: diff --git a/test/userdict.txt b/test/userdict.txt index 73d2d58..fc73d5c 100644 --- a/test/userdict.txt +++ b/test/userdict.txt @@ -3,4 +3,5 @@ 创新办 3 i easy_install 3 eng 好用 300 -韩玉赏鉴 3 nz \ No newline at end of file +韩玉赏鉴 3 nz +八一双鹿 3 nz \ No newline at end of file