diff --git a/README.md b/README.md index d1813ef..9ae4c53 100644 --- a/README.md +++ b/README.md @@ -144,7 +144,7 @@ Output: * 实验结果:在4核3.4GHz Linux机器上,对金庸全集进行精确分词,获得了1MB/s的速度,是单进程版的3.3倍。 -功能 5) : Tokenize:返回词语在原文的起始位置 +功能 6) : Tokenize:返回词语在原文的起始位置 ============================================ * 注意,输入参数只接受unicode * 默认模式 @@ -181,7 +181,7 @@ word 有限公司 start: 6 end:10 ``` -功能 6) : ChineseAnalyzer for Whoosh搜索引擎 +功能 7) : ChineseAnalyzer for Whoosh搜索引擎 ============================================ * 引用: `from jieba.analyse import ChineseAnalyzer ` * 用法示例:https://github.com/fxsjy/jieba/blob/master/test/test_whoosh.py diff --git a/jieba/__init__.py b/jieba/__init__.py index 15da8c2..5731a0f 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -264,15 +264,22 @@ def load_userdict(f): if line_no==1: word = word.replace(u'\ufeff',u"") #remove bom flag if it exists if len(tup)==3: - user_word_tag_tab[word]=tup[2].strip() - freq = float(freq) - FREQ[word] = log(freq / total) - p = trie - for c in word: - if not c in p: - p[c] ={} - p = p[c] - p['']='' #ending flag + add_word(word, freq, tup[2]) + else: + add_word(word, freq) + +def add_word(word, freq, tag=None): + global FREQ, trie, total, user_word_tag_tab + freq = float(freq) + FREQ[word] = log(freq / total) + if tag is not None: + user_word_tag_tab[word] = tag.strip() + p = trie + for c in word: + if not c in p: + p[c] = {} + p = p[c] + p[''] = '' # ending flag __ref_cut = cut __ref_cut_for_search = cut_for_search diff --git a/setup.py b/setup.py index 259c37a..9e11bb8 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from distutils.core import setup setup(name='jieba', - version='0.30', + version='0.31.alpha', description='Chinese Words Segementation Utilities', author='Sun, Junyi', author_email='ccnusjy@gmail.com',