merge chage from chao78787

12 years ago · 9ea14a8a54
parent 45daf561c7 632a086035
commit 9ea14a8a54
3 changed files with 19 additions and 12 deletions
--- a/README.md
+++ b/README.md
@ -144,7 +144,7 @@ Output:
 * 实验结果：在4核3.4GHz Linux机器上，对金庸全集进行精确分词，获得了1MB/s的速度，是单进程版的3.3倍。


-功能 5) : Tokenize：返回词语在原文的起始位置
+功能 6) : Tokenize：返回词语在原文的起始位置
 ============================================
 * 注意，输入参数只接受unicode
 * 默认模式
@ -181,7 +181,7 @@ word 有限公司            start: 6                end:10
 ```
  
  
-功能 6) : ChineseAnalyzer for Whoosh搜索引擎
+功能 7) : ChineseAnalyzer for Whoosh搜索引擎
 ============================================
 * 引用： `from jieba.analyse import ChineseAnalyzer `
 * 用法示例：https://github.com/fxsjy/jieba/blob/master/test/test_whoosh.py
--- a/jieba/init.py
+++ b/jieba/init.py
@ -264,15 +264,22 @@ def load_userdict(f):
        if line_no==1:
            word = word.replace(u'\ufeff',u"") #remove bom flag if it exists
        if len(tup)==3:
-            user_word_tag_tab[word]=tup[2].strip()
+            add_word(word, freq, tup[2])
+        else:
+            add_word(word, freq)
+
+def add_word(word, freq, tag=None):
+    global FREQ, trie, total, user_word_tag_tab
    freq = float(freq)
    FREQ[word] = log(freq / total)
+    if tag is not None:
+        user_word_tag_tab[word] = tag.strip()
    p = trie
    for c in word:
        if not c in p:
-                p[c] ={}
+            p[c] = {}
        p = p[c]
-        p['']='' #ending flag
+    p[''] = ''                  # ending flag

 __ref_cut = cut
 __ref_cut_for_search = cut_for_search
--- a/setup.py
+++ b/setup.py
@ -1,6 +1,6 @@
 from distutils.core import setup  
 setup(name='jieba',  
-      version='0.30',  
+      version='0.31.alpha',  
      description='Chinese Words Segementation Utilities',  
      author='Sun, Junyi',  
      author_email='ccnusjy@gmail.com',