From e1c1d463248fdd2254c8971730aedbd5f500858f Mon Sep 17 00:00:00 2001 From: Sun Junyi Date: Mon, 1 Jul 2013 12:43:33 +0800 Subject: [PATCH 1/2] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d1813ef..9ae4c53 100644 --- a/README.md +++ b/README.md @@ -144,7 +144,7 @@ Output: * 实验结果:在4核3.4GHz Linux机器上,对金庸全集进行精确分词,获得了1MB/s的速度,是单进程版的3.3倍。 -功能 5) : Tokenize:返回词语在原文的起始位置 +功能 6) : Tokenize:返回词语在原文的起始位置 ============================================ * 注意,输入参数只接受unicode * 默认模式 @@ -181,7 +181,7 @@ word 有限公司 start: 6 end:10 ``` -功能 6) : ChineseAnalyzer for Whoosh搜索引擎 +功能 7) : ChineseAnalyzer for Whoosh搜索引擎 ============================================ * 引用: `from jieba.analyse import ChineseAnalyzer ` * 用法示例:https://github.com/fxsjy/jieba/blob/master/test/test_whoosh.py From 3246236133d0354a60a6147df819ba043349a0ba Mon Sep 17 00:00:00 2001 From: Richard Wong Date: Wed, 3 Jul 2013 15:03:45 +0800 Subject: [PATCH 2/2] Separate cal and IO process. --- jieba/__init__.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/jieba/__init__.py b/jieba/__init__.py index aacfba2..7eb8efe 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -264,15 +264,21 @@ def load_userdict(f): if line_no==1: word = word.replace(u'\ufeff',u"") #remove bom flag if it exists if len(tup)==3: - user_word_tag_tab[word]=tup[2].strip() - freq = float(freq) - FREQ[word] = log(freq / total) - p = trie - for c in word: - if not c in p: - p[c] ={} - p = p[c] - p['']='' #ending flag + add_word(word, freq, tup[2]) + add_word(word, freq) + +def add_word(word, freq, tag=None): + global FREQ, trie, total, user_word_tag_tab + freq = float(freq) + FREQ[word] = log(freq / total) + if tag is not None: + user_word_tag_tab[word] = tag.strip() + p = trie + for c in word: + if not c in p: + p[c] = {} + p = p[c] + p[''] = '' # ending flag __ref_cut = cut __ref_cut_for_search = cut_for_search