From 4197dfb8fa62574a86ec09fa5ddc33fd117d6464 Mon Sep 17 00:00:00 2001 From: Dingyuan Wang Date: Mon, 9 Feb 2015 16:26:00 +0800 Subject: [PATCH] store int directly in FREQ; small improvements --- README.md | 9 ++++-- jieba/__init__.py | 62 +++++++++++++++++--------------------- jieba/finalseg/__init__.py | 17 ++++++----- jieba/posseg/__init__.py | 25 +++++++-------- 4 files changed, 56 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index 486080b..c9be9bd 100644 --- a/README.md +++ b/README.md @@ -330,12 +330,12 @@ https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big 结巴分词 C++ 版本 ---------------- -作者:Aszxqw +作者:yanyiwu 地址:https://github.com/aszxqw/cppjieba 结巴分词 Node.js 版本 ---------------- -作者:Aszxqw +作者:yanyiwu 地址:https://github.com/aszxqw/nodejieba 结巴分词 Erlang 版本 @@ -348,6 +348,11 @@ https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big 作者:qinwf 地址:https://github.com/qinwf/jiebaR +结巴分词 iOS 版本 +---------------- +作者:yanyiwu +地址:https://github.com/aszxqw/iosjieba + 系统集成 ======== 1. Solr: https://github.com/sing1ee/jieba-solr diff --git a/jieba/__init__.py b/jieba/__init__.py index 59df14b..006bc8e 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -19,10 +19,12 @@ DICTIONARY = "dict.txt" DICT_LOCK = threading.RLock() pfdict = None # to be initialized FREQ = {} -min_freq = 0.0 -total = 0.0 +total = 0 user_word_tag_tab = {} initialized = False +pool = None + +_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) log_console = logging.StreamHandler(sys.stderr) logger = logging.getLogger(__name__) @@ -36,14 +38,14 @@ def setLogLevel(log_level): def gen_pfdict(f_name): lfreq = {} pfdict = set() - ltotal = 0.0 + ltotal = 0 with open(f_name, 'rb') as f: lineno = 0 for line in f.read().rstrip().decode('utf-8').split('\n'): lineno += 1 try: - word,freq = line.split(' ')[:2] - freq = float(freq) + word, freq = line.split(' ')[:2] + freq = int(freq) lfreq[word] = freq ltotal += freq for ch in range(len(word)): @@ -60,10 +62,6 @@ def initialize(dictionary=None): with DICT_LOCK: if initialized: return - if pfdict: - del pfdict - pfdict = None - _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) abs_path = os.path.join(_curpath, dictionary) logger.debug("Building prefix dict from %s ..." % abs_path) @@ -74,31 +72,29 @@ def initialize(dictionary=None): cache_file = os.path.join(tempfile.gettempdir(), "jieba.u%s.cache" % md5(abs_path.encode('utf-8', 'replace')).hexdigest()) load_from_cache_fail = True - if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path): + if os.path.isfile(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path): logger.debug("Loading model from cache %s" % cache_file) try: with open(cache_file, 'rb') as cf: - pfdict,FREQ,total,min_freq = marshal.load(cf) + pfdict, FREQ, total = marshal.load(cf) # prevent conflict with old version load_from_cache_fail = not isinstance(pfdict, set) - except: + except Exception: load_from_cache_fail = True if load_from_cache_fail: - pfdict,FREQ,total = gen_pfdict(abs_path) - FREQ = dict((k,log(float(v)/total)) for k,v in FREQ.items()) #normalize - min_freq = min(FREQ.values()) + pfdict, FREQ, total = gen_pfdict(abs_path) logger.debug("Dumping model to file cache %s" % cache_file) try: fd, fpath = tempfile.mkstemp() with os.fdopen(fd, 'wb') as temp_cache_file: - marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file) + marshal.dump((pfdict, FREQ, total), temp_cache_file) if os.name == 'nt': from shutil import move as replace_file else: replace_file = os.rename replace_file(fpath, cache_file) - except: + except Exception: logger.exception("Dump cache file failed.") initialized = True @@ -139,7 +135,7 @@ def calc(sentence, DAG, route): N = len(sentence) route[N] = (0.0, '') for idx in range(N-1, -1, -1): - route[idx] = max((FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx]) + route[idx] = max((log(FREQ.get(sentence[idx:x+1], 1)) - log(total) + route[x+1][0], x) for x in DAG[idx]) @require_initialized def get_DAG(sentence): @@ -202,7 +198,7 @@ def __cut_DAG(sentence): yield buf buf = '' else: - if (buf not in FREQ): + if buf not in FREQ: recognized = finalseg.cut(buf) for t in recognized: yield t @@ -216,7 +212,7 @@ def __cut_DAG(sentence): if buf: if len(buf) == 1: yield buf - elif (buf not in FREQ): + elif buf not in FREQ: recognized = finalseg.cut(buf) for t in recognized: yield t @@ -297,26 +293,24 @@ def load_userdict(f): ''' if isinstance(f, str): f = open(f, 'rb') - content = f.read().decode('utf-8') + content = f.read().decode('utf-8').lstrip('\ufeff') line_no = 0 for line in content.split("\n"): line_no += 1 if not line.rstrip(): continue - tup = line.split(" ") - word, freq = tup[0], tup[1] - if freq.isdigit() is False: - continue - if line_no == 1: - word = word.replace('\ufeff',"") #remove bom flag if it exists - add_word(*tup) + tup = line.strip().split(" ") + if tup[1].isdigit(): + add_word(*tup) @require_initialized def add_word(word, freq, tag=None): global FREQ, pfdict, total, user_word_tag_tab - FREQ[word] = log(float(freq) / total) + freq = int(freq) + FREQ[word] = freq + total += freq if tag is not None: - user_word_tag_tab[word] = tag.strip() + user_word_tag_tab[word] = tag for ch in range(len(word)): pfdict.add(word[:ch+1]) @@ -366,8 +360,8 @@ def enable_parallel(processnum=None): cut_for_search = pcut_for_search def disable_parallel(): - global pool,cut,cut_for_search - if 'pool' in globals(): + global pool, cut, cut_for_search + if pool: pool.close() pool = None cut = __ref_cut @@ -383,9 +377,7 @@ def set_dictionary(dictionary_path): initialized = False def get_abs_path_dict(): - _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) - abs_path = os.path.join(_curpath,DICTIONARY) - return abs_path + return os.path.join(_curpath, DICTIONARY) def tokenize(unicode_sentence, mode="default", HMM=True): """Tokenize a sentence and yields tuples of (word, start, end) diff --git a/jieba/finalseg/__init__.py b/jieba/finalseg/__init__.py index 5e676ad..8ca8496 100644 --- a/jieba/finalseg/__init__.py +++ b/jieba/finalseg/__init__.py @@ -40,8 +40,9 @@ def load_model(): if sys.platform.startswith("java"): start_P, trans_P, emit_P = load_model() else: - from . import prob_start,prob_trans,prob_emit - start_P, trans_P, emit_P = prob_start.P, prob_trans.P, prob_emit.P + from .prob_start import P as start_P + from .prob_trans import P as trans_P + from .prob_emit import P as emit_P def viterbi(obs, states, start_p, trans_p, emit_p): V = [{}] #tabular @@ -49,7 +50,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p): for y in states: #init V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT) path[y] = [y] - for t in range(1,len(obs)): + for t in range(1, len(obs)): V.append({}) newpath = {} for y in states: @@ -67,7 +68,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p): def __cut(sentence): global emit_P prob, pos_list = viterbi(sentence, ('B','M','E','S'), start_P, trans_P, emit_P) - begin, next = 0,0 + begin, nexti = 0, 0 #print pos_list, sentence for i,char in enumerate(sentence): pos = pos_list[i] @@ -75,12 +76,12 @@ def __cut(sentence): begin = i elif pos == 'E': yield sentence[begin:i+1] - next = i+1 + nexti = i+1 elif pos == 'S': yield char - next = i+1 - if next < len(sentence): - yield sentence[next:] + nexti = i+1 + if nexti < len(sentence): + yield sentence[nexti:] def cut(sentence): if not isinstance(sentence, str): diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index 3277474..eb327b3 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -54,8 +54,11 @@ def load_model(f_name, isJython=True): if sys.platform.startswith("java"): char_state_tab_P, start_P, trans_P, emit_P, word_tag_tab = load_model(jieba.get_abs_path_dict()) else: - from . import char_state_tab, prob_start, prob_trans, prob_emit - char_state_tab_P, start_P, trans_P, emit_P = char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P + from .char_state_tab import P as char_state_tab_P + from .prob_start import P as start_P + from .prob_trans import P as trans_P + from .prob_emit import P as emit_P + word_tag_tab = load_model(jieba.get_abs_path_dict(), isJython=False) def makesure_userdict_loaded(fn): @@ -164,16 +167,14 @@ def __cut_DAG(sentence): if buf: if len(buf) == 1: yield pair(buf, word_tag_tab.get(buf, 'x')) - buf = '' + elif buf not in jieba.FREQ: + recognized = __cut_detail(buf) + for t in recognized: + yield t else: - if (buf not in jieba.FREQ): - recognized = __cut_detail(buf) - for t in recognized: - yield t - else: - for elem in buf: - yield pair(elem, word_tag_tab.get(elem, 'x')) - buf = '' + for elem in buf: + yield pair(elem, word_tag_tab.get(elem, 'x')) + buf = '' yield pair(l_word, word_tag_tab.get(l_word, 'x')) x = y @@ -228,7 +229,7 @@ def __lcut_internal_no_hmm(sentence): @makesure_userdict_loaded def cut(sentence, HMM=True): - if (not hasattr(jieba, 'pool')) or (jieba.pool is None): + if jieba.pool is None: for w in __cut_internal(sentence, HMM=HMM): yield w else: