From 4197dfb8fa62574a86ec09fa5ddc33fd117d6464 Mon Sep 17 00:00:00 2001
From: Dingyuan Wang <abcdoyle888@gmail.com>
Date: Mon, 9 Feb 2015 16:26:00 +0800
Subject: [PATCH] store int directly in FREQ; small improvements

---
 README.md                  |  9 ++++--
 jieba/__init__.py          | 62 +++++++++++++++++---------------------
 jieba/finalseg/__init__.py | 17 ++++++-----
 jieba/posseg/__init__.py   | 25 +++++++--------
 4 files changed, 56 insertions(+), 57 deletions(-)

diff --git a/README.md b/README.md
index 486080b..c9be9bd 100644
--- a/README.md
+++ b/README.md
@@ -330,12 +330,12 @@ https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big
 
 结巴分词 C++ 版本
 ----------------
-作者：Aszxqw
+作者：yanyiwu
 地址：https://github.com/aszxqw/cppjieba
 
 结巴分词 Node.js 版本
 ----------------
-作者：Aszxqw
+作者：yanyiwu
 地址：https://github.com/aszxqw/nodejieba
 
 结巴分词 Erlang 版本
@@ -348,6 +348,11 @@ https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big
 作者：qinwf
 地址：https://github.com/qinwf/jiebaR
 
+结巴分词 iOS 版本
+----------------
+作者：yanyiwu
+地址：https://github.com/aszxqw/iosjieba
+
 系统集成
 ========
 1. Solr: https://github.com/sing1ee/jieba-solr
diff --git a/jieba/__init__.py b/jieba/__init__.py
index 59df14b..006bc8e 100644
--- a/jieba/__init__.py
+++ b/jieba/__init__.py
@@ -19,10 +19,12 @@ DICTIONARY = "dict.txt"
 DICT_LOCK = threading.RLock()
 pfdict = None # to be initialized
 FREQ = {}
-min_freq = 0.0
-total = 0.0
+total = 0
 user_word_tag_tab = {}
 initialized = False
+pool = None
+
+_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
 
 log_console = logging.StreamHandler(sys.stderr)
 logger = logging.getLogger(__name__)
@@ -36,14 +38,14 @@ def setLogLevel(log_level):
 def gen_pfdict(f_name):
     lfreq = {}
     pfdict = set()
-    ltotal = 0.0
+    ltotal = 0
     with open(f_name, 'rb') as f:
         lineno = 0
         for line in f.read().rstrip().decode('utf-8').split('\n'):
             lineno += 1
             try:
-                word,freq = line.split(' ')[:2]
-                freq = float(freq)
+                word, freq = line.split(' ')[:2]
+                freq = int(freq)
                 lfreq[word] = freq
                 ltotal += freq
                 for ch in range(len(word)):
@@ -60,10 +62,6 @@ def initialize(dictionary=None):
     with DICT_LOCK:
         if initialized:
             return
-        if pfdict:
-            del pfdict
-            pfdict = None
-        _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
 
         abs_path = os.path.join(_curpath, dictionary)
         logger.debug("Building prefix dict from %s ..." % abs_path)
@@ -74,31 +72,29 @@ def initialize(dictionary=None):
             cache_file = os.path.join(tempfile.gettempdir(), "jieba.u%s.cache" % md5(abs_path.encode('utf-8', 'replace')).hexdigest())
 
         load_from_cache_fail = True
-        if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
+        if os.path.isfile(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
             logger.debug("Loading model from cache %s" % cache_file)
             try:
                 with open(cache_file, 'rb') as cf:
-                    pfdict,FREQ,total,min_freq = marshal.load(cf)
+                    pfdict, FREQ, total = marshal.load(cf)
                 # prevent conflict with old version
                 load_from_cache_fail = not isinstance(pfdict, set)
-            except:
+            except Exception:
                 load_from_cache_fail = True
 
         if load_from_cache_fail:
-            pfdict,FREQ,total = gen_pfdict(abs_path)
-            FREQ = dict((k,log(float(v)/total)) for k,v in FREQ.items()) #normalize
-            min_freq = min(FREQ.values())
+            pfdict, FREQ, total = gen_pfdict(abs_path)
             logger.debug("Dumping model to file cache %s" % cache_file)
             try:
                 fd, fpath = tempfile.mkstemp()
                 with os.fdopen(fd, 'wb') as temp_cache_file:
-                    marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file)
+                    marshal.dump((pfdict, FREQ, total), temp_cache_file)
                 if os.name == 'nt':
                     from shutil import move as replace_file
                 else:
                     replace_file = os.rename
                 replace_file(fpath, cache_file)
-            except:
+            except Exception:
                 logger.exception("Dump cache file failed.")
 
         initialized = True
@@ -139,7 +135,7 @@ def calc(sentence, DAG, route):
     N = len(sentence)
     route[N] = (0.0, '')
     for idx in range(N-1, -1, -1):
-        route[idx] = max((FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx])
+        route[idx] = max((log(FREQ.get(sentence[idx:x+1], 1)) - log(total) + route[x+1][0], x) for x in DAG[idx])
 
 @require_initialized
 def get_DAG(sentence):
@@ -202,7 +198,7 @@ def __cut_DAG(sentence):
                     yield buf
                     buf = ''
                 else:
-                    if (buf not in FREQ):
+                    if buf not in FREQ:
                         recognized = finalseg.cut(buf)
                         for t in recognized:
                             yield t
@@ -216,7 +212,7 @@ def __cut_DAG(sentence):
     if buf:
         if len(buf) == 1:
             yield buf
-        elif (buf not in FREQ):
+        elif buf not in FREQ:
             recognized = finalseg.cut(buf)
             for t in recognized:
                 yield t
@@ -297,26 +293,24 @@ def load_userdict(f):
     '''
     if isinstance(f, str):
         f = open(f, 'rb')
-    content = f.read().decode('utf-8')
+    content = f.read().decode('utf-8').lstrip('\ufeff')
     line_no = 0
     for line in content.split("\n"):
         line_no += 1
         if not line.rstrip():
             continue
-        tup = line.split(" ")
-        word, freq = tup[0], tup[1]
-        if freq.isdigit() is False:
-            continue
-        if line_no == 1:
-            word = word.replace('\ufeff',"") #remove bom flag if it exists
-        add_word(*tup)
+        tup = line.strip().split(" ")
+        if tup[1].isdigit():
+            add_word(*tup)
 
 @require_initialized
 def add_word(word, freq, tag=None):
     global FREQ, pfdict, total, user_word_tag_tab
-    FREQ[word] = log(float(freq) / total)
+    freq = int(freq)
+    FREQ[word] = freq
+    total += freq
     if tag is not None:
-        user_word_tag_tab[word] = tag.strip()
+        user_word_tag_tab[word] = tag
     for ch in range(len(word)):
         pfdict.add(word[:ch+1])
 
@@ -366,8 +360,8 @@ def enable_parallel(processnum=None):
     cut_for_search = pcut_for_search
 
 def disable_parallel():
-    global pool,cut,cut_for_search
-    if 'pool' in globals():
+    global pool, cut, cut_for_search
+    if pool:
         pool.close()
         pool = None
     cut = __ref_cut
@@ -383,9 +377,7 @@ def set_dictionary(dictionary_path):
         initialized = False
 
 def get_abs_path_dict():
-    _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
-    abs_path = os.path.join(_curpath,DICTIONARY)
-    return abs_path
+    return os.path.join(_curpath, DICTIONARY)
 
 def tokenize(unicode_sentence, mode="default", HMM=True):
     """Tokenize a sentence and yields tuples of (word, start, end)
diff --git a/jieba/finalseg/__init__.py b/jieba/finalseg/__init__.py
index 5e676ad..8ca8496 100644
--- a/jieba/finalseg/__init__.py
+++ b/jieba/finalseg/__init__.py
@@ -40,8 +40,9 @@ def load_model():
 if sys.platform.startswith("java"):
     start_P, trans_P, emit_P = load_model()
 else:
-    from . import prob_start,prob_trans,prob_emit
-    start_P, trans_P, emit_P = prob_start.P, prob_trans.P, prob_emit.P
+    from .prob_start import P as start_P
+    from .prob_trans import P as trans_P
+    from .prob_emit  import P as emit_P
 
 def viterbi(obs, states, start_p, trans_p, emit_p):
     V = [{}] #tabular
@@ -49,7 +50,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
     for y in states: #init
         V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT)
         path[y] = [y]
-    for t in range(1,len(obs)):
+    for t in range(1, len(obs)):
         V.append({})
         newpath = {}
         for y in states:
@@ -67,7 +68,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
 def __cut(sentence):
     global emit_P
     prob, pos_list =  viterbi(sentence, ('B','M','E','S'), start_P, trans_P, emit_P)
-    begin, next = 0,0
+    begin, nexti = 0, 0
     #print pos_list, sentence
     for i,char in enumerate(sentence):
         pos = pos_list[i]
@@ -75,12 +76,12 @@ def __cut(sentence):
             begin = i
         elif pos == 'E':
             yield sentence[begin:i+1]
-            next = i+1
+            nexti = i+1
         elif pos == 'S':
             yield char
-            next = i+1
-    if next < len(sentence):
-        yield sentence[next:]
+            nexti = i+1
+    if nexti < len(sentence):
+        yield sentence[nexti:]
 
 def cut(sentence):
     if not isinstance(sentence, str):
diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py
index 3277474..eb327b3 100644
--- a/jieba/posseg/__init__.py
+++ b/jieba/posseg/__init__.py
@@ -54,8 +54,11 @@ def load_model(f_name, isJython=True):
 if sys.platform.startswith("java"):
     char_state_tab_P, start_P, trans_P, emit_P, word_tag_tab = load_model(jieba.get_abs_path_dict())
 else:
-    from . import char_state_tab, prob_start, prob_trans, prob_emit
-    char_state_tab_P, start_P, trans_P, emit_P = char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P
+    from .char_state_tab import P as char_state_tab_P
+    from .prob_start import P as start_P
+    from .prob_trans import P as trans_P
+    from .prob_emit  import P as emit_P
+
     word_tag_tab = load_model(jieba.get_abs_path_dict(), isJython=False)
 
 def makesure_userdict_loaded(fn):
@@ -164,16 +167,14 @@ def __cut_DAG(sentence):
             if buf:
                 if len(buf) == 1:
                     yield pair(buf, word_tag_tab.get(buf, 'x'))
-                    buf = ''
+                elif buf not in jieba.FREQ:
+                    recognized = __cut_detail(buf)
+                    for t in recognized:
+                        yield t
                 else:
-                    if (buf not in jieba.FREQ):
-                        recognized = __cut_detail(buf)
-                        for t in recognized:
-                            yield t
-                    else:
-                        for elem in buf:
-                            yield pair(elem, word_tag_tab.get(elem, 'x'))
-                    buf = ''
+                    for elem in buf:
+                        yield pair(elem, word_tag_tab.get(elem, 'x'))
+                buf = ''
             yield pair(l_word, word_tag_tab.get(l_word, 'x'))
         x = y
 
@@ -228,7 +229,7 @@ def __lcut_internal_no_hmm(sentence):
 
 @makesure_userdict_loaded
 def cut(sentence, HMM=True):
-    if (not hasattr(jieba, 'pool')) or (jieba.pool is None):
+    if jieba.pool is None:
         for w in __cut_internal(sentence, HMM=HMM):
             yield w
     else: