From c6b386f65b6295d4fbe691f7eb78ec4982009ef9 Mon Sep 17 00:00:00 2001
From: Dingyuan Wang <abcdoyle888@gmail.com>
Date: Sat, 29 Nov 2014 16:06:20 +0800
Subject: [PATCH] update jieba3k

---
 .gitignore                |   6 +-
 Changelog                 |  10 +-
 jieba/__init__.py         |   8 +-
 jieba/analyse/__init__.py |  18 +++-
 jieba/analyse/textrank.py |   6 +-
 jieba/posseg/__init__.py  |   4 +-
 jieba/posseg/viterbi.py   |   5 +-
 setup.py                  |  71 ++++++++++++-
 test/2to3.diff            | 216 +++++++++++++++++++++++++-------------
 9 files changed, 250 insertions(+), 94 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8c2c5f4..e36fabc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -113,8 +113,10 @@ Generated_Code #added for RIA/Silverlight projects
 _UpgradeReport_Files/
 Backup*/
 UpgradeLog*.XML
-
-
+############
+## pycharm
+############
+.idea
 
 ############
 ## Windows
diff --git a/Changelog b/Changelog
index d1628f1..be1aaa3 100644
--- a/Changelog
+++ b/Changelog
@@ -1,11 +1,11 @@
 2014-11-15: version 0.35.1
-1) fix Python 3.2的兼容性问题
+1. 修复 Python 3.2 的兼容性问题
 
 2014-11-13: version 0.35
-1) 改进词典cache的dump和加载机制；by @gumblex
-2）提升关键词提取的性能; by @gumblex
-3）关键词提取新增基于textrank算法的子模块; by @singlee
-4）修复自定义stopwords功能的bug; by @walkskyer
+1. 改进词典cache的dump和加载机制；by @gumblex
+2. 提升关键词提取的性能; by @gumblex
+3. 关键词提取新增基于textrank算法的子模块; by @singlee
+4. 修复自定义stopwords功能的bug; by @walkskyer
 
 
 2014-10-20: version 0.34
diff --git a/jieba/__init__.py b/jieba/__init__.py
index 37e2e62..59df14b 100644
--- a/jieba/__init__.py
+++ b/jieba/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '0.34'
+__version__ = '0.35'
 __license__ = 'MIT'
 
 import re
@@ -135,7 +135,7 @@ def __cut_all(sentence):
                     old_j = j
 
 
-def calc(sentence, DAG, idx, route):
+def calc(sentence, DAG, route):
     N = len(sentence)
     route[N] = (0.0, '')
     for idx in range(N-1, -1, -1):
@@ -164,7 +164,7 @@ def __cut_DAG_NO_HMM(sentence):
     re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
     DAG = get_DAG(sentence)
     route = {}
-    calc(sentence, DAG, 0, route)
+    calc(sentence, DAG, route)
     x = 0
     N = len(sentence)
     buf = ''
@@ -187,7 +187,7 @@ def __cut_DAG_NO_HMM(sentence):
 def __cut_DAG(sentence):
     DAG = get_DAG(sentence)
     route = {}
-    calc(sentence, DAG, 0, route=route)
+    calc(sentence, DAG, route=route)
     x = 0
     buf = ''
     N = len(sentence)
diff --git a/jieba/analyse/__init__.py b/jieba/analyse/__init__.py
index 94d0f49..1b04c32 100644
--- a/jieba/analyse/__init__.py
+++ b/jieba/analyse/__init__.py
@@ -1,5 +1,6 @@
 #encoding=utf-8
 import jieba
+import jieba.posseg
 import os
 from operator import itemgetter
 try:
@@ -54,25 +55,36 @@ def set_stop_words(stop_words_path):
     if not os.path.exists(abs_path):
         raise Exception("jieba: path does not exist: " + abs_path)
     content = open(abs_path,'rb').read().decode('utf-8')
-    lines = content.replace("\r","").split('\n')
+    lines = content.replace("\r", "").split('\n')
     for line in lines:
         STOP_WORDS.add(line)
 
-def extract_tags(sentence, topK=20, withWeight=False):
+def extract_tags(sentence, topK=20, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v']):
     """
     Extract keywords from sentence using TF-IDF algorithm.
     Parameter:
         - topK: return how many top keywords. `None` for all possible words.
         - withWeight: if True, return a list of (word, weight);
                       if False, return a list of words.
+        - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
+                    if the POS of w is not in this list,it will be filtered.
     """
     global STOP_WORDS, idf_loader
 
     idf_freq, median_idf = idf_loader.get_idf()
 
-    words = jieba.cut(sentence)
+    if allowPOS:
+        allowPOS = frozenset(allowPOS)
+        words = jieba.posseg.cut(sentence)
+    else:
+        words = jieba.cut(sentence)
     freq = {}
     for w in words:
+        if allowPOS:
+            if w.flag not in allowPOS:
+                continue
+            else:
+                w = w.word
         if len(w.strip()) < 2 or w.lower() in STOP_WORDS:
             continue
         freq[w] = freq.get(w, 0.0) + 1.0
diff --git a/jieba/analyse/textrank.py b/jieba/analyse/textrank.py
index 9bd5e2f..12dce89 100644
--- a/jieba/analyse/textrank.py
+++ b/jieba/analyse/textrank.py
@@ -48,15 +48,17 @@ class UndirectWeightedGraph:
         return ws
 
 
-def textrank(sentence, topK=10, withWeight=False):
+def textrank(sentence, topK=10, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v']):
     """
     Extract keywords from sentence using TextRank algorithm.
     Parameter:
         - topK: return how many top keywords. `None` for all possible words.
         - withWeight: if True, return a list of (word, weight);
                       if False, return a list of words.
+        - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
+                    if the POS of w is not in this list,it will be filtered.
     """
-    pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
+    pos_filt = frozenset(allowPOS)
     g = UndirectWeightedGraph()
     cm = collections.defaultdict(int)
     span = 5
diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py
index 865a07d..7d2d096 100644
--- a/jieba/posseg/__init__.py
+++ b/jieba/posseg/__init__.py
@@ -125,7 +125,7 @@ def __cut_detail(sentence):
 def __cut_DAG_NO_HMM(sentence):
     DAG = jieba.get_DAG(sentence)
     route = {}
-    jieba.calc(sentence, DAG, 0, route=route)
+    jieba.calc(sentence, DAG, route)
     x = 0
     N = len(sentence)
     buf = ''
@@ -150,7 +150,7 @@ def __cut_DAG(sentence):
     DAG = jieba.get_DAG(sentence)
     route = {}
 
-    jieba.calc(sentence,DAG,0,route=route)
+    jieba.calc(sentence, DAG, route)
 
     x = 0
     buf = ''
diff --git a/jieba/posseg/viterbi.py b/jieba/posseg/viterbi.py
index 96c1108..4081fe6 100644
--- a/jieba/posseg/viterbi.py
+++ b/jieba/posseg/viterbi.py
@@ -3,8 +3,7 @@ MIN_FLOAT = -3.14e100
 MIN_INF = float("-inf")
 
 def get_top_states(t_state_v, K=4):
-    topK = sorted(t_state_v.items(), key=operator.itemgetter(1), reverse=True)[:K]
-    return [x[0] for x in topK]
+    return sorted(t_state_v, key=t_state_v.__getitem__, reverse=True)[:K]
 
 def viterbi(obs, states, start_p, trans_p, emit_p):
     V = [{}] #tabular
@@ -26,7 +25,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
             obs_states = prev_states_expect_next if prev_states_expect_next else all_states
 
         for y in obs_states:
-            prob, state = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT), y0) for y0 in prev_states])
+            prob, state = max((V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT), y0) for y0 in prev_states)
             V[t][y] = prob
             mem_path[t][y] = state
 
diff --git a/setup.py b/setup.py
index 57a8421..3fcf220 100644
--- a/setup.py
+++ b/setup.py
@@ -1,10 +1,79 @@
+# -*- coding: utf-8 -*-
 from distutils.core import setup
+LONGDOC = """
+jieba
+=====
+
+“结巴”中文分词：做最好的 Python 中文分词组件
+
+"Jieba" (Chinese for "to stutter") Chinese text segmentation: built to
+be the best Python Chinese word segmentation module.
+
+完整文档见 ``README.md``
+
+GitHub: https://github.com/fxsjy/jieba/tree/jieba3k
+
+特点
+====
+
+-  支持三种分词模式：
+
+   -  精确模式，试图将句子最精确地切开，适合文本分析；
+   -  全模式，把句子中所有的可以成词的词语都扫描出来,
+      速度非常快，但是不能解决歧义；
+   -  搜索引擎模式，在精确模式的基础上，对长词再次切分，提高召回率，适合用于搜索引擎分词。
+
+-  支持繁体分词
+-  支持自定义词典
+
+在线演示： http://jiebademo.ap01.aws.af.cm/
+
+安装说明
+========
+
+Python 2.x
+----------
+
+见 https://pypi.python.org/pypi/jieba/
+
+Python 3.x
+----------
+
+-  目前 master 分支是只支持 Python 2.x 的
+-  Python 3.x 版本的分支也已经基本可用：
+   https://github.com/fxsjy/jieba/tree/jieba3k
+
+.. code:: bash
+
+    git clone https://github.com/fxsjy/jieba.git
+    git checkout jieba3k
+    python setup.py install
+
+-  或使用pip3安装： pip3 install jieba3k
+
+"""
+
 setup(name='jieba3k',
       version='0.35.1',
       description='Chinese Words Segementation Utilities',
+      long_description=LONGDOC,
       author='Sun, Junyi',
       author_email='ccnusjy@gmail.com',
-      url='http://github.com/fxsjy',
+      url='https://github.com/fxsjy/jieba/tree/jieba3k',
+      license="MIT",
+      classifiers=[
+        'Intended Audience :: Developers',
+        'License :: OSI Approved :: MIT License',
+        'Operating System :: OS Independent',
+        'Natural Language :: Chinese (Simplified)',
+        'Natural Language :: Chinese (Traditional)',
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 3',
+        'Topic :: Text Processing',
+        'Topic :: Text Processing :: Indexing',
+        'Topic :: Text Processing :: Linguistic',
+      ],
+      keywords='NLP,tokenizing,Chinese word segementation',
       packages=['jieba'],
       package_dir={'jieba':'jieba'},
       package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']}
diff --git a/test/2to3.diff b/test/2to3.diff
index d811204..2c4396f 100644
--- a/test/2to3.diff
+++ b/test/2to3.diff
@@ -1,6 +1,6 @@
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/analyzer.py ../jieba/jieba/analyse/analyzer.py
---- ./jieba/analyse/analyzer.py	2014-11-07 23:07:02.779210408 +0800
-+++ ../jieba/jieba/analyse/analyzer.py	2014-11-07 23:07:02.079210422 +0800
+--- ./jieba/analyse/analyzer.py	2014-11-29 15:46:45.987925569 +0800
++++ ../jieba/jieba/analyse/analyzer.py	2014-11-29 15:34:42.859932465 +0800
 @@ -1,4 +1,4 @@
 -##encoding=utf-8
 +#encoding=utf-8
@@ -8,9 +8,9 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
  from whoosh.analysis import Tokenizer,Token
  from whoosh.lang.porter import stem
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/__init__.py ../jieba/jieba/analyse/__init__.py
---- ./jieba/analyse/__init__.py	2014-11-07 23:07:02.879210406 +0800
-+++ ../jieba/jieba/analyse/__init__.py	2014-11-07 23:16:27.171198767 +0800
-@@ -25,7 +25,7 @@
+--- ./jieba/analyse/__init__.py	2014-11-29 15:46:46.139925567 +0800
++++ ../jieba/jieba/analyse/__init__.py	2014-11-29 15:36:13.147931604 +0800
+@@ -26,7 +26,7 @@
  
      def set_new_path(self, new_idf_path):
          if self.path != new_idf_path:
@@ -19,7 +19,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
              idf_freq = {}
              lines = content.rstrip('\n').split('\n')
              for line in lines:
-@@ -81,7 +81,7 @@
+@@ -93,7 +93,7 @@
          freq[k] *= idf_freq.get(k, median_idf) / total
  
      if withWeight:
@@ -29,8 +29,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
          tags = sorted(freq, key=freq.__getitem__, reverse=True)
      if topK:
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/textrank.py ../jieba/jieba/analyse/textrank.py
---- ./jieba/analyse/textrank.py	2014-11-07 23:07:02.827210407 +0800
-+++ ../jieba/jieba/analyse/textrank.py	2014-11-07 23:18:22.059196398 +0800
+--- ./jieba/analyse/textrank.py	2014-11-29 15:46:46.043925568 +0800
++++ ../jieba/jieba/analyse/textrank.py	2014-11-29 15:36:39.291931354 +0800
 @@ -1,4 +1,4 @@
 -#!/usr/bin/env python
 +#!/usr/bin/env python3
@@ -61,7 +61,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
              # to unify the weights, don't *100.
              ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
  
-@@ -70,12 +70,12 @@
+@@ -72,12 +72,12 @@
                      continue
                  cm[(words[i].word, words[j].word)] += 1
  
@@ -77,19 +77,28 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
          tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
      if topK:
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/finalseg/__init__.py ../jieba/jieba/finalseg/__init__.py
---- ./jieba/finalseg/__init__.py	2014-11-07 23:07:03.147210400 +0800
-+++ ../jieba/jieba/finalseg/__init__.py	2014-11-07 23:18:43.495195956 +0800
+--- ./jieba/finalseg/__init__.py	2014-11-29 15:46:46.367925565 +0800
++++ ../jieba/jieba/finalseg/__init__.py	2014-11-29 15:34:42.859932465 +0800
 @@ -1,4 +1,3 @@
 -
  import re
  import os
  import marshal
+@@ -89,7 +88,7 @@
+             sentence = sentence.decode('utf-8')
+         except UnicodeDecodeError:
+             sentence = sentence.decode('gbk', 'ignore')
+-    re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"(\d+\.\d+|[a-zA-Z0-9]+)")
++    re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
+     blocks = re_han.split(sentence)
+     for blk in blocks:
+         if re_han.match(blk):
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__init__.py ../jieba/jieba/__init__.py
---- ./jieba/__init__.py	2014-11-07 23:07:02.751210408 +0800
-+++ ../jieba/jieba/__init__.py	2014-11-07 23:22:34.963191182 +0800
+--- ./jieba/__init__.py	2014-11-29 15:46:45.955925569 +0800
++++ ../jieba/jieba/__init__.py	2014-11-29 15:39:03.335929981 +0800
 @@ -1,4 +1,3 @@
 -
- __version__ = '0.34'
+ __version__ = '0.35'
  __license__ = 'MIT'
  
 @@ -51,7 +50,7 @@
@@ -101,17 +110,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
      return pfdict, lfreq, ltotal
  
  def initialize(dictionary=None):
-@@ -78,7 +77,8 @@
-         if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
-             logger.debug("Loading model from cache %s" % cache_file)
-             try:
--                pfdict,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
-+                with open(cache_file, 'rb') as cf:
-+                    pfdict,FREQ,total,min_freq = marshal.load(cf)
-                 # prevent conflict with old version
-                 load_from_cache_fail = not isinstance(pfdict, set)
-             except:
-@@ -228,11 +228,11 @@
+@@ -229,11 +228,11 @@
      '''The main function that segments an entire sentence that contains
      Chinese characters into seperated words.
      Parameter:
@@ -125,7 +124,19 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
          try:
              sentence = sentence.decode('utf-8')
          except UnicodeDecodeError:
-@@ -338,8 +338,6 @@
+@@ -243,9 +242,9 @@
+     # \r\n|\s : whitespace characters. Will not be handled.
+ 
+     if cut_all:
+-        re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)", re.U), re.compile(r"[^a-zA-Z0-9+#\n]", re.U)
++        re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)", re.U), re.compile("[^a-zA-Z0-9+#\n]", re.U)
+     else:
+-        re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(r"(\r\n|\s)", re.U)
++        re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile("(\r\n|\s)", re.U)
+     blocks = re_han.split(sentence)
+     if cut_all:
+         cut_block = __cut_all
+@@ -339,8 +338,6 @@
      global pool, cut, cut_for_search
      if os.name == 'nt':
          raise Exception("jieba: parallel mode only supports posix system")
@@ -134,7 +145,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
      from multiprocessing import Pool, cpu_count
      if processnum is None:
          processnum = cpu_count()
-@@ -392,12 +390,12 @@
+@@ -393,12 +390,12 @@
  def tokenize(unicode_sentence, mode="default", HMM=True):
      """Tokenize a sentence and yields tuples of (word, start, end)
      Parameter:
@@ -150,8 +161,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
      if mode == 'default':
          for w in cut(unicode_sentence, HMM=HMM):
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__main__.py ../jieba/jieba/__main__.py
---- ./jieba/__main__.py	2014-11-07 23:07:02.563210412 +0800
-+++ ../jieba/jieba/__main__.py	2014-11-07 23:07:02.079210422 +0800
+--- ./jieba/__main__.py	2014-11-29 15:46:45.747925571 +0800
++++ ../jieba/jieba/__main__.py	2014-11-29 15:34:42.859932465 +0800
 @@ -40,7 +40,7 @@
  ln = fp.readline()
  while ln:
@@ -162,8 +173,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
  
  fp.close()
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/__init__.py ../jieba/jieba/posseg/__init__.py
---- ./jieba/posseg/__init__.py	2014-11-07 23:07:03.047210402 +0800
-+++ ../jieba/jieba/posseg/__init__.py	2014-11-07 23:19:40.883194772 +0800
+--- ./jieba/posseg/__init__.py	2014-11-29 15:46:46.271925566 +0800
++++ ../jieba/jieba/posseg/__init__.py	2014-11-29 15:37:52.299930658 +0800
 @@ -1,4 +1,3 @@
 -
  import re
@@ -188,27 +199,41 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
      if not isJython:
          return result
  
-@@ -46,7 +45,7 @@
- 
-     state = {}
-     abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
--    with open(abs_path, 'r') as f:
-+    with open(abs_path, 'rb') as f:
-         state = marshal.load(f)
-     f.closed
- 
+@@ -105,8 +104,8 @@
+         yield pair(sentence[next:], pos_list[next][1])
+ 
+ def __cut_detail(sentence):
+-    re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"([\.0-9]+|[a-zA-Z0-9]+)")
+-    re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
++    re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
++    re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
+     blocks = re_han.split(sentence)
+     for blk in blocks:
+         if re_han.match(blk):
+@@ -130,7 +129,7 @@
+     x = 0
+     N = len(sentence)
+     buf = ''
+-    re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
++    re_eng = re.compile('[a-zA-Z0-9]',re.U)
+     while x < N:
+         y = route[x][1]+1
+         l_word = sentence[x:y]
+@@ -195,8 +194,8 @@
+             sentence = sentence.decode('utf-8')
+         except UnicodeDecodeError:
+             sentence = sentence.decode('gbk', 'ignore')
+-    re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(r"(\r\n|\s)")
+-    re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
++    re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
++    re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
+     blocks = re_han.split(sentence)
+     if HMM:
+         __cut_blk = __cut_DAG
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/viterbi.py ../jieba/jieba/posseg/viterbi.py
---- ./jieba/posseg/viterbi.py	2014-11-07 23:07:03.079210402 +0800
-+++ ../jieba/jieba/posseg/viterbi.py	2014-11-07 23:07:02.095210422 +0800
-@@ -3,14 +3,13 @@
- MIN_INF = float("-inf")
- 
- def get_top_states(t_state_v, K=4):
--    items = list(t_state_v.items())
--    topK = sorted(items, key=operator.itemgetter(1), reverse=True)[:K]
-+    topK = sorted(t_state_v.items(), key=operator.itemgetter(1), reverse=True)[:K]
-     return [x[0] for x in topK]
- 
+--- ./jieba/posseg/viterbi.py	2014-11-29 15:46:46.303925566 +0800
++++ ../jieba/jieba/posseg/viterbi.py	2014-11-29 15:38:28.527930313 +0800
+@@ -8,7 +8,7 @@
  def viterbi(obs, states, start_p, trans_p, emit_p):
      V = [{}] #tabular
      mem_path = [{}]
@@ -217,7 +242,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
      for y in states.get(obs[0], all_states): #init
          V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
          mem_path[0][y] = ''
-@@ -18,9 +17,9 @@
+@@ -16,9 +16,9 @@
          V.append({})
          mem_path.append({})
          #prev_states = get_top_states(V[t-1])
@@ -229,7 +254,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
          obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next
  
          if not obs_states:
-@@ -31,7 +30,7 @@
+@@ -29,7 +29,7 @@
              V[t][y] = prob
              mem_path[t][y] = state
  
@@ -239,8 +264,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
          #print obs
      prob, state = max(last)
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./README.md ../jieba/README.md
---- ./README.md	2014-11-07 23:07:02.067210423 +0800
-+++ ../jieba/README.md	2014-11-07 23:24:49.263188412 +0800
+--- ./README.md	2014-11-29 15:46:08.487925926 +0800
++++ ../jieba/README.md	2014-11-29 15:34:42.859932465 +0800
 @@ -4,6 +4,9 @@
  "Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module.
  - _Scroll down for English documentation._
@@ -348,18 +373,65 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
  
  ```
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./setup.py ../jieba/setup.py
---- ./setup.py	2014-11-07 23:07:02.067210423 +0800
-+++ ../jieba/setup.py	2014-11-07 23:07:02.095210422 +0800
-@@ -1,5 +1,5 @@
- from distutils.core import setup
+--- ./setup.py	2014-11-29 15:46:46.379925565 +0800
++++ ../jieba/setup.py	2014-11-29 15:42:20.263928103 +0800
+@@ -11,7 +11,7 @@
+ 
+ 完整文档见 ``README.md``
+ 
+-GitHub: https://github.com/fxsjy/jieba
++GitHub: https://github.com/fxsjy/jieba/tree/jieba3k
+ 
+ 特点
+ ====
+@@ -34,17 +34,11 @@
+ Python 2.x
+ ----------
+ 
+--  全自动安装： ``easy_install jieba`` 或者 ``pip install jieba``
+--  半自动安装：先下载 https://pypi.python.org/pypi/jieba/ ，解压后运行
+-   python setup.py install
+--  手动安装：将 jieba 目录放置于当前目录或者 site-packages 目录
+--  通过 ``import jieba`` 来引用
++见 https://pypi.python.org/pypi/jieba/
+ 
+ Python 3.x
+ ----------
+ 
+-见 https://pypi.python.org/pypi/jieba3k/
+-
+ -  目前 master 分支是只支持 Python 2.x 的
+ -  Python 3.x 版本的分支也已经基本可用：
+    https://github.com/fxsjy/jieba/tree/jieba3k
+@@ -59,13 +53,13 @@
+ 
+ """
+ 
 -setup(name='jieba',
 +setup(name='jieba3k',
-       version='0.35',
+       version='0.35.1',
        description='Chinese Words Segementation Utilities',
+       long_description=LONGDOC,
        author='Sun, Junyi',
+       author_email='ccnusjy@gmail.com',
+-      url='https://github.com/fxsjy/jieba',
++      url='https://github.com/fxsjy/jieba/tree/jieba3k',
+       license="MIT",
+       classifiers=[
+         'Intended Audience :: Developers',
+@@ -73,9 +67,8 @@
+         'Operating System :: OS Independent',
+         'Natural Language :: Chinese (Simplified)',
+         'Natural Language :: Chinese (Traditional)',
+         'Programming Language :: Python',
+-        'Programming Language :: Python :: 2',
++        'Programming Language :: Python :: 3',
+         'Topic :: Text Processing',
+         'Topic :: Text Processing :: Indexing',
+         'Topic :: Text Processing :: Linguistic',
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/extract_topic.py ../jieba/test/extract_topic.py
---- ./test/extract_topic.py	2014-11-07 23:07:03.707210389 +0800
-+++ ../jieba/test/extract_topic.py	2014-11-07 23:07:02.095210422 +0800
+--- ./test/extract_topic.py	2014-11-29 15:46:47.003925559 +0800
++++ ../jieba/test/extract_topic.py	2014-11-29 15:34:42.919932464 +0800
 @@ -51,13 +51,13 @@
  print("training...")
  
@@ -379,8 +451,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
 +                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
      print("")
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jiebacmd.py ../jieba/test/jiebacmd.py
---- ./test/jiebacmd.py	2014-11-07 23:07:03.211210399 +0800
-+++ ../jieba/test/jiebacmd.py	2014-11-07 23:07:02.099210422 +0800
+--- ./test/jiebacmd.py	2014-11-29 15:46:46.443925564 +0800
++++ ../jieba/test/jiebacmd.py	2014-11-29 15:34:42.919932464 +0800
 @@ -23,6 +23,6 @@
          break
      line = line.strip()
@@ -390,9 +462,9 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
  
  
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jieba_test.py ../jieba/test/jieba_test.py
---- ./test/jieba_test.py	2014-11-07 23:07:03.947210384 +0800
-+++ ../jieba/test/jieba_test.py	2014-11-07 23:07:02.099210422 +0800
-@@ -1,5 +1,6 @@
+--- ./test/jieba_test.py	2014-11-29 15:46:47.271925556 +0800
++++ ../jieba/test/jieba_test.py	2014-11-29 15:34:42.919932464 +0800
+@@ -152,7 +152,7 @@
  #-*-coding: utf-8 -*-
  import sys
 +import imp
@@ -417,7 +489,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
              assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
              result = list(result)
              assert isinstance(result, list), "Test Tokenize error on content: %s" % content
-@@ -180,7 +181,7 @@
+@@ -181,7 +181,7 @@
  
      def testTokenize_NOHMM(self):
          for content in test_contents:
@@ -427,8 +499,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
              result = list(result)
              assert isinstance(result, list), "Test Tokenize error on content: %s" % content
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize_no_hmm.py ../jieba/test/test_tokenize_no_hmm.py
---- ./test/test_tokenize_no_hmm.py	2014-11-07 23:07:04.031210382 +0800
-+++ ../jieba/test/test_tokenize_no_hmm.py	2014-11-07 23:07:02.099210422 +0800
+--- ./test/test_tokenize_no_hmm.py	2014-11-29 15:46:47.355925556 +0800
++++ ../jieba/test/test_tokenize_no_hmm.py	2014-11-29 15:34:42.919932464 +0800
 @@ -7,7 +7,6 @@
  
  def cuttest(test_sent):
@@ -438,8 +510,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
      for tk in result:
          print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize.py ../jieba/test/test_tokenize.py
---- ./test/test_tokenize.py	2014-11-07 23:07:04.071210381 +0800
-+++ ../jieba/test/test_tokenize.py	2014-11-07 23:07:02.099210422 +0800
+--- ./test/test_tokenize.py	2014-11-29 15:46:47.403925555 +0800
++++ ../jieba/test/test_tokenize.py	2014-11-29 15:34:42.919932464 +0800
 @@ -7,7 +7,6 @@
  
  def cuttest(test_sent):