jieba/test/2to3.diff

diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/analyzer.py ../jieba/jieba/analyse/analyzer.py
--- ./jieba/analyse/analyzer.py	2014-11-07 23:07:02.779210408 +0800
+++ ../jieba/jieba/analyse/analyzer.py	2014-11-07 23:07:02.079210422 +0800
@@ -1,4 +1,4 @@
-##encoding=utf-8
+#encoding=utf-8
 from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
 from whoosh.analysis import Tokenizer,Token
 from whoosh.lang.porter import stem
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/__init__.py ../jieba/jieba/analyse/__init__.py
--- ./jieba/analyse/__init__.py	2014-11-07 23:07:02.879210406 +0800
+++ ../jieba/jieba/analyse/__init__.py	2014-11-07 23:16:27.171198767 +0800
@@ -25,7 +25,7 @@
 
     def set_new_path(self, new_idf_path):
         if self.path != new_idf_path:
-            content = open(new_idf_path, 'rb').read().decode('utf-8')
+            content = open(new_idf_path, 'r', encoding='utf-8').read()
             idf_freq = {}
             lines = content.rstrip('\n').split('\n')
             for line in lines:
@@ -81,7 +81,7 @@
         freq[k] *= idf_freq.get(k, median_idf) / total
 
     if withWeight:
-        tags = sorted(list(freq.items()), key=itemgetter(1), reverse=True)
+        tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
     else:
         tags = sorted(freq, key=freq.__getitem__, reverse=True)
     if topK:
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/textrank.py ../jieba/jieba/analyse/textrank.py
--- ./jieba/analyse/textrank.py	2014-11-07 23:07:02.827210407 +0800
+++ ../jieba/jieba/analyse/textrank.py	2014-11-07 23:18:22.059196398 +0800
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
 import sys
@@ -22,12 +22,12 @@
         outSum = collections.defaultdict(float)
 
         wsdef = 1.0 / len(self.graph)
-        for n, out in list(self.graph.items()):
+        for n, out in self.graph.items():
             ws[n] = wsdef
             outSum[n] = sum((e[2] for e in out), 0.0)
 
         for x in range(10):  # 10 iters
-            for n, inedges in list(self.graph.items()):
+            for n, inedges in self.graph.items():
                 s = 0
                 for e in inedges:
                     s += e[2] / outSum[e[1]] * ws[e[1]]
@@ -41,7 +41,7 @@
             elif w > max_rank:
                 max_rank = w
 
-        for n, w in list(ws.items()):
+        for n, w in ws.items():
             # to unify the weights, don't *100.
             ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
 
@@ -70,12 +70,12 @@
                     continue
                 cm[(words[i].word, words[j].word)] += 1
 
-    for terms, w in list(cm.items()):
+    for terms, w in cm.items():
         g.addEdge(terms[0], terms[1], w)
 
     nodes_rank = g.rank()
     if withWeight:
-        tags = sorted(list(nodes_rank.items()), key=itemgetter(1), reverse=True)
+        tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
     else:
         tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
     if topK:
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/finalseg/__init__.py ../jieba/jieba/finalseg/__init__.py
--- ./jieba/finalseg/__init__.py	2014-11-07 23:07:03.147210400 +0800
+++ ../jieba/jieba/finalseg/__init__.py	2014-11-07 23:18:43.495195956 +0800
@@ -1,4 +1,3 @@
-
 import re
 import os
 import marshal
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__init__.py ../jieba/jieba/__init__.py
--- ./jieba/__init__.py	2014-11-07 23:07:02.751210408 +0800
+++ ../jieba/jieba/__init__.py	2014-11-07 23:22:34.963191182 +0800
@@ -1,4 +1,3 @@
-
 __version__ = '0.34'
 __license__ = 'MIT'
 
@@ -51,7 +50,7 @@
                     pfdict.add(word[:ch+1])
             except ValueError as e:
                 logger.debug('%s at line %s %s' % (f_name, lineno, line))
-                raise ValueError(e)
+                raise e
     return pfdict, lfreq, ltotal
 
 def initialize(dictionary=None):
@@ -78,7 +77,8 @@
         if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
             logger.debug("Loading model from cache %s" % cache_file)
             try:
-                pfdict,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
+                with open(cache_file, 'rb') as cf:
+                    pfdict,FREQ,total,min_freq = marshal.load(cf)
                 # prevent conflict with old version
                 load_from_cache_fail = not isinstance(pfdict, set)
             except:
@@ -228,11 +228,11 @@
     '''The main function that segments an entire sentence that contains
     Chinese characters into seperated words.
     Parameter:
-        - sentence: The str/unicode to be segmented.
+        - sentence: The str to be segmented.
         - cut_all: Model type. True for full pattern, False for accurate pattern.
         - HMM: Whether to use the Hidden Markov Model.
     '''
-    if not isinstance(sentence, str):
+    if isinstance(sentence, bytes):
         try:
             sentence = sentence.decode('utf-8')
         except UnicodeDecodeError:
@@ -338,8 +338,6 @@
     global pool, cut, cut_for_search
     if os.name == 'nt':
         raise Exception("jieba: parallel mode only supports posix system")
-    if sys.version_info[0]==2 and sys.version_info[1]<6:
-        raise Exception("jieba: the parallel feature needs Python version>2.5")
     from multiprocessing import Pool, cpu_count
     if processnum is None:
         processnum = cpu_count()
@@ -392,12 +390,12 @@
 def tokenize(unicode_sentence, mode="default", HMM=True):
     """Tokenize a sentence and yields tuples of (word, start, end)
     Parameter:
-        - sentence: the unicode to be segmented.
+        - sentence: the str to be segmented.
         - mode: "default" or "search", "search" is for finer segmentation.
         - HMM: whether to use the Hidden Markov Model.
     """
     if not isinstance(unicode_sentence, str):
-        raise Exception("jieba: the input parameter should be unicode.")
+        raise Exception("jieba: the input parameter should be str.")
     start = 0
     if mode == 'default':
         for w in cut(unicode_sentence, HMM=HMM):
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__main__.py ../jieba/jieba/__main__.py
--- ./jieba/__main__.py	2014-11-07 23:07:02.563210412 +0800
+++ ../jieba/jieba/__main__.py	2014-11-07 23:07:02.079210422 +0800
@@ -40,7 +40,7 @@
 ln = fp.readline()
 while ln:
     l = ln.rstrip('\r\n')
-    print((delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8')))
+    print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)))
     ln = fp.readline()
 
 fp.close()
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/__init__.py ../jieba/jieba/posseg/__init__.py
--- ./jieba/posseg/__init__.py	2014-11-07 23:07:03.047210402 +0800
+++ ../jieba/jieba/posseg/__init__.py	2014-11-07 23:19:40.883194772 +0800
@@ -1,4 +1,3 @@
-
 import re
 import os
 from . import viterbi
@@ -18,14 +17,14 @@
     _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
 
     result = {}
-    with open(f_name, "r") as f:
+    with open(f_name, "rb") as f:
         for line in f:
             line = line.strip()
             if not line:
                 continue
-            word, _, tag = line.split(' ')
-            result[word.decode('utf-8')] = tag
-
+            line = line.decode("utf-8")
+            word, _, tag = line.split(" ")
+            result[word] = tag
     if not isJython:
         return result
 
@@ -46,7 +45,7 @@
 
     state = {}
     abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
-    with open(abs_path, 'r') as f:
+    with open(abs_path, 'rb') as f:
         state = marshal.load(f)
     f.closed
 
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/viterbi.py ../jieba/jieba/posseg/viterbi.py
--- ./jieba/posseg/viterbi.py	2014-11-07 23:07:03.079210402 +0800
+++ ../jieba/jieba/posseg/viterbi.py	2014-11-07 23:07:02.095210422 +0800
@@ -3,14 +3,13 @@
 MIN_INF = float("-inf")
 
 def get_top_states(t_state_v, K=4):
-    items = list(t_state_v.items())
-    topK = sorted(items, key=operator.itemgetter(1), reverse=True)[:K]
+    topK = sorted(t_state_v.items(), key=operator.itemgetter(1), reverse=True)[:K]
     return [x[0] for x in topK]
 
 def viterbi(obs, states, start_p, trans_p, emit_p):
     V = [{}] #tabular
     mem_path = [{}]
-    all_states = list(trans_p.keys())
+    all_states = trans_p.keys()
     for y in states.get(obs[0], all_states): #init
         V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
         mem_path[0][y] = ''
@@ -18,9 +17,9 @@
         V.append({})
         mem_path.append({})
         #prev_states = get_top_states(V[t-1])
-        prev_states = [x for x in list(mem_path[t-1].keys()) if len(trans_p[x]) > 0]
+        prev_states = [x for x in mem_path[t-1].keys() if len(trans_p[x]) > 0]
 
-        prev_states_expect_next = set((y for x in prev_states for y in list(trans_p[x].keys())))
+        prev_states_expect_next = set((y for x in prev_states for y in trans_p[x].keys()))
         obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next
 
         if not obs_states:
@@ -31,7 +30,7 @@
             V[t][y] = prob
             mem_path[t][y] = state
 
-    last = [(V[-1][y], y) for y in list(mem_path[-1].keys())]
+    last = [(V[-1][y], y) for y in mem_path[-1].keys()]
     #if len(last)==0:
         #print obs
     prob, state = max(last)
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./README.md ../jieba/README.md
--- ./README.md	2014-11-07 23:07:02.067210423 +0800
+++ ../jieba/README.md	2014-11-07 23:24:49.263188412 +0800
@@ -4,6 +4,9 @@
 "Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module.
 - _Scroll down for English documentation._
 
+注意！
+========
+这个branch `jieba3k` 是专门用于Python3.x的版本
 
 特点
 ========
@@ -68,16 +71,16 @@
 import jieba
 
 seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
-print "Full Mode:", "/ ".join(seg_list)  # 全模式
+print("Full Mode:", "/ ".join(seg_list))  # 全模式
 
 seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
-print "Default Mode:", "/ ".join(seg_list)  # 精确模式
+print("Default Mode:", "/ ".join(seg_list))  # 精确模式
 
 seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认是精确模式
-print ", ".join(seg_list)
+print(", ".join(seg_list))
 
 seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
-print ", ".join(seg_list)
+print(", ".join(seg_list))
 ```
 
 输出:
@@ -174,7 +177,7 @@
 >>> import jieba.posseg as pseg
 >>> words = pseg.cut("我爱北京天安门")
 >>> for w in words:
-...    print w.word, w.flag
+...    print(w.word, w.flag)
 ...
 我 r
 爱 v
@@ -203,7 +206,7 @@
 ```python
 result = jieba.tokenize(u'永和服装饰品有限公司')
 for tk in result:
-    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
 ```
 
 ```
@@ -219,7 +222,7 @@
 ```python
 result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
 for tk in result:
-    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
 ```
 
 ```
@@ -408,16 +411,16 @@
 import jieba
 
 seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
-print "Full Mode:", "/ ".join(seg_list)  # 全模式
+print("Full Mode:", "/ ".join(seg_list))  # 全模式
 
 seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
-print "Default Mode:", "/ ".join(seg_list)  # 默认模式
+print("Default Mode:", "/ ".join(seg_list))  # 默认模式
 
 seg_list = jieba.cut("他来到了网易杭研大厦")
-print ", ".join(seg_list)
+print(", ".join(seg_list))
 
 seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
-print ", ".join(seg_list)
+print(", ".join(seg_list))
 ```
 
 Output:
@@ -483,7 +486,7 @@
 >>> import jieba.posseg as pseg
 >>> words = pseg.cut("我爱北京天安门")
 >>> for w in words:
-...    print w.word, w.flag
+...    print(w.word, w.flag)
 ...
 我 r
 爱 v
@@ -512,7 +515,7 @@
 ```python
 result = jieba.tokenize(u'永和服装饰品有限公司')
 for tk in result:
-    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
 ```
 
 ```
@@ -528,7 +531,7 @@
 ```python
 result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
 for tk in result:
-    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
 ```
 
 ```
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./setup.py ../jieba/setup.py
--- ./setup.py	2014-11-07 23:07:02.067210423 +0800
+++ ../jieba/setup.py	2014-11-07 23:07:02.095210422 +0800
@@ -1,5 +1,5 @@
 from distutils.core import setup
-setup(name='jieba',
+setup(name='jieba3k',
       version='0.34',
       description='Chinese Words Segementation Utilities',
       author='Sun, Junyi',
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/extract_topic.py ../jieba/test/extract_topic.py
--- ./test/extract_topic.py	2014-11-07 23:07:03.707210389 +0800
+++ ../jieba/test/extract_topic.py	2014-11-07 23:07:02.095210422 +0800
@@ -51,13 +51,13 @@
 print("training...")
 
 nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
-print(("done in %0.3fs." % (time.time() - t0)))
+print("done in %0.3fs." % (time.time() - t0))
 
 # Inverse the vectorizer vocabulary to be able
 feature_names = count_vect.get_feature_names()
 
 for topic_idx, topic in enumerate(nmf.components_):
-    print(("Topic #%d:" % topic_idx))
-    print((" ".join([feature_names[i]
-                    for i in topic.argsort()[:-n_top_words - 1:-1]])))
+    print("Topic #%d:" % topic_idx)
+    print(" ".join([feature_names[i]
+                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
     print("")
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jiebacmd.py ../jieba/test/jiebacmd.py
--- ./test/jiebacmd.py	2014-11-07 23:07:03.211210399 +0800
+++ ../jieba/test/jiebacmd.py	2014-11-07 23:07:02.099210422 +0800
@@ -23,6 +23,6 @@
         break
     line = line.strip()
     for word in jieba.cut(line):
-        print(word.encode(default_encoding))
+        print(word)
 
 
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jieba_test.py ../jieba/test/jieba_test.py
--- ./test/jieba_test.py	2014-11-07 23:07:03.947210384 +0800
+++ ../jieba/test/jieba_test.py	2014-11-07 23:07:02.099210422 +0800
@@ -1,5 +1,6 @@
 #-*-coding: utf-8 -*-
 import sys
+import imp
 sys.path.append("../")
 import unittest
 import types
@@ -97,7 +98,7 @@
 
 class JiebaTestCase(unittest.TestCase):
     def setUp(self):
-        reload(jieba)
+        imp.reload(jieba)
 
     def tearDown(self):
         pass
@@ -151,7 +152,7 @@
 
     def testTokenize(self):
         for content in test_contents:
-            result = jieba.tokenize(content.decode('utf-8'))
+            result = jieba.tokenize(content)
             assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
             result = list(result)
             assert isinstance(result, list), "Test Tokenize error on content: %s" % content
@@ -180,7 +181,7 @@
 
     def testTokenize_NOHMM(self):
         for content in test_contents:
-            result = jieba.tokenize(content.decode('utf-8'),HMM=False)
+            result = jieba.tokenize(content,HMM=False)
             assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
             result = list(result)
             assert isinstance(result, list), "Test Tokenize error on content: %s" % content
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize_no_hmm.py ../jieba/test/test_tokenize_no_hmm.py
--- ./test/test_tokenize_no_hmm.py	2014-11-07 23:07:04.031210382 +0800
+++ ../jieba/test/test_tokenize_no_hmm.py	2014-11-07 23:07:02.099210422 +0800
@@ -7,7 +7,6 @@
 
 def cuttest(test_sent):
     global g_mode
-    test_sent = test_sent.decode('utf-8')
     result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
     for tk in result:
         print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize.py ../jieba/test/test_tokenize.py
--- ./test/test_tokenize.py	2014-11-07 23:07:04.071210381 +0800
+++ ../jieba/test/test_tokenize.py	2014-11-07 23:07:02.099210422 +0800
@@ -7,7 +7,6 @@
 
 def cuttest(test_sent):
     global g_mode
-    test_sent = test_sent.decode('utf-8')
     result = jieba.tokenize(test_sent,mode=g_mode)
     for tk in result:
         print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
port extract_tags, etc to jieba3k; add auto2to3 script 11 years ago			`diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/analyzer.py ../jieba/jieba/analyse/analyzer.py`
			`--- ./jieba/analyse/analyzer.py 2014-11-07 23:07:02.779210408 +0800`
			`+++ ../jieba/jieba/analyse/analyzer.py 2014-11-07 23:07:02.079210422 +0800`
			`@@ -1,4 +1,4 @@`
			`-##encoding=utf-8`
			`+#encoding=utf-8`
			`from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter`
			`from whoosh.analysis import Tokenizer,Token`
			`from whoosh.lang.porter import stem`
			`diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/__init__.py ../jieba/jieba/analyse/__init__.py`
			`--- ./jieba/analyse/__init__.py 2014-11-07 23:07:02.879210406 +0800`
			`+++ ../jieba/jieba/analyse/__init__.py 2014-11-07 23:16:27.171198767 +0800`
			`@@ -25,7 +25,7 @@`

			`def set_new_path(self, new_idf_path):`
			`if self.path != new_idf_path:`
			`- content = open(new_idf_path, 'rb').read().decode('utf-8')`
			`+ content = open(new_idf_path, 'r', encoding='utf-8').read()`
			`idf_freq = {}`
			`lines = content.rstrip('\n').split('\n')`
			`for line in lines:`
			`@@ -81,7 +81,7 @@`
			`freq[k] *= idf_freq.get(k, median_idf) / total`

			`if withWeight:`
			`- tags = sorted(list(freq.items()), key=itemgetter(1), reverse=True)`
			`+ tags = sorted(freq.items(), key=itemgetter(1), reverse=True)`
			`else:`
			`tags = sorted(freq, key=freq.__getitem__, reverse=True)`
			`if topK:`
			`diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/textrank.py ../jieba/jieba/analyse/textrank.py`
			`--- ./jieba/analyse/textrank.py 2014-11-07 23:07:02.827210407 +0800`
			`+++ ../jieba/jieba/analyse/textrank.py 2014-11-07 23:18:22.059196398 +0800`
			`@@ -1,4 +1,4 @@`
			`-#!/usr/bin/env python`
			`+#!/usr/bin/env python3`
			`# -- coding: utf-8 --`

			`import sys`
			`@@ -22,12 +22,12 @@`
			`outSum = collections.defaultdict(float)`

			`wsdef = 1.0 / len(self.graph)`
			`- for n, out in list(self.graph.items()):`
			`+ for n, out in self.graph.items():`
			`ws[n] = wsdef`
			`outSum[n] = sum((e[2] for e in out), 0.0)`

			`for x in range(10): # 10 iters`
			`- for n, inedges in list(self.graph.items()):`
			`+ for n, inedges in self.graph.items():`
			`s = 0`
			`for e in inedges:`
			`s += e[2] / outSum[e[1]] * ws[e[1]]`
			`@@ -41,7 +41,7 @@`
			`elif w > max_rank:`
			`max_rank = w`

			`- for n, w in list(ws.items()):`
			`+ for n, w in ws.items():`
			`# to unify the weights, don't *100.`
			`ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)`

			`@@ -70,12 +70,12 @@`
			`continue`
			`cm[(words[i].word, words[j].word)] += 1`

			`- for terms, w in list(cm.items()):`
			`+ for terms, w in cm.items():`
			`g.addEdge(terms[0], terms[1], w)`

			`nodes_rank = g.rank()`
			`if withWeight:`
			`- tags = sorted(list(nodes_rank.items()), key=itemgetter(1), reverse=True)`
			`+ tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)`
			`else:`
			`tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)`
			`if topK:`
			`diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/finalseg/__init__.py ../jieba/jieba/finalseg/__init__.py`
			`--- ./jieba/finalseg/__init__.py 2014-11-07 23:07:03.147210400 +0800`
			`+++ ../jieba/jieba/finalseg/__init__.py 2014-11-07 23:18:43.495195956 +0800`
			`@@ -1,4 +1,3 @@`
			`-`
			`import re`
			`import os`
			`import marshal`
			`diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__init__.py ../jieba/jieba/__init__.py`
			`--- ./jieba/__init__.py 2014-11-07 23:07:02.751210408 +0800`
			`+++ ../jieba/jieba/__init__.py 2014-11-07 23:22:34.963191182 +0800`
			`@@ -1,4 +1,3 @@`
			`-`
			`__version__ = '0.34'`
			`__license__ = 'MIT'`

			`@@ -51,7 +50,7 @@`
			`pfdict.add(word[:ch+1])`
			`except ValueError as e:`
			`logger.debug('%s at line %s %s' % (f_name, lineno, line))`
			`- raise ValueError(e)`
			`+ raise e`
			`return pfdict, lfreq, ltotal`

			`def initialize(dictionary=None):`
			`@@ -78,7 +77,8 @@`
			`if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):`
			`logger.debug("Loading model from cache %s" % cache_file)`
			`try:`
			`- pfdict,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))`
			`+ with open(cache_file, 'rb') as cf:`
			`+ pfdict,FREQ,total,min_freq = marshal.load(cf)`
			`# prevent conflict with old version`
			`load_from_cache_fail = not isinstance(pfdict, set)`
			`except:`
			`@@ -228,11 +228,11 @@`
			`'''The main function that segments an entire sentence that contains`
			`Chinese characters into seperated words.`
			`Parameter:`
			`- - sentence: The str/unicode to be segmented.`
			`+ - sentence: The str to be segmented.`
			`- cut_all: Model type. True for full pattern, False for accurate pattern.`
			`- HMM: Whether to use the Hidden Markov Model.`
			`'''`
			`- if not isinstance(sentence, str):`
			`+ if isinstance(sentence, bytes):`
			`try:`
			`sentence = sentence.decode('utf-8')`
			`except UnicodeDecodeError:`
			`@@ -338,8 +338,6 @@`
			`global pool, cut, cut_for_search`
			`if os.name == 'nt':`
			`raise Exception("jieba: parallel mode only supports posix system")`
			`- if sys.version_info[0]==2 and sys.version_info[1]<6:`
			`- raise Exception("jieba: the parallel feature needs Python version>2.5")`
			`from multiprocessing import Pool, cpu_count`
			`if processnum is None:`
			`processnum = cpu_count()`
			`@@ -392,12 +390,12 @@`
			`def tokenize(unicode_sentence, mode="default", HMM=True):`
			`"""Tokenize a sentence and yields tuples of (word, start, end)`
			`Parameter:`
			`- - sentence: the unicode to be segmented.`
			`+ - sentence: the str to be segmented.`
			`- mode: "default" or "search", "search" is for finer segmentation.`
			`- HMM: whether to use the Hidden Markov Model.`
			`"""`
			`if not isinstance(unicode_sentence, str):`
			`- raise Exception("jieba: the input parameter should be unicode.")`
			`+ raise Exception("jieba: the input parameter should be str.")`
			`start = 0`
			`if mode == 'default':`
			`for w in cut(unicode_sentence, HMM=HMM):`
			`diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__main__.py ../jieba/jieba/__main__.py`
			`--- ./jieba/__main__.py 2014-11-07 23:07:02.563210412 +0800`
			`+++ ../jieba/jieba/__main__.py 2014-11-07 23:07:02.079210422 +0800`
			`@@ -40,7 +40,7 @@`
			`ln = fp.readline()`
			`while ln:`
			`l = ln.rstrip('\r\n')`
			`- print((delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8')))`
			`+ print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)))`
			`ln = fp.readline()`

			`fp.close()`
			`diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/__init__.py ../jieba/jieba/posseg/__init__.py`
			`--- ./jieba/posseg/__init__.py 2014-11-07 23:07:03.047210402 +0800`
			`+++ ../jieba/jieba/posseg/__init__.py 2014-11-07 23:19:40.883194772 +0800`
			`@@ -1,4 +1,3 @@`
			`-`
			`import re`
			`import os`
			`from . import viterbi`
			`@@ -18,14 +17,14 @@`
			`_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))`

			`result = {}`
			`- with open(f_name, "r") as f:`
			`+ with open(f_name, "rb") as f:`
			`for line in f:`
			`line = line.strip()`
			`if not line:`
			`continue`
			`- word, _, tag = line.split(' ')`
			`- result[word.decode('utf-8')] = tag`
			`-`
			`+ line = line.decode("utf-8")`
			`+ word, _, tag = line.split(" ")`
			`+ result[word] = tag`
			`if not isJython:`
			`return result`

			`@@ -46,7 +45,7 @@`

			`state = {}`
			`abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)`
			`- with open(abs_path, 'r') as f:`
			`+ with open(abs_path, 'rb') as f:`
			`state = marshal.load(f)`
			`f.closed`

			`diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/viterbi.py ../jieba/jieba/posseg/viterbi.py`
			`--- ./jieba/posseg/viterbi.py 2014-11-07 23:07:03.079210402 +0800`
			`+++ ../jieba/jieba/posseg/viterbi.py 2014-11-07 23:07:02.095210422 +0800`
			`@@ -3,14 +3,13 @@`
			`MIN_INF = float("-inf")`

			`def get_top_states(t_state_v, K=4):`
			`- items = list(t_state_v.items())`
			`- topK = sorted(items, key=operator.itemgetter(1), reverse=True)[:K]`
			`+ topK = sorted(t_state_v.items(), key=operator.itemgetter(1), reverse=True)[:K]`
			`return [x[0] for x in topK]`

			`def viterbi(obs, states, start_p, trans_p, emit_p):`
			`V = [{}] #tabular`
			`mem_path = [{}]`
			`- all_states = list(trans_p.keys())`
			`+ all_states = trans_p.keys()`
			`for y in states.get(obs[0], all_states): #init`
			`V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)`
			`mem_path[0][y] = ''`
			`@@ -18,9 +17,9 @@`
			`V.append({})`
			`mem_path.append({})`
			`#prev_states = get_top_states(V[t-1])`
			`- prev_states = [x for x in list(mem_path[t-1].keys()) if len(trans_p[x]) > 0]`
			`+ prev_states = [x for x in mem_path[t-1].keys() if len(trans_p[x]) > 0]`

			`- prev_states_expect_next = set((y for x in prev_states for y in list(trans_p[x].keys())))`
			`+ prev_states_expect_next = set((y for x in prev_states for y in trans_p[x].keys()))`
			`obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next`

			`if not obs_states:`
			`@@ -31,7 +30,7 @@`
			`V[t][y] = prob`
			`mem_path[t][y] = state`

			`- last = [(V[-1][y], y) for y in list(mem_path[-1].keys())]`
			`+ last = [(V[-1][y], y) for y in mem_path[-1].keys()]`
			`#if len(last)==0:`
			`#print obs`
			`prob, state = max(last)`
			`diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./README.md ../jieba/README.md`
			`--- ./README.md 2014-11-07 23:07:02.067210423 +0800`
			`+++ ../jieba/README.md 2014-11-07 23:24:49.263188412 +0800`
			`@@ -4,6 +4,9 @@`
			`"Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module.`
			`- _Scroll down for English documentation._`

			`+注意！`
			`+========`
			+这个branch `jieba3k` 是专门用于Python3.x的版本

			`特点`
			`========`
			`@@ -68,16 +71,16 @@`
			`import jieba`

			`seg_list = jieba.cut("我来到北京清华大学", cut_all=True)`
			`-print "Full Mode:", "/ ".join(seg_list) # 全模式`
			`+print("Full Mode:", "/ ".join(seg_list)) # 全模式`

			`seg_list = jieba.cut("我来到北京清华大学", cut_all=False)`
			`-print "Default Mode:", "/ ".join(seg_list) # 精确模式`
			`+print("Default Mode:", "/ ".join(seg_list)) # 精确模式`

			`seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式`
			`-print ", ".join(seg_list)`
			`+print(", ".join(seg_list))`

			`seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造") # 搜索引擎模式`
			`-print ", ".join(seg_list)`
			`+print(", ".join(seg_list))`
			```

			`输出:`
			`@@ -174,7 +177,7 @@`
			`>>> import jieba.posseg as pseg`
			`>>> words = pseg.cut("我爱北京天安门")`
			`>>> for w in words:`
			`-... print w.word, w.flag`
			`+... print(w.word, w.flag)`
			`...`
			`我 r`
			`爱 v`
			`@@ -203,7 +206,7 @@`
			```python
			`result = jieba.tokenize(u'永和服装饰品有限公司')`
			`for tk in result:`
			`- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])`
			`+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))`
			```

			```
			`@@ -219,7 +222,7 @@`
			```python
			`result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')`
			`for tk in result:`
			`- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])`
			`+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))`
			```

			```
			`@@ -408,16 +411,16 @@`
			`import jieba`

			`seg_list = jieba.cut("我来到北京清华大学", cut_all=True)`
			`-print "Full Mode:", "/ ".join(seg_list) # 全模式`
			`+print("Full Mode:", "/ ".join(seg_list)) # 全模式`

			`seg_list = jieba.cut("我来到北京清华大学", cut_all=False)`
			`-print "Default Mode:", "/ ".join(seg_list) # 默认模式`
			`+print("Default Mode:", "/ ".join(seg_list)) # 默认模式`

			`seg_list = jieba.cut("他来到了网易杭研大厦")`
			`-print ", ".join(seg_list)`
			`+print(", ".join(seg_list))`

			`seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造") # 搜索引擎模式`
			`-print ", ".join(seg_list)`
			`+print(", ".join(seg_list))`
			```

			`Output:`
			`@@ -483,7 +486,7 @@`
			`>>> import jieba.posseg as pseg`
			`>>> words = pseg.cut("我爱北京天安门")`
			`>>> for w in words:`
			`-... print w.word, w.flag`
			`+... print(w.word, w.flag)`
			`...`
			`我 r`
			`爱 v`
			`@@ -512,7 +515,7 @@`
			```python
			`result = jieba.tokenize(u'永和服装饰品有限公司')`
			`for tk in result:`
			`- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])`
			`+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))`
			```

			```
			`@@ -528,7 +531,7 @@`
			```python
			`result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')`
			`for tk in result:`
			`- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])`
			`+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))`
			```

			```
			`diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./setup.py ../jieba/setup.py`
			`--- ./setup.py 2014-11-07 23:07:02.067210423 +0800`
			`+++ ../jieba/setup.py 2014-11-07 23:07:02.095210422 +0800`
			`@@ -1,5 +1,5 @@`
			`from distutils.core import setup`
			`-setup(name='jieba',`
			`+setup(name='jieba3k',`
			`version='0.34',`
			`description='Chinese Words Segementation Utilities',`
			`author='Sun, Junyi',`
			`diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/extract_topic.py ../jieba/test/extract_topic.py`
			`--- ./test/extract_topic.py 2014-11-07 23:07:03.707210389 +0800`
			`+++ ../jieba/test/extract_topic.py 2014-11-07 23:07:02.095210422 +0800`
			`@@ -51,13 +51,13 @@`
			`print("training...")`

			`nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)`
			`-print(("done in %0.3fs." % (time.time() - t0)))`
			`+print("done in %0.3fs." % (time.time() - t0))`

			`# Inverse the vectorizer vocabulary to be able`
			`feature_names = count_vect.get_feature_names()`

			`for topic_idx, topic in enumerate(nmf.components_):`
			`- print(("Topic #%d:" % topic_idx))`
			`- print((" ".join([feature_names[i]`
			`- for i in topic.argsort()[:-n_top_words - 1:-1]])))`
			`+ print("Topic #%d:" % topic_idx)`
			`+ print(" ".join([feature_names[i]`
			`+ for i in topic.argsort()[:-n_top_words - 1:-1]]))`
			`print("")`
			`diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jiebacmd.py ../jieba/test/jiebacmd.py`
			`--- ./test/jiebacmd.py 2014-11-07 23:07:03.211210399 +0800`
			`+++ ../jieba/test/jiebacmd.py 2014-11-07 23:07:02.099210422 +0800`
			`@@ -23,6 +23,6 @@`
			`break`
			`line = line.strip()`
			`for word in jieba.cut(line):`
			`- print(word.encode(default_encoding))`
			`+ print(word)`


			`diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jieba_test.py ../jieba/test/jieba_test.py`
			`--- ./test/jieba_test.py 2014-11-07 23:07:03.947210384 +0800`
			`+++ ../jieba/test/jieba_test.py 2014-11-07 23:07:02.099210422 +0800`
			`@@ -1,5 +1,6 @@`
			`#--coding: utf-8 --`
			`import sys`
			`+import imp`
			`sys.path.append("../")`
			`import unittest`
			`import types`
			`@@ -97,7 +98,7 @@`

			`class JiebaTestCase(unittest.TestCase):`
			`def setUp(self):`
			`- reload(jieba)`
			`+ imp.reload(jieba)`

			`def tearDown(self):`
			`pass`
			`@@ -151,7 +152,7 @@`

			`def testTokenize(self):`
			`for content in test_contents:`
			`- result = jieba.tokenize(content.decode('utf-8'))`
			`+ result = jieba.tokenize(content)`
			`assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"`
			`result = list(result)`
			`assert isinstance(result, list), "Test Tokenize error on content: %s" % content`
			`@@ -180,7 +181,7 @@`

			`def testTokenize_NOHMM(self):`
			`for content in test_contents:`
			`- result = jieba.tokenize(content.decode('utf-8'),HMM=False)`
			`+ result = jieba.tokenize(content,HMM=False)`
			`assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"`
			`result = list(result)`
			`assert isinstance(result, list), "Test Tokenize error on content: %s" % content`
			`diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize_no_hmm.py ../jieba/test/test_tokenize_no_hmm.py`
			`--- ./test/test_tokenize_no_hmm.py 2014-11-07 23:07:04.031210382 +0800`
			`+++ ../jieba/test/test_tokenize_no_hmm.py 2014-11-07 23:07:02.099210422 +0800`
			`@@ -7,7 +7,6 @@`

			`def cuttest(test_sent):`
			`global g_mode`
			`- test_sent = test_sent.decode('utf-8')`
			`result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)`
			`for tk in result:`
			`print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))`
			`diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize.py ../jieba/test/test_tokenize.py`
			`--- ./test/test_tokenize.py 2014-11-07 23:07:04.071210381 +0800`
			`+++ ../jieba/test/test_tokenize.py 2014-11-07 23:07:02.099210422 +0800`
			`@@ -7,7 +7,6 @@`

			`def cuttest(test_sent):`
			`global g_mode`
			`- test_sent = test_sent.decode('utf-8')`
			`result = jieba.tokenize(test_sent,mode=g_mode)`
			`for tk in result:`
			`print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))`