Merge master and jieba3k, make the code Python 2/3 compatible

10 years ago · 22bcf8be7a
parent 765fd6b7f0 4197dfb8fa
commit 22bcf8be7a
48 changed files with 131433 additions and 131938 deletions
--- a/README.md
+++ b/README.md
@ -68,16 +68,16 @@ python setup.py install
 import jieba

 seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
-print "Full Mode:", "/ ".join(seg_list)  # 全模式
+print("Full Mode: " + "/ ".join(seg_list))  # 全模式

 seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
-print "Default Mode:", "/ ".join(seg_list)  # 精确模式
+print("Default Mode: " + "/ ".join(seg_list))  # 精确模式

 seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认是精确模式
-print ", ".join(seg_list)
+print(", ".join(seg_list))

 seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
-print ", ".join(seg_list)
+print(", ".join(seg_list))
 ```

 输出:
@ -174,7 +174,7 @@ jieba.analyse.textrank(raw_text)
 >>> import jieba.posseg as pseg
 >>> words = pseg.cut("我爱北京天安门")
 >>> for w in words:
-...    print w.word, w.flag
+...    print('%s %s' % (w.word, w.flag))
 ...
 我 r
 爱 v
@ -203,7 +203,7 @@ jieba.analyse.textrank(raw_text)
 ```python
 result = jieba.tokenize(u'永和服装饰品有限公司')
 for tk in result:
-    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
 ```

 ```
@ -219,7 +219,7 @@ word 有限公司            start: 6                end:10
 ```python
 result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
 for tk in result:
-    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
 ```

 ```
@ -413,16 +413,16 @@ Main Functions
 import jieba

 seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
-print "Full Mode:", "/ ".join(seg_list)  # 全模式
+print("Full Mode: " + "/ ".join(seg_list))  # 全模式

 seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
-print "Default Mode:", "/ ".join(seg_list)  # 默认模式
+print("Default Mode: " + "/ ".join(seg_list))  # 默认模式

 seg_list = jieba.cut("他来到了网易杭研大厦")
-print ", ".join(seg_list)
+print(", ".join(seg_list))

 seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
-print ", ".join(seg_list)
+print(", ".join(seg_list))
 ```

 Output:
@ -488,7 +488,7 @@ Use: `jieba.analyse.textrank(raw_text)`.
 >>> import jieba.posseg as pseg
 >>> words = pseg.cut("我爱北京天安门")
 >>> for w in words:
-...    print w.word, w.flag
+...    print('%s %s' % (w.word, w.flag))
 ...
 我 r
 爱 v
@ -517,7 +517,7 @@ Use: `jieba.analyse.textrank(raw_text)`.
 ```python
 result = jieba.tokenize(u'永和服装饰品有限公司')
 for tk in result:
-    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
 ```

 ```
@ -533,7 +533,7 @@ word 有限公司            start: 6                end:10
 ```python
 result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
 for tk in result:
-    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
 ```

 ```
--- a/jieba/init.py
+++ b/jieba/init.py
@ -1,20 +1,20 @@
-from __future__ import with_statement
+from __future__ import absolute_import, unicode_literals
 __version__ = '0.35'
 __license__ = 'MIT'

 import re
 import os
 import sys
-import finalseg
 import time
 import tempfile
 import marshal
 from math import log
-import random
 import threading
 from functools import wraps
 import logging
 from hashlib import md5
+from ._compat import *
+from . import finalseg

 DICTIONARY = "dict.txt"
 DICT_LOCK = threading.RLock()
@ -51,13 +51,13 @@ def gen_pfdict(f_name):
                ltotal += freq
                for ch in xrange(len(word)):
                    pfdict.add(word[:ch+1])
-            except ValueError, e:
+            except ValueError as e:
                logger.debug('%s at line %s %s' % (f_name, lineno, line))
-                raise ValueError, e
+                raise e
    return pfdict, lfreq, ltotal

 def initialize(dictionary=None):
-    global pfdict, FREQ, total, min_freq, initialized, DICTIONARY, DICT_LOCK
+    global pfdict, FREQ, total, initialized, DICTIONARY, DICT_LOCK
    if not dictionary:
        dictionary = DICTIONARY
    with DICT_LOCK:
@ -121,7 +121,7 @@ def require_initialized(fn):
 def __cut_all(sentence):
    dag = get_DAG(sentence)
    old_j = -1
-    for k,L in dag.iteritems():
+    for k,L in iteritems(dag):
        if len(L) == 1 and k > old_j:
            yield sentence[k:L[0]+1]
            old_j = L[0]
@ -158,13 +158,13 @@ def get_DAG(sentence):
    return DAG

 def __cut_DAG_NO_HMM(sentence):
-    re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
+    re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
    DAG = get_DAG(sentence)
    route = {}
    calc(sentence, DAG, route)
    x = 0
    N = len(sentence)
-    buf = u''
+    buf = ''
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
@ -174,19 +174,19 @@ def __cut_DAG_NO_HMM(sentence):
        else:
            if buf:
                yield buf
-                buf = u''
+                buf = ''
            yield l_word
            x = y
    if buf:
        yield buf
-        buf = u''
+        buf = ''

 def __cut_DAG(sentence):
    DAG = get_DAG(sentence)
    route = {}
    calc(sentence, DAG, route=route)
    x = 0
-    buf = u''
+    buf = ''
    N = len(sentence)
    while x < N:
        y = route[x][1]+1
@ -197,7 +197,7 @@ def __cut_DAG(sentence):
            if buf:
                if len(buf) == 1:
                    yield buf
-                    buf = u''
+                    buf = ''
                else:
                    if buf not in FREQ:
                        recognized = finalseg.cut(buf)
@ -206,7 +206,7 @@ def __cut_DAG(sentence):
                    else:
                        for elem in buf:
                            yield elem
-                    buf = u''
+                    buf = ''
            yield l_word
        x = y

@ -225,23 +225,19 @@ def cut(sentence, cut_all=False, HMM=True):
    '''The main function that segments an entire sentence that contains
    Chinese characters into seperated words.
    Parameter:
-        - sentence: The str/unicode to be segmented.
+        - sentence: The str(unicode) to be segmented.
        - cut_all: Model type. True for full pattern, False for accurate pattern.
        - HMM: Whether to use the Hidden Markov Model.
    '''
-    if not isinstance(sentence, unicode):
-        try:
-            sentence = sentence.decode('utf-8')
-        except UnicodeDecodeError:
-            sentence = sentence.decode('gbk', 'ignore')
+    sentence = strdecode(sentence)

    # \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
    # \r\n|\s : whitespace characters. Will not be handled.

    if cut_all:
-        re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U)
+        re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)", re.U), re.compile("[^a-zA-Z0-9+#\n]", re.U)
    else:
-        re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
+        re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile("(\r\n|\s)", re.U)
    blocks = re_han.split(sentence)
    if cut_all:
        cut_block = __cut_all
@ -292,9 +288,9 @@ def load_userdict(f):
    ...
    Word type may be ignored
    '''
-    if isinstance(f, (str, unicode)):
+    if isinstance(f, string_types):
        f = open(f, 'rb')
-    content = f.read().decode('utf-8').lstrip(u'\ufeff')
+    content = f.read().decode('utf-8').lstrip('\ufeff')
    line_no = 0
    for line in content.split("\n"):
        line_no += 1
@ -333,15 +329,13 @@ def enable_parallel(processnum=None):
    global pool, cut, cut_for_search
    if os.name == 'nt':
        raise Exception("jieba: parallel mode only supports posix system")
-    if sys.version_info[0]==2 and sys.version_info[1]<6:
-        raise Exception("jieba: the parallel feature needs Python version>2.5")
    from multiprocessing import Pool, cpu_count
    if processnum is None:
        processnum = cpu_count()
    pool = Pool(processnum)

    def pcut(sentence,cut_all=False,HMM=True):
-        parts = re.compile('([\r\n]+)').split(sentence)
+        parts = strdecode(sentence).split('\n')
        if cut_all:
            result = pool.map(__lcut_all, parts)
        elif HMM:
@ -353,7 +347,7 @@ def enable_parallel(processnum=None):
                yield w

    def pcut_for_search(sentence):
-        parts = re.compile('([\r\n]+)').split(sentence)
+        parts = strdecode(sentence).split('\n')
        result = pool.map(__lcut_for_search, parts)
        for r in result:
            for w in r:
@ -385,11 +379,11 @@ def get_abs_path_dict():
 def tokenize(unicode_sentence, mode="default", HMM=True):
    """Tokenize a sentence and yields tuples of (word, start, end)
    Parameter:
-        - sentence: the unicode to be segmented.
+        - sentence: the str(unicode) to be segmented.
        - mode: "default" or "search", "search" is for finer segmentation.
        - HMM: whether to use the Hidden Markov Model.
    """
-    if not isinstance(unicode_sentence, unicode):
+    if not isinstance(unicode_sentence, text_type):
        raise Exception("jieba: the input parameter should be unicode.")
    start = 0
    if mode == 'default':
--- a/jieba/main.py
+++ b/jieba/main.py
@ -25,7 +25,7 @@ args = parser.parse_args()

 if args.quiet:
    jieba.setLogLevel(60)
-delim = unicode(args.delimiter)
+delim = text_type(args.delimiter)
 cutall = args.cutall
 hmm = args.hmm
 fp = open(args.filename, 'r') if args.filename else sys.stdin
@ -40,7 +40,10 @@ if args.user_dict:
 ln = fp.readline()
 while ln:
    l = ln.rstrip('\r\n')
-    print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8'))
+    result = delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm))
+    if PY2:
+        result = result.encode(default_encoding)
+    print(result)
    ln = fp.readline()

 fp.close()
--- a/jieba/_compat.py
+++ b/jieba/_compat.py
@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+import sys
+
+PY2 = sys.version_info[0] == 2
+
+default_encoding = sys.getfilesystemencoding()
+
+if PY2:
+    text_type = unicode
+    string_types = (str, unicode)
+
+    iterkeys = lambda d: d.iterkeys()
+    itervalues = lambda d: d.itervalues()
+    iteritems = lambda d: d.iteritems()
+
+else:
+    text_type = str
+    string_types = (str,)
+    xrange = range
+
+    iterkeys = lambda d: iter(d.keys())
+    itervalues = lambda d: iter(d.values())
+    iteritems = lambda d: iter(d.items())
+
+def strdecode(sentence):
+    if not isinstance(sentence, text_type):
+        try:
+            sentence = sentence.decode('utf-8')
+        except UnicodeDecodeError:
+            sentence = sentence.decode('gbk', 'ignore')
+    return sentence
--- a/jieba/analyse/init.py
+++ b/jieba/analyse/init.py
@ -1,13 +1,14 @@
 #encoding=utf-8
+from __future__ import absolute_import
 import jieba
 import jieba.posseg
 import os
 from operator import itemgetter
 try:
-    from analyzer import ChineseAnalyzer
+    from .analyzer import ChineseAnalyzer
 except ImportError:
    pass
-from textrank import textrank
+from .textrank import textrank

 _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
 abs_path = os.path.join(_curpath, "idf.txt")
--- a/jieba/analyse/analyzer.py
+++ b/jieba/analyse/analyzer.py
@ -1,4 +1,5 @@
-##encoding=utf-8
+#encoding=utf-8
+from __future__ import unicode_literals
 from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
 from whoosh.analysis import Tokenizer,Token
 from whoosh.lang.porter import stem
@ -10,9 +11,9 @@ STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
                        'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
                        'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
                        'to', 'us', 'we', 'when', 'will', 'with', 'yet',
-                        'you', 'your', u'的', u'了', u'和'))
+                        'you', 'your', '的', '了', '和'))

-accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")
+accepted_chars = re.compile(r"[\u4E00-\u9FA5]+")

 class ChineseTokenizer(Tokenizer):
    def __call__(self, text, **kargs):
--- a/jieba/analyse/textrank.py
+++ b/jieba/analyse/textrank.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-

+from __future__ import unicode_literals
 import sys
 import collections
 from operator import itemgetter
@ -35,7 +36,7 @@ class UndirectWeightedGraph:

        (min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])

-        for w in ws.itervalues():
+        for w in itervalues(ws):
            if w < min_rank:
                min_rank = w
            elif w > max_rank:
@ -88,4 +89,4 @@ def textrank(sentence, topK=10, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v'
 if __name__ == '__main__':
    s = "此外，公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元，增资后，吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年，实现营业收入0万元，实现净利润-139.13万元。"
    for x, w in textrank(s, withWeight=True):
-        print x, w
+        print('%s %s' % (x, w))
--- a/jieba/finalseg/init.py
+++ b/jieba/finalseg/init.py
@ -1,8 +1,9 @@
-from __future__ import with_statement
+from __future__ import absolute_import, unicode_literals
 import re
 import os
 import marshal
 import sys
+from .._compat import *

 MIN_FLOAT = -3.14e100

@ -41,9 +42,9 @@ def load_model():
 if sys.platform.startswith("java"):
    start_P, trans_P, emit_P = load_model()
 else:
-    from prob_start import P as start_P
-    from prob_trans import P as trans_P
-    from prob_emit  import P as emit_P
+    from .prob_start import P as start_P
+    from .prob_trans import P as trans_P
+    from .prob_emit  import P as emit_P

 def viterbi(obs, states, start_p, trans_p, emit_p):
    V = [{}] #tabular
@ -85,12 +86,8 @@ def __cut(sentence):
        yield sentence[nexti:]

 def cut(sentence):
-    if not isinstance(sentence, unicode):
-        try:
-            sentence = sentence.decode('utf-8')
-        except UnicodeDecodeError:
-            sentence = sentence.decode('gbk', 'ignore')
-    re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"(\d+\.\d+|[a-zA-Z0-9]+)")
+    sentence = strdecode(sentence)
+    re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
    blocks = re_han.split(sentence)
    for blk in blocks:
        if re_han.match(blk):
--- a/jieba/finalseg/prob_emit.py
+++ b/jieba/finalseg/prob_emit.py
--- a/jieba/posseg/init.py
+++ b/jieba/posseg/init.py
@ -1,13 +1,12 @@
-from __future__ import with_statement
+from __future__ import absolute_import, unicode_literals
 import re
 import os
-import viterbi
 import jieba
 import sys
 import marshal
 from functools import wraps
-
-default_encoding = sys.getfilesystemencoding()
+from .._compat import *
+from .viterbi import viterbi

 PROB_START_P = "prob_start.p"
 PROB_TRANS_P = "prob_trans.p"
@ -18,13 +17,14 @@ def load_model(f_name, isJython=True):
    _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

    result = {}
-    with open(f_name, "r") as f:
+    with open(f_name, "rb") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
-            word, _, tag = line.split(' ')
-            result[word.decode('utf-8')] = tag
+            line = line.decode("utf-8")
+            word, _, tag = line.split(" ")
+            result[word] = tag

    if not isJython:
        return result
@ -55,10 +55,10 @@ def load_model(f_name, isJython=True):
 if sys.platform.startswith("java"):
    char_state_tab_P, start_P, trans_P, emit_P, word_tag_tab = load_model(jieba.get_abs_path_dict())
 else:
-    from char_state_tab import P as char_state_tab_P
-    from prob_start import P as start_P
-    from prob_trans import P as trans_P
-    from prob_emit  import P as emit_P
+    from .char_state_tab import P as char_state_tab_P
+    from .prob_start import P as start_P
+    from .prob_trans import P as trans_P
+    from .prob_emit  import P as emit_P

    word_tag_tab = load_model(jieba.get_abs_path_dict(), isJython=False)

@ -79,20 +79,23 @@ class pair(object):
        self.flag = flag

    def __unicode__(self):
-        return u'%s/%s' % (self.word, self.flag)
+        return '%s/%s' % (self.word, self.flag)

    def __repr__(self):
        return self.__str__()

    def __str__(self):
-        return self.__unicode__().encode(default_encoding)
+        if PY2:
+            return self.__unicode__().encode(default_encoding)
+        else:
+            return self.__unicode__()

    def encode(self,arg):
        return self.__unicode__().encode(arg)

 def __cut(sentence):
-    prob, pos_list = viterbi.viterbi(sentence, char_state_tab_P, start_P, trans_P, emit_P)
-    begin, next = 0, 0
+    prob, pos_list = viterbi(sentence, char_state_tab_P, start_P, trans_P, emit_P)
+    begin, nexti = 0, 0

    for i,char in enumerate(sentence):
        pos = pos_list[i][0]
@ -100,16 +103,16 @@ def __cut(sentence):
            begin = i
        elif pos == 'E':
            yield pair(sentence[begin:i+1], pos_list[i][1])
-            next = i+1
+            nexti = i+1
        elif pos == 'S':
            yield pair(char, pos_list[i][1])
-            next = i+1
-    if next < len(sentence):
-        yield pair(sentence[next:], pos_list[next][1])
+            nexti = i+1
+    if nexti < len(sentence):
+        yield pair(sentence[nexti:], pos_list[nexti][1])

 def __cut_detail(sentence):
-    re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([\.0-9]+|[a-zA-Z0-9]+)")
-    re_eng, re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
+    re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
+    re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
    blocks = re_han.split(sentence)
    for blk in blocks:
        if re_han.match(blk):
@ -132,8 +135,8 @@ def __cut_DAG_NO_HMM(sentence):
    jieba.calc(sentence, DAG, route)
    x = 0
    N = len(sentence)
-    buf = u''
-    re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
+    buf = ''
+    re_eng = re.compile('[a-zA-Z0-9]',re.U)
    while x < N:
        y = route[x][1]+1
        l_word = sentence[x:y]
@ -143,12 +146,12 @@ def __cut_DAG_NO_HMM(sentence):
        else:
            if buf:
                yield pair(buf,'eng')
-                buf = u''
+                buf = ''
            yield pair(l_word, word_tag_tab.get(l_word, 'x'))
            x = y
    if buf:
        yield pair(buf,'eng')
-        buf = u''
+        buf = ''

 def __cut_DAG(sentence):
    DAG = jieba.get_DAG(sentence)
@ -157,7 +160,7 @@ def __cut_DAG(sentence):
    jieba.calc(sentence, DAG, route)

    x = 0
-    buf = u''
+    buf = ''
    N = len(sentence)
    while x < N:
        y = route[x][1]+1
@ -175,7 +178,7 @@ def __cut_DAG(sentence):
                else:
                    for elem in buf:
                        yield pair(elem, word_tag_tab.get(elem, 'x'))
-                buf = u''
+                buf = ''
            yield pair(l_word, word_tag_tab.get(l_word, 'x'))
        x = y

@ -191,13 +194,9 @@ def __cut_DAG(sentence):
                yield pair(elem, word_tag_tab.get(elem, 'x'))

 def __cut_internal(sentence, HMM=True):
-    if not isinstance(sentence, unicode):
-        try:
-            sentence = sentence.decode('utf-8')
-        except UnicodeDecodeError:
-            sentence = sentence.decode('gbk', 'ignore')
-    re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\r\n|\s)")
-    re_eng, re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
+    sentence = strdecode(sentence)
+    re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
+    re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
    blocks = re_han.split(sentence)
    if HMM:
        __cut_blk = __cut_DAG
@ -234,7 +233,7 @@ def cut(sentence, HMM=True):
        for w in __cut_internal(sentence, HMM=HMM):
            yield w
    else:
-        parts = re.compile('([\r\n]+)').split(sentence)
+        parts = strdecode(sentence).split('\n')
        if HMM:
            result = jieba.pool.map(__lcut_internal, parts)
        else:
--- a/jieba/posseg/char_state_tab.py
+++ b/jieba/posseg/char_state_tab.py
--- a/jieba/posseg/prob_emit.py
+++ b/jieba/posseg/prob_emit.py
--- a/jieba/posseg/viterbi.py
+++ b/jieba/posseg/viterbi.py
@ -1,7 +1,11 @@
+import sys
 import operator
 MIN_FLOAT = -3.14e100
 MIN_INF = float("-inf")

+if sys.version_info[0] > 2:
+    xrange = range
+
 def get_top_states(t_state_v, K=4):
    return sorted(t_state_v, key=t_state_v.__getitem__, reverse=True)[:K]

--- a/setup.py
+++ b/setup.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 from distutils.core import setup
-LONGDOC = u"""
+LONGDOC = """
 jieba
 =====

@ -75,6 +75,12 @@ setup(name='jieba',
        'Natural Language :: Chinese (Traditional)',
        'Programming Language :: Python',
        'Programming Language :: Python :: 2',
+        'Programming Language :: Python :: 2.6',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.2',
+        'Programming Language :: Python :: 3.3',
+        'Programming Language :: Python :: 3.4',
        'Topic :: Text Processing',
        'Topic :: Text Processing :: Indexing',
        'Topic :: Text Processing :: Linguistic',
--- a/test/2to3.diff
+++ b/test/2to3.diff
@ -1,522 +0,0 @@
-diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/analyzer.py ../jieba/jieba/analyse/analyzer.py
--- ./jieba/analyse/analyzer.py	2014-11-29 15:46:45.987925569 +0800
-+++ ../jieba/jieba/analyse/analyzer.py	2014-11-29 15:34:42.859932465 +0800
-@@ -1,4 +1,4 @@
-##encoding=utf-8
-+#encoding=utf-8
- from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
- from whoosh.analysis import Tokenizer,Token
- from whoosh.lang.porter import stem
-diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/__init__.py ../jieba/jieba/analyse/__init__.py
--- ./jieba/analyse/__init__.py	2014-11-29 15:46:46.139925567 +0800
-+++ ../jieba/jieba/analyse/__init__.py	2014-11-29 15:36:13.147931604 +0800
-@@ -26,7 +26,7 @@
- 
-     def set_new_path(self, new_idf_path):
-         if self.path != new_idf_path:
-            content = open(new_idf_path, 'rb').read().decode('utf-8')
-+            content = open(new_idf_path, 'r', encoding='utf-8').read()
-             idf_freq = {}
-             lines = content.rstrip('\n').split('\n')
-             for line in lines:
-@@ -93,7 +93,7 @@
-         freq[k] *= idf_freq.get(k, median_idf) / total
- 
-     if withWeight:
-        tags = sorted(list(freq.items()), key=itemgetter(1), reverse=True)
-+        tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
-     else:
-         tags = sorted(freq, key=freq.__getitem__, reverse=True)
-     if topK:
-diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/textrank.py ../jieba/jieba/analyse/textrank.py
--- ./jieba/analyse/textrank.py	2014-11-29 15:46:46.043925568 +0800
-+++ ../jieba/jieba/analyse/textrank.py	2014-11-29 15:36:39.291931354 +0800
-@@ -1,4 +1,4 @@
-#!/usr/bin/env python
-+#!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- 
- import sys
-@@ -22,12 +22,12 @@
-         outSum = collections.defaultdict(float)
- 
-         wsdef = 1.0 / len(self.graph)
-        for n, out in list(self.graph.items()):
-+        for n, out in self.graph.items():
-             ws[n] = wsdef
-             outSum[n] = sum((e[2] for e in out), 0.0)
- 
-         for x in range(10):  # 10 iters
-            for n, inedges in list(self.graph.items()):
-+            for n, inedges in self.graph.items():
-                 s = 0
-                 for e in inedges:
-                     s += e[2] / outSum[e[1]] * ws[e[1]]
-@@ -41,7 +41,7 @@
-             elif w > max_rank:
-                 max_rank = w
- 
-        for n, w in list(ws.items()):
-+        for n, w in ws.items():
-             # to unify the weights, don't *100.
-             ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
- 
-@@ -72,12 +72,12 @@
-                     continue
-                 cm[(words[i].word, words[j].word)] += 1
- 
-    for terms, w in list(cm.items()):
-+    for terms, w in cm.items():
-         g.addEdge(terms[0], terms[1], w)
- 
-     nodes_rank = g.rank()
-     if withWeight:
-        tags = sorted(list(nodes_rank.items()), key=itemgetter(1), reverse=True)
-+        tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
-     else:
-         tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
-     if topK:
-diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/finalseg/__init__.py ../jieba/jieba/finalseg/__init__.py
--- ./jieba/finalseg/__init__.py	2014-11-29 15:46:46.367925565 +0800
-+++ ../jieba/jieba/finalseg/__init__.py	2014-11-29 15:34:42.859932465 +0800
-@@ -1,4 +1,3 @@
-
- import re
- import os
- import marshal
-@@ -89,7 +88,7 @@
-             sentence = sentence.decode('utf-8')
-         except UnicodeDecodeError:
-             sentence = sentence.decode('gbk', 'ignore')
-    re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"(\d+\.\d+|[a-zA-Z0-9]+)")
-+    re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
-     blocks = re_han.split(sentence)
-     for blk in blocks:
-         if re_han.match(blk):
-diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__init__.py ../jieba/jieba/__init__.py
--- ./jieba/__init__.py	2014-11-29 15:46:45.955925569 +0800
-+++ ../jieba/jieba/__init__.py	2014-11-29 15:39:03.335929981 +0800
-@@ -1,4 +1,3 @@
-
- __version__ = '0.35'
- __license__ = 'MIT'
- 
-@@ -51,7 +50,7 @@
-                     pfdict.add(word[:ch+1])
-             except ValueError as e:
-                 logger.debug('%s at line %s %s' % (f_name, lineno, line))
-                raise ValueError(e)
-+                raise e
-     return pfdict, lfreq, ltotal
- 
- def initialize(dictionary=None):
-@@ -229,11 +228,11 @@
-     '''The main function that segments an entire sentence that contains
-     Chinese characters into seperated words.
-     Parameter:
-        - sentence: The str/unicode to be segmented.
-+        - sentence: The str to be segmented.
-         - cut_all: Model type. True for full pattern, False for accurate pattern.
-         - HMM: Whether to use the Hidden Markov Model.
-     '''
-    if not isinstance(sentence, str):
-+    if isinstance(sentence, bytes):
-         try:
-             sentence = sentence.decode('utf-8')
-         except UnicodeDecodeError:
-@@ -243,9 +242,9 @@
-     # \r\n|\s : whitespace characters. Will not be handled.
- 
-     if cut_all:
-        re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)", re.U), re.compile(r"[^a-zA-Z0-9+#\n]", re.U)
-+        re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)", re.U), re.compile("[^a-zA-Z0-9+#\n]", re.U)
-     else:
-        re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(r"(\r\n|\s)", re.U)
-+        re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile("(\r\n|\s)", re.U)
-     blocks = re_han.split(sentence)
-     if cut_all:
-         cut_block = __cut_all
-@@ -339,8 +338,6 @@
-     global pool, cut, cut_for_search
-     if os.name == 'nt':
-         raise Exception("jieba: parallel mode only supports posix system")
-    if sys.version_info[0]==2 and sys.version_info[1]<6:
-        raise Exception("jieba: the parallel feature needs Python version>2.5")
-     from multiprocessing import Pool, cpu_count
-     if processnum is None:
-         processnum = cpu_count()
-@@ -393,12 +390,12 @@
- def tokenize(unicode_sentence, mode="default", HMM=True):
-     """Tokenize a sentence and yields tuples of (word, start, end)
-     Parameter:
-        - sentence: the unicode to be segmented.
-+        - sentence: the str to be segmented.
-         - mode: "default" or "search", "search" is for finer segmentation.
-         - HMM: whether to use the Hidden Markov Model.
-     """
-     if not isinstance(unicode_sentence, str):
-        raise Exception("jieba: the input parameter should be unicode.")
-+        raise Exception("jieba: the input parameter should be str.")
-     start = 0
-     if mode == 'default':
-         for w in cut(unicode_sentence, HMM=HMM):
-diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__main__.py ../jieba/jieba/__main__.py
--- ./jieba/__main__.py	2014-11-29 15:46:45.747925571 +0800
-+++ ../jieba/jieba/__main__.py	2014-11-29 15:34:42.859932465 +0800
-@@ -40,7 +40,7 @@
- ln = fp.readline()
- while ln:
-     l = ln.rstrip('\r\n')
-    print((delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8')))
-+    print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)))
-     ln = fp.readline()
- 
- fp.close()
-diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/__init__.py ../jieba/jieba/posseg/__init__.py
--- ./jieba/posseg/__init__.py	2014-11-29 15:46:46.271925566 +0800
-+++ ../jieba/jieba/posseg/__init__.py	2014-11-29 15:37:52.299930658 +0800
-@@ -1,4 +1,3 @@
-
- import re
- import os
- from . import viterbi
-@@ -18,14 +17,14 @@
-     _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
- 
-     result = {}
-    with open(f_name, "r") as f:
-+    with open(f_name, "rb") as f:
-         for line in f:
-             line = line.strip()
-             if not line:
-                 continue
-            word, _, tag = line.split(' ')
-            result[word.decode('utf-8')] = tag
-
-+            line = line.decode("utf-8")
-+            word, _, tag = line.split(" ")
-+            result[word] = tag
-     if not isJython:
-         return result
- 
-@@ -105,8 +104,8 @@
-         yield pair(sentence[next:], pos_list[next][1])
- 
- def __cut_detail(sentence):
-    re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"([\.0-9]+|[a-zA-Z0-9]+)")
-    re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
-+    re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
-+    re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
-     blocks = re_han.split(sentence)
-     for blk in blocks:
-         if re_han.match(blk):
-@@ -130,7 +129,7 @@
-     x = 0
-     N = len(sentence)
-     buf = ''
-    re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
-+    re_eng = re.compile('[a-zA-Z0-9]',re.U)
-     while x < N:
-         y = route[x][1]+1
-         l_word = sentence[x:y]
-@@ -195,8 +194,8 @@
-             sentence = sentence.decode('utf-8')
-         except UnicodeDecodeError:
-             sentence = sentence.decode('gbk', 'ignore')
-    re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(r"(\r\n|\s)")
-    re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
-+    re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
-+    re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
-     blocks = re_han.split(sentence)
-     if HMM:
-         __cut_blk = __cut_DAG
-diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/viterbi.py ../jieba/jieba/posseg/viterbi.py
--- ./jieba/posseg/viterbi.py	2014-11-29 15:46:46.303925566 +0800
-+++ ../jieba/jieba/posseg/viterbi.py	2014-11-29 15:38:28.527930313 +0800
-@@ -8,7 +8,7 @@
- def viterbi(obs, states, start_p, trans_p, emit_p):
-     V = [{}] #tabular
-     mem_path = [{}]
-    all_states = list(trans_p.keys())
-+    all_states = trans_p.keys()
-     for y in states.get(obs[0], all_states): #init
-         V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
-         mem_path[0][y] = ''
-@@ -16,9 +16,9 @@
-         V.append({})
-         mem_path.append({})
-         #prev_states = get_top_states(V[t-1])
-        prev_states = [x for x in list(mem_path[t-1].keys()) if len(trans_p[x]) > 0]
-+        prev_states = [x for x in mem_path[t-1].keys() if len(trans_p[x]) > 0]
- 
-        prev_states_expect_next = set((y for x in prev_states for y in list(trans_p[x].keys())))
-+        prev_states_expect_next = set((y for x in prev_states for y in trans_p[x].keys()))
-         obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next
- 
-         if not obs_states:
-@@ -29,7 +29,7 @@
-             V[t][y] = prob
-             mem_path[t][y] = state
- 
-    last = [(V[-1][y], y) for y in list(mem_path[-1].keys())]
-+    last = [(V[-1][y], y) for y in mem_path[-1].keys()]
-     #if len(last)==0:
-         #print obs
-     prob, state = max(last)
-diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./README.md ../jieba/README.md
--- ./README.md	2014-11-29 15:46:08.487925926 +0800
-+++ ../jieba/README.md	2014-11-29 15:34:42.859932465 +0800
-@@ -4,6 +4,9 @@
- "Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module.
- - _Scroll down for English documentation._
- 
-+注意！
-+========
-+这个branch `jieba3k` 是专门用于Python3.x的版本
- 
- 特点
- ========
-@@ -68,16 +71,16 @@
- import jieba
- 
- seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
-print "Full Mode:", "/ ".join(seg_list)  # 全模式
-+print("Full Mode:", "/ ".join(seg_list))  # 全模式
- 
- seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
-print "Default Mode:", "/ ".join(seg_list)  # 精确模式
-+print("Default Mode:", "/ ".join(seg_list))  # 精确模式
- 
- seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认是精确模式
-print ", ".join(seg_list)
-+print(", ".join(seg_list))
- 
- seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
-print ", ".join(seg_list)
-+print(", ".join(seg_list))
- ```
- 
- 输出:
-@@ -174,7 +177,7 @@
- >>> import jieba.posseg as pseg
- >>> words = pseg.cut("我爱北京天安门")
- >>> for w in words:
-...    print w.word, w.flag
-+...    print(w.word, w.flag)
- ...
- 我 r
- 爱 v
-@@ -203,7 +206,7 @@
- ```python
- result = jieba.tokenize(u'永和服装饰品有限公司')
- for tk in result:
-    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
-+    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
- ```
- 
- ```
-@@ -219,7 +222,7 @@
- ```python
- result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
- for tk in result:
-    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
-+    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
- ```
- 
- ```
-@@ -408,16 +411,16 @@
- import jieba
- 
- seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
-print "Full Mode:", "/ ".join(seg_list)  # 全模式
-+print("Full Mode:", "/ ".join(seg_list))  # 全模式
- 
- seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
-print "Default Mode:", "/ ".join(seg_list)  # 默认模式
-+print("Default Mode:", "/ ".join(seg_list))  # 默认模式
- 
- seg_list = jieba.cut("他来到了网易杭研大厦")
-print ", ".join(seg_list)
-+print(", ".join(seg_list))
- 
- seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
-print ", ".join(seg_list)
-+print(", ".join(seg_list))
- ```
- 
- Output:
-@@ -483,7 +486,7 @@
- >>> import jieba.posseg as pseg
- >>> words = pseg.cut("我爱北京天安门")
- >>> for w in words:
-...    print w.word, w.flag
-+...    print(w.word, w.flag)
- ...
- 我 r
- 爱 v
-@@ -512,7 +515,7 @@
- ```python
- result = jieba.tokenize(u'永和服装饰品有限公司')
- for tk in result:
-    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
-+    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
- ```
- 
- ```
-@@ -528,7 +531,7 @@
- ```python
- result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
- for tk in result:
-    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
-+    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
- ```
- 
- ```
-diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./setup.py ../jieba/setup.py
--- ./setup.py	2014-11-29 15:46:46.379925565 +0800
-+++ ../jieba/setup.py	2014-11-29 15:42:20.263928103 +0800
-@@ -11,7 +11,7 @@
- 
- 完整文档见 ``README.md``
- 
-GitHub: https://github.com/fxsjy/jieba
-+GitHub: https://github.com/fxsjy/jieba/tree/jieba3k
- 
- 特点
- ====
-@@ -34,17 +34,11 @@
- Python 2.x
- ----------
- 
--  全自动安装： ``easy_install jieba`` 或者 ``pip install jieba``
--  半自动安装：先下载 https://pypi.python.org/pypi/jieba/ ，解压后运行
-   python setup.py install
--  手动安装：将 jieba 目录放置于当前目录或者 site-packages 目录
--  通过 ``import jieba`` 来引用
-+见 https://pypi.python.org/pypi/jieba/
- 
- Python 3.x
- ----------
- 
-见 https://pypi.python.org/pypi/jieba3k/
-
- -  目前 master 分支是只支持 Python 2.x 的
- -  Python 3.x 版本的分支也已经基本可用：
-    https://github.com/fxsjy/jieba/tree/jieba3k
-@@ -59,13 +53,13 @@
- 
- """
- 
-setup(name='jieba',
-+setup(name='jieba3k',
-       version='0.35.1',
-       description='Chinese Words Segementation Utilities',
-       long_description=LONGDOC,
-       author='Sun, Junyi',
-       author_email='ccnusjy@gmail.com',
-      url='https://github.com/fxsjy/jieba',
-+      url='https://github.com/fxsjy/jieba/tree/jieba3k',
-       license="MIT",
-       classifiers=[
-         'Intended Audience :: Developers',
-@@ -73,9 +67,8 @@
-         'Operating System :: OS Independent',
-         'Natural Language :: Chinese (Simplified)',
-         'Natural Language :: Chinese (Traditional)',
-         'Programming Language :: Python',
-        'Programming Language :: Python :: 2',
-+        'Programming Language :: Python :: 3',
-         'Topic :: Text Processing',
-         'Topic :: Text Processing :: Indexing',
-         'Topic :: Text Processing :: Linguistic',
-diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/extract_topic.py ../jieba/test/extract_topic.py
--- ./test/extract_topic.py	2014-11-29 15:46:47.003925559 +0800
-+++ ../jieba/test/extract_topic.py	2014-11-29 15:34:42.919932464 +0800
-@@ -51,13 +51,13 @@
- print("training...")
- 
- nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
-print(("done in %0.3fs." % (time.time() - t0)))
-+print("done in %0.3fs." % (time.time() - t0))
- 
- # Inverse the vectorizer vocabulary to be able
- feature_names = count_vect.get_feature_names()
- 
- for topic_idx, topic in enumerate(nmf.components_):
-    print(("Topic #%d:" % topic_idx))
-    print((" ".join([feature_names[i]
-                    for i in topic.argsort()[:-n_top_words - 1:-1]])))
-+    print("Topic #%d:" % topic_idx)
-+    print(" ".join([feature_names[i]
-+                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
-     print("")
-diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jiebacmd.py ../jieba/test/jiebacmd.py
--- ./test/jiebacmd.py	2014-11-29 15:46:46.443925564 +0800
-+++ ../jieba/test/jiebacmd.py	2014-11-29 15:34:42.919932464 +0800
-@@ -23,6 +23,6 @@
-         break
-     line = line.strip()
-     for word in jieba.cut(line):
-        print(word.encode(default_encoding))
-+        print(word)
- 
- 
-diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jieba_test.py ../jieba/test/jieba_test.py
--- ./test/jieba_test.py	2014-11-29 15:46:47.271925556 +0800
-+++ ../jieba/test/jieba_test.py	2014-11-29 15:34:42.919932464 +0800
-@@ -152,7 +152,7 @@
- #-*-coding: utf-8 -*-
- import sys
-+import imp
- sys.path.append("../")
- import unittest
- import types
-@@ -97,7 +98,7 @@
- 
- class JiebaTestCase(unittest.TestCase):
-     def setUp(self):
-        reload(jieba)
-+        imp.reload(jieba)
- 
-     def tearDown(self):
-         pass
-@@ -151,7 +152,7 @@
- 
-     def testTokenize(self):
-         for content in test_contents:
-            result = jieba.tokenize(content.decode('utf-8'))
-+            result = jieba.tokenize(content)
-             assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
-             result = list(result)
-             assert isinstance(result, list), "Test Tokenize error on content: %s" % content
-@@ -181,7 +181,7 @@
- 
-     def testTokenize_NOHMM(self):
-         for content in test_contents:
-            result = jieba.tokenize(content.decode('utf-8'),HMM=False)
-+            result = jieba.tokenize(content,HMM=False)
-             assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
-             result = list(result)
-             assert isinstance(result, list), "Test Tokenize error on content: %s" % content
-diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize_no_hmm.py ../jieba/test/test_tokenize_no_hmm.py
--- ./test/test_tokenize_no_hmm.py	2014-11-29 15:46:47.355925556 +0800
-+++ ../jieba/test/test_tokenize_no_hmm.py	2014-11-29 15:34:42.919932464 +0800
-@@ -7,7 +7,6 @@
- 
- def cuttest(test_sent):
-     global g_mode
-    test_sent = test_sent.decode('utf-8')
-     result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
-     for tk in result:
-         print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
-diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize.py ../jieba/test/test_tokenize.py
--- ./test/test_tokenize.py	2014-11-29 15:46:47.403925555 +0800
-+++ ../jieba/test/test_tokenize.py	2014-11-29 15:34:42.919932464 +0800
-@@ -7,7 +7,6 @@
- 
- def cuttest(test_sent):
-     global g_mode
-    test_sent = test_sent.decode('utf-8')
-     result = jieba.tokenize(test_sent,mode=g_mode)
-     for tk in result:
-         print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
--- a/test/auto2to3
+++ b/test/auto2to3
@ -1,34 +0,0 @@
-#!/bin/bash
-# Set 2to3 path.
-PYTHON2TO3=2to3
-# Copy the python2 version.
-echo Jieba 2to3 manual conversion tool
-echo
-if ! git rev-parse; then
- exit 1
-fi
-echo Copying working directory to ../jieba2
-if [ -d ../jieba2 ]; then
- echo Found existing ../jieba2
- read -p "Replace it with new one? (y/n) " -r
- if ! [[ $REPLY =~ ^[Yy]$ ]]; then
-  echo Cancelled.
-  exit
- else
-  rm -rf ../jieba2
- fi
-fi
-if ! git checkout jieba3k; then
- exit 1
-fi
-cp -r . ../jieba2
-cd ../jieba2
-if ! git checkout master; then
- exit 1
-fi
-# Here starts auto conversion.
-echo Converting jieba2 to Python3 ...
-find . -type f -name '*.py' \! -path '*/build/*' \! -name 'prob_*.py' \! -name 'char_state_tab.py' -exec $PYTHON2TO3 -w -n {} +
-find . -type f \! -path '*/build/*' -a \( -name 'prob_*.py' -o -name 'char_state_tab.py' \) -exec sed -i "s/u'\\\u/'\\\u/g" {} \;
-patch -p0 -s <../jieba/test/2to3.diff
-echo Done. Compare jieba and jieba2 to manually port.
--- a/test/demo.py
+++ b/test/demo.py
@ -1,17 +1,18 @@
 #encoding=utf-8
+from __future__ import unicode_literals
 import sys
 sys.path.append("../")

 import jieba

-seg_list = jieba.cut(u"我来到北京清华大学", cut_all=True)
-print u"Full Mode:", u"/ ".join(seg_list)  # 全模式
+seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
+print("Full Mode: " + "/ ".join(seg_list))  # 全模式

-seg_list = jieba.cut(u"我来到北京清华大学", cut_all=False)
-print u"Default Mode:", u"/ ".join(seg_list)  # 默认模式
+seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
+print("Default Mode: " + "/ ".join(seg_list))  # 默认模式

-seg_list = jieba.cut(u"他来到了网易杭研大厦")
-print u", ".join(seg_list)
+seg_list = jieba.cut("他来到了网易杭研大厦")
+print(", ".join(seg_list))

-seg_list = jieba.cut_for_search(u"小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
-print u", ".join(seg_list)
+seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
+print(", ".join(seg_list))
--- a/test/extract_tags.py
+++ b/test/extract_tags.py
@ -13,7 +13,7 @@ opt, args = parser.parse_args()


 if len(args) < 1:
-    print USAGE
+    print(USAGE)
    sys.exit(1)

 file_name = args[0]
@ -27,4 +27,4 @@ content = open(file_name, 'rb').read()

 tags = jieba.analyse.extract_tags(content, topK=topK)

-print ",".join(tags)
+print(",".join(tags))
--- a/test/extract_tags_idfpath.py
+++ b/test/extract_tags_idfpath.py
@ -13,7 +13,7 @@ opt, args = parser.parse_args()


 if len(args) < 1:
-    print USAGE
+    print(USAGE)
    sys.exit(1)

 file_name = args[0]
@ -29,4 +29,4 @@ jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");

 tags = jieba.analyse.extract_tags(content, topK=topK)

-print ",".join(tags)
+print(",".join(tags))
--- a/test/extract_tags_stop_words.py
+++ b/test/extract_tags_stop_words.py
@ -13,7 +13,7 @@ opt, args = parser.parse_args()


 if len(args) < 1:
-    print USAGE
+    print(USAGE)
    sys.exit(1)

 file_name = args[0]
@ -30,4 +30,4 @@ jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");

 tags = jieba.analyse.extract_tags(content, topK=topK)

-print ",".join(tags)
+print(",".join(tags))
--- a/test/extract_tags_with_weight.py
+++ b/test/extract_tags_with_weight.py
@ -14,7 +14,7 @@ opt, args = parser.parse_args()


 if len(args) < 1:
-    print USAGE
+    print(USAGE)
    sys.exit(1)

 file_name = args[0]
@ -38,6 +38,6 @@ tags = jieba.analyse.extract_tags(content, topK=topK, withWeight=withWeight)

 if withWeight is True:
    for tag in tags:
-        print "tag: %s\t\t weight: %f" % (tag[0],tag[1])
+        print("tag: %s\t\t weight: %f" % (tag[0],tag[1]))
 else:
-    print ",".join(tags)
+    print(",".join(tags))
--- a/test/extract_topic.py
+++ b/test/extract_topic.py
@ -12,7 +12,7 @@ import os
 import random

 if len(sys.argv)<2:
-    print "usage: extract_topic.py directory [n_topic] [n_top_words]"
+    print("usage: extract_topic.py directory [n_topic] [n_top_words]")
    sys.exit(0)

 n_topic = 10
@ -28,27 +28,27 @@ count_vect = CountVectorizer()
 docs = []

 pattern = os.path.join(sys.argv[1],"*.txt") 
-print "read "+pattern
+print("read "+pattern)

 for f_name in glob.glob(pattern):
    with open(f_name) as f:
-        print "read file:", f_name
+        print("read file:", f_name)
        for line in f: #one line as a document
            words = " ".join(jieba.cut(line))
            docs.append(words)

 random.shuffle(docs)

-print "read done."
+print("read done.")

-print "transform"
+print("transform")
 counts = count_vect.fit_transform(docs)
 tfidf = TfidfTransformer().fit_transform(counts)
-print tfidf.shape
+print(tfidf.shape)


 t0 = time.time()
-print "training..."
+print("training...")

 nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
 print("done in %0.3fs." % (time.time() - t0))
--- a/test/jieba_test.py
+++ b/test/jieba_test.py
@ -1,9 +1,13 @@
 #-*-coding: utf-8 -*-
+from __future__ import unicode_literals, print_function
 import sys
 sys.path.append("../")
 import unittest
 import types
 import jieba
+if sys.version_info[0] > 2:
+    from imp import reload
+
 jieba.initialize()


@ -108,8 +112,8 @@ class JiebaTestCase(unittest.TestCase):
            assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
            result = list(result)
            assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
-            print >> sys.stderr, " , ".join(result)
-        print  >> sys.stderr, "testDefaultCut"
+            print(" , ".join(result), file=sys.stderr)
+        print("testDefaultCut", file=sys.stderr)

    def testCutAll(self):
        for content in test_contents:
@ -117,8 +121,8 @@ class JiebaTestCase(unittest.TestCase):
            assert isinstance(result, types.GeneratorType), "Test CutAll Generator error"
            result = list(result)
            assert isinstance(result, list), "Test CutAll error on content: %s" % content
-            print >> sys.stderr, " , ".join(result)
-        print  >> sys.stderr, "testCutAll"
+            print(" , ".join(result), file=sys.stderr)
+        print("testCutAll", file=sys.stderr)

    def testSetDictionary(self):
        jieba.set_dictionary("foobar.txt")
@ -127,8 +131,8 @@ class JiebaTestCase(unittest.TestCase):
            assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error"
            result = list(result)
            assert isinstance(result, list), "Test SetDictionary error on content: %s" % content
-            print >> sys.stderr, " , ".join(result)
-        print  >> sys.stderr, "testSetDictionary"
+            print(" , ".join(result), file=sys.stderr)
+        print("testSetDictionary", file=sys.stderr)

    def testCutForSearch(self):
        for content in test_contents:
@ -136,8 +140,8 @@ class JiebaTestCase(unittest.TestCase):
            assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
            result = list(result)
            assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
-            print >> sys.stderr, " , ".join(result)
-        print  >> sys.stderr, "testCutForSearch"
+            print(" , ".join(result), file=sys.stderr)
+        print("testCutForSearch", file=sys.stderr)

    def testPosseg(self):
        import jieba.posseg as pseg
@ -146,18 +150,18 @@ class JiebaTestCase(unittest.TestCase):
            assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Posseg error on content: %s" % content
-            print >> sys.stderr, " , ".join([w.word + " / " + w.flag for w in result])
-        print  >> sys.stderr, "testPosseg"
+            print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
+        print("testPosseg", file=sys.stderr)

    def testTokenize(self):
        for content in test_contents:
-            result = jieba.tokenize(content.decode('utf-8'))
+            result = jieba.tokenize(content)
            assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Tokenize error on content: %s" % content
            for tk in result:
-                print >>sys.stderr, "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
-        print  >> sys.stderr, "testTokenize"
+                print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
+        print("testTokenize", file=sys.stderr)

    def testDefaultCut_NOHMM(self):
        for content in test_contents:
@ -165,8 +169,8 @@ class JiebaTestCase(unittest.TestCase):
            assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
            result = list(result)
            assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
-            print >> sys.stderr, " , ".join(result)
-        print  >> sys.stderr, "testDefaultCut_NOHMM"
+            print(" , ".join(result), file=sys.stderr)
+        print("testDefaultCut_NOHMM", file=sys.stderr)

    def testPosseg_NOHMM(self):
        import jieba.posseg as pseg
@ -175,18 +179,18 @@ class JiebaTestCase(unittest.TestCase):
            assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Posseg error on content: %s" % content
-            print >> sys.stderr, " , ".join([w.word + " / " + w.flag for w in result])
-        print  >> sys.stderr, "testPosseg_NOHMM"
+            print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
+        print("testPosseg_NOHMM", file=sys.stderr)

    def testTokenize_NOHMM(self):
        for content in test_contents:
-            result = jieba.tokenize(content.decode('utf-8'),HMM=False)
+            result = jieba.tokenize(content,HMM=False)
            assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Tokenize error on content: %s" % content
            for tk in result:
-                print >>sys.stderr, "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
-        print  >> sys.stderr, "testTokenize_NOHMM"
+                print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
+        print("testTokenize_NOHMM", file=sys.stderr)

    def testCutForSearch_NOHMM(self):
        for content in test_contents:
@ -194,8 +198,8 @@ class JiebaTestCase(unittest.TestCase):
            assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
            result = list(result)
            assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
-            print >> sys.stderr, " , ".join(result)
-        print  >> sys.stderr, "testCutForSearch_NOHMM"
+            print(" , ".join(result), file=sys.stderr)
+        print("testCutForSearch_NOHMM", file=sys.stderr)

 if __name__ == "__main__":
    unittest.main()
--- a/test/jiebacmd.py
+++ b/test/jiebacmd.py
@ -6,7 +6,7 @@ cat abc.txt | python jiebacmd.py | sort | uniq -c | sort -nr -k1 | head -100

 '''

-
+from __future__ import unicode_literals
 import sys
 sys.path.append("../")

@ -23,6 +23,6 @@ while True:
        break
    line = line.strip()
    for word in jieba.cut(line):
-        print word.encode(default_encoding)
+        print(word)


--- a/test/parallel/extract_tags.py
+++ b/test/parallel/extract_tags.py
@ -14,7 +14,7 @@ opt, args = parser.parse_args()


 if len(args) <1:
-    print USAGE
+    print(USAGE)
    sys.exit(1)

 file_name = args[0]
@ -29,6 +29,6 @@ content = open(file_name,'rb').read()

 tags = jieba.analyse.extract_tags(content,topK=topK)

-print ",".join(tags)
+print(",".join(tags))


--- a/test/parallel/test.py
+++ b/test/parallel/test.py
@ -1,4 +1,5 @@
 #encoding=utf-8
+from __future__ import print_function
 import sys
 sys.path.append("../../")
 import jieba
@ -7,8 +8,8 @@ jieba.enable_parallel(4)
 def cuttest(test_sent):
    result = jieba.cut(test_sent)
    for word in result:
-        print word, "/", 
-    print ""
+        print(word, "/", end=' ') 
+    print("")


 if __name__ == "__main__":
--- a/test/parallel/test2.py
+++ b/test/parallel/test2.py
@ -1,4 +1,5 @@
 #encoding=utf-8
+from __future__ import print_function
 import sys
 sys.path.append("../../")
 import jieba
@ -7,8 +8,8 @@ jieba.enable_parallel(4)
 def cuttest(test_sent):
    result = jieba.cut(test_sent,cut_all=True)
    for word in result:
-        print word, "/", 
-    print ""
+        print(word, "/", end=' ') 
+    print("")


 if __name__ == "__main__":
--- a/test/parallel/test_cut_for_search.py
+++ b/test/parallel/test_cut_for_search.py
@ -1,4 +1,5 @@
 #encoding=utf-8
+from __future__ import print_function
 import sys
 sys.path.append("../../")
 import jieba
@ -7,8 +8,8 @@ jieba.enable_parallel(4)
 def cuttest(test_sent):
    result = jieba.cut_for_search(test_sent)
    for word in result:
-        print word, "/", 
-    print ""
+        print(word, "/", end=' ') 
+    print("")


 if __name__ == "__main__":
--- a/test/parallel/test_file.py
+++ b/test/parallel/test_file.py
@ -1,6 +1,5 @@
-import urllib2
-import sys,time
 import sys
+import time
 sys.path.append("../../")
 import jieba

@ -17,5 +16,5 @@ tm_cost = t2-t1
 log_f = open("1.log","wb")
 log_f.write(words.encode('utf-8'))

-print 'speed' , len(content)/tm_cost, " bytes/second"
+print('speed %s bytes/second' % (len(content)/tm_cost))

--- a/test/parallel/test_pos.py
+++ b/test/parallel/test_pos.py
@ -1,4 +1,5 @@
 #encoding=utf-8
+from __future__ import print_function
 import sys
 sys.path.append("../../")
 import jieba
@ -8,8 +9,8 @@ import jieba.posseg as pseg
 def cuttest(test_sent):
    result = pseg.cut(test_sent)
    for w in result:
-        print w.word, "/", w.flag, ", ",  
-    print ""
+        print(w.word, "/", w.flag, ", ", end=' ')  
+    print("")


 if __name__ == "__main__":
--- a/test/parallel/test_pos_file.py
+++ b/test/parallel/test_pos_file.py
@ -1,9 +1,10 @@
-import urllib2
+from __future__ import print_function
 import sys,time
 import sys
 sys.path.append("../../")
 import jieba
 import jieba.posseg as pseg
+
 jieba.enable_parallel(4)

 url = sys.argv[1]
@ -14,9 +15,8 @@ words = list(pseg.cut(content))
 t2 = time.time()
 tm_cost = t2-t1

-log_f = open("1.log","wb")
-for w in words:
-    print >> log_f, w.encode("utf-8"), "/" ,
+log_f = open("1.log","w")
+log_f.write(' / '.join(map(str, words)))

-print 'speed' , len(content)/tm_cost, " bytes/second"
+print('speed' , len(content)/tm_cost, " bytes/second")

--- a/test/test.py
+++ b/test/test.py
@ -6,7 +6,7 @@ import jieba

 def cuttest(test_sent):
    result = jieba.cut(test_sent)
-    print " / ".join(result)
+    print(" / ".join(result))


 if __name__ == "__main__":
--- a/test/test_bug.py
+++ b/test/test_bug.py
@ -5,5 +5,5 @@ import jieba
 import jieba.posseg as pseg
 words=pseg.cut("又跛又啞")
 for w in words:
-	print w.word,w.flag
+	print(w.word,w.flag)

--- a/test/test_change_dictpath.py
+++ b/test/test_change_dictpath.py
@ -5,7 +5,7 @@ import jieba

 def cuttest(test_sent):
    result = jieba.cut(test_sent)
-    print "  ".join(result)
+    print("  ".join(result))

 def testcase():
    cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空，我爱北京，我爱Python和C++。")
@ -22,6 +22,6 @@ def testcase():
 if __name__ == "__main__":
    testcase()
    jieba.set_dictionary("foobar.txt")
-    print "================================"
+    print("================================")
    testcase()

--- a/test/test_cut_for_search.py
+++ b/test/test_cut_for_search.py
@ -6,8 +6,8 @@ import jieba
 def cuttest(test_sent):
    result = jieba.cut_for_search(test_sent)
    for word in result:
-        print word, "/", 
-    print ""
+        print(word, "/", end=' ') 
+    print("")


 if __name__ == "__main__":
--- a/test/test_cutall.py
+++ b/test/test_cutall.py
@ -6,8 +6,8 @@ import jieba
 def cuttest(test_sent):
    result = jieba.cut(test_sent,cut_all=True)
    for word in result:
-        print word, "/", 
-    print ""
+        print(word, "/", end=' ') 
+    print("")


 if __name__ == "__main__":
--- a/test/test_file.py
+++ b/test/test_file.py
@ -1,4 +1,3 @@
-import urllib2
 import sys,time
 import sys
 sys.path.append("../")
@ -17,6 +16,6 @@ log_f = open("1.log","wb")
 log_f.write(words.encode('utf-8'))
 log_f.close()

-print 'cost',tm_cost
-print 'speed' , len(content)/tm_cost, " bytes/second"
+print('cost ' + tm_cost)
+print('speed %s bytes/second' % (len(content)/tm_cost))

--- a/test/test_multithread.py
+++ b/test/test_multithread.py
@ -8,18 +8,18 @@ import jieba
 class Worker(threading.Thread):
    def run(self):
        seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
-        print "Full Mode:" + "/ ".join(seg_list) #全模式
+        print("Full Mode:" + "/ ".join(seg_list)) #全模式

        seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
-        print "Default Mode:" + "/ ".join(seg_list) #默认模式
+        print("Default Mode:" + "/ ".join(seg_list)) #默认模式

        seg_list = jieba.cut("他来到了网易杭研大厦")
-        print ", ".join(seg_list)
+        print(", ".join(seg_list))

        seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造") #搜索引擎模式
-        print ", ".join(seg_list)
+        print(", ".join(seg_list))
 workers = []
-for i in xrange(10):
+for i in range(10):
    worker = Worker()
    workers.append(worker)
    worker.start()
--- a/test/test_no_hmm.py
+++ b/test/test_no_hmm.py
@ -6,7 +6,7 @@ import jieba

 def cuttest(test_sent):
    result = jieba.cut(test_sent,HMM=False)
-    print " / ".join(result)
+    print(" / ".join(result))


 if __name__ == "__main__":
--- a/test/test_pos.py
+++ b/test/test_pos.py
@ -1,4 +1,5 @@
 #encoding=utf-8
+from __future__ import print_function
 import sys
 sys.path.append("../")
 import jieba.posseg as pseg
@ -6,8 +7,8 @@ import jieba.posseg as pseg
 def cuttest(test_sent):
    result = pseg.cut(test_sent)
    for w in result:
-        print w.word, "/", w.flag, ", ",
-    print ""
+        print(w.word, "/", w.flag, ", ", end=' ')
+    print("")


 if __name__ == "__main__":
--- a/test/test_pos_file.py
+++ b/test/test_pos_file.py
@ -1,6 +1,6 @@
-import urllib2
-import sys,time
+from __future__ import print_function
 import sys
+import time
 sys.path.append("../")
 import jieba
 jieba.initialize()
@ -14,9 +14,8 @@ words = list(pseg.cut(content))
 t2 = time.time()
 tm_cost = t2-t1

-log_f = open("1.log","wb")
-for w in words:
-    print >> log_f, w.encode("utf-8"), "/" ,
+log_f = open("1.log","w")
+log_f.write(' / '.join(map(str, words)))

-print 'speed' , len(content)/tm_cost, " bytes/second"
+print('speed' , len(content)/tm_cost, " bytes/second")

--- a/test/test_pos_no_hmm.py
+++ b/test/test_pos_no_hmm.py
@ -6,8 +6,8 @@ import jieba.posseg as pseg
 def cuttest(test_sent):
    result = pseg.cut(test_sent,HMM=False)
    for w in result:
-        print w.word, "/", w.flag, ", ",  
-    print ""
+        print(w.word, "/", w.flag, ", ", end=' ')  
+    print("")


 if __name__ == "__main__":
--- a/test/test_tokenize.py
+++ b/test/test_tokenize.py
@ -7,10 +7,9 @@ g_mode="default"

 def cuttest(test_sent):
    global g_mode
-    test_sent = test_sent.decode('utf-8')
    result = jieba.tokenize(test_sent,mode=g_mode)
    for tk in result:
-        print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))


 if __name__ == "__main__":
--- a/test/test_tokenize_no_hmm.py
+++ b/test/test_tokenize_no_hmm.py
@ -7,10 +7,9 @@ g_mode="default"

 def cuttest(test_sent):
    global g_mode
-    test_sent = test_sent.decode('utf-8')
    result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
    for tk in result:
-        print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))


 if __name__ == "__main__":
--- a/test/test_userdict.py
+++ b/test/test_userdict.py
@ -9,19 +9,19 @@ test_sent = "李小福是创新办主任也是云计算方面的专家; 什么
 test_sent += "例如我输入一个带“韩玉赏鉴”的标题，在自定义词库中也增加了此词为N类型"
 words = jieba.cut(test_sent)
 for w in words:
-    print w
+    print(w)

 result = pseg.cut(test_sent)

 for w in result:
-    print w.word, "/", w.flag, ", ",
+    print(w.word, "/", w.flag, ", ", end=' ')

-print "\n========"
+print("\n========")

 terms = jieba.cut('easy_install is great')
 for t in terms:
-    print t
-print '-------------------------'
+    print(t)
+print('-------------------------')
 terms = jieba.cut('python 的正则表达式是好用的')
 for t in terms:
-    print t
+    print(t)
--- a/test/test_whoosh.py
+++ b/test/test_whoosh.py
@ -1,4 +1,5 @@
 # -*- coding: UTF-8 -*-
+from __future__ import unicode_literals
 import sys,os
 sys.path.append("../")
 from whoosh.index import create_in,open_dir
@ -18,46 +19,46 @@ ix = create_in("tmp", schema) # for create new index
 writer = ix.writer()

 writer.add_document(
-    title=u"document1",
-    path=u"/a",
-    content=u"This is the first document we’ve added!"
+    title="document1",
+    path="/a",
+    content="This is the first document we’ve added!"
 )

 writer.add_document(
-    title=u"document2",
-    path=u"/b",
-    content=u"The second one 你 中文测试中文 is even more interesting! 吃水果"
+    title="document2",
+    path="/b",
+    content="The second one 你 中文测试中文 is even more interesting! 吃水果"
 )

 writer.add_document(
-    title=u"document3",
-    path=u"/c",
-    content=u"买水果然后来世博园。"
+    title="document3",
+    path="/c",
+    content="买水果然后来世博园。"
 )

 writer.add_document(
-    title=u"document4",
-    path=u"/c",
-    content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
+    title="document4",
+    path="/c",
+    content="工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
 )

 writer.add_document(
-    title=u"document4",
-    path=u"/c",
-    content=u"咱俩交换一下吧。"
+    title="document4",
+    path="/c",
+    content="咱俩交换一下吧。"
 )

 writer.commit()
 searcher = ix.searcher()
 parser = QueryParser("content", schema=ix.schema)

-for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交换"):
-    print "result of ",keyword
+for keyword in ("水果世博园","你","first","中文","交换机","交换"):
+    print("result of ",keyword)
    q = parser.parse(keyword)
    results = searcher.search(q)
    for hit in results:
-        print hit.highlights("content")
-    print "="*10
+        print(hit.highlights("content"))
+    print("="*10)

-for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream. this is intetesting and interested me a lot"):
-    print t.text
+for t in analyzer("我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream. this is intetesting and interested me a lot"):
+    print(t.text)
--- a/test/test_whoosh_file.py
+++ b/test/test_whoosh_file.py
@ -1,4 +1,5 @@
 # -*- coding: UTF-8 -*-
+from __future__ import unicode_literals
 import sys
 import os
 sys.path.append("../")
@ -23,8 +24,8 @@ with open(file_name,"rb") as inf:
    for line in inf:
        i+=1
        writer.add_document(
-            title=u"line"+str(i),
-            path=u"/a",
+            title="line"+str(i),
+            path="/a",
            content=line.decode('gbk','ignore')
        )
 writer.commit()
@ -32,10 +33,10 @@ writer.commit()
 searcher = ix.searcher()
 parser = QueryParser("content", schema=ix.schema)

-for keyword in (u"水果小姐",u"你",u"first",u"中文",u"交换机",u"交换"):
-    print "result of ",keyword
+for keyword in ("水果小姐","你","first","中文","交换机","交换"):
+    print("result of " + keyword)
    q = parser.parse(keyword)
    results = searcher.search(q)
    for hit in results:
-        print hit.highlights("content")
-    print "="*10
+        print(hit.highlights("content"))
+    print("="*10)
--- a/test/test_whoosh_file_read.py
+++ b/test/test_whoosh_file_read.py
@ -1,4 +1,5 @@
 # -*- coding: UTF-8 -*-
+from __future__ import unicode_literals
 import sys
 import os
 sys.path.append("../")
@ -18,10 +19,10 @@ ix = open_dir("tmp")
 searcher = ix.searcher()
 parser = QueryParser("content", schema=ix.schema)

-for keyword in (u"水果小姐",u"你",u"first",u"中文",u"交换机",u"交换",u"少林",u"乔峰"):
-    print "result of ",keyword
+for keyword in ("水果小姐","你","first","中文","交换机","交换","少林","乔峰"):
+    print("result of ",keyword)
    q = parser.parse(keyword)
    results = searcher.search(q)
    for hit in results:  
-        print hit.highlights("content")
-    print "="*10
+        print(hit.highlights("content"))
+    print("="*10)