From c6b386f65b6295d4fbe691f7eb78ec4982009ef9 Mon Sep 17 00:00:00 2001 From: Dingyuan Wang Date: Sat, 29 Nov 2014 16:06:20 +0800 Subject: [PATCH] update jieba3k --- .gitignore | 6 +- Changelog | 10 +- jieba/__init__.py | 8 +- jieba/analyse/__init__.py | 18 +++- jieba/analyse/textrank.py | 6 +- jieba/posseg/__init__.py | 4 +- jieba/posseg/viterbi.py | 5 +- setup.py | 71 ++++++++++++- test/2to3.diff | 216 +++++++++++++++++++++++++------------- 9 files changed, 250 insertions(+), 94 deletions(-) diff --git a/.gitignore b/.gitignore index 8c2c5f4..e36fabc 100644 --- a/.gitignore +++ b/.gitignore @@ -113,8 +113,10 @@ Generated_Code #added for RIA/Silverlight projects _UpgradeReport_Files/ Backup*/ UpgradeLog*.XML - - +############ +## pycharm +############ +.idea ############ ## Windows diff --git a/Changelog b/Changelog index d1628f1..be1aaa3 100644 --- a/Changelog +++ b/Changelog @@ -1,11 +1,11 @@ 2014-11-15: version 0.35.1 -1) fix Python 3.2的兼容性问题 +1. 修复 Python 3.2 的兼容性问题 2014-11-13: version 0.35 -1) 改进词典cache的dump和加载机制;by @gumblex -2)提升关键词提取的性能; by @gumblex -3)关键词提取新增基于textrank算法的子模块; by @singlee -4)修复自定义stopwords功能的bug; by @walkskyer +1. 改进词典cache的dump和加载机制;by @gumblex +2. 提升关键词提取的性能; by @gumblex +3. 关键词提取新增基于textrank算法的子模块; by @singlee +4. 修复自定义stopwords功能的bug; by @walkskyer 2014-10-20: version 0.34 diff --git a/jieba/__init__.py b/jieba/__init__.py index 37e2e62..59df14b 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.34' +__version__ = '0.35' __license__ = 'MIT' import re @@ -135,7 +135,7 @@ def __cut_all(sentence): old_j = j -def calc(sentence, DAG, idx, route): +def calc(sentence, DAG, route): N = len(sentence) route[N] = (0.0, '') for idx in range(N-1, -1, -1): @@ -164,7 +164,7 @@ def __cut_DAG_NO_HMM(sentence): re_eng = re.compile(r'[a-zA-Z0-9]',re.U) DAG = get_DAG(sentence) route = {} - calc(sentence, DAG, 0, route) + calc(sentence, DAG, route) x = 0 N = len(sentence) buf = '' @@ -187,7 +187,7 @@ def __cut_DAG_NO_HMM(sentence): def __cut_DAG(sentence): DAG = get_DAG(sentence) route = {} - calc(sentence, DAG, 0, route=route) + calc(sentence, DAG, route=route) x = 0 buf = '' N = len(sentence) diff --git a/jieba/analyse/__init__.py b/jieba/analyse/__init__.py index 94d0f49..1b04c32 100644 --- a/jieba/analyse/__init__.py +++ b/jieba/analyse/__init__.py @@ -1,5 +1,6 @@ #encoding=utf-8 import jieba +import jieba.posseg import os from operator import itemgetter try: @@ -54,25 +55,36 @@ def set_stop_words(stop_words_path): if not os.path.exists(abs_path): raise Exception("jieba: path does not exist: " + abs_path) content = open(abs_path,'rb').read().decode('utf-8') - lines = content.replace("\r","").split('\n') + lines = content.replace("\r", "").split('\n') for line in lines: STOP_WORDS.add(line) -def extract_tags(sentence, topK=20, withWeight=False): +def extract_tags(sentence, topK=20, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v']): """ Extract keywords from sentence using TF-IDF algorithm. Parameter: - topK: return how many top keywords. `None` for all possible words. - withWeight: if True, return a list of (word, weight); if False, return a list of words. + - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v']. + if the POS of w is not in this list,it will be filtered. """ global STOP_WORDS, idf_loader idf_freq, median_idf = idf_loader.get_idf() - words = jieba.cut(sentence) + if allowPOS: + allowPOS = frozenset(allowPOS) + words = jieba.posseg.cut(sentence) + else: + words = jieba.cut(sentence) freq = {} for w in words: + if allowPOS: + if w.flag not in allowPOS: + continue + else: + w = w.word if len(w.strip()) < 2 or w.lower() in STOP_WORDS: continue freq[w] = freq.get(w, 0.0) + 1.0 diff --git a/jieba/analyse/textrank.py b/jieba/analyse/textrank.py index 9bd5e2f..12dce89 100644 --- a/jieba/analyse/textrank.py +++ b/jieba/analyse/textrank.py @@ -48,15 +48,17 @@ class UndirectWeightedGraph: return ws -def textrank(sentence, topK=10, withWeight=False): +def textrank(sentence, topK=10, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v']): """ Extract keywords from sentence using TextRank algorithm. Parameter: - topK: return how many top keywords. `None` for all possible words. - withWeight: if True, return a list of (word, weight); if False, return a list of words. + - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v']. + if the POS of w is not in this list,it will be filtered. """ - pos_filt = frozenset(('ns', 'n', 'vn', 'v')) + pos_filt = frozenset(allowPOS) g = UndirectWeightedGraph() cm = collections.defaultdict(int) span = 5 diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index 865a07d..7d2d096 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -125,7 +125,7 @@ def __cut_detail(sentence): def __cut_DAG_NO_HMM(sentence): DAG = jieba.get_DAG(sentence) route = {} - jieba.calc(sentence, DAG, 0, route=route) + jieba.calc(sentence, DAG, route) x = 0 N = len(sentence) buf = '' @@ -150,7 +150,7 @@ def __cut_DAG(sentence): DAG = jieba.get_DAG(sentence) route = {} - jieba.calc(sentence,DAG,0,route=route) + jieba.calc(sentence, DAG, route) x = 0 buf = '' diff --git a/jieba/posseg/viterbi.py b/jieba/posseg/viterbi.py index 96c1108..4081fe6 100644 --- a/jieba/posseg/viterbi.py +++ b/jieba/posseg/viterbi.py @@ -3,8 +3,7 @@ MIN_FLOAT = -3.14e100 MIN_INF = float("-inf") def get_top_states(t_state_v, K=4): - topK = sorted(t_state_v.items(), key=operator.itemgetter(1), reverse=True)[:K] - return [x[0] for x in topK] + return sorted(t_state_v, key=t_state_v.__getitem__, reverse=True)[:K] def viterbi(obs, states, start_p, trans_p, emit_p): V = [{}] #tabular @@ -26,7 +25,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p): obs_states = prev_states_expect_next if prev_states_expect_next else all_states for y in obs_states: - prob, state = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT), y0) for y0 in prev_states]) + prob, state = max((V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT), y0) for y0 in prev_states) V[t][y] = prob mem_path[t][y] = state diff --git a/setup.py b/setup.py index 57a8421..3fcf220 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,79 @@ +# -*- coding: utf-8 -*- from distutils.core import setup +LONGDOC = """ +jieba +===== + +“结巴”中文分词:做最好的 Python 中文分词组件 + +"Jieba" (Chinese for "to stutter") Chinese text segmentation: built to +be the best Python Chinese word segmentation module. + +完整文档见 ``README.md`` + +GitHub: https://github.com/fxsjy/jieba/tree/jieba3k + +特点 +==== + +- 支持三种分词模式: + + - 精确模式,试图将句子最精确地切开,适合文本分析; + - 全模式,把句子中所有的可以成词的词语都扫描出来, + 速度非常快,但是不能解决歧义; + - 搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。 + +- 支持繁体分词 +- 支持自定义词典 + +在线演示: http://jiebademo.ap01.aws.af.cm/ + +安装说明 +======== + +Python 2.x +---------- + +见 https://pypi.python.org/pypi/jieba/ + +Python 3.x +---------- + +- 目前 master 分支是只支持 Python 2.x 的 +- Python 3.x 版本的分支也已经基本可用: + https://github.com/fxsjy/jieba/tree/jieba3k + +.. code:: bash + + git clone https://github.com/fxsjy/jieba.git + git checkout jieba3k + python setup.py install + +- 或使用pip3安装: pip3 install jieba3k + +""" + setup(name='jieba3k', version='0.35.1', description='Chinese Words Segementation Utilities', + long_description=LONGDOC, author='Sun, Junyi', author_email='ccnusjy@gmail.com', - url='http://github.com/fxsjy', + url='https://github.com/fxsjy/jieba/tree/jieba3k', + license="MIT", + classifiers=[ + 'Intended Audience :: Developers', + 'License :: OSI Approved :: MIT License', + 'Operating System :: OS Independent', + 'Natural Language :: Chinese (Simplified)', + 'Natural Language :: Chinese (Traditional)', + 'Programming Language :: Python', + 'Programming Language :: Python :: 3', + 'Topic :: Text Processing', + 'Topic :: Text Processing :: Indexing', + 'Topic :: Text Processing :: Linguistic', + ], + keywords='NLP,tokenizing,Chinese word segementation', packages=['jieba'], package_dir={'jieba':'jieba'}, package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']} diff --git a/test/2to3.diff b/test/2to3.diff index d811204..2c4396f 100644 --- a/test/2to3.diff +++ b/test/2to3.diff @@ -1,6 +1,6 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/analyzer.py ../jieba/jieba/analyse/analyzer.py ---- ./jieba/analyse/analyzer.py 2014-11-07 23:07:02.779210408 +0800 -+++ ../jieba/jieba/analyse/analyzer.py 2014-11-07 23:07:02.079210422 +0800 +--- ./jieba/analyse/analyzer.py 2014-11-29 15:46:45.987925569 +0800 ++++ ../jieba/jieba/analyse/analyzer.py 2014-11-29 15:34:42.859932465 +0800 @@ -1,4 +1,4 @@ -##encoding=utf-8 +#encoding=utf-8 @@ -8,9 +8,9 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p from whoosh.analysis import Tokenizer,Token from whoosh.lang.porter import stem diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/__init__.py ../jieba/jieba/analyse/__init__.py ---- ./jieba/analyse/__init__.py 2014-11-07 23:07:02.879210406 +0800 -+++ ../jieba/jieba/analyse/__init__.py 2014-11-07 23:16:27.171198767 +0800 -@@ -25,7 +25,7 @@ +--- ./jieba/analyse/__init__.py 2014-11-29 15:46:46.139925567 +0800 ++++ ../jieba/jieba/analyse/__init__.py 2014-11-29 15:36:13.147931604 +0800 +@@ -26,7 +26,7 @@ def set_new_path(self, new_idf_path): if self.path != new_idf_path: @@ -19,7 +19,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p idf_freq = {} lines = content.rstrip('\n').split('\n') for line in lines: -@@ -81,7 +81,7 @@ +@@ -93,7 +93,7 @@ freq[k] *= idf_freq.get(k, median_idf) / total if withWeight: @@ -29,8 +29,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p tags = sorted(freq, key=freq.__getitem__, reverse=True) if topK: diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/textrank.py ../jieba/jieba/analyse/textrank.py ---- ./jieba/analyse/textrank.py 2014-11-07 23:07:02.827210407 +0800 -+++ ../jieba/jieba/analyse/textrank.py 2014-11-07 23:18:22.059196398 +0800 +--- ./jieba/analyse/textrank.py 2014-11-29 15:46:46.043925568 +0800 ++++ ../jieba/jieba/analyse/textrank.py 2014-11-29 15:36:39.291931354 +0800 @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 @@ -61,7 +61,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p # to unify the weights, don't *100. ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0) -@@ -70,12 +70,12 @@ +@@ -72,12 +72,12 @@ continue cm[(words[i].word, words[j].word)] += 1 @@ -77,19 +77,28 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True) if topK: diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/finalseg/__init__.py ../jieba/jieba/finalseg/__init__.py ---- ./jieba/finalseg/__init__.py 2014-11-07 23:07:03.147210400 +0800 -+++ ../jieba/jieba/finalseg/__init__.py 2014-11-07 23:18:43.495195956 +0800 +--- ./jieba/finalseg/__init__.py 2014-11-29 15:46:46.367925565 +0800 ++++ ../jieba/jieba/finalseg/__init__.py 2014-11-29 15:34:42.859932465 +0800 @@ -1,4 +1,3 @@ - import re import os import marshal +@@ -89,7 +88,7 @@ + sentence = sentence.decode('utf-8') + except UnicodeDecodeError: + sentence = sentence.decode('gbk', 'ignore') +- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"(\d+\.\d+|[a-zA-Z0-9]+)") ++ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)") + blocks = re_han.split(sentence) + for blk in blocks: + if re_han.match(blk): diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__init__.py ../jieba/jieba/__init__.py ---- ./jieba/__init__.py 2014-11-07 23:07:02.751210408 +0800 -+++ ../jieba/jieba/__init__.py 2014-11-07 23:22:34.963191182 +0800 +--- ./jieba/__init__.py 2014-11-29 15:46:45.955925569 +0800 ++++ ../jieba/jieba/__init__.py 2014-11-29 15:39:03.335929981 +0800 @@ -1,4 +1,3 @@ - - __version__ = '0.34' + __version__ = '0.35' __license__ = 'MIT' @@ -51,7 +50,7 @@ @@ -101,17 +110,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p return pfdict, lfreq, ltotal def initialize(dictionary=None): -@@ -78,7 +77,8 @@ - if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path): - logger.debug("Loading model from cache %s" % cache_file) - try: -- pfdict,FREQ,total,min_freq = marshal.load(open(cache_file,'rb')) -+ with open(cache_file, 'rb') as cf: -+ pfdict,FREQ,total,min_freq = marshal.load(cf) - # prevent conflict with old version - load_from_cache_fail = not isinstance(pfdict, set) - except: -@@ -228,11 +228,11 @@ +@@ -229,11 +228,11 @@ '''The main function that segments an entire sentence that contains Chinese characters into seperated words. Parameter: @@ -125,7 +124,19 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p try: sentence = sentence.decode('utf-8') except UnicodeDecodeError: -@@ -338,8 +338,6 @@ +@@ -243,9 +242,9 @@ + # \r\n|\s : whitespace characters. Will not be handled. + + if cut_all: +- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)", re.U), re.compile(r"[^a-zA-Z0-9+#\n]", re.U) ++ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)", re.U), re.compile("[^a-zA-Z0-9+#\n]", re.U) + else: +- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(r"(\r\n|\s)", re.U) ++ re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile("(\r\n|\s)", re.U) + blocks = re_han.split(sentence) + if cut_all: + cut_block = __cut_all +@@ -339,8 +338,6 @@ global pool, cut, cut_for_search if os.name == 'nt': raise Exception("jieba: parallel mode only supports posix system") @@ -134,7 +145,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p from multiprocessing import Pool, cpu_count if processnum is None: processnum = cpu_count() -@@ -392,12 +390,12 @@ +@@ -393,12 +390,12 @@ def tokenize(unicode_sentence, mode="default", HMM=True): """Tokenize a sentence and yields tuples of (word, start, end) Parameter: @@ -150,8 +161,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p if mode == 'default': for w in cut(unicode_sentence, HMM=HMM): diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__main__.py ../jieba/jieba/__main__.py ---- ./jieba/__main__.py 2014-11-07 23:07:02.563210412 +0800 -+++ ../jieba/jieba/__main__.py 2014-11-07 23:07:02.079210422 +0800 +--- ./jieba/__main__.py 2014-11-29 15:46:45.747925571 +0800 ++++ ../jieba/jieba/__main__.py 2014-11-29 15:34:42.859932465 +0800 @@ -40,7 +40,7 @@ ln = fp.readline() while ln: @@ -162,8 +173,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p fp.close() diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/__init__.py ../jieba/jieba/posseg/__init__.py ---- ./jieba/posseg/__init__.py 2014-11-07 23:07:03.047210402 +0800 -+++ ../jieba/jieba/posseg/__init__.py 2014-11-07 23:19:40.883194772 +0800 +--- ./jieba/posseg/__init__.py 2014-11-29 15:46:46.271925566 +0800 ++++ ../jieba/jieba/posseg/__init__.py 2014-11-29 15:37:52.299930658 +0800 @@ -1,4 +1,3 @@ - import re @@ -188,27 +199,41 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p if not isJython: return result -@@ -46,7 +45,7 @@ - - state = {} - abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P) -- with open(abs_path, 'r') as f: -+ with open(abs_path, 'rb') as f: - state = marshal.load(f) - f.closed - +@@ -105,8 +104,8 @@ + yield pair(sentence[next:], pos_list[next][1]) + + def __cut_detail(sentence): +- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"([\.0-9]+|[a-zA-Z0-9]+)") +- re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+") ++ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)") ++ re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+") + blocks = re_han.split(sentence) + for blk in blocks: + if re_han.match(blk): +@@ -130,7 +129,7 @@ + x = 0 + N = len(sentence) + buf = '' +- re_eng = re.compile(r'[a-zA-Z0-9]',re.U) ++ re_eng = re.compile('[a-zA-Z0-9]',re.U) + while x < N: + y = route[x][1]+1 + l_word = sentence[x:y] +@@ -195,8 +194,8 @@ + sentence = sentence.decode('utf-8') + except UnicodeDecodeError: + sentence = sentence.decode('gbk', 'ignore') +- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(r"(\r\n|\s)") +- re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+") ++ re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)") ++ re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+") + blocks = re_han.split(sentence) + if HMM: + __cut_blk = __cut_DAG diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/viterbi.py ../jieba/jieba/posseg/viterbi.py ---- ./jieba/posseg/viterbi.py 2014-11-07 23:07:03.079210402 +0800 -+++ ../jieba/jieba/posseg/viterbi.py 2014-11-07 23:07:02.095210422 +0800 -@@ -3,14 +3,13 @@ - MIN_INF = float("-inf") - - def get_top_states(t_state_v, K=4): -- items = list(t_state_v.items()) -- topK = sorted(items, key=operator.itemgetter(1), reverse=True)[:K] -+ topK = sorted(t_state_v.items(), key=operator.itemgetter(1), reverse=True)[:K] - return [x[0] for x in topK] - +--- ./jieba/posseg/viterbi.py 2014-11-29 15:46:46.303925566 +0800 ++++ ../jieba/jieba/posseg/viterbi.py 2014-11-29 15:38:28.527930313 +0800 +@@ -8,7 +8,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p): V = [{}] #tabular mem_path = [{}] @@ -217,7 +242,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p for y in states.get(obs[0], all_states): #init V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT) mem_path[0][y] = '' -@@ -18,9 +17,9 @@ +@@ -16,9 +16,9 @@ V.append({}) mem_path.append({}) #prev_states = get_top_states(V[t-1]) @@ -229,7 +254,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next if not obs_states: -@@ -31,7 +30,7 @@ +@@ -29,7 +29,7 @@ V[t][y] = prob mem_path[t][y] = state @@ -239,8 +264,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p #print obs prob, state = max(last) diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./README.md ../jieba/README.md ---- ./README.md 2014-11-07 23:07:02.067210423 +0800 -+++ ../jieba/README.md 2014-11-07 23:24:49.263188412 +0800 +--- ./README.md 2014-11-29 15:46:08.487925926 +0800 ++++ ../jieba/README.md 2014-11-29 15:34:42.859932465 +0800 @@ -4,6 +4,9 @@ "Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module. - _Scroll down for English documentation._ @@ -348,18 +373,65 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p ``` diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./setup.py ../jieba/setup.py ---- ./setup.py 2014-11-07 23:07:02.067210423 +0800 -+++ ../jieba/setup.py 2014-11-07 23:07:02.095210422 +0800 -@@ -1,5 +1,5 @@ - from distutils.core import setup +--- ./setup.py 2014-11-29 15:46:46.379925565 +0800 ++++ ../jieba/setup.py 2014-11-29 15:42:20.263928103 +0800 +@@ -11,7 +11,7 @@ + + 完整文档见 ``README.md`` + +-GitHub: https://github.com/fxsjy/jieba ++GitHub: https://github.com/fxsjy/jieba/tree/jieba3k + + 特点 + ==== +@@ -34,17 +34,11 @@ + Python 2.x + ---------- + +-- 全自动安装: ``easy_install jieba`` 或者 ``pip install jieba`` +-- 半自动安装:先下载 https://pypi.python.org/pypi/jieba/ ,解压后运行 +- python setup.py install +-- 手动安装:将 jieba 目录放置于当前目录或者 site-packages 目录 +-- 通过 ``import jieba`` 来引用 ++见 https://pypi.python.org/pypi/jieba/ + + Python 3.x + ---------- + +-见 https://pypi.python.org/pypi/jieba3k/ +- + - 目前 master 分支是只支持 Python 2.x 的 + - Python 3.x 版本的分支也已经基本可用: + https://github.com/fxsjy/jieba/tree/jieba3k +@@ -59,13 +53,13 @@ + + """ + -setup(name='jieba', +setup(name='jieba3k', - version='0.35', + version='0.35.1', description='Chinese Words Segementation Utilities', + long_description=LONGDOC, author='Sun, Junyi', + author_email='ccnusjy@gmail.com', +- url='https://github.com/fxsjy/jieba', ++ url='https://github.com/fxsjy/jieba/tree/jieba3k', + license="MIT", + classifiers=[ + 'Intended Audience :: Developers', +@@ -73,9 +67,8 @@ + 'Operating System :: OS Independent', + 'Natural Language :: Chinese (Simplified)', + 'Natural Language :: Chinese (Traditional)', + 'Programming Language :: Python', +- 'Programming Language :: Python :: 2', ++ 'Programming Language :: Python :: 3', + 'Topic :: Text Processing', + 'Topic :: Text Processing :: Indexing', + 'Topic :: Text Processing :: Linguistic', diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/extract_topic.py ../jieba/test/extract_topic.py ---- ./test/extract_topic.py 2014-11-07 23:07:03.707210389 +0800 -+++ ../jieba/test/extract_topic.py 2014-11-07 23:07:02.095210422 +0800 +--- ./test/extract_topic.py 2014-11-29 15:46:47.003925559 +0800 ++++ ../jieba/test/extract_topic.py 2014-11-29 15:34:42.919932464 +0800 @@ -51,13 +51,13 @@ print("training...") @@ -379,8 +451,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p + for i in topic.argsort()[:-n_top_words - 1:-1]])) print("") diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jiebacmd.py ../jieba/test/jiebacmd.py ---- ./test/jiebacmd.py 2014-11-07 23:07:03.211210399 +0800 -+++ ../jieba/test/jiebacmd.py 2014-11-07 23:07:02.099210422 +0800 +--- ./test/jiebacmd.py 2014-11-29 15:46:46.443925564 +0800 ++++ ../jieba/test/jiebacmd.py 2014-11-29 15:34:42.919932464 +0800 @@ -23,6 +23,6 @@ break line = line.strip() @@ -390,9 +462,9 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jieba_test.py ../jieba/test/jieba_test.py ---- ./test/jieba_test.py 2014-11-07 23:07:03.947210384 +0800 -+++ ../jieba/test/jieba_test.py 2014-11-07 23:07:02.099210422 +0800 -@@ -1,5 +1,6 @@ +--- ./test/jieba_test.py 2014-11-29 15:46:47.271925556 +0800 ++++ ../jieba/test/jieba_test.py 2014-11-29 15:34:42.919932464 +0800 +@@ -152,7 +152,7 @@ #-*-coding: utf-8 -*- import sys +import imp @@ -417,7 +489,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error" result = list(result) assert isinstance(result, list), "Test Tokenize error on content: %s" % content -@@ -180,7 +181,7 @@ +@@ -181,7 +181,7 @@ def testTokenize_NOHMM(self): for content in test_contents: @@ -427,8 +499,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p result = list(result) assert isinstance(result, list), "Test Tokenize error on content: %s" % content diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize_no_hmm.py ../jieba/test/test_tokenize_no_hmm.py ---- ./test/test_tokenize_no_hmm.py 2014-11-07 23:07:04.031210382 +0800 -+++ ../jieba/test/test_tokenize_no_hmm.py 2014-11-07 23:07:02.099210422 +0800 +--- ./test/test_tokenize_no_hmm.py 2014-11-29 15:46:47.355925556 +0800 ++++ ../jieba/test/test_tokenize_no_hmm.py 2014-11-29 15:34:42.919932464 +0800 @@ -7,7 +7,6 @@ def cuttest(test_sent): @@ -438,8 +510,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])) diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize.py ../jieba/test/test_tokenize.py ---- ./test/test_tokenize.py 2014-11-07 23:07:04.071210381 +0800 -+++ ../jieba/test/test_tokenize.py 2014-11-07 23:07:02.099210422 +0800 +--- ./test/test_tokenize.py 2014-11-29 15:46:47.403925555 +0800 ++++ ../jieba/test/test_tokenize.py 2014-11-29 15:34:42.919932464 +0800 @@ -7,7 +7,6 @@ def cuttest(test_sent):