From 7b7c6955a9119734ac70f902129b7fbfd1f0b660 Mon Sep 17 00:00:00 2001 From: Dingyuan Wang Date: Sat, 29 Nov 2014 15:33:42 +0800 Subject: [PATCH] complete the setup.py, fix #202 problem in posseg --- jieba/__init__.py | 5 +-- jieba/posseg/__init__.py | 6 ++-- jieba/posseg/viterbi.py | 6 ++-- setup.py | 78 +++++++++++++++++++++++++++++++++++++++- 4 files changed, 85 insertions(+), 10 deletions(-) diff --git a/jieba/__init__.py b/jieba/__init__.py index 4e46fa3..27c9dc2 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -1,5 +1,5 @@ from __future__ import with_statement -__version__ = '0.34' +__version__ = '0.35' __license__ = 'MIT' import re @@ -78,7 +78,8 @@ def initialize(dictionary=None): if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path): logger.debug("Loading model from cache %s" % cache_file) try: - pfdict,FREQ,total,min_freq = marshal.load(open(cache_file,'rb')) + with open(cache_file, 'rb') as cf: + pfdict,FREQ,total,min_freq = marshal.load(cf) # prevent conflict with old version load_from_cache_fail = not isinstance(pfdict, set) except: diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index 30160d4..484874d 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -46,7 +46,7 @@ def load_model(f_name, isJython=True): state = {} abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P) - with open(abs_path, 'r') as f: + with open(abs_path, 'rb') as f: state = marshal.load(f) f.closed @@ -126,7 +126,7 @@ def __cut_detail(sentence): def __cut_DAG_NO_HMM(sentence): DAG = jieba.get_DAG(sentence) route = {} - jieba.calc(sentence, DAG, 0, route=route) + jieba.calc(sentence, DAG, route) x = 0 N = len(sentence) buf = u'' @@ -151,7 +151,7 @@ def __cut_DAG(sentence): DAG = jieba.get_DAG(sentence) route = {} - jieba.calc(sentence,DAG,0,route=route) + jieba.calc(sentence, DAG, route) x = 0 buf = u'' diff --git a/jieba/posseg/viterbi.py b/jieba/posseg/viterbi.py index 0130f5b..5a643fb 100644 --- a/jieba/posseg/viterbi.py +++ b/jieba/posseg/viterbi.py @@ -3,9 +3,7 @@ MIN_FLOAT = -3.14e100 MIN_INF = float("-inf") def get_top_states(t_state_v, K=4): - items = t_state_v.items() - topK = sorted(items, key=operator.itemgetter(1), reverse=True)[:K] - return [x[0] for x in topK] + return sorted(t_state_v, key=t_state_v.__getitem__, reverse=True)[:K] def viterbi(obs, states, start_p, trans_p, emit_p): V = [{}] #tabular @@ -27,7 +25,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p): obs_states = prev_states_expect_next if prev_states_expect_next else all_states for y in obs_states: - prob, state = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT), y0) for y0 in prev_states]) + prob, state = max((V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT), y0) for y0 in prev_states) V[t][y] = prob mem_path[t][y] = state diff --git a/setup.py b/setup.py index 3e25168..19200f2 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,86 @@ +# -*- coding: utf-8 -*- from distutils.core import setup +LONGDOC = u""" +jieba +===== + +“结巴”中文分词:做最好的 Python 中文分词组件 + +"Jieba" (Chinese for "to stutter") Chinese text segmentation: built to +be the best Python Chinese word segmentation module. + +完整文档见 ``README.md`` + +GitHub: https://github.com/fxsjy/jieba + +特点 +==== + +- 支持三种分词模式: + + - 精确模式,试图将句子最精确地切开,适合文本分析; + - 全模式,把句子中所有的可以成词的词语都扫描出来, + 速度非常快,但是不能解决歧义; + - 搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。 + +- 支持繁体分词 +- 支持自定义词典 + +在线演示: http://jiebademo.ap01.aws.af.cm/ + +安装说明 +======== + +Python 2.x +---------- + +- 全自动安装: ``easy_install jieba`` 或者 ``pip install jieba`` +- 半自动安装:先下载 https://pypi.python.org/pypi/jieba/ ,解压后运行 + python setup.py install +- 手动安装:将 jieba 目录放置于当前目录或者 site-packages 目录 +- 通过 ``import jieba`` 来引用 + +Python 3.x +---------- + +见 https://pypi.python.org/pypi/jieba3k/ + +- 目前 master 分支是只支持 Python 2.x 的 +- Python 3.x 版本的分支也已经基本可用: + https://github.com/fxsjy/jieba/tree/jieba3k + +.. code:: bash + + git clone https://github.com/fxsjy/jieba.git + git checkout jieba3k + python setup.py install + +- 或使用pip3安装: pip3 install jieba3k + +""" + setup(name='jieba', version='0.35', description='Chinese Words Segementation Utilities', + long_description=LONGDOC, author='Sun, Junyi', author_email='ccnusjy@gmail.com', - url='http://github.com/fxsjy', + url='https://github.com/fxsjy/jieba', + license="MIT", + classifiers=[ + 'Intended Audience :: Developers', + 'License :: OSI Approved :: MIT License', + 'Operating System :: OS Independent', + 'Natural Language :: Chinese (Simplified)', + 'Natural Language :: Chinese (Traditional)', + 'Programming Language :: Cython', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Topic :: Text Processing', + 'Topic :: Text Processing :: Indexing', + 'Topic :: Text Processing :: Linguistic', + ], + keywords='NLP,tokenizing,Chinese word segementation', packages=['jieba'], package_dir={'jieba':'jieba'}, package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']}