update jieba3k

pull/204/head
Dingyuan Wang
parent 6b0da06481
commit c6b386f65b

6
.gitignore vendored

@ -113,8 +113,10 @@ Generated_Code #added for RIA/Silverlight projects
_UpgradeReport_Files/ _UpgradeReport_Files/
Backup*/ Backup*/
UpgradeLog*.XML UpgradeLog*.XML
############
## pycharm
############
.idea
############ ############
## Windows ## Windows

@ -1,11 +1,11 @@
2014-11-15: version 0.35.1 2014-11-15: version 0.35.1
1) fix Python 3.2的兼容性问题 1. 修复 Python 3.2 的兼容性问题
2014-11-13: version 0.35 2014-11-13: version 0.35
1) 改进词典cache的dump和加载机制by @gumblex 1. 改进词典cache的dump和加载机制by @gumblex
2提升关键词提取的性能; by @gumblex 2. 提升关键词提取的性能; by @gumblex
3关键词提取新增基于textrank算法的子模块; by @singlee 3. 关键词提取新增基于textrank算法的子模块; by @singlee
4修复自定义stopwords功能的bug; by @walkskyer 4. 修复自定义stopwords功能的bug; by @walkskyer
2014-10-20: version 0.34 2014-10-20: version 0.34

@ -1,4 +1,4 @@
__version__ = '0.34' __version__ = '0.35'
__license__ = 'MIT' __license__ = 'MIT'
import re import re
@ -135,7 +135,7 @@ def __cut_all(sentence):
old_j = j old_j = j
def calc(sentence, DAG, idx, route): def calc(sentence, DAG, route):
N = len(sentence) N = len(sentence)
route[N] = (0.0, '') route[N] = (0.0, '')
for idx in range(N-1, -1, -1): for idx in range(N-1, -1, -1):
@ -164,7 +164,7 @@ def __cut_DAG_NO_HMM(sentence):
re_eng = re.compile(r'[a-zA-Z0-9]',re.U) re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
DAG = get_DAG(sentence) DAG = get_DAG(sentence)
route = {} route = {}
calc(sentence, DAG, 0, route) calc(sentence, DAG, route)
x = 0 x = 0
N = len(sentence) N = len(sentence)
buf = '' buf = ''
@ -187,7 +187,7 @@ def __cut_DAG_NO_HMM(sentence):
def __cut_DAG(sentence): def __cut_DAG(sentence):
DAG = get_DAG(sentence) DAG = get_DAG(sentence)
route = {} route = {}
calc(sentence, DAG, 0, route=route) calc(sentence, DAG, route=route)
x = 0 x = 0
buf = '' buf = ''
N = len(sentence) N = len(sentence)

@ -1,5 +1,6 @@
#encoding=utf-8 #encoding=utf-8
import jieba import jieba
import jieba.posseg
import os import os
from operator import itemgetter from operator import itemgetter
try: try:
@ -54,25 +55,36 @@ def set_stop_words(stop_words_path):
if not os.path.exists(abs_path): if not os.path.exists(abs_path):
raise Exception("jieba: path does not exist: " + abs_path) raise Exception("jieba: path does not exist: " + abs_path)
content = open(abs_path,'rb').read().decode('utf-8') content = open(abs_path,'rb').read().decode('utf-8')
lines = content.replace("\r","").split('\n') lines = content.replace("\r", "").split('\n')
for line in lines: for line in lines:
STOP_WORDS.add(line) STOP_WORDS.add(line)
def extract_tags(sentence, topK=20, withWeight=False): def extract_tags(sentence, topK=20, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v']):
""" """
Extract keywords from sentence using TF-IDF algorithm. Extract keywords from sentence using TF-IDF algorithm.
Parameter: Parameter:
- topK: return how many top keywords. `None` for all possible words. - topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight); - withWeight: if True, return a list of (word, weight);
if False, return a list of words. if False, return a list of words.
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
if the POS of w is not in this list,it will be filtered.
""" """
global STOP_WORDS, idf_loader global STOP_WORDS, idf_loader
idf_freq, median_idf = idf_loader.get_idf() idf_freq, median_idf = idf_loader.get_idf()
words = jieba.cut(sentence) if allowPOS:
allowPOS = frozenset(allowPOS)
words = jieba.posseg.cut(sentence)
else:
words = jieba.cut(sentence)
freq = {} freq = {}
for w in words: for w in words:
if allowPOS:
if w.flag not in allowPOS:
continue
else:
w = w.word
if len(w.strip()) < 2 or w.lower() in STOP_WORDS: if len(w.strip()) < 2 or w.lower() in STOP_WORDS:
continue continue
freq[w] = freq.get(w, 0.0) + 1.0 freq[w] = freq.get(w, 0.0) + 1.0

@ -48,15 +48,17 @@ class UndirectWeightedGraph:
return ws return ws
def textrank(sentence, topK=10, withWeight=False): def textrank(sentence, topK=10, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v']):
""" """
Extract keywords from sentence using TextRank algorithm. Extract keywords from sentence using TextRank algorithm.
Parameter: Parameter:
- topK: return how many top keywords. `None` for all possible words. - topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight); - withWeight: if True, return a list of (word, weight);
if False, return a list of words. if False, return a list of words.
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
if the POS of w is not in this list,it will be filtered.
""" """
pos_filt = frozenset(('ns', 'n', 'vn', 'v')) pos_filt = frozenset(allowPOS)
g = UndirectWeightedGraph() g = UndirectWeightedGraph()
cm = collections.defaultdict(int) cm = collections.defaultdict(int)
span = 5 span = 5

@ -125,7 +125,7 @@ def __cut_detail(sentence):
def __cut_DAG_NO_HMM(sentence): def __cut_DAG_NO_HMM(sentence):
DAG = jieba.get_DAG(sentence) DAG = jieba.get_DAG(sentence)
route = {} route = {}
jieba.calc(sentence, DAG, 0, route=route) jieba.calc(sentence, DAG, route)
x = 0 x = 0
N = len(sentence) N = len(sentence)
buf = '' buf = ''
@ -150,7 +150,7 @@ def __cut_DAG(sentence):
DAG = jieba.get_DAG(sentence) DAG = jieba.get_DAG(sentence)
route = {} route = {}
jieba.calc(sentence,DAG,0,route=route) jieba.calc(sentence, DAG, route)
x = 0 x = 0
buf = '' buf = ''

@ -3,8 +3,7 @@ MIN_FLOAT = -3.14e100
MIN_INF = float("-inf") MIN_INF = float("-inf")
def get_top_states(t_state_v, K=4): def get_top_states(t_state_v, K=4):
topK = sorted(t_state_v.items(), key=operator.itemgetter(1), reverse=True)[:K] return sorted(t_state_v, key=t_state_v.__getitem__, reverse=True)[:K]
return [x[0] for x in topK]
def viterbi(obs, states, start_p, trans_p, emit_p): def viterbi(obs, states, start_p, trans_p, emit_p):
V = [{}] #tabular V = [{}] #tabular
@ -26,7 +25,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
obs_states = prev_states_expect_next if prev_states_expect_next else all_states obs_states = prev_states_expect_next if prev_states_expect_next else all_states
for y in obs_states: for y in obs_states:
prob, state = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT), y0) for y0 in prev_states]) prob, state = max((V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT), y0) for y0 in prev_states)
V[t][y] = prob V[t][y] = prob
mem_path[t][y] = state mem_path[t][y] = state

@ -1,10 +1,79 @@
# -*- coding: utf-8 -*-
from distutils.core import setup from distutils.core import setup
LONGDOC = """
jieba
=====
结巴中文分词做最好的 Python 中文分词组件
"Jieba" (Chinese for "to stutter") Chinese text segmentation: built to
be the best Python Chinese word segmentation module.
完整文档见 ``README.md``
GitHub: https://github.com/fxsjy/jieba/tree/jieba3k
特点
====
- 支持三种分词模式
- 精确模式试图将句子最精确地切开适合文本分析
- 全模式把句子中所有的可以成词的词语都扫描出来,
速度非常快但是不能解决歧义
- 搜索引擎模式在精确模式的基础上对长词再次切分提高召回率适合用于搜索引擎分词
- 支持繁体分词
- 支持自定义词典
在线演示 http://jiebademo.ap01.aws.af.cm/
安装说明
========
Python 2.x
----------
https://pypi.python.org/pypi/jieba/
Python 3.x
----------
- 目前 master 分支是只支持 Python 2.x
- Python 3.x 版本的分支也已经基本可用
https://github.com/fxsjy/jieba/tree/jieba3k
.. code:: bash
git clone https://github.com/fxsjy/jieba.git
git checkout jieba3k
python setup.py install
- 或使用pip3安装 pip3 install jieba3k
"""
setup(name='jieba3k', setup(name='jieba3k',
version='0.35.1', version='0.35.1',
description='Chinese Words Segementation Utilities', description='Chinese Words Segementation Utilities',
long_description=LONGDOC,
author='Sun, Junyi', author='Sun, Junyi',
author_email='ccnusjy@gmail.com', author_email='ccnusjy@gmail.com',
url='http://github.com/fxsjy', url='https://github.com/fxsjy/jieba/tree/jieba3k',
license="MIT",
classifiers=[
'Intended Audience :: Developers',
'License :: OSI Approved :: MIT License',
'Operating System :: OS Independent',
'Natural Language :: Chinese (Simplified)',
'Natural Language :: Chinese (Traditional)',
'Programming Language :: Python',
'Programming Language :: Python :: 3',
'Topic :: Text Processing',
'Topic :: Text Processing :: Indexing',
'Topic :: Text Processing :: Linguistic',
],
keywords='NLP,tokenizing,Chinese word segementation',
packages=['jieba'], packages=['jieba'],
package_dir={'jieba':'jieba'}, package_dir={'jieba':'jieba'},
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']} package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']}

@ -1,6 +1,6 @@
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/analyzer.py ../jieba/jieba/analyse/analyzer.py diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/analyzer.py ../jieba/jieba/analyse/analyzer.py
--- ./jieba/analyse/analyzer.py 2014-11-07 23:07:02.779210408 +0800 --- ./jieba/analyse/analyzer.py 2014-11-29 15:46:45.987925569 +0800
+++ ../jieba/jieba/analyse/analyzer.py 2014-11-07 23:07:02.079210422 +0800 +++ ../jieba/jieba/analyse/analyzer.py 2014-11-29 15:34:42.859932465 +0800
@@ -1,4 +1,4 @@ @@ -1,4 +1,4 @@
-##encoding=utf-8 -##encoding=utf-8
+#encoding=utf-8 +#encoding=utf-8
@ -8,9 +8,9 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
from whoosh.analysis import Tokenizer,Token from whoosh.analysis import Tokenizer,Token
from whoosh.lang.porter import stem from whoosh.lang.porter import stem
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/__init__.py ../jieba/jieba/analyse/__init__.py diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/__init__.py ../jieba/jieba/analyse/__init__.py
--- ./jieba/analyse/__init__.py 2014-11-07 23:07:02.879210406 +0800 --- ./jieba/analyse/__init__.py 2014-11-29 15:46:46.139925567 +0800
+++ ../jieba/jieba/analyse/__init__.py 2014-11-07 23:16:27.171198767 +0800 +++ ../jieba/jieba/analyse/__init__.py 2014-11-29 15:36:13.147931604 +0800
@@ -25,7 +25,7 @@ @@ -26,7 +26,7 @@
def set_new_path(self, new_idf_path): def set_new_path(self, new_idf_path):
if self.path != new_idf_path: if self.path != new_idf_path:
@ -19,7 +19,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
idf_freq = {} idf_freq = {}
lines = content.rstrip('\n').split('\n') lines = content.rstrip('\n').split('\n')
for line in lines: for line in lines:
@@ -81,7 +81,7 @@ @@ -93,7 +93,7 @@
freq[k] *= idf_freq.get(k, median_idf) / total freq[k] *= idf_freq.get(k, median_idf) / total
if withWeight: if withWeight:
@ -29,8 +29,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
tags = sorted(freq, key=freq.__getitem__, reverse=True) tags = sorted(freq, key=freq.__getitem__, reverse=True)
if topK: if topK:
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/textrank.py ../jieba/jieba/analyse/textrank.py diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/textrank.py ../jieba/jieba/analyse/textrank.py
--- ./jieba/analyse/textrank.py 2014-11-07 23:07:02.827210407 +0800 --- ./jieba/analyse/textrank.py 2014-11-29 15:46:46.043925568 +0800
+++ ../jieba/jieba/analyse/textrank.py 2014-11-07 23:18:22.059196398 +0800 +++ ../jieba/jieba/analyse/textrank.py 2014-11-29 15:36:39.291931354 +0800
@@ -1,4 +1,4 @@ @@ -1,4 +1,4 @@
-#!/usr/bin/env python -#!/usr/bin/env python
+#!/usr/bin/env python3 +#!/usr/bin/env python3
@ -61,7 +61,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
# to unify the weights, don't *100. # to unify the weights, don't *100.
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0) ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
@@ -70,12 +70,12 @@ @@ -72,12 +72,12 @@
continue continue
cm[(words[i].word, words[j].word)] += 1 cm[(words[i].word, words[j].word)] += 1
@ -77,19 +77,28 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True) tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
if topK: if topK:
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/finalseg/__init__.py ../jieba/jieba/finalseg/__init__.py diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/finalseg/__init__.py ../jieba/jieba/finalseg/__init__.py
--- ./jieba/finalseg/__init__.py 2014-11-07 23:07:03.147210400 +0800 --- ./jieba/finalseg/__init__.py 2014-11-29 15:46:46.367925565 +0800
+++ ../jieba/jieba/finalseg/__init__.py 2014-11-07 23:18:43.495195956 +0800 +++ ../jieba/jieba/finalseg/__init__.py 2014-11-29 15:34:42.859932465 +0800
@@ -1,4 +1,3 @@ @@ -1,4 +1,3 @@
- -
import re import re
import os import os
import marshal import marshal
@@ -89,7 +88,7 @@
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
sentence = sentence.decode('gbk', 'ignore')
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"(\d+\.\d+|[a-zA-Z0-9]+)")
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
blocks = re_han.split(sentence)
for blk in blocks:
if re_han.match(blk):
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__init__.py ../jieba/jieba/__init__.py diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__init__.py ../jieba/jieba/__init__.py
--- ./jieba/__init__.py 2014-11-07 23:07:02.751210408 +0800 --- ./jieba/__init__.py 2014-11-29 15:46:45.955925569 +0800
+++ ../jieba/jieba/__init__.py 2014-11-07 23:22:34.963191182 +0800 +++ ../jieba/jieba/__init__.py 2014-11-29 15:39:03.335929981 +0800
@@ -1,4 +1,3 @@ @@ -1,4 +1,3 @@
- -
__version__ = '0.34' __version__ = '0.35'
__license__ = 'MIT' __license__ = 'MIT'
@@ -51,7 +50,7 @@ @@ -51,7 +50,7 @@
@ -101,17 +110,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
return pfdict, lfreq, ltotal return pfdict, lfreq, ltotal
def initialize(dictionary=None): def initialize(dictionary=None):
@@ -78,7 +77,8 @@ @@ -229,11 +228,11 @@
if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
logger.debug("Loading model from cache %s" % cache_file)
try:
- pfdict,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
+ with open(cache_file, 'rb') as cf:
+ pfdict,FREQ,total,min_freq = marshal.load(cf)
# prevent conflict with old version
load_from_cache_fail = not isinstance(pfdict, set)
except:
@@ -228,11 +228,11 @@
'''The main function that segments an entire sentence that contains '''The main function that segments an entire sentence that contains
Chinese characters into seperated words. Chinese characters into seperated words.
Parameter: Parameter:
@ -125,7 +124,19 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
try: try:
sentence = sentence.decode('utf-8') sentence = sentence.decode('utf-8')
except UnicodeDecodeError: except UnicodeDecodeError:
@@ -338,8 +338,6 @@ @@ -243,9 +242,9 @@
# \r\n|\s : whitespace characters. Will not be handled.
if cut_all:
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)", re.U), re.compile(r"[^a-zA-Z0-9+#\n]", re.U)
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)", re.U), re.compile("[^a-zA-Z0-9+#\n]", re.U)
else:
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(r"(\r\n|\s)", re.U)
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile("(\r\n|\s)", re.U)
blocks = re_han.split(sentence)
if cut_all:
cut_block = __cut_all
@@ -339,8 +338,6 @@
global pool, cut, cut_for_search global pool, cut, cut_for_search
if os.name == 'nt': if os.name == 'nt':
raise Exception("jieba: parallel mode only supports posix system") raise Exception("jieba: parallel mode only supports posix system")
@ -134,7 +145,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
from multiprocessing import Pool, cpu_count from multiprocessing import Pool, cpu_count
if processnum is None: if processnum is None:
processnum = cpu_count() processnum = cpu_count()
@@ -392,12 +390,12 @@ @@ -393,12 +390,12 @@
def tokenize(unicode_sentence, mode="default", HMM=True): def tokenize(unicode_sentence, mode="default", HMM=True):
"""Tokenize a sentence and yields tuples of (word, start, end) """Tokenize a sentence and yields tuples of (word, start, end)
Parameter: Parameter:
@ -150,8 +161,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
if mode == 'default': if mode == 'default':
for w in cut(unicode_sentence, HMM=HMM): for w in cut(unicode_sentence, HMM=HMM):
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__main__.py ../jieba/jieba/__main__.py diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__main__.py ../jieba/jieba/__main__.py
--- ./jieba/__main__.py 2014-11-07 23:07:02.563210412 +0800 --- ./jieba/__main__.py 2014-11-29 15:46:45.747925571 +0800
+++ ../jieba/jieba/__main__.py 2014-11-07 23:07:02.079210422 +0800 +++ ../jieba/jieba/__main__.py 2014-11-29 15:34:42.859932465 +0800
@@ -40,7 +40,7 @@ @@ -40,7 +40,7 @@
ln = fp.readline() ln = fp.readline()
while ln: while ln:
@ -162,8 +173,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
fp.close() fp.close()
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/__init__.py ../jieba/jieba/posseg/__init__.py diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/__init__.py ../jieba/jieba/posseg/__init__.py
--- ./jieba/posseg/__init__.py 2014-11-07 23:07:03.047210402 +0800 --- ./jieba/posseg/__init__.py 2014-11-29 15:46:46.271925566 +0800
+++ ../jieba/jieba/posseg/__init__.py 2014-11-07 23:19:40.883194772 +0800 +++ ../jieba/jieba/posseg/__init__.py 2014-11-29 15:37:52.299930658 +0800
@@ -1,4 +1,3 @@ @@ -1,4 +1,3 @@
- -
import re import re
@ -188,27 +199,41 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
if not isJython: if not isJython:
return result return result
@@ -46,7 +45,7 @@ @@ -105,8 +104,8 @@
yield pair(sentence[next:], pos_list[next][1])
state = {}
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P) def __cut_detail(sentence):
- with open(abs_path, 'r') as f: - re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"([\.0-9]+|[a-zA-Z0-9]+)")
+ with open(abs_path, 'rb') as f: - re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
state = marshal.load(f) + re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
f.closed + re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
blocks = re_han.split(sentence)
for blk in blocks:
if re_han.match(blk):
@@ -130,7 +129,7 @@
x = 0
N = len(sentence)
buf = ''
- re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
+ re_eng = re.compile('[a-zA-Z0-9]',re.U)
while x < N:
y = route[x][1]+1
l_word = sentence[x:y]
@@ -195,8 +194,8 @@
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
sentence = sentence.decode('gbk', 'ignore')
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(r"(\r\n|\s)")
- re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
+ re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
blocks = re_han.split(sentence)
if HMM:
__cut_blk = __cut_DAG
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/viterbi.py ../jieba/jieba/posseg/viterbi.py diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/viterbi.py ../jieba/jieba/posseg/viterbi.py
--- ./jieba/posseg/viterbi.py 2014-11-07 23:07:03.079210402 +0800 --- ./jieba/posseg/viterbi.py 2014-11-29 15:46:46.303925566 +0800
+++ ../jieba/jieba/posseg/viterbi.py 2014-11-07 23:07:02.095210422 +0800 +++ ../jieba/jieba/posseg/viterbi.py 2014-11-29 15:38:28.527930313 +0800
@@ -3,14 +3,13 @@ @@ -8,7 +8,7 @@
MIN_INF = float("-inf")
def get_top_states(t_state_v, K=4):
- items = list(t_state_v.items())
- topK = sorted(items, key=operator.itemgetter(1), reverse=True)[:K]
+ topK = sorted(t_state_v.items(), key=operator.itemgetter(1), reverse=True)[:K]
return [x[0] for x in topK]
def viterbi(obs, states, start_p, trans_p, emit_p): def viterbi(obs, states, start_p, trans_p, emit_p):
V = [{}] #tabular V = [{}] #tabular
mem_path = [{}] mem_path = [{}]
@ -217,7 +242,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
for y in states.get(obs[0], all_states): #init for y in states.get(obs[0], all_states): #init
V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT) V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
mem_path[0][y] = '' mem_path[0][y] = ''
@@ -18,9 +17,9 @@ @@ -16,9 +16,9 @@
V.append({}) V.append({})
mem_path.append({}) mem_path.append({})
#prev_states = get_top_states(V[t-1]) #prev_states = get_top_states(V[t-1])
@ -229,7 +254,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next
if not obs_states: if not obs_states:
@@ -31,7 +30,7 @@ @@ -29,7 +29,7 @@
V[t][y] = prob V[t][y] = prob
mem_path[t][y] = state mem_path[t][y] = state
@ -239,8 +264,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
#print obs #print obs
prob, state = max(last) prob, state = max(last)
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./README.md ../jieba/README.md diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./README.md ../jieba/README.md
--- ./README.md 2014-11-07 23:07:02.067210423 +0800 --- ./README.md 2014-11-29 15:46:08.487925926 +0800
+++ ../jieba/README.md 2014-11-07 23:24:49.263188412 +0800 +++ ../jieba/README.md 2014-11-29 15:34:42.859932465 +0800
@@ -4,6 +4,9 @@ @@ -4,6 +4,9 @@
"Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module. "Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module.
- _Scroll down for English documentation._ - _Scroll down for English documentation._
@ -348,18 +373,65 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
``` ```
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./setup.py ../jieba/setup.py diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./setup.py ../jieba/setup.py
--- ./setup.py 2014-11-07 23:07:02.067210423 +0800 --- ./setup.py 2014-11-29 15:46:46.379925565 +0800
+++ ../jieba/setup.py 2014-11-07 23:07:02.095210422 +0800 +++ ../jieba/setup.py 2014-11-29 15:42:20.263928103 +0800
@@ -1,5 +1,5 @@ @@ -11,7 +11,7 @@
from distutils.core import setup
完整文档见 ``README.md``
-GitHub: https://github.com/fxsjy/jieba
+GitHub: https://github.com/fxsjy/jieba/tree/jieba3k
特点
====
@@ -34,17 +34,11 @@
Python 2.x
----------
-- 全自动安装: ``easy_install jieba`` 或者 ``pip install jieba``
-- 半自动安装:先下载 https://pypi.python.org/pypi/jieba/ ,解压后运行
- python setup.py install
-- 手动安装:将 jieba 目录放置于当前目录或者 site-packages 目录
-- 通过 ``import jieba`` 来引用
+见 https://pypi.python.org/pypi/jieba/
Python 3.x
----------
-见 https://pypi.python.org/pypi/jieba3k/
-
- 目前 master 分支是只支持 Python 2.x 的
- Python 3.x 版本的分支也已经基本可用:
https://github.com/fxsjy/jieba/tree/jieba3k
@@ -59,13 +53,13 @@
"""
-setup(name='jieba', -setup(name='jieba',
+setup(name='jieba3k', +setup(name='jieba3k',
version='0.35', version='0.35.1',
description='Chinese Words Segementation Utilities', description='Chinese Words Segementation Utilities',
long_description=LONGDOC,
author='Sun, Junyi', author='Sun, Junyi',
author_email='ccnusjy@gmail.com',
- url='https://github.com/fxsjy/jieba',
+ url='https://github.com/fxsjy/jieba/tree/jieba3k',
license="MIT",
classifiers=[
'Intended Audience :: Developers',
@@ -73,9 +67,8 @@
'Operating System :: OS Independent',
'Natural Language :: Chinese (Simplified)',
'Natural Language :: Chinese (Traditional)',
'Programming Language :: Python',
- 'Programming Language :: Python :: 2',
+ 'Programming Language :: Python :: 3',
'Topic :: Text Processing',
'Topic :: Text Processing :: Indexing',
'Topic :: Text Processing :: Linguistic',
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/extract_topic.py ../jieba/test/extract_topic.py diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/extract_topic.py ../jieba/test/extract_topic.py
--- ./test/extract_topic.py 2014-11-07 23:07:03.707210389 +0800 --- ./test/extract_topic.py 2014-11-29 15:46:47.003925559 +0800
+++ ../jieba/test/extract_topic.py 2014-11-07 23:07:02.095210422 +0800 +++ ../jieba/test/extract_topic.py 2014-11-29 15:34:42.919932464 +0800
@@ -51,13 +51,13 @@ @@ -51,13 +51,13 @@
print("training...") print("training...")
@ -379,8 +451,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
+ for i in topic.argsort()[:-n_top_words - 1:-1]])) + for i in topic.argsort()[:-n_top_words - 1:-1]]))
print("") print("")
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jiebacmd.py ../jieba/test/jiebacmd.py diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jiebacmd.py ../jieba/test/jiebacmd.py
--- ./test/jiebacmd.py 2014-11-07 23:07:03.211210399 +0800 --- ./test/jiebacmd.py 2014-11-29 15:46:46.443925564 +0800
+++ ../jieba/test/jiebacmd.py 2014-11-07 23:07:02.099210422 +0800 +++ ../jieba/test/jiebacmd.py 2014-11-29 15:34:42.919932464 +0800
@@ -23,6 +23,6 @@ @@ -23,6 +23,6 @@
break break
line = line.strip() line = line.strip()
@ -390,9 +462,9 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jieba_test.py ../jieba/test/jieba_test.py diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jieba_test.py ../jieba/test/jieba_test.py
--- ./test/jieba_test.py 2014-11-07 23:07:03.947210384 +0800 --- ./test/jieba_test.py 2014-11-29 15:46:47.271925556 +0800
+++ ../jieba/test/jieba_test.py 2014-11-07 23:07:02.099210422 +0800 +++ ../jieba/test/jieba_test.py 2014-11-29 15:34:42.919932464 +0800
@@ -1,5 +1,6 @@ @@ -152,7 +152,7 @@
#-*-coding: utf-8 -*- #-*-coding: utf-8 -*-
import sys import sys
+import imp +import imp
@ -417,7 +489,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error" assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result) result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content assert isinstance(result, list), "Test Tokenize error on content: %s" % content
@@ -180,7 +181,7 @@ @@ -181,7 +181,7 @@
def testTokenize_NOHMM(self): def testTokenize_NOHMM(self):
for content in test_contents: for content in test_contents:
@ -427,8 +499,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
result = list(result) result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content assert isinstance(result, list), "Test Tokenize error on content: %s" % content
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize_no_hmm.py ../jieba/test/test_tokenize_no_hmm.py diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize_no_hmm.py ../jieba/test/test_tokenize_no_hmm.py
--- ./test/test_tokenize_no_hmm.py 2014-11-07 23:07:04.031210382 +0800 --- ./test/test_tokenize_no_hmm.py 2014-11-29 15:46:47.355925556 +0800
+++ ../jieba/test/test_tokenize_no_hmm.py 2014-11-07 23:07:02.099210422 +0800 +++ ../jieba/test/test_tokenize_no_hmm.py 2014-11-29 15:34:42.919932464 +0800
@@ -7,7 +7,6 @@ @@ -7,7 +7,6 @@
def cuttest(test_sent): def cuttest(test_sent):
@ -438,8 +510,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
for tk in result: for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])) print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize.py ../jieba/test/test_tokenize.py diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize.py ../jieba/test/test_tokenize.py
--- ./test/test_tokenize.py 2014-11-07 23:07:04.071210381 +0800 --- ./test/test_tokenize.py 2014-11-29 15:46:47.403925555 +0800
+++ ../jieba/test/test_tokenize.py 2014-11-07 23:07:02.099210422 +0800 +++ ../jieba/test/test_tokenize.py 2014-11-29 15:34:42.919932464 +0800
@@ -7,7 +7,6 @@ @@ -7,7 +7,6 @@
def cuttest(test_sent): def cuttest(test_sent):

Loading…
Cancel
Save