diff --git a/Changelog b/Changelog index 671add0..d1628f1 100644 --- a/Changelog +++ b/Changelog @@ -1,8 +1,11 @@ +2014-11-15: version 0.35.1 +1) fix Python 3.2的兼容性问题 + 2014-11-13: version 0.35 -1. 改进词典cache的dump和加载机制;by @gumblex -2. 提升关键词提取的性能; by @gumblex -3. 关键词提取新增基于textrank算法的子模块; by @singlee -4. 修复自定义stopwords功能的bug; by @walkskyer +1) 改进词典cache的dump和加载机制;by @gumblex +2)提升关键词提取的性能; by @gumblex +3)关键词提取新增基于textrank算法的子模块; by @singlee +4)修复自定义stopwords功能的bug; by @walkskyer 2014-10-20: version 0.34 diff --git a/jieba/__init__.py b/jieba/__init__.py index e507111..37e2e62 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -242,9 +242,9 @@ def cut(sentence, cut_all=False, HMM=True): # \r\n|\s : whitespace characters. Will not be handled. if cut_all: - re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)", re.U), re.compile(r"[^a-zA-Z0-9+#\n]", re.U) + re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)", re.U), re.compile("[^a-zA-Z0-9+#\n]", re.U) else: - re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(r"(\r\n|\s)", re.U) + re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile("(\r\n|\s)", re.U) blocks = re_han.split(sentence) if cut_all: cut_block = __cut_all diff --git a/jieba/analyse/__init__.py b/jieba/analyse/__init__.py index 752204e..94d0f49 100644 --- a/jieba/analyse/__init__.py +++ b/jieba/analyse/__init__.py @@ -54,7 +54,7 @@ def set_stop_words(stop_words_path): if not os.path.exists(abs_path): raise Exception("jieba: path does not exist: " + abs_path) content = open(abs_path,'rb').read().decode('utf-8') - lines = content.replace("\r", "").split('\n') + lines = content.replace("\r","").split('\n') for line in lines: STOP_WORDS.add(line) diff --git a/jieba/finalseg/__init__.py b/jieba/finalseg/__init__.py index a7694fc..5e676ad 100644 --- a/jieba/finalseg/__init__.py +++ b/jieba/finalseg/__init__.py @@ -88,7 +88,7 @@ def cut(sentence): sentence = sentence.decode('utf-8') except UnicodeDecodeError: sentence = sentence.decode('gbk', 'ignore') - re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"(\d+\.\d+|[a-zA-Z0-9]+)") + re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)") blocks = re_han.split(sentence) for blk in blocks: if re_han.match(blk): diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index 52e3382..865a07d 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -104,8 +104,8 @@ def __cut(sentence): yield pair(sentence[next:], pos_list[next][1]) def __cut_detail(sentence): - re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"([\.0-9]+|[a-zA-Z0-9]+)") - re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+") + re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)") + re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+") blocks = re_han.split(sentence) for blk in blocks: if re_han.match(blk): @@ -129,7 +129,7 @@ def __cut_DAG_NO_HMM(sentence): x = 0 N = len(sentence) buf = '' - re_eng = re.compile(r'[a-zA-Z0-9]',re.U) + re_eng = re.compile('[a-zA-Z0-9]',re.U) while x < N: y = route[x][1]+1 l_word = sentence[x:y] @@ -194,8 +194,8 @@ def __cut_internal(sentence, HMM=True): sentence = sentence.decode('utf-8') except UnicodeDecodeError: sentence = sentence.decode('gbk', 'ignore') - re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(r"(\r\n|\s)") - re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+") + re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)") + re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+") blocks = re_han.split(sentence) if HMM: __cut_blk = __cut_DAG diff --git a/setup.py b/setup.py index 3e6d860..57a8421 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from distutils.core import setup setup(name='jieba3k', - version='0.35', + version='0.35.1', description='Chinese Words Segementation Utilities', author='Sun, Junyi', author_email='ccnusjy@gmail.com',