You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
jieba/test/2to3.diff

523 lines
20 KiB
Diff

diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/analyzer.py ../jieba/jieba/analyse/analyzer.py
10 years ago
--- ./jieba/analyse/analyzer.py 2014-11-29 15:46:45.987925569 +0800
+++ ../jieba/jieba/analyse/analyzer.py 2014-11-29 15:34:42.859932465 +0800
@@ -1,4 +1,4 @@
-##encoding=utf-8
+#encoding=utf-8
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
from whoosh.analysis import Tokenizer,Token
from whoosh.lang.porter import stem
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/__init__.py ../jieba/jieba/analyse/__init__.py
10 years ago
--- ./jieba/analyse/__init__.py 2014-11-29 15:46:46.139925567 +0800
+++ ../jieba/jieba/analyse/__init__.py 2014-11-29 15:36:13.147931604 +0800
@@ -26,7 +26,7 @@
def set_new_path(self, new_idf_path):
if self.path != new_idf_path:
- content = open(new_idf_path, 'rb').read().decode('utf-8')
+ content = open(new_idf_path, 'r', encoding='utf-8').read()
idf_freq = {}
lines = content.rstrip('\n').split('\n')
for line in lines:
10 years ago
@@ -93,7 +93,7 @@
freq[k] *= idf_freq.get(k, median_idf) / total
if withWeight:
- tags = sorted(list(freq.items()), key=itemgetter(1), reverse=True)
+ tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(freq, key=freq.__getitem__, reverse=True)
if topK:
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/textrank.py ../jieba/jieba/analyse/textrank.py
10 years ago
--- ./jieba/analyse/textrank.py 2014-11-29 15:46:46.043925568 +0800
+++ ../jieba/jieba/analyse/textrank.py 2014-11-29 15:36:39.291931354 +0800
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
@@ -22,12 +22,12 @@
outSum = collections.defaultdict(float)
wsdef = 1.0 / len(self.graph)
- for n, out in list(self.graph.items()):
+ for n, out in self.graph.items():
ws[n] = wsdef
outSum[n] = sum((e[2] for e in out), 0.0)
for x in range(10): # 10 iters
- for n, inedges in list(self.graph.items()):
+ for n, inedges in self.graph.items():
s = 0
for e in inedges:
s += e[2] / outSum[e[1]] * ws[e[1]]
@@ -41,7 +41,7 @@
elif w > max_rank:
max_rank = w
- for n, w in list(ws.items()):
+ for n, w in ws.items():
# to unify the weights, don't *100.
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
10 years ago
@@ -72,12 +72,12 @@
continue
cm[(words[i].word, words[j].word)] += 1
- for terms, w in list(cm.items()):
+ for terms, w in cm.items():
g.addEdge(terms[0], terms[1], w)
nodes_rank = g.rank()
if withWeight:
- tags = sorted(list(nodes_rank.items()), key=itemgetter(1), reverse=True)
+ tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
if topK:
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/finalseg/__init__.py ../jieba/jieba/finalseg/__init__.py
10 years ago
--- ./jieba/finalseg/__init__.py 2014-11-29 15:46:46.367925565 +0800
+++ ../jieba/jieba/finalseg/__init__.py 2014-11-29 15:34:42.859932465 +0800
@@ -1,4 +1,3 @@
-
import re
import os
import marshal
10 years ago
@@ -89,7 +88,7 @@
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
sentence = sentence.decode('gbk', 'ignore')
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"(\d+\.\d+|[a-zA-Z0-9]+)")
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
blocks = re_han.split(sentence)
for blk in blocks:
if re_han.match(blk):
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__init__.py ../jieba/jieba/__init__.py
10 years ago
--- ./jieba/__init__.py 2014-11-29 15:46:45.955925569 +0800
+++ ../jieba/jieba/__init__.py 2014-11-29 15:39:03.335929981 +0800
@@ -1,4 +1,3 @@
-
10 years ago
__version__ = '0.35'
__license__ = 'MIT'
@@ -51,7 +50,7 @@
pfdict.add(word[:ch+1])
except ValueError as e:
logger.debug('%s at line %s %s' % (f_name, lineno, line))
- raise ValueError(e)
+ raise e
return pfdict, lfreq, ltotal
def initialize(dictionary=None):
10 years ago
@@ -229,11 +228,11 @@
'''The main function that segments an entire sentence that contains
Chinese characters into seperated words.
Parameter:
- - sentence: The str/unicode to be segmented.
+ - sentence: The str to be segmented.
- cut_all: Model type. True for full pattern, False for accurate pattern.
- HMM: Whether to use the Hidden Markov Model.
'''
- if not isinstance(sentence, str):
+ if isinstance(sentence, bytes):
try:
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
10 years ago
@@ -243,9 +242,9 @@
# \r\n|\s : whitespace characters. Will not be handled.
if cut_all:
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)", re.U), re.compile(r"[^a-zA-Z0-9+#\n]", re.U)
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)", re.U), re.compile("[^a-zA-Z0-9+#\n]", re.U)
else:
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(r"(\r\n|\s)", re.U)
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile("(\r\n|\s)", re.U)
blocks = re_han.split(sentence)
if cut_all:
cut_block = __cut_all
@@ -339,8 +338,6 @@
global pool, cut, cut_for_search
if os.name == 'nt':
raise Exception("jieba: parallel mode only supports posix system")
- if sys.version_info[0]==2 and sys.version_info[1]<6:
- raise Exception("jieba: the parallel feature needs Python version>2.5")
from multiprocessing import Pool, cpu_count
if processnum is None:
processnum = cpu_count()
10 years ago
@@ -393,12 +390,12 @@
def tokenize(unicode_sentence, mode="default", HMM=True):
"""Tokenize a sentence and yields tuples of (word, start, end)
Parameter:
- - sentence: the unicode to be segmented.
+ - sentence: the str to be segmented.
- mode: "default" or "search", "search" is for finer segmentation.
- HMM: whether to use the Hidden Markov Model.
"""
if not isinstance(unicode_sentence, str):
- raise Exception("jieba: the input parameter should be unicode.")
+ raise Exception("jieba: the input parameter should be str.")
start = 0
if mode == 'default':
for w in cut(unicode_sentence, HMM=HMM):
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__main__.py ../jieba/jieba/__main__.py
10 years ago
--- ./jieba/__main__.py 2014-11-29 15:46:45.747925571 +0800
+++ ../jieba/jieba/__main__.py 2014-11-29 15:34:42.859932465 +0800
@@ -40,7 +40,7 @@
ln = fp.readline()
while ln:
l = ln.rstrip('\r\n')
- print((delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8')))
+ print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)))
ln = fp.readline()
fp.close()
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/__init__.py ../jieba/jieba/posseg/__init__.py
10 years ago
--- ./jieba/posseg/__init__.py 2014-11-29 15:46:46.271925566 +0800
+++ ../jieba/jieba/posseg/__init__.py 2014-11-29 15:37:52.299930658 +0800
@@ -1,4 +1,3 @@
-
import re
import os
from . import viterbi
@@ -18,14 +17,14 @@
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
result = {}
- with open(f_name, "r") as f:
+ with open(f_name, "rb") as f:
for line in f:
line = line.strip()
if not line:
continue
- word, _, tag = line.split(' ')
- result[word.decode('utf-8')] = tag
-
+ line = line.decode("utf-8")
+ word, _, tag = line.split(" ")
+ result[word] = tag
if not isJython:
return result
10 years ago
@@ -105,8 +104,8 @@
yield pair(sentence[next:], pos_list[next][1])
def __cut_detail(sentence):
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"([\.0-9]+|[a-zA-Z0-9]+)")
- re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
+ re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
blocks = re_han.split(sentence)
for blk in blocks:
if re_han.match(blk):
@@ -130,7 +129,7 @@
x = 0
N = len(sentence)
buf = ''
- re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
+ re_eng = re.compile('[a-zA-Z0-9]',re.U)
while x < N:
y = route[x][1]+1
l_word = sentence[x:y]
@@ -195,8 +194,8 @@
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
sentence = sentence.decode('gbk', 'ignore')
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(r"(\r\n|\s)")
- re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
+ re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
blocks = re_han.split(sentence)
if HMM:
__cut_blk = __cut_DAG
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/viterbi.py ../jieba/jieba/posseg/viterbi.py
10 years ago
--- ./jieba/posseg/viterbi.py 2014-11-29 15:46:46.303925566 +0800
+++ ../jieba/jieba/posseg/viterbi.py 2014-11-29 15:38:28.527930313 +0800
@@ -8,7 +8,7 @@
def viterbi(obs, states, start_p, trans_p, emit_p):
V = [{}] #tabular
mem_path = [{}]
- all_states = list(trans_p.keys())
+ all_states = trans_p.keys()
for y in states.get(obs[0], all_states): #init
V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
mem_path[0][y] = ''
10 years ago
@@ -16,9 +16,9 @@
V.append({})
mem_path.append({})
#prev_states = get_top_states(V[t-1])
- prev_states = [x for x in list(mem_path[t-1].keys()) if len(trans_p[x]) > 0]
+ prev_states = [x for x in mem_path[t-1].keys() if len(trans_p[x]) > 0]
- prev_states_expect_next = set((y for x in prev_states for y in list(trans_p[x].keys())))
+ prev_states_expect_next = set((y for x in prev_states for y in trans_p[x].keys()))
obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next
if not obs_states:
10 years ago
@@ -29,7 +29,7 @@
V[t][y] = prob
mem_path[t][y] = state
- last = [(V[-1][y], y) for y in list(mem_path[-1].keys())]
+ last = [(V[-1][y], y) for y in mem_path[-1].keys()]
#if len(last)==0:
#print obs
prob, state = max(last)
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./README.md ../jieba/README.md
10 years ago
--- ./README.md 2014-11-29 15:46:08.487925926 +0800
+++ ../jieba/README.md 2014-11-29 15:34:42.859932465 +0800
@@ -4,6 +4,9 @@
"Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module.
- _Scroll down for English documentation._
+注意!
+========
+这个branch `jieba3k` 是专门用于Python3.x的版本
特点
========
@@ -68,16 +71,16 @@
import jieba
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
-print "Full Mode:", "/ ".join(seg_list) # 全模式
+print("Full Mode:", "/ ".join(seg_list)) # 全模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
-print "Default Mode:", "/ ".join(seg_list) # 精确模式
+print("Default Mode:", "/ ".join(seg_list)) # 精确模式
seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式
-print ", ".join(seg_list)
+print(", ".join(seg_list))
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
-print ", ".join(seg_list)
+print(", ".join(seg_list))
```
输出:
@@ -174,7 +177,7 @@
>>> import jieba.posseg as pseg
>>> words = pseg.cut("我爱北京天安门")
>>> for w in words:
-... print w.word, w.flag
+... print(w.word, w.flag)
...
我 r
爱 v
@@ -203,7 +206,7 @@
```python
result = jieba.tokenize(u'永和服装饰品有限公司')
for tk in result:
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```
```
@@ -219,7 +222,7 @@
```python
result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
for tk in result:
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```
```
@@ -408,16 +411,16 @@
import jieba
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
-print "Full Mode:", "/ ".join(seg_list) # 全模式
+print("Full Mode:", "/ ".join(seg_list)) # 全模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
-print "Default Mode:", "/ ".join(seg_list) # 默认模式
+print("Default Mode:", "/ ".join(seg_list)) # 默认模式
seg_list = jieba.cut("他来到了网易杭研大厦")
-print ", ".join(seg_list)
+print(", ".join(seg_list))
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
-print ", ".join(seg_list)
+print(", ".join(seg_list))
```
Output:
@@ -483,7 +486,7 @@
>>> import jieba.posseg as pseg
>>> words = pseg.cut("我爱北京天安门")
>>> for w in words:
-... print w.word, w.flag
+... print(w.word, w.flag)
...
我 r
爱 v
@@ -512,7 +515,7 @@
```python
result = jieba.tokenize(u'永和服装饰品有限公司')
for tk in result:
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```
```
@@ -528,7 +531,7 @@
```python
result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
for tk in result:
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```
```
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./setup.py ../jieba/setup.py
10 years ago
--- ./setup.py 2014-11-29 15:46:46.379925565 +0800
+++ ../jieba/setup.py 2014-11-29 15:42:20.263928103 +0800
@@ -11,7 +11,7 @@
完整文档见 ``README.md``
-GitHub: https://github.com/fxsjy/jieba
+GitHub: https://github.com/fxsjy/jieba/tree/jieba3k
特点
====
@@ -34,17 +34,11 @@
Python 2.x
----------
-- 全自动安装: ``easy_install jieba`` 或者 ``pip install jieba``
-- 半自动安装:先下载 https://pypi.python.org/pypi/jieba/ ,解压后运行
- python setup.py install
-- 手动安装:将 jieba 目录放置于当前目录或者 site-packages 目录
-- 通过 ``import jieba`` 来引用
+见 https://pypi.python.org/pypi/jieba/
Python 3.x
----------
-见 https://pypi.python.org/pypi/jieba3k/
-
- 目前 master 分支是只支持 Python 2.x 的
- Python 3.x 版本的分支也已经基本可用:
https://github.com/fxsjy/jieba/tree/jieba3k
@@ -59,13 +53,13 @@
"""
-setup(name='jieba',
+setup(name='jieba3k',
10 years ago
version='0.35.1',
description='Chinese Words Segementation Utilities',
10 years ago
long_description=LONGDOC,
author='Sun, Junyi',
10 years ago
author_email='ccnusjy@gmail.com',
- url='https://github.com/fxsjy/jieba',
+ url='https://github.com/fxsjy/jieba/tree/jieba3k',
license="MIT",
classifiers=[
'Intended Audience :: Developers',
@@ -73,9 +67,8 @@
'Operating System :: OS Independent',
'Natural Language :: Chinese (Simplified)',
'Natural Language :: Chinese (Traditional)',
'Programming Language :: Python',
- 'Programming Language :: Python :: 2',
+ 'Programming Language :: Python :: 3',
'Topic :: Text Processing',
'Topic :: Text Processing :: Indexing',
'Topic :: Text Processing :: Linguistic',
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/extract_topic.py ../jieba/test/extract_topic.py
10 years ago
--- ./test/extract_topic.py 2014-11-29 15:46:47.003925559 +0800
+++ ../jieba/test/extract_topic.py 2014-11-29 15:34:42.919932464 +0800
@@ -51,13 +51,13 @@
print("training...")
nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
-print(("done in %0.3fs." % (time.time() - t0)))
+print("done in %0.3fs." % (time.time() - t0))
# Inverse the vectorizer vocabulary to be able
feature_names = count_vect.get_feature_names()
for topic_idx, topic in enumerate(nmf.components_):
- print(("Topic #%d:" % topic_idx))
- print((" ".join([feature_names[i]
- for i in topic.argsort()[:-n_top_words - 1:-1]])))
+ print("Topic #%d:" % topic_idx)
+ print(" ".join([feature_names[i]
+ for i in topic.argsort()[:-n_top_words - 1:-1]]))
print("")
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jiebacmd.py ../jieba/test/jiebacmd.py
10 years ago
--- ./test/jiebacmd.py 2014-11-29 15:46:46.443925564 +0800
+++ ../jieba/test/jiebacmd.py 2014-11-29 15:34:42.919932464 +0800
@@ -23,6 +23,6 @@
break
line = line.strip()
for word in jieba.cut(line):
- print(word.encode(default_encoding))
+ print(word)
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jieba_test.py ../jieba/test/jieba_test.py
10 years ago
--- ./test/jieba_test.py 2014-11-29 15:46:47.271925556 +0800
+++ ../jieba/test/jieba_test.py 2014-11-29 15:34:42.919932464 +0800
@@ -152,7 +152,7 @@
#-*-coding: utf-8 -*-
import sys
+import imp
sys.path.append("../")
import unittest
import types
@@ -97,7 +98,7 @@
class JiebaTestCase(unittest.TestCase):
def setUp(self):
- reload(jieba)
+ imp.reload(jieba)
def tearDown(self):
pass
@@ -151,7 +152,7 @@
def testTokenize(self):
for content in test_contents:
- result = jieba.tokenize(content.decode('utf-8'))
+ result = jieba.tokenize(content)
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
10 years ago
@@ -181,7 +181,7 @@
def testTokenize_NOHMM(self):
for content in test_contents:
- result = jieba.tokenize(content.decode('utf-8'),HMM=False)
+ result = jieba.tokenize(content,HMM=False)
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize_no_hmm.py ../jieba/test/test_tokenize_no_hmm.py
10 years ago
--- ./test/test_tokenize_no_hmm.py 2014-11-29 15:46:47.355925556 +0800
+++ ../jieba/test/test_tokenize_no_hmm.py 2014-11-29 15:34:42.919932464 +0800
@@ -7,7 +7,6 @@
def cuttest(test_sent):
global g_mode
- test_sent = test_sent.decode('utf-8')
result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize.py ../jieba/test/test_tokenize.py
10 years ago
--- ./test/test_tokenize.py 2014-11-29 15:46:47.403925555 +0800
+++ ../jieba/test/test_tokenize.py 2014-11-29 15:34:42.919932464 +0800
@@ -7,7 +7,6 @@
def cuttest(test_sent):
global g_mode
- test_sent = test_sent.decode('utf-8')
result = jieba.tokenize(test_sent,mode=g_mode)
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))