Merge master and jieba3k, make the code Python 2/3 compatible

pull/233/head
Dingyuan Wang 10 years ago
commit 22bcf8be7a

@ -68,16 +68,16 @@ python setup.py install
import jieba
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print "Full Mode:", "/ ".join(seg_list) # 全模式
print("Full Mode: " + "/ ".join(seg_list)) # 全模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print "Default Mode:", "/ ".join(seg_list) # 精确模式
print("Default Mode: " + "/ ".join(seg_list)) # 精确模式
seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式
print ", ".join(seg_list)
print(", ".join(seg_list))
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
print ", ".join(seg_list)
print(", ".join(seg_list))
```
输出:
@ -174,7 +174,7 @@ jieba.analyse.textrank(raw_text)
>>> import jieba.posseg as pseg
>>> words = pseg.cut("我爱北京天安门")
>>> for w in words:
... print w.word, w.flag
... print('%s %s' % (w.word, w.flag))
...
我 r
爱 v
@ -203,7 +203,7 @@ jieba.analyse.textrank(raw_text)
```python
result = jieba.tokenize(u'永和服装饰品有限公司')
for tk in result:
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```
```
@ -219,7 +219,7 @@ word 有限公司 start: 6 end:10
```python
result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
for tk in result:
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```
```
@ -413,16 +413,16 @@ Main Functions
import jieba
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print "Full Mode:", "/ ".join(seg_list) # 全模式
print("Full Mode: " + "/ ".join(seg_list)) # 全模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print "Default Mode:", "/ ".join(seg_list) # 默认模式
print("Default Mode: " + "/ ".join(seg_list)) # 默认模式
seg_list = jieba.cut("他来到了网易杭研大厦")
print ", ".join(seg_list)
print(", ".join(seg_list))
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
print ", ".join(seg_list)
print(", ".join(seg_list))
```
Output:
@ -488,7 +488,7 @@ Use: `jieba.analyse.textrank(raw_text)`.
>>> import jieba.posseg as pseg
>>> words = pseg.cut("我爱北京天安门")
>>> for w in words:
... print w.word, w.flag
... print('%s %s' % (w.word, w.flag))
...
我 r
爱 v
@ -517,7 +517,7 @@ Use: `jieba.analyse.textrank(raw_text)`.
```python
result = jieba.tokenize(u'永和服装饰品有限公司')
for tk in result:
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```
```
@ -533,7 +533,7 @@ word 有限公司 start: 6 end:10
```python
result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
for tk in result:
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```
```

@ -1,20 +1,20 @@
from __future__ import with_statement
from __future__ import absolute_import, unicode_literals
__version__ = '0.35'
__license__ = 'MIT'
import re
import os
import sys
import finalseg
import time
import tempfile
import marshal
from math import log
import random
import threading
from functools import wraps
import logging
from hashlib import md5
from ._compat import *
from . import finalseg
DICTIONARY = "dict.txt"
DICT_LOCK = threading.RLock()
@ -51,13 +51,13 @@ def gen_pfdict(f_name):
ltotal += freq
for ch in xrange(len(word)):
pfdict.add(word[:ch+1])
except ValueError, e:
except ValueError as e:
logger.debug('%s at line %s %s' % (f_name, lineno, line))
raise ValueError, e
raise e
return pfdict, lfreq, ltotal
def initialize(dictionary=None):
global pfdict, FREQ, total, min_freq, initialized, DICTIONARY, DICT_LOCK
global pfdict, FREQ, total, initialized, DICTIONARY, DICT_LOCK
if not dictionary:
dictionary = DICTIONARY
with DICT_LOCK:
@ -121,7 +121,7 @@ def require_initialized(fn):
def __cut_all(sentence):
dag = get_DAG(sentence)
old_j = -1
for k,L in dag.iteritems():
for k,L in iteritems(dag):
if len(L) == 1 and k > old_j:
yield sentence[k:L[0]+1]
old_j = L[0]
@ -158,13 +158,13 @@ def get_DAG(sentence):
return DAG
def __cut_DAG_NO_HMM(sentence):
re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
DAG = get_DAG(sentence)
route = {}
calc(sentence, DAG, route)
x = 0
N = len(sentence)
buf = u''
buf = ''
while x < N:
y = route[x][1] + 1
l_word = sentence[x:y]
@ -174,19 +174,19 @@ def __cut_DAG_NO_HMM(sentence):
else:
if buf:
yield buf
buf = u''
buf = ''
yield l_word
x = y
if buf:
yield buf
buf = u''
buf = ''
def __cut_DAG(sentence):
DAG = get_DAG(sentence)
route = {}
calc(sentence, DAG, route=route)
x = 0
buf = u''
buf = ''
N = len(sentence)
while x < N:
y = route[x][1]+1
@ -197,7 +197,7 @@ def __cut_DAG(sentence):
if buf:
if len(buf) == 1:
yield buf
buf = u''
buf = ''
else:
if buf not in FREQ:
recognized = finalseg.cut(buf)
@ -206,7 +206,7 @@ def __cut_DAG(sentence):
else:
for elem in buf:
yield elem
buf = u''
buf = ''
yield l_word
x = y
@ -225,23 +225,19 @@ def cut(sentence, cut_all=False, HMM=True):
'''The main function that segments an entire sentence that contains
Chinese characters into seperated words.
Parameter:
- sentence: The str/unicode to be segmented.
- sentence: The str(unicode) to be segmented.
- cut_all: Model type. True for full pattern, False for accurate pattern.
- HMM: Whether to use the Hidden Markov Model.
'''
if not isinstance(sentence, unicode):
try:
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
sentence = sentence.decode('gbk', 'ignore')
sentence = strdecode(sentence)
# \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
# \r\n|\s : whitespace characters. Will not be handled.
if cut_all:
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U)
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)", re.U), re.compile("[^a-zA-Z0-9+#\n]", re.U)
else:
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile("(\r\n|\s)", re.U)
blocks = re_han.split(sentence)
if cut_all:
cut_block = __cut_all
@ -292,9 +288,9 @@ def load_userdict(f):
...
Word type may be ignored
'''
if isinstance(f, (str, unicode)):
if isinstance(f, string_types):
f = open(f, 'rb')
content = f.read().decode('utf-8').lstrip(u'\ufeff')
content = f.read().decode('utf-8').lstrip('\ufeff')
line_no = 0
for line in content.split("\n"):
line_no += 1
@ -333,15 +329,13 @@ def enable_parallel(processnum=None):
global pool, cut, cut_for_search
if os.name == 'nt':
raise Exception("jieba: parallel mode only supports posix system")
if sys.version_info[0]==2 and sys.version_info[1]<6:
raise Exception("jieba: the parallel feature needs Python version>2.5")
from multiprocessing import Pool, cpu_count
if processnum is None:
processnum = cpu_count()
pool = Pool(processnum)
def pcut(sentence,cut_all=False,HMM=True):
parts = re.compile('([\r\n]+)').split(sentence)
parts = strdecode(sentence).split('\n')
if cut_all:
result = pool.map(__lcut_all, parts)
elif HMM:
@ -353,7 +347,7 @@ def enable_parallel(processnum=None):
yield w
def pcut_for_search(sentence):
parts = re.compile('([\r\n]+)').split(sentence)
parts = strdecode(sentence).split('\n')
result = pool.map(__lcut_for_search, parts)
for r in result:
for w in r:
@ -385,11 +379,11 @@ def get_abs_path_dict():
def tokenize(unicode_sentence, mode="default", HMM=True):
"""Tokenize a sentence and yields tuples of (word, start, end)
Parameter:
- sentence: the unicode to be segmented.
- sentence: the str(unicode) to be segmented.
- mode: "default" or "search", "search" is for finer segmentation.
- HMM: whether to use the Hidden Markov Model.
"""
if not isinstance(unicode_sentence, unicode):
if not isinstance(unicode_sentence, text_type):
raise Exception("jieba: the input parameter should be unicode.")
start = 0
if mode == 'default':

@ -25,7 +25,7 @@ args = parser.parse_args()
if args.quiet:
jieba.setLogLevel(60)
delim = unicode(args.delimiter)
delim = text_type(args.delimiter)
cutall = args.cutall
hmm = args.hmm
fp = open(args.filename, 'r') if args.filename else sys.stdin
@ -40,7 +40,10 @@ if args.user_dict:
ln = fp.readline()
while ln:
l = ln.rstrip('\r\n')
print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8'))
result = delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm))
if PY2:
result = result.encode(default_encoding)
print(result)
ln = fp.readline()
fp.close()

@ -0,0 +1,31 @@
# -*- coding: utf-8 -*-
import sys
PY2 = sys.version_info[0] == 2
default_encoding = sys.getfilesystemencoding()
if PY2:
text_type = unicode
string_types = (str, unicode)
iterkeys = lambda d: d.iterkeys()
itervalues = lambda d: d.itervalues()
iteritems = lambda d: d.iteritems()
else:
text_type = str
string_types = (str,)
xrange = range
iterkeys = lambda d: iter(d.keys())
itervalues = lambda d: iter(d.values())
iteritems = lambda d: iter(d.items())
def strdecode(sentence):
if not isinstance(sentence, text_type):
try:
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
sentence = sentence.decode('gbk', 'ignore')
return sentence

@ -1,13 +1,14 @@
#encoding=utf-8
from __future__ import absolute_import
import jieba
import jieba.posseg
import os
from operator import itemgetter
try:
from analyzer import ChineseAnalyzer
from .analyzer import ChineseAnalyzer
except ImportError:
pass
from textrank import textrank
from .textrank import textrank
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
abs_path = os.path.join(_curpath, "idf.txt")

@ -1,4 +1,5 @@
##encoding=utf-8
#encoding=utf-8
from __future__ import unicode_literals
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
from whoosh.analysis import Tokenizer,Token
from whoosh.lang.porter import stem
@ -10,9 +11,9 @@ STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
'to', 'us', 'we', 'when', 'will', 'with', 'yet',
'you', 'your', u'', u'', u''))
'you', 'your', '', '', ''))
accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")
accepted_chars = re.compile(r"[\u4E00-\u9FA5]+")
class ChineseTokenizer(Tokenizer):
def __call__(self, text, **kargs):

@ -1,6 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import sys
import collections
from operator import itemgetter
@ -35,7 +36,7 @@ class UndirectWeightedGraph:
(min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])
for w in ws.itervalues():
for w in itervalues(ws):
if w < min_rank:
min_rank = w
elif w > max_rank:
@ -88,4 +89,4 @@ def textrank(sentence, topK=10, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v'
if __name__ == '__main__':
s = "此外公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元增资后吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年实现营业收入0万元实现净利润-139.13万元。"
for x, w in textrank(s, withWeight=True):
print x, w
print('%s %s' % (x, w))

@ -1,8 +1,9 @@
from __future__ import with_statement
from __future__ import absolute_import, unicode_literals
import re
import os
import marshal
import sys
from .._compat import *
MIN_FLOAT = -3.14e100
@ -41,9 +42,9 @@ def load_model():
if sys.platform.startswith("java"):
start_P, trans_P, emit_P = load_model()
else:
from prob_start import P as start_P
from prob_trans import P as trans_P
from prob_emit import P as emit_P
from .prob_start import P as start_P
from .prob_trans import P as trans_P
from .prob_emit import P as emit_P
def viterbi(obs, states, start_p, trans_p, emit_p):
V = [{}] #tabular
@ -85,12 +86,8 @@ def __cut(sentence):
yield sentence[nexti:]
def cut(sentence):
if not isinstance(sentence, unicode):
try:
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
sentence = sentence.decode('gbk', 'ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"(\d+\.\d+|[a-zA-Z0-9]+)")
sentence = strdecode(sentence)
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
blocks = re_han.split(sentence)
for blk in blocks:
if re_han.match(blk):

File diff suppressed because it is too large Load Diff

@ -1,13 +1,12 @@
from __future__ import with_statement
from __future__ import absolute_import, unicode_literals
import re
import os
import viterbi
import jieba
import sys
import marshal
from functools import wraps
default_encoding = sys.getfilesystemencoding()
from .._compat import *
from .viterbi import viterbi
PROB_START_P = "prob_start.p"
PROB_TRANS_P = "prob_trans.p"
@ -18,13 +17,14 @@ def load_model(f_name, isJython=True):
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
result = {}
with open(f_name, "r") as f:
with open(f_name, "rb") as f:
for line in f:
line = line.strip()
if not line:
continue
word, _, tag = line.split(' ')
result[word.decode('utf-8')] = tag
line = line.decode("utf-8")
word, _, tag = line.split(" ")
result[word] = tag
if not isJython:
return result
@ -55,10 +55,10 @@ def load_model(f_name, isJython=True):
if sys.platform.startswith("java"):
char_state_tab_P, start_P, trans_P, emit_P, word_tag_tab = load_model(jieba.get_abs_path_dict())
else:
from char_state_tab import P as char_state_tab_P
from prob_start import P as start_P
from prob_trans import P as trans_P
from prob_emit import P as emit_P
from .char_state_tab import P as char_state_tab_P
from .prob_start import P as start_P
from .prob_trans import P as trans_P
from .prob_emit import P as emit_P
word_tag_tab = load_model(jieba.get_abs_path_dict(), isJython=False)
@ -79,20 +79,23 @@ class pair(object):
self.flag = flag
def __unicode__(self):
return u'%s/%s' % (self.word, self.flag)
return '%s/%s' % (self.word, self.flag)
def __repr__(self):
return self.__str__()
def __str__(self):
return self.__unicode__().encode(default_encoding)
if PY2:
return self.__unicode__().encode(default_encoding)
else:
return self.__unicode__()
def encode(self,arg):
return self.__unicode__().encode(arg)
def __cut(sentence):
prob, pos_list = viterbi.viterbi(sentence, char_state_tab_P, start_P, trans_P, emit_P)
begin, next = 0, 0
prob, pos_list = viterbi(sentence, char_state_tab_P, start_P, trans_P, emit_P)
begin, nexti = 0, 0
for i,char in enumerate(sentence):
pos = pos_list[i][0]
@ -100,16 +103,16 @@ def __cut(sentence):
begin = i
elif pos == 'E':
yield pair(sentence[begin:i+1], pos_list[i][1])
next = i+1
nexti = i+1
elif pos == 'S':
yield pair(char, pos_list[i][1])
next = i+1
if next < len(sentence):
yield pair(sentence[next:], pos_list[next][1])
nexti = i+1
if nexti < len(sentence):
yield pair(sentence[nexti:], pos_list[nexti][1])
def __cut_detail(sentence):
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([\.0-9]+|[a-zA-Z0-9]+)")
re_eng, re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
blocks = re_han.split(sentence)
for blk in blocks:
if re_han.match(blk):
@ -132,8 +135,8 @@ def __cut_DAG_NO_HMM(sentence):
jieba.calc(sentence, DAG, route)
x = 0
N = len(sentence)
buf = u''
re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
buf = ''
re_eng = re.compile('[a-zA-Z0-9]',re.U)
while x < N:
y = route[x][1]+1
l_word = sentence[x:y]
@ -143,12 +146,12 @@ def __cut_DAG_NO_HMM(sentence):
else:
if buf:
yield pair(buf,'eng')
buf = u''
buf = ''
yield pair(l_word, word_tag_tab.get(l_word, 'x'))
x = y
if buf:
yield pair(buf,'eng')
buf = u''
buf = ''
def __cut_DAG(sentence):
DAG = jieba.get_DAG(sentence)
@ -157,7 +160,7 @@ def __cut_DAG(sentence):
jieba.calc(sentence, DAG, route)
x = 0
buf = u''
buf = ''
N = len(sentence)
while x < N:
y = route[x][1]+1
@ -175,7 +178,7 @@ def __cut_DAG(sentence):
else:
for elem in buf:
yield pair(elem, word_tag_tab.get(elem, 'x'))
buf = u''
buf = ''
yield pair(l_word, word_tag_tab.get(l_word, 'x'))
x = y
@ -191,13 +194,9 @@ def __cut_DAG(sentence):
yield pair(elem, word_tag_tab.get(elem, 'x'))
def __cut_internal(sentence, HMM=True):
if not isinstance(sentence, unicode):
try:
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
sentence = sentence.decode('gbk', 'ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\r\n|\s)")
re_eng, re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
sentence = strdecode(sentence)
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
blocks = re_han.split(sentence)
if HMM:
__cut_blk = __cut_DAG
@ -234,7 +233,7 @@ def cut(sentence, HMM=True):
for w in __cut_internal(sentence, HMM=HMM):
yield w
else:
parts = re.compile('([\r\n]+)').split(sentence)
parts = strdecode(sentence).split('\n')
if HMM:
result = jieba.pool.map(__lcut_internal, parts)
else:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,7 +1,11 @@
import sys
import operator
MIN_FLOAT = -3.14e100
MIN_INF = float("-inf")
if sys.version_info[0] > 2:
xrange = range
def get_top_states(t_state_v, K=4):
return sorted(t_state_v, key=t_state_v.__getitem__, reverse=True)[:K]

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
from distutils.core import setup
LONGDOC = u"""
LONGDOC = """
jieba
=====
@ -75,6 +75,12 @@ setup(name='jieba',
'Natural Language :: Chinese (Traditional)',
'Programming Language :: Python',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.6',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.2',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
'Topic :: Text Processing',
'Topic :: Text Processing :: Indexing',
'Topic :: Text Processing :: Linguistic',

@ -1,522 +0,0 @@
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/analyzer.py ../jieba/jieba/analyse/analyzer.py
--- ./jieba/analyse/analyzer.py 2014-11-29 15:46:45.987925569 +0800
+++ ../jieba/jieba/analyse/analyzer.py 2014-11-29 15:34:42.859932465 +0800
@@ -1,4 +1,4 @@
-##encoding=utf-8
+#encoding=utf-8
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
from whoosh.analysis import Tokenizer,Token
from whoosh.lang.porter import stem
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/__init__.py ../jieba/jieba/analyse/__init__.py
--- ./jieba/analyse/__init__.py 2014-11-29 15:46:46.139925567 +0800
+++ ../jieba/jieba/analyse/__init__.py 2014-11-29 15:36:13.147931604 +0800
@@ -26,7 +26,7 @@
def set_new_path(self, new_idf_path):
if self.path != new_idf_path:
- content = open(new_idf_path, 'rb').read().decode('utf-8')
+ content = open(new_idf_path, 'r', encoding='utf-8').read()
idf_freq = {}
lines = content.rstrip('\n').split('\n')
for line in lines:
@@ -93,7 +93,7 @@
freq[k] *= idf_freq.get(k, median_idf) / total
if withWeight:
- tags = sorted(list(freq.items()), key=itemgetter(1), reverse=True)
+ tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(freq, key=freq.__getitem__, reverse=True)
if topK:
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/textrank.py ../jieba/jieba/analyse/textrank.py
--- ./jieba/analyse/textrank.py 2014-11-29 15:46:46.043925568 +0800
+++ ../jieba/jieba/analyse/textrank.py 2014-11-29 15:36:39.291931354 +0800
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
@@ -22,12 +22,12 @@
outSum = collections.defaultdict(float)
wsdef = 1.0 / len(self.graph)
- for n, out in list(self.graph.items()):
+ for n, out in self.graph.items():
ws[n] = wsdef
outSum[n] = sum((e[2] for e in out), 0.0)
for x in range(10): # 10 iters
- for n, inedges in list(self.graph.items()):
+ for n, inedges in self.graph.items():
s = 0
for e in inedges:
s += e[2] / outSum[e[1]] * ws[e[1]]
@@ -41,7 +41,7 @@
elif w > max_rank:
max_rank = w
- for n, w in list(ws.items()):
+ for n, w in ws.items():
# to unify the weights, don't *100.
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
@@ -72,12 +72,12 @@
continue
cm[(words[i].word, words[j].word)] += 1
- for terms, w in list(cm.items()):
+ for terms, w in cm.items():
g.addEdge(terms[0], terms[1], w)
nodes_rank = g.rank()
if withWeight:
- tags = sorted(list(nodes_rank.items()), key=itemgetter(1), reverse=True)
+ tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
if topK:
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/finalseg/__init__.py ../jieba/jieba/finalseg/__init__.py
--- ./jieba/finalseg/__init__.py 2014-11-29 15:46:46.367925565 +0800
+++ ../jieba/jieba/finalseg/__init__.py 2014-11-29 15:34:42.859932465 +0800
@@ -1,4 +1,3 @@
-
import re
import os
import marshal
@@ -89,7 +88,7 @@
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
sentence = sentence.decode('gbk', 'ignore')
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"(\d+\.\d+|[a-zA-Z0-9]+)")
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
blocks = re_han.split(sentence)
for blk in blocks:
if re_han.match(blk):
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__init__.py ../jieba/jieba/__init__.py
--- ./jieba/__init__.py 2014-11-29 15:46:45.955925569 +0800
+++ ../jieba/jieba/__init__.py 2014-11-29 15:39:03.335929981 +0800
@@ -1,4 +1,3 @@
-
__version__ = '0.35'
__license__ = 'MIT'
@@ -51,7 +50,7 @@
pfdict.add(word[:ch+1])
except ValueError as e:
logger.debug('%s at line %s %s' % (f_name, lineno, line))
- raise ValueError(e)
+ raise e
return pfdict, lfreq, ltotal
def initialize(dictionary=None):
@@ -229,11 +228,11 @@
'''The main function that segments an entire sentence that contains
Chinese characters into seperated words.
Parameter:
- - sentence: The str/unicode to be segmented.
+ - sentence: The str to be segmented.
- cut_all: Model type. True for full pattern, False for accurate pattern.
- HMM: Whether to use the Hidden Markov Model.
'''
- if not isinstance(sentence, str):
+ if isinstance(sentence, bytes):
try:
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
@@ -243,9 +242,9 @@
# \r\n|\s : whitespace characters. Will not be handled.
if cut_all:
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)", re.U), re.compile(r"[^a-zA-Z0-9+#\n]", re.U)
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)", re.U), re.compile("[^a-zA-Z0-9+#\n]", re.U)
else:
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(r"(\r\n|\s)", re.U)
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile("(\r\n|\s)", re.U)
blocks = re_han.split(sentence)
if cut_all:
cut_block = __cut_all
@@ -339,8 +338,6 @@
global pool, cut, cut_for_search
if os.name == 'nt':
raise Exception("jieba: parallel mode only supports posix system")
- if sys.version_info[0]==2 and sys.version_info[1]<6:
- raise Exception("jieba: the parallel feature needs Python version>2.5")
from multiprocessing import Pool, cpu_count
if processnum is None:
processnum = cpu_count()
@@ -393,12 +390,12 @@
def tokenize(unicode_sentence, mode="default", HMM=True):
"""Tokenize a sentence and yields tuples of (word, start, end)
Parameter:
- - sentence: the unicode to be segmented.
+ - sentence: the str to be segmented.
- mode: "default" or "search", "search" is for finer segmentation.
- HMM: whether to use the Hidden Markov Model.
"""
if not isinstance(unicode_sentence, str):
- raise Exception("jieba: the input parameter should be unicode.")
+ raise Exception("jieba: the input parameter should be str.")
start = 0
if mode == 'default':
for w in cut(unicode_sentence, HMM=HMM):
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__main__.py ../jieba/jieba/__main__.py
--- ./jieba/__main__.py 2014-11-29 15:46:45.747925571 +0800
+++ ../jieba/jieba/__main__.py 2014-11-29 15:34:42.859932465 +0800
@@ -40,7 +40,7 @@
ln = fp.readline()
while ln:
l = ln.rstrip('\r\n')
- print((delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8')))
+ print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)))
ln = fp.readline()
fp.close()
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/__init__.py ../jieba/jieba/posseg/__init__.py
--- ./jieba/posseg/__init__.py 2014-11-29 15:46:46.271925566 +0800
+++ ../jieba/jieba/posseg/__init__.py 2014-11-29 15:37:52.299930658 +0800
@@ -1,4 +1,3 @@
-
import re
import os
from . import viterbi
@@ -18,14 +17,14 @@
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
result = {}
- with open(f_name, "r") as f:
+ with open(f_name, "rb") as f:
for line in f:
line = line.strip()
if not line:
continue
- word, _, tag = line.split(' ')
- result[word.decode('utf-8')] = tag
-
+ line = line.decode("utf-8")
+ word, _, tag = line.split(" ")
+ result[word] = tag
if not isJython:
return result
@@ -105,8 +104,8 @@
yield pair(sentence[next:], pos_list[next][1])
def __cut_detail(sentence):
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"([\.0-9]+|[a-zA-Z0-9]+)")
- re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
+ re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
blocks = re_han.split(sentence)
for blk in blocks:
if re_han.match(blk):
@@ -130,7 +129,7 @@
x = 0
N = len(sentence)
buf = ''
- re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
+ re_eng = re.compile('[a-zA-Z0-9]',re.U)
while x < N:
y = route[x][1]+1
l_word = sentence[x:y]
@@ -195,8 +194,8 @@
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
sentence = sentence.decode('gbk', 'ignore')
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(r"(\r\n|\s)")
- re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
+ re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
blocks = re_han.split(sentence)
if HMM:
__cut_blk = __cut_DAG
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/viterbi.py ../jieba/jieba/posseg/viterbi.py
--- ./jieba/posseg/viterbi.py 2014-11-29 15:46:46.303925566 +0800
+++ ../jieba/jieba/posseg/viterbi.py 2014-11-29 15:38:28.527930313 +0800
@@ -8,7 +8,7 @@
def viterbi(obs, states, start_p, trans_p, emit_p):
V = [{}] #tabular
mem_path = [{}]
- all_states = list(trans_p.keys())
+ all_states = trans_p.keys()
for y in states.get(obs[0], all_states): #init
V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
mem_path[0][y] = ''
@@ -16,9 +16,9 @@
V.append({})
mem_path.append({})
#prev_states = get_top_states(V[t-1])
- prev_states = [x for x in list(mem_path[t-1].keys()) if len(trans_p[x]) > 0]
+ prev_states = [x for x in mem_path[t-1].keys() if len(trans_p[x]) > 0]
- prev_states_expect_next = set((y for x in prev_states for y in list(trans_p[x].keys())))
+ prev_states_expect_next = set((y for x in prev_states for y in trans_p[x].keys()))
obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next
if not obs_states:
@@ -29,7 +29,7 @@
V[t][y] = prob
mem_path[t][y] = state
- last = [(V[-1][y], y) for y in list(mem_path[-1].keys())]
+ last = [(V[-1][y], y) for y in mem_path[-1].keys()]
#if len(last)==0:
#print obs
prob, state = max(last)
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./README.md ../jieba/README.md
--- ./README.md 2014-11-29 15:46:08.487925926 +0800
+++ ../jieba/README.md 2014-11-29 15:34:42.859932465 +0800
@@ -4,6 +4,9 @@
"Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module.
- _Scroll down for English documentation._
+注意!
+========
+这个branch `jieba3k` 是专门用于Python3.x的版本
特点
========
@@ -68,16 +71,16 @@
import jieba
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
-print "Full Mode:", "/ ".join(seg_list) # 全模式
+print("Full Mode:", "/ ".join(seg_list)) # 全模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
-print "Default Mode:", "/ ".join(seg_list) # 精确模式
+print("Default Mode:", "/ ".join(seg_list)) # 精确模式
seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式
-print ", ".join(seg_list)
+print(", ".join(seg_list))
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
-print ", ".join(seg_list)
+print(", ".join(seg_list))
```
输出:
@@ -174,7 +177,7 @@
>>> import jieba.posseg as pseg
>>> words = pseg.cut("我爱北京天安门")
>>> for w in words:
-... print w.word, w.flag
+... print(w.word, w.flag)
...
我 r
爱 v
@@ -203,7 +206,7 @@
```python
result = jieba.tokenize(u'永和服装饰品有限公司')
for tk in result:
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```
```
@@ -219,7 +222,7 @@
```python
result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
for tk in result:
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```
```
@@ -408,16 +411,16 @@
import jieba
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
-print "Full Mode:", "/ ".join(seg_list) # 全模式
+print("Full Mode:", "/ ".join(seg_list)) # 全模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
-print "Default Mode:", "/ ".join(seg_list) # 默认模式
+print("Default Mode:", "/ ".join(seg_list)) # 默认模式
seg_list = jieba.cut("他来到了网易杭研大厦")
-print ", ".join(seg_list)
+print(", ".join(seg_list))
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
-print ", ".join(seg_list)
+print(", ".join(seg_list))
```
Output:
@@ -483,7 +486,7 @@
>>> import jieba.posseg as pseg
>>> words = pseg.cut("我爱北京天安门")
>>> for w in words:
-... print w.word, w.flag
+... print(w.word, w.flag)
...
我 r
爱 v
@@ -512,7 +515,7 @@
```python
result = jieba.tokenize(u'永和服装饰品有限公司')
for tk in result:
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```
```
@@ -528,7 +531,7 @@
```python
result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
for tk in result:
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```
```
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./setup.py ../jieba/setup.py
--- ./setup.py 2014-11-29 15:46:46.379925565 +0800
+++ ../jieba/setup.py 2014-11-29 15:42:20.263928103 +0800
@@ -11,7 +11,7 @@
完整文档见 ``README.md``
-GitHub: https://github.com/fxsjy/jieba
+GitHub: https://github.com/fxsjy/jieba/tree/jieba3k
特点
====
@@ -34,17 +34,11 @@
Python 2.x
----------
-- 全自动安装: ``easy_install jieba`` 或者 ``pip install jieba``
-- 半自动安装:先下载 https://pypi.python.org/pypi/jieba/ ,解压后运行
- python setup.py install
-- 手动安装:将 jieba 目录放置于当前目录或者 site-packages 目录
-- 通过 ``import jieba`` 来引用
+见 https://pypi.python.org/pypi/jieba/
Python 3.x
----------
-见 https://pypi.python.org/pypi/jieba3k/
-
- 目前 master 分支是只支持 Python 2.x 的
- Python 3.x 版本的分支也已经基本可用:
https://github.com/fxsjy/jieba/tree/jieba3k
@@ -59,13 +53,13 @@
"""
-setup(name='jieba',
+setup(name='jieba3k',
version='0.35.1',
description='Chinese Words Segementation Utilities',
long_description=LONGDOC,
author='Sun, Junyi',
author_email='ccnusjy@gmail.com',
- url='https://github.com/fxsjy/jieba',
+ url='https://github.com/fxsjy/jieba/tree/jieba3k',
license="MIT",
classifiers=[
'Intended Audience :: Developers',
@@ -73,9 +67,8 @@
'Operating System :: OS Independent',
'Natural Language :: Chinese (Simplified)',
'Natural Language :: Chinese (Traditional)',
'Programming Language :: Python',
- 'Programming Language :: Python :: 2',
+ 'Programming Language :: Python :: 3',
'Topic :: Text Processing',
'Topic :: Text Processing :: Indexing',
'Topic :: Text Processing :: Linguistic',
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/extract_topic.py ../jieba/test/extract_topic.py
--- ./test/extract_topic.py 2014-11-29 15:46:47.003925559 +0800
+++ ../jieba/test/extract_topic.py 2014-11-29 15:34:42.919932464 +0800
@@ -51,13 +51,13 @@
print("training...")
nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
-print(("done in %0.3fs." % (time.time() - t0)))
+print("done in %0.3fs." % (time.time() - t0))
# Inverse the vectorizer vocabulary to be able
feature_names = count_vect.get_feature_names()
for topic_idx, topic in enumerate(nmf.components_):
- print(("Topic #%d:" % topic_idx))
- print((" ".join([feature_names[i]
- for i in topic.argsort()[:-n_top_words - 1:-1]])))
+ print("Topic #%d:" % topic_idx)
+ print(" ".join([feature_names[i]
+ for i in topic.argsort()[:-n_top_words - 1:-1]]))
print("")
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jiebacmd.py ../jieba/test/jiebacmd.py
--- ./test/jiebacmd.py 2014-11-29 15:46:46.443925564 +0800
+++ ../jieba/test/jiebacmd.py 2014-11-29 15:34:42.919932464 +0800
@@ -23,6 +23,6 @@
break
line = line.strip()
for word in jieba.cut(line):
- print(word.encode(default_encoding))
+ print(word)
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jieba_test.py ../jieba/test/jieba_test.py
--- ./test/jieba_test.py 2014-11-29 15:46:47.271925556 +0800
+++ ../jieba/test/jieba_test.py 2014-11-29 15:34:42.919932464 +0800
@@ -152,7 +152,7 @@
#-*-coding: utf-8 -*-
import sys
+import imp
sys.path.append("../")
import unittest
import types
@@ -97,7 +98,7 @@
class JiebaTestCase(unittest.TestCase):
def setUp(self):
- reload(jieba)
+ imp.reload(jieba)
def tearDown(self):
pass
@@ -151,7 +152,7 @@
def testTokenize(self):
for content in test_contents:
- result = jieba.tokenize(content.decode('utf-8'))
+ result = jieba.tokenize(content)
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
@@ -181,7 +181,7 @@
def testTokenize_NOHMM(self):
for content in test_contents:
- result = jieba.tokenize(content.decode('utf-8'),HMM=False)
+ result = jieba.tokenize(content,HMM=False)
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize_no_hmm.py ../jieba/test/test_tokenize_no_hmm.py
--- ./test/test_tokenize_no_hmm.py 2014-11-29 15:46:47.355925556 +0800
+++ ../jieba/test/test_tokenize_no_hmm.py 2014-11-29 15:34:42.919932464 +0800
@@ -7,7 +7,6 @@
def cuttest(test_sent):
global g_mode
- test_sent = test_sent.decode('utf-8')
result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize.py ../jieba/test/test_tokenize.py
--- ./test/test_tokenize.py 2014-11-29 15:46:47.403925555 +0800
+++ ../jieba/test/test_tokenize.py 2014-11-29 15:34:42.919932464 +0800
@@ -7,7 +7,6 @@
def cuttest(test_sent):
global g_mode
- test_sent = test_sent.decode('utf-8')
result = jieba.tokenize(test_sent,mode=g_mode)
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))

@ -1,34 +0,0 @@
#!/bin/bash
# Set 2to3 path.
PYTHON2TO3=2to3
# Copy the python2 version.
echo Jieba 2to3 manual conversion tool
echo
if ! git rev-parse; then
exit 1
fi
echo Copying working directory to ../jieba2
if [ -d ../jieba2 ]; then
echo Found existing ../jieba2
read -p "Replace it with new one? (y/n) " -r
if ! [[ $REPLY =~ ^[Yy]$ ]]; then
echo Cancelled.
exit
else
rm -rf ../jieba2
fi
fi
if ! git checkout jieba3k; then
exit 1
fi
cp -r . ../jieba2
cd ../jieba2
if ! git checkout master; then
exit 1
fi
# Here starts auto conversion.
echo Converting jieba2 to Python3 ...
find . -type f -name '*.py' \! -path '*/build/*' \! -name 'prob_*.py' \! -name 'char_state_tab.py' -exec $PYTHON2TO3 -w -n {} +
find . -type f \! -path '*/build/*' -a \( -name 'prob_*.py' -o -name 'char_state_tab.py' \) -exec sed -i "s/u'\\\u/'\\\u/g" {} \;
patch -p0 -s <../jieba/test/2to3.diff
echo Done. Compare jieba and jieba2 to manually port.

@ -1,17 +1,18 @@
#encoding=utf-8
from __future__ import unicode_literals
import sys
sys.path.append("../")
import jieba
seg_list = jieba.cut(u"我来到北京清华大学", cut_all=True)
print u"Full Mode:", u"/ ".join(seg_list) # 全模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print("Full Mode: " + "/ ".join(seg_list)) # 全模式
seg_list = jieba.cut(u"我来到北京清华大学", cut_all=False)
print u"Default Mode:", u"/ ".join(seg_list) # 默认模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print("Default Mode: " + "/ ".join(seg_list)) # 默认模式
seg_list = jieba.cut(u"他来到了网易杭研大厦")
print u", ".join(seg_list)
seg_list = jieba.cut("他来到了网易杭研大厦")
print(", ".join(seg_list))
seg_list = jieba.cut_for_search(u"小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
print u", ".join(seg_list)
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
print(", ".join(seg_list))

@ -13,7 +13,7 @@ opt, args = parser.parse_args()
if len(args) < 1:
print USAGE
print(USAGE)
sys.exit(1)
file_name = args[0]
@ -27,4 +27,4 @@ content = open(file_name, 'rb').read()
tags = jieba.analyse.extract_tags(content, topK=topK)
print ",".join(tags)
print(",".join(tags))

@ -13,7 +13,7 @@ opt, args = parser.parse_args()
if len(args) < 1:
print USAGE
print(USAGE)
sys.exit(1)
file_name = args[0]
@ -29,4 +29,4 @@ jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
tags = jieba.analyse.extract_tags(content, topK=topK)
print ",".join(tags)
print(",".join(tags))

@ -13,7 +13,7 @@ opt, args = parser.parse_args()
if len(args) < 1:
print USAGE
print(USAGE)
sys.exit(1)
file_name = args[0]
@ -30,4 +30,4 @@ jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
tags = jieba.analyse.extract_tags(content, topK=topK)
print ",".join(tags)
print(",".join(tags))

@ -14,7 +14,7 @@ opt, args = parser.parse_args()
if len(args) < 1:
print USAGE
print(USAGE)
sys.exit(1)
file_name = args[0]
@ -38,6 +38,6 @@ tags = jieba.analyse.extract_tags(content, topK=topK, withWeight=withWeight)
if withWeight is True:
for tag in tags:
print "tag: %s\t\t weight: %f" % (tag[0],tag[1])
print("tag: %s\t\t weight: %f" % (tag[0],tag[1]))
else:
print ",".join(tags)
print(",".join(tags))

@ -12,7 +12,7 @@ import os
import random
if len(sys.argv)<2:
print "usage: extract_topic.py directory [n_topic] [n_top_words]"
print("usage: extract_topic.py directory [n_topic] [n_top_words]")
sys.exit(0)
n_topic = 10
@ -28,27 +28,27 @@ count_vect = CountVectorizer()
docs = []
pattern = os.path.join(sys.argv[1],"*.txt")
print "read "+pattern
print("read "+pattern)
for f_name in glob.glob(pattern):
with open(f_name) as f:
print "read file:", f_name
print("read file:", f_name)
for line in f: #one line as a document
words = " ".join(jieba.cut(line))
docs.append(words)
random.shuffle(docs)
print "read done."
print("read done.")
print "transform"
print("transform")
counts = count_vect.fit_transform(docs)
tfidf = TfidfTransformer().fit_transform(counts)
print tfidf.shape
print(tfidf.shape)
t0 = time.time()
print "training..."
print("training...")
nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
print("done in %0.3fs." % (time.time() - t0))

@ -1,9 +1,13 @@
#-*-coding: utf-8 -*-
from __future__ import unicode_literals, print_function
import sys
sys.path.append("../")
import unittest
import types
import jieba
if sys.version_info[0] > 2:
from imp import reload
jieba.initialize()
@ -108,8 +112,8 @@ class JiebaTestCase(unittest.TestCase):
assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
result = list(result)
assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
print >> sys.stderr, " , ".join(result)
print >> sys.stderr, "testDefaultCut"
print(" , ".join(result), file=sys.stderr)
print("testDefaultCut", file=sys.stderr)
def testCutAll(self):
for content in test_contents:
@ -117,8 +121,8 @@ class JiebaTestCase(unittest.TestCase):
assert isinstance(result, types.GeneratorType), "Test CutAll Generator error"
result = list(result)
assert isinstance(result, list), "Test CutAll error on content: %s" % content
print >> sys.stderr, " , ".join(result)
print >> sys.stderr, "testCutAll"
print(" , ".join(result), file=sys.stderr)
print("testCutAll", file=sys.stderr)
def testSetDictionary(self):
jieba.set_dictionary("foobar.txt")
@ -127,8 +131,8 @@ class JiebaTestCase(unittest.TestCase):
assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error"
result = list(result)
assert isinstance(result, list), "Test SetDictionary error on content: %s" % content
print >> sys.stderr, " , ".join(result)
print >> sys.stderr, "testSetDictionary"
print(" , ".join(result), file=sys.stderr)
print("testSetDictionary", file=sys.stderr)
def testCutForSearch(self):
for content in test_contents:
@ -136,8 +140,8 @@ class JiebaTestCase(unittest.TestCase):
assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
result = list(result)
assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
print >> sys.stderr, " , ".join(result)
print >> sys.stderr, "testCutForSearch"
print(" , ".join(result), file=sys.stderr)
print("testCutForSearch", file=sys.stderr)
def testPosseg(self):
import jieba.posseg as pseg
@ -146,18 +150,18 @@ class JiebaTestCase(unittest.TestCase):
assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
result = list(result)
assert isinstance(result, list), "Test Posseg error on content: %s" % content
print >> sys.stderr, " , ".join([w.word + " / " + w.flag for w in result])
print >> sys.stderr, "testPosseg"
print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
print("testPosseg", file=sys.stderr)
def testTokenize(self):
for content in test_contents:
result = jieba.tokenize(content.decode('utf-8'))
result = jieba.tokenize(content)
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
for tk in result:
print >>sys.stderr, "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
print >> sys.stderr, "testTokenize"
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
print("testTokenize", file=sys.stderr)
def testDefaultCut_NOHMM(self):
for content in test_contents:
@ -165,8 +169,8 @@ class JiebaTestCase(unittest.TestCase):
assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
result = list(result)
assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
print >> sys.stderr, " , ".join(result)
print >> sys.stderr, "testDefaultCut_NOHMM"
print(" , ".join(result), file=sys.stderr)
print("testDefaultCut_NOHMM", file=sys.stderr)
def testPosseg_NOHMM(self):
import jieba.posseg as pseg
@ -175,18 +179,18 @@ class JiebaTestCase(unittest.TestCase):
assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
result = list(result)
assert isinstance(result, list), "Test Posseg error on content: %s" % content
print >> sys.stderr, " , ".join([w.word + " / " + w.flag for w in result])
print >> sys.stderr, "testPosseg_NOHMM"
print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
print("testPosseg_NOHMM", file=sys.stderr)
def testTokenize_NOHMM(self):
for content in test_contents:
result = jieba.tokenize(content.decode('utf-8'),HMM=False)
result = jieba.tokenize(content,HMM=False)
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
for tk in result:
print >>sys.stderr, "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
print >> sys.stderr, "testTokenize_NOHMM"
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
print("testTokenize_NOHMM", file=sys.stderr)
def testCutForSearch_NOHMM(self):
for content in test_contents:
@ -194,8 +198,8 @@ class JiebaTestCase(unittest.TestCase):
assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
result = list(result)
assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
print >> sys.stderr, " , ".join(result)
print >> sys.stderr, "testCutForSearch_NOHMM"
print(" , ".join(result), file=sys.stderr)
print("testCutForSearch_NOHMM", file=sys.stderr)
if __name__ == "__main__":
unittest.main()

@ -6,7 +6,7 @@ cat abc.txt | python jiebacmd.py | sort | uniq -c | sort -nr -k1 | head -100
'''
from __future__ import unicode_literals
import sys
sys.path.append("../")
@ -23,6 +23,6 @@ while True:
break
line = line.strip()
for word in jieba.cut(line):
print word.encode(default_encoding)
print(word)

@ -14,7 +14,7 @@ opt, args = parser.parse_args()
if len(args) <1:
print USAGE
print(USAGE)
sys.exit(1)
file_name = args[0]
@ -29,6 +29,6 @@ content = open(file_name,'rb').read()
tags = jieba.analyse.extract_tags(content,topK=topK)
print ",".join(tags)
print(",".join(tags))

@ -1,4 +1,5 @@
#encoding=utf-8
from __future__ import print_function
import sys
sys.path.append("../../")
import jieba
@ -7,8 +8,8 @@ jieba.enable_parallel(4)
def cuttest(test_sent):
result = jieba.cut(test_sent)
for word in result:
print word, "/",
print ""
print(word, "/", end=' ')
print("")
if __name__ == "__main__":

@ -1,4 +1,5 @@
#encoding=utf-8
from __future__ import print_function
import sys
sys.path.append("../../")
import jieba
@ -7,8 +8,8 @@ jieba.enable_parallel(4)
def cuttest(test_sent):
result = jieba.cut(test_sent,cut_all=True)
for word in result:
print word, "/",
print ""
print(word, "/", end=' ')
print("")
if __name__ == "__main__":

@ -1,4 +1,5 @@
#encoding=utf-8
from __future__ import print_function
import sys
sys.path.append("../../")
import jieba
@ -7,8 +8,8 @@ jieba.enable_parallel(4)
def cuttest(test_sent):
result = jieba.cut_for_search(test_sent)
for word in result:
print word, "/",
print ""
print(word, "/", end=' ')
print("")
if __name__ == "__main__":

@ -1,6 +1,5 @@
import urllib2
import sys,time
import sys
import time
sys.path.append("../../")
import jieba
@ -17,5 +16,5 @@ tm_cost = t2-t1
log_f = open("1.log","wb")
log_f.write(words.encode('utf-8'))
print 'speed' , len(content)/tm_cost, " bytes/second"
print('speed %s bytes/second' % (len(content)/tm_cost))

@ -1,4 +1,5 @@
#encoding=utf-8
from __future__ import print_function
import sys
sys.path.append("../../")
import jieba
@ -8,8 +9,8 @@ import jieba.posseg as pseg
def cuttest(test_sent):
result = pseg.cut(test_sent)
for w in result:
print w.word, "/", w.flag, ", ",
print ""
print(w.word, "/", w.flag, ", ", end=' ')
print("")
if __name__ == "__main__":

@ -1,9 +1,10 @@
import urllib2
from __future__ import print_function
import sys,time
import sys
sys.path.append("../../")
import jieba
import jieba.posseg as pseg
jieba.enable_parallel(4)
url = sys.argv[1]
@ -14,9 +15,8 @@ words = list(pseg.cut(content))
t2 = time.time()
tm_cost = t2-t1
log_f = open("1.log","wb")
for w in words:
print >> log_f, w.encode("utf-8"), "/" ,
log_f = open("1.log","w")
log_f.write(' / '.join(map(str, words)))
print 'speed' , len(content)/tm_cost, " bytes/second"
print('speed' , len(content)/tm_cost, " bytes/second")

@ -6,7 +6,7 @@ import jieba
def cuttest(test_sent):
result = jieba.cut(test_sent)
print " / ".join(result)
print(" / ".join(result))
if __name__ == "__main__":

@ -5,5 +5,5 @@ import jieba
import jieba.posseg as pseg
words=pseg.cut("又跛又啞")
for w in words:
print w.word,w.flag
print(w.word,w.flag)

@ -5,7 +5,7 @@ import jieba
def cuttest(test_sent):
result = jieba.cut(test_sent)
print " ".join(result)
print(" ".join(result))
def testcase():
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。")
@ -22,6 +22,6 @@ def testcase():
if __name__ == "__main__":
testcase()
jieba.set_dictionary("foobar.txt")
print "================================"
print("================================")
testcase()

@ -6,8 +6,8 @@ import jieba
def cuttest(test_sent):
result = jieba.cut_for_search(test_sent)
for word in result:
print word, "/",
print ""
print(word, "/", end=' ')
print("")
if __name__ == "__main__":

@ -6,8 +6,8 @@ import jieba
def cuttest(test_sent):
result = jieba.cut(test_sent,cut_all=True)
for word in result:
print word, "/",
print ""
print(word, "/", end=' ')
print("")
if __name__ == "__main__":

@ -1,4 +1,3 @@
import urllib2
import sys,time
import sys
sys.path.append("../")
@ -17,6 +16,6 @@ log_f = open("1.log","wb")
log_f.write(words.encode('utf-8'))
log_f.close()
print 'cost',tm_cost
print 'speed' , len(content)/tm_cost, " bytes/second"
print('cost ' + tm_cost)
print('speed %s bytes/second' % (len(content)/tm_cost))

@ -8,18 +8,18 @@ import jieba
class Worker(threading.Thread):
def run(self):
seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
print "Full Mode:" + "/ ".join(seg_list) #全模式
print("Full Mode:" + "/ ".join(seg_list)) #全模式
seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
print "Default Mode:" + "/ ".join(seg_list) #默认模式
print("Default Mode:" + "/ ".join(seg_list)) #默认模式
seg_list = jieba.cut("他来到了网易杭研大厦")
print ", ".join(seg_list)
print(", ".join(seg_list))
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
print ", ".join(seg_list)
print(", ".join(seg_list))
workers = []
for i in xrange(10):
for i in range(10):
worker = Worker()
workers.append(worker)
worker.start()

@ -6,7 +6,7 @@ import jieba
def cuttest(test_sent):
result = jieba.cut(test_sent,HMM=False)
print " / ".join(result)
print(" / ".join(result))
if __name__ == "__main__":

@ -1,4 +1,5 @@
#encoding=utf-8
from __future__ import print_function
import sys
sys.path.append("../")
import jieba.posseg as pseg
@ -6,8 +7,8 @@ import jieba.posseg as pseg
def cuttest(test_sent):
result = pseg.cut(test_sent)
for w in result:
print w.word, "/", w.flag, ", ",
print ""
print(w.word, "/", w.flag, ", ", end=' ')
print("")
if __name__ == "__main__":

@ -1,6 +1,6 @@
import urllib2
import sys,time
from __future__ import print_function
import sys
import time
sys.path.append("../")
import jieba
jieba.initialize()
@ -14,9 +14,8 @@ words = list(pseg.cut(content))
t2 = time.time()
tm_cost = t2-t1
log_f = open("1.log","wb")
for w in words:
print >> log_f, w.encode("utf-8"), "/" ,
log_f = open("1.log","w")
log_f.write(' / '.join(map(str, words)))
print 'speed' , len(content)/tm_cost, " bytes/second"
print('speed' , len(content)/tm_cost, " bytes/second")

@ -6,8 +6,8 @@ import jieba.posseg as pseg
def cuttest(test_sent):
result = pseg.cut(test_sent,HMM=False)
for w in result:
print w.word, "/", w.flag, ", ",
print ""
print(w.word, "/", w.flag, ", ", end=' ')
print("")
if __name__ == "__main__":

@ -7,10 +7,9 @@ g_mode="default"
def cuttest(test_sent):
global g_mode
test_sent = test_sent.decode('utf-8')
result = jieba.tokenize(test_sent,mode=g_mode)
for tk in result:
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
if __name__ == "__main__":

@ -7,10 +7,9 @@ g_mode="default"
def cuttest(test_sent):
global g_mode
test_sent = test_sent.decode('utf-8')
result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
for tk in result:
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
if __name__ == "__main__":

@ -9,19 +9,19 @@ test_sent = "李小福是创新办主任也是云计算方面的专家; 什么
test_sent += "例如我输入一个带“韩玉赏鉴”的标题在自定义词库中也增加了此词为N类型"
words = jieba.cut(test_sent)
for w in words:
print w
print(w)
result = pseg.cut(test_sent)
for w in result:
print w.word, "/", w.flag, ", ",
print(w.word, "/", w.flag, ", ", end=' ')
print "\n========"
print("\n========")
terms = jieba.cut('easy_install is great')
for t in terms:
print t
print '-------------------------'
print(t)
print('-------------------------')
terms = jieba.cut('python 的正则表达式是好用的')
for t in terms:
print t
print(t)

@ -1,4 +1,5 @@
# -*- coding: UTF-8 -*-
from __future__ import unicode_literals
import sys,os
sys.path.append("../")
from whoosh.index import create_in,open_dir
@ -18,46 +19,46 @@ ix = create_in("tmp", schema) # for create new index
writer = ix.writer()
writer.add_document(
title=u"document1",
path=u"/a",
content=u"This is the first document weve added!"
title="document1",
path="/a",
content="This is the first document weve added!"
)
writer.add_document(
title=u"document2",
path=u"/b",
content=u"The second one 你 中文测试中文 is even more interesting! 吃水果"
title="document2",
path="/b",
content="The second one 你 中文测试中文 is even more interesting! 吃水果"
)
writer.add_document(
title=u"document3",
path=u"/c",
content=u"买水果然后来世博园。"
title="document3",
path="/c",
content="买水果然后来世博园。"
)
writer.add_document(
title=u"document4",
path=u"/c",
content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
title="document4",
path="/c",
content="工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
)
writer.add_document(
title=u"document4",
path=u"/c",
content=u"咱俩交换一下吧。"
title="document4",
path="/c",
content="咱俩交换一下吧。"
)
writer.commit()
searcher = ix.searcher()
parser = QueryParser("content", schema=ix.schema)
for keyword in (u"水果世博园",u"",u"first",u"中文",u"交换机",u"交换"):
print "result of ",keyword
for keyword in ("水果世博园","","first","中文","交换机","交换"):
print("result of ",keyword)
q = parser.parse(keyword)
results = searcher.search(q)
for hit in results:
print hit.highlights("content")
print "="*10
print(hit.highlights("content"))
print("="*10)
for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream. this is intetesting and interested me a lot"):
print t.text
for t in analyzer("我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream. this is intetesting and interested me a lot"):
print(t.text)

@ -1,4 +1,5 @@
# -*- coding: UTF-8 -*-
from __future__ import unicode_literals
import sys
import os
sys.path.append("../")
@ -23,8 +24,8 @@ with open(file_name,"rb") as inf:
for line in inf:
i+=1
writer.add_document(
title=u"line"+str(i),
path=u"/a",
title="line"+str(i),
path="/a",
content=line.decode('gbk','ignore')
)
writer.commit()
@ -32,10 +33,10 @@ writer.commit()
searcher = ix.searcher()
parser = QueryParser("content", schema=ix.schema)
for keyword in (u"水果小姐",u"",u"first",u"中文",u"交换机",u"交换"):
print "result of ",keyword
for keyword in ("水果小姐","","first","中文","交换机","交换"):
print("result of " + keyword)
q = parser.parse(keyword)
results = searcher.search(q)
for hit in results:
print hit.highlights("content")
print "="*10
print(hit.highlights("content"))
print("="*10)

@ -1,4 +1,5 @@
# -*- coding: UTF-8 -*-
from __future__ import unicode_literals
import sys
import os
sys.path.append("../")
@ -18,10 +19,10 @@ ix = open_dir("tmp")
searcher = ix.searcher()
parser = QueryParser("content", schema=ix.schema)
for keyword in (u"水果小姐",u"",u"first",u"中文",u"交换机",u"交换",u"少林",u"乔峰"):
print "result of ",keyword
for keyword in ("水果小姐","","first","中文","交换机","交换","少林","乔峰"):
print("result of ",keyword)
q = parser.parse(keyword)
results = searcher.search(q)
for hit in results:
print hit.highlights("content")
print "="*10
print(hit.highlights("content"))
print("="*10)
Loading…
Cancel
Save