use CRLF as seperator to make chunks in parallel mode

pull/39/head
fxsjy 12 years ago
parent 6b83593b5a
commit b46166f768

@ -233,7 +233,7 @@ def enable_parallel(processnum):
pool = Pool(processnum)
def pcut(sentence,cut_all=False):
parts = re.compile('(\s+)').split(sentence)
parts = re.compile('([\r\n]+)').split(sentence)
if cut_all:
result = pool.map(__lcut_all,parts)
else:
@ -243,7 +243,7 @@ def enable_parallel(processnum):
yield w
def pcut_for_search(sentence):
parts = re.compile('(\s+)').split(sentence)
parts = re.compile('([\r\n]+)').split(sentence)
result = pool.map(__lcut_for_search,parts)
for r in result:
for w in r:

@ -155,7 +155,7 @@ def cut(sentence):
for w in __cut_internal(sentence):
yield w
else:
parts = re.compile('(\s+)').split(sentence)
parts = re.compile('([\r\n]+)').split(sentence)
result = jieba.pool.map(__lcut_internal,parts)
for r in result:
for w in r:

@ -0,0 +1,34 @@
import sys
sys.path.append('../../')
import jieba
jieba.enable_parallel(4)
import jieba.analyse
from optparse import OptionParser
USAGE ="usage: python extract_tags.py [file name] -k [top k]"
parser = OptionParser(USAGE)
parser.add_option("-k",dest="topK")
opt, args = parser.parse_args()
if len(args) <1:
print USAGE
sys.exit(1)
file_name = args[0]
if opt.topK==None:
topK=10
else:
topK = int(opt.topK)
content = open(file_name,'rb').read()
tags = jieba.analyse.extract_tags(content,topK=topK)
print ",".join(tags)
Loading…
Cancel
Save