diff --git a/jieba/__init__.py b/jieba/__init__.py index 2601740..62185aa 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -233,7 +233,7 @@ def enable_parallel(processnum): pool = Pool(processnum) def pcut(sentence,cut_all=False): - parts = re.compile('(\s+)').split(sentence) + parts = re.compile('([\r\n]+)').split(sentence) if cut_all: result = pool.map(__lcut_all,parts) else: @@ -243,7 +243,7 @@ def enable_parallel(processnum): yield w def pcut_for_search(sentence): - parts = re.compile('(\s+)').split(sentence) + parts = re.compile('([\r\n]+)').split(sentence) result = pool.map(__lcut_for_search,parts) for r in result: for w in r: diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index 8e4ee95..8e265aa 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -155,7 +155,7 @@ def cut(sentence): for w in __cut_internal(sentence): yield w else: - parts = re.compile('(\s+)').split(sentence) + parts = re.compile('([\r\n]+)').split(sentence) result = jieba.pool.map(__lcut_internal,parts) for r in result: for w in r: diff --git a/test/parallel/extract_tags.py b/test/parallel/extract_tags.py new file mode 100644 index 0000000..f187df9 --- /dev/null +++ b/test/parallel/extract_tags.py @@ -0,0 +1,34 @@ +import sys +sys.path.append('../../') + +import jieba +jieba.enable_parallel(4) +import jieba.analyse +from optparse import OptionParser + +USAGE ="usage: python extract_tags.py [file name] -k [top k]" + +parser = OptionParser(USAGE) +parser.add_option("-k",dest="topK") +opt, args = parser.parse_args() + + +if len(args) <1: + print USAGE + sys.exit(1) + +file_name = args[0] + +if opt.topK==None: + topK=10 +else: + topK = int(opt.topK) + + +content = open(file_name,'rb').read() + +tags = jieba.analyse.extract_tags(content,topK=topK) + +print ",".join(tags) + +