Merge pull request #315 from gumblex/master

命令行分词支持词性标注
pull/351/head
Sun Junyi 9 years ago
commit e5c9af78e2

@ -284,10 +284,13 @@ word 有限公司 start: 6 end:10
-d [DELIM], --delimiter [DELIM]
使用 DELIM 分隔词语,而不是用默认的' / '。
若不指定 DELIM则使用一个空格分隔。
-p [DELIM], --pos [DELIM]
启用词性标注;如果指定 DELIM词语和词性之间
用它分隔,否则用 _ 分隔
-D DICT, --dict DICT 使用 DICT 代替默认词典
-u USER_DICT, --user-dict USER_DICT
使用 USER_DICT 作为附加词典,与默认词典或自定义词典配合使用
-a, --cut-all 全模式分词
-a, --cut-all 全模式分词(不支持词性标注)
-n, --no-hmm 不使用隐含马尔可夫模型
-q, --quiet 不输出载入信息到 STDERR
-V, --version 显示版本信息并退出
@ -297,8 +300,6 @@ word 有限公司 start: 6 end:10
`--help` 选项输出:
$> python -m jieba --help
usage: python -m jieba [options] filename
Jieba command line interface.
positional arguments:
@ -309,11 +310,14 @@ word 有限公司 start: 6 end:10
-d [DELIM], --delimiter [DELIM]
use DELIM instead of ' / ' for word delimiter; or a
space if it is used without DELIM
-p [DELIM], --pos [DELIM]
enable POS tagging; if DELIM is specified, use DELIM
instead of '_' for POS delimiter
-D DICT, --dict DICT use DICT as dictionary
-u USER_DICT, --user-dict USER_DICT
use USER_DICT together with the default dictionary or
DICT (if specified)
-a, --cut-all full pattern cutting
-a, --cut-all full pattern cutting (ignored with POS tagging)
-n, --no-hmm don't use the Hidden Markov Model
-q, --quiet don't print loading messages to stderr
-V, --version show program's version number and exit
@ -686,8 +690,6 @@ word 有限公司 start: 6 end:10
--------------------------------
$> python -m jieba --help
usage: python -m jieba [options] filename
Jieba command line interface.
positional arguments:
@ -698,11 +700,14 @@ word 有限公司 start: 6 end:10
-d [DELIM], --delimiter [DELIM]
use DELIM instead of ' / ' for word delimiter; or a
space if it is used without DELIM
-p [DELIM], --pos [DELIM]
enable POS tagging; if DELIM is specified, use DELIM
instead of '_' for POS delimiter
-D DICT, --dict DICT use DICT as dictionary
-u USER_DICT, --user-dict USER_DICT
use USER_DICT together with the default dictionary or
DICT (if specified)
-a, --cut-all full pattern cutting
-a, --cut-all full pattern cutting (ignored with POS tagging)
-n, --no-hmm don't use the Hidden Markov Model
-q, --quiet don't print loading messages to stderr
-V, --version show program's version number and exit

@ -8,12 +8,14 @@ parser = ArgumentParser(usage="%s -m jieba [options] filename" % sys.executable,
parser.add_argument("-d", "--delimiter", metavar="DELIM", default=' / ',
nargs='?', const=' ',
help="use DELIM instead of ' / ' for word delimiter; or a space if it is used without DELIM")
parser.add_argument("-p", "--pos", metavar="DELIM", nargs='?', const='_',
help="enable POS tagging; if DELIM is specified, use DELIM instead of '_' for POS delimiter")
parser.add_argument("-D", "--dict", help="use DICT as dictionary")
parser.add_argument("-u", "--user-dict",
help="use USER_DICT together with the default dictionary or DICT (if specified)")
parser.add_argument("-a", "--cut-all",
action="store_true", dest="cutall", default=False,
help="full pattern cutting")
help="full pattern cutting (ignored with POS tagging)")
parser.add_argument("-n", "--no-hmm", dest="hmm", action="store_false",
default=True, help="don't use the Hidden Markov Model")
parser.add_argument("-q", "--quiet", action="store_true", default=False,
@ -26,6 +28,15 @@ args = parser.parse_args()
if args.quiet:
jieba.setLogLevel(60)
if args.pos:
import jieba.posseg
posdelim = args.pos
def cutfunc(sentence, _, HMM=True):
for w, f in jieba.posseg.cut(sentence, HMM):
yield w + posdelim + f
else:
cutfunc = jieba.cut
delim = text_type(args.delimiter)
cutall = args.cutall
hmm = args.hmm
@ -41,7 +52,7 @@ if args.user_dict:
ln = fp.readline()
while ln:
l = ln.rstrip('\r\n')
result = delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm))
result = delim.join(cutfunc(ln.rstrip('\r\n'), cutall, hmm))
if PY2:
result = result.encode(default_encoding)
print(result)

Loading…
Cancel
Save