diff --git a/README.md b/README.md index a6ba211..8737716 100644 --- a/README.md +++ b/README.md @@ -284,10 +284,13 @@ word 有限公司 start: 6 end:10 -d [DELIM], --delimiter [DELIM] 使用 DELIM 分隔词语,而不是用默认的' / '。 若不指定 DELIM,则使用一个空格分隔。 + -p [DELIM], --pos [DELIM] + 启用词性标注;如果指定 DELIM,词语和词性之间 + 用它分隔,否则用 _ 分隔 -D DICT, --dict DICT 使用 DICT 代替默认词典 -u USER_DICT, --user-dict USER_DICT 使用 USER_DICT 作为附加词典,与默认词典或自定义词典配合使用 - -a, --cut-all 全模式分词 + -a, --cut-all 全模式分词(不支持词性标注) -n, --no-hmm 不使用隐含马尔可夫模型 -q, --quiet 不输出载入信息到 STDERR -V, --version 显示版本信息并退出 @@ -297,8 +300,6 @@ word 有限公司 start: 6 end:10 `--help` 选项输出: $> python -m jieba --help - usage: python -m jieba [options] filename - Jieba command line interface. positional arguments: @@ -309,11 +310,14 @@ word 有限公司 start: 6 end:10 -d [DELIM], --delimiter [DELIM] use DELIM instead of ' / ' for word delimiter; or a space if it is used without DELIM + -p [DELIM], --pos [DELIM] + enable POS tagging; if DELIM is specified, use DELIM + instead of '_' for POS delimiter -D DICT, --dict DICT use DICT as dictionary -u USER_DICT, --user-dict USER_DICT use USER_DICT together with the default dictionary or DICT (if specified) - -a, --cut-all full pattern cutting + -a, --cut-all full pattern cutting (ignored with POS tagging) -n, --no-hmm don't use the Hidden Markov Model -q, --quiet don't print loading messages to stderr -V, --version show program's version number and exit @@ -686,8 +690,6 @@ word 有限公司 start: 6 end:10 -------------------------------- $> python -m jieba --help - usage: python -m jieba [options] filename - Jieba command line interface. positional arguments: @@ -698,11 +700,14 @@ word 有限公司 start: 6 end:10 -d [DELIM], --delimiter [DELIM] use DELIM instead of ' / ' for word delimiter; or a space if it is used without DELIM + -p [DELIM], --pos [DELIM] + enable POS tagging; if DELIM is specified, use DELIM + instead of '_' for POS delimiter -D DICT, --dict DICT use DICT as dictionary -u USER_DICT, --user-dict USER_DICT use USER_DICT together with the default dictionary or DICT (if specified) - -a, --cut-all full pattern cutting + -a, --cut-all full pattern cutting (ignored with POS tagging) -n, --no-hmm don't use the Hidden Markov Model -q, --quiet don't print loading messages to stderr -V, --version show program's version number and exit diff --git a/jieba/__main__.py b/jieba/__main__.py index 1d24905..2467ef4 100644 --- a/jieba/__main__.py +++ b/jieba/__main__.py @@ -8,12 +8,14 @@ parser = ArgumentParser(usage="%s -m jieba [options] filename" % sys.executable, parser.add_argument("-d", "--delimiter", metavar="DELIM", default=' / ', nargs='?', const=' ', help="use DELIM instead of ' / ' for word delimiter; or a space if it is used without DELIM") +parser.add_argument("-p", "--pos", metavar="DELIM", nargs='?', const='_', + help="enable POS tagging; if DELIM is specified, use DELIM instead of '_' for POS delimiter") parser.add_argument("-D", "--dict", help="use DICT as dictionary") parser.add_argument("-u", "--user-dict", help="use USER_DICT together with the default dictionary or DICT (if specified)") parser.add_argument("-a", "--cut-all", action="store_true", dest="cutall", default=False, - help="full pattern cutting") + help="full pattern cutting (ignored with POS tagging)") parser.add_argument("-n", "--no-hmm", dest="hmm", action="store_false", default=True, help="don't use the Hidden Markov Model") parser.add_argument("-q", "--quiet", action="store_true", default=False, @@ -26,6 +28,15 @@ args = parser.parse_args() if args.quiet: jieba.setLogLevel(60) +if args.pos: + import jieba.posseg + posdelim = args.pos + def cutfunc(sentence, _, HMM=True): + for w, f in jieba.posseg.cut(sentence, HMM): + yield w + posdelim + f +else: + cutfunc = jieba.cut + delim = text_type(args.delimiter) cutall = args.cutall hmm = args.hmm @@ -41,7 +52,7 @@ if args.user_dict: ln = fp.readline() while ln: l = ln.rstrip('\r\n') - result = delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)) + result = delim.join(cutfunc(ln.rstrip('\r\n'), cutall, hmm)) if PY2: result = result.encode(default_encoding) print(result)