From 87734d37857dcb8fd25975ce739ecb8d69545edb Mon Sep 17 00:00:00 2001
From: Dingyuan Wang <abcdoyle888@gmail.com>
Date: Tue, 17 Nov 2015 19:06:44 +0800
Subject: [PATCH] support POS tagging in __main__

---
 README.md         | 19 ++++++++++++-------
 jieba/__main__.py | 15 +++++++++++++--
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index a6ba211..8737716 100644
--- a/README.md
+++ b/README.md
@@ -284,10 +284,13 @@ word 有限公司            start: 6                end:10
       -d [DELIM], --delimiter [DELIM]
                             使用 DELIM 分隔词语,而不是用默认的' / '。
                             若不指定 DELIM,则使用一个空格分隔。
+      -p [DELIM], --pos [DELIM]
+                            启用词性标注;如果指定 DELIM,词语和词性之间
+                            用它分隔,否则用 _ 分隔
       -D DICT, --dict DICT  使用 DICT 代替默认词典
       -u USER_DICT, --user-dict USER_DICT
                             使用 USER_DICT 作为附加词典,与默认词典或自定义词典配合使用
-      -a, --cut-all         全模式分词
+      -a, --cut-all         全模式分词(不支持词性标注)
       -n, --no-hmm          不使用隐含马尔可夫模型
       -q, --quiet           不输出载入信息到 STDERR
       -V, --version         显示版本信息并退出
@@ -297,8 +300,6 @@ word 有限公司            start: 6                end:10
 `--help` 选项输出:
 
     $> python -m jieba --help
-    usage: python -m jieba [options] filename
-
     Jieba command line interface.
 
     positional arguments:
@@ -309,11 +310,14 @@ word 有限公司            start: 6                end:10
       -d [DELIM], --delimiter [DELIM]
                             use DELIM instead of ' / ' for word delimiter; or a
                             space if it is used without DELIM
+      -p [DELIM], --pos [DELIM]
+                            enable POS tagging; if DELIM is specified, use DELIM
+                            instead of '_' for POS delimiter
       -D DICT, --dict DICT  use DICT as dictionary
       -u USER_DICT, --user-dict USER_DICT
                             use USER_DICT together with the default dictionary or
                             DICT (if specified)
-      -a, --cut-all         full pattern cutting
+      -a, --cut-all         full pattern cutting (ignored with POS tagging)
       -n, --no-hmm          don't use the Hidden Markov Model
       -q, --quiet           don't print loading messages to stderr
       -V, --version         show program's version number and exit
@@ -686,8 +690,6 @@ word 有限公司            start: 6                end:10
 --------------------------------
 
     $> python -m jieba --help
-    usage: python -m jieba [options] filename
-
     Jieba command line interface.
 
     positional arguments:
@@ -698,11 +700,14 @@ word 有限公司            start: 6                end:10
       -d [DELIM], --delimiter [DELIM]
                             use DELIM instead of ' / ' for word delimiter; or a
                             space if it is used without DELIM
+      -p [DELIM], --pos [DELIM]
+                            enable POS tagging; if DELIM is specified, use DELIM
+                            instead of '_' for POS delimiter
       -D DICT, --dict DICT  use DICT as dictionary
       -u USER_DICT, --user-dict USER_DICT
                             use USER_DICT together with the default dictionary or
                             DICT (if specified)
-      -a, --cut-all         full pattern cutting
+      -a, --cut-all         full pattern cutting (ignored with POS tagging)
       -n, --no-hmm          don't use the Hidden Markov Model
       -q, --quiet           don't print loading messages to stderr
       -V, --version         show program's version number and exit
diff --git a/jieba/__main__.py b/jieba/__main__.py
index 1d24905..2467ef4 100644
--- a/jieba/__main__.py
+++ b/jieba/__main__.py
@@ -8,12 +8,14 @@ parser = ArgumentParser(usage="%s -m jieba [options] filename" % sys.executable,
 parser.add_argument("-d", "--delimiter", metavar="DELIM", default=' / ',
                     nargs='?', const=' ',
                     help="use DELIM instead of ' / ' for word delimiter; or a space if it is used without DELIM")
+parser.add_argument("-p", "--pos", metavar="DELIM", nargs='?', const='_',
+                    help="enable POS tagging; if DELIM is specified, use DELIM instead of '_' for POS delimiter")
 parser.add_argument("-D", "--dict", help="use DICT as dictionary")
 parser.add_argument("-u", "--user-dict",
                     help="use USER_DICT together with the default dictionary or DICT (if specified)")
 parser.add_argument("-a", "--cut-all",
                     action="store_true", dest="cutall", default=False,
-                    help="full pattern cutting")
+                    help="full pattern cutting (ignored with POS tagging)")
 parser.add_argument("-n", "--no-hmm", dest="hmm", action="store_false",
                     default=True, help="don't use the Hidden Markov Model")
 parser.add_argument("-q", "--quiet", action="store_true", default=False,
@@ -26,6 +28,15 @@ args = parser.parse_args()
 
 if args.quiet:
     jieba.setLogLevel(60)
+if args.pos:
+    import jieba.posseg
+    posdelim = args.pos
+    def cutfunc(sentence, _, HMM=True):
+        for w, f in jieba.posseg.cut(sentence, HMM):
+            yield w + posdelim + f
+else:
+    cutfunc = jieba.cut
+
 delim = text_type(args.delimiter)
 cutall = args.cutall
 hmm = args.hmm
@@ -41,7 +52,7 @@ if args.user_dict:
 ln = fp.readline()
 while ln:
     l = ln.rstrip('\r\n')
-    result = delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm))
+    result = delim.join(cutfunc(ln.rstrip('\r\n'), cutall, hmm))
     if PY2:
         result = result.encode(default_encoding)
     print(result)