From d82d2c18df20342c88291d2a94093bc51bc5eee0 Mon Sep 17 00:00:00 2001
From: walkskyer <walkskyer@qq.com>
Date: Thu, 13 Nov 2014 22:26:22 +0800
Subject: [PATCH] =?UTF-8?q?=E4=B8=BA=E5=85=B3=E9=94=AE=E5=AD=97=E6=8F=90?=
 =?UTF-8?q?=E5=8F=96=E5=87=BD=E6=95=B0=E5=A2=9E=E5=8A=A0=E8=AF=8D=E6=80=A7?=
 =?UTF-8?q?=E8=BF=87=E6=BB=A4=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 jieba/analyse/__init__.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/jieba/analyse/__init__.py b/jieba/analyse/__init__.py
index af36149..c8a996f 100755
--- a/jieba/analyse/__init__.py
+++ b/jieba/analyse/__init__.py
@@ -1,5 +1,6 @@
 #encoding=utf-8
 import jieba
+import jieba.posseg
 import os
 from operator import itemgetter
 try:
@@ -58,21 +59,31 @@ def set_stop_words(stop_words_path):
     for line in lines:
         STOP_WORDS.add(line)
 
-def extract_tags(sentence, topK=20, withWeight=False):
+def extract_tags(sentence, topK=20, withWeight=False, allowPOS=[]):
     """
     Extract keywords from sentence using TF-IDF algorithm.
     Parameter:
         - topK: return how many top keywords. `None` for all possible words.
         - withWeight: if True, return a list of (word, weight);
                       if False, return a list of words.
+        - allowPOS: the allowed POS list eg. ['n'].
+                    if the POS of w is not in this list,it will be filtered.
     """
     global STOP_WORDS, idf_loader
 
     idf_freq, median_idf = idf_loader.get_idf()
 
-    words = jieba.cut(sentence)
+    if allowPOS:
+        words = jieba.posseg.cut(sentence)
+    else:
+        words = jieba.cut(sentence)
     freq = {}
     for w in words:
+        if allowPOS:
+            if w.flag not in allowPOS:
+                continue
+            else:
+                w = w.word
         if len(w.strip()) < 2 or w.lower() in STOP_WORDS:
             continue
         freq[w] = freq.get(w, 0.0) + 1.0