From d82d2c18df20342c88291d2a94093bc51bc5eee0 Mon Sep 17 00:00:00 2001 From: walkskyer Date: Thu, 13 Nov 2014 22:26:22 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=BA=E5=85=B3=E9=94=AE=E5=AD=97=E6=8F=90?= =?UTF-8?q?=E5=8F=96=E5=87=BD=E6=95=B0=E5=A2=9E=E5=8A=A0=E8=AF=8D=E6=80=A7?= =?UTF-8?q?=E8=BF=87=E6=BB=A4=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- jieba/analyse/__init__.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/jieba/analyse/__init__.py b/jieba/analyse/__init__.py index af36149..c8a996f 100755 --- a/jieba/analyse/__init__.py +++ b/jieba/analyse/__init__.py @@ -1,5 +1,6 @@ #encoding=utf-8 import jieba +import jieba.posseg import os from operator import itemgetter try: @@ -58,21 +59,31 @@ def set_stop_words(stop_words_path): for line in lines: STOP_WORDS.add(line) -def extract_tags(sentence, topK=20, withWeight=False): +def extract_tags(sentence, topK=20, withWeight=False, allowPOS=[]): """ Extract keywords from sentence using TF-IDF algorithm. Parameter: - topK: return how many top keywords. `None` for all possible words. - withWeight: if True, return a list of (word, weight); if False, return a list of words. + - allowPOS: the allowed POS list eg. ['n']. + if the POS of w is not in this list,it will be filtered. """ global STOP_WORDS, idf_loader idf_freq, median_idf = idf_loader.get_idf() - words = jieba.cut(sentence) + if allowPOS: + words = jieba.posseg.cut(sentence) + else: + words = jieba.cut(sentence) freq = {} for w in words: + if allowPOS: + if w.flag not in allowPOS: + continue + else: + w = w.word if len(w.strip()) < 2 or w.lower() in STOP_WORDS: continue freq[w] = freq.get(w, 0.0) + 1.0