Merge pull request #303 from jerryday/master

add a withFlag param to extract_tags
9 years ago · 093980647b
parent f73a2183a5 e5e41a4aad
commit 093980647b
3 changed files with 27 additions and 7 deletions
--- a/jieba/analyse/textrank.py
+++ b/jieba/analyse/textrank.py
@ -44,7 +44,7 @@ class UndirectWeightedGraph:
        for w in itervalues(ws):
            if w < min_rank:
                min_rank = w
-            elif w > max_rank:
+            if w > max_rank:
                max_rank = w

        for n, w in ws.items():
@ -66,7 +66,7 @@ class TextRank(KeywordExtractor):
        return (wp.flag in self.pos_filt and len(wp.word.strip()) >= 2
                and wp.word.lower() not in self.stop_words)

-    def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')):
+    def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
        """
        Extract keywords from sentence using TextRank algorithm.
        Parameter:
@ -75,6 +75,8 @@ class TextRank(KeywordExtractor):
                          if False, return a list of words.
            - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
                        if the POS of w is not in this list, it will be filtered.
+            - withFlag: if True, return a list of pair(word, weight) like posseg.cut
+                        if False, return a list of words
        """
        self.pos_filt = frozenset(allowPOS)
        g = UndirectWeightedGraph()
@ -87,6 +89,9 @@ class TextRank(KeywordExtractor):
                        break
                    if not self.pairfilter(words[j]):
                        continue
+                    if allowPOS and withFlag:
+                        cm[(wp, words[j])] += 1
+                    else:
                        cm[(wp.word, words[j].word)] += 1

        for terms, w in cm.items():
@ -96,6 +101,7 @@ class TextRank(KeywordExtractor):
            tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
        else:
            tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
+
        if topK:
            return tags[:topK]
        else:
--- a/jieba/analyse/tfidf.py
+++ b/jieba/analyse/tfidf.py
@ -72,7 +72,7 @@ class TFIDF(KeywordExtractor):
        self.idf_loader.set_new_path(new_abs_path)
        self.idf_freq, self.median_idf = self.idf_loader.get_idf()

-    def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=()):
+    def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
        """
        Extract keywords from sentence using TF-IDF algorithm.
        Parameter:
@ -81,6 +81,9 @@ class TFIDF(KeywordExtractor):
                          if False, return a list of words.
            - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
                        if the POS of w is not in this list,it will be filtered.
+            - withFlag: only work with allowPOS is not empty.
+                        if True, return a list of pair(word, weight) like posseg.cut
+                        if False, return a list of words
        """
        if allowPOS:
            allowPOS = frozenset(allowPOS)
@ -92,14 +95,16 @@ class TFIDF(KeywordExtractor):
            if allowPOS:
                if w.flag not in allowPOS:
                    continue
-                else:
+                elif not withFlag:
                    w = w.word
-            if len(w.strip()) < 2 or w.lower() in self.stop_words:
+            wc = w.word if allowPOS and withFlag else w
+            if len(wc.strip()) < 2 or wc.lower() in self.stop_words:
                continue
            freq[w] = freq.get(w, 0.0) + 1.0
        total = sum(freq.values())
        for k in freq:
-            freq[k] *= self.idf_freq.get(k, self.median_idf) / total
+            kw = k.word if allowPOS and withFlag else k
+            freq[k] *= self.idf_freq.get(kw, self.median_idf) / total

        if withWeight:
            tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
--- a/jieba/posseg/init.py
+++ b/jieba/posseg/init.py
@ -62,6 +62,15 @@ class pair(object):
    def __iter__(self):
        return iter((self.word, self.flag))

+    def __lt__(self, other):
+        return self.word < other.word
+
+    def __eq__(self, other):
+        return isinstance(other, pair) and self.word == other.word and self.flag == other.flag
+
+    def __hash__(self):
+        return hash(self.word)
+
    def encode(self, arg):
        return self.__unicode__().encode(arg)