Merge pull request #303 from jerryday/master

add a withFlag param to extract_tags
pull/315/head
Sun Junyi 9 years ago
commit 093980647b

@ -44,7 +44,7 @@ class UndirectWeightedGraph:
for w in itervalues(ws): for w in itervalues(ws):
if w < min_rank: if w < min_rank:
min_rank = w min_rank = w
elif w > max_rank: if w > max_rank:
max_rank = w max_rank = w
for n, w in ws.items(): for n, w in ws.items():
@ -66,7 +66,7 @@ class TextRank(KeywordExtractor):
return (wp.flag in self.pos_filt and len(wp.word.strip()) >= 2 return (wp.flag in self.pos_filt and len(wp.word.strip()) >= 2
and wp.word.lower() not in self.stop_words) and wp.word.lower() not in self.stop_words)
def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')): def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
""" """
Extract keywords from sentence using TextRank algorithm. Extract keywords from sentence using TextRank algorithm.
Parameter: Parameter:
@ -75,6 +75,8 @@ class TextRank(KeywordExtractor):
if False, return a list of words. if False, return a list of words.
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v']. - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
if the POS of w is not in this list, it will be filtered. if the POS of w is not in this list, it will be filtered.
- withFlag: if True, return a list of pair(word, weight) like posseg.cut
if False, return a list of words
""" """
self.pos_filt = frozenset(allowPOS) self.pos_filt = frozenset(allowPOS)
g = UndirectWeightedGraph() g = UndirectWeightedGraph()
@ -87,6 +89,9 @@ class TextRank(KeywordExtractor):
break break
if not self.pairfilter(words[j]): if not self.pairfilter(words[j]):
continue continue
if allowPOS and withFlag:
cm[(wp, words[j])] += 1
else:
cm[(wp.word, words[j].word)] += 1 cm[(wp.word, words[j].word)] += 1
for terms, w in cm.items(): for terms, w in cm.items():
@ -96,6 +101,7 @@ class TextRank(KeywordExtractor):
tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True) tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
else: else:
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True) tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
if topK: if topK:
return tags[:topK] return tags[:topK]
else: else:

@ -72,7 +72,7 @@ class TFIDF(KeywordExtractor):
self.idf_loader.set_new_path(new_abs_path) self.idf_loader.set_new_path(new_abs_path)
self.idf_freq, self.median_idf = self.idf_loader.get_idf() self.idf_freq, self.median_idf = self.idf_loader.get_idf()
def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=()): def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
""" """
Extract keywords from sentence using TF-IDF algorithm. Extract keywords from sentence using TF-IDF algorithm.
Parameter: Parameter:
@ -81,6 +81,9 @@ class TFIDF(KeywordExtractor):
if False, return a list of words. if False, return a list of words.
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr']. - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
if the POS of w is not in this list,it will be filtered. if the POS of w is not in this list,it will be filtered.
- withFlag: only work with allowPOS is not empty.
if True, return a list of pair(word, weight) like posseg.cut
if False, return a list of words
""" """
if allowPOS: if allowPOS:
allowPOS = frozenset(allowPOS) allowPOS = frozenset(allowPOS)
@ -92,14 +95,16 @@ class TFIDF(KeywordExtractor):
if allowPOS: if allowPOS:
if w.flag not in allowPOS: if w.flag not in allowPOS:
continue continue
else: elif not withFlag:
w = w.word w = w.word
if len(w.strip()) < 2 or w.lower() in self.stop_words: wc = w.word if allowPOS and withFlag else w
if len(wc.strip()) < 2 or wc.lower() in self.stop_words:
continue continue
freq[w] = freq.get(w, 0.0) + 1.0 freq[w] = freq.get(w, 0.0) + 1.0
total = sum(freq.values()) total = sum(freq.values())
for k in freq: for k in freq:
freq[k] *= self.idf_freq.get(k, self.median_idf) / total kw = k.word if allowPOS and withFlag else k
freq[k] *= self.idf_freq.get(kw, self.median_idf) / total
if withWeight: if withWeight:
tags = sorted(freq.items(), key=itemgetter(1), reverse=True) tags = sorted(freq.items(), key=itemgetter(1), reverse=True)

@ -62,6 +62,15 @@ class pair(object):
def __iter__(self): def __iter__(self):
return iter((self.word, self.flag)) return iter((self.word, self.flag))
def __lt__(self, other):
return self.word < other.word
def __eq__(self, other):
return isinstance(other, pair) and self.word == other.word and self.flag == other.flag
def __hash__(self):
return hash(self.word)
def encode(self, arg): def encode(self, arg):
return self.__unicode__().encode(arg) return self.__unicode__().encode(arg)

Loading…
Cancel
Save