improve extract_tags; unify extract_tags and testrank

pull/195/head
Dingyuan Wang 10 years ago
parent e3f3dcccba
commit 751ff35eb5

@ -153,17 +153,16 @@ jieba.analyse.textrank(raw_text)
来自`__main__`的示例结果:
```
吉林 100.0
欧亚 86.4592606421
置业 55.3262889963
实现 52.0353476663
收入 37.9475518129
增资 35.5042189944
子公司 34.9286032861
全资 30.8154823412
城市 30.6031961172
商业 30.4779050167
吉林 1.0
欧亚 0.864834432786
置业 0.553465925497
实现 0.520660869531
收入 0.379699688954
增资 0.355086023683
子公司 0.349758490263
全资 0.308537396283
城市 0.306103738053
商业 0.304837414946
```
4) : 词性标注

@ -1,6 +1,7 @@
#encoding=utf-8
import jieba
import os
from operator import itemgetter
try:
from analyzer import ChineseAnalyzer
except ImportError:
@ -26,13 +27,11 @@ class IDFLoader:
if self.path != new_idf_path:
content = open(new_idf_path, 'rb').read().decode('utf-8')
idf_freq = {}
lines = content.split('\n')
if lines and not lines[-1]:
lines.pop(-1)
lines = content.rstrip('\n').split('\n')
for line in lines:
word, freq = line.split(' ')
idf_freq[word] = float(freq)
median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
median_idf = sorted(idf_freq.values())[len(idf_freq)//2]
self.idf_freq = idf_freq
self.median_idf = median_idf
self.path = new_idf_path
@ -60,27 +59,32 @@ def set_stop_words(stop_words_path):
STOP_WORDS.add(line)
def extract_tags(sentence, topK=20, withWeight=False):
global STOP_WORDS
"""
Extract keywords from sentence using TF-IDF algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
"""
global STOP_WORDS, idf_loader
idf_freq, median_idf = idf_loader.get_idf()
words = jieba.cut(sentence)
freq = {}
for w in words:
if len(w.strip()) < 2:
continue
if w.lower() in STOP_WORDS:
if len(w.strip()) < 2 or w.lower() in STOP_WORDS:
continue
freq[w] = freq.get(w, 0.0) + 1.0
total = sum(freq.values())
freq = [(k,v/total) for k,v in freq.iteritems()]
tf_idf_list = [(v*idf_freq.get(k,median_idf), k) for k,v in freq]
st_list = sorted(tf_idf_list, reverse=True)
for k in freq:
freq[k] *= idf_freq.get(k, median_idf) / total
if withWeight:
tags = st_list[:topK]
tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(freq, key=freq.__getitem__, reverse=True)
if topK:
return tags[:topK]
else:
top_tuples = st_list[:topK]
tags = [a[1] for a in top_tuples]
return tags
return tags

@ -1,9 +1,10 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import jieba.posseg as pseg
import collections
import sys
import collections
from operator import itemgetter
import jieba.posseg as pseg
class UndirectWeightedGraph:
d = 0.85
@ -41,17 +42,25 @@ class UndirectWeightedGraph:
max_rank = w
for n, w in ws.items():
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0) * 100
# to unify the weights, don't *100.
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
return ws
def textrank(raw, topk=10):
def textrank(sentence, topK=10, withWeight=False):
"""
Extract keywords from sentence using TextRank algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
"""
pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
g = UndirectWeightedGraph()
cm = collections.defaultdict(int)
span = 5
words = [x for x in pseg.cut(raw)]
words = list(pseg.cut(sentence))
for i in xrange(len(words)):
if words[i].flag in pos_filt:
for j in xrange(i + 1, i + span):
@ -65,10 +74,16 @@ def textrank(raw, topk=10):
g.addEdge(terms[0], terms[1], w)
nodes_rank = g.rank()
nrs = sorted(nodes_rank.items(), key=lambda x: x[1], reverse=True)
return nrs[:topk]
if withWeight:
tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
if topK:
return tags[:topK]
else:
return tags
if __name__ == '__main__':
s = "此外公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元增资后吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年实现营业收入0万元实现净利润-139.13万元。"
for x, w in textrank(s):
for x, w in textrank(s, withWeight=True):
print x, w

@ -19,25 +19,22 @@ PrevStatus = {
}
def load_model():
_curpath=os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
start_p = {}
abs_path = os.path.join(_curpath, PROB_START_P)
with open(abs_path, mode='r') as f:
with open(abs_path, 'rb') as f:
start_p = marshal.load(f)
f.closed
trans_p = {}
abs_path = os.path.join(_curpath, PROB_TRANS_P)
with open(abs_path, 'r') as f:
with open(abs_path, 'rb') as f:
trans_p = marshal.load(f)
f.closed
emit_p = {}
abs_path = os.path.join(_curpath, PROB_EMIT_P)
with open(abs_path, 'r') as f:
with open(abs_path, 'rb') as f:
emit_p = marshal.load(f)
f.closed
return start_p, trans_p, emit_p

@ -25,27 +25,24 @@ def load_model(f_name, isJython=True):
continue
word, _, tag = line.split(' ')
result[word.decode('utf-8')] = tag
f.closed
if not isJython:
return result
start_p = {}
abs_path = os.path.join(_curpath, PROB_START_P)
with open(abs_path, mode='r') as f:
with open(abs_path, 'rb') as f:
start_p = marshal.load(f)
f.closed
trans_p = {}
abs_path = os.path.join(_curpath, PROB_TRANS_P)
with open(abs_path, 'r') as f:
with open(abs_path, 'rb') as f:
trans_p = marshal.load(f)
f.closed
emit_p = {}
abs_path = os.path.join(_curpath, PROB_EMIT_P)
with open(abs_path, 'r') as f:
with open(abs_path, 'rb') as f:
emit_p = marshal.load(f)
f.closed
state = {}
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)

Loading…
Cancel
Save