Merge pull request #190 from sing1ee/master

add a simple implementation of textrank
pull/192/head
Sun Junyi 10 years ago
commit 4030d8ed86

@ -0,0 +1,87 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import jieba.posseg as pseg
import collections
import sys
class Edge:
def __init__(self, start, end, weight):
self.start = start
self.end = end
self.weight = weight
class UndirectWeightedGraph:
d = 0.85
def __init__(self):
self.graph = collections.defaultdict(list)
def addEdge(self, start, end, weight):
e1 = Edge(start, end, weight)
e2 = Edge(end, start, weight)
self.graph[start].append(e1)
self.graph[end].append(e2)
def rank(self):
ws = collections.defaultdict(float)
outSum = collections.defaultdict(float)
for n, _ in self.graph.items():
ws[n] = 1.0 / len(self.graph)
for n, out in self.graph.items():
os = 0.0
for e in out:
os += e.weight
outSum[n] = os
for x in xrange(10): # 10 iters
for n, inedges in self.graph.items():
s = 0
for e in inedges:
s += e.weight / outSum[e.end] * ws[e.end]
ws[n] = (1 - self.d) + self.d * s
(min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])
for _, w in ws.items():
if w < min_rank:
min_rank = w
if w > max_rank:
max_rank = w
for n, w in ws.items():
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0) * 100
return ws
def textrank(raw=None, topk=10):
pos_filt = set(['ns', 'n', 'vn', 'v'])
g = UndirectWeightedGraph()
cm = collections.defaultdict(int)
span = 5
words = [x for x in pseg.cut(raw)]
for i in xrange(len(words)):
for j in xrange(i + 1, i + span):
if j >= len(words):
break
if words[i].flag not in pos_filt or words[j].flag not in pos_filt:
continue
cm['%s:%s' % (words[i].word, words[j].word)] += 1
for pair, w in cm.items():
terms = pair.split(':')
g.addEdge(terms[0], terms[1], w)
nodes_rank = g.rank()
nrs = sorted(nodes_rank.items(), key=lambda x: x[1], reverse=True)
return nrs[:topk]
if __name__ == '__main__':
for x, w in textrank("此外公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元增资后吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年实现营业收入0万元实现净利润-139.13万元。"):
print x, w
Loading…
Cancel
Save