fix self.FREQ in cut_for_search; make pair object iterable

pull/271/head
Dingyuan Wang 10 years ago
parent 3b76328f2a
commit ceb5c26be4

@ -200,8 +200,8 @@ https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py
```pycon ```pycon
>>> import jieba.posseg as pseg >>> import jieba.posseg as pseg
>>> words = pseg.cut("我爱北京天安门") >>> words = pseg.cut("我爱北京天安门")
>>> for w in words: >>> for word, flag in words:
... print('%s %s' % (w.word, w.flag)) ... print('%s %s' % (word, flag))
... ...
我 r 我 r
爱 v 爱 v

@ -310,12 +310,12 @@ class Tokenizer(object):
if len(w) > 2: if len(w) > 2:
for i in xrange(len(w) - 1): for i in xrange(len(w) - 1):
gram2 = w[i:i + 2] gram2 = w[i:i + 2]
if FREQ.get(gram2): if self.FREQ.get(gram2):
yield gram2 yield gram2
if len(w) > 3: if len(w) > 3:
for i in xrange(len(w) - 2): for i in xrange(len(w) - 2):
gram3 = w[i:i + 3] gram3 = w[i:i + 3]
if FREQ.get(gram3): if self.FREQ.get(gram3):
yield gram3 yield gram3
yield w yield w

@ -70,7 +70,7 @@ class pair(object):
return '%s/%s' % (self.word, self.flag) return '%s/%s' % (self.word, self.flag)
def __repr__(self): def __repr__(self):
return self.__str__() return 'pair(%r, %r)' % (self.word, self.flag)
def __str__(self): def __str__(self):
if PY2: if PY2:
@ -78,6 +78,9 @@ class pair(object):
else: else:
return self.__unicode__() return self.__unicode__()
def __iter__(self):
return iter((self.word, self.flag))
def encode(self, arg): def encode(self, arg):
return self.__unicode__().encode(arg) return self.__unicode__().encode(arg)

@ -62,8 +62,8 @@ print('4. 词性标注')
print('-'*40) print('-'*40)
words = jieba.posseg.cut("我爱北京天安门") words = jieba.posseg.cut("我爱北京天安门")
for w in words: for word, flag in words:
print('%s %s' % (w.word, w.flag)) print('%s %s' % (word, flag))
print('='*40) print('='*40)
print('6. Tokenize: 返回词语在原文的起止位置') print('6. Tokenize: 返回词语在原文的起止位置')

@ -6,8 +6,8 @@ import jieba.posseg as pseg
def cuttest(test_sent): def cuttest(test_sent):
result = pseg.cut(test_sent) result = pseg.cut(test_sent)
for w in result: for word, flag in result:
print(w.word, "/", w.flag, ", ", end=' ') print(word, "/", flag, ", ", end=' ')
print("") print("")

@ -5,9 +5,9 @@ sys.path.append("../")
import jieba.posseg as pseg import jieba.posseg as pseg
def cuttest(test_sent): def cuttest(test_sent):
result = pseg.cut(test_sent,HMM=False) result = pseg.cut(test_sent, HMM=False)
for w in result: for word, flag in result:
print(w.word, "/", w.flag, ", ", end=' ') print(word, "/", flag, ", ", end=' ')
print("") print("")

Loading…
Cancel
Save