修复 cut_for_search;改善 pair 对象
@ -200,8 +200,8 @@ https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py
```pycon
>>> import jieba.posseg as pseg
>>> words = pseg.cut("我爱北京天安门")
>>> for w in words:
... print('%s %s' % (w.word, w.flag))
>>> for word, flag in words:
... print('%s %s' % (word, flag))
...
我 r
爱 v
@ -310,12 +310,12 @@ class Tokenizer(object):
if len(w) > 2:
for i in xrange(len(w) - 1):
gram2 = w[i:i + 2]
if FREQ.get(gram2):
if self.FREQ.get(gram2):
yield gram2
if len(w) > 3:
for i in xrange(len(w) - 2):
gram3 = w[i:i + 3]
if FREQ.get(gram3):
if self.FREQ.get(gram3):
yield gram3
yield w
@ -70,7 +70,7 @@ class pair(object):
return '%s/%s' % (self.word, self.flag)
def __repr__(self):
return self.__str__()
return 'pair(%r, %r)' % (self.word, self.flag)
def __str__(self):
if PY2:
@ -78,6 +78,9 @@ class pair(object):
else:
return self.__unicode__()
def __iter__(self):
return iter((self.word, self.flag))
def encode(self, arg):
return self.__unicode__().encode(arg)
@ -62,8 +62,8 @@ print('4. 词性标注')
print('-'*40)
words = jieba.posseg.cut("我爱北京天安门")
for w in words:
print('%s %s' % (w.word, w.flag))
for word, flag in words:
print('%s %s' % (word, flag))
print('='*40)
print('6. Tokenize: 返回词语在原文的起止位置')
@ -6,8 +6,8 @@ import jieba.posseg as pseg
def cuttest(test_sent):
result = pseg.cut(test_sent)
for w in result:
print(w.word, "/", w.flag, ", ", end=' ')
for word, flag in result:
print(word, "/", flag, ", ", end=' ')
print("")
@ -5,9 +5,9 @@ sys.path.append("../")
import jieba.posseg as pseg
result = pseg.cut(test_sent,HMM=False)
result = pseg.cut(test_sent, HMM=False)