From ceb5c26be4403ca4e10b1d40f81f6cd7cbfe64d8 Mon Sep 17 00:00:00 2001 From: Dingyuan Wang Date: Mon, 1 Jun 2015 14:36:38 +0800 Subject: [PATCH] fix self.FREQ in cut_for_search; make pair object iterable --- README.md | 4 ++-- jieba/__init__.py | 4 ++-- jieba/posseg/__init__.py | 5 ++++- test/demo.py | 4 ++-- test/test_pos.py | 4 ++-- test/test_pos_no_hmm.py | 6 +++--- 6 files changed, 15 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 3e33739..ef4e705 100644 --- a/README.md +++ b/README.md @@ -200,8 +200,8 @@ https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py ```pycon >>> import jieba.posseg as pseg >>> words = pseg.cut("我爱北京天安门") ->>> for w in words: -... print('%s %s' % (w.word, w.flag)) +>>> for word, flag in words: +... print('%s %s' % (word, flag)) ... 我 r 爱 v diff --git a/jieba/__init__.py b/jieba/__init__.py index 351c8af..6dfc23a 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -310,12 +310,12 @@ class Tokenizer(object): if len(w) > 2: for i in xrange(len(w) - 1): gram2 = w[i:i + 2] - if FREQ.get(gram2): + if self.FREQ.get(gram2): yield gram2 if len(w) > 3: for i in xrange(len(w) - 2): gram3 = w[i:i + 3] - if FREQ.get(gram3): + if self.FREQ.get(gram3): yield gram3 yield w diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index 3133233..749ef94 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -70,7 +70,7 @@ class pair(object): return '%s/%s' % (self.word, self.flag) def __repr__(self): - return self.__str__() + return 'pair(%r, %r)' % (self.word, self.flag) def __str__(self): if PY2: @@ -78,6 +78,9 @@ class pair(object): else: return self.__unicode__() + def __iter__(self): + return iter((self.word, self.flag)) + def encode(self, arg): return self.__unicode__().encode(arg) diff --git a/test/demo.py b/test/demo.py index 6ebb159..49d60ec 100644 --- a/test/demo.py +++ b/test/demo.py @@ -62,8 +62,8 @@ print('4. 词性标注') print('-'*40) words = jieba.posseg.cut("我爱北京天安门") -for w in words: - print('%s %s' % (w.word, w.flag)) +for word, flag in words: + print('%s %s' % (word, flag)) print('='*40) print('6. Tokenize: 返回词语在原文的起止位置') diff --git a/test/test_pos.py b/test/test_pos.py index ee2c18f..9f1682b 100644 --- a/test/test_pos.py +++ b/test/test_pos.py @@ -6,8 +6,8 @@ import jieba.posseg as pseg def cuttest(test_sent): result = pseg.cut(test_sent) - for w in result: - print(w.word, "/", w.flag, ", ", end=' ') + for word, flag in result: + print(word, "/", flag, ", ", end=' ') print("") diff --git a/test/test_pos_no_hmm.py b/test/test_pos_no_hmm.py index 7d9c1d5..2dd555a 100644 --- a/test/test_pos_no_hmm.py +++ b/test/test_pos_no_hmm.py @@ -5,9 +5,9 @@ sys.path.append("../") import jieba.posseg as pseg def cuttest(test_sent): - result = pseg.cut(test_sent,HMM=False) - for w in result: - print(w.word, "/", w.flag, ", ", end=' ') + result = pseg.cut(test_sent, HMM=False) + for word, flag in result: + print(word, "/", flag, ", ", end=' ') print("")