From 45591bb9ab484f8e8fb1e047f0b17f4d538ab1ee Mon Sep 17 00:00:00 2001 From: fxsjy Date: Fri, 12 Apr 2013 21:53:03 +0800 Subject: [PATCH 1/3] support flag '_'; ignore white space --- jieba/__init__.py | 5 +++-- jieba/posseg/__init__.py | 2 +- test/test_userdict.py | 10 ++++++++++ test/userdict.txt | 6 ++++-- 4 files changed, 18 insertions(+), 5 deletions(-) diff --git a/jieba/__init__.py b/jieba/__init__.py index 9aaccd7..faf52a4 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -153,7 +153,7 @@ def cut(sentence,cut_all=False): sentence = sentence.decode('utf-8') except: sentence = sentence.decode('gbk','ignore') - re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\.]+)"), re.compile(ur"(\s+)") + re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\s+)") if cut_all: re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]") blocks = re_han.split(sentence) @@ -169,7 +169,8 @@ def cut(sentence,cut_all=False): tmp = re_skip.split(blk) for x in tmp: if re_skip.match(x): - yield x + if x!=' ': + yield x else: for xx in x: yield xx diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index 09df9e4..48b0488 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -125,7 +125,7 @@ def cut(sentence): sentence = sentence.decode('utf-8') except: sentence = sentence.decode('gbk','ignore') - re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\.]+)"), re.compile(ur"(\s+)") + re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\s+)") re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+") blocks = re_han.split(sentence) for blk in blocks: diff --git a/test/test_userdict.py b/test/test_userdict.py index aac4730..4b62d42 100644 --- a/test/test_userdict.py +++ b/test/test_userdict.py @@ -14,3 +14,13 @@ result = pseg.cut(test_sent) for w in result: print w.word, "/", w.flag, ", ", + +print "\n========" + +terms = jieba.cut('easy_install is great') +for t in terms: + print t +print '-------------------------' +terms = jieba.cut('python 的正则表达式是好用的') +for t in terms: + print t diff --git a/test/userdict.txt b/test/userdict.txt index 56e1f3d..4632566 100644 --- a/test/userdict.txt +++ b/test/userdict.txt @@ -1,3 +1,5 @@ -云计算 5 +云计算 5 李小福 2 nr -创新办 3 i \ No newline at end of file +创新办 3 i +easy_install 3 eng +好用 300 \ No newline at end of file From 012fddf13f69a1546747e5c5f345a7bd29150841 Mon Sep 17 00:00:00 2001 From: Sun Junyi Date: Fri, 12 Apr 2013 22:37:53 +0800 Subject: [PATCH 2/3] ignore white space --- jieba/__init__.py | 2 +- jieba/posseg/__init__.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/jieba/__init__.py b/jieba/__init__.py index faf52a4..5a17d5b 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -169,7 +169,7 @@ def cut(sentence,cut_all=False): tmp = re_skip.split(blk) for x in tmp: if re_skip.match(x): - if x!=' ': + if x.strip(' ')!='': yield x else: for xx in x: diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index 48b0488..fbc791e 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -136,7 +136,8 @@ def cut(sentence): tmp = re_skip.split(blk) for x in tmp: if re_skip.match(x): - yield pair(x,'') + if x.strip(' ')!='': + yield pair(x,'') else: for xx in x: if re_num.match(xx): From 8d89e8afda4bd1d547f5bf6202c09fe6c500658b Mon Sep 17 00:00:00 2001 From: Sun Junyi Date: Fri, 19 Apr 2013 10:02:33 +0800 Subject: [PATCH 3/3] =?UTF-8?q?handle=20=E7=9A=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- jieba/dict.txt | 5 +++-- test/test.py | 3 ++- test/test_pos.py | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/jieba/dict.txt b/jieba/dict.txt index f4b81ef..3fbcb4a 100644 --- a/jieba/dict.txt +++ b/jieba/dict.txt @@ -245161,10 +245161,9 @@ 皂隶 96 n 皂靴 10 n 皂鞋 3 n -的 3188252 uj +的 318825 uj 的一确二 3 l 的士高 3 n -的歌者 3 n 的的确确 64 d 的确 2135 d 的确如此 31 l @@ -245174,6 +245173,8 @@ 的里雅斯特 23 ns 的里雅斯特市 3 ns 的黎波里 62 ns +的哥 63 n +的士 20 n 皆 7511 d 皆佳 3 nrt 皆准 3 i diff --git a/test/test.py b/test/test.py index ea8595f..86ade7d 100644 --- a/test/test.py +++ b/test/test.py @@ -93,4 +93,5 @@ if __name__ == "__main__": cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站') cuttest('张晓梅去人民医院做了个B超然后去买了件T恤') cuttest('AT&T是一件不错的公司,给你发offer了吗?') - cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159') \ No newline at end of file + cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159') + cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。') \ No newline at end of file diff --git a/test/test_pos.py b/test/test_pos.py index 5e2862f..7f6a7f3 100644 --- a/test/test_pos.py +++ b/test/test_pos.py @@ -93,4 +93,5 @@ if __name__ == "__main__": cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站') cuttest('张晓梅去人民医院做了个B超然后去买了件T恤') cuttest('AT&T是一件不错的公司,给你发offer了吗?') - cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159') \ No newline at end of file + cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159') + cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。') \ No newline at end of file