diff --git a/jieba/__init__.py b/jieba/__init__.py index 2bc5b41..0b1dcad 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -157,7 +157,7 @@ def cut(sentence,cut_all=False): except: sentence = sentence.decode('gbk','ignore') - re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\.]+)"), re.compile("(\s+)") + re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)") if cut_all: re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("[^a-zA-Z0-9+#\n]") @@ -175,7 +175,8 @@ def cut(sentence,cut_all=False): tmp = re_skip.split(blk) for x in tmp: if re_skip.match(x): - yield x + if x.strip(' ')!='': + yield x else: for xx in x: yield xx diff --git a/jieba/dict.txt b/jieba/dict.txt index f4b81ef..3fbcb4a 100644 --- a/jieba/dict.txt +++ b/jieba/dict.txt @@ -245161,10 +245161,9 @@ 皂隶 96 n 皂靴 10 n 皂鞋 3 n -的 3188252 uj +的 318825 uj 的一确二 3 l 的士高 3 n -的歌者 3 n 的的确确 64 d 的确 2135 d 的确如此 31 l @@ -245174,6 +245173,8 @@ 的里雅斯特 23 ns 的里雅斯特市 3 ns 的黎波里 62 ns +的哥 63 n +的士 20 n 皆 7511 d 皆佳 3 nrt 皆准 3 i diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index d4e93cf..c1933f2 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -129,7 +129,7 @@ def cut(sentence): except: sentence = sentence.decode('gbk','ignore') - re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\.]+)"), re.compile("(\s+)") + re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)") re_eng,re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+") blocks = re_han.split(sentence) @@ -141,7 +141,8 @@ def cut(sentence): tmp = re_skip.split(blk) for x in tmp: if re_skip.match(x): - yield pair(x,'') + if x.strip(' ')!='': + yield pair(x,'') else: for xx in x: if re_num.match(xx): diff --git a/test/test.py b/test/test.py index 8fed235..e69a218 100644 --- a/test/test.py +++ b/test/test.py @@ -92,3 +92,4 @@ if __name__ == "__main__": cuttest('张晓梅去人民医院做了个B超然后去买了件T恤') cuttest('AT&T是一件不错的公司,给你发offer了吗?') cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159') + cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。') diff --git a/test/test_pos.py b/test/test_pos.py index 01b77e0..1268321 100644 --- a/test/test_pos.py +++ b/test/test_pos.py @@ -94,4 +94,4 @@ if __name__ == "__main__": cuttest('张晓梅去人民医院做了个B超然后去买了件T恤') cuttest('AT&T是一件不错的公司,给你发offer了吗?') cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159') - + cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。') diff --git a/test/test_userdict.py b/test/test_userdict.py index 4e7b49b..f94be41 100644 --- a/test/test_userdict.py +++ b/test/test_userdict.py @@ -14,3 +14,13 @@ result = pseg.cut(test_sent) for w in result: print(w.word, "/", w.flag, ", ") + +print("\n========") + +terms = jieba.cut('easy_install is great') +for t in terms: + print(t) +print('-------------------------') +terms = jieba.cut('python 的正则表达式是好用的') +for t in terms: + print(t) diff --git a/test/userdict.txt b/test/userdict.txt index 56e1f3d..4632566 100644 --- a/test/userdict.txt +++ b/test/userdict.txt @@ -1,3 +1,5 @@ -云计算 5 +云计算 5 李小福 2 nr -创新办 3 i \ No newline at end of file +创新办 3 i +easy_install 3 eng +好用 300 \ No newline at end of file