en-chn mix words in POS

12 years ago · 06ebc6f71c
parent a8ae0398b4
commit 06ebc6f71c
2 changed files with 27 additions and 5 deletions
--- a/jieba/posseg/init.py
+++ b/jieba/posseg/init.py
@ -3,6 +3,7 @@ import os
 import viterbi
 import jieba
 import sys
+
 default_encoding = sys.getfilesystemencoding()

 def load_model(f_name):
@ -60,10 +61,31 @@ def __cut(sentence):
 	if next<len(sentence):
 		yield pair(sentence[next:], pos_list[next][1] )

+def __cut_detail(sentence):
+	re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\r\n]")
+	re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+")
+	blocks = re_han.split(sentence)
+	for blk in blocks:
+		if re_han.match(blk):
+				for word in __cut(blk):
+					yield word
+		else:
+			tmp = re_skip.split(blk)
+			for x in tmp:
+				if x!="":
+					if re_num.match(x):
+						yield pair(x,'m')
+					elif re_eng.match(x):
+						yield pair(x,'eng')
+					else:
+						yield pair(x,'x')
+
 def __cut_DAG(sentence):
 	DAG = jieba.get_DAG(sentence)
 	route ={}
+	
 	jieba.calc(sentence,DAG,0,route=route)
+
 	x = 0
 	buf =u''
 	N = len(sentence)
@ -78,7 +100,7 @@ def __cut_DAG(sentence):
 					yield pair(buf,word_tag_tab.get(buf,'x'))
 					buf=u''
 				else:
-					regognized = __cut(buf)
+					regognized = __cut_detail(buf)
 					for t in regognized:
 						yield t
 					buf=u''
@ -89,7 +111,7 @@ def __cut_DAG(sentence):
 		if len(buf)==1:
 			yield pair(buf,word_tag_tab.get(buf,'x'))
 		else:
-			regognized = __cut(buf)
+			regognized = __cut_detail(buf)
 			for t in regognized:
 				yield t

@ -100,10 +122,9 @@ def cut(sentence):
 			sentence = sentence.decode('utf-8')
 		except:
 			sentence = sentence.decode('gbk','ignore')
-	re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n%]")
+	re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile(ur"[^\r\n]")
 	re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+")
 	blocks = re_han.split(sentence)
-
 	for blk in blocks:
 		if re_han.match(blk):
 				for word in __cut_DAG(blk):
--- a/test/test_pos.py
+++ b/test/test_pos.py
@ -90,4 +90,5 @@ if __name__ == "__main__":
 	cuttest('一次性交多少钱')
 	cuttest('两块五一套，三块八一斤，四块七一本，五块六一条')
 	cuttest('小和尚留了一个像大和尚一样的和尚头')
-	cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
+	cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
+	cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')