diff --git a/jieba/__init__.py b/jieba/__init__.py index 62183a9..45dc908 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -40,7 +40,10 @@ re_eng = re.compile('[a-zA-Z0-9]', re.U) # \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han # \r\n|\s : whitespace characters. Will not be handled. -re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)", re.U) +# re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)", re.U) +# Adding "-" symbol in re_han_default +re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U) + re_skip_default = re.compile("(\r\n|\s)", re.U) re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U) re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)