use regex and fix encoding related issues in load_userdict

9 years ago · 99d0fb1a8a
parent 1c33252fce
commit 99d0fb1a8a
3 changed files with 24 additions and 22 deletions
--- a/README.md
+++ b/README.md
@ -89,8 +89,8 @@ print(", ".join(seg_list))
 ### 载入词典

 * 开发者可以指定自己自定义的词典，以便包含 jieba 词库里没有的词。虽然 jieba 有新词识别能力，但是自行添加新词可以保证更高的正确率
-* 用法： jieba.load_userdict(file_name) # file_name 为自定义词典的路径
-* 词典格式和 `dict.txt` 一样，一个词占一行；每一行分三部分：词语、词频（可省略）、词性（可省略），用空格隔开，顺序不可颠倒。
+* 用法： jieba.load_userdict(file_name) # file_name 为文件类对象或自定义词典的路径
+* 词典格式和 `dict.txt` 一样，一个词占一行；每一行分三部分：词语、词频（可省略）、词性（可省略），用空格隔开，顺序不可颠倒。`file_name` 若为路径或二进制方式打开的文件，则文件必须为 UTF-8 编码。
 * 词频省略时使用自动计算的能保证分出该词的词频。

 **例如：**
@ -521,8 +521,8 @@ Output:
 ###　Load dictionary

 * Developers can specify their own custom dictionary to be included in the jieba default dictionary. Jieba is able to identify new words, but you can add your own new words can ensure a higher accuracy.
-* Usage： `jieba.load_userdict(file_name) # file_name is the path of the custom dictionary`
-* The dictionary format is the same as that of `dict.txt`: one word per line; each line is divided into three parts separated by a space: word, word frequency, POS tag.
+* Usage： `jieba.load_userdict(file_name)` # file_name is a file-like object or the path of the custom dictionary
+* The dictionary format is the same as that of `dict.txt`: one word per line; each line is divided into three parts separated by a space: word, word frequency, POS tag. If `file_name` is a path or a file opened in binary mode, the dictionary must be UTF-8 encoded.
 * The word frequency and POS tag can be omitted respectively. The word frequency will be filled with a suitable value if omitted.

 **For example:**
--- a/jieba/init.py
+++ b/jieba/init.py
@ -35,6 +35,8 @@ DICT_WRITING = {}

 pool = None

+re_userdict = re.compile('^(.+?)( [0-9]+)?( [a-z]+)?$', re.U)
+
 re_eng = re.compile('[a-zA-Z0-9]', re.U)

 # \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
@ -350,6 +352,8 @@ class Tokenizer(object):

        Parameter:
            - f : A plain text file contains words and their ocurrences.
+                  Can be a file-like object, or the path of the dictionary file,
+                  whose encoding must be utf-8.

        Structure of dict file:
        word1 freq1 word_type1
@ -361,24 +365,21 @@ class Tokenizer(object):
        if isinstance(f, string_types):
            f = open(f, 'rb')
        for lineno, ln in enumerate(f, 1):
-            try:
-                line = ln.strip().decode('utf-8').lstrip('\ufeff')
-                if not line:
-                    continue
-                tup = line.split(" ")
-                freq, tag = None, None
-                if len(tup) == 2:
-                    if tup[1].isdigit():
-                        freq = tup[1]
-                    else:
-                        tag = tup[1]
-                elif len(tup) > 2:
-                    freq, tag = tup[1], tup[2]
-                self.add_word(tup[0], freq, tag)
-            except Exception:
-                raise ValueError(
-                    'invalid dictionary entry in %s at Line %s: %s' % (
-                    f.name, lineno, line))
+            line = ln.strip()
+            if not isinstance(f, text_type):
+                try:
+                    line = line.decode('utf-8').lstrip('\ufeff')
+                except UnicodeDecodeError:
+                    raise ValueError('dictionary file %s must be utf-8' % f.name)
+            if not line:
+                continue
+            # match won't be None because there's at least one character
+            word, freq, tag = re_userdict.match(line).groups()
+            if freq is not None:
+                freq = freq.strip()
+            if tag is not None:
+                tag = tag.strip()
+            self.add_word(word, freq, tag)

    def add_word(self, word, freq=None, tag=None):
        """
--- a/test/userdict.txt
+++ b/test/userdict.txt
@ -7,3 +7,4 @@ easy_install 3 eng
 八一双鹿 3 nz
 台中
 凱特琳 nz
+Edu Trust认证 2000