Merge pull request #275 from gumblex/master

防止跨文件系统创建缓存
pull/292/head
Sun Junyi 10 years ago
commit 8e99a13aa9

@ -102,7 +102,7 @@ print(", ".join(seg_list))
台中 台中
``` ```
* 更改分词器(默认为 jieba.dt的 tmp_dir 和 cache_file 属性,可指定缓存文件位置,用于受限的文件系统。 * 更改分词器(默认为 `jieba.dt`)的 `tmp_dir``cache_file` 属性,可分别指定缓存文件所在的文件夹及其文件名,用于受限的文件系统。
* 范例: * 范例:

@ -110,11 +110,14 @@ class Tokenizer(object):
# default dictionary # default dictionary
elif abs_path == DEFAULT_DICT: elif abs_path == DEFAULT_DICT:
cache_file = "jieba.cache" cache_file = "jieba.cache"
else: # custom dictionary # custom dictionary
else:
cache_file = "jieba.u%s.cache" % md5( cache_file = "jieba.u%s.cache" % md5(
abs_path.encode('utf-8', 'replace')).hexdigest() abs_path.encode('utf-8', 'replace')).hexdigest()
cache_file = os.path.join( cache_file = os.path.join(
self.tmp_dir or tempfile.gettempdir(), cache_file) self.tmp_dir or tempfile.gettempdir(), cache_file)
# prevent absolute path in self.cache_file
tmpdir = os.path.dirname(cache_file)
load_from_cache_fail = True load_from_cache_fail = True
if os.path.isfile(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path): if os.path.isfile(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
@ -135,7 +138,8 @@ class Tokenizer(object):
default_logger.debug( default_logger.debug(
"Dumping model to file cache %s" % cache_file) "Dumping model to file cache %s" % cache_file)
try: try:
fd, fpath = tempfile.mkstemp() # prevent moving across different filesystems
fd, fpath = tempfile.mkstemp(dir=tmpdir)
with os.fdopen(fd, 'wb') as temp_cache_file: with os.fdopen(fd, 'wb') as temp_cache_file:
marshal.dump( marshal.dump(
(self.FREQ, self.total), temp_cache_file) (self.FREQ, self.total), temp_cache_file)

Loading…
Cancel
Save