|
|
@ -23,313 +23,313 @@ user_word_tag_tab={}
|
|
|
|
initialized = False
|
|
|
|
initialized = False
|
|
|
|
|
|
|
|
|
|
|
|
def gen_trie(f_name):
|
|
|
|
def gen_trie(f_name):
|
|
|
|
lfreq = {}
|
|
|
|
lfreq = {}
|
|
|
|
trie = {}
|
|
|
|
trie = {}
|
|
|
|
ltotal = 0.0
|
|
|
|
ltotal = 0.0
|
|
|
|
with open(f_name, 'rb') as f:
|
|
|
|
with open(f_name, 'rb') as f:
|
|
|
|
lineno = 0
|
|
|
|
lineno = 0
|
|
|
|
for line in f.read().rstrip().decode('utf-8').split('\n'):
|
|
|
|
for line in f.read().rstrip().decode('utf-8').split('\n'):
|
|
|
|
lineno += 1
|
|
|
|
lineno += 1
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
word,freq,_ = line.split(' ')
|
|
|
|
word,freq,_ = line.split(' ')
|
|
|
|
freq = float(freq)
|
|
|
|
freq = float(freq)
|
|
|
|
lfreq[word] = freq
|
|
|
|
lfreq[word] = freq
|
|
|
|
ltotal+=freq
|
|
|
|
ltotal+=freq
|
|
|
|
p = trie
|
|
|
|
p = trie
|
|
|
|
for c in word:
|
|
|
|
for c in word:
|
|
|
|
if not c in p:
|
|
|
|
if not c in p:
|
|
|
|
p[c] ={}
|
|
|
|
p[c] ={}
|
|
|
|
p = p[c]
|
|
|
|
p = p[c]
|
|
|
|
p['']='' #ending flag
|
|
|
|
p['']='' #ending flag
|
|
|
|
except ValueError as e:
|
|
|
|
except ValueError as e:
|
|
|
|
print(f_name,' at line',lineno,line, file=sys.stderr)
|
|
|
|
print(f_name,' at line',lineno,line, file=sys.stderr)
|
|
|
|
raise e
|
|
|
|
raise e
|
|
|
|
return trie, lfreq,ltotal
|
|
|
|
return trie, lfreq,ltotal
|
|
|
|
|
|
|
|
|
|
|
|
def initialize(*args):
|
|
|
|
def initialize(*args):
|
|
|
|
global trie, FREQ, total, min_freq, initialized
|
|
|
|
global trie, FREQ, total, min_freq, initialized
|
|
|
|
if len(args)==0:
|
|
|
|
if len(args)==0:
|
|
|
|
dictionary = DICTIONARY
|
|
|
|
dictionary = DICTIONARY
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
dictionary = args[0]
|
|
|
|
dictionary = args[0]
|
|
|
|
with DICT_LOCK:
|
|
|
|
with DICT_LOCK:
|
|
|
|
if initialized:
|
|
|
|
if initialized:
|
|
|
|
return
|
|
|
|
return
|
|
|
|
if trie:
|
|
|
|
if trie:
|
|
|
|
del trie
|
|
|
|
del trie
|
|
|
|
trie = None
|
|
|
|
trie = None
|
|
|
|
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
|
|
|
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
|
|
|
|
|
|
|
|
|
|
|
abs_path = os.path.join(_curpath,dictionary)
|
|
|
|
abs_path = os.path.join(_curpath,dictionary)
|
|
|
|
print("Building Trie..., from " + abs_path, file=sys.stderr)
|
|
|
|
print("Building Trie..., from " + abs_path, file=sys.stderr)
|
|
|
|
t1 = time.time()
|
|
|
|
t1 = time.time()
|
|
|
|
if abs_path == os.path.join(_curpath,"dict.txt"): #defautl dictionary
|
|
|
|
if abs_path == os.path.join(_curpath,"dict.txt"): #defautl dictionary
|
|
|
|
cache_file = os.path.join(tempfile.gettempdir(),"jieba.cache")
|
|
|
|
cache_file = os.path.join(tempfile.gettempdir(),"jieba.cache")
|
|
|
|
else: #customer dictionary
|
|
|
|
else: #customer dictionary
|
|
|
|
cache_file = os.path.join(tempfile.gettempdir(),"jieba.user."+str(hash(abs_path))+".cache")
|
|
|
|
cache_file = os.path.join(tempfile.gettempdir(),"jieba.user."+str(hash(abs_path))+".cache")
|
|
|
|
|
|
|
|
|
|
|
|
load_from_cache_fail = True
|
|
|
|
load_from_cache_fail = True
|
|
|
|
if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(abs_path):
|
|
|
|
if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(abs_path):
|
|
|
|
print("loading model from cache " + cache_file, file=sys.stderr)
|
|
|
|
print("loading model from cache " + cache_file, file=sys.stderr)
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
trie,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
|
|
|
|
trie,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
|
|
|
|
load_from_cache_fail = False
|
|
|
|
load_from_cache_fail = False
|
|
|
|
except:
|
|
|
|
except:
|
|
|
|
load_from_cache_fail = True
|
|
|
|
load_from_cache_fail = True
|
|
|
|
|
|
|
|
|
|
|
|
if load_from_cache_fail:
|
|
|
|
if load_from_cache_fail:
|
|
|
|
trie,FREQ,total = gen_trie(abs_path)
|
|
|
|
trie,FREQ,total = gen_trie(abs_path)
|
|
|
|
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.items()]) #normalize
|
|
|
|
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.items()]) #normalize
|
|
|
|
min_freq = min(FREQ.values())
|
|
|
|
min_freq = min(FREQ.values())
|
|
|
|
print("dumping model to file cache " + cache_file, file=sys.stderr)
|
|
|
|
print("dumping model to file cache " + cache_file, file=sys.stderr)
|
|
|
|
tmp_suffix = "."+str(random.random())
|
|
|
|
tmp_suffix = "."+str(random.random())
|
|
|
|
with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
|
|
|
|
with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
|
|
|
|
marshal.dump((trie,FREQ,total,min_freq),temp_cache_file)
|
|
|
|
marshal.dump((trie,FREQ,total,min_freq),temp_cache_file)
|
|
|
|
if os.name=='nt':
|
|
|
|
if os.name=='nt':
|
|
|
|
import shutil
|
|
|
|
import shutil
|
|
|
|
replace_file = shutil.move
|
|
|
|
replace_file = shutil.move
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
replace_file = os.rename
|
|
|
|
replace_file = os.rename
|
|
|
|
replace_file(cache_file+tmp_suffix,cache_file)
|
|
|
|
replace_file(cache_file+tmp_suffix,cache_file)
|
|
|
|
|
|
|
|
|
|
|
|
initialized = True
|
|
|
|
initialized = True
|
|
|
|
|
|
|
|
|
|
|
|
print("loading model cost ", time.time() - t1, "seconds.",file=sys.stderr)
|
|
|
|
print("loading model cost ", time.time() - t1, "seconds.",file=sys.stderr)
|
|
|
|
print("Trie has been built succesfully.", file=sys.stderr)
|
|
|
|
print("Trie has been built succesfully.", file=sys.stderr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def require_initialized(fn):
|
|
|
|
def require_initialized(fn):
|
|
|
|
global initialized,DICTIONARY
|
|
|
|
global initialized,DICTIONARY
|
|
|
|
|
|
|
|
|
|
|
|
@wraps(fn)
|
|
|
|
@wraps(fn)
|
|
|
|
def wrapped(*args, **kwargs):
|
|
|
|
def wrapped(*args, **kwargs):
|
|
|
|
if initialized:
|
|
|
|
if initialized:
|
|
|
|
return fn(*args, **kwargs)
|
|
|
|
return fn(*args, **kwargs)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
initialize(DICTIONARY)
|
|
|
|
initialize(DICTIONARY)
|
|
|
|
return fn(*args, **kwargs)
|
|
|
|
return fn(*args, **kwargs)
|
|
|
|
return wrapped
|
|
|
|
return wrapped
|
|
|
|
|
|
|
|
|
|
|
|
def __cut_all(sentence):
|
|
|
|
def __cut_all(sentence):
|
|
|
|
dag = get_DAG(sentence)
|
|
|
|
dag = get_DAG(sentence)
|
|
|
|
old_j = -1
|
|
|
|
old_j = -1
|
|
|
|
for k,L in dag.items():
|
|
|
|
for k,L in dag.items():
|
|
|
|
if len(L)==1 and k>old_j:
|
|
|
|
if len(L)==1 and k>old_j:
|
|
|
|
yield sentence[k:L[0]+1]
|
|
|
|
yield sentence[k:L[0]+1]
|
|
|
|
old_j = L[0]
|
|
|
|
old_j = L[0]
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
for j in L:
|
|
|
|
for j in L:
|
|
|
|
if j>k:
|
|
|
|
if j>k:
|
|
|
|
yield sentence[k:j+1]
|
|
|
|
yield sentence[k:j+1]
|
|
|
|
old_j = j
|
|
|
|
old_j = j
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def calc(sentence,DAG,idx,route):
|
|
|
|
def calc(sentence,DAG,idx,route):
|
|
|
|
N = len(sentence)
|
|
|
|
N = len(sentence)
|
|
|
|
route[N] = (0.0,'')
|
|
|
|
route[N] = (0.0,'')
|
|
|
|
for idx in range(N-1,-1,-1):
|
|
|
|
for idx in range(N-1,-1,-1):
|
|
|
|
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0],x ) for x in DAG[idx] ]
|
|
|
|
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0],x ) for x in DAG[idx] ]
|
|
|
|
route[idx] = max(candidates)
|
|
|
|
route[idx] = max(candidates)
|
|
|
|
|
|
|
|
|
|
|
|
@require_initialized
|
|
|
|
@require_initialized
|
|
|
|
def get_DAG(sentence):
|
|
|
|
def get_DAG(sentence):
|
|
|
|
N = len(sentence)
|
|
|
|
N = len(sentence)
|
|
|
|
i,j=0,0
|
|
|
|
i,j=0,0
|
|
|
|
p = trie
|
|
|
|
p = trie
|
|
|
|
DAG = {}
|
|
|
|
DAG = {}
|
|
|
|
while i<N:
|
|
|
|
while i<N:
|
|
|
|
c = sentence[j]
|
|
|
|
c = sentence[j]
|
|
|
|
if c in p:
|
|
|
|
if c in p:
|
|
|
|
p = p[c]
|
|
|
|
p = p[c]
|
|
|
|
if '' in p:
|
|
|
|
if '' in p:
|
|
|
|
if not i in DAG:
|
|
|
|
if not i in DAG:
|
|
|
|
DAG[i]=[]
|
|
|
|
DAG[i]=[]
|
|
|
|
DAG[i].append(j)
|
|
|
|
DAG[i].append(j)
|
|
|
|
j+=1
|
|
|
|
j+=1
|
|
|
|
if j>=N:
|
|
|
|
if j>=N:
|
|
|
|
i+=1
|
|
|
|
i+=1
|
|
|
|
j=i
|
|
|
|
j=i
|
|
|
|
p=trie
|
|
|
|
p=trie
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
p = trie
|
|
|
|
p = trie
|
|
|
|
i+=1
|
|
|
|
i+=1
|
|
|
|
j=i
|
|
|
|
j=i
|
|
|
|
for i in range(len(sentence)):
|
|
|
|
for i in range(len(sentence)):
|
|
|
|
if not i in DAG:
|
|
|
|
if not i in DAG:
|
|
|
|
DAG[i] =[i]
|
|
|
|
DAG[i] =[i]
|
|
|
|
return DAG
|
|
|
|
return DAG
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __cut_DAG(sentence):
|
|
|
|
def __cut_DAG(sentence):
|
|
|
|
DAG = get_DAG(sentence)
|
|
|
|
DAG = get_DAG(sentence)
|
|
|
|
route ={}
|
|
|
|
route ={}
|
|
|
|
calc(sentence,DAG,0,route=route)
|
|
|
|
calc(sentence,DAG,0,route=route)
|
|
|
|
x = 0
|
|
|
|
x = 0
|
|
|
|
buf =''
|
|
|
|
buf =''
|
|
|
|
N = len(sentence)
|
|
|
|
N = len(sentence)
|
|
|
|
while x<N:
|
|
|
|
while x<N:
|
|
|
|
y = route[x][1]+1
|
|
|
|
y = route[x][1]+1
|
|
|
|
l_word = sentence[x:y]
|
|
|
|
l_word = sentence[x:y]
|
|
|
|
if y-x==1:
|
|
|
|
if y-x==1:
|
|
|
|
buf+= l_word
|
|
|
|
buf+= l_word
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
if len(buf)>0:
|
|
|
|
if len(buf)>0:
|
|
|
|
if len(buf)==1:
|
|
|
|
if len(buf)==1:
|
|
|
|
yield buf
|
|
|
|
yield buf
|
|
|
|
buf=''
|
|
|
|
buf=''
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
if not (buf in FREQ):
|
|
|
|
if not (buf in FREQ):
|
|
|
|
regognized = finalseg.cut(buf)
|
|
|
|
regognized = finalseg.cut(buf)
|
|
|
|
for t in regognized:
|
|
|
|
for t in regognized:
|
|
|
|
yield t
|
|
|
|
yield t
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
for elem in buf:
|
|
|
|
for elem in buf:
|
|
|
|
yield elem
|
|
|
|
yield elem
|
|
|
|
buf=''
|
|
|
|
buf=''
|
|
|
|
yield l_word
|
|
|
|
yield l_word
|
|
|
|
x =y
|
|
|
|
x =y
|
|
|
|
|
|
|
|
|
|
|
|
if len(buf)>0:
|
|
|
|
if len(buf)>0:
|
|
|
|
if len(buf)==1:
|
|
|
|
if len(buf)==1:
|
|
|
|
yield buf
|
|
|
|
yield buf
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
if not (buf in FREQ):
|
|
|
|
if not (buf in FREQ):
|
|
|
|
regognized = finalseg.cut(buf)
|
|
|
|
regognized = finalseg.cut(buf)
|
|
|
|
for t in regognized:
|
|
|
|
for t in regognized:
|
|
|
|
yield t
|
|
|
|
yield t
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
for elem in buf:
|
|
|
|
for elem in buf:
|
|
|
|
yield elem
|
|
|
|
yield elem
|
|
|
|
|
|
|
|
|
|
|
|
def cut(sentence,cut_all=False):
|
|
|
|
def cut(sentence,cut_all=False):
|
|
|
|
if( type(sentence) is bytes):
|
|
|
|
if( type(sentence) is bytes):
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
sentence = sentence.decode('utf-8')
|
|
|
|
sentence = sentence.decode('utf-8')
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
sentence = sentence.decode('gbk','ignore')
|
|
|
|
sentence = sentence.decode('gbk','ignore')
|
|
|
|
|
|
|
|
|
|
|
|
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)")
|
|
|
|
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)")
|
|
|
|
|
|
|
|
|
|
|
|
if cut_all:
|
|
|
|
if cut_all:
|
|
|
|
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("[^a-zA-Z0-9+#\n]")
|
|
|
|
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("[^a-zA-Z0-9+#\n]")
|
|
|
|
|
|
|
|
|
|
|
|
blocks = re_han.split(sentence)
|
|
|
|
blocks = re_han.split(sentence)
|
|
|
|
cut_block = __cut_DAG
|
|
|
|
cut_block = __cut_DAG
|
|
|
|
if cut_all:
|
|
|
|
if cut_all:
|
|
|
|
cut_block = __cut_all
|
|
|
|
cut_block = __cut_all
|
|
|
|
for blk in blocks:
|
|
|
|
for blk in blocks:
|
|
|
|
if re_han.match(blk):
|
|
|
|
if re_han.match(blk):
|
|
|
|
#pprint.pprint(__cut_DAG(blk))
|
|
|
|
#pprint.pprint(__cut_DAG(blk))
|
|
|
|
for word in cut_block(blk):
|
|
|
|
for word in cut_block(blk):
|
|
|
|
yield word
|
|
|
|
yield word
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
tmp = re_skip.split(blk)
|
|
|
|
tmp = re_skip.split(blk)
|
|
|
|
for x in tmp:
|
|
|
|
for x in tmp:
|
|
|
|
if re_skip.match(x):
|
|
|
|
if re_skip.match(x):
|
|
|
|
yield x
|
|
|
|
yield x
|
|
|
|
elif not cut_all:
|
|
|
|
elif not cut_all:
|
|
|
|
for xx in x:
|
|
|
|
for xx in x:
|
|
|
|
yield xx
|
|
|
|
yield xx
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
yield x
|
|
|
|
yield x
|
|
|
|
|
|
|
|
|
|
|
|
def cut_for_search(sentence):
|
|
|
|
def cut_for_search(sentence):
|
|
|
|
words = cut(sentence)
|
|
|
|
words = cut(sentence)
|
|
|
|
for w in words:
|
|
|
|
for w in words:
|
|
|
|
if len(w)>2:
|
|
|
|
if len(w)>2:
|
|
|
|
for i in range(len(w)-1):
|
|
|
|
for i in range(len(w)-1):
|
|
|
|
gram2 = w[i:i+2]
|
|
|
|
gram2 = w[i:i+2]
|
|
|
|
if gram2 in FREQ:
|
|
|
|
if gram2 in FREQ:
|
|
|
|
yield gram2
|
|
|
|
yield gram2
|
|
|
|
if len(w)>3:
|
|
|
|
if len(w)>3:
|
|
|
|
for i in range(len(w)-2):
|
|
|
|
for i in range(len(w)-2):
|
|
|
|
gram3 = w[i:i+3]
|
|
|
|
gram3 = w[i:i+3]
|
|
|
|
if gram3 in FREQ:
|
|
|
|
if gram3 in FREQ:
|
|
|
|
yield gram3
|
|
|
|
yield gram3
|
|
|
|
yield w
|
|
|
|
yield w
|
|
|
|
|
|
|
|
|
|
|
|
@require_initialized
|
|
|
|
@require_initialized
|
|
|
|
def load_userdict(f):
|
|
|
|
def load_userdict(f):
|
|
|
|
global trie,total,FREQ
|
|
|
|
global trie,total,FREQ
|
|
|
|
if isinstance(f, (str, )):
|
|
|
|
if isinstance(f, (str, )):
|
|
|
|
f = open(f, 'rb')
|
|
|
|
f = open(f, 'rb')
|
|
|
|
content = f.read().decode('utf-8')
|
|
|
|
content = f.read().decode('utf-8')
|
|
|
|
line_no = 0
|
|
|
|
line_no = 0
|
|
|
|
for line in content.split("\n"):
|
|
|
|
for line in content.split("\n"):
|
|
|
|
line_no+=1
|
|
|
|
line_no+=1
|
|
|
|
if line.rstrip()=='': continue
|
|
|
|
if line.rstrip()=='': continue
|
|
|
|
tup =line.split(" ")
|
|
|
|
tup =line.split(" ")
|
|
|
|
word,freq = tup[0],tup[1]
|
|
|
|
word,freq = tup[0],tup[1]
|
|
|
|
if line_no==1:
|
|
|
|
if line_no==1:
|
|
|
|
word = word.replace('\ufeff',"") #remove bom flag if it exists
|
|
|
|
word = word.replace('\ufeff',"") #remove bom flag if it exists
|
|
|
|
if len(tup)==3:
|
|
|
|
if len(tup)==3:
|
|
|
|
user_word_tag_tab[word]=tup[2].strip()
|
|
|
|
user_word_tag_tab[word]=tup[2].strip()
|
|
|
|
freq = float(freq)
|
|
|
|
freq = float(freq)
|
|
|
|
FREQ[word] = log(freq / total)
|
|
|
|
FREQ[word] = log(freq / total)
|
|
|
|
p = trie
|
|
|
|
p = trie
|
|
|
|
for c in word:
|
|
|
|
for c in word:
|
|
|
|
if not c in p:
|
|
|
|
if not c in p:
|
|
|
|
p[c] ={}
|
|
|
|
p[c] ={}
|
|
|
|
p = p[c]
|
|
|
|
p = p[c]
|
|
|
|
p['']='' #ending flag
|
|
|
|
p['']='' #ending flag
|
|
|
|
|
|
|
|
|
|
|
|
__ref_cut = cut
|
|
|
|
__ref_cut = cut
|
|
|
|
__ref_cut_for_search = cut_for_search
|
|
|
|
__ref_cut_for_search = cut_for_search
|
|
|
|
|
|
|
|
|
|
|
|
def __lcut(sentence):
|
|
|
|
def __lcut(sentence):
|
|
|
|
return list(__ref_cut(sentence,False))
|
|
|
|
return list(__ref_cut(sentence,False))
|
|
|
|
def __lcut_all(sentence):
|
|
|
|
def __lcut_all(sentence):
|
|
|
|
return list(__ref_cut(sentence,True))
|
|
|
|
return list(__ref_cut(sentence,True))
|
|
|
|
def __lcut_for_search(sentence):
|
|
|
|
def __lcut_for_search(sentence):
|
|
|
|
return list(__ref_cut_for_search(sentence))
|
|
|
|
return list(__ref_cut_for_search(sentence))
|
|
|
|
|
|
|
|
|
|
|
|
@require_initialized
|
|
|
|
@require_initialized
|
|
|
|
def enable_parallel(processnum):
|
|
|
|
def enable_parallel(processnum):
|
|
|
|
global pool,cut,cut_for_search
|
|
|
|
global pool,cut,cut_for_search
|
|
|
|
if os.name=='nt':
|
|
|
|
if os.name=='nt':
|
|
|
|
raise Exception("parallel mode only supports posix system")
|
|
|
|
raise Exception("parallel mode only supports posix system")
|
|
|
|
|
|
|
|
|
|
|
|
from multiprocessing import Pool
|
|
|
|
from multiprocessing import Pool
|
|
|
|
pool = Pool(processnum)
|
|
|
|
pool = Pool(processnum)
|
|
|
|
|
|
|
|
|
|
|
|
def pcut(sentence,cut_all=False):
|
|
|
|
def pcut(sentence,cut_all=False):
|
|
|
|
parts = re.compile(b'([\r\n]+)').split(sentence)
|
|
|
|
parts = re.compile(b'([\r\n]+)').split(sentence)
|
|
|
|
if cut_all:
|
|
|
|
if cut_all:
|
|
|
|
result = pool.map(__lcut_all,parts)
|
|
|
|
result = pool.map(__lcut_all,parts)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
result = pool.map(__lcut,parts)
|
|
|
|
result = pool.map(__lcut,parts)
|
|
|
|
for r in result:
|
|
|
|
for r in result:
|
|
|
|
for w in r:
|
|
|
|
for w in r:
|
|
|
|
yield w
|
|
|
|
yield w
|
|
|
|
|
|
|
|
|
|
|
|
def pcut_for_search(sentence):
|
|
|
|
def pcut_for_search(sentence):
|
|
|
|
parts = re.compile(b'([\r\n]+)').split(sentence)
|
|
|
|
parts = re.compile(b'([\r\n]+)').split(sentence)
|
|
|
|
result = pool.map(__lcut_for_search,parts)
|
|
|
|
result = pool.map(__lcut_for_search,parts)
|
|
|
|
for r in result:
|
|
|
|
for r in result:
|
|
|
|
for w in r:
|
|
|
|
for w in r:
|
|
|
|
yield w
|
|
|
|
yield w
|
|
|
|
|
|
|
|
|
|
|
|
cut = pcut
|
|
|
|
cut = pcut
|
|
|
|
cut_for_search = pcut_for_search
|
|
|
|
cut_for_search = pcut_for_search
|
|
|
|
|
|
|
|
|
|
|
|
def disable_parallel():
|
|
|
|
def disable_parallel():
|
|
|
|
global pool,cut,cut_for_search
|
|
|
|
global pool,cut,cut_for_search
|
|
|
|
if 'pool' in globals():
|
|
|
|
if 'pool' in globals():
|
|
|
|
pool.close()
|
|
|
|
pool.close()
|
|
|
|
pool = None
|
|
|
|
pool = None
|
|
|
|
cut = __ref_cut
|
|
|
|
cut = __ref_cut
|
|
|
|
cut_for_search = __ref_cut_for_search
|
|
|
|
cut_for_search = __ref_cut_for_search
|
|
|
|
|
|
|
|
|
|
|
|
def set_dictionary(dictionary_path):
|
|
|
|
def set_dictionary(dictionary_path):
|
|
|
|
global initialized, DICTIONARY
|
|
|
|
global initialized, DICTIONARY
|
|
|
|
with DICT_LOCK:
|
|
|
|
with DICT_LOCK:
|
|
|
|
abs_path = os.path.normpath( os.path.join( os.getcwd(), dictionary_path ) )
|
|
|
|
abs_path = os.path.normpath( os.path.join( os.getcwd(), dictionary_path ) )
|
|
|
|
if not os.path.exists(abs_path):
|
|
|
|
if not os.path.exists(abs_path):
|
|
|
|
raise Exception("path does not exists:" + abs_path)
|
|
|
|
raise Exception("path does not exists:" + abs_path)
|
|
|
|
DICTIONARY = abs_path
|
|
|
|
DICTIONARY = abs_path
|
|
|
|
initialized = False
|
|
|
|
initialized = False
|
|
|
|
|
|
|
|
|
|
|
|
def get_abs_path_dict():
|
|
|
|
def get_abs_path_dict():
|
|
|
|
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
|
|
|
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
|
|
|
abs_path = os.path.join(_curpath,DICTIONARY)
|
|
|
|
abs_path = os.path.join(_curpath,DICTIONARY)
|
|
|
|
return abs_path
|
|
|
|
return abs_path
|
|
|
|