Merge pull request #237 from gumblex/master

直接将前缀储存在词频字典里
pull/238/head
Sun Junyi 10 years ago
commit b14eb329e3

@ -18,8 +18,7 @@ from . import finalseg
DICTIONARY = "dict.txt"
DICT_LOCK = threading.RLock()
pfdict = None # to be initialized
FREQ = {}
FREQ = {} # to be initialized
total = 0
user_word_tag_tab = {}
initialized = False
@ -41,7 +40,6 @@ def setLogLevel(log_level):
def gen_pfdict(f_name):
lfreq = {}
pfdict = set()
ltotal = 0
with open(f_name, 'rb') as f:
lineno = 0
@ -53,15 +51,17 @@ def gen_pfdict(f_name):
lfreq[word] = freq
ltotal += freq
for ch in xrange(len(word)):
pfdict.add(word[:ch + 1])
wfrag = word[:ch + 1]
if wfrag not in lfreq:
lfreq[wfrag] = 0
except ValueError as e:
logger.debug('%s at line %s %s' % (f_name, lineno, line))
raise e
return pfdict, lfreq, ltotal
return lfreq, ltotal
def initialize(dictionary=None):
global pfdict, FREQ, total, initialized, DICTIONARY, DICT_LOCK
global FREQ, total, initialized, DICTIONARY, DICT_LOCK
if not dictionary:
dictionary = DICTIONARY
with DICT_LOCK:
@ -83,19 +83,18 @@ def initialize(dictionary=None):
logger.debug("Loading model from cache %s" % cache_file)
try:
with open(cache_file, 'rb') as cf:
pfdict, FREQ, total = marshal.load(cf)
# prevent conflict with old version
load_from_cache_fail = not isinstance(pfdict, set)
FREQ, total = marshal.load(cf)
load_from_cache_fail = False
except Exception:
load_from_cache_fail = True
if load_from_cache_fail:
pfdict, FREQ, total = gen_pfdict(abs_path)
FREQ, total = gen_pfdict(abs_path)
logger.debug("Dumping model to file cache %s" % cache_file)
try:
fd, fpath = tempfile.mkstemp()
with os.fdopen(fd, 'wb') as temp_cache_file:
marshal.dump((pfdict, FREQ, total), temp_cache_file)
marshal.dump((FREQ, total), temp_cache_file)
if os.name == 'nt':
from shutil import move as replace_file
else:
@ -140,23 +139,24 @@ def __cut_all(sentence):
def calc(sentence, DAG, route):
N = len(sentence)
route[N] = (0.0, '')
route[N] = (0, 0)
logtotal = log(total)
for idx in xrange(N - 1, -1, -1):
route[idx] = max((log(FREQ.get(sentence[idx:x + 1], 1)) -
log(total) + route[x + 1][0], x) for x in DAG[idx])
route[idx] = max((log(FREQ.get(sentence[idx:x + 1]) or 1) -
logtotal + route[x + 1][0], x) for x in DAG[idx])
@require_initialized
def get_DAG(sentence):
global pfdict, FREQ
global FREQ
DAG = {}
N = len(sentence)
for k in xrange(N):
tmplist = []
i = k
frag = sentence[k]
while i < N and frag in pfdict:
if frag in FREQ:
while i < N and frag in FREQ:
if FREQ[frag]:
tmplist.append(i)
i += 1
frag = sentence[k:i + 1]
@ -165,7 +165,7 @@ def get_DAG(sentence):
DAG[k] = tmplist
return DAG
re_eng = re.compile(r'[a-zA-Z0-9]', re.U)
re_eng = re.compile('[a-zA-Z0-9]', re.U)
def __cut_DAG_NO_HMM(sentence):
@ -210,7 +210,7 @@ def __cut_DAG(sentence):
yield buf
buf = ''
else:
if buf not in FREQ:
if not FREQ.get(buf):
recognized = finalseg.cut(buf)
for t in recognized:
yield t
@ -224,7 +224,7 @@ def __cut_DAG(sentence):
if buf:
if len(buf) == 1:
yield buf
elif buf not in FREQ:
elif not FREQ.get(buf):
recognized = finalseg.cut(buf)
for t in recognized:
yield t
@ -288,12 +288,12 @@ def cut_for_search(sentence, HMM=True):
if len(w) > 2:
for i in xrange(len(w) - 1):
gram2 = w[i:i + 2]
if gram2 in FREQ:
if FREQ.get(gram2):
yield gram2
if len(w) > 3:
for i in xrange(len(w) - 2):
gram3 = w[i:i + 3]
if gram3 in FREQ:
if FREQ.get(gram3):
yield gram3
yield w
@ -324,14 +324,16 @@ def load_userdict(f):
@require_initialized
def add_word(word, freq, tag=None):
global FREQ, pfdict, total, user_word_tag_tab
global FREQ, total, user_word_tag_tab
freq = int(freq)
FREQ[word] = freq
total += freq
if tag is not None:
user_word_tag_tab[word] = tag
for ch in xrange(len(word)):
pfdict.add(word[:ch + 1])
wfrag = word[:ch + 1]
if wfrag not in lfreq:
lfreq[wfrag] = 0
__ref_cut = cut
__ref_cut_for_search = cut_for_search
@ -430,12 +432,12 @@ def tokenize(unicode_sentence, mode="default", HMM=True):
if len(w) > 2:
for i in xrange(len(w) - 1):
gram2 = w[i:i + 2]
if gram2 in FREQ:
if FREQ.get(gram2):
yield (gram2, start + i, start + i + 2)
if len(w) > 3:
for i in xrange(len(w) - 2):
gram3 = w[i:i + 3]
if gram3 in FREQ:
if FREQ.get(gram3):
yield (gram3, start + i, start + i + 3)
yield (w, start, start + width)
start += width

@ -95,4 +95,4 @@ if __name__ == "__main__":
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
cuttest('AT&T是一件不错的公司给你发offer了吗')
cuttest('C++和c#是什么关系11+122=133是吗PI=3.14159')
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')

@ -1,4 +1,5 @@
#encoding=utf-8
from __future__ import print_function
import sys
sys.path.append("../")
import jieba
@ -94,4 +95,4 @@ if __name__ == "__main__":
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
cuttest('AT&T是一件不错的公司给你发offer了吗')
cuttest('C++和c#是什么关系11+122=133是吗PI=3.14159')
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')

@ -1,4 +1,4 @@
import sys,time
import time
import sys
sys.path.append("../")
import jieba

@ -1,4 +1,5 @@
#encoding=utf-8
from __future__ import print_function
import sys
sys.path.append("../")
import jieba.posseg as pseg
@ -95,4 +96,4 @@ if __name__ == "__main__":
cuttest('AT&T是一件不错的公司给你发offer了吗')
cuttest('C++和c#是什么关系11+122=133是吗PI=3.14159')
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
cuttest('枪杆子中出政权')
cuttest('枪杆子中出政权')

@ -1,4 +1,5 @@
#encoding=utf-8
from __future__ import print_function
import sys
sys.path.append("../")
import jieba

Loading…
Cancel
Save