Merge pull request #237 from gumblex/master

直接将前缀储存在词频字典里
pull/238/head
Sun Junyi 10 years ago
commit b14eb329e3

@ -18,8 +18,7 @@ from . import finalseg
DICTIONARY = "dict.txt" DICTIONARY = "dict.txt"
DICT_LOCK = threading.RLock() DICT_LOCK = threading.RLock()
pfdict = None # to be initialized FREQ = {} # to be initialized
FREQ = {}
total = 0 total = 0
user_word_tag_tab = {} user_word_tag_tab = {}
initialized = False initialized = False
@ -41,7 +40,6 @@ def setLogLevel(log_level):
def gen_pfdict(f_name): def gen_pfdict(f_name):
lfreq = {} lfreq = {}
pfdict = set()
ltotal = 0 ltotal = 0
with open(f_name, 'rb') as f: with open(f_name, 'rb') as f:
lineno = 0 lineno = 0
@ -53,15 +51,17 @@ def gen_pfdict(f_name):
lfreq[word] = freq lfreq[word] = freq
ltotal += freq ltotal += freq
for ch in xrange(len(word)): for ch in xrange(len(word)):
pfdict.add(word[:ch + 1]) wfrag = word[:ch + 1]
if wfrag not in lfreq:
lfreq[wfrag] = 0
except ValueError as e: except ValueError as e:
logger.debug('%s at line %s %s' % (f_name, lineno, line)) logger.debug('%s at line %s %s' % (f_name, lineno, line))
raise e raise e
return pfdict, lfreq, ltotal return lfreq, ltotal
def initialize(dictionary=None): def initialize(dictionary=None):
global pfdict, FREQ, total, initialized, DICTIONARY, DICT_LOCK global FREQ, total, initialized, DICTIONARY, DICT_LOCK
if not dictionary: if not dictionary:
dictionary = DICTIONARY dictionary = DICTIONARY
with DICT_LOCK: with DICT_LOCK:
@ -83,19 +83,18 @@ def initialize(dictionary=None):
logger.debug("Loading model from cache %s" % cache_file) logger.debug("Loading model from cache %s" % cache_file)
try: try:
with open(cache_file, 'rb') as cf: with open(cache_file, 'rb') as cf:
pfdict, FREQ, total = marshal.load(cf) FREQ, total = marshal.load(cf)
# prevent conflict with old version load_from_cache_fail = False
load_from_cache_fail = not isinstance(pfdict, set)
except Exception: except Exception:
load_from_cache_fail = True load_from_cache_fail = True
if load_from_cache_fail: if load_from_cache_fail:
pfdict, FREQ, total = gen_pfdict(abs_path) FREQ, total = gen_pfdict(abs_path)
logger.debug("Dumping model to file cache %s" % cache_file) logger.debug("Dumping model to file cache %s" % cache_file)
try: try:
fd, fpath = tempfile.mkstemp() fd, fpath = tempfile.mkstemp()
with os.fdopen(fd, 'wb') as temp_cache_file: with os.fdopen(fd, 'wb') as temp_cache_file:
marshal.dump((pfdict, FREQ, total), temp_cache_file) marshal.dump((FREQ, total), temp_cache_file)
if os.name == 'nt': if os.name == 'nt':
from shutil import move as replace_file from shutil import move as replace_file
else: else:
@ -140,23 +139,24 @@ def __cut_all(sentence):
def calc(sentence, DAG, route): def calc(sentence, DAG, route):
N = len(sentence) N = len(sentence)
route[N] = (0.0, '') route[N] = (0, 0)
logtotal = log(total)
for idx in xrange(N - 1, -1, -1): for idx in xrange(N - 1, -1, -1):
route[idx] = max((log(FREQ.get(sentence[idx:x + 1], 1)) - route[idx] = max((log(FREQ.get(sentence[idx:x + 1]) or 1) -
log(total) + route[x + 1][0], x) for x in DAG[idx]) logtotal + route[x + 1][0], x) for x in DAG[idx])
@require_initialized @require_initialized
def get_DAG(sentence): def get_DAG(sentence):
global pfdict, FREQ global FREQ
DAG = {} DAG = {}
N = len(sentence) N = len(sentence)
for k in xrange(N): for k in xrange(N):
tmplist = [] tmplist = []
i = k i = k
frag = sentence[k] frag = sentence[k]
while i < N and frag in pfdict: while i < N and frag in FREQ:
if frag in FREQ: if FREQ[frag]:
tmplist.append(i) tmplist.append(i)
i += 1 i += 1
frag = sentence[k:i + 1] frag = sentence[k:i + 1]
@ -165,7 +165,7 @@ def get_DAG(sentence):
DAG[k] = tmplist DAG[k] = tmplist
return DAG return DAG
re_eng = re.compile(r'[a-zA-Z0-9]', re.U) re_eng = re.compile('[a-zA-Z0-9]', re.U)
def __cut_DAG_NO_HMM(sentence): def __cut_DAG_NO_HMM(sentence):
@ -210,7 +210,7 @@ def __cut_DAG(sentence):
yield buf yield buf
buf = '' buf = ''
else: else:
if buf not in FREQ: if not FREQ.get(buf):
recognized = finalseg.cut(buf) recognized = finalseg.cut(buf)
for t in recognized: for t in recognized:
yield t yield t
@ -224,7 +224,7 @@ def __cut_DAG(sentence):
if buf: if buf:
if len(buf) == 1: if len(buf) == 1:
yield buf yield buf
elif buf not in FREQ: elif not FREQ.get(buf):
recognized = finalseg.cut(buf) recognized = finalseg.cut(buf)
for t in recognized: for t in recognized:
yield t yield t
@ -288,12 +288,12 @@ def cut_for_search(sentence, HMM=True):
if len(w) > 2: if len(w) > 2:
for i in xrange(len(w) - 1): for i in xrange(len(w) - 1):
gram2 = w[i:i + 2] gram2 = w[i:i + 2]
if gram2 in FREQ: if FREQ.get(gram2):
yield gram2 yield gram2
if len(w) > 3: if len(w) > 3:
for i in xrange(len(w) - 2): for i in xrange(len(w) - 2):
gram3 = w[i:i + 3] gram3 = w[i:i + 3]
if gram3 in FREQ: if FREQ.get(gram3):
yield gram3 yield gram3
yield w yield w
@ -324,14 +324,16 @@ def load_userdict(f):
@require_initialized @require_initialized
def add_word(word, freq, tag=None): def add_word(word, freq, tag=None):
global FREQ, pfdict, total, user_word_tag_tab global FREQ, total, user_word_tag_tab
freq = int(freq) freq = int(freq)
FREQ[word] = freq FREQ[word] = freq
total += freq total += freq
if tag is not None: if tag is not None:
user_word_tag_tab[word] = tag user_word_tag_tab[word] = tag
for ch in xrange(len(word)): for ch in xrange(len(word)):
pfdict.add(word[:ch + 1]) wfrag = word[:ch + 1]
if wfrag not in lfreq:
lfreq[wfrag] = 0
__ref_cut = cut __ref_cut = cut
__ref_cut_for_search = cut_for_search __ref_cut_for_search = cut_for_search
@ -430,12 +432,12 @@ def tokenize(unicode_sentence, mode="default", HMM=True):
if len(w) > 2: if len(w) > 2:
for i in xrange(len(w) - 1): for i in xrange(len(w) - 1):
gram2 = w[i:i + 2] gram2 = w[i:i + 2]
if gram2 in FREQ: if FREQ.get(gram2):
yield (gram2, start + i, start + i + 2) yield (gram2, start + i, start + i + 2)
if len(w) > 3: if len(w) > 3:
for i in xrange(len(w) - 2): for i in xrange(len(w) - 2):
gram3 = w[i:i + 3] gram3 = w[i:i + 3]
if gram3 in FREQ: if FREQ.get(gram3):
yield (gram3, start + i, start + i + 3) yield (gram3, start + i, start + i + 3)
yield (w, start, start + width) yield (w, start, start + width)
start += width start += width

@ -95,4 +95,4 @@ if __name__ == "__main__":
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤') cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
cuttest('AT&T是一件不错的公司给你发offer了吗') cuttest('AT&T是一件不错的公司给你发offer了吗')
cuttest('C++和c#是什么关系11+122=133是吗PI=3.14159') cuttest('C++和c#是什么关系11+122=133是吗PI=3.14159')
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。') cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')

@ -1,4 +1,5 @@
#encoding=utf-8 #encoding=utf-8
from __future__ import print_function
import sys import sys
sys.path.append("../") sys.path.append("../")
import jieba import jieba
@ -94,4 +95,4 @@ if __name__ == "__main__":
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤') cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
cuttest('AT&T是一件不错的公司给你发offer了吗') cuttest('AT&T是一件不错的公司给你发offer了吗')
cuttest('C++和c#是什么关系11+122=133是吗PI=3.14159') cuttest('C++和c#是什么关系11+122=133是吗PI=3.14159')
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。') cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')

@ -1,4 +1,4 @@
import sys,time import time
import sys import sys
sys.path.append("../") sys.path.append("../")
import jieba import jieba

@ -1,4 +1,5 @@
#encoding=utf-8 #encoding=utf-8
from __future__ import print_function
import sys import sys
sys.path.append("../") sys.path.append("../")
import jieba.posseg as pseg import jieba.posseg as pseg
@ -95,4 +96,4 @@ if __name__ == "__main__":
cuttest('AT&T是一件不错的公司给你发offer了吗') cuttest('AT&T是一件不错的公司给你发offer了吗')
cuttest('C++和c#是什么关系11+122=133是吗PI=3.14159') cuttest('C++和c#是什么关系11+122=133是吗PI=3.14159')
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。') cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
cuttest('枪杆子中出政权') cuttest('枪杆子中出政权')

@ -1,4 +1,5 @@
#encoding=utf-8 #encoding=utf-8
from __future__ import print_function
import sys import sys
sys.path.append("../") sys.path.append("../")
import jieba import jieba

Loading…
Cancel
Save