Add initialize function and lazy initialization

12 years ago · c6098a8657
parent 87c2799692
commit c6098a8657
4 changed files with 718159 additions and 718118 deletions
--- a/README.md
+++ b/README.md
@ -229,6 +229,17 @@ Code sample (keyword extraction)

 	https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py

+Using Other Dictionaries
+========
+It is possible to supply Jieba with your own custom dictionary, and there are also two dictionaries readily available for download:
+
+1. You can employ a smaller dictionary to use less memory:
+https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.small
+
+2. There is also a bigger file that has better support for traditional characters (繁體):
+https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big
+
+In either case, download the file you want first, and then call `jieba.load_userdict('dict.txt.small')` or just replace the existing `dict.txt`.

 Segmentation speed
 =========
--- a/jieba/init.py
+++ b/jieba/init.py
@ -9,9 +9,13 @@ import marshal
 from math import log
 import random

+DICTIONARY = "dict.txt"
+
+trie = None # to be initialized
 FREQ = {}
+min_freq = 0.0
 total =0.0
-
+initialized = False

 def gen_trie(f_name):
 	lfreq = {}
@ -31,7 +35,8 @@ def gen_trie(f_name):
 		p['']='' #ending flag
 	return trie, lfreq,ltotal

-
+def initialize(dictionary=DICTIONARY):
+	global trie, FREQ, total, min_freq, initialized
 	_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )

 	print >> sys.stderr, "Building Trie..."
@ -47,7 +52,7 @@ if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(
 			load_from_cache_fail = True

 	if load_from_cache_fail:
-	trie,FREQ,total = gen_trie(os.path.join(_curpath,"dict.txt"))
+		trie,FREQ,total = gen_trie(os.path.join(_curpath, dictionary))
 		FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.iteritems()]) #normalize
 		min_freq = min(FREQ.itervalues())
 		print >> sys.stderr, "dumping model to file cache"
@ -60,10 +65,24 @@ if load_from_cache_fail:
 			replace_file = os.rename
 		replace_file(cache_file+tmp_suffix,cache_file)

+	initialized = True
+
 	print >> sys.stderr, "loading model cost ", time.time() - t1, "seconds."
 	print >> sys.stderr, "Trie has been built succesfully."


+def require_initialized(fn):
+		global initialized
+
+		def wrapped(*args, **kwargs):
+			if initialized:
+				return fn(*args, **kwargs)
+			else:
+				initialize()
+				return fn(*args, **kwargs)
+		return wrapped
+
+
 def __cut_all(sentence):
 	dag = get_DAG(sentence)
 	old_j = -1
@ -77,6 +96,7 @@ def __cut_all(sentence):
 					yield sentence[k:j+1]
 					old_j = j

+
 def calc(sentence,DAG,idx,route):
 	N = len(sentence)
 	route[N] = (1.0,'')
@ -84,6 +104,8 @@ def calc(sentence,DAG,idx,route):
 		candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0],x ) for x in DAG[idx] ]
 		route[idx] = max(candidates)

+
+@require_initialized
 def get_DAG(sentence):
 	N = len(sentence)
 	i,j=0,0
@ -111,6 +133,7 @@ def get_DAG(sentence):
 			DAG[i] =[i]
 	return DAG

+
 def __cut_DAG(sentence):
 	DAG = get_DAG(sentence)
 	route ={}
@ -144,7 +167,6 @@ def __cut_DAG(sentence):
 			for t in regognized:
 				yield t

-
 def cut(sentence,cut_all=False):
 	if not ( type(sentence) is unicode):
 		try:
@ -184,6 +206,7 @@ def cut_for_search(sentence):
 					yield gram3
 		yield w

+@require_initialized
 def load_userdict(f):
 	global trie,total,FREQ
 	if isinstance(f, (str, unicode)):
@ -200,3 +223,10 @@ def load_userdict(f):
 				p[c] ={}
 			p = p[c]
 		p['']='' #ending flag
+
+
+def set_dictionary(dictionary_path):
+	global initialized, DICTIONARY
+	DICTIONARY = dictionary_path
+	if initialized:
+		initialize()