讓 jieba 可以切換 idf 語料庫

1. 新增繁體中文 idf 語料庫
2. 為了讓 jieba 可以切換 iff 語料庫,新增 get_idf, set_idf_path 方法,並改寫 extract_tags
3. test 增加 extract_tags_idfpath
pull/174/head
Fukuball Lin 11 years ago
parent 473ac1df75
commit 7198d562f1

File diff suppressed because it is too large Load Diff

@ -39,7 +39,7 @@ def gen_trie(f_name):
trie = {} trie = {}
ltotal = 0.0 ltotal = 0.0
with open(f_name, 'rb') as f: with open(f_name, 'rb') as f:
lineno = 0 lineno = 0
for line in f.read().rstrip().decode('utf-8').split('\n'): for line in f.read().rstrip().decode('utf-8').split('\n'):
lineno += 1 lineno += 1
try: try:
@ -134,7 +134,7 @@ def __cut_all(sentence):
for k,L in dag.iteritems(): for k,L in dag.iteritems():
if len(L)==1 and k>old_j: if len(L)==1 and k>old_j:
yield sentence[k:L[0]+1] yield sentence[k:L[0]+1]
old_j = L[0] old_j = L[0]
else: else:
for j in L: for j in L:
if j>k: if j>k:
@ -195,7 +195,7 @@ def __cut_DAG_NO_HMM(sentence):
if len(buf)>0: if len(buf)>0:
yield buf yield buf
buf = u'' buf = u''
yield l_word yield l_word
x =y x =y
if len(buf)>0: if len(buf)>0:
yield buf yield buf
@ -227,7 +227,7 @@ def __cut_DAG(sentence):
for elem in buf: for elem in buf:
yield elem yield elem
buf=u'' buf=u''
yield l_word yield l_word
x =y x =y
if len(buf)>0: if len(buf)>0:
@ -243,8 +243,8 @@ def __cut_DAG(sentence):
yield elem yield elem
def cut(sentence,cut_all=False,HMM=True): def cut(sentence,cut_all=False,HMM=True):
'''The main function that segments an entire sentence that contains '''The main function that segments an entire sentence that contains
Chinese characters into seperated words. Chinese characters into seperated words.
Parameter: Parameter:
- sentence: The String to be segmented - sentence: The String to be segmented
- cut_all: Model. True means full pattern, false means accurate pattern. - cut_all: Model. True means full pattern, false means accurate pattern.
@ -257,8 +257,8 @@ def cut(sentence,cut_all=False,HMM=True):
sentence = sentence.decode('gbk','ignore') sentence = sentence.decode('gbk','ignore')
''' '''
\u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
\r\n|\s : whitespace characters. Will not be Handled. \r\n|\s : whitespace characters. Will not be Handled.
''' '''
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U) re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
if cut_all: if cut_all:
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U) re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U)
@ -306,7 +306,7 @@ def load_userdict(f):
''' Load personalized dict to improve detect rate. ''' Load personalized dict to improve detect rate.
Parameter: Parameter:
- f : A plain text file contains words and their ocurrences. - f : A plain text file contains words and their ocurrences.
Structure of dict file: Structure of dict file:
word1 freq1 word_type1 word1 freq1 word_type1
word2 freq2 word_type2 word2 freq2 word_type2
... ...
@ -372,7 +372,7 @@ def enable_parallel(processnum=None):
def pcut(sentence,cut_all=False,HMM=True): def pcut(sentence,cut_all=False,HMM=True):
parts = re.compile('([\r\n]+)').split(sentence) parts = re.compile('([\r\n]+)').split(sentence)
if cut_all: if cut_all:
result = pool.map(__lcut_all,parts) result = pool.map(__lcut_all,parts)
else: else:
if HMM: if HMM:
result = pool.map(__lcut,parts) result = pool.map(__lcut,parts)
@ -418,7 +418,7 @@ def tokenize(unicode_sentence,mode="default",HMM=True):
#mode ("default" or "search") #mode ("default" or "search")
if not isinstance(unicode_sentence, unicode): if not isinstance(unicode_sentence, unicode):
raise Exception("jieba: the input parameter should unicode.") raise Exception("jieba: the input parameter should unicode.")
start = 0 start = 0
if mode=='default': if mode=='default':
for w in cut(unicode_sentence,HMM=HMM): for w in cut(unicode_sentence,HMM=HMM):
width = len(w) width = len(w)

@ -1,3 +1,4 @@
#encoding=utf-8
import jieba import jieba
import os import os
try: try:
@ -5,22 +6,34 @@ try:
except ImportError: except ImportError:
pass pass
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) _curpath = os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
f_name = os.path.join(_curpath,"idf.txt") abs_path = os.path.join(_curpath, "idf.txt")
content = open(f_name,'rb').read().decode('utf-8') IDF_DICTIONARY = abs_path
idf_freq = {} def set_idf_path(idf_path):
lines = content.split('\n') global IDF_DICTIONARY
for line in lines: abs_path = os.path.normpath( os.path.join( os.getcwd(), idf_path ) )
word,freq = line.split(' ') if not os.path.exists(abs_path):
idf_freq[word] = float(freq) raise Exception("jieba: path does not exist:" + abs_path)
IDF_DICTIONARY = abs_path
return
median_idf = sorted(idf_freq.values())[len(idf_freq)/2] def get_idf(abs_path):
stop_words= set([ content = open(abs_path,'rb').read().decode('utf-8')
"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that" idf_freq = {}
]) lines = content.split('\n')
for line in lines:
word,freq = line.split(' ')
idf_freq[word] = float(freq)
median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
return idf_freq, median_idf
def extract_tags(sentence,topK=20): def extract_tags(sentence,topK=20):
global IDF_DICTIONARY
idf_freq, median_idf = get_idf(IDF_DICTIONARY)
stop_words= set([
"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
])
words = jieba.cut(sentence) words = jieba.cut(sentence)
freq = {} freq = {}
for w in words: for w in words:

@ -0,0 +1,32 @@
import sys
sys.path.append('../')
import jieba
import jieba.analyse
from optparse import OptionParser
USAGE = "usage: python extract_tags_idfpath.py [file name] -k [top k]"
parser = OptionParser(USAGE)
parser.add_option("-k", dest="topK")
opt, args = parser.parse_args()
if len(args) < 1:
print USAGE
sys.exit(1)
file_name = args[0]
if opt.topK is None:
topK = 10
else:
topK = int(opt.topK)
content = open(file_name, 'rb').read()
jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
tags = jieba.analyse.extract_tags(content, topK=topK)
print ",".join(tags)
Loading…
Cancel
Save