Merge pull request #174 from fukuball/master

讓 jieba 可以切換 idf 語料庫及 stop words 語料庫
pull/176/head
Sun Junyi 11 years ago
commit 16d626d347

File diff suppressed because it is too large Load Diff

@ -0,0 +1,51 @@
the
of
is
and
to
in
that
we
for
an
are
by
be
as
on
with
can
if
from
which
you
it
this
then
at
have
all
not
one
has
or
that
一個
沒有
我們
你們
妳們
他們
她們
是否

@ -39,7 +39,7 @@ def gen_trie(f_name):
trie = {}
ltotal = 0.0
with open(f_name, 'rb') as f:
lineno = 0
lineno = 0
for line in f.read().rstrip().decode('utf-8').split('\n'):
lineno += 1
try:
@ -134,7 +134,7 @@ def __cut_all(sentence):
for k,L in dag.iteritems():
if len(L)==1 and k>old_j:
yield sentence[k:L[0]+1]
old_j = L[0]
old_j = L[0]
else:
for j in L:
if j>k:
@ -195,7 +195,7 @@ def __cut_DAG_NO_HMM(sentence):
if len(buf)>0:
yield buf
buf = u''
yield l_word
yield l_word
x =y
if len(buf)>0:
yield buf
@ -227,7 +227,7 @@ def __cut_DAG(sentence):
for elem in buf:
yield elem
buf=u''
yield l_word
yield l_word
x =y
if len(buf)>0:
@ -243,8 +243,8 @@ def __cut_DAG(sentence):
yield elem
def cut(sentence,cut_all=False,HMM=True):
'''The main function that segments an entire sentence that contains
Chinese characters into seperated words.
'''The main function that segments an entire sentence that contains
Chinese characters into seperated words.
Parameter:
- sentence: The String to be segmented
- cut_all: Model. True means full pattern, false means accurate pattern.
@ -257,8 +257,8 @@ def cut(sentence,cut_all=False,HMM=True):
sentence = sentence.decode('gbk','ignore')
'''
\u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
\r\n|\s : whitespace characters. Will not be Handled.
'''
\r\n|\s : whitespace characters. Will not be Handled.
'''
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
if cut_all:
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U)
@ -306,7 +306,7 @@ def load_userdict(f):
''' Load personalized dict to improve detect rate.
Parameter:
- f : A plain text file contains words and their ocurrences.
Structure of dict file:
Structure of dict file:
word1 freq1 word_type1
word2 freq2 word_type2
...
@ -372,7 +372,7 @@ def enable_parallel(processnum=None):
def pcut(sentence,cut_all=False,HMM=True):
parts = re.compile('([\r\n]+)').split(sentence)
if cut_all:
result = pool.map(__lcut_all,parts)
result = pool.map(__lcut_all,parts)
else:
if HMM:
result = pool.map(__lcut,parts)
@ -418,7 +418,7 @@ def tokenize(unicode_sentence,mode="default",HMM=True):
#mode ("default" or "search")
if not isinstance(unicode_sentence, unicode):
raise Exception("jieba: the input parameter should unicode.")
start = 0
start = 0
if mode=='default':
for w in cut(unicode_sentence,HMM=HMM):
width = len(w)

@ -1,3 +1,4 @@
#encoding=utf-8
import jieba
import os
try:
@ -5,27 +6,54 @@ try:
except ImportError:
pass
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
f_name = os.path.join(_curpath,"idf.txt")
content = open(f_name,'rb').read().decode('utf-8')
_curpath = os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
abs_path = os.path.join(_curpath, "idf.txt")
idf_freq = {}
lines = content.split('\n')
for line in lines:
word,freq = line.split(' ')
idf_freq[word] = float(freq)
median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
stop_words= set([
"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
IDF_DICTIONARY = abs_path
STOP_WORDS = set([
"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
])
def set_idf_path(idf_path):
global IDF_DICTIONARY
abs_path = os.path.normpath( os.path.join( os.getcwd(), idf_path ) )
if not os.path.exists(abs_path):
raise Exception("jieba: path does not exist:" + abs_path)
IDF_DICTIONARY = abs_path
return
def get_idf(abs_path):
content = open(abs_path,'rb').read().decode('utf-8')
idf_freq = {}
lines = content.split('\n')
for line in lines:
word,freq = line.split(' ')
idf_freq[word] = float(freq)
median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
return idf_freq, median_idf
def set_stop_words(stop_words_path):
global STOP_WORDS
abs_path = os.path.normpath( os.path.join( os.getcwd(), stop_words_path ) )
if not os.path.exists(abs_path):
raise Exception("jieba: path does not exist:" + abs_path)
content = open(abs_path,'rb').read().decode('utf-8')
lines = content.split('\n')
for line in lines:
STOP_WORDS.add(line)
return
def extract_tags(sentence,topK=20):
global IDF_DICTIONARY
global STOP_WORDS
idf_freq, median_idf = get_idf(IDF_DICTIONARY)
words = jieba.cut(sentence)
freq = {}
for w in words:
if len(w.strip())<2: continue
if w.lower() in stop_words: continue
if w.lower() in STOP_WORDS: continue
freq[w]=freq.get(w,0.0)+1.0
total = sum(freq.values())
freq = [(k,v/total) for k,v in freq.iteritems()]

@ -0,0 +1,32 @@
import sys
sys.path.append('../')
import jieba
import jieba.analyse
from optparse import OptionParser
USAGE = "usage: python extract_tags_idfpath.py [file name] -k [top k]"
parser = OptionParser(USAGE)
parser.add_option("-k", dest="topK")
opt, args = parser.parse_args()
if len(args) < 1:
print USAGE
sys.exit(1)
file_name = args[0]
if opt.topK is None:
topK = 10
else:
topK = int(opt.topK)
content = open(file_name, 'rb').read()
jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
tags = jieba.analyse.extract_tags(content, topK=topK)
print ",".join(tags)

@ -0,0 +1,33 @@
import sys
sys.path.append('../')
import jieba
import jieba.analyse
from optparse import OptionParser
USAGE = "usage: python extract_tags_stop_words.py [file name] -k [top k]"
parser = OptionParser(USAGE)
parser.add_option("-k", dest="topK")
opt, args = parser.parse_args()
if len(args) < 1:
print USAGE
sys.exit(1)
file_name = args[0]
if opt.topK is None:
topK = 10
else:
topK = int(opt.topK)
content = open(file_name, 'rb').read()
jieba.analyse.set_stop_words("../extra_dict/stop_words.txt")
jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
tags = jieba.analyse.extract_tags(content, topK=topK)
print ",".join(tags)
Loading…
Cancel
Save