Merge pull request #174 from fukuball/master

讓 jieba 可以切換 idf 語料庫及 stop words 語料庫
11 years ago · 16d626d347
parent 473ac1df75 b658ee69cb
commit 16d626d347
6 changed files with 176407 additions and 24 deletions
--- a/extra_dict/idf.txt.big
+++ b/extra_dict/idf.txt.big
--- a/extra_dict/stop_words.txt
+++ b/extra_dict/stop_words.txt
@ -0,0 +1,51 @@
 the
 of
 is
 and
 to
 in
 that
 we
 for
 an
 are
 by
 be
 as
 on
 with
 can
 if
 from
 which
 you
 it
 this
 then
 at
 have
 all
 not
 one
 has
 or
 that
 的
 了
 和
 是
 就
 都
 而
 及
 與
 著
 或
 一個
 沒有
 我們
 你們
 妳們
 他們
 她們
 是否
--- a/jieba/init.py
+++ b/jieba/init.py
@ -39,7 +39,7 @@ def gen_trie(f_name):
    trie = {}
    ltotal = 0.0
    with open(f_name, 'rb') as f:
-        lineno = 0 
+        lineno = 0
        for line in f.read().rstrip().decode('utf-8').split('\n'):
            lineno += 1
            try:
@ -134,7 +134,7 @@ def __cut_all(sentence):
    for k,L in dag.iteritems():
        if len(L)==1 and k>old_j:
            yield sentence[k:L[0]+1]
-            old_j = L[0] 
+            old_j = L[0]
        else:
            for j in L:
                if j>k:
@ -195,7 +195,7 @@ def __cut_DAG_NO_HMM(sentence):
            if len(buf)>0:
                yield buf
                buf = u''
-            yield l_word        
+            yield l_word
            x =y
    if len(buf)>0:
        yield buf
@ -227,7 +227,7 @@ def __cut_DAG(sentence):
                        for elem in buf:
                            yield elem
                    buf=u''
-            yield l_word        
+            yield l_word
        x =y
    if len(buf)>0:
@ -243,8 +243,8 @@ def __cut_DAG(sentence):
                    yield elem
 def cut(sentence,cut_all=False,HMM=True):
-    '''The main function that segments an entire sentence that contains 
+    '''The main function that segments an entire sentence that contains
-    Chinese characters into seperated words. 
+    Chinese characters into seperated words.
    Parameter:
        - sentence: The String to be segmented
        - cut_all: Model. True means full pattern, false means accurate pattern.
@ -257,8 +257,8 @@ def cut(sentence,cut_all=False,HMM=True):
            sentence = sentence.decode('gbk','ignore')
    '''
        \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
-        \r\n|\s : whitespace characters. Will not be Handled. 
+        \r\n|\s : whitespace characters. Will not be Handled.
-    ''' 
+    '''
    re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
    if cut_all:
        re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U)
@ -306,7 +306,7 @@ def load_userdict(f):
    ''' Load personalized dict to improve detect rate.
    Parameter:
        - f : A plain text file contains words and their ocurrences.
-    Structure of dict file: 
+    Structure of dict file:
    word1 freq1 word_type1
    word2 freq2 word_type2
    ...
@ -372,7 +372,7 @@ def enable_parallel(processnum=None):
    def pcut(sentence,cut_all=False,HMM=True):
        parts = re.compile('([\r\n]+)').split(sentence)
        if cut_all:
-            result = pool.map(__lcut_all,parts) 
+            result = pool.map(__lcut_all,parts)
        else:
            if HMM:
                result = pool.map(__lcut,parts)
@ -418,7 +418,7 @@ def tokenize(unicode_sentence,mode="default",HMM=True):
    #mode ("default" or "search")
    if not isinstance(unicode_sentence, unicode):
        raise Exception("jieba: the input parameter should  unicode.")
-    start = 0 
+    start = 0
    if mode=='default':
        for w in cut(unicode_sentence,HMM=HMM):
            width = len(w)
--- a/jieba/analyse/init.py
+++ b/jieba/analyse/init.py
@ -1,3 +1,4 @@
 #encoding=utf-8
 import jieba
 import os
 try:
@ -5,27 +6,54 @@ try:
 except ImportError:
    pass
-_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
+_curpath = os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
-f_name = os.path.join(_curpath,"idf.txt")
+abs_path = os.path.join(_curpath, "idf.txt")
 content = open(f_name,'rb').read().decode('utf-8')
-idf_freq = {}
+IDF_DICTIONARY = abs_path
-lines = content.split('\n')
+STOP_WORDS = set([
-for line in lines:
+    "the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
    word,freq = line.split(' ')
    idf_freq[word] = float(freq)
 median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
 stop_words= set([
 "the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
 ])
 def set_idf_path(idf_path):
    global IDF_DICTIONARY
    abs_path = os.path.normpath( os.path.join( os.getcwd(), idf_path )  )
    if not os.path.exists(abs_path):
        raise Exception("jieba: path does not exist:" + abs_path)
    IDF_DICTIONARY = abs_path
    return
 def get_idf(abs_path):
    content = open(abs_path,'rb').read().decode('utf-8')
    idf_freq = {}
    lines = content.split('\n')
    for line in lines:
        word,freq = line.split(' ')
        idf_freq[word] = float(freq)
    median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
    return idf_freq, median_idf
 def set_stop_words(stop_words_path):
    global STOP_WORDS
    abs_path = os.path.normpath( os.path.join( os.getcwd(), stop_words_path )  )
    if not os.path.exists(abs_path):
        raise Exception("jieba: path does not exist:" + abs_path)
    content = open(abs_path,'rb').read().decode('utf-8')
    lines = content.split('\n')
    for line in lines:
        STOP_WORDS.add(line)
    return
 def extract_tags(sentence,topK=20):
    global IDF_DICTIONARY
    global STOP_WORDS
    idf_freq, median_idf = get_idf(IDF_DICTIONARY)
    words = jieba.cut(sentence)
    freq = {}
    for w in words:
        if len(w.strip())<2: continue
-        if w.lower() in stop_words: continue
+        if w.lower() in STOP_WORDS: continue
        freq[w]=freq.get(w,0.0)+1.0
    total = sum(freq.values())
    freq = [(k,v/total) for k,v in freq.iteritems()]
--- a/test/extract_tags_idfpath.py
+++ b/test/extract_tags_idfpath.py
@ -0,0 +1,32 @@
 import sys
 sys.path.append('../')
 import jieba
 import jieba.analyse
 from optparse import OptionParser
 USAGE = "usage:    python extract_tags_idfpath.py [file name] -k [top k]"
 parser = OptionParser(USAGE)
 parser.add_option("-k", dest="topK")
 opt, args = parser.parse_args()
 if len(args) < 1:
    print USAGE
    sys.exit(1)
 file_name = args[0]
 if opt.topK is None:
    topK = 10
 else:
    topK = int(opt.topK)
 content = open(file_name, 'rb').read()
 jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
 tags = jieba.analyse.extract_tags(content, topK=topK)
 print ",".join(tags)
--- a/test/extract_tags_stop_words.py
+++ b/test/extract_tags_stop_words.py
@ -0,0 +1,33 @@
 import sys
 sys.path.append('../')
 import jieba
 import jieba.analyse
 from optparse import OptionParser
 USAGE = "usage:    python extract_tags_stop_words.py [file name] -k [top k]"
 parser = OptionParser(USAGE)
 parser.add_option("-k", dest="topK")
 opt, args = parser.parse_args()
 if len(args) < 1:
    print USAGE
    sys.exit(1)
 file_name = args[0]
 if opt.topK is None:
    topK = 10
 else:
    topK = int(opt.topK)
 content = open(file_name, 'rb').read()
 jieba.analyse.set_stop_words("../extra_dict/stop_words.txt")
 jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
 tags = jieba.analyse.extract_tags(content, topK=topK)
 print ",".join(tags)