讓 jieba 可以切換 idf 語料庫

1. 新增繁體中文 idf 語料庫 2. 為了讓 jieba 可以切換 iff 語料庫，新增 get_idf, set_idf_path 方法，並改寫 extract_tags 3. test 增加 extract_tags_idfpath
11 years ago · 7198d562f1
parent 473ac1df75
commit 7198d562f1
4 changed files with 176307 additions and 23 deletions
--- a/extra_dict/idf.txt.big
+++ b/extra_dict/idf.txt.big
--- a/jieba/analyse/init.py
+++ b/jieba/analyse/init.py
@ -1,3 +1,4 @@
+#encoding=utf-8
 import jieba
 import os
 try:
@ -6,21 +7,33 @@ except ImportError:
    pass

 _curpath = os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
-f_name = os.path.join(_curpath,"idf.txt")
-content = open(f_name,'rb').read().decode('utf-8')
+abs_path = os.path.join(_curpath, "idf.txt")
+IDF_DICTIONARY = abs_path

+def set_idf_path(idf_path):
+    global IDF_DICTIONARY
+    abs_path = os.path.normpath( os.path.join( os.getcwd(), idf_path )  )
+    if not os.path.exists(abs_path):
+        raise Exception("jieba: path does not exist:" + abs_path)
+    IDF_DICTIONARY = abs_path
+    return
+
+def get_idf(abs_path):
+    content = open(abs_path,'rb').read().decode('utf-8')
    idf_freq = {}
    lines = content.split('\n')
    for line in lines:
        word,freq = line.split(' ')
        idf_freq[word] = float(freq)
-
    median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
+    return idf_freq, median_idf
+
+def extract_tags(sentence,topK=20):
+    global IDF_DICTIONARY
+    idf_freq, median_idf = get_idf(IDF_DICTIONARY)
    stop_words= set([
        "the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
    ])
-
-def extract_tags(sentence,topK=20):
    words = jieba.cut(sentence)
    freq = {}
    for w in words:
--- a/test/extract_tags_idfpath.py
+++ b/test/extract_tags_idfpath.py
@ -0,0 +1,32 @@
+import sys
+sys.path.append('../')
+
+import jieba
+import jieba.analyse
+from optparse import OptionParser
+
+USAGE = "usage:    python extract_tags_idfpath.py [file name] -k [top k]"
+
+parser = OptionParser(USAGE)
+parser.add_option("-k", dest="topK")
+opt, args = parser.parse_args()
+
+
+if len(args) < 1:
+    print USAGE
+    sys.exit(1)
+
+file_name = args[0]
+
+if opt.topK is None:
+    topK = 10
+else:
+    topK = int(opt.topK)
+
+content = open(file_name, 'rb').read()
+
+jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
+
+tags = jieba.analyse.extract_tags(content, topK=topK)
+
+print ",".join(tags)