add a example of using sklearn+jieba

12 years ago · 4300f79788
parent a8f902545c
commit 4300f79788
1 changed files with 63 additions and 0 deletions
--- a/test/extract_topic.py
+++ b/test/extract_topic.py
@ -0,0 +1,63 @@
 import sys
 sys.path.append("../")
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_extraction.text import TfidfTransformer
 from sklearn import decomposition
 import jieba
 import time
 import glob
 import sys
 import os
 import random
 if len(sys.argv)<2:
 	print "usage: extract_topic.py directory [n_topic] [n_top_words]"
 	sys.exit(0)
 n_topic = 10
 n_top_words = 25
 if len(sys.argv)>2:
 	n_topic = int(sys.argv[2])
 if len(sys.argv)>3:
 	n_top_words = int(sys.argv[3])
 count_vect = CountVectorizer()
 docs = []
 pattern = os.path.join(sys.argv[1],"*.txt") 
 print "read "+pattern
 for f_name in glob.glob(pattern):
 	with open(f_name) as f:
 		print "read file:", f_name
 		for line in f: #one line as a document
 			words = " ".join(jieba.cut(line))
 			docs.append(words)
 random.shuffle(docs)
 print "read done."
 print "transform"
 counts = count_vect.fit_transform(docs)
 tfidf = TfidfTransformer().fit_transform(counts)
 print tfidf.shape
 t0 = time.time()
 print "training..."
 nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
 print("done in %0.3fs." % (time.time() - t0))
 # Inverse the vectorizer vocabulary to be able
 feature_names = count_vect.get_feature_names()
 for topic_idx, topic in enumerate(nmf.components_):
    print("Topic #%d:" % topic_idx)
    print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print("")