From 4300f7978803e2af27c577f08d461684f0ad058f Mon Sep 17 00:00:00 2001 From: Sun Junyi Date: Fri, 17 May 2013 09:35:12 +0800 Subject: [PATCH] add a example of using sklearn+jieba --- test/extract_topic.py | 63 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 test/extract_topic.py diff --git a/test/extract_topic.py b/test/extract_topic.py new file mode 100644 index 0000000..8b8f35f --- /dev/null +++ b/test/extract_topic.py @@ -0,0 +1,63 @@ +import sys +sys.path.append("../") +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_extraction.text import TfidfTransformer +from sklearn import decomposition + +import jieba +import time +import glob +import sys +import os +import random + +if len(sys.argv)<2: + print "usage: extract_topic.py directory [n_topic] [n_top_words]" + sys.exit(0) + +n_topic = 10 +n_top_words = 25 + +if len(sys.argv)>2: + n_topic = int(sys.argv[2]) + +if len(sys.argv)>3: + n_top_words = int(sys.argv[3]) + +count_vect = CountVectorizer() +docs = [] + +pattern = os.path.join(sys.argv[1],"*.txt") +print "read "+pattern + +for f_name in glob.glob(pattern): + with open(f_name) as f: + print "read file:", f_name + for line in f: #one line as a document + words = " ".join(jieba.cut(line)) + docs.append(words) + +random.shuffle(docs) + +print "read done." + +print "transform" +counts = count_vect.fit_transform(docs) +tfidf = TfidfTransformer().fit_transform(counts) +print tfidf.shape + + +t0 = time.time() +print "training..." + +nmf = decomposition.NMF(n_components=n_topic).fit(tfidf) +print("done in %0.3fs." % (time.time() - t0)) + +# Inverse the vectorizer vocabulary to be able +feature_names = count_vect.get_feature_names() + +for topic_idx, topic in enumerate(nmf.components_): + print("Topic #%d:" % topic_idx) + print(" ".join([feature_names[i] + for i in topic.argsort()[:-n_top_words - 1:-1]])) + print("")