LDA
LDAをためしてみる
$ pip install gensim pyldavis
gensimでトピックを生成し、LDAvisで可視化
データセットはnltk.reuters
jypyter notebookで実行
from nltk.corpus import reuters, stopwords from gensim.utils import lemmatize import re from gensim.corpora.dictionary import Dictionary from gensim.models.ldamodel import LdaModel import pyLDAvis import pyLDAvis.gensim documents = reuters.fileids() train_docs = list(filter(lambda doc: doc.startswith("train"), documents)) stopwords_list = stopwords.words("english") texts = [[re.sub(r'\/[A-Z]+$', '', word.decode("utf-8")) for word in lemmatize(reuters.raw(doc), stopwords=stopwords_list)] for doc in train_docs] ## cf. https://radimrehurek.com/gensim/models/ldamodel.html # Create a corpus from a list of texts dic = Dictionary(texts) corpus = [dic.doc2bow(text) for text in texts] # Train the model on the corpus. num_topics = 20 model = LdaModel(corpus=corpus, id2word=dic, num_topics=num_topics) pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(model, corpus, dic, sort_topics=False) display(vis)