LDA - kanekkie’s blog

LDAをためしてみる

$ pip install gensim pyldavis

gensimでトピックを生成し、LDAvisで可視化
データセットはnltk.reuters
jypyter notebookで実行

from nltk.corpus import reuters, stopwords
from gensim.utils import lemmatize
import re
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
import pyLDAvis
import pyLDAvis.gensim

documents = reuters.fileids()
train_docs = list(filter(lambda doc: doc.startswith("train"), documents))
stopwords_list = stopwords.words("english")
texts = [[re.sub(r'\/[A-Z]+$', '', word.decode("utf-8")) for word in lemmatize(reuters.raw(doc), stopwords=stopwords_list)] for doc in train_docs]

## cf. https://radimrehurek.com/gensim/models/ldamodel.html
# Create a corpus from a list of texts
dic = Dictionary(texts)
corpus = [dic.doc2bow(text) for text in texts]

# Train the model on the corpus.
num_topics = 20
model = LdaModel(corpus=corpus, id2word=dic, num_topics=num_topics)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpus, dic, sort_topics=False)
display(vis)