<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
	<id>https://lms.onnocenter.or.id/wiki/index.php?action=history&amp;feed=atom&amp;title=R%3A_tidytext%3A_topic-modelling</id>
	<title>R: tidytext: topic-modelling - Revision history</title>
	<link rel="self" type="application/atom+xml" href="https://lms.onnocenter.or.id/wiki/index.php?action=history&amp;feed=atom&amp;title=R%3A_tidytext%3A_topic-modelling"/>
	<link rel="alternate" type="text/html" href="https://lms.onnocenter.or.id/wiki/index.php?title=R:_tidytext:_topic-modelling&amp;action=history"/>
	<updated>2026-04-21T06:15:32Z</updated>
	<subtitle>Revision history for this page on the wiki</subtitle>
	<generator>MediaWiki 1.45.1</generator>
	<entry>
		<id>https://lms.onnocenter.or.id/wiki/index.php?title=R:_tidytext:_topic-modelling&amp;diff=57743&amp;oldid=prev</id>
		<title>Onnowpurbo: Created page with &quot; # Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/06-topic-models.Rmd     library(knitr)  opts_chunk$set(message = FALSE, warning = FALSE, cache = TRUE)  options(...&quot;</title>
		<link rel="alternate" type="text/html" href="https://lms.onnocenter.or.id/wiki/index.php?title=R:_tidytext:_topic-modelling&amp;diff=57743&amp;oldid=prev"/>
		<updated>2019-12-04T11:14:25Z</updated>

		<summary type="html">&lt;p&gt;Created page with &amp;quot; # Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/06-topic-models.Rmd     library(knitr)  opts_chunk$set(message = FALSE, warning = FALSE, cache = TRUE)  options(...&amp;quot;&lt;/p&gt;
&lt;p&gt;&lt;b&gt;New page&lt;/b&gt;&lt;/p&gt;&lt;div&gt; # Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/06-topic-models.Rmd&lt;br /&gt;
  &lt;br /&gt;
 library(knitr)&lt;br /&gt;
 opts_chunk$set(message = FALSE, warning = FALSE, cache = TRUE)&lt;br /&gt;
 options(width = 100, dplyr.width = 150)&lt;br /&gt;
 library(ggplot2)&lt;br /&gt;
 library(methods)&lt;br /&gt;
 library(scales)&lt;br /&gt;
 theme_set(theme_light())&lt;br /&gt;
&lt;br /&gt;
 ## Latent Dirichlet allocation&lt;br /&gt;
&lt;br /&gt;
 library(topicmodels)&lt;br /&gt;
 data(&amp;quot;AssociatedPress&amp;quot;)&lt;br /&gt;
 AssociatedPress&lt;br /&gt;
&lt;br /&gt;
 # This function returns an object containing the full details of the model fit,&lt;br /&gt;
 # such as how words are associated with topics and&lt;br /&gt;
 # how topics are associated with documents.&lt;br /&gt;
 ap_lda &amp;lt;- LDA(AssociatedPress, k = 2, control = list(seed = 1234))&lt;br /&gt;
 ap_lda&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
 ### Word-topic probabilities&lt;br /&gt;
&lt;br /&gt;
 library(tidytext)&lt;br /&gt;
 ap_topics &amp;lt;- tidy(ap_lda, matrix = &amp;quot;beta&amp;quot;)&lt;br /&gt;
 ap_topics&lt;br /&gt;
&lt;br /&gt;
 # plot&lt;br /&gt;
 library(ggplot2)&lt;br /&gt;
 library(dplyr)&lt;br /&gt;
 ap_top_terms &amp;lt;- ap_topics %&amp;gt;%&lt;br /&gt;
   group_by(topic) %&amp;gt;%&lt;br /&gt;
   top_n(10, beta) %&amp;gt;%&lt;br /&gt;
   ungroup() %&amp;gt;%&lt;br /&gt;
   arrange(topic, -beta)&lt;br /&gt;
 ap_top_terms %&amp;gt;%&lt;br /&gt;
   mutate(term = reorder_within(term, beta, topic)) %&amp;gt;%&lt;br /&gt;
   ggplot(aes(term, beta, fill = factor(topic))) +&lt;br /&gt;
   geom_col(show.legend = FALSE) +&lt;br /&gt;
   facet_wrap(~ topic, scales = &amp;quot;free&amp;quot;) +&lt;br /&gt;
   coord_flip() +&lt;br /&gt;
   scale_x_reordered()&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
 # consider the terms that had the *greatest difference* in $\beta$ between topic 1 and topic 2.&lt;br /&gt;
 # This can be estimated based on the log ratio of the two: $\log_2(\frac{\beta_2}{\beta_1})$&lt;br /&gt;
 # (a log ratio is useful because it makes the difference symmetrical:&lt;br /&gt;
 # $\beta_2$ being twice as large leads to a log ratio of 1,&lt;br /&gt;
 # while $\beta_1$ being twice as large results in -1).&lt;br /&gt;
 # To constrain it to a set of especially relevant words,&lt;br /&gt;
 # we can filter for relatively common words,&lt;br /&gt;
 # such as those that have a $\beta$ greater than 1/1000 in at least one topic.&lt;br /&gt;
 library(tidyr)&lt;br /&gt;
 beta_spread &amp;lt;- ap_topics %&amp;gt;%&lt;br /&gt;
   mutate(topic = paste0(&amp;quot;topic&amp;quot;, topic)) %&amp;gt;%&lt;br /&gt;
   spread(topic, beta) %&amp;gt;%&lt;br /&gt;
   filter(topic1 &amp;gt; .001 | topic2 &amp;gt; .001) %&amp;gt;%&lt;br /&gt;
   mutate(log_ratio = log2(topic2 / topic1))&lt;br /&gt;
 beta_spread&lt;br /&gt;
&lt;br /&gt;
 # plot&lt;br /&gt;
 beta_spread %&amp;gt;%&lt;br /&gt;
   group_by(direction = log_ratio &amp;gt; 0) %&amp;gt;%&lt;br /&gt;
   top_n(10, abs(log_ratio)) %&amp;gt;%&lt;br /&gt;
   ungroup() %&amp;gt;%&lt;br /&gt;
   mutate(term = reorder(term, log_ratio)) %&amp;gt;%&lt;br /&gt;
   ggplot(aes(term, log_ratio)) +&lt;br /&gt;
   geom_col() +&lt;br /&gt;
   labs(y = &amp;quot;Log2 ratio of beta in topic 2 / topic 1&amp;quot;) +&lt;br /&gt;
   coord_flip()&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
 ### Document-topic probabilities&lt;br /&gt;
&lt;br /&gt;
 ap_documents &amp;lt;- tidy(ap_lda, matrix = &amp;quot;gamma&amp;quot;)&lt;br /&gt;
 ap_documents&lt;br /&gt;
&lt;br /&gt;
 # check document 6&lt;br /&gt;
 tidy(AssociatedPress) %&amp;gt;%&lt;br /&gt;
   filter(document == 6) %&amp;gt;%&lt;br /&gt;
   arrange(desc(count))&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
 ## Example: the great library heist&lt;br /&gt;
&lt;br /&gt;
 # download dari gutenberg&lt;br /&gt;
 titles &amp;lt;- c(&amp;quot;Twenty Thousand Leagues under the Sea&amp;quot;, &amp;quot;The War of the Worlds&amp;quot;,&lt;br /&gt;
             &amp;quot;Pride and Prejudice&amp;quot;, &amp;quot;Great Expectations&amp;quot;)&lt;br /&gt;
 library(gutenbergr)&lt;br /&gt;
 books &amp;lt;- gutenberg_works(title %in% titles) %&amp;gt;%&lt;br /&gt;
   gutenberg_download(meta_fields = &amp;quot;title&amp;quot;)&lt;br /&gt;
&lt;br /&gt;
 library(stringr)&lt;br /&gt;
 # divide into documents, each representing one chapter&lt;br /&gt;
 by_chapter &amp;lt;- books %&amp;gt;%&lt;br /&gt;
   group_by(title) %&amp;gt;%&lt;br /&gt;
   mutate(chapter = cumsum(str_detect(text, regex(&amp;quot;^chapter &amp;quot;, ignore_case = TRUE)))) %&amp;gt;%&lt;br /&gt;
   ungroup() %&amp;gt;%&lt;br /&gt;
   filter(chapter &amp;gt; 0) %&amp;gt;%&lt;br /&gt;
   unite(document, title, chapter)&lt;br /&gt;
 # split into words&lt;br /&gt;
 by_chapter_word &amp;lt;- by_chapter %&amp;gt;%&lt;br /&gt;
   unnest_tokens(word, text)&lt;br /&gt;
 # find document-word counts&lt;br /&gt;
 word_counts &amp;lt;- by_chapter_word %&amp;gt;%&lt;br /&gt;
   anti_join(stop_words) %&amp;gt;%&lt;br /&gt;
   count(document, word, sort = TRUE) %&amp;gt;%&lt;br /&gt;
   ungroup()&lt;br /&gt;
 word_counts&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
 ### LDA on chapters&lt;br /&gt;
&lt;br /&gt;
 chapters_dtm &amp;lt;- word_counts %&amp;gt;%&lt;br /&gt;
   cast_dtm(document, word, n)&lt;br /&gt;
 chapters_dtm&lt;br /&gt;
&lt;br /&gt;
 # use the `LDA()` function to create a four-topic model&lt;br /&gt;
 chapters_lda &amp;lt;- LDA(chapters_dtm, k = 4, control = list(seed = 1234))&lt;br /&gt;
 chapters_lda&lt;br /&gt;
&lt;br /&gt;
 # examine per-topic-per-word probabilities.&lt;br /&gt;
 chapter_topics &amp;lt;- tidy(chapters_lda, matrix = &amp;quot;beta&amp;quot;)&lt;br /&gt;
 chapter_topics&lt;br /&gt;
&lt;br /&gt;
 # use dplyr&amp;#039;s `top_n()` to find the top 5 terms within each topic.&lt;br /&gt;
 top_terms &amp;lt;- chapter_topics %&amp;gt;%&lt;br /&gt;
   group_by(topic) %&amp;gt;%&lt;br /&gt;
   top_n(5, beta) %&amp;gt;%&lt;br /&gt;
   ungroup() %&amp;gt;%&lt;br /&gt;
   arrange(topic, -beta)&lt;br /&gt;
 top_terms&lt;br /&gt;
&lt;br /&gt;
 # plot&lt;br /&gt;
 library(ggplot2)&lt;br /&gt;
 top_terms %&amp;gt;%&lt;br /&gt;
   mutate(term = reorder_within(term, beta, topic)) %&amp;gt;%&lt;br /&gt;
   ggplot(aes(term, beta, fill = factor(topic))) +&lt;br /&gt;
   geom_col(show.legend = FALSE) +&lt;br /&gt;
   facet_wrap(~ topic, scales = &amp;quot;free&amp;quot;) +&lt;br /&gt;
   coord_flip() +&lt;br /&gt;
   scale_x_reordered()&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
 ### Per-document classification&lt;br /&gt;
&lt;br /&gt;
 chapters_gamma &amp;lt;- tidy(chapters_lda, matrix = &amp;quot;gamma&amp;quot;)&lt;br /&gt;
 chapters_gamma&lt;br /&gt;
&lt;br /&gt;
 # re-separate the document name into title and chapter&lt;br /&gt;
 chapters_gamma &amp;lt;- chapters_gamma %&amp;gt;%&lt;br /&gt;
   separate(document, c(&amp;quot;title&amp;quot;, &amp;quot;chapter&amp;quot;), sep = &amp;quot;_&amp;quot;, convert = TRUE)&lt;br /&gt;
 chapters_gamma&lt;br /&gt;
&lt;br /&gt;
 # reorder titles in order of topic 1, topic 2, etc before plotting&lt;br /&gt;
 # ERROR disini&lt;br /&gt;
 chapters_gamma %&amp;gt;%&lt;br /&gt;
   mutate(title = reorder(title, gamma * topic)) %&amp;gt;%&lt;br /&gt;
   ggplot(aes(factor(topic), gamma)) +&lt;br /&gt;
   geom_boxplot() +&lt;br /&gt;
   facet_wrap(~ title)&lt;br /&gt;
&lt;br /&gt;
 # look like some chapters from Great Expectations (which should be topic 4)&lt;br /&gt;
 # were somewhat associated with other topics&lt;br /&gt;
 chapter_classifications &amp;lt;- chapters_gamma %&amp;gt;%&lt;br /&gt;
   group_by(title, chapter) %&amp;gt;%&lt;br /&gt;
   top_n(1, gamma) %&amp;gt;%&lt;br /&gt;
   ungroup()&lt;br /&gt;
 chapter_classifications&lt;br /&gt;
&lt;br /&gt;
 book_topics &amp;lt;- chapter_classifications %&amp;gt;%&lt;br /&gt;
   count(title, topic) %&amp;gt;%&lt;br /&gt;
   group_by(title) %&amp;gt;%&lt;br /&gt;
   top_n(1, n) %&amp;gt;%&lt;br /&gt;
   ungroup() %&amp;gt;%&lt;br /&gt;
   transmute(consensus = title, topic)&lt;br /&gt;
 chapter_classifications %&amp;gt;%&lt;br /&gt;
   inner_join(book_topics, by = &amp;quot;topic&amp;quot;) %&amp;gt;%&lt;br /&gt;
   filter(title != consensus)&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
 ### By word assignments: `augment`&lt;br /&gt;
&lt;br /&gt;
 assignments &amp;lt;- augment(chapters_lda, data = chapters_dtm)&lt;br /&gt;
 assignments&lt;br /&gt;
&lt;br /&gt;
 # returns a tidy data frame of book-term counts, but adds an extra column: `.topic&lt;br /&gt;
 assignments &amp;lt;- assignments %&amp;gt;%&lt;br /&gt;
   separate(document, c(&amp;quot;title&amp;quot;, &amp;quot;chapter&amp;quot;), sep = &amp;quot;_&amp;quot;, convert = TRUE) %&amp;gt;%&lt;br /&gt;
   inner_join(book_topics, by = c(&amp;quot;.topic&amp;quot; = &amp;quot;topic&amp;quot;))&lt;br /&gt;
 assignments&lt;br /&gt;
&lt;br /&gt;
 # combination of the true book (`title`) and the book assigned to it (`consensus`)&lt;br /&gt;
 # is useful for further exploration.&lt;br /&gt;
 library(scales)&lt;br /&gt;
 assignments %&amp;gt;%&lt;br /&gt;
   count(title, consensus, wt = count) %&amp;gt;%&lt;br /&gt;
   group_by(title) %&amp;gt;%&lt;br /&gt;
   mutate(percent = n / sum(n)) %&amp;gt;%&lt;br /&gt;
   ggplot(aes(consensus, title, fill = percent)) +&lt;br /&gt;
   geom_tile() +&lt;br /&gt;
   scale_fill_gradient2(high = &amp;quot;red&amp;quot;, label = percent_format()) +&lt;br /&gt;
   theme_minimal() +&lt;br /&gt;
   theme(axis.text.x = element_text(angle = 90, hjust = 1),&lt;br /&gt;
         panel.grid = element_blank()) +&lt;br /&gt;
   labs(x = &amp;quot;Book words were assigned to&amp;quot;,&lt;br /&gt;
        y = &amp;quot;Book words came from&amp;quot;,&lt;br /&gt;
        fill = &amp;quot;% of assignments&amp;quot;)&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
 # What were the most commonly mistaken words?&lt;br /&gt;
 wrong_words &amp;lt;- assignments %&amp;gt;%&lt;br /&gt;
   filter(title != consensus)&lt;br /&gt;
 wrong_words&lt;br /&gt;
 wrong_words %&amp;gt;%&lt;br /&gt;
   count(title, consensus, term, wt = count) %&amp;gt;%&lt;br /&gt;
   ungroup() %&amp;gt;%&lt;br /&gt;
   arrange(desc(n))&lt;br /&gt;
&lt;br /&gt;
 #  confirm &amp;quot;flopson&amp;quot; appears only in *Great Expectations*,&lt;br /&gt;
 # even though it&amp;#039;s assigned to the &amp;quot;Pride and Prejudice&amp;quot; cluster.&lt;br /&gt;
 word_counts %&amp;gt;%&lt;br /&gt;
   filter(word == &amp;quot;flopson&amp;quot;)&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
 ## Alternative LDA implementations&lt;br /&gt;
&lt;br /&gt;
 library(dplyr)&lt;br /&gt;
 library(tidytext)&lt;br /&gt;
 library(stringr)&lt;br /&gt;
 library(ggplot2)&lt;br /&gt;
 theme_set(theme_light())&lt;br /&gt;
&lt;br /&gt;
 library(mallet)&lt;br /&gt;
 # create a vector with one string per chapter&lt;br /&gt;
 collapsed &amp;lt;- by_chapter_word %&amp;gt;%&lt;br /&gt;
   anti_join(stop_words, by = &amp;quot;word&amp;quot;) %&amp;gt;%&lt;br /&gt;
   mutate(word = str_replace(word, &amp;quot;&amp;#039;&amp;quot;, &amp;quot;&amp;quot;)) %&amp;gt;%&lt;br /&gt;
   group_by(document) %&amp;gt;%&lt;br /&gt;
   summarize(text = paste(word, collapse = &amp;quot; &amp;quot;))&lt;br /&gt;
 # create an empty file of &amp;quot;stopwords&amp;quot;&lt;br /&gt;
 file.create(empty_file &amp;lt;- tempfile())&lt;br /&gt;
 docs &amp;lt;- mallet.import(collapsed$document, collapsed$text, empty_file)&lt;br /&gt;
 mallet_model &amp;lt;- MalletLDA(num.topics = 4)&lt;br /&gt;
 mallet_model$loadDocuments(docs)&lt;br /&gt;
 mallet_model$train(100)&lt;br /&gt;
&lt;br /&gt;
 # use the `tidy()` and `augment()` functions described in the rest of the chapter&lt;br /&gt;
 # in an almost identical way. This includes extracting the probabilities of words&lt;br /&gt;
 # within each topic or topics within each document.&lt;br /&gt;
&lt;br /&gt;
 # word-topic pairs&lt;br /&gt;
 tidy(mallet_model)&lt;br /&gt;
 # document-topic pairs&lt;br /&gt;
 tidy(mallet_model, matrix = &amp;quot;gamma&amp;quot;)&lt;br /&gt;
 # column needs to be named &amp;quot;term&amp;quot; for &amp;quot;augment&amp;quot;&lt;br /&gt;
 term_counts &amp;lt;- rename(word_counts, term = word)&lt;br /&gt;
 augment(mallet_model, term_counts)&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
==Referensi==&lt;br /&gt;
&lt;br /&gt;
* https://github.com/dgrtwo/tidy-text-mining/blob/master/06-topic-models.Rmd&lt;br /&gt;
&lt;br /&gt;
==Pranala Menarik==&lt;br /&gt;
&lt;br /&gt;
* [[R]]&lt;/div&gt;</summary>
		<author><name>Onnowpurbo</name></author>
	</entry>
</feed>