1. Loading file and data
# Read the text file
filePath <- "../example.txt"
text <- readLines(filePath)
# Load the data as a tibble
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
text_df <- tibble(line = 1:15, text = text)
2. Tokenizing
library(tidytext)
token_df <- text_df %>%
unnest_tokens(word, text)
token_df
3. Removing generic stopwords
data(stop_words)
clean_df <- token_df %>%
anti_join(stop_words, by = c("word" = "word"))
4. Removing stopwords defined by me
mystop <- read.csv("../stop.csv", header = TRUE)
cleaner_df <- clean_df %>%
anti_join(mystop, by = c("word" = "text"))
cleaner_df
5. Getting word count
frequency <- cleaner_df %>%
count(word, sort = TRUE)
6. Plotting
Version A
library(wordcloud)
frequency %>%
with(wordcloud(word, n, min.freq = 4, max.words = 100, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2")))

Version B
library(wordcloud2)
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
wordcloud2(data=frequency, size=1.6, color='random-dark')
LS0tCnRpdGxlOiAiV29yZGNsb3VkIHdpdGggUiIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyMgMS4gTG9hZGluZyBmaWxlIGFuZCBkYXRhCgpgYGB7cn0KIyBSZWFkIHRoZSB0ZXh0IGZpbGUgCmZpbGVQYXRoIDwtICIuLi9leGFtcGxlLnR4dCIKdGV4dCA8LSByZWFkTGluZXMoZmlsZVBhdGgpCgojIExvYWQgdGhlIGRhdGEgYXMgYSB0aWJibGUKbGlicmFyeShkcGx5cikKdGV4dF9kZiA8LSB0aWJibGUobGluZSA9IDE6MTUsIHRleHQgPSB0ZXh0KQpgYGAKIyMgMi4gVG9rZW5pemluZwoKYGBge3J9CmxpYnJhcnkodGlkeXRleHQpCgp0b2tlbl9kZiA8LSB0ZXh0X2RmICU+JQogICAgICAgICAgICB1bm5lc3RfdG9rZW5zKHdvcmQsIHRleHQpCgp0b2tlbl9kZgpgYGAKIyMgMy4gUmVtb3ZpbmcgZ2VuZXJpYyBzdG9wd29yZHMKCmBgYHtyfQpkYXRhKHN0b3Bfd29yZHMpCgpjbGVhbl9kZiA8LSB0b2tlbl9kZiAlPiUKICBhbnRpX2pvaW4oc3RvcF93b3JkcywgYnkgPSBjKCJ3b3JkIiA9ICJ3b3JkIikpCgpgYGAKCiMjIDQuIFJlbW92aW5nIHN0b3B3b3JkcyBkZWZpbmVkIGJ5IG1lCgpgYGB7cn0KbXlzdG9wIDwtIHJlYWQuY3N2KCIuLi9zdG9wLmNzdiIsIGhlYWRlciA9IFRSVUUpCgpjbGVhbmVyX2RmIDwtIGNsZWFuX2RmICU+JQogIGFudGlfam9pbihteXN0b3AsIGJ5ID0gYygid29yZCIgPSAidGV4dCIpKQoKY2xlYW5lcl9kZgpgYGAKIyMgNS4gR2V0dGluZyB3b3JkIGNvdW50CmBgYHtyfQpmcmVxdWVuY3kgPC0gY2xlYW5lcl9kZiAlPiUKICAgICAgICAgICAgICBjb3VudCh3b3JkLCBzb3J0ID0gVFJVRSkgCmBgYAoKIyMgNi4gUGxvdHRpbmcKCiMjIyBWZXJzaW9uIEEKYGBge3J9CmxpYnJhcnkod29yZGNsb3VkKQoKZnJlcXVlbmN5ICU+JQogd2l0aCh3b3JkY2xvdWQod29yZCwgbiwgbWluLmZyZXEgPSA0LCBtYXgud29yZHMgPSAxMDAsIHJhbmRvbS5vcmRlcj1GQUxTRSwgcm90LnBlcj0wLjM1LCBjb2xvcnM9YnJld2VyLnBhbCg4LCAiRGFyazIiKSkpCmBgYAoKIyMjIFZlcnNpb24gQgpgYGB7cn0KbGlicmFyeSh3b3JkY2xvdWQyKQpgYGAKCmBgYHtyfQogIHdvcmRjbG91ZDIoZGF0YT1mcmVxdWVuY3ksIHNpemU9MS42LCBjb2xvcj0ncmFuZG9tLWRhcmsnKQpgYGAKCgoKCgoKCgo=