install.packages("corpus.JSS.papers", repos = "http://datacube.wu.ac.at/", type = "source")
data("JSS_papers", package = "corpus.JSS.papers")
# Remove those containing non-ASCII characters in the abstracts.
JSS_papers = JSS_papers[sapply(JSS_papers[, "description"],Encoding) == "unknown",]
dim(JSS_papers)
## [1] 754 15
JSS_papers[1,]
## $title
## [1] "A Diagnostic to Assess the Fit of a Variogram Model to Spatial Data"
##
## $creator
## [1] "Barry, Ronald"
##
## $subject
## [1] "" "" ""
##
## $description
## [1] "The fit of a variogram model to spatially-distributed data is often difficult to assess. A graphical diagnostic written in S-plus is introduced that allows the user to determine both the general quality of the fit of a variogram model, and to find specific pairs of locations that do not have measurements that are consonant with the fitted variogram. It can help identify nonstationarity, outliers, and poor variogram fit in general. Simulated data sets and a set of soil nitrogen concentration data are examined using this graphical diagnostic."
##
## $publisher
## [1] "Foundation for Open Access Statistics"
##
## $contributor
## [1] ""
##
## $date
## [1] "1996-08-16"
##
## $type
## [1] "info:eu-repo/semantics/article"
## [2] "info:eu-repo/semantics/publishedVersion"
## [3] "Peer-reviewed Article"
##
## $format
## [1] "PB"
##
## $identifier
## [1] "https://www.jstatsoft.org/index.php/jss/article/view/v001i01"
## [2] "10.18637/jss.v001.i01"
##
## $source
## [1] "Journal of Statistical Software; Vol 1 (1997); 1 - 11"
## [2] "1548-7660"
##
## $language
## [1] "eng"
##
## $relation
## [1] "https://www.jstatsoft.org/index.php/jss/article/view/v001i01/barry.pdf"
## [2] "https://www.jstatsoft.org/index.php/jss/article/downloadSuppFile/v001i01/code.txt"
##
## $coverage
## character(0)
##
## $rights
## [1] "Copyright (c) 1996 Ronald Barry"
range(JSS_papers[, "date"])
## [1] "1996-08-16" "2017-01-11"
install.packages("tm")
install.packages("SnowballC")
install.packages("topicmodels")
install.packages("wordcloud")
install.packages("RColorBrewer")
tm
package to transform the data to a Corpus and then to a DocumentTermMatrix.When transforming data to a DocumentTermMatrix, we apply the following steps: terms are stemmed and the stop words, punctuation, numbers and terms of length less than 3 are removed (in the control argument).
library(tm)
## Loading required package: NLP
library(SnowballC) # needed for stemdocument
myCorpus = Corpus(VectorSource(JSS_papers[, "description"]))
JSS_dtm = DocumentTermMatrix(myCorpus,
control = list(stemming = TRUE, stopwords = TRUE, minWordLength = 3,
removeNumbers = TRUE, removePunctuation = TRUE))
dim(JSS_dtm)
## [1] 754 4646
words=colnames(JSS_dtm)
words[1:20]
## [1] "aalen" "abadi" "abbrevi" "aberr" "abil"
## [6] "abl" "absenc" "absent" "absolut" "absorb"
## [11] "abstract" "abund" "academ" "acceler" "accept"
## [16] "access" "accommod" "accompani" "accomplish" "accord"
JSS_dtm[,1]
## <<DocumentTermMatrix (documents: 754, terms: 1)>>
## Non-/sparse entries: 3/751
## Sparsity : 100%
## Maximal term length: 5
## Weighting : term frequency (tf)
Goal: remove terms with low frequences as well as those occurring in many documents. We only include terms whose tf-idf values are at least 0.1, which is a bit more than the median and ensures that the very frequent terms are omitted.
library("slam")
summary(col_sums(JSS_dtm))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 1.0 2.0 12.8 7.0 1420.0
term_tfidf = tapply(JSS_dtm$v/row_sums(JSS_dtm)[JSS_dtm$i], JSS_dtm$j, mean)*
log2(nDocs(JSS_dtm)/col_sums(JSS_dtm > 0))
summary(term_tfidf)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0123 0.0803 0.1060 0.1410 0.1590 1.0400
JSS_dtm = JSS_dtm[, term_tfidf >= 0.1]
JSS_dtm = JSS_dtm[row_sums(JSS_dtm) > 0,]
summary(col_sums(JSS_dtm))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.00 2.00 4.15 4.00 176.00
dim(JSS_dtm)
## [1] 753 2551
words=colnames(JSS_dtm)
words[1:20]
## [1] "aalen" "abadi" "aberr" "absenc" "absorb"
## [6] "abund" "academ" "accept" "accompani" "accuraci"
## [11] "accustom" "acidif" "acrobat" "action" "actor"
## [16] "actuar" "actuari" "acycl" "ada" "adaboost"
The output is an LDA
object, from which we obtain
library("topicmodels")
k = 30
VEM = LDA(JSS_dtm, k = k,method = "VEM")
# a mixing weight vector (over 30 topics) for each document
dim(posterior(VEM)$topics)
## [1] 753 30
# a term frequency vector (over all words) for each of the 30 topic
dim(posterior(VEM)$terms)
## [1] 30 2551
# 5 most frequent terms for each top
freq.Terms = terms(VEM, 5)
dim(freq.Terms)
## [1] 5 30
freq.Terms
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6
## [1,] "imput" "surviv" "chart" "file" "treatment" "fmri"
## [2,] "latent" "gene" "decis" "attribut" "sequenc" "meta"
## [3,] "posit" "microarray" "tree" "shrinkag" "stage" "multilevel"
## [4,] "roc" "census" "polit" "record" "mine" "captur"
## [5,] "mida" "project" "bar" "toolbox" "trial" "jag"
## Topic 7 Topic 8 Topic 9 Topic 10 Topic 11 Topic 12 Topic 13
## [1,] "panel" "anim" "stat" "cluster" "quantil" "poisson" "kernel"
## [2,] "frame" "dose" "lisp" "equat" "circular" "zero" "ensembl"
## [3,] "tree" "phase" "java" "ordin" "exposur" "inflat" "aspect"
## [4,] "map" "trial" "web" "score" "forecast" "binomi" "ssa"
## [5,] "micromap" "poli" "side" "partit" "pomp" "regular" "boost"
## Topic 14 Topic 15 Topic 16 Topic 17 Topic 18 Topic 19
## [1,] "stationari" "network" "item" "arima" "ecolog" "pattern"
## [2,] "winbug" "graph" "irt" "fuzzi" "subset" "transit"
## [3,] "map" "ergm" "rasch" "sunflow" "speci" "econometr"
## [4,] "mathemat" "edg" "person" "season" "autoregress" "volum"
## [5,] "conting" "graphlet" "score" "color" "monitor" "fast"
## Topic 20 Topic 21 Topic 22 Topic 23 Topic 24 Topic 25
## [1,] "imag" "event" "index" "survey" "gui" "genet"
## [2,] "project" "rank" "diffus" "threshold" "inequ" "map"
## [3,] "ratio" "longitudin" "dirichlet" "wavelet" "constrain" "match"
## [4,] "gee" "surviv" "beta" "plan" "bay" "qtl"
## [5,] "odd" "hazard" "sampler" "vista" "jan" "analogu"
## Topic 26 Topic 27 Topic 28 Topic 29 Topic 30
## [1,] "copula" "censor" "genet" "formula" "matric"
## [2,] "network" "chang" "author" "tempor" "distanc"
## [3,] "socr" "contamin" "haplotyp" "accuraci" "mantel"
## [4,] "eda" "permut" "mokken" "cumul" "molecular"
## [5,] "clickstream" "impur" "tgui" "spatio" "posit"
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
topic_terms = posterior(VEM)$terms
# Wordcloud for all topics
v = sort(colSums(topic_terms), decreasing=TRUE);
myNames = names(v);
d = data.frame(word=myNames, freq=v);
wordcloud(d$word, d$freq, max.words=50, scale=c(4,.2), random.order=FALSE, rot.per=.25,
colors=brewer.pal(8,"Dark2"))
# Wordcloud for individual topics
topic.id = c(4, 6, 8, 12, 13, 27)
par(mfrow=c(2,3))
for(i in 1:6){
v = sort(topic_terms[topic.id[i],], decreasing=TRUE);
myNames = names(v);
d = data.frame(word=myNames, freq=v);
wordcloud(d$word, d$freq, max.words=30, scale=c(4,.3), random.order=FALSE, rot.per=.25,
colors=brewer.pal(8,"Dark2"))
}