Text Mining with JSS Abstracts

Get Journal of Statisticsl Software (JSS) abstract data

install.packages("corpus.JSS.papers", repos = "http://datacube.wu.ac.at/", type = "source")
data("JSS_papers", package = "corpus.JSS.papers")

# Remove those containing non-ASCII characters in the abstracts.
JSS_papers = JSS_papers[sapply(JSS_papers[, "description"],Encoding) == "unknown",]
dim(JSS_papers)

## [1] 754  15

JSS_papers[1,]

## $title
## [1] "A Diagnostic to Assess the Fit of a Variogram Model to Spatial Data"
## 
## $creator
## [1] "Barry, Ronald"
## 
## $subject
## [1] "" "" ""
## 
## $description
## [1] "The fit of a variogram model to spatially-distributed data is often difficult to assess. A graphical diagnostic written in S-plus is introduced that allows the user to determine both the general quality of the fit of a variogram model, and to find specific pairs of locations that do not have measurements that are consonant with the fitted variogram. It can help identify nonstationarity, outliers, and poor variogram fit in general. Simulated data sets and a set of soil nitrogen concentration data are examined using this graphical diagnostic."
## 
## $publisher
## [1] "Foundation for Open Access Statistics"
## 
## $contributor
## [1] ""
## 
## $date
## [1] "1996-08-16"
## 
## $type
## [1] "info:eu-repo/semantics/article"         
## [2] "info:eu-repo/semantics/publishedVersion"
## [3] "Peer-reviewed Article"                  
## 
## $format
## [1] "PB"
## 
## $identifier
## [1] "https://www.jstatsoft.org/index.php/jss/article/view/v001i01"
## [2] "10.18637/jss.v001.i01"                                       
## 
## $source
## [1] "Journal of Statistical Software; Vol 1 (1997); 1 - 11"
## [2] "1548-7660"                                            
## 
## $language
## [1] "eng"
## 
## $relation
## [1] "https://www.jstatsoft.org/index.php/jss/article/view/v001i01/barry.pdf"           
## [2] "https://www.jstatsoft.org/index.php/jss/article/downloadSuppFile/v001i01/code.txt"
## 
## $coverage
## character(0)
## 
## $rights
## [1] "Copyright (c) 1996 Ronald Barry"

range(JSS_papers[, "date"])

## [1] "1996-08-16" "2017-01-11"

Install other relevant packages.

install.packages("tm")
install.packages("SnowballC")
install.packages("topicmodels")
install.packages("wordcloud")
install.packages("RColorBrewer")

Use the `tm` package to transform the data to a Corpus and then to a DocumentTermMatrix.

When transforming data to a DocumentTermMatrix, we apply the following steps: terms are stemmed and the stop words, punctuation, numbers and terms of length less than 3 are removed (in the control argument).

library(tm)

## Loading required package: NLP

library(SnowballC) # needed for stemdocument
myCorpus = Corpus(VectorSource(JSS_papers[, "description"]))

JSS_dtm = DocumentTermMatrix(myCorpus, 
                             control = list(stemming = TRUE, stopwords = TRUE, minWordLength = 3, 
                                            removeNumbers = TRUE, removePunctuation = TRUE))

dim(JSS_dtm)

## [1]  754 4646

words=colnames(JSS_dtm)
words[1:20]

##  [1] "aalen"      "abadi"      "abbrevi"    "aberr"      "abil"      
##  [6] "abl"        "absenc"     "absent"     "absolut"    "absorb"    
## [11] "abstract"   "abund"      "academ"     "acceler"    "accept"    
## [16] "access"     "accommod"   "accompani"  "accomplish" "accord"

JSS_dtm[,1]

## <<DocumentTermMatrix (documents: 754, terms: 1)>>
## Non-/sparse entries: 3/751
## Sparsity           : 100%
## Maximal term length: 5
## Weighting          : term frequency (tf)

Use tf-idf (term frequency-inverse document frequency) to select the vocabulary.

Goal: remove terms with low frequences as well as those occurring in many documents. We only include terms whose tf-idf values are at least 0.1, which is a bit more than the median and ensures that the very frequent terms are omitted.

library("slam")
summary(col_sums(JSS_dtm))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0     1.0     2.0    12.8     7.0  1420.0

term_tfidf = tapply(JSS_dtm$v/row_sums(JSS_dtm)[JSS_dtm$i], JSS_dtm$j, mean)*
  log2(nDocs(JSS_dtm)/col_sums(JSS_dtm > 0))

summary(term_tfidf)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0123  0.0803  0.1060  0.1410  0.1590  1.0400

JSS_dtm =  JSS_dtm[, term_tfidf >= 0.1]
JSS_dtm = JSS_dtm[row_sums(JSS_dtm) > 0,]
summary(col_sums(JSS_dtm))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.00    2.00    4.15    4.00  176.00

dim(JSS_dtm)

## [1]  753 2551

words=colnames(JSS_dtm)
words[1:20]

##  [1] "aalen"     "abadi"     "aberr"     "absenc"    "absorb"   
##  [6] "abund"     "academ"    "accept"    "accompani" "accuraci" 
## [11] "accustom"  "acidif"    "acrobat"   "action"    "actor"    
## [16] "actuar"    "actuari"   "acycl"     "ada"       "adaboost"

Fit an LDA model with 30 topics using VEM (Variational EM)

The output is an LDA object, from which we obtain

a mixing weight vector (over 30 topics) for each document,
a term frequency vector (over all words) for each of the 30 topic,
and other summaries from this topic model.

library("topicmodels")
k = 30
VEM = LDA(JSS_dtm, k = k,method = "VEM")


# a mixing weight vector (over 30 topics) for each document
dim(posterior(VEM)$topics)

## [1] 753  30

# a term frequency vector (over all words) for each of the 30 topic
dim(posterior(VEM)$terms)

## [1]   30 2551

# 5 most frequent terms for each top
freq.Terms = terms(VEM, 5)
dim(freq.Terms)

## [1]  5 30

freq.Terms

##      Topic 1  Topic 2      Topic 3 Topic 4    Topic 5     Topic 6     
## [1,] "imput"  "surviv"     "chart" "file"     "treatment" "fmri"      
## [2,] "latent" "gene"       "decis" "attribut" "sequenc"   "meta"      
## [3,] "posit"  "microarray" "tree"  "shrinkag" "stage"     "multilevel"
## [4,] "roc"    "census"     "polit" "record"   "mine"      "captur"    
## [5,] "mida"   "project"    "bar"   "toolbox"  "trial"     "jag"       
##      Topic 7    Topic 8 Topic 9 Topic 10  Topic 11   Topic 12  Topic 13 
## [1,] "panel"    "anim"  "stat"  "cluster" "quantil"  "poisson" "kernel" 
## [2,] "frame"    "dose"  "lisp"  "equat"   "circular" "zero"    "ensembl"
## [3,] "tree"     "phase" "java"  "ordin"   "exposur"  "inflat"  "aspect" 
## [4,] "map"      "trial" "web"   "score"   "forecast" "binomi"  "ssa"    
## [5,] "micromap" "poli"  "side"  "partit"  "pomp"     "regular" "boost"  
##      Topic 14     Topic 15   Topic 16 Topic 17  Topic 18      Topic 19   
## [1,] "stationari" "network"  "item"   "arima"   "ecolog"      "pattern"  
## [2,] "winbug"     "graph"    "irt"    "fuzzi"   "subset"      "transit"  
## [3,] "map"        "ergm"     "rasch"  "sunflow" "speci"       "econometr"
## [4,] "mathemat"   "edg"      "person" "season"  "autoregress" "volum"    
## [5,] "conting"    "graphlet" "score"  "color"   "monitor"     "fast"     
##      Topic 20  Topic 21     Topic 22    Topic 23    Topic 24    Topic 25 
## [1,] "imag"    "event"      "index"     "survey"    "gui"       "genet"  
## [2,] "project" "rank"       "diffus"    "threshold" "inequ"     "map"    
## [3,] "ratio"   "longitudin" "dirichlet" "wavelet"   "constrain" "match"  
## [4,] "gee"     "surviv"     "beta"      "plan"      "bay"       "qtl"    
## [5,] "odd"     "hazard"     "sampler"   "vista"     "jan"       "analogu"
##      Topic 26      Topic 27   Topic 28   Topic 29   Topic 30   
## [1,] "copula"      "censor"   "genet"    "formula"  "matric"   
## [2,] "network"     "chang"    "author"   "tempor"   "distanc"  
## [3,] "socr"        "contamin" "haplotyp" "accuraci" "mantel"   
## [4,] "eda"         "permut"   "mokken"   "cumul"    "molecular"
## [5,] "clickstream" "impur"    "tgui"     "spatio"   "posit"

Generate word clouds

library(wordcloud)

## Loading required package: RColorBrewer

library(RColorBrewer)

topic_terms = posterior(VEM)$terms

# Wordcloud for all topics
v = sort(colSums(topic_terms), decreasing=TRUE);
myNames = names(v);
d = data.frame(word=myNames, freq=v);
wordcloud(d$word, d$freq, max.words=50, scale=c(4,.2), random.order=FALSE, rot.per=.25, 
          colors=brewer.pal(8,"Dark2"))

# Wordcloud for individual topics
topic.id = c(4, 6, 8, 12, 13, 27)
par(mfrow=c(2,3))
for(i in 1:6){
  v = sort(topic_terms[topic.id[i],], decreasing=TRUE);
  myNames = names(v);
  d = data.frame(word=myNames, freq=v);
  wordcloud(d$word, d$freq, max.words=30, scale=c(4,.3), random.order=FALSE, rot.per=.25, 
      colors=brewer.pal(8,"Dark2"))
}