Stat542: Random Forest

Load packages.

library(randomForest);
library("ElemStatLearn")

Prepare the spam data: predict Y based on 57 features.

spam.name=read.table("spam_name.txt", as.is=TRUE);
for(i in 1:57) names(spam)[i]=as.character(spam.name[i,]);
names(spam)[58]="Y";
spam$Y=as.factor(spam$Y);

Split the data into training and test.

load("spam_ID.Rdata");
spam.training = spam[ train.id, ];
spam.test = spam[ test.id, ];
p = dim(spam)[2] - 1;

Fit a random forest

## the default value for mtry is sqrt(p) for classification
## mtry = sqrt(p) = sqrt(57) = 7.54

rfModel = randomForest(Y~., data = spam.training,
                       importance = T, mtry = 7, ntree=400 ); 
names(rfModel)

##  [1] "call"            "type"            "predicted"      
##  [4] "err.rate"        "confusion"       "votes"          
##  [7] "oob.times"       "classes"         "importance"     
## [10] "importanceSD"    "localImportance" "proximity"      
## [13] "ntree"           "mtry"            "forest"         
## [16] "y"               "test"            "inbag"          
## [19] "terms"

Test Error

yhat.test = predict(rfModel, spam.test)
table(spam.test$Y,  yhat.test);

##    yhat.test
##       0   1
##   0 231  11
##   1  23 135

Two Training Errors

We can evaluate the training error by obtaining the prediction on the training set (same as the regular training error, i.e., may underestimate the real classification error). But randomForest provides an estimate of the training error based on OOB samples, which is similar to CV errors, i.e., an unbiased estimate of the real classification error.

yhat.train = predict(rfModel, spam.training )
table(spam.training$Y,  yhat.train);

##    yhat.train
##       0   1
##   0 364   0
##   1   3 233

table(rfModel$predicted, spam.training$Y)

##    
##       0   1
##   0 351  22
##   1  13 214

rfModel$confusion  # calculated based oob

##     0   1 class.error
## 0 351  13     0.03571
## 1  22 214     0.09322

The plot function for randomForest

tmp=rfModel$err.rate
tmp[c(1:5,397:400),]

##           OOB       0       1
##  [1,] 0.13248 0.07914 0.21053
##  [2,] 0.13687 0.10185 0.19014
##  [3,] 0.14318 0.12121 0.17614
##  [4,] 0.15182 0.12458 0.19289
##  [5,] 0.13594 0.12654 0.15023
##  [6,] 0.06000 0.03846 0.09322
##  [7,] 0.06000 0.03846 0.09322
##  [8,] 0.05833 0.03571 0.09322
##  [9,] 0.05833 0.03571 0.09322

par(mfrow=c(1,2))
plot(rfModel)
plot(c(0, rfModel$ntree), range(tmp), type="n",
     xlab = "Number of trees", ylab="Error")
lines(tmp[,1], col="black")
lines(tmp[,2], col="red")
lines(tmp[,3], col="green")

Variable importance.

sortedImpt = sort(importance(rfModel, scale = F)[,3], decreasing = T );
sortedImpt_scaled = sort(importance(rfModel, scale = T)[,3], decreasing = T )
sortedImp_gini = sort(rfModel$importance[,4], decreasing=T);

barplot(sortedImpt_scaled,  horiz=TRUE, col="blue", space=.5, names.arg=substr(names(sortedImpt_scaled), 2, 5), cex.names=0.5)
title("importance measured by accuracy - scaled");

plot(sortedImp_gini, ylim=c(-1, max(sortedImp_gini)+1), type="h");
text(1:p, sortedImp_gini, substr(names(sortedImp_gini), 2, 5), col=3, cex=0.5);
title("importance measured by gini");
grid(20);