library(randomForest);
library("ElemStatLearn")
spam.name=read.table("spam_name.txt", as.is=TRUE);
for(i in 1:57) names(spam)[i]=as.character(spam.name[i,]);
names(spam)[58]="Y";
spam$Y=as.factor(spam$Y);
load("spam_ID.Rdata");
spam.training = spam[ train.id, ];
spam.test = spam[ test.id, ];
p = dim(spam)[2] - 1;
## the default value for mtry is sqrt(p) for classification
## mtry = sqrt(p) = sqrt(57) = 7.54
rfModel = randomForest(Y~., data = spam.training,
importance = T, mtry = 7, ntree=400 );
names(rfModel)
## [1] "call" "type" "predicted"
## [4] "err.rate" "confusion" "votes"
## [7] "oob.times" "classes" "importance"
## [10] "importanceSD" "localImportance" "proximity"
## [13] "ntree" "mtry" "forest"
## [16] "y" "test" "inbag"
## [19] "terms"
yhat.test = predict(rfModel, spam.test)
table(spam.test$Y, yhat.test);
## yhat.test
## 0 1
## 0 231 11
## 1 23 135
We can evaluate the training error by obtaining the prediction on the training set (same as the regular training error, i.e., may underestimate the real classification error). But randomForest provides an estimate of the training error based on OOB samples, which is similar to CV errors, i.e., an unbiased estimate of the real classification error.
yhat.train = predict(rfModel, spam.training )
table(spam.training$Y, yhat.train);
## yhat.train
## 0 1
## 0 364 0
## 1 3 233
table(rfModel$predicted, spam.training$Y)
##
## 0 1
## 0 351 22
## 1 13 214
rfModel$confusion # calculated based oob
## 0 1 class.error
## 0 351 13 0.03571
## 1 22 214 0.09322
tmp=rfModel$err.rate
tmp[c(1:5,397:400),]
## OOB 0 1
## [1,] 0.13248 0.07914 0.21053
## [2,] 0.13687 0.10185 0.19014
## [3,] 0.14318 0.12121 0.17614
## [4,] 0.15182 0.12458 0.19289
## [5,] 0.13594 0.12654 0.15023
## [6,] 0.06000 0.03846 0.09322
## [7,] 0.06000 0.03846 0.09322
## [8,] 0.05833 0.03571 0.09322
## [9,] 0.05833 0.03571 0.09322
par(mfrow=c(1,2))
plot(rfModel)
plot(c(0, rfModel$ntree), range(tmp), type="n",
xlab = "Number of trees", ylab="Error")
lines(tmp[,1], col="black")
lines(tmp[,2], col="red")
lines(tmp[,3], col="green")
sortedImpt = sort(importance(rfModel, scale = F)[,3], decreasing = T );
sortedImpt_scaled = sort(importance(rfModel, scale = T)[,3], decreasing = T )
sortedImp_gini = sort(rfModel$importance[,4], decreasing=T);
barplot(sortedImpt_scaled, horiz=TRUE, col="blue", space=.5, names.arg=substr(names(sortedImpt_scaled), 2, 5), cex.names=0.5)
title("importance measured by accuracy - scaled");
plot(sortedImp_gini, ylim=c(-1, max(sortedImp_gini)+1), type="h");
text(1:p, sortedImp_gini, substr(names(sortedImp_gini), 2, 5), col=3, cex=0.5);
title("importance measured by gini");
grid(20);