if (!require("pacman")) install.packages("pacman")
pacman::p_load(caret, e1071, kernlab, ggplot2, ISLR, Hmisc,
gridExtra, RANN, AppliedPredictiveModeling, rattle, magrittr,
rpart.plot, ElemStatLearn, party, randomForest, gbm,
MASS, klaR, dplyr, pgmm)
Trying to predict (classify) species of plants based on trees. Which features I have?
library(ggplot2); data(iris)
names(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## [5] "Species"
Where species is the outcome.
table(iris$Species)
##
## setosa versicolor virginica
## 50 50 50
50 samples of each.
inTrain <- createDataPartition(y=iris$Species, p=0.7, list=FALSE)
training <- iris[inTrain,]
testing <- iris[-inTrain,]
c(dim(training),'|',dim(testing))
## [1] "105" "5" "|" "45" "5"
qplot(Petal.Width, Sepal.Width, colour=Species, data=training)
3 very distinct clusters.
We will use train with method='rpart' the method for classification by trees. There are other tree building options in R: party, and out of the caret package tree.
library(caret)
modFit <- train(Species ~ ., method='rpart', data=training)
print(modFit$finalModel)
## n= 105
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 105 70 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.6 35 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.6 70 35 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Length< 4.75 32 0 versicolor (0.00000000 1.00000000 0.00000000) *
## 7) Petal.Length>=4.75 38 3 virginica (0.00000000 0.07894737 0.92105263) *
You can read the splits to understand what the classification algo is doing.
You can also plot the dendogram of the classification tree:
plot(modFit$finalModel, uniform=TRUE, main='classification tree')
text(modFit$finalModel, use.n=TRUE, all=TRUE, cex=.8)
Or a prettier plot with rattle:
library(rattle)
fancyRpartPlot(modFit$finalModel)
You can predict new values with the usual predict:
predict(modFit, newData=testing)
## [1] setosa setosa setosa setosa setosa setosa
## [7] setosa setosa setosa setosa setosa setosa
## [13] setosa setosa setosa setosa setosa setosa
## [19] setosa setosa setosa setosa setosa setosa
## [25] setosa setosa setosa setosa setosa setosa
## [31] setosa setosa setosa setosa setosa versicolor
## [37] versicolor virginica versicolor versicolor versicolor versicolor
## [43] versicolor versicolor versicolor versicolor versicolor versicolor
## [49] versicolor versicolor versicolor versicolor versicolor virginica
## [55] virginica versicolor versicolor versicolor versicolor versicolor
## [61] versicolor versicolor versicolor versicolor versicolor versicolor
## [67] versicolor versicolor versicolor versicolor virginica virginica
## [73] virginica virginica virginica virginica virginica virginica
## [79] virginica virginica virginica virginica virginica virginica
## [85] virginica virginica virginica virginica virginica virginica
## [91] virginica virginica virginica virginica virginica virginica
## [97] virginica virginica virginica virginica virginica virginica
## [103] virginica virginica virginica
## Levels: setosa versicolor virginica
library(caret)
library(AppliedPredictiveModeling)
data(segmentationOriginal)
Subsetting into training and testing:
library(dplyr)
set.seed(125)
training <- filter(segmentationOriginal, Case == 'Train')
testing <- filter(segmentationOriginal, Case == 'Test')
c(dim(training),'|',dim(testing))
## [1] "1009" "119" "|" "1010" "119"
Fitting a model
set.seed(125)
modFit <- train(Class ~ ., method='rpart', data=training)
print(modFit$finalModel)
## n= 1009
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 1009 373 PS (0.63032706 0.36967294)
## 2) TotalIntenCh2< 45323.5 454 34 PS (0.92511013 0.07488987) *
## 3) TotalIntenCh2>=45323.5 555 216 WS (0.38918919 0.61081081)
## 6) FiberWidthCh1< 9.673245 154 47 PS (0.69480519 0.30519481) *
## 7) FiberWidthCh1>=9.673245 401 109 WS (0.27182045 0.72817955) *
Visualizing the tree
library(rattle)
fancyRpartPlot(modFit$finalModel)
Predicting for specific values:
You can do visually, on the tree plotted above, where Class will be, for each case:
I do not know how to come up to the same values using a computational, non visual approach. R’s handling of NA values in prediction algorithms is weird:
library(dplyr)
emptyDf <- subset(testing, FALSE) # empty df with all columns
df <- data.frame(TotalIntench2=c(23000,50000,57000,NA),
FiberWidthCh1=c(10,10,8,8),
PerimStatusCh1=c(2,NA,NA,2),
VarIntenCh4=c(NA,100,100,100))
df2 <- rbind_list(emptyDf, df)
predict(modFit, newdata=df2, na.action=na.omit)
## factor(0)
## Levels: PS WS
Omits all NA values, dropping all outcomes. And the alternative:
predict(modFit, newdata=df2, na.action=na.pass)
## [1] WS WS PS PS
## Levels: PS WS
Produces random results. The results are different. The solution would be around imputing missing results, I do not fully control the technique.
library(pgmm)
data(olive)
olive = olive[,-1]
summary(olive)
## Area Palmitic Palmitoleic Stearic
## Min. :1.0 Min. : 610 Min. : 15.00 Min. :152.0
## 1st Qu.:3.0 1st Qu.:1095 1st Qu.: 87.75 1st Qu.:205.0
## Median :3.0 Median :1201 Median :110.00 Median :223.0
## Mean :4.6 Mean :1232 Mean :126.09 Mean :228.9
## 3rd Qu.:7.0 3rd Qu.:1360 3rd Qu.:169.25 3rd Qu.:249.0
## Max. :9.0 Max. :1753 Max. :280.00 Max. :375.0
## Oleic Linoleic Linolenic Arachidic
## Min. :6300 Min. : 448.0 Min. : 0.00 Min. : 0.0
## 1st Qu.:7000 1st Qu.: 770.8 1st Qu.:26.00 1st Qu.: 50.0
## Median :7302 Median :1030.0 Median :33.00 Median : 61.0
## Mean :7312 Mean : 980.5 Mean :31.89 Mean : 58.1
## 3rd Qu.:7680 3rd Qu.:1180.8 3rd Qu.:40.25 3rd Qu.: 70.0
## Max. :8410 Max. :1470.0 Max. :74.00 Max. :105.0
## Eicosenoic
## Min. : 1.00
## 1st Qu.: 2.00
## Median :17.00
## Mean :16.28
## 3rd Qu.:28.00
## Max. :58.00
Fitting…
set.seed(125)
modFit <- train(Area ~ ., method='rpart', data=olive)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
print(modFit$finalModel)
## n= 572
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 572 3171.32000 4.599650
## 2) Eicosenoic>=6.5 323 176.82970 2.783282 *
## 3) Eicosenoic< 6.5 249 546.51410 6.955823
## 6) Linoleic>=1053.5 98 21.88776 5.336735 *
## 7) Linoleic< 1053.5 151 100.99340 8.006623 *
Visualizing…
fancyRpartPlot(modFit$finalModel)
Predicting…
predict(modFit, newdata=as.data.frame(t(colMeans(olive))))
## 1
## 2.783282
This result is strange because Area should be a qualitative variable - but tree is reporting the average value of Area as a numeric variable in the leaf predicted for newdata.
Short for ’bootstrap aggregating`. Example: ozone data
library(ElemStatLearn); data(ozone)
ozone <- ozone[order(ozone$ozone),]
head(ozone)
## ozone radiation temperature wind
## 17 1 8 59 9.7
## 19 4 25 61 9.7
## 14 6 78 57 18.4
## 45 7 48 80 14.3
## 106 7 49 69 10.3
## 7 8 19 61 20.1
We will try to predict temperature as a function of ozone.
How to calculate a bagged LOESS curve, a curve that captures a the variability on past measurements:
ll <- matrix(NA, nrow=10, ncol=155)
for (i in 1:10) {
ss <- sample(1:dim(ozone)[1], replace=TRUE) # sample with replacement, the entire dataset
ozone0 <- ozone[ss,]
ozone0 <- ozone0[order(ozone0$ozone),] # ozone0 is the bootstrap of ozone
loess0 <- loess(temperature ~ ozone, data=ozone0, span=0.2) # smoothing curve
ll[i,] <- predict(loess0, newdata=data.frame(ozone=1:155)) # i-th row is the prediction
}
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
Some other bagging available as a method in train function: bagEarth, treeBag, bagFDA. Or you can use the bag function.
How does it look like?
plot(ozone$ozone, ozone$temperature, pch=19, cex=0.5)
for (i in 1:10) {
lines(1:155, ll[i,], col='grey', lwd=2)
}
lines(1:155, apply(ll, 2, mean), col='red', lwd=2)
You can do your own bagging in caret:
predictors <- data.frame(ozone=ozone$ozone)
temperature <- ozone$temperature
treebag <- bag(predictors, temperature, B=10,
bagControl=bagControl(fit=ctreeBag$fit,
predict=ctreeBag$pred,
aggregate=ctreeBag$aggregate))
## Warning: executing %dopar% sequentially: no parallel backend registered
How well does it capture a trend in data points?
plot(ozone$ozone, temperature, col='lightgrey',pch=19)
points(ozone$ozone, predict(treebag$fits[[1]]$fit, predictors), pch=19, col='red')
points(ozone$ozone, predict(treebag, predictors), pch=19, col='blue') # custom bagging
ctreeBag$fit
## function (x, y, ...)
## {
## loadNamespace("party")
## data <- as.data.frame(x)
## data$y <- y
## party::ctree(y ~ ., data = data)
## }
## <environment: namespace:caret>
ctreeBag$pred
## function (object, x)
## {
## if (!is.data.frame(x))
## x <- as.data.frame(x)
## obsLevels <- levels(object@data@get("response")[, 1])
## if (!is.null(obsLevels)) {
## rawProbs <- party::treeresponse(object, x)
## probMatrix <- matrix(unlist(rawProbs), ncol = length(obsLevels),
## byrow = TRUE)
## out <- data.frame(probMatrix)
## colnames(out) <- obsLevels
## rownames(out) <- NULL
## }
## else out <- unlist(party::treeresponse(object, x))
## out
## }
## <environment: namespace:caret>
ctreeBag$aggregate
## function (x, type = "class")
## {
## if (is.matrix(x[[1]]) | is.data.frame(x[[1]])) {
## pooled <- x[[1]] & NA
## classes <- colnames(pooled)
## for (i in 1:ncol(pooled)) {
## tmp <- lapply(x, function(y, col) y[, col], col = i)
## tmp <- do.call("rbind", tmp)
## pooled[, i] <- apply(tmp, 2, median)
## }
## if (type == "class") {
## out <- factor(classes[apply(pooled, 1, which.max)],
## levels = classes)
## }
## else out <- as.data.frame(pooled)
## }
## else {
## x <- matrix(unlist(x), ncol = length(x))
## out <- apply(x, 1, median)
## }
## out
## }
## <environment: namespace:caret>
Average of different paths taken on different executions through different paths. Example, iris data:
set.seed(123)
data(iris); library(ggplot2); library(caret)
inTrain <- createDataPartition(y=iris$Species, p=0.7, list=FALSE)
training <- iris[inTrain,]
testing <- iris[-inTrain,]
Training using random forests, method='rf', outcome is Species, regressors are all the other features.
modFit <- train(Species ~ ., data=training, method='rf', prox=TRUE)
modFit
## Random Forest
##
## 105 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 105, 105, 105, 105, 105, 105, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9421392 0.9119782
## 3 0.9440956 0.9149427
## 4 0.9430724 0.9134138
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
You can check any individual trees, i.e.
getTree(modFit$finalModel, k=2) # second tree
## left daughter right daughter split var split point status prediction
## 1 2 3 3 2.45 1 0
## 2 0 0 0 0.00 -1 1
## 3 4 5 4 1.70 1 0
## 4 6 7 3 5.35 1 0
## 5 0 0 0 0.00 -1 3
## 6 0 0 0 0.00 -1 2
## 7 0 0 0 0.00 -1 3
You can check for “class centers” as well, cluster centers, obtained with classCenter:
irisP <- as.data.frame(classCenter(training[,c(3,4)], training$Species, modFit$finalModel$prox))
irisP$Species <- rownames(irisP)
qplot(Petal.Width, Petal.Length, col=Species, data=training) +
geom_point(aes(x=Petal.Width, y=Petal.Length, col=Species), size=5, shape=4, data=irisP) # plot X's
You can predict new values with predict
pred <- predict(modFit, testing)
And check for correct predictions:
testing$predRight <- pred==testing$Species
table(pred, testing$Species)
##
## pred setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 13 1
## virginica 0 2 14
We can see a few mis-classified values, let’s check where they are:
qplot(Petal.Width, Petal.Length, colour=predRight, data=testing, main='data predictions')
You can see the misclassified values are right on the border of one or more clusters.
library(ElemStatLearn)
data(vowel.train)
data(vowel.test)
vowel.train$y <- as.factor(vowel.train$y)
vowel.test$y <- as.factor(vowel.test$y)
set.seed(33833)
modFit <- train(y ~ ., data=vowel.train, method='rf')
varImp(modFit, data=vowel.train)
## rf variable importance
##
## Overall
## x.1 100.0000
## x.2 98.5960
## x.5 44.3214
## x.6 28.3926
## x.8 14.9865
## x.9 6.7483
## x.3 4.8055
## x.4 4.5061
## x.7 0.5042
## x.10 0.0000
set.seed(33833)
modFit <- randomForest(y ~ ., data=vowel.train)
varImp(modFit)
## Overall
## x.1 89.12864
## x.2 91.24009
## x.3 33.08111
## x.4 34.24433
## x.5 50.25539
## x.6 43.33148
## x.7 31.88132
## x.8 42.92470
## x.9 33.37031
## x.10 29.59956
order(varImp(modFit), decreasing = TRUE)
## [1] 2 1 5 6 8 4 9 3 7 10
Using boosting in our Wage example
set.seed(123)
library(ggplot2); library(caret); library(ISLR); data(wage)
## Warning in data(wage): data set 'wage' not found
Wage <- subset(Wage, select=-c(logwage))
inTrain <- createDataPartition(y=Wage$wage, p=0.7, list=FALSE)
training <- Wage[inTrain,]
testing <- Wage[-inTrain,]
Fitting the model with gbm, “boosting with trees”. It has to be cached this is really slow:
modFit <- train(wage ~ ., method='gbm', data=training, verbose=FALSE)
print(modFit)
## Stochastic Gradient Boosting
##
## 2102 samples
## 10 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees RMSE Rsquared
## 1 50 34.53147 0.3124394
## 1 100 33.94784 0.3200580
## 1 150 33.88889 0.3198883
## 2 50 33.97457 0.3201718
## 2 100 33.88419 0.3202670
## 2 150 33.95668 0.3177865
## 3 50 33.88911 0.3207441
## 3 100 33.99469 0.3163406
## 3 150 34.24555 0.3083372
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 100,
## interaction.depth = 2, shrinkage = 0.1 and n.minobsinnode = 10.
How did it do? Let’s plot the predicted wage against the wage in testing set. Ideally it should all fit in a perfect line.
qplot(predict(modFit, testing), wage, data=testing)
Not too bad, a lot of variability, outliers but there is a correlation.
set.seed(123)
data(iris); library(ggplot2); library(caret)
names(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## [5] "Species"
table(iris$Species)
##
## setosa versicolor virginica
## 50 50 50
inTrain <- createDataPartition(y=iris$Species, p=0.7, list=FALSE)
training <- iris[inTrain,]
testing <- iris[-inTrain,]
c(dim(training), dim(testing))
## [1] 105 5 45 5
Training with LDA classification, method='lda':
modlda <- train(Species ~ ., data=training, method='lda')
plda <- predict(modlda, testing)
Training with naive bayesian classification, method='nb':
modnb <- train(Species ~ ., data=training, method='nb')
pnb <- predict(modnb, testing)
How did they agree?
table(plda, pnb)
## pnb
## plda setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 13 1
## virginica 0 1 15
The two methods disagree on 2 instances. Where are them?
equalPredictions <- (plda == pnb)
qplot(Petal.Width, Sepal.Width, col=equalPredictions, data=testing)
They have been misclassified either because they lay in between clusters or is an outlier.