Introduction
Step 0: Load Packages & Specify Directories
Step 1: Load and Process Data
Step 2: Feature Selection
Step 3: Implement Algorithm
Step 4: Evaluation
This notebook will implement and examine different classification methods on the Breast Cancer Wisconsin (Diagnostic) Data Set to classify whether the breast cancer is malignant or benign based on the features computed from a digitized image of a fine needle aspirate (FNA) of a breast mass.
# Packages that will be used
packages.used <- c("corrplot", "caret", "randomForest", "e1071", "dplyr", "gbm")
# Check packages that need to be installed
packages.needed <- setdiff(packages.used,
intersect(installed.packages()[,1],
packages.used))
# Install additional packages
if(length(packages.needed)>0){
install.packages(packages.needed, dependencies = TRUE,
repos='http://cran.us.r-project.org')
}
# Load libraries
library("corrplot")
library("caret")
library("randomForest")
library("e1071")
library("dplyr")
library("gbm")
# Set working directory to the doc folder
setwd("~/GitHub/Spring2018-Project5-grp_2/doc")
# Load data
df <- read.csv("../data/data.csv", header = TRUE, stringsAsFactors = FALSE)
# Print the head of data
head(df)
The dataset includes 569 observations of 33 variables described as below.
(1) ID number
(2) Diagnosis (M = malignant, B = benign)
(3)-(32) Ten real-valued features described as follows are computed for each cell nucleus:
a) radius (mean of distances from center to points on the perimeter)
b) texture (standard deviation of gray-scale values)
c) perimeter
d) area
e) smoothness (local variation in radius lengths)
f) compactness (perimeter^2 / area - 1.0)
g) concavity (severity of concave portions of the contour)
h) concave points (number of concave portions of the contour)
i) symmetry
j) fractal dimension (“coastline approximation” - 1)
The mean, standard error and “worst” or largest (mean of the three largest values) of these features were computed for each image, resulting in 30 features. For instance, field 3 is Mean Radius, field 13 is Radius SE, field 23 is Worst Radius.
(33) All entries are NA’s
# Strucutre of the dataset
str(df)
'data.frame': 569 obs. of 33 variables:
$ id : int 842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
$ diagnosis : chr "M" "M" "M" "M" ...
$ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
$ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
$ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
$ area_mean : num 1001 1326 1203 386 1297 ...
$ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
$ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
$ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
$ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
$ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
$ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
$ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
$ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
$ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
$ area_se : num 153.4 74.1 94 27.2 94.4 ...
$ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
$ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
$ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
$ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
$ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
$ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
$ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
$ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
$ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
$ area_worst : num 2019 1956 1709 568 1575 ...
$ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
$ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
$ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
$ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
$ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
$ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
$ X : logi NA NA NA NA NA NA ...
# Summary of the dataset
summary(df)
id diagnosis radius_mean texture_mean perimeter_mean
Min. : 8670 Length:569 Min. : 6.981 Min. : 9.71 Min. : 43.79
1st Qu.: 869218 Class :character 1st Qu.:11.700 1st Qu.:16.17 1st Qu.: 75.17
Median : 906024 Mode :character Median :13.370 Median :18.84 Median : 86.24
Mean : 30371831 Mean :14.127 Mean :19.29 Mean : 91.97
3rd Qu.: 8813129 3rd Qu.:15.780 3rd Qu.:21.80 3rd Qu.:104.10
Max. :911320502 Max. :28.110 Max. :39.28 Max. :188.50
area_mean smoothness_mean compactness_mean concavity_mean concave.points_mean
Min. : 143.5 Min. :0.05263 Min. :0.01938 Min. :0.00000 Min. :0.00000
1st Qu.: 420.3 1st Qu.:0.08637 1st Qu.:0.06492 1st Qu.:0.02956 1st Qu.:0.02031
Median : 551.1 Median :0.09587 Median :0.09263 Median :0.06154 Median :0.03350
Mean : 654.9 Mean :0.09636 Mean :0.10434 Mean :0.08880 Mean :0.04892
3rd Qu.: 782.7 3rd Qu.:0.10530 3rd Qu.:0.13040 3rd Qu.:0.13070 3rd Qu.:0.07400
Max. :2501.0 Max. :0.16340 Max. :0.34540 Max. :0.42680 Max. :0.20120
symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
Min. :0.1060 Min. :0.04996 Min. :0.1115 Min. :0.3602 Min. : 0.757
1st Qu.:0.1619 1st Qu.:0.05770 1st Qu.:0.2324 1st Qu.:0.8339 1st Qu.: 1.606
Median :0.1792 Median :0.06154 Median :0.3242 Median :1.1080 Median : 2.287
Mean :0.1812 Mean :0.06280 Mean :0.4052 Mean :1.2169 Mean : 2.866
3rd Qu.:0.1957 3rd Qu.:0.06612 3rd Qu.:0.4789 3rd Qu.:1.4740 3rd Qu.: 3.357
Max. :0.3040 Max. :0.09744 Max. :2.8730 Max. :4.8850 Max. :21.980
area_se smoothness_se compactness_se concavity_se concave.points_se
Min. : 6.802 Min. :0.001713 Min. :0.002252 Min. :0.00000 Min. :0.000000
1st Qu.: 17.850 1st Qu.:0.005169 1st Qu.:0.013080 1st Qu.:0.01509 1st Qu.:0.007638
Median : 24.530 Median :0.006380 Median :0.020450 Median :0.02589 Median :0.010930
Mean : 40.337 Mean :0.007041 Mean :0.025478 Mean :0.03189 Mean :0.011796
3rd Qu.: 45.190 3rd Qu.:0.008146 3rd Qu.:0.032450 3rd Qu.:0.04205 3rd Qu.:0.014710
Max. :542.200 Max. :0.031130 Max. :0.135400 Max. :0.39600 Max. :0.052790
symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst
Min. :0.007882 Min. :0.0008948 Min. : 7.93 Min. :12.02 Min. : 50.41
1st Qu.:0.015160 1st Qu.:0.0022480 1st Qu.:13.01 1st Qu.:21.08 1st Qu.: 84.11
Median :0.018730 Median :0.0031870 Median :14.97 Median :25.41 Median : 97.66
Mean :0.020542 Mean :0.0037949 Mean :16.27 Mean :25.68 Mean :107.26
3rd Qu.:0.023480 3rd Qu.:0.0045580 3rd Qu.:18.79 3rd Qu.:29.72 3rd Qu.:125.40
Max. :0.078950 Max. :0.0298400 Max. :36.04 Max. :49.54 Max. :251.20
area_worst smoothness_worst compactness_worst concavity_worst concave.points_worst
Min. : 185.2 Min. :0.07117 Min. :0.02729 Min. :0.0000 Min. :0.00000
1st Qu.: 515.3 1st Qu.:0.11660 1st Qu.:0.14720 1st Qu.:0.1145 1st Qu.:0.06493
Median : 686.5 Median :0.13130 Median :0.21190 Median :0.2267 Median :0.09993
Mean : 880.6 Mean :0.13237 Mean :0.25427 Mean :0.2722 Mean :0.11461
3rd Qu.:1084.0 3rd Qu.:0.14600 3rd Qu.:0.33910 3rd Qu.:0.3829 3rd Qu.:0.16140
Max. :4254.0 Max. :0.22260 Max. :1.05800 Max. :1.2520 Max. :0.29100
symmetry_worst fractal_dimension_worst X
Min. :0.1565 Min. :0.05504 Mode:logical
1st Qu.:0.2504 1st Qu.:0.07146 NA's:569
Median :0.2822 Median :0.08004
Mean :0.2901 Mean :0.08395
3rd Qu.:0.3179 3rd Qu.:0.09208
Max. :0.6638 Max. :0.20750
We will clean up the dataset and then split the dataset into 80% train set and 20% test set.
# Delete the first column from dataset as id won't be used, and delete last column from dataset as its entries are all NA's
df <- df[,-c(1,33)]
# Convert the diagnosis attribute
df$diagnosis <- factor(df$diagnosis)
df$diagnosis <- as.integer(df$diagnosis)-1 # M=1 B=0
# Split entire data into 80% train set and 20% test set
set.seed(123)
index <- sample(1:nrow(df),0.8*nrow(df))
df.train <- df[index,]
df.test <- df[-index,]
# Check proportion of diagnosis (Benign/Malignant) in train/test sets
#prop.table(table(df.train$diagnosis))
#prop.table(table(df.test$diagnosis))
30 features are grouped into 14 groups based on their correlation, and one feature (the italicized feature) from each group is selected based on their importance. Then using recursive feature elimination with 5-fold cross validation, we have found that a subset of 11 features (the bolded) for best accuracy.
a) texture_mean, texture_worst
b) area_se, radius_se, perimeter_se
c) area_mean, radius_mean, perimeter_mean, area_worst, radius_worst, perimeter_worst
d) concave.points_worst, concavity_mean, concave.points_mean e) compactness_mean, compactness_worst, concavity_worst
f) compactness_se, fractal_dimension_se
g) concavity_se, concave.points_se
h) texture_se
i) smoothness_se
j) smoothness_mean, smoothness_worst
k) fractal_dimension_mean, fractal_dimension_worst
l) symmetry_se
m) symmetry_mean
n) symmetry_worst
#library(corrplot)
# Compute and plot corrlation matrix
corr_mat <- cor(df.train[,2:ncol(df)])
corrplot(corr_mat, method = "square", order = "hclust",
# adjust the color, size and rotation degree of the text label
tl.col = "black", tl.cex = 0.6, tl.srt = 45,
# adjust the color, format, size of the corrlation display
addCoef.col = "black", addCoefasPercent = TRUE, number.cex=0.45,
addrect = 14)
# Compute importance of each feature
control <- trainControl(method = "repeatedcv", number = 5, repeats = 3)
model <- train(factor(diagnosis)~., data=df.train, method="rf", preProcess="scale", trControl=control)
importance <- varImp(model, scale=FALSE)
plot(importance)
# define selected features
feature_selected <- c("texture_worst", "area_se", "perimeter_worst",
"concave.points_worst", "concavity_worst", "fractal_dimension_se",
"concavity_se", "texture_se", "smoothness_se",
"smoothness_worst", "fractal_dimension_worst", "symmetry_se",
"symmetry_mean", "symmetry_worst")
df.train2 <- df.train[,c("diagnosis",feature_selected)]
#library(caret)
#library(randomForest)
control <- rfeControl(functions=rfFuncs, method="cv", number=5)
results <- rfe(df.train2[,-1],factor(df.train2[,1]),size=c(1:14),rfeControl=control)
predictors(results)
[1] "perimeter_worst" "concave.points_worst" "area_se" "concavity_worst"
[5] "texture_worst" "smoothness_worst" "symmetry_worst" "concavity_se"
#plot(results, type=c("g", "o"))
# redefine selected features
feature_selected <- c("perimeter_worst", "concave.points_worst", "area_se",
"concavity_worst", "texture_worst", "smoothness_worst",
"symmetry_worst", "concavity_se", "fractal_dimension_worst",
"symmetry_mean", "fractal_dimension_se")
df.train2 <- df.train2[,c("diagnosis",feature_selected)]
df.test2 <- df.test[,c("diagnosis",feature_selected)]
We have impletemented six different classification methods on both the full set of features and reduced set of features.
run.rf <- FALSE
source("../lib/rf.R")
if(run.rf){
output_rf <- RF(df.train,df.test)
output2_rf <- RF(df.train2,df.test2)
save(output_rf,file = "../output/output_rf.RData")
save(output2_rf,file = "../output/output2_rf.RData")
}else{
load("../output/output_rf.RData")
load("../output/output2_rf.RData")
}
run.logi <- FALSE
source("../lib/logi.R")
if(run.logi){
output_logi <- logi(df.train, df.test)
output2_logi <- logi(df.train2, df.test2)
save(output_logi, file = "../output/output_logi.RData")
save(output2_logi, file = "../output/output2_logi.RData")
}else{
load("../output/output_logi.RData")
load("../output/output2_logi.RData")
}
run.gbm <- FALSE
source("../lib/gbmp.r")
if(run.gbm){
output_gbm <- gbmp(df.train, df.test)
output2_gbm <- gbmp(df.train2, df.test2)
save(output_gbm, file = "../output/output_gbm.RData")
save(output2_gbm, file = "../output/output2_gbm.RData")
}else{
load("../output/output_gbm.RData")
load("../output/output2_gbm.RData")
}
run.xg <- FALSE
source("../lib/xgboost.r")
if(run.xg){
output_xg <- xgb(df.train,df.test)
output2_xg <- xgb(df.train2,df.test2)
save(output_xg,file = "../output/output_xg.RData")
save(output2_xg,file = "../output/output2_xg.RData")
}else{
load("../output/output_xg.RData")
load("../output/output2_xg.RData")
}
run.ada <- FALSE
source("../lib/adaboost.r")
if(run.ada){
output_ada <- adaboost(df.train,df.test)
output2_ada <- adaboost(df.train2,df.test2)
save(output_ada,file = "../output/output_ada.RData")
save(output2_ada,file = "../output/output2_ada.RData")
}else{
load("../output/output_ada.RData")
load("../output/output2_ada.RData")
}
run.svm <- FALSE
source("../lib/svm.R")
if(run.svm){
output_svm <- SVM(df.train,df.test)
output2_svm <- SVM(df.train2,df.test2)
save(output_svm,file = "../output/output_svm.RData")
save(output2_svm,file = "../output/output2_svm.RData")
}else{
load("../output/output_svm.RData")
load("../output/output2_svm.RData")
}
We have compared the prediction accuracy and running time among the six classification methods with the two different sets of features, and have found that two best models are SVM with all features (100% prediction accuracy and 0.07s training time) and Logistic Regression with reduced features (98% prediction accuracy and 0.03s training time).
# compute confusion matrix
cm_rf <- confusionMatrix(output_rf$prediction,df.test$diagnosis)
cm_rf2 <- confusionMatrix(output2_rf$prediction,df.test$diagnosis)
cm_logi <- confusionMatrix(output_logi$prediction,df.test$diagnosis)
cm_logi2 <- confusionMatrix(output2_logi$prediction,df.test$diagnosis)
cm_gbm <- confusionMatrix(output_gbm$prediction,df.test$diagnosis)
cm_gbm2 <- confusionMatrix(output2_gbm$prediction,df.test$diagnosis)
cm_xg <- confusionMatrix(output_xg$prediction,df.test$diagnosis)
cm_xg2 <- confusionMatrix(output2_xg$prediction,df.test$diagnosis)
cm_ada <- confusionMatrix(output_ada$prediction,df.test$diagnosis)
cm_ada2 <- confusionMatrix(output2_ada$prediction,df.test$diagnosis)
cm_svm <- confusionMatrix(output_svm$prediction,df.test$diagnosis)
cm_svm2 <- confusionMatrix(output2_svm$prediction,df.test$diagnosis)
# compare prediction accuracy
accuracy <- c(cm_rf$overall[1],cm_logi$overall[1],cm_gbm$overall[1],
cm_xg$overall[1],cm_ada$overall[1],cm_svm$overall[1],
cm_rf2$overall[1],cm_logi2$overall[1],cm_gbm2$overall[1],
cm_xg2$overall[1],cm_ada2$overall[1],cm_svm2$overall[1])
accuracy_comparison <- matrix(accuracy, nrow = 2, byrow = TRUE,
dimnames = list(c("all_features","reduced_features"),
c("RandomForest","Logistic Regression","GBM",
"XGBoost","AdaBoost","SVM")))
round(accuracy_comparison,4)
RandomForest Logistic Regression GBM XGBoost AdaBoost SVM
all_features 0.9737 0.9825 0.9825 0.9737 0.9825 1.0000
reduced_features 0.9561 0.9825 0.9561 0.9649 0.9737 0.9561
# compare time
time <- c(output_rf$time,output_logi$time,output_gbm$time,output_xg$time,output_ada$time,output_svm$time,
output2_rf$time,output2_logi$time,output2_gbm$time,output2_xg$time,output2_ada$time,output2_svm$time)
time_comparison <- matrix(time, nrow=2, byrow = TRUE,
dimnames = list(c("all_features","reduced_features"),
c("RandomForest","Logistic Regression","GBM",
"XGBoost","AdaBoost","SVM")))
round(time_comparison,2)
RandomForest Logistic Regression GBM XGBoost AdaBoost SVM
all_features 0.71 0.08 3.62 11.72 3.36 0.07
reduced_features 0.38 0.03 2.21 6.17 1.46 0.03
par(mfrow=c(1,2))
fourfoldplot(cm_logi2$table, conf.level = 0, margin = 1,
main = paste0("Logistic Regression (", round(cm_logi2$overall[1]*100), "%)"))
fourfoldplot(cm_svm$table, conf.level = 0, margin = 1,
main = paste0("SVM (", round(cm_svm$overall[1]*100), "%)"))