Previous: 2. Process data
Next: 4. Make predictions

3. Train models

The summary statistics generated in the previous step are now combined with outcomes of the playoffs series, and then used to train models. To facilitate use of multiple types of statistical models, I am using the caret package, which provides a unified interface for this purpose. And in order to use both CPU cores on my laptop, I am using package doMC.

suppressMessages({
  # caret uses package plyr, but whenever dplyr and plyr are both loaded,
  # dplyr should be loaded after plyr. Hence plyr is loaded explicitly.
  library(plyr)
  library(dplyr)
  library(caret)
  library(doMC)
})
registerDoMC(cores=2)

Define a function to detect the winner of a series. It takes a vector of winners of individual games and returns the overall winner.

series_winner <- function(x) {
  counts <- sort(table(x), decreasing=TRUE)
  if ((length(counts) == 1) || (counts[1] > counts[2]))
    return(names(counts)[1])
  stop("I'm sorry, but I couldn't figure out the winner: ",
    paste(x, collapse=", "))
}

Define a function to append game statistics to the outcome of a playoff series. Its behavior can be controlled with the argument which between these three options:

add_stats <- function(games, gamestats, which=c("both", "single", "overall")) {
  which <- match.arg(which)
  if (which == "overall") {
    away <- left_join(games, gamestats[["overall"]],
      by=c("season", awayteam="team"))
    home <- left_join(games, gamestats[["overall"]],
      by=c("season", hometeam="team"))
  } else {
    away <- left_join(games, gamestats[["away"]],
      by=c("season", awayteam="team"))
    home <- left_join(games, gamestats[["home"]],
      by=c("season", hometeam="team"))
  }

  if (which == "both") {
    away2 <- left_join(games, gamestats[["home"]],
      by=c("season", awayteam="team"))
    home2 <- left_join(games, gamestats[["away"]],
      by=c("season", hometeam="team"))
  }

  games$goals <- away$goals - home$goals
  games$shots <- away$shots - home$shots
  games$faceoffs <- away$faceoffs - home$faceoffs
  games$penalties <- away$penalties - home$penalties
  games$pp <- away$pp - home$pk
  games$pk <- away$pk - home$pp

  if (which == "both") {
    games$goals2 <- away2$goals - home2$goals
    games$shots2 <- away2$shots - home2$shots
    games$faceoffs2 <- away2$faceoffs - home2$faceoffs
    games$penalties2 <- away2$penalties - home2$penalties
    games$pp2 <- away2$pp - home2$pk
    games$pk2 <- away2$pk - home2$pp
  }

  games
}

Load data and append summary statistics.

load(file.path("source-data", "nhlscrapr-core.RData"))
rm(list=c("roster.master", "roster.unique"))

gamestats <- readRDS("processed.rds")

games <- tbl_df(games)
games <- games %>%
  filter(status != 0, session == "Playoffs", season != "20142015") %>%
  mutate(awayscore=as.integer(awayscore), homescore=as.integer(homescore))

playoffs <- games %>%
  mutate(winner=ifelse(awayscore > homescore, awayteam, hometeam)) %>%
  group_by(season, round=substring(gcode, 3, 3),
    series=substring(gcode, 4, 4)) %>%
  summarise(awayteam=first(awayteam), hometeam=first(hometeam),
    winner=series_winner(winner)) %>%
  ungroup() %>%
  select(season, awayteam, hometeam, winner) %>%
  mutate(winner=as.factor(ifelse(awayteam == winner, "away", "home")))

playoffs <- add_stats(playoffs, gamestats)

playoffs <- playoffs %>%
  select(-season, -awayteam, -hometeam)

The training data looks like this:

head(playoffs)
winner goals shots faceoffs penalties pp pk goals2 shots2 faceoffs2 penalties2 pp2 pk2
home -0.140 -0.080 0.035 0.098 -0.006 -0.045 -0.077 0.009 0.050 -0.011 0.002 -0.040
home -0.171 -0.092 -0.041 0.108 0.080 0.019 -0.029 -0.001 0.031 0.012 0.013 0.052
home -0.104 -0.075 -0.008 0.149 -0.017 0.005 0.046 0.017 -0.004 -0.020 -0.010 -0.018
home -0.035 -0.133 -0.038 0.089 0.005 -0.029 0.061 -0.053 0.007 -0.028 0.016 0.014
home -0.118 -0.055 -0.023 0.016 0.026 -0.004 -0.025 -0.031 0.029 -0.037 -0.042 -0.051
away -0.105 -0.030 0.009 -0.001 0.014 -0.124 0.100 0.005 0.054 -0.105 0.011 -0.083

Next, define parameters for the model training. I am preprocessing the data with centering and scaling, and then using a 10-fold cross-validation repeated 10 times for parameter tuning. For each parameter, five different values are evaluated via the cross-validation, and the combination with the best overall accuracy chosen. A final model is then fitted with all of the training data and the chosen parameter values.

In order to make the analysis reproducible, sets of random seeds are generated to be used at each point of training.

method <- "repeatedcv"
number <- 10
repeats <- 10
preProcess <- c("center", "scale")
tuneLength <- 5
metric <- "Accuracy"
maxParameters <- 5
seeds <- vector(mode="list", length=repeats*number+1)
for (i in seq_along(seeds))
  seeds[[i]] <- (1000*i+1):(1000*i+1+tuneLength^maxParameters)

fitControl <- trainControl(method=method, number=number, repeats=repeats,
  seeds=seeds)

Define a function that trains the models.

train_model <- function(method) {
  message("Training model: ", method, "...", appendLF=FALSE)
  set.seed(7474505)
  suppressMessages({ captured <- capture.output({
    fit <- train(winner ~ ., data=playoffs,
      method=method, trControl=fitControl, preProcess=preProcess,
      metric=metric, tuneLength=tuneLength)
  })})
  message()
  fit
}

Define which models we want to include, and then train them. I am including generalized linear model, linear discriminant analysis, neural network, random forests, and a support vector machine with a linear kernel. Each one of them undergoes the cross-validation for parameter tuning, and a final model is built with all of the training data.

Afterwards, save the resulting five final models.

methods <- c("glm", "lda", "nnet", "rf", "svmLinear")
models <- lapply(methods, train_model)
## Training model: glm...
## Training model: lda...
## Training model: nnet...
## Training model: rf...
## Training model: svmLinear...
names(models) <- methods

saveRDS(models, "models.rds")

Next: 4. Make predictions
Previous: 2. Process data