Previous: 1. Scrape raw data
Next: 3. Train models

2. Process data

In order to train models on the outcomes of past playoffs, I am calculating summary statistics on each teams performance in the regular season. This script reads the data files produced by nhlscrapr and generates these summaries for every team and for every season. It also generates separate statistics for away and home games, as well as an overall metric.

First, load the dplyr package for data manipulation:

suppressMessages({
  library(dplyr)
})

Next, define a function to calculate the summary statistics for each season. First for each game, calculate for both the away and home teams:

Then, calculate the average for the entire regular season. Separate averages are calculated for each team when they are playing away, when they are playing at home, and also the overall for both.

process_season <- function(theseason) {
  message("Processing season ", substring(theseason, 1, 4), "-",
    substring(theseason, 5, 8), "...", appendLF=FALSE)
  load(file.path("source-data", paste0("nhlscrapr-", theseason, ".RData")))
  grand.data <- tbl_df(grand.data)
  grand.data$season <- as.character(grand.data$season)

  gamestats <- grand.data %>%
    filter(ev.team %in% unique(grand.data$hometeam)) %>%
    filter(substring(gcode, 1, 1) == "2") %>%
    mutate(season==season) %>%
    group_by(season, gcode) %>%
    summarise(
      awayteam=first(awayteam),
      hometeam=first(hometeam),
      totalgoals=sum(etype == "GOAL"),
      awaygoals=sum(ev.team == awayteam & etype == "GOAL"),
      homegoals=sum(ev.team == hometeam & etype == "GOAL"),
      totalshots=sum(etype %in% c("SHOT", "GOAL")),
      awayshots=sum(ev.team == awayteam & etype %in% c("SHOT", "GOAL")),
      homeshots=sum(ev.team == hometeam & etype %in% c("SHOT", "GOAL")),
      totalfaceoffs=sum(etype == "FAC"),
      awayfaceoffs=sum(ev.team == awayteam & etype == "FAC"),
      homefaceoffs=sum(ev.team == hometeam & etype == "FAC"),
      totalpenalties=sum(etype == "PENL"),
      awaypenalties=sum(ev.team == awayteam & etype == "PENL"),
      homepenalties=sum(ev.team == hometeam & etype == "PENL"),
      awaypp=sum(ev.team == awayteam & etype == "GOAL" &
        away.skaters > home.skaters),
      homepp=sum(ev.team == hometeam & etype == "GOAL" &
        away.skaters < home.skaters),
      awaysh=sum(ev.team == awayteam & etype == "GOAL" &
        away.skaters < home.skaters),
      homesh=sum(ev.team == hometeam & etype == "GOAL" &
        away.skaters > home.skaters))

  awaygames <- gamestats %>%
    ungroup() %>%
    transmute(
      season=season,
      team=awayteam,
      goals=awaygoals / totalgoals,
      shots=awayshots / totalshots,
      faceoffs=awayfaceoffs / totalfaceoffs,
      penalties=ifelse(totalpenalties==0, NA, awaypenalties / totalpenalties),
      pp=ifelse(homepenalties==0, NA, awaypp / homepenalties),
      pk=ifelse(awaypenalties==0, NA, homepp / awaypenalties))

  homegames <- gamestats %>%
    ungroup() %>%
    transmute(
      season=season,
      team=hometeam,
      goals=homegoals / totalgoals,
      shots=homeshots / totalshots,
      faceoffs=homefaceoffs / totalfaceoffs,
      penalties=ifelse(totalpenalties==0, NA, homepenalties / totalpenalties),
      pp=ifelse(awaypenalties==0, NA, homepp / awaypenalties),
      pk=ifelse(homepenalties==0, NA, awaypp / homepenalties))

  awaystats <- awaygames %>%
    group_by(season, team) %>%
    summarise_each(funs(mean(., na.rm=TRUE)))

  homestats <- homegames %>%
    group_by(season, team) %>%
    summarise_each(funs(mean(., na.rm=TRUE)))

  overallstats <- bind_rows(awaystats, homestats) %>%
    group_by(season, team) %>%
    summarise_each(funs(mean(., na.rm=TRUE)))

  message()
  list(away=awaystats, home=homestats, overall=overallstats)
}

Finally, load all available seasons and run the function defined above on each one, combine results together, and save the processed data.

load(file.path("source-data", "nhlscrapr-core.RData"))
seasons <- unique(games$season)
rm(list=c("games", "roster.master", "roster.unique"))

seasonstats <- lapply(seasons, process_season)
## Processing season 2002-2003...
## Processing season 2003-2004...
## Processing season 2005-2006...
## Processing season 2006-2007...
## Processing season 2007-2008...
## Processing season 2008-2009...
## Processing season 2009-2010...
## Processing season 2010-2011...
## Processing season 2011-2012...
## Processing season 2012-2013...
## Processing season 2013-2014...
## Processing season 2014-2015...
gamestats <- list(
  away=bind_rows(lapply(seasonstats, "[[", "away")),
  home=bind_rows(lapply(seasonstats, "[[", "home")),
  overall=bind_rows(lapply(seasonstats, "[[", "overall")))
rm(list=c("seasons", "seasonstats"))
  
saveRDS(gamestats, "processed.rds")

This is what the generated summary statistics look like:

head(gamestats[["away"]])
season team goals shots faceoffs penalties pp pk
20022003 ANA 0.480 0.477 0.544 0.517 0.159 0.095
20022003 ATL 0.433 0.434 0.467 0.504 0.147 0.139
20022003 BOS 0.422 0.503 0.481 0.555 0.137 0.103
20022003 BUF 0.416 0.482 0.493 0.526 0.118 0.119
20022003 CAR 0.384 0.492 0.512 0.514 0.105 0.169
20022003 CBJ 0.376 0.432 0.472 0.511 0.110 0.104
head(gamestats[["home"]])
season team goals shots faceoffs penalties pp pk
20022003 ANA 0.595 0.497 0.564 0.464 0.097 0.086
20022003 ATL 0.447 0.471 0.456 0.471 0.102 0.113
20022003 BOS 0.546 0.562 0.517 0.494 0.138 0.121
20022003 BUF 0.492 0.524 0.502 0.473 0.117 0.077
20022003 CAR 0.444 0.533 0.534 0.447 0.087 0.136
20022003 CBJ 0.535 0.460 0.488 0.469 0.126 0.122
head(gamestats[["overall"]])
season team goals shots faceoffs penalties pp pk
20022003 ANA 0.538 0.487 0.554 0.490 0.128 0.090
20022003 ATL 0.440 0.452 0.461 0.487 0.124 0.126
20022003 BOS 0.484 0.533 0.499 0.525 0.137 0.112
20022003 BUF 0.454 0.503 0.497 0.499 0.117 0.098
20022003 CAR 0.414 0.513 0.523 0.481 0.096 0.153
20022003 CBJ 0.456 0.446 0.480 0.490 0.118 0.113

Next: 3. Train models
Previous: 1. Scrape raw data