Previous: 1. Scrape raw data
Next: 3. Train models

3. Process data

This script reads the data files produced by nhlscrapr and generates a summary of each team’s performance in the regular season each year.

First, load packages and the Canadian programming library:

suppressMessages({
  library(dplyr)
})
source("canadian.R")

Then define a function to calculate each team’s performance for a given season.

process_season <- function(theseason) {
  message("Processing season ", substring(theseason, 1, 4), "-",
    substring(theseason, 5, 8), "...", appendLF=FALSE)
  load(file.path("source-data", paste0("nhlscrapr-", theseason, ".RData")))
  grand.data <- tbl_df(grand.data)
  grand.data$season <- as.character(grand.data$season)

  gamestats <- grand.data %>%
    filter(ev.team %in% unique(grand.data$hometeam)) %>%
    filter(substring(gcode, 1, 1) == "2") %>%
    mutate(season==season) %>%
    group_by(season, gcode) %>%
    summarise(
      awayteam=first(awayteam),
      hometeam=first(hometeam),
      totalgoals=sum(etype == "GOAL"),
      awaygoals=sum(ev.team == awayteam & etype == "GOAL"),
      homegoals=sum(ev.team == hometeam & etype == "GOAL"),
      totalshots=sum(etype %in% c("SHOT", "GOAL")),
      awayshots=sum(ev.team == awayteam & etype %in% c("SHOT", "GOAL")),
      homeshots=sum(ev.team == hometeam & etype %in% c("SHOT", "GOAL")),
      totalfaceoffs=sum(etype == "FAC"),
      awayfaceoffs=sum(ev.team == awayteam & etype == "FAC"),
      homefaceoffs=sum(ev.team == hometeam & etype == "FAC"),
      totalpenalties=sum(etype == "PENL"),
      awaypenalties=sum(ev.team == awayteam & etype == "PENL"),
      homepenalties=sum(ev.team == hometeam & etype == "PENL"),
      awaypp=sum(ev.team == awayteam & etype == "GOAL" &
        away.skaters > home.skaters),
      homepp=sum(ev.team == hometeam & etype == "GOAL" &
        away.skaters < home.skaters),
      awaysh=sum(ev.team == awayteam & etype == "GOAL" &
        away.skaters < home.skaters),
      homesh=sum(ev.team == hometeam & etype == "GOAL" &
        away.skaters > home.skaters))

  awaygames <- gamestats %>%
    ungroup() %>%
    transmute(
      season=season,
      team=awayteam,
      goals=awaygoals / totalgoals,
      shots=awayshots / totalshots,
      faceoffs=awayfaceoffs / totalfaceoffs,
      penalties=ifelse(totalpenalties==0, NA, awaypenalties / totalpenalties),
      pp=ifelse(homepenalties==0, NA, awaypp / homepenalties),
      pk=ifelse(awaypenalties==0, NA, homepp / awaypenalties))

  homegames <- gamestats %>%
    ungroup() %>%
    transmute(
      season=season,
      team=hometeam,
      goals=homegoals / totalgoals,
      shots=homeshots / totalshots,
      faceoffs=homefaceoffs / totalfaceoffs,
      penalties=ifelse(totalpenalties==0, NA, homepenalties / totalpenalties),
      pp=ifelse(awaypenalties==0, NA, homepp / awaypenalties),
      pk=ifelse(homepenalties==0, NA, awaypp / homepenalties))

  awaystats <- awaygames %>%
    group_by(season, team) %>%
    summarise_each(funs(mean(., na.rm=TRUE)))

  homestats <- homegames %>%
    group_by(season, team) %>%
    summarise_each(funs(mean(., na.rm=TRUE)))

  overallstats <- bind_rows(awaystats, homestats) %>%
    group_by(season, team) %>%
    summarise_each(funs(mean(., na.rm=TRUE)))

  message()
  list(away=awaystats, home=homestats, overall=overallstats)
}

Load all available seasons and run the function on each one, combine, and save.

load(file.path("source-data", "nhlscrapr-core.RData"))
seasons <- unique(games$season)
rm(list=c("games", "roster.master", "roster.unique"))

# separate
seasonstats <- lapply(seasons, process_season)
## Processing season 2002-2003...
## Processing season 2003-2004...
## Processing season 2005-2006...
## Processing season 2006-2007...
## Processing season 2007-2008...
## Processing season 2008-2009...
## Processing season 2009-2010...
## Processing season 2010-2011...
## Processing season 2011-2012...
## Processing season 2012-2013...
## Processing season 2013-2014...
## Processing season 2014-2015...
gamestats <- list(
  away=bind_rows(lapply(seasonstats, "[[", "away")),
  home=bind_rows(lapply(seasonstats, "[[", "home")),
  overall=bind_rows(lapply(seasonstats, "[[", "overall")))
rm(list=c("seasons", "seasonstats"))
  
saveRDS(gamestats, "processed.rds")
head(gamestats[["away"]])
head(gamestats[["home"]])
head(gamestats[["overall"]])
season team goals shots faceoffs penalties pp pk
20022003 ANA 0.480 0.477 0.544 0.517 0.159 0.095
20022003 ATL 0.433 0.434 0.467 0.504 0.147 0.139
20022003 BOS 0.422 0.503 0.481 0.555 0.137 0.103
20022003 BUF 0.416 0.482 0.493 0.526 0.118 0.119
20022003 CAR 0.384 0.492 0.512 0.514 0.105 0.169
20022003 CBJ 0.376 0.432 0.472 0.511 0.110 0.104
season team goals shots faceoffs penalties pp pk
20022003 ANA 0.595 0.497 0.564 0.464 0.097 0.086
20022003 ATL 0.447 0.471 0.456 0.471 0.102 0.113
20022003 BOS 0.546 0.562 0.517 0.494 0.138 0.121
20022003 BUF 0.492 0.524 0.502 0.473 0.117 0.077
20022003 CAR 0.444 0.533 0.534 0.447 0.087 0.136
20022003 CBJ 0.535 0.460 0.488 0.469 0.126 0.122
season team goals shots faceoffs penalties pp pk
20022003 ANA 0.538 0.487 0.554 0.490 0.128 0.090
20022003 ATL 0.440 0.452 0.461 0.487 0.124 0.126
20022003 BOS 0.484 0.533 0.499 0.525 0.137 0.112
20022003 BUF 0.454 0.503 0.497 0.499 0.117 0.098
20022003 CAR 0.414 0.513 0.523 0.481 0.096 0.153
20022003 CBJ 0.456 0.446 0.480 0.490 0.118 0.113

Next: 3. Train models
Previous: 1. Scrape raw data