Since there were a number of \(\hat{k} > 0.7\) indicating an unreliable calculation of \(\hat{elpd}\) using PSIS-LOO for both models, we perform K-fold cross validation with \(K=10\) following Vehtari, Gelman, and Gabry (2016).
# Load R packages
library(ggplot2)
library(scales)
library(hexbin)
library(tidyr)
library(dplyr)
library(MASS)
library(rstan)
library(loo)
library(matrixStats)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
set.seed(42)
iter <- 2000
chains <- 3
We use the same data as before, but we create 10 lists, so that each list has 9/10 of the observations as training data and 1/10 of the observations as held out data. In order to avoid having very few or no observations of a given participant in the training set of some list, we group the data by participants before doing the split.
load("dataNIG-SRC.Rda")
# save the order of the data frame
dexp$row <- as.numeric(as.character(row.names(dexp)))
if (!file.exists("ldata.Rda")) {
# The following code prevents that there will be a fold without a
# participant (or where a participant is underrepresented) (There might be a
# simpler way) Assuming K = 10, the procedure is the following: 1) We
# extract 1/10 of the data and save it in the list G with k = 1; 2) We
# extract 1/9 of the remaining data and save it with k = 2; 3) We extract
# 1/8 of the remaining data and save it with k = 3; 4) ...; 10) We extract
# all the data the data and save it with k = 10
K <- 10
d <- dexp
G <- list()
for (i in 1:K) {
G[[i]] <- sample_frac(group_by(d, participant), (1/(K + 1 - i)))
G[[i]]$k <- i
d <<- anti_join(d, G[[i]], by = c("participant", "item", "winner", "RT",
"condition", "row"))
}
# We create a dataframe again:
dK <- bind_rows(G)
# We save the order of the dataframe
dK <- dK[order(dK$row), ]
ldata <- plyr::llply(1:K, function(i) {
list(N_obs = nrow(dK), winner = dK$winner, RT = dK$RT, N_choices = max(dK$winner),
subj = as.numeric(as.factor(dK$participant)), N_subj = length(unique(dK$participant)),
item = as.numeric(as.factor(dK$item)), N_item = length(unique(dK$item)),
holdout = ifelse(dK$k == i, 1, 0))
})
save(ldata, file = "ldata.Rda")
} else {
load("ldata.Rda")
}
# All the folds include all the participant
for (k in 1:10) {
print(paste0("Fold = ", k, "; N_subj = ", ldata[[k]]$N_subj, "; N_heldout = ",
sum(ldata[[k]]$holdout)))
}
## [1] "Fold = 1; N_subj = 182; N_heldout = 364"
## [1] "Fold = 2; N_subj = 182; N_heldout = 364"
## [1] "Fold = 3; N_subj = 182; N_heldout = 364"
## [1] "Fold = 4; N_subj = 182; N_heldout = 364"
## [1] "Fold = 5; N_subj = 182; N_heldout = 364"
## [1] "Fold = 6; N_subj = 182; N_heldout = 363"
## [1] "Fold = 7; N_subj = 182; N_heldout = 364"
## [1] "Fold = 8; N_subj = 182; N_heldout = 355"
## [1] "Fold = 9; N_subj = 182; N_heldout = 364"
## [1] "Fold = 10; N_subj = 182; N_heldout = 322"
In order to apply K-fold cross validation, we modify the Stan models so that they increment the log-likelihood only when the data are not held out. We use the generated quantities
section to calculate the pointwise likelihood for every observation, but we will later use only the likelihood of the held-out data for the calculation of \(\hat{elpd}\). See also the files activation-based_h_Kfold.stan
and direct_access_h_Kfold.stan
for the details.
For the activation-based model:
(...)
model {
(...)
for (n in 1:N_obs) {
if(holdout[n] == 0){
vector[N_choices] alpha;
real psi;
alpha = alpha_0 + u[, subj[n]] + w[, item[n]];
psi = exp(psi_0 + u_psi[subj[n]]);
target += race(winner[n], RT[n], alpha, b, sigma, psi);
}
}
}
generated quantities {
(...)
vector[N_obs] log_lik;
(...)
for (n in 1:N_obs) {
vector[N_choices] alpha;
real psi;
alpha = alpha_0 + u[, subj[n]] + w[, item[n]];
psi = exp(psi_0 + u_psi[subj[n]]);
log_lik[n] = race(winner[n], RT[n], alpha, b, sigma, psi);
}
}
For the direct access model:
(...)
model {
(...)
for (n in 1:N_obs) {
if(holdout[n] == 0){
real mu_da;
real mu_b;
vector[N_choices] beta;
real psi;
mu_da = mu_da_0 + u_RT[1,subj[n]] + w_RT[1,item[n]];
mu_b = mu_b_0 + u_RT[2,subj[n]] + w_RT[2,item[n]];
beta = beta_0 + u[,subj[n]] + w[,item[n]];
psi = exp(psi_0 + u_psi[subj[n]]);
target += da(winner[n], RT[n], beta, P_b, mu_da, mu_b, sigma, psi);
}
}
generated quantities {
(...)
vector[N_obs] log_lik;
(...)
for (n in 1:N_obs) {
real mu_da;
real mu_b;
vector[N_choices] beta;
real psi;
vector[2] gen;
mu_da = mu_da_0 + u_RT[1,subj[n]] + w_RT[1,item[n]];
mu_b = mu_b_0 + u_RT[2,subj[n]] + w_RT[2,item[n]];
beta = beta_0 + u[,subj[n]] + w[,item[n]];
psi = exp(psi_0 + u_psi[subj[n]]);
log_lik[n] = da(winner[n], RT[n], beta, P_b, mu_da, mu_b, sigma, psi);
}
}
Then we use the following function to parallelize the 10 runs of both models:
# The following function can run all the chains of all the folds of the
# model in parallel:
stan_kfold <- function(file, list_of_datas, chains, cores, ...) {
library(pbmcapply)
badRhat <- 1.1
K <- length(list_of_datas)
model <- stan_model(file = file)
# First parallelize all chains:
sflist <- pbmclapply(1:(K * chains), mc.cores = cores, function(i) {
# Fold number:
k <- ((i + 1)/chains)
s <- sampling(model, data = list_of_datas[[k]], chains = 1, chain_id = i,
...)
return(s)
})
# Then merge the K * chains to create K stanfits:
stanfit <- list()
for (k in 1:K) {
inchains <- (chains * k - 2):(chains * k)
# Merge `chains` of each fold
stanfit[[k]] <- sflist2stanfit(sflist[inchains])
}
return(stanfit)
}
We run the models and extract the log-likelihood evaluated at the posterior simulations of the parameter values:
# Wrapper function to extract the log_lik of the held-out data, given a list
# of stanfits, and a list which indicates with 1 and 0 whether the
# observation was held out or not:
extract_log_lik_K <- function(list_of_stanfits, list_of_holdout, ...) {
K <- length(list_of_stanfits)
list_of_log_liks <- plyr::llply(1:K, function(k) {
extract_log_lik(list_of_stanfits[[k]], ...)
})
# `log_lik_heldout` will include the loglike of all the held out data of all
# the folds. We define `log_lik_heldout` as a (samples x N_obs) matrix
# (similar to each log_lik matrix)
log_lik_heldout <- list_of_log_liks[[1]] * NA
for (k in 1:K) {
log_lik <- list_of_log_liks[[k]]
samples <- dim(log_lik)[1]
N_obs <- dim(log_lik)[2]
# This is a matrix with the same size as log_lik_heldout with 1 if the data
# was held out in the fold k
heldout <- matrix(rep(list_of_holdout[[k]], each = samples), nrow = samples)
# Sanity check that the previous log_lik is not being overwritten:
if (any(!is.na(log_lik_heldout[heldout == 1]))) {
warning("Heldout log_lik has been overwritten!!!!")
}
# We save here the log_lik of the fold k in the matrix:
log_lik_heldout[heldout == 1] <- log_lik[heldout == 1]
}
return(log_lik_heldout)
}
# We apply the function to both models:
if (!file.exists("log_lik_ab.Rda")) {
# We run all the chains of all the folds of the activation-based model in
# parallel: (We are using 30 cores of a server)
ab10Kfits <- stan_kfold("activation-based_h_Kfold.stan", list_of_datas = ldata,
chains = 3, cores = 30, seed = 42, iter = iter)
holdout <- lapply(ldata, "[[", "holdout")
# We extract all the held_out log_lik of all the folds
log_lik_ab <- extract_log_lik_K(ab10Kfits, holdout, "log_lik")
save(log_lik_ab, file = "log_lik_ab.Rda")
} else {
load("log_lik_ab.Rda")
}
if (!file.exists("log_lik_da.Rda")) {
# We run all the chains of all the folds of the direct access model in
# parallel:
da10Kfits <- stan_kfold("direct_access_h_Kfold.stan", list_of_datas = ldata,
chains = 3, cores = 30, seed = 42, iter = iter)
holdout <- lapply(ldata, "[[", "holdout")
# We extract all the held_out log_lik of all the folds
log_lik_da <- extract_log_lik_K(da10Kfits, holdout, "log_lik")
save(log_lik_da, file = "log_lik_da.Rda")
} else {
load("log_lik_da.Rda")
}
The following function is an adaptation of loo
function from R
package loo
to calculate pointwise and total \(\hat{elpd}\) of K-fold cross validation:
kfold <- function(log_lik_heldout) {
library(matrixStats)
logColMeansExp <- function(x) {
# should be more stable than log(colMeans(exp(x)))
S <- nrow(x)
colLogSumExps(x) - log(S)
}
# See equation (20) of @VehtariEtAl2016
pointwise <- matrix(logColMeansExp(log_lik_heldout), ncol = 1)
colnames(pointwise) <- "elpd"
# See equation (21) of @VehtariEtAl2016
elpd_kfold <- sum(pointwise)
se_elpd_kfold <- sqrt(ncol(log_lik_heldout) * var(pointwise))
out <- list(pointwise = pointwise, elpd_kfold = elpd_kfold, se_elpd_kfold = se_elpd_kfold)
structure(out, class = "loo")
}
We can now repeat the same analysis using K-fold cross validation instead of PSIS-LOO:
(kfold_ab <- kfold(log_lik_ab))
## Computed from log-likelihood matrix
##
## Estimate SE
## elpd_kfold -26582.0 94.8
(kfold_da <- kfold(log_lik_da))
## Computed from log-likelihood matrix
##
## Estimate SE
## elpd_kfold -26467.0 94.9
Comparing the models on K-fold cross validation reveals also an estimated difference in \(\hat{elpd}\) in favor of the direct access model in comparison with the activation-based model:
(kfold_comparison <- compare(kfold_ab, kfold_da))
## elpd_diff se
## 115.0 23.5
We compare models in their \(\hat{elpd}\), point by point below according to K-fold cross validation calcualtion. These graphs are very similar to the ones using \(\hat{elpd}\) calculated with PSIS-LOO. (The code producing the graphs is available in the Rmd file.)
Vehtari, Aki, Andrew Gelman, and Jonah Gabry. 2016. “Practical Bayesian Model Evaluation Using Leave-One-Out Cross-Validation and WAIC.” Statistics and Computing.