Last updated: 2018-05-22
Code version: d1b408b
Perform 5-fold cross-validation. Do the following for each fold:
Fit spml to training samples, and use the gene weights learned from training samples to predict cell times in testing samples
Compute correlation between predicted cell times and the fucci-labeled cell times. The correlation quanties the extent to which the two data series are rotationally dependent. The larger the value the fewer rotations/transformations needed to rotate data series A to data series B. Also compute 95% confidence interval using bootstraping to quantify the level of statistical significance of correlations.
Note there are some issues in the computations of the permutation-based p-values. The p-values turned out to be the same across folds…
Importantly, the method estimates the average direction (or angle) given the covariates, which does not address the question that we are asking.
I stopped pursing this idea and instead focus on nonparametric regerssion methods.
The approach implemented here references the method described in Presnell, Brett, Scott P. Morrison, and Ramon C. Littell. “Projected Multivariate Linear Models for Directional Data.” Journal of the American Statistical Association 93, no. 443 (1998): 1068-077. doi:10.2307/2669850. R package Rfast
was used to perform the computations.
Get data
library(Biobase)
df <- readRDS(file="../data/eset-final.rds")
pdata <- pData(df)
fdata <- fData(df)
# select endogeneous genes
counts <- exprs(df)[grep("ENSG", rownames(df)), ]
log2cpm.all <- t(log2(1+(10^6)*(t(counts)/pdata$molecules)))
macosko <- readRDS("../data/cellcycle-genes-previous-studies/rds/macosko-2015.rds")
log2cpm.all <- log2cpm.all[,order(pdata$theta)]
pdata <- pdata[order(pdata$theta),]
log2cpm.quant <- readRDS("../output/npreg-trendfilter-quantile.Rmd/log2cpm.quant.rds")
# import previously identifid cell cycle genes
cyclegenes <- readRDS("../output/npreg-methods.Rmd/cyclegenes.rds")
cyclegenes.names <- colnames(cyclegenes)[2:6]
quant.sub <- log2cpm.quant[rownames(log2cpm.quant) %in% cyclegenes.names, ]
Analysis
source("../code/utility.R")
N <- dim(quant.sub)[2]
parts <- partitionSamples(c(1:N), runs=5, nsize.each = c(rep(177,4), 180))
res <- lapply(1:5, function(iter) {
ii_train <- parts$partitions[[iter]][[1]]
ii_test <- parts$partitions[[iter]][[2]]
theta_test <- pdata$theta[ii_test]
theta_train <- pdata$theta[ii_train]
cycle.spml.testmodel(Y_test = quant.sub[,ii_test],
Y_train = quant.sub[,ii_train],
theta_test = theta_test,
theta_train = theta_train)
})
# the permutation-based pvalue is questionable...
tab <- do.call(rbind, lapply(res, function(xx) {
data.frame(rho=xx$rho,
boot_95ci_low=xx$boot_95ci_low,
boot_95ci_high=xx$boot_95ci_high,
pval=xx$pval)
}) )
with(tab, rho > boot_95ci_low & rho < boot_95ci_high)
[1] TRUE TRUE TRUE TRUE TRUE
Get data
df <- readRDS(file="../data/eset-final.rds")
pdata <- pData(df)
fdata <- fData(df)
# select endogeneous genes
counts <- exprs(df)[grep("ENSG", rownames(df)), ]
log2cpm.all <- t(log2(1+(10^6)*(t(counts)/pdata$molecules)))
macosko <- readRDS("../data/cellcycle-genes-previous-studies/rds/macosko-2015.rds")
log2cpm.all <- log2cpm.all[,order(pdata$theta)]
pdata <- pdata[order(pdata$theta),]
log2cpm.quant <- readRDS("../output/npreg-trendfilter-quantile.Rmd/log2cpm.quant.rds")
out.stats.ordered.sig.101 <- readRDS("../output/npreg-trendfilter-quantile.Rmd/out.stats.ordered.sig.101.rds")
out.stats.ordered.sig.476 <- readRDS("../output/npreg-trendfilter-quantile.Rmd/out.stats.ordered.sig.476.rds")
quant.101 <- log2cpm.quant[rownames(log2cpm.quant) %in% rownames(out.stats.ordered.sig.101), ]
quant.476 <- log2cpm.quant[rownames(log2cpm.quant) %in% rownames(out.stats.ordered.sig.476), ]
Analysis code
source("../code/utility.R")
# 20 fold
nfold <- 5
N <- dim(quant.101)[2]
parts <- partitionSamples(c(1:N), runs=nfold, nsize.each=c(177,177,177,177,180))
df <- quant.101
res <- lapply(1:nfold, function(i) {
ii_train <- parts$partitions[[i]]$train
ii_test <- parts$partitions[[i]]$test
theta_test <- pdata$theta[ii_test]
theta_train <- pdata$theta[ii_train]
cycle.spml.testmodel(Y_test = df[,ii_test],
Y_train = df[,ii_train],
theta_test = theta_test,
theta_train = theta_train)
})
# the permutation-based pvalue is questionable...
tab <- do.call(rbind, lapply(res, function(xx) {
data.frame(rho=xx$rho,
boot_95ci_low=xx$boot_95ci_low,
boot_95ci_high=xx$boot_95ci_high,
pval=xx$pval)
}) )
with(tab, rho > boot_95ci_low & rho < boot_95ci_high)
[1] TRUE TRUE TRUE TRUE TRUE
Results too perfect: turns out more investigations are needed for the correlation between circular variables.
Check prediction for some random data
# select external validation samples
set.seed(99)
nvalid <- round(ncol(log2cpm.quant)*.15)
ii.valid <- sample(1:ncol(log2cpm.quant), nvalid, replace = F)
ii.nonvalid <- setdiff(1:ncol(log2cpm.quant), ii.valid)
log2cpm.quant.nonvalid <- log2cpm.quant[,ii.nonvalid]
log2cpm.quant.valid <- log2cpm.quant[,ii.valid]
theta.nonvalid <- pdata$theta[ii.nonvalid]
theta.valid <- pdata$theta[ii.valid]
sig.genes <- readRDS("../output/npreg-trendfilter-quantile.Rmd/out.stats.ordered.sig.rds")
expr.sig <- log2cpm.quant.nonvalid[rownames(log2cpm.quant.nonvalid) %in% rownames(sig.genes)[1:10], ]
# set training samples
source("../peco/R/primes.R")
source("../peco/R/partitionSamples.R")
parts <- partitionSamples(1:ncol(log2cpm.quant.nonvalid), runs=5,
nsize.each = rep(151,5))
part_indices <- parts$partitions
Y_train <- expr.sig[,part_indices[[1]]$train]
theta_train <- theta.nonvalid[part_indices[[1]]$train]
Y_test <- expr.sig[,part_indices[[1]]$test]
theta_test <- theta.nonvalid[part_indices[[1]]$test]
fit_train <- cycle.spml.trainmodel(Y_train, theta_train)
pred_cart <- cbind(1,t(Y_test))%*%fit_train$be
pred_polar <- atan( pred_cart[, 2] / pred_cart[, 1] ) + pi * I(pred_cart[, 1] < 0)
rho_test <- rFLIndTestRand(pred_polar, theta_test, 9999)
boot_ci <- rhoFLCIBoot(pred_polar, theta_test, 95, 9999)
sessionInfo()
R version 3.4.4 (2018-03-15)
Platform: x86_64-redhat-linux-gnu (64-bit)
Running under: Scientific Linux 7.4 (Nitrogen)
Matrix products: default
BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] assertthat_0.2.0 Rfast_1.9.0 RcppZiggurat_0.1.4
[4] Rcpp_0.12.16 Biobase_2.38.0 BiocGenerics_0.24.0
loaded via a namespace (and not attached):
[1] digest_0.6.15 rprojroot_1.3-2 backports_1.1.2 git2r_0.21.0
[5] magrittr_1.5 evaluate_0.10.1 stringi_1.1.7 rmarkdown_1.9
[9] tools_3.4.4 stringr_1.3.0 yaml_2.1.18 compiler_3.4.4
[13] htmltools_0.3.6 knitr_1.20
This R Markdown site was created with workflowr