Last updated: 2018-07-12
Code version: a6b5299
Split data to validation samples and training samples.
data/withheld.rds
and data/training.rds
.code/working/makedata_supervised.R
.In the training sample, repeat the following 5 times. Each time, take 4/5 of the samples, fit trendfilter and identified the top X cyclical genes, and the compute the prediction error margin as a function of the top X cyclical genes.
code/run_methods.R
: runs all methods, including supervised, unsupervised trendfilter, unsupervised bspline, unsupervised lowess, seurat) and compute prediction error margin.code/working/job_run_methods.train.R
calls code/run_methods.R
code/working/job_run_methods.train.sbatch
submits jobs one fold at a time.Based on the training sample results, select the top best X genes and fit to the validation sample. Compute and compare the methods’ performance on the validation sample.
data_training_cyclical_genes.fold.1 <- readRDS("../data/results/data_training_cyclical_genes.fold.1.rds")
data_training_cyclical_genes.fold.2 <- readRDS("../data/results/data_training_cyclical_genes.fold.2.rds")
data_training_cyclical_genes.fold.3 <- readRDS("../data/results/data_training_cyclical_genes.fold.3.rds")
data_training_cyclical_genes.fold.4 <- readRDS("../data/results/data_training_cyclical_genes.fold.4.rds")
data_training_cyclical_genes.fold.5 <- readRDS("../data/results/data_training_cyclical_genes.fold.5.rds")
data_cyclical_list <- list(data_training_cyclical_genes.fold.1,
data_training_cyclical_genes.fold.2,
data_training_cyclical_genes.fold.3,
data_training_cyclical_genes.fold.4,
data_training_cyclical_genes.fold.5)
library(gplots)
top100 <- lapply(1:5, function(i) {
rownames(data_cyclical_list[[i]])[order(data_cyclical_list[[i]]$pve,
decreasing = T)[1:100]]
})
venn( top100 )
top50 <- lapply(1:5, function(i) {
rownames(data_cyclical_list[[i]])[order(data_cyclical_list[[i]]$pve,
decreasing = T)[1:50]]
})
venn( top50)
top10 <- lapply(1:5, function(i) {
rownames(data_cyclical_list[[i]])[order(data_cyclical_list[[i]]$pve,
decreasing = T)[1:10]]
})
venn( top10)
top5 <- lapply(1:5, function(i) {
rownames(data_cyclical_list[[i]])[order(data_cyclical_list[[i]]$pve,
decreasing = T)[1:5]]
})
venn( top5)
top260 <- lapply(1:5, function(i) {
rownames(data_cyclical_list[[i]])[order(data_cyclical_list[[i]]$pve,
decreasing = T)[1:260]]
})
venn(top260)
library(gplots)
ngenes <- c(5, seq(10, nrow(data_cyclical_list[[1]]), 10))
genes_list <- lapply(1:length(ngenes), function(i) {
ngene <- ngenes[i]
tmp <- lapply(1:5, function(fold) {
rownames(data_cyclical_list[[fold]])[order(data_cyclical_list[[fold]]$pve,
decreasing = T)[1:ngene]]
})
parts <- venn(tmp, show.plot = F)
unlist(attr(parts, "intersections")$`A:B:C:D:E`)
})
names(genes_list) <- ngenes
saveRDS(genes_list,
file = "../data/results/results_topgenes.rds")
data_training_cyclical_genes.fold.1 <- readRDS("../data/results/data_training_cyclical_genes_permute.fold.1.rds")
data_training_cyclical_genes.fold.2 <- readRDS("../data/results/data_training_cyclical_genes_permute.fold.2.rds")
data_training_cyclical_genes.fold.3 <- readRDS("../data/results/data_training_cyclical_genes_permute.fold.3.rds")
data_training_cyclical_genes.fold.4 <- readRDS("../data/results/data_training_cyclical_genes_permute.fold.4.rds")
data_training_cyclical_genes.fold.5 <- readRDS("../data/results/data_training_cyclical_genes_permute.fold.5.rds")
data_cyclical_list <- list(data_training_cyclical_genes.fold.1,
data_training_cyclical_genes.fold.2,
data_training_cyclical_genes.fold.3,
data_training_cyclical_genes.fold.4,
data_training_cyclical_genes.fold.5)
library(gplots)
top100 <- lapply(1:5, function(i) {
rownames(data_cyclical_list[[i]])[order(data_cyclical_list[[i]]$pve,
decreasing = T)[1:100]]
})
venn( top100 )
top50 <- lapply(1:5, function(i) {
rownames(data_cyclical_list[[i]])[order(data_cyclical_list[[i]]$pve,
decreasing = T)[1:50]]
})
venn( top50)
top10 <- lapply(1:5, function(i) {
rownames(data_cyclical_list[[i]])[order(data_cyclical_list[[i]]$pve,
decreasing = T)[1:10]]
})
venn( top10)
top5 <- lapply(1:5, function(i) {
rownames(data_cyclical_list[[i]])[order(data_cyclical_list[[i]]$pve,
decreasing = T)[1:5]]
})
venn( top5)
library(gplots)
ngenes <- c(5, seq(10, nrow(data_cyclical_list[[1]]), 10))
genes_list <- lapply(1:length(ngenes), function(i) {
ngene <- ngenes[i]
tmp <- lapply(1:5, function(fold) {
rownames(data_cyclical_list[[fold]])[order(data_cyclical_list[[fold]]$pve,
decreasing = T)[1:ngene]]
})
parts <- venn(tmp, show.plot = F)
unlist(attr(parts, "intersections")$`A:B:C:D:E`)
})
names(genes_list) <- ngenes
#sapply(genes_list, function(x) length(x))
saveRDS(genes_list,
file = "../data/results/results_topgenes_permute.rds")
genes_list <- readRDS(file = "../data/results/results_topgenes.rds")
genes_list_permute <- readRDS(file = "../data/results/results_topgenes_permute.rds")
ngenes <- as.numeric(names(genes_list))
plot(x=ngenes[1:41],
y=sapply(genes_list, function(x) length(x))[1:41],
xlab="Top X cyclical genes in training set",
ylab="Number of shared cyclical genes",
main = "Shared cyclical genes across the five training sets",
xlim=c(0,410), pch=16, cex=.7, col="gray50")
abline(v=260, col="gray50")
ngenes <- as.numeric(names(genes_list))
plot(x=ngenes,
y=sapply(genes_list, function(x) length(x)),
xlab="Top X cyclical genes in training set",
ylab="Number of shared cyclical genes",
main = "Shared cyclical genes across the five training sets",
xlim=c(0,11100), pch=16, cex=.7, col="gray50")
points(x=ngenes,
y=sapply(genes_list_permute, function(x) length(x)),
pch=16, cex=.7, col="blue")
# ngenes <- as.numeric(names(genes_list))
# plot(x=ngenes,
# y=sapply(genes_list, function(x) length(x)),
# xlab="Top X cyclical genes in training set",
# ylab="Number of shared cyclical genes",
# main = "Shared cyclical genes in top 1000",
# xlim=c(0,1000), pch=16, cex=.7, col="gray50", ylim=c(1,250))
# points(x=ngenes,
# y=sapply(genes_list_permute, function(x) length(x)),
# pch=16, cex=.7, col="blue")
ngenes <- as.numeric(names(genes_list))
plot(x=ngenes,
y=sapply(genes_list, function(x) length(x)),
xlab="Top X cyclical genes in training set",
ylab="Number of shared cyclical genes",
main = "Shared genes in top 400 cyclical",
xlim=c(0,400), pch=16, cex=.7, col="gray50", ylim=c(1,110))
points(x=ngenes,
y=sapply(genes_list_permute, function(x) length(x)),
pch=16, cex=.7, col="blue")
abline(v=260, col="gray50")
fdr <- sapply(1:length(genes_list), function(i) {
a <- unlist(genes_list[i])
b <- unlist(genes_list_permute[i])
ll <- length(intersect(b,a))
ll/length(a)
})
names(fdr) <- names(genes_list)
saveRDS(fdr, file = "../output/method-train-classifiers-all.Rmd/fdr.rds")
fdr <- readRDS(file = "../output/method-train-classifiers-all.Rmd/fdr.rds")
plot(as.numeric(names(fdr)), fdr,
xlab="Top X cyclical genes",
ylab="Fraction of permuted in observed",
main="False discovery rate (fraction of permuted in observed",
pch=16, cex=.7)
plot(as.numeric(names(fdr))[1:1000], fdr[1:1000],
xlab="Top X cyclical genes",
ylab="Fraction of permuted in observed",
main="False discovery rate in top 1000",
pch=16, cex=.7,
xlim=c(1,1000), ylim=c(0,.05))
plot(as.numeric(names(fdr))[1:400], fdr[1:400],
xlab="Top X cyclical genes",
ylab="Fraction of permuted in observed",
main="False discovery rate in top 400",
pch=16, cex=.7,
xlim=c(1,400), ylim=c(0,.06))
abline(v=260, col="gray50")
code for summarize results
diff_time_wrapper <- function(results_list) {
methods_list <- sapply(names(results_list),
function(x) strsplit(x, split=".", fixed=TRUE)[[1]][2])
diff_time_list <- do.call(rbind, lapply(1:length(results_list), function(i) {
diff_time <- results_list[[i]]$diff_time
diff_mean <- mean(diff_time/2/pi)
# diff_se <- sd(diff_time/2/pi)/sqrt(ncol(results_list[[1]]$Y))
return(data.frame(diff_mean=diff_mean,
# diff_se=diff_se,
methods=methods_list[i]))
}) )
return(diff_time_list)
}
summarize results
# ngenes <- c(5, seq(10,400, by=10))
# train_top <- lapply(1:length(ngenes), function(i) {
# ngene <- ngenes[i]
# train_topX <- unlist(lapply(1:5, function(fold) {
# fl_name <- list.files("../data/results",
# pattern=paste0("results_train.fold.",fold,".top",ngene,".rds"),
# full.names = TRUE)
# return(fl_name)
# }) )
# return(train_topX)
# })
ngenes <- c(5, seq(10,400, by=10))
train_top <- do.call(rbind, lapply(1:length(ngenes), function(i) {
ngene <- ngenes[i]
train_topX <- do.call(rbind, lapply(1:5, function(fold) {
fl_name <- paste0("../data/results/results_train.fold.",fold,".top",ngene,".rds")
df <- readRDS(fl_name)
out <- diff_time_wrapper(df$fit.test)
out$fold <- fold
return(out)
}) )
train_topX$ngenes <- ngene
#return(train_topX)
agg_mn <- aggregate(diff_mean ~ methods,
data=train_topX, FUN=mean)
agg_sd <- aggregate(diff_mean ~ methods,
data=train_topX, FUN=sd)
obj <- data.frame(methods=agg_mn$methods,
diff_mean=agg_mn$diff_mean,
diff_se=agg_sd$diff_mean/sqrt(5))
obj$ngenes <- ngene
return(obj)
}) )
saveRDS(train_top, file = "../output/method-train-classifiers-all.Rmd/diff_time_all.rds")
ngenes <- c(5, seq(10,200, by=10))
train_top_permute <- do.call(rbind, lapply(1:length(ngenes), function(i) {
ngene <- ngenes[i]
train_topX <- do.call(rbind, lapply(1:5, function(fold) {
fl_name <- paste0("../data/results/results_train_permute.fold.",
fold,".top",ngene,".rds")
df <- readRDS(fl_name)
out <- diff_time_wrapper(df$fit.test)
out$fold <- fold
return(out)
}) )
train_topX$ngenes <- ngene
#return(train_topX)
agg_mn <- aggregate(diff_mean ~ methods,
data=train_topX, FUN=mean)
agg_sd <- aggregate(diff_mean ~ methods,
data=train_topX, FUN=sd)
obj <- data.frame(methods=agg_mn$methods,
diff_mean=agg_mn$diff_mean,
diff_se=agg_sd$diff_mean/sqrt(5))
obj$ngenes <- ngene
return(obj)
}) )
saveRDS(train_top_permute,
file = "../output/method-train-classifiers-all.Rmd/diff_time_all_permute.rds")
ngenes <- c(5, seq(10,200, by=10))
train_top_permute <- do.call(rbind, lapply(1:length(ngenes), function(i) {
ngene <- ngenes[i]
train_topX <- do.call(rbind, lapply(1:5, function(fold) {
fl_name <- paste0("../data/results/results_train_permute_onrandom.fold.",
fold,".top",ngene,".rds")
df <- readRDS(fl_name)
out <- diff_time_wrapper(df$fit.test)
out$fold <- fold
return(out)
}) )
train_topX$ngenes <- ngene
#return(train_topX)
agg_mn <- aggregate(diff_mean ~ methods,
data=train_topX, FUN=mean)
agg_sd <- aggregate(diff_mean ~ methods,
data=train_topX, FUN=sd)
obj <- data.frame(methods=agg_mn$methods,
diff_mean=agg_mn$diff_mean,
diff_se=agg_sd$diff_mean/sqrt(5))
obj$ngenes <- ngene
return(obj)
}) )
saveRDS(train_top_permute,
file = "../output/method-train-classifiers-all.Rmd/diff_time_rando_on_fucci.rds")
ngenes <- c(5, seq(10,200, by=10))
train_top_permute <- do.call(rbind, lapply(1:length(ngenes), function(i) {
ngene <- ngenes[i]
train_topX <- do.call(rbind, lapply(1:5, function(fold) {
fl_name <- paste0("../data/results/results_train_permute_oncyclical.fold.",
fold,".top",ngene,".rds")
df <- readRDS(fl_name)
out <- diff_time_wrapper(df$fit.test)
out$fold <- fold
return(out)
}) )
train_topX$ngenes <- ngene
#return(train_topX)
agg_mn <- aggregate(diff_mean ~ methods,
data=train_topX, FUN=mean)
agg_sd <- aggregate(diff_mean ~ methods,
data=train_topX, FUN=sd)
obj <- data.frame(methods=agg_mn$methods,
diff_mean=agg_mn$diff_mean,
diff_se=agg_sd$diff_mean/sqrt(5))
obj$ngenes <- ngene
return(obj)
}) )
saveRDS(train_top_permute,
file = "../output/method-train-classifiers-all.Rmd/diff_time_fucci_on_rando.rds")
train_top <- readRDS("../output/method-train-classifiers-all.Rmd/diff_time_all.rds")
library(ggplot2)
ggplot(train_top,
aes(x=ngenes, y=diff_mean, color=methods)) +
geom_errorbar(subset(train_top, methods == "supervised"),
mapping=aes(ymin=diff_mean-diff_se, ymax=diff_mean+diff_se)) +
geom_point() + geom_line(lty=3) +
ylab("Error margin (% arc length)") + xlab("Top X cyclical genes") +
ggtitle("Prediction error in the training set")
ggplot(subset(train_top, methods == "supervised"),
aes(x=ngenes, y=diff_mean)) +
geom_errorbar(subset(train_top, methods == "supervised"),
mapping=aes(ymin=diff_mean-diff_se, ymax=diff_mean+diff_se),
color="blue") +
geom_point() + geom_line(lty=3) +
ylab("Error margin (% arc length)") + xlab("Top X cyclical genes") +
ggtitle("Prediction error in the training set")
Check between 5 genes and 100 genes
#ngenes <- c(5, seq(10,400, by=10))
ngenes <- c(5, 10, 20, 100)
train_top <- lapply(1:length(ngenes), function(i) {
ngene <- ngenes[i]
train_topX <- do.call(rbind, lapply(1:5, function(fold) {
fl_name <- paste0("../data/results/results_train.fold.",fold,".top",ngene,".rds")
df <- readRDS(fl_name)
out <- diff_time_wrapper(df$fit.test)
out$fold <- fold
return(out)
}) )
train_topX$ngenes <- ngene
return(data.frame(train_topX))
})
tmp <- do.call(cbind, train_top)
tmp <- tmp[tmp$methods== "supervised",]
tmp <- tmp[,c(3,1,5, 9, 13)]
colnames(tmp) <- c("fold", paste0("top",c(5, 10, 20, 100)))
train_top_permute <- readRDS("../output/method-train-classifiers-all.Rmd/diff_time_all_permute.rds")
library(ggplot2)
ggplot(train_top_permute,
aes(x=ngenes, y=diff_mean, color=methods)) +
geom_errorbar(subset(train_top_permute, methods == "supervised"),
mapping=aes(ymin=diff_mean-diff_se, ymax=diff_mean+diff_se)) +
geom_point() + geom_line(lty=3) +
ylab("Error margin (% arc length)") + xlab("Top X cyclical genes") +
ggtitle("Prediction error in the training set")
ggplot(subset(train_top_permute, methods == "supervised"),
aes(x=ngenes, y=diff_mean)) +
geom_errorbar(subset(train_top_permute, methods == "supervised"),
mapping=aes(ymin=diff_mean-diff_se, ymax=diff_mean+diff_se),
color="blue") +
geom_point() + geom_line(lty=3) +
ylab("Error margin (% arc length)") + xlab("Top X cyclical genes") +
ggtitle("Prediction error in the training set")
train_rando_on_fucci <- readRDS("../output/method-train-classifiers-all.Rmd/diff_time_rando_on_fucci.rds")
library(ggplot2)
ggplot(subset(train_rando_on_fucci, methods == "supervised"),
aes(x=ngenes, y=diff_mean)) +
geom_errorbar(subset(train_rando_on_fucci, methods == "supervised"),
mapping=aes(ymin=diff_mean-diff_se, ymax=diff_mean+diff_se),
color="blue") +
geom_point() + geom_line(lty=3) +
ylab("Error margin (% arc length)") + xlab("Top X cyclical genes") +
ggtitle("Random cyclical genes predicting fucci times")
train_fucci_on_rando <- readRDS("../output/method-train-classifiers-all.Rmd/diff_time_fucci_on_rando.rds")
ggplot(subset(train_fucci_on_rando, methods == "supervised"),
aes(x=ngenes, y=diff_mean)) +
geom_errorbar(subset(train_fucci_on_rando, methods == "supervised"),
mapping=aes(ymin=diff_mean-diff_se, ymax=diff_mean+diff_se),
color="blue") +
geom_point() + geom_line(lty=3) +
ylab("Error margin (% arc length)") + xlab("Top X cyclical genes") +
ggtitle("fucci cyclical genes predicting random times")
sessionInfo()
R version 3.4.3 (2017-11-30)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Scientific Linux 7.4 (Nitrogen)
Matrix products: default
BLAS/LAPACK: /software/openblas-0.2.19-el7-x86_64/lib/libopenblas_haswellp-r0.2.19.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggplot2_2.2.1
loaded via a namespace (and not attached):
[1] Rcpp_0.12.17 digest_0.6.15 rprojroot_1.3-2 plyr_1.8.4
[5] grid_3.4.3 gtable_0.2.0 backports_1.1.2 git2r_0.21.0
[9] magrittr_1.5 scales_0.5.0 evaluate_0.10.1 pillar_1.1.0
[13] rlang_0.1.6 stringi_1.1.6 lazyeval_0.2.1 rmarkdown_1.10
[17] labeling_0.3 tools_3.4.3 stringr_1.2.0 munsell_0.4.3
[21] yaml_2.1.16 compiler_3.4.3 colorspace_1.3-2 htmltools_0.3.6
[25] knitr_1.20 tibble_1.4.2
This R Markdown site was created with workflowr