Last updated: 2017-11-07
Code version: 2c05d59
Smemo et al 2014 provides a mouse heart RNA-seq data set. The data set contains 2 conditions, and each condition has only 2 samples. We’ll see if Gaussian derivatives can handle this difficult situation.
counts = read.table("../data/smemo.txt", header = T, row.name = 1)
counts = counts[, -5]
## Number of genes
nrow(counts)
[1] 23587
## Number of samples
ncol(counts)
[1] 4
## Sneak peek
head(counts, 10)
lv1 lv2 rv1 rv2
Itm2a 2236 2174 9484 10883
Sergef 97 90 341 408
Fam109a 383 314 1864 2384
Dhx9 2688 2631 18501 20879
Ssu72 762 674 2806 3435
Olfr1018 0 0 0 0
Fam71e2 0 0 0 0
Eif2b2 736 762 3081 3601
Mks1 77 82 398 685
Hebp2 203 205 732 921
In the first exploratory investigation, we only choose genes whose expression levels are not all zero for all 4 samples. This is to prevent the complications brought by “non-expressed” genes.
counts.nonzero = counts[rowSums(counts) >= 1, ]
## Equivalently
## counts.nonzero = counts[apply(counts, 1, max) >= 1, ]
design = model.matrix(~c(0, 0, 1, 1))
## Number of genes expressed
nrow(counts.nonzero)
[1] 18615
Then we feed the count matrix to the pipeline to get the summary statistics: \(\hat\beta\), \(\hat s\), \(z\).
counts_to_summary = function (counts, design) {
dgecounts = edgeR::calcNormFactors(edgeR::DGEList(counts = counts, group = design[, 2]))
v = limma::voom(dgecounts, design, plot = FALSE)
lim = limma::lmFit(v)
r.ebayes = limma::eBayes(lim)
p = r.ebayes$p.value[, 2]
t = r.ebayes$t[, 2]
z = sign(t) * qnorm(1 - p/2)
betahat = lim$coefficients[,2]
sebetahat = betahat / z
return (list(betahat = betahat, sebetahat = sebetahat, z = z))
}
Suppose \(z\) are correlated null, will they be well fitted by 10 Gaussian derivatives?
source("../code/gdash.R")
Warning: replacing previous import 'Matrix::crossprod' by 'gmp::crossprod'
when loading 'cvxr'
Warning: replacing previous import 'Matrix::tcrossprod' by
'gmp::tcrossprod' when loading 'cvxr'
source("../code/gdfit.R")
w.fit = gdfit(z, gd.ord = 10)
plot.gdfit(z, w.fit$w, w.fit$gd.ord, breaks = 100)
plot.gdfit(z, w.fit$w, w.fit$gd.ord, std.norm = FALSE, breaks = 100)
plot(ecdf(z))
plot.gdfit(z, w.fit$w, w.fit$gd.ord, breaks = "Sturges")
plot.gdfit(z, w.fit$w, w.fit$gd.ord, std.norm = FALSE, breaks = "Sturges")
## Remove all singletons
counts.nonsingleton = counts[rowSums(counts) > 1, ]
## Number of non-singleton genes
nrow(counts.nonsingleton)
[1] 18075
w.fit = gdfit(z, gd.ord = 10)
plot.gdfit(z, w.fit$w, w.fit$gd.ord, breaks = 100)
plot.gdfit(z, w.fit$w, w.fit$gd.ord, std.norm = FALSE, breaks = 100)
plot(ecdf(z))
## Remove all zeros
counts.pos = counts[apply(counts, 1, min) > 0, ]
## Number of positive genes
nrow(counts.pos)
[1] 15573
w.fit = gdfit(z, gd.ord = 10)
cat(rbind(paste(0 : w.fit$gd.ord, ":"), paste(w.fit$w, ";")))
0 : 1 ; 1 : 0.0728018367613569 ; 2 : 1.90276901242463 ; 3 : 0.487348637367052 ; 4 : 2.25733510399704 ; 5 : 0.946549908952083 ; 6 : 1.49164087236149 ; 7 : 0.902213125512891 ; 8 : 0.427623049883572 ; 9 : 0.342914618843359 ; 10 : 0.011101038914766 ;
plot.gdfit(z, w.fit$w, w.fit$gd.ord, breaks = 100)
plot.gdfit(z, w.fit$w, w.fit$gd.ord, std.norm = FALSE, breaks = 100)
plot(ecdf(z))
plot(betahat, sebetahat, cex = 0.7, pch = 19)
plot(betahat, z, cex = 0.7, pch = 19)
fit.gdash = gdash(betahat, sebetahat)
fit.gdash
$fitted_g
$pi
[1] 1.000000e+00 2.528802e-09 2.400118e-09 2.176700e-09 1.830511e-09
[6] 1.379624e-09 9.118214e-10 5.331526e-10 2.878238e-10 1.520977e-10
[11] 8.278542e-11 4.833395e-11 3.156007e-11 2.470504e-11 2.695689e-11
[16] 5.699028e-11 1.271587e-10 5.089537e-11 1.086289e-11 4.376792e-12
[21] 2.589195e-12 1.842740e-12 1.463606e-12
$mean
[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
$sd
[1] 0.000000000 0.007302517 0.010327318 0.014605033 0.020654636
[6] 0.029210066 0.041309272 0.058420132 0.082618544 0.116840265
[11] 0.165237087 0.233680530 0.330474174 0.467361059 0.660948349
[16] 0.934722118 1.321896697 1.869444237 2.643793394 3.738888474
[21] 5.287586788 7.477776948 10.575173576
attr(,"row.names")
[1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
attr(,"class")
[1] "normalmix"
$w
[1] 1.000000e+00 -5.240201e-02 1.789038e+00 -3.560324e-01 1.788218e+00
[6] -5.932673e-01 7.470306e-01 -4.652369e-01 1.611175e-07 -1.348388e-01
[11] -1.229651e-07
$niter
[1] 3
$converged
[1] TRUE
fit.ash = ashr::ash(betahat, sebetahat)
lfsr.ash = ashr::get_lfsr(fit.ash)
sum(lfsr.ash <= 0.05)
[1] 3839
fit.gdash.ash = ashr::ash(betahat, sebetahat, fixg = TRUE, g = fit.gdash$fitted_g)
lfsr.gdash.ash = ashr::get_lfsr(fit.gdash.ash)
sum(lfsr.gdash.ash <= 0.05)
[1] 0
pval = (1 - pnorm(abs(z))) * 2
pval.BH = p.adjust(pval, method = "BH")
sum(pval.BH <= 0.05)
[1] 3087
sessionInfo()
R version 3.4.2 (2017-09-28)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] Rmosek_7.1.3 PolynomF_0.94 cvxr_0.0.0.9400 REBayes_0.85
[5] Matrix_1.2-11 SQUAREM_2017.10-1 EQL_1.0-0 ttutils_1.0-1
loaded via a namespace (and not attached):
[1] Rcpp_0.12.13 knitr_1.17 magrittr_1.5
[4] edgeR_3.20.1 MASS_7.3-47 pscl_1.5.2
[7] doParallel_1.0.11 lattice_0.20-35 foreach_1.4.3
[10] ashr_2.1-27 stringr_1.2.0 tools_3.4.2
[13] parallel_3.4.2 grid_3.4.2 git2r_0.19.0
[16] iterators_1.0.8 htmltools_0.3.6 assertthat_0.2.0
[19] yaml_2.1.14 rprojroot_1.2 digest_0.6.12
[22] gmp_0.5-13.1 codetools_0.2-15 evaluate_0.10.1
[25] rmarkdown_1.6 limma_3.34.0 stringi_1.1.5
[28] compiler_3.4.2 backports_1.1.1 locfit_1.5-9.1
[31] truncnorm_1.0-7
This R Markdown site was created with workflowr