Gaussian derivatives applied to Smemo’s data

Last updated: 2017-11-07

Code version: 2c05d59

Introduction

Smemo et al 2014 provides a mouse heart RNA-seq data set. The data set contains 2 conditions, and each condition has only 2 samples. We’ll see if Gaussian derivatives can handle this difficult situation.

counts = read.table("../data/smemo.txt", header = T, row.name = 1)
counts = counts[, -5]

## Number of genes
nrow(counts)

[1] 23587

## Number of samples
ncol(counts)

[1] 4

## Sneak peek
head(counts, 10)

          lv1  lv2   rv1   rv2
Itm2a    2236 2174  9484 10883
Sergef     97   90   341   408
Fam109a   383  314  1864  2384
Dhx9     2688 2631 18501 20879
Ssu72     762  674  2806  3435
Olfr1018    0    0     0     0
Fam71e2     0    0     0     0
Eif2b2    736  762  3081  3601
Mks1       77   82   398   685
Hebp2     203  205   732   921

Preprocessing

In the first exploratory investigation, we only choose genes whose expression levels are not all zero for all 4 samples. This is to prevent the complications brought by “non-expressed” genes.

counts.nonzero = counts[rowSums(counts) >= 1, ]
## Equivalently
## counts.nonzero = counts[apply(counts, 1, max) >= 1, ]
design = model.matrix(~c(0, 0, 1, 1))
## Number of genes expressed
nrow(counts.nonzero)

[1] 18615

Then we feed the count matrix to the pipeline to get the summary statistics: \(\hat\beta\), \(\hat s\), \(z\).

counts_to_summary = function (counts, design) {
  dgecounts = edgeR::calcNormFactors(edgeR::DGEList(counts = counts, group = design[, 2]))
  v = limma::voom(dgecounts, design, plot = FALSE)
  lim = limma::lmFit(v)
  r.ebayes = limma::eBayes(lim)
  p = r.ebayes$p.value[, 2]
  t = r.ebayes$t[, 2]
  z = sign(t) * qnorm(1 - p/2)
  betahat = lim$coefficients[,2]
  sebetahat = betahat / z
  return (list(betahat = betahat, sebetahat = sebetahat, z = z))
}

Fitting \(z\) with Gaussian derivatives

Suppose \(z\) are correlated null, will they be well fitted by 10 Gaussian derivatives?

source("../code/gdash.R")

Warning: replacing previous import 'Matrix::crossprod' by 'gmp::crossprod'
when loading 'cvxr'

Warning: replacing previous import 'Matrix::tcrossprod' by
'gmp::tcrossprod' when loading 'cvxr'

source("../code/gdfit.R")
w.fit = gdfit(z, gd.ord = 10)
plot.gdfit(z, w.fit$w, w.fit$gd.ord, breaks = 100)

plot.gdfit(z, w.fit$w, w.fit$gd.ord, std.norm = FALSE, breaks = 100)

plot(ecdf(z))

plot.gdfit(z, w.fit$w, w.fit$gd.ord, breaks = "Sturges")

plot.gdfit(z, w.fit$w, w.fit$gd.ord, std.norm = FALSE, breaks = "Sturges")

Remove two peaks

## Remove all singletons
counts.nonsingleton = counts[rowSums(counts) > 1, ]
## Number of non-singleton genes
nrow(counts.nonsingleton)

[1] 18075

w.fit = gdfit(z, gd.ord = 10)
plot.gdfit(z, w.fit$w, w.fit$gd.ord, breaks = 100)

plot.gdfit(z, w.fit$w, w.fit$gd.ord, std.norm = FALSE, breaks = 100)

plot(ecdf(z))

Higher expression

## Remove all zeros
counts.pos = counts[apply(counts, 1, min) > 0, ]
## Number of positive genes
nrow(counts.pos)

[1] 15573

w.fit = gdfit(z, gd.ord = 10)
cat(rbind(paste(0 : w.fit$gd.ord, ":"), paste(w.fit$w, ";")))

0 : 1 ; 1 : 0.0728018367613569 ; 2 : 1.90276901242463 ; 3 : 0.487348637367052 ; 4 : 2.25733510399704 ; 5 : 0.946549908952083 ; 6 : 1.49164087236149 ; 7 : 0.902213125512891 ; 8 : 0.427623049883572 ; 9 : 0.342914618843359 ; 10 : 0.011101038914766 ;

plot.gdfit(z, w.fit$w, w.fit$gd.ord, breaks = 100)

plot.gdfit(z, w.fit$w, w.fit$gd.ord, std.norm = FALSE, breaks = 100)

plot(ecdf(z))

plot(betahat, sebetahat, cex = 0.7, pch = 19)

plot(betahat, z, cex = 0.7, pch = 19)

fit.gdash = gdash(betahat, sebetahat)
fit.gdash

$fitted_g
$pi
 [1] 1.000000e+00 2.528802e-09 2.400118e-09 2.176700e-09 1.830511e-09
 [6] 1.379624e-09 9.118214e-10 5.331526e-10 2.878238e-10 1.520977e-10
[11] 8.278542e-11 4.833395e-11 3.156007e-11 2.470504e-11 2.695689e-11
[16] 5.699028e-11 1.271587e-10 5.089537e-11 1.086289e-11 4.376792e-12
[21] 2.589195e-12 1.842740e-12 1.463606e-12

$mean
 [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

$sd
 [1]  0.000000000  0.007302517  0.010327318  0.014605033  0.020654636
 [6]  0.029210066  0.041309272  0.058420132  0.082618544  0.116840265
[11]  0.165237087  0.233680530  0.330474174  0.467361059  0.660948349
[16]  0.934722118  1.321896697  1.869444237  2.643793394  3.738888474
[21]  5.287586788  7.477776948 10.575173576

attr(,"row.names")
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
attr(,"class")
[1] "normalmix"

$w
 [1]  1.000000e+00 -5.240201e-02  1.789038e+00 -3.560324e-01  1.788218e+00
 [6] -5.932673e-01  7.470306e-01 -4.652369e-01  1.611175e-07 -1.348388e-01
[11] -1.229651e-07

$niter
[1] 3

$converged
[1] TRUE

fit.ash = ashr::ash(betahat, sebetahat)
lfsr.ash = ashr::get_lfsr(fit.ash)
sum(lfsr.ash <= 0.05)

[1] 3839

fit.gdash.ash = ashr::ash(betahat, sebetahat, fixg = TRUE, g = fit.gdash$fitted_g)
lfsr.gdash.ash = ashr::get_lfsr(fit.gdash.ash)
sum(lfsr.gdash.ash <= 0.05)

[1] 0

pval = (1 - pnorm(abs(z))) * 2
pval.BH = p.adjust(pval, method = "BH")
sum(pval.BH <= 0.05)

[1] 3087

Session information

sessionInfo()

R version 3.4.2 (2017-09-28)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.6

Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] Rmosek_7.1.3      PolynomF_0.94     cvxr_0.0.0.9400   REBayes_0.85     
[5] Matrix_1.2-11     SQUAREM_2017.10-1 EQL_1.0-0         ttutils_1.0-1    

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.13      knitr_1.17        magrittr_1.5     
 [4] edgeR_3.20.1      MASS_7.3-47       pscl_1.5.2       
 [7] doParallel_1.0.11 lattice_0.20-35   foreach_1.4.3    
[10] ashr_2.1-27       stringr_1.2.0     tools_3.4.2      
[13] parallel_3.4.2    grid_3.4.2        git2r_0.19.0     
[16] iterators_1.0.8   htmltools_0.3.6   assertthat_0.2.0 
[19] yaml_2.1.14       rprojroot_1.2     digest_0.6.12    
[22] gmp_0.5-13.1      codetools_0.2-15  evaluate_0.10.1  
[25] rmarkdown_1.6     limma_3.34.0      stringi_1.1.5    
[28] compiler_3.4.2    backports_1.1.1   locfit_1.5-9.1   
[31] truncnorm_1.0-7

This R Markdown site was created with workflowr