Here, we becnhmark several factor analyses methods on single cell RNA sequencing (scRNA-Seq) data obtained from human human brain samples (Darmanis et. al., 2015). All of these methods are used to uncover variation associated with different cell types and compared against each other. This dataset is included in an R data package (“iasvaExamples”) containing data examples for IA-SVA (https://github.com/dleelab/iasvaExamples). To install the ‘iasvaExamples’ package, follow the instructions provided in the GitHub page.

Load packages

rm(list=ls())
library(iasva)
library(iasvaExamples)
library(sva)
library(RUVSeq)
library(Rtsne)
library(pheatmap)
library(corrplot)
library(DescTools) #pcc i.e., Pearson's contingency coefficient
library(RColorBrewer)
library(zinbwave)
library(nnet)
library(SummarizedExperiment)
color.vec <- brewer.pal(8, "Set1")
set.seed(3532379)
# To make the task more challenging we selected 
  #1000 genes to be used in downstream analyses. 
num.genes <- 1000

Load the brain single cell RNA-Seq data

data("Darmanis_Brain_scRNAseq_Read_Counts")
data("Darmanis_Brain_scRNAseq_Annotations")
ls()

## [1] "color.vec"                          
## [2] "Darmanis_Brain_scRNAseq_Annotations"
## [3] "Darmanis_Brain_scRNAseq_Read_Counts"
## [4] "num.genes"

counts.total <- Darmanis_Brain_scRNAseq_Read_Counts
anns.total <- Darmanis_Brain_scRNAseq_Annotations
dim(anns.total)

## [1] 461   6

dim(counts.total)

## [1] 23267   461

summary(anns.total)

##          run              tissue                cell_type  
##  SRR1974543:  1   cortex     :426   neurons          :129  
##  SRR1974544:  1   hippocampus: 35   fetal_quiescent  :109  
##  SRR1974545:  1                     astrocytes       : 62  
##  SRR1974546:  1                     hybrid           : 46  
##  SRR1974547:  1                     oligodendrocytes : 37  
##  SRR1974548:  1                     fetal_replicating: 25  
##  (Other)   :455                     (Other)          : 53  
##                  age           c1_chip_id   sample_name 
##  prenatal 16-18 W  :133   nochipID2 : 77   AB_S4  : 77  
##  postnatal 50 years: 77   nochipID8 : 55   AB_S11 : 62  
##  postnatal 37 years: 62   nochipID11: 45   AB_S8  : 57  
##  postnatal 54 years: 57   1772078217: 33   AB_S7  : 55  
##  postnatal 21 years: 55   1772078236: 33   FB_S2  : 45  
##  postnatal 63 years: 49   nochipID13: 33   AB_S5  : 44  
##  (Other)           : 28   (Other)   :185   (Other):121

table(anns.total$tissue, anns.total$cell_type)

##              
##               astrocytes endothelial fetal_quiescent fetal_replicating
##   cortex              62          17             109                25
##   hippocampus          0           3               0                 0
##              
##               hybrid microglia neurons oligodendrocytes OPC
##   cortex          44         8     128               31   2
##   hippocampus      2         7       1                6  16

ContCoef(table(anns.total$tissue, anns.total$cell_type))

## [1] 0.5794563

table(anns.total$age, anns.total$cell_type)

##                     
##                      astrocytes endothelial fetal_quiescent
##   postnatal 21 years          3           0               0
##   postnatal 22 years          0           1               1
##   postnatal 37 years          7           1               0
##   postnatal 47 years         11           2               0
##   postnatal 50 years         38           0               0
##   postnatal 54 years          1           0               0
##   postnatal 63 years          2          16               0
##   prenatal 16-18 W            0           0             108
##                     
##                      fetal_replicating hybrid microglia neurons
##   postnatal 21 years                 0      7         0      34
##   postnatal 22 years                 0      0         0       2
##   postnatal 37 years                 0      3         1      49
##   postnatal 47 years                 0      0         5       6
##   postnatal 50 years                 0     19         0      19
##   postnatal 54 years                 0     17         6       0
##   postnatal 63 years                 0      0         3      19
##   prenatal 16-18 W                  25      0         0       0
##                     
##                      oligodendrocytes OPC
##   postnatal 21 years               11   0
##   postnatal 22 years                0   0
##   postnatal 37 years                1   0
##   postnatal 47 years                0   0
##   postnatal 50 years                1   0
##   postnatal 54 years               15  18
##   postnatal 63 years                9   0
##   prenatal 16-18 W                  0   0

ContCoef(table(anns.total$age, anns.total$cell_type))

## [1] 0.8256324

table(anns.total$sample_name, anns.total$cell_type)

##         
##          astrocytes endothelial fetal_quiescent fetal_replicating hybrid
##   AB_S1          11           2               0                 0      0
##   AB_S11          7           1               0                 0      3
##   AB_S2           2           0               0                 0      0
##   AB_S3           0           1               1                 0      0
##   AB_S4          38           0               0                 0     19
##   AB_S5           0          16               0                 0      0
##   AB_S7           3           0               0                 0      7
##   AB_S8           1           0               0                 0     17
##   FB_S1           0           0               5                21      0
##   FB_S2           0           0              43                 2      0
##   FB_S3           0           0              33                 0      0
##   FB_S6           0           0              27                 2      0
##         
##          microglia neurons oligodendrocytes OPC
##   AB_S1          5       6                0   0
##   AB_S11         1      49                1   0
##   AB_S2          2       1                0   0
##   AB_S3          0       2                0   0
##   AB_S4          0      19                1   0
##   AB_S5          1      18                9   0
##   AB_S7          0      34               11   0
##   AB_S8          6       0               15  18
##   FB_S1          0       0                0   0
##   FB_S2          0       0                0   0
##   FB_S3          0       0                0   0
##   FB_S6          0       0                0   0

ContCoef(table(anns.total$sample_name, anns.total$cell_type))

## [1] 0.8602264

table(anns.total$c1_chip_id, anns.total$cell_type)

##             
##              astrocytes endothelial fetal_quiescent fetal_replicating
##   1772078217          1           0               0                 0
##   1772078218          0           0               0                 0
##   1772078236          2           0               0                 0
##   1772078237          5           1               0                 0
##   nochipID10          0           0               5                21
##   nochipID11          0           0              43                 2
##   nochipID12          1           2               0                 0
##   nochipID13          0           0              33                 0
##   nochipID14          2           0               0                 0
##   nochipID15          0           1               1                 0
##   nochipID2          38           0               0                 0
##   nochipID3           0          13               0                 0
##   nochipID4           0           0              27                 2
##   nochipID5           0           3               0                 0
##   nochipID8           3           0               0                 0
##   nochipID9          10           0               0                 0
##             
##              hybrid microglia neurons oligodendrocytes OPC
##   1772078217     15         0       0               15   2
##   1772078218      2         6       0                0  16
##   1772078236      3         1      26                1   0
##   1772078237      0         0      23                0   0
##   nochipID10      0         0       0                0   0
##   nochipID11      0         0       0                0   0
##   nochipID12      0         2       0                0   0
##   nochipID13      0         0       0                0   0
##   nochipID14      0         2       1                0   0
##   nochipID15      0         0       2                0   0
##   nochipID2      19         0      19                1   0
##   nochipID3       0         0      17                3   0
##   nochipID4       0         0       0                0   0
##   nochipID5       0         1       1                6   0
##   nochipID8       7         0      34               11   0
##   nochipID9       0         3       6                0   0

ContCoef(table(anns.total$c1_chip_id, anns.total$cell_type))

## [1] 0.8802176

The annotations describing samples and experimental settings are stored in “anns” and the read counts information is stored in “counts”.

For the downstream analyses, select neurons, oligodendrcytes and astrocytes from 2 cell types and 8 individuals.

# Select neurons and astrocytes for downstream analyses
selected.cells <- intersect(which(anns.total$tissue=="cortex"),
                            which(anns.total$cell_type=="neurons"|anns.total$cell_type=="astrocytes"))
counts <- counts.total[, selected.cells]
anns <- droplevels(anns.total[selected.cells,])
table(anns$tissue, anns$cell_type)

     astrocytes neurons

cortex 62 128

table(anns$tissue, anns$age)

     postnatal 21 years postnatal 22 years postnatal 37 years

cortex 37 2 56

     postnatal 47 years postnatal 50 years postnatal 54 years

cortex 17 57 1

     postnatal 63 years

cortex 20

table(anns$age, anns$sample_name)

                 AB_S1 AB_S11 AB_S2 AB_S3 AB_S4 AB_S5 AB_S7 AB_S8

postnatal 21 years 0 0 0 0 0 0 37 0 postnatal 22 years 0 0 0 2 0 0 0 0 postnatal 37 years 0 56 0 0 0 0 0 0 postnatal 47 years 17 0 0 0 0 0 0 0 postnatal 50 years 0 0 0 0 57 0 0 0 postnatal 54 years 0 0 0 0 0 0 0 1 postnatal 63 years 0 0 3 0 0 17 0 0

table(anns$age, anns$c1_chip_id)

                 1772078217 1772078236 1772078237 nochipID12

postnatal 21 years 0 0 0 0 postnatal 22 years 0 0 0 0 postnatal 37 years 0 28 28 0 postnatal 47 years 0 0 0 1 postnatal 50 years 0 0 0 0 postnatal 54 years 1 0 0 0 postnatal 63 years 0 0 0 0

                 nochipID14 nochipID15 nochipID2 nochipID3 nochipID8

postnatal 21 years 0 0 0 0 37 postnatal 22 years 0 2 0 0 0 postnatal 37 years 0 0 0 0 0 postnatal 47 years 0 0 0 0 0 postnatal 50 years 0 0 57 0 0 postnatal 54 years 0 0 0 0 0 postnatal 63 years 3 0 0 17 0

                 nochipID9

postnatal 21 years 0 postnatal 22 years 0 postnatal 37 years 0 postnatal 47 years 16 postnatal 50 years 0 postnatal 54 years 0 postnatal 63 years 0

table(anns$tissue, anns$c1_chip_id)

     1772078217 1772078236 1772078237 nochipID12 nochipID14 nochipID15

cortex 1 28 28 1 3 2

     nochipID2 nochipID3 nochipID8 nochipID9

cortex 57 17 37 16

table(anns$cell_type, anns$sample_name)

         AB_S1 AB_S11 AB_S2 AB_S3 AB_S4 AB_S5 AB_S7 AB_S8

astrocytes 11 7 2 0 38 0 3 1 neurons 6 49 1 2 19 17 34 0

table(anns$cell_type, anns$tissue)

         cortex

astrocytes 62 neurons 128

dim(counts)

[1] 23267 190

dim(anns)

[1] 190 6

dropout.rate <- sum(counts==0)/(dim(counts)[1]*dim(counts)[2])*100
dropout.rate

[1] 66.32653

## Filter out lowly-expressed genes 
filter = apply(counts, 1, function(x) length(x[x>5])>=3)
counts = as.matrix(counts[filter,])
dim(counts)

[1] 21806 190

dropout.rate <- sum(counts==0)/(dim(counts)[1]*dim(counts)[2])*100
dropout.rate

[1] 64.13206

# To make this task more challenging for benchamarking purposes
# we only used randomly selected 1000 genes in downstream analyses
rand.gene.list <- sample(1:nrow(counts), num.genes)
counts <- counts[rand.gene.list,]

Cell_Type <- anns$cell_type
Tissue <- anns$tissue
Cell_Type_Num <- as.integer(Cell_Type)
Samples <- as.integer(anns$sample_name)

Calculate geometric library size, i.e., library size of log-transfromed read counts

It is well known that the geometric library size (i.e., library size of log-transfromed read counts) or proportion of expressed genes in each cell explains very large portion of variability of scRNA-Seq data (Hicks et. al. 2015 BioRxiv, McDavid et. al. 2016 Nature Biotechnology). Frequently, the first principal component of log-transformed scRNA-Seq read counts is highly correlated with the geometric library size. (r > 0.9). Here, we calculate it and show the high correlation. Later this vector will be used as an known factor in IA-SVA algorithm.

Geo_Lib_Size <- colSums(log(counts+1))
barplot(Geo_Lib_Size, xlab="Cell")

lcounts <- log(counts + 1)
pca.res = svd(lcounts - rowMeans(lcounts))$v
cor(Geo_Lib_Size, pca.res[,1])

## [1] 0.9435376

cor(Geo_Lib_Size, Cell_Type_Num)

## [1] 0.1815641

run analyses

Library size, tissue, and sample ID are used as covariates in all factor analyses except for PCA. 3 SVs are captured with each method and the SV that has the maximum correlation with the cell type assignments is selected among these 3. This is especially important for a fair comparison of PCA.

mod1 = model.matrix(~Geo_Lib_Size+anns$sample_name)
mod0 = cbind(mod1[,1])
summ_exp <- SummarizedExperiment(assays = counts)
### IA-SVA 
start_time <- Sys.time()
iasva.res<- iasva(summ_exp, mod1[,-1],verbose=FALSE, permute=FALSE, num.sv=3)

## IA-SVA running...
## SV1 Detected!
## SV2 Detected!
## SV3 Detected!
## # of significant surrogate variables: 3

end_time <- Sys.time()
iasva.time <- start_time - end_time
iasva.sv <- iasva.res$sv
plot(iasva.sv[,1], iasva.sv[,2],col=Cell_Type, xlab="SV1", ylab="SV2")

pairs(iasva.sv, col=Cell_Type, main="IA-SVA")

max.cor.iasva <- which.is.max(c(abs(cor(iasva.sv[,1], Cell_Type_Num)),
                abs(cor(iasva.sv[,2], Cell_Type_Num)),
                abs(cor(iasva.sv[,3], Cell_Type_Num))))
max.cor.iasva

## [1] 1

## PCA
ldat0 = log(counts + 1)
start_time <- Sys.time()
pca.sv = svd(ldat0 - rowMeans(ldat0))$v
end_time <- Sys.time()
pca.time <- start_time - end_time
plot(pca.sv[,1], pca.sv[,2],col=Cell_Type, xlab="SV1", ylab="SV2")

pairs(pca.sv[,1:3], col=Cell_Type, main="PCA")

max.cor.pca <- which.is.max(c(abs(cor(pca.sv[,1], Cell_Type_Num)),
                abs(cor(pca.sv[,2], Cell_Type_Num)),
                abs(cor(pca.sv[,3], Cell_Type_Num))))
max.cor.pca

## [1] 2

## USVA
start_time <- Sys.time()
usva.sv = svaseq(counts,mod1,mod0,n.sv=3)$sv

## Number of significant surrogate variables is:  3 
## Iteration (out of 5 ):1  2  3  4  5

end_time <- Sys.time()
usva.time <- start_time - end_time
plot(usva.sv[,1], usva.sv[,2],col=Cell_Type, xlab="SV1", ylab="SV2")

pairs(usva.sv[,1:3], col=Cell_Type, main="USVA")

max.cor.usva <- which.is.max(c (abs(cor(usva.sv[,1], Cell_Type_Num)),
                abs(cor(usva.sv[,2], Cell_Type_Num)),
                abs(cor(usva.sv[,3], Cell_Type_Num))))
max.cor.usva

## [1] 2

### RUVres
design <- model.matrix(~Geo_Lib_Size+anns$sample_name)
start_time <- Sys.time()
y <- DGEList(counts=counts)
y <- calcNormFactors(y, method="TMM")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
fit <- glmFit(y, design)
res <- residuals(fit, type="deviance")
set <- betweenLaneNormalization(counts, which="upper")
genes = rep(TRUE,dim(counts)[1])
ruvres.sv = RUVr(set,genes,k=3,res,isLog = T)$W
end_time <- Sys.time()
ruvres.time <- start_time - end_time
plot(ruvres.sv[,1], ruvres.sv[,2],col=Cell_Type, xlab="SV1", ylab="SV2")

pairs(ruvres.sv[,1:3], col=Cell_Type)

max.cor.ruvres <- which.is.max (c (abs(cor(ruvres.sv[,1], Cell_Type_Num)),
                abs(cor(ruvres.sv[,2], Cell_Type_Num)),
                abs(cor(ruvres.sv[,3], Cell_Type_Num))))
max.cor.ruvres

## [1] 1

### RUVemp
start_time <- Sys.time()
y <- DGEList(counts=counts)
y <- calcNormFactors(y, method="TMM")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
fit <- glmFit(y, design)
lrt <- glmLRT(fit, coef=2)
emp.genes =rank(lrt$table$LR) > 400
ruvemp.sv <- RUVg(counts, emp.genes, k=3)$W
end_time <- Sys.time()
ruvemp.time <- start_time - end_time
plot(ruvemp.sv[,1], ruvemp.sv[,2],col=Cell_Type, xlab="SV1", ylab="SV2")

pairs(ruvemp.sv[,1:3], col=Cell_Type)

max.cor.ruvemp <- which.is.max(c (abs(cor(ruvemp.sv[,1], Cell_Type_Num)),
                abs(cor(ruvemp.sv[,2], Cell_Type_Num)),
                abs(cor(ruvemp.sv[,3], Cell_Type_Num))))
max.cor.ruvemp

## [1] 2

### ZINB WAVE
se.data <- SummarizedExperiment(assays=list(counts=counts))
se.data$LS <- Geo_Lib_Size
se.data$SN <- anns$sample_name
se.data$Tissue <- anns$tissue
# use Geometric library size + Tissue + sample is as covariates
# suggested epsilon parameter is used for the analyses
start_time <- Sys.time()
zinb_cov <- zinbFit(se.data, K=3, X="~LS+SN", epsilon=1000)

## Warning in simpute.als(x, J, thresh, lambda, maxit, trace.it, warm.start, :
## Convergence not achieved by 100 iterations

W <- getW(zinb_cov)
end_time <- Sys.time()
zinb.time <- start_time - end_time
colnames(W) <- paste0("W", 1:3)
pairs(W,col=Cell_Type)

max.cor.zinb <- which.is.max (c (abs(cor(W[,1], Cell_Type_Num)),
                abs(cor(W[,2], Cell_Type_Num)),
                abs(cor(W[,3], Cell_Type_Num))))

max.cor.zinb

## [1] 1

Compare results

Compare results from different methods to the true cell type assignments using correlation plots.

## Plot correlation between true cell_type and estimates (SV1)
celltype.est = cbind(Cell_Type_Num, iasva.sv[,max.cor.iasva],
                     pca.sv[,max.cor.pca],usva.sv[,max.cor.usva], 
                     ruvres.sv[,max.cor.ruvres], 
                     ruvemp.sv[,max.cor.ruvemp], W[,max.cor.zinb])

colnames(celltype.est) = c("Cell Type","IA-SVA","PCA","USVA",
                           "RUVres","RUVemp", "ZINB-WAVE")
corr = abs(cor(celltype.est))
print(corr)

##           Cell Type    IA-SVA       PCA      USVA    RUVres    RUVemp
## Cell Type 1.0000000 0.9364864 0.8412577 0.9217416 0.7647474 0.8358981
## IA-SVA    0.9364864 1.0000000 0.9241849 0.9441659 0.7767050 0.9186404
## PCA       0.8412577 0.9241849 1.0000000 0.8876566 0.5461878 0.9959706
## USVA      0.9217416 0.9441659 0.8876566 1.0000000 0.7562019 0.8756892
## RUVres    0.7647474 0.7767050 0.5461878 0.7562019 1.0000000 0.5373655
## RUVemp    0.8358981 0.9186404 0.9959706 0.8756892 0.5373655 1.0000000
## ZINB-WAVE 0.8256225 0.8473005 0.8046761 0.8616269 0.6648571 0.7892463
##           ZINB-WAVE
## Cell Type 0.8256225
## IA-SVA    0.8473005
## PCA       0.8046761
## USVA      0.8616269
## RUVres    0.6648571
## RUVemp    0.7892463
## ZINB-WAVE 1.0000000

cols = colorRampPalette(c(color.vec[2],"white",color.vec[1]))

corrplot.mixed(corr, lower="ellipse", upper="number", mar=c(0,0,1,0))

num.cells <- dim(anns)[1]
pdf(paste0("Brain_scRNASeq_neuron_astro_",num.cells,"_cells.pdf"), height=6, width=6)
corrplot.mixed(corr, lower="ellipse", upper="number", mar=c(0,0,1,0))
dev.off()

## quartz_off_screen 
##                 2

time.seconds <- c(iasva.time,pca.time,usva.time,ruvres.time,ruvemp.time,zinb.time)
names(time.seconds) <- c("IA-SVA","PCA","USVA","RUVres","RUVemp", "ZINB-WAVE")
time.seconds

## Time differences in secs
##      IA-SVA         PCA        USVA      RUVres      RUVemp   ZINB-WAVE 
##  -0.2223659  -0.1276200  -1.0486910 -62.4883530 -63.0217659 -76.6563649

barplot(t(as.matrix(abs(time.seconds))), las =2, ylab = "Run time (seconds)")

pdf(paste0("Brain_scRNASeq_neuron_astro_",num.cells,"_cells_run_time.pdf"), height=6, width=6)
barplot(t(as.matrix(abs(time.seconds))), las =2, ylab = "Run time (seconds)")
dev.off()

## quartz_off_screen 
##                 2

Load packages

Load the brain single cell RNA-Seq data

Calculate geometric library size, i.e., library size of log-transfromed read counts

run analyses

Compare results

Session Info