Last updated: 2018-02-06
Code version: fd3bf7b
library("cowplot")
library("dplyr")
library("edgeR")
library("ggplot2")
library("heatmap3")
library("reshape2")
library("Biobase")
source("../code/pca.R")
fname <- Sys.glob("../data/eset/*.rds")
eset <- Reduce(combine, Map(readRDS, fname))
## look at human genes
eset_hs <- eset[fData(eset)$source == "H. sapiens", ]
head(featureNames(eset_hs))
[1] "ENSG00000000003" "ENSG00000000005" "ENSG00000000419" "ENSG00000000457"
[5] "ENSG00000000460" "ENSG00000000938"
## remove genes of all 0s
eset_hs_clean <- eset_hs[rowSums(exprs(eset_hs)) != 0, ]
dim(eset_hs_clean)
Features Samples
19348 1536
## convert to log2 cpm
mol_hs_cpm <- cpm(exprs(eset_hs_clean), log = TRUE)
mol_hs_cpm_means <- rowMeans(mol_hs_cpm)
summary(mol_hs_cpm_means)
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.413 2.482 3.180 3.858 4.761 12.999
## keep genes with reasonable expression levels
mol_hs_cpm <- mol_hs_cpm[mol_hs_cpm_means > median(mol_hs_cpm_means), ]
dim(mol_hs_cpm)
[1] 9674 1536
## pca of genes with reasonable expression levels
pca_hs <- run_pca(mol_hs_cpm)
## a function of pca vs technical factors
get_r2 <- function(x, y) {
stopifnot(length(x) == length(y))
model <- lm(y ~ x)
stats <- summary(model)
return(stats$adj.r.squared)
}
## selection of technical factor
covariates <- pData(eset) %>% dplyr::select(experiment, well, concentration, raw:unmapped,
starts_with("detect"), chip_id, molecules)
## look at the first 6 PCs
pcs <- pca_hs$PCs[, 1:6]
## generate the data
r2_before <- matrix(NA, nrow = ncol(covariates), ncol = ncol(pcs),
dimnames = list(colnames(covariates), colnames(pcs)))
for (cov in colnames(covariates)) {
for (pc in colnames(pcs)) {
r2_before[cov, pc] <- get_r2(covariates[, cov], pcs[, pc])
}
}
## plot
heatmap3(r2_before, cexRow=1, cexCol=1, margins=c(8,8),
ylab="technical factor", main = "Before filter")
heatmap3(r2_before, cexRow=1, cexCol=1, margins=c(8,8), scale = "none",
ylab="technical factor", main = "Before filter w/o scale")
Import data post sample and gene filtering
load(file="../output/gene-filtering.Rmd/eset-filtered.rdata")
Compute log2 CPM based on the library size before filtering.
log2cpm <- log2(cpm_filtered+1)
pca_log2cpm <- run_pca(log2cpm)
pdata <- pData(eset_filtered)
pdata$experiment <- as.factor(pdata$experiment)
plot_grid(
plot_pca(x=pca_log2cpm$PCs, explained=pca_log2cpm$explained,
metadata=pdata, color="chip_id"),
plot_pca(x=pca_log2cpm$PCs, explained=pca_log2cpm$explained,
metadata=pdata, color="experiment"),
ncol=2)
## selection of technical factor
covariates <- pData(eset_filtered) %>% dplyr::select(experiment, well, chip_id,
concentration, raw:unmapped,
starts_with("detect"), molecules)
## look at the first 6 PCs
pcs <- pca_log2cpm$PCs[, 1:6]
## generate the data
r2 <- matrix(NA, nrow = ncol(covariates), ncol = ncol(pcs),
dimnames = list(colnames(covariates), colnames(pcs)))
for (cov in colnames(covariates)) {
for (pc in colnames(pcs)) {
r2[cov, pc] <- get_r2(covariates[, cov], pcs[, pc])
}
}
## plot heatmap
heatmap3(r2, cexRow=1, cexCol=1, margins=c(8,8),
ylab="technical factor", main = "After filter")
heatmap3(r2, cexRow=1, cexCol=1, margins=c(8,8), scale = "none",
ylab="technical factor", main = "After filter w/o scale")
PC1 correlated with number of genes detected, which is described in Hicks et al 2017
Number of genes detected also highly correlated with sequencing metrics, especially total molecule number per sample.
cor_tech <- cor(as.matrix(covariates[,4:11]),use="pairwise.complete.obs")
heatmap(cor_tech, symm = TRUE)
Look at the top 10% expression genes to see if the correlation of PC1 and number of detected gene would go away. However, the PC1 is still not individual (chip_id).
## look at top 10% of genes
log2cpm_mean <- rowMeans(log2cpm)
summary(log2cpm_mean)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.3048 1.3316 2.9092 3.3464 4.9083 13.6379
log2cpm_top <- log2cpm[rank(log2cpm_mean) / length(log2cpm_mean) > 1 - 0.1, ]
dim(log2cpm_top)
[1] 1149 1025
pca_top <- run_pca(log2cpm_top)
## look at the first 6 PCs
pcs <- pca_top$PCs[, 1:6]
## generate the data
r2_top <- matrix(NA, nrow = ncol(covariates), ncol = ncol(pcs),
dimnames = list(colnames(covariates), colnames(pcs)))
for (cov in colnames(covariates)) {
for (pc in colnames(pcs)) {
r2_top[cov, pc] <- get_r2(covariates[, cov], pcs[, pc])
}
}
## plot heatmap
heatmap3(r2_top, cexRow=1, cexCol=1, margins=c(8,8),
ylab="technical factor", main = "Top 10 % gene")
heatmap3(r2_top, cexRow=1, cexCol=1, margins=c(8,8), scale = "none",
ylab="technical factor", main = "Top 10 % gene w/o scale")
This R Markdown site was created with workflowr