Last updated: 2018-12-12

workflowr checks: (Click a bullet for more information)
Expand here to see past versions:

Introduction


Steps

Same steps as in https://github.com/statOmics/zinbwaveZinger/blob/master/realdata/createdata/createDataObject.Rmd.

Reading in data.

library(Seurat)
library(dplyr)
library(Matrix)
# Load the PBMC dataset
pbmc.data <- Read10X(data.dir = "/project/mstephens/data/external_public_supp/pbmc3k_filtered_gene_bc_matrices")

pbmc <- CreateSeuratObject(raw.data = pbmc.data, min.cells = 3, min.genes = 200, project = "10X_PBMC")

QC and selecting cells for further analysis

mito.genes <- grep(pattern = "^MT-", x = rownames(x = pbmc@data), value = TRUE)
percent.mito <- Matrix::colSums(pbmc@raw.data[mito.genes, ]) / Matrix::colSums(pbmc@raw.data)
# AddMetaData adds columns to object@meta.data, and is a great place to stash QC stats
pbmc <- AddMetaData(object = pbmc, metadata = percent.mito, col.name = "percent.mito")

pbmc <- FilterCells(object = pbmc, subset.names = c("nGene", "percent.mito"), low.thresholds = c(200, -Inf), high.thresholds = c(2500, 0.05))

Normalizing the data

pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize", scale.factor = 1e4)

Detection of variable genes across the single cells

pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)

length(x = pbmc@var.genes)

Scaling the data and removing unwanted sources of variation

pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nUMI", "percent.mito"))

Perform linear dimensional reduction

pbmc <- RunPCA(object = pbmc, pc.genes = pbmc@var.genes, do.print = TRUE, pcs.print = 1:5, genes.print = 5)

Cluster the cells

pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10, resolution = 0.6, print.output = 0, save.SNN = TRUE)

Run Non-linear dimensional reduction (tSNE)

pbmc <- RunTSNE(object = pbmc, dims.use = 1:10, do.fast = TRUE)

Assigning cell type identity to clusters

Cluster ID Markers Cell Type
0 IL7R CD4 T cells
1 CD14, LYZ CD14+ Monocytes
2 MS4A1 B cells
3 CD8A CD8 T cells
4 FCGR3A, MS4A7 FCGR3A+ Monocytes
5 GNLY, NKG7 NK cells
6 FCER1A, CST3 Dendritic Cells
7 PPBP Megakaryocytes
current.cluster.ids <- c(0, 1, 2, 3, 4, 5, 6, 7)
new.cluster.ids <- c("CD4 T cells", "CD14+ Monocytes", "B cells", "CD8 T cells", "FCGR3A+ Monocytes", "NK cells", "Dendritic cells", "Megakaryocytes")
pbmc@ident <- plyr::mapvalues(x = pbmc@ident, from = current.cluster.ids, to = new.cluster.ids)

Further subdivisions within cell types

# First lets stash our identities for later
pbmc <- StashIdent(object = pbmc, save.name = "ClusterNames_0.6")
# Note that if you set save.snn=T above, you don't need to recalculate the SNN, and can simply put: 
# pbmc <- FindClusters(pbmc,resolution = 0.8)
pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10, resolution = 0.8, print.output = FALSE)

Create SE object

library(SingleCellExperiment)
# use raw data as input for zinbwave but keep only non filtered cells
# and most variable genes indentified by seurat
keepcells = colnames(pbmc@data)
counts = pbmc@raw.data[, keepcells]
# zinbwave does not want dgTMatrix as input
counts = as.matrix(counts)
counts = counts[rowSums(counts) > 0, ]
#keepcells = as.integer(pbmc@ident) %in% 1:2
#counts = counts[, keepcells]
# coldata
clusters = as.integer(pbmc@ident)
#clusters = clusters[clusters %in% 1:2]
cData = data.frame(seurat = clusters)
rownames(cData) = colnames(counts)
# rowdata
rData = data.frame(seuratVarGenes = rownames(counts) %in% pbmc@var.genes)
rownames(rData) = rownames(counts)
# create sce object
core = SingleCellExperiment(assays = list(counts = counts),
                            colData = cData, rowData = rData)

unloadNamespace("Seurat")
saveRDS(core, file = '../dsc/data/pbmc.rds')

saveRDS(assay(pbmc), file = '../dsc/data/pbmc_counts.rds')

Load pre-computed object.

pbmc <- readRDS(file = '../dsc/data/pbmc.rds')

pbmc

Session information

sessionInfo()
R version 3.5.1 (2018-07-02)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Scientific Linux 7.4 (Nitrogen)

Matrix products: default
BLAS/LAPACK: /software/openblas-0.2.19-el7-x86_64/lib/libopenblas_haswellp-r0.2.19.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

loaded via a namespace (and not attached):
 [1] workflowr_1.1.1   Rcpp_1.0.0        digest_0.6.18    
 [4] rprojroot_1.3-2   R.methodsS3_1.7.1 backports_1.1.2  
 [7] magrittr_1.5      git2r_0.23.0      evaluate_0.12    
[10] stringi_1.2.4     whisker_0.3-2     R.oo_1.22.0      
[13] R.utils_2.7.0     rmarkdown_1.10    tools_3.5.1      
[16] stringr_1.3.1     yaml_2.2.0        compiler_3.5.1   
[19] htmltools_0.3.6   knitr_1.20       

This reproducible R Markdown analysis was created with workflowr 1.1.1