Last updated: 2018-11-29
workflowr checks: (Click a bullet for more information) ✖ R Markdown file: uncommitted changes
The R Markdown file has unstaged changes. To know which version of the R Markdown file created these results, you’ll want to first commit it to the Git repo. If you’re still working on the analysis, you can ignore this warning. When you’re finished, you can run wflow_publish
to commit the R Markdown file and build the HTML.
✔ Environment: empty
Great job! The global environment was empty. Objects defined in the global environment can affect the analysis in your R Markdown file in unknown ways. For reproduciblity it’s best to always run the code in an empty environment.
✔ Seed:
set.seed(20181119)
The command set.seed(20181119)
was run prior to running the code in the R Markdown file. Setting a seed ensures that any results that rely on randomness, e.g. subsampling or permutations, are reproducible.
✔ Session information: recorded
Great job! Recording the operating system, R version, and package versions is critical for reproducibility.
✔ Repository version: b5d096d
wflow_publish
or wflow_git_commit
). workflowr only checks the R Markdown file, but you know if there are other scripts or data files that it depends on. Below is the status of the Git repository when the results were generated:
Ignored files:
Ignored: .Rproj.user/
Ignored: analysis/figure/
Untracked files:
Untracked: analysis/Quality_metrics.Rmd
Unstaged changes:
Modified: analysis/crop_workflow_Alan.Rmd
Modified: analysis/crop_workflow_Siwei.Rmd
Note that any generated files, e.g. HTML, png, CSS, etc., are not included in this status report because it is ok for generated content to have uncommitted changes.
# 01-Nov-2018
# obtain the gene expression profile of all gRNAs individually
library(edgeR)
library(Rfast)
library(data.table)
library(cellrangerRkit)
#################
# exp_matrix <- exp_matrix_backup
##############
gbm <- load_cellranger_matrix("../NSC_merged_05_07_08_new/") # load the GBM with NEW data
exp_matrix <- gbm[, colSums(exprs(gbm)) > 11000] # get cells above background value
use_genes <-get_nonzero_genes(exp_matrix)
exp_matrix <- exp_matrix[use_genes, ]
exp_matrix <- normalize_barcode_sums_to_median(exp_matrix)
# Select cells that have at least 1 reads in gRNA count
exp_matrix <- exp_matrix[, colMaxs(as.matrix(exprs(exp_matrix[
(length(exp_matrix@featureData@data$id)-75): # should be 75 or 76?
(length(exp_matrix@featureData@data$id)), ])), value = T) > 0] #more than 1 count
# make cell count for cells contain unique RNA
exp_matrix <- exp_matrix[, # Select cells that have one gRNA expression more than 3x of all others
(colMaxs(as.matrix(exprs(exp_matrix[
(length(exp_matrix@featureData@data$id)-75):
(length(exp_matrix@featureData@data$id)), ])), value = T)) >
(colSums(exprs(exp_matrix[(length(exp_matrix@featureData@data$id)-75):
(length(exp_matrix@featureData@data$id)), ]))*3/4)]
# exp_GeneBCMatrix <- exp_matrix # output from the last step
#############
# find the gRNA for each cell by taking the rowMax index of gRNA columns(29847:29922)
# will return a data.frame with entries followed by .xx since colnames should be unique
exp_matrix <- as.matrix(exprs(exp_matrix))
exp_matrix_backup <- exp_matrix # make a backup in case of need
#############
exp_matrix <- exp_matrix_backup
############
# total UMI count: 77,997,203
#scale exp_matrix to 1,000,000 counts -- NO, will significantly decrease the statistic power
# exp_matrix <- exp_matrix*(1000000/77997203)
### Make the gRNA list
# gRNA_list <- as.data.frame(rownames(exp_matrix
# [(nrow(exp_matrix)-76 +
# colMaxs(
# as.matrix(
# exp_matrix[
# ((nrow(exp_matrix)-75):(nrow(exp_matrix)))
# ,]
# ), value = F)
# ),
# ]))
gRNA_list <- as.data.frame(rownames(exp_matrix
[(nrow(exp_matrix)-76 +
colMaxs(
exp_matrix[
((nrow(exp_matrix)-75):(nrow(exp_matrix)))
,]
, value = F)
),
]))
colnames(gRNA_list) <- "gRNA"
gRNA_ASoC_list <- gRNA_list[!duplicated(gRNA_list$gRNA), ]
gRNA_ASoC_list <- gRNA_ASoC_list[order(gRNA_ASoC_list)]
# exp_matrix$cell_type <- as.factor(gRNA_dist$gRNA)
gRNA_ASoC_list <- gRNA_ASoC_list[-c(31,32,33,37,38,39,40,41,51,52,59,60,61,65,66,67)]
#############
# exp_matrix now: rownames=ENSG, colnames=barcode
# exp_matrix <- as.matrix(t(exp_matrix))
# not using
cell_type_index <- as.vector(gRNA_list$gRNA) # use this variable as index, length = 2522
############ Main Program ######
gRNA_ASoC_list_backup <- gRNA_ASoC_list # set a backup, will use the full list after debugging
# gRNA_ASoC_list <- gRNA_ASoC_list[1:2] # list for debugging, 2 elements only
# gRNA_ASoC_list <- gRNA_ASoC_list_backup
for(i in 1:length(gRNA_ASoC_list)) {
print(gRNA_ASoC_list[i])
### make correspondence of Gene_Symbol and Gene_id
# write_out_with_gene_name <- as.data.frame(get_CRISPRi_result(gRNA_ASoC_list[i],
# TPM_filter = F, TPM_threshold = 0.01))
write_out_with_gene_name <- as.data.frame(get_CRISPRi_result(gRNA_ASoC_list[i],
TPM_filter = F, TPM_threshold = 0.01)) # no filter set
write_out_with_gene_name$Geneid <- rownames(write_out_with_gene_name)
write_out_with_gene_name_output <- merge(write_out_with_gene_name, ENSG_coord_gene_gencodev28, by = "Geneid")
write_out_with_gene_name_output <- write_out_with_gene_name_output[order(write_out_with_gene_name_output$PValue), ]
write.table(write_out_with_gene_name_output, append = F,
row.names = F, col.names = T, sep = "\t", quote = F,
file = paste("output/", gRNA_ASoC_list[i], "_gRNA.txt", collapse = "", sep = ""))
}
############
############## custom functions##############
get_CRISPRi_result <- function(gRNA_name, TPM_filter = FALSE, TPM_threshold = 0.01) {
# TPM filter takes only genes with an estimated TPM above 1
# in more than 25% of the considered cells
# TPM_filter <- TRUE # temporary
# TPM_threshold <- 0.01 # temporaray
# if(TPM_filter) {
# exp_matrix <- exp_matrix[
# rowSums(exp_matrix > TPM_threshold)
# > trunc(ncol(exp_matrix/4)), ]
# }
# make two matrix using grepl, separate the target gRNA and control gRNA (EGFP/neg)
control_matrix <- exp_matrix[ , grepl("EGFP", cell_type_index) |
grepl("neg", cell_type_index)]
# gRNA_name <- "VPS45_2_gene" # temporary
gRNA_matrix <- exp_matrix[ , grepl(gRNA_name, cell_type_index)]
# prepare the input matrix for edgeR
## merge the two matrices as one, ensure the gRNA is the first
matrix_combined <- as.matrix(cbind(control_matrix, gRNA_matrix))
# matrix_combined <- matrix_combined[order(cell_type_index), ]
## Trim the tailing gRNA artificial genes, total 75
# matrix_combined_transposed <- t(matrix_combined[, -((ncol(matrix_combined)-75):
# ncol(matrix_combined))])
matrix_combined_transposed <- matrix_combined[1:(nrow(matrix_combined)-76), ]
### Assign rownames as ENSG gene identifiers
rownames(matrix_combined_transposed) <- gsub("\\..*", "", rownames(matrix_combined_transposed))
# rownames(matrix_combined_transposed) <- gsub("\\..*", "",
# colnames(matrix_combined[ , 1:(ncol(matrix_combined)-76)]))
# colnames(matrix_combined_transposed) <- rownames(matrix_combined)
# print(scale(colMeans(matrix_combined_transposed > 0)))
# Run edgeR use edgeRQLFDetRate, nrow(control_matrix) should be 139
group <- factor(c(rep("ctrl", len = ncol(control_matrix)),
rep("gRNA", len = ncol(gRNA_matrix))))
# make DGEList()
main_DGE <- DGEList(counts = matrix_combined_transposed, group = group, remove.zeros = T)
# Use edgeRQLFDetRate flow from now on
main_DGE <- calcNormFactors(main_DGE)
cdr <- scale(colMeans(matrix_combined_transposed > 0)) # DetRate is applied here
design <- model.matrix(~ cdr + group) # cdr (~ cdr + group)
main_DGE <- estimateDisp(main_DGE, design = design)
fit <- glmQLFit(main_DGE, design = design) # fit
qlf <- glmQLFTest(fit) # QLF vs LRT
# tt <- topTags(qlf, n = 100)
exp_table <- qlf$table
exp_table$FDR <- p.adjust(exp_table$PValue, "fdr")
# write.table(group, append = F,
# row.names = F, col.names = T, sep = "\t", quote = F,
# file = paste("output/", gRNA_ASoC_list[i], "_group.txt", collapse = ""))
return(exp_table)
}
sessionInfo()
R version 3.3.2 (2016-10-31)
Platform: x86_64-apple-darwin11.4.2 (64-bit)
Running under: OS X El Capitan 10.11.6
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
loaded via a namespace (and not attached):
[1] workflowr_1.1.1 Rcpp_0.12.17 digest_0.6.12
[4] rprojroot_1.2 R.methodsS3_1.7.1 backports_1.0.5
[7] git2r_0.18.0 magrittr_1.5 evaluate_0.10
[10] stringi_1.1.5 whisker_0.3-2 R.oo_1.22.0
[13] R.utils_2.7.0 rmarkdown_1.10 tools_3.3.2
[16] stringr_1.2.0 yaml_2.1.16 htmltools_0.3.6
[19] knitr_1.20
This reproducible R Markdown analysis was created with workflowr 1.1.1