Last updated: 2018-07-26
workflowr checks: (Click a bullet for more information) ✔ R Markdown file: up-to-date
Great! Since the R Markdown file has been committed to the Git repository, you know the exact version of the code that produced these results.
✔ Environment: empty
Great job! The global environment was empty. Objects defined in the global environment can affect the analysis in your R Markdown file in unknown ways. For reproduciblity it’s best to always run the code in an empty environment.
✔ Seed:
set.seed(20180613)
The command set.seed(20180613)
was run prior to running the code in the R Markdown file. Setting a seed ensures that any results that rely on randomness, e.g. subsampling or permutations, are reproducible.
✔ Session information: recorded
Great job! Recording the operating system, R version, and package versions is critical for reproducibility.
✔ Repository version: a327824
wflow_publish
or wflow_git_commit
). workflowr only checks the R Markdown file, but you know if there are other scripts or data files that it depends on. Below is the status of the Git repository when the results were generated:
Ignored files:
Ignored: .DS_Store
Ignored: .Rhistory
Ignored: .Rproj.user/
Ignored: code_RCC/.DS_Store
Ignored: data/.DS_Store
Untracked files:
Untracked: analysis/compare_centipede_predictions_DiffACregions_MEF2D.Rmd
Untracked: docs/figure/compare_centipede_predictions.Rmd/
Untracked: docs/figure/compare_centipede_predictions_DiffACregions_MEF2D.Rmd/
Untracked: workflow_setup.R
Unstaged changes:
Modified: analysis/ATACseq_preprocessing_pipeline.Rmd
Modified: analysis/compare_centipede_predictions_CTCF.Rmd
Modified: analysis/compare_centipede_predictions_DiffACregions_CTCF.Rmd
Modified: analysis/compare_centipede_predictions_DiffACregions_HIF1A.Rmd
Modified: analysis/fit_centipede_ATACseq.Rmd
Modified: analysis/index.Rmd
Modified: analysis/license.Rmd
Note that any generated files, e.g. HTML, png, CSS, etc., are not included in this status report because it is ok for generated content to have uncommitted changes.
File | Version | Author | Date | Message |
---|---|---|---|---|
Rmd | a327824 | kevinlkx | 2018-07-26 | compare centipede predictions for MEF2D |
library(ggplot2)
library(grid)
library(gridExtra)
library(limma)
library(edgeR)
library(VennDiagram)
Warning: package 'VennDiagram' was built under R version 3.4.4
Loading required package: futile.logger
message <- futile.logger::flog.threshold(futile.logger::ERROR, name = "VennDiagramLogger")
## venn diagram
plot_venn_overlaps <- function(overlaps.m, title = "", col_fill = NULL, category.names = NULL){
grid.newpage()
overlaps_venn.l <- lapply(as.data.frame(overlaps.m), function(x) which(x == 1))
if(is.null(col_fill)){
col_fill <- 1:length(overlaps_venn.l)
}
if(is.null(category.names)){
category.names <- names(x)
}
venn.plot <- venn.diagram(
x = overlaps_venn.l,
category.names = category.names,
filename = NULL,
fill = col_fill,
alpha=rep(0.5, length(overlaps_venn.l)),
cex = 1.5,
cat.fontface=4,
main=title)
grid.draw(venn.plot)
}
tf_name <- "MEF2D"
pwm_name <- "MEF2D_MA0773.1_1e-4"
thresh_PostPr_bound <- 0.99
cat(pwm_name, "\n")
MEF2D_MA0773.1_1e-4
dir_predictions <- paste0("~/Dropbox/research/ATAC_DNase/ATAC-seq_Olivia_Gray/results/centipede_predictions/", pwm_name)
## condition: N
bam_namelist_N <- c("N1_nomito_rdup.bam", "N2_nomito_rdup.bam", "N3_nomito_rdup.bam")
site_predictions_N.l <- vector("list", 3)
names(site_predictions_N.l) <- bam_namelist_N
for(i in 1:length(bam_namelist_N)){
bam_basename <- tools::file_path_sans_ext(basename(bam_namelist_N[[i]]))
site_predictions_N.l[[i]] <- read.table(paste0(dir_predictions, "/", pwm_name, "_", bam_basename, "_predictions.txt"), header = T, stringsAsFactors = F)
}
CentPostPr_N.df <- data.frame(N1 = site_predictions_N.l[[1]]$CentPostPr,
N2 = site_predictions_N.l[[2]]$CentPostPr,
N3 = site_predictions_N.l[[3]]$CentPostPr)
CentLogRatios_N.df <- data.frame(N1 = site_predictions_N.l[[1]]$CentLogRatios,
N2 = site_predictions_N.l[[2]]$CentLogRatios,
N3 = site_predictions_N.l[[3]]$CentLogRatios)
## condition: H
bam_namelist_H <- c("H1_nomito_rdup.bam", "H2_nomito_rdup.bam", "H3_nomito_rdup.bam")
site_predictions_H.l <- vector("list", 3)
names(site_predictions_H.l) <- bam_namelist_H
for(i in 1:length(bam_namelist_H)){
bam_basename <- tools::file_path_sans_ext(basename(bam_namelist_H[[i]]))
site_predictions_H.l[[i]] <- read.table(paste0(dir_predictions, "/", pwm_name, "_", bam_basename, "_predictions.txt"), header = T, stringsAsFactors = F)
}
name_sites <- site_predictions_H.l[[1]]$name
CentPostPr_H.df <- data.frame(H1 = site_predictions_H.l[[1]]$CentPostPr,
H2 = site_predictions_H.l[[2]]$CentPostPr,
H3 = site_predictions_H.l[[3]]$CentPostPr)
CentLogRatios_H.df <- data.frame(H1 = site_predictions_H.l[[1]]$CentLogRatios,
H2 = site_predictions_H.l[[2]]$CentLogRatios,
H3 = site_predictions_H.l[[3]]$CentLogRatios)
CentPostPr.df <- cbind(CentPostPr_N.df, CentPostPr_H.df)
CentLogRatios.df <- cbind(CentLogRatios_N.df, CentLogRatios_H.df)
cat("Number of bound sites: \n")
Number of bound sites:
colSums(CentPostPr.df > thresh_PostPr_bound)
N1 N2 N3 H1 H2 H3
31853 20926 19070 16535 15801 20099
idx_bound <- which(rowSums(CentPostPr.df > thresh_PostPr_bound) >= 2)
cat(length(idx_bound), "(",round(length(idx_bound)/nrow(CentPostPr.df) *100, 2), "% ) sites are bound in at least two samples \n")
25468 ( 1.73 % ) sites are bound in at least two samples
bound_N <- rowSums(CentPostPr.df[,c("N1", "N2", "N3")] > thresh_PostPr_bound) >= 2
bound_H <- rowSums(CentPostPr.df[,c("H1", "H2", "H3")] > thresh_PostPr_bound) >= 2
bound_N_H <- data.frame(N = bound_N, H = bound_H)
plot_venn_overlaps(bound_N_H, title = paste("Number of", tf_name, "bound sites"),
category.names = c("Bound in N", "Bound in H"), col_fill = c("blue", "red"))
red dots are those sites bound in at least two samples
par(pty="s")
plot(rowMeans(CentPostPr_N.df), rowMeans(CentPostPr_H.df),
xlab = "N average P(Bound)", ylab = "H average P(Bound)", main = tf_name,
pch = ".", col = rgb(0,0,1,0.7))
points(rowMeans(CentPostPr_N.df)[idx_bound], rowMeans(CentPostPr_H.df)[idx_bound],
pch = ".", col = rgb(1,0,0,0.7))
abline(a=0,b=1)
mtext(text = "red dots are those sites bound in at least two samples", side = 3, line = 0, cex = 0.7)
par(mfrow = c(1,2))
par(pty="s")
plot(rowMeans(CentLogRatios_N.df), rowMeans(CentLogRatios_H.df),
xlab = "N average logRatios", ylab = "H average logRatios", main = tf_name,
pch = ".", col = rgb(0,0,1,0.7))
points(rowMeans(CentLogRatios_N.df)[idx_bound], rowMeans(CentLogRatios_H.df)[idx_bound],
pch = ".", col = rgb(1,0,0,0.7))
abline(a=0,b=1,col = "darkgray")
plot(x = (rowMeans(CentLogRatios_H.df) + rowMeans(CentLogRatios_N.df))/2,
y = rowMeans(CentLogRatios_H.df) - rowMeans(CentLogRatios_N.df),
xlab = "average logRatios", ylab = "Difference in logRatios (H - N)", main = tf_name,
pch = ".", col = rgb(0,0,1,0.7))
points(x = ((rowMeans(CentLogRatios_H.df) + rowMeans(CentLogRatios_N.df))/2)[idx_bound],
y = (rowMeans(CentLogRatios_H.df) - rowMeans(CentLogRatios_N.df))[idx_bound],
pch = ".", col = rgb(1,0,0,0.7))
abline(v=0, h=0, col = "darkgray")
pca_logRatios <- prcomp(t(CentLogRatios.df))
percentage <- round(pca_logRatios$sdev / sum(pca_logRatios$sdev) * 100, 2)
percentage <- paste0( colnames(pca_logRatios$x), " (", paste( as.character(percentage), "%)") )
pca_logRatios.df <- as.data.frame(pca_logRatios$x)
pca_logRatios.df$group <- rep(c("N","H"), each = 3)
p <- ggplot(pca_logRatios.df, aes(x=PC1,y=PC2,color=group,label=row.names(pca_logRatios.df)))
p <- p + geom_point() + geom_text(size = 3, show.legend = F, vjust = 2, nudge_y = 0.5) +
labs(title = tf_name, x = percentage[1], y = percentage[2])
p
pca_logRatios <- prcomp(t(CentLogRatios.df[idx_bound, ]))
percentage <- round(pca_logRatios$sdev / sum(pca_logRatios$sdev) * 100, 2)
percentage <- paste0( colnames(pca_logRatios$x), " (", paste( as.character(percentage), "%)") )
pca_logRatios.df <- as.data.frame(pca_logRatios$x)
pca_logRatios.df$group <- rep(c("N","H"), each = 3)
p <- ggplot(pca_logRatios.df, aes(x=PC1,y=PC2,color=group,label=row.names(pca_logRatios.df)))
p <- p + geom_point() + geom_text(size = 3, show.legend = F, vjust = 2, nudge_y = 0.5) +
labs(title = tf_name, x = percentage[1], y = percentage[2])
p
targets <- data.frame(bam = c(bam_namelist_N, bam_namelist_H),
label = colnames(CentLogRatios.df),
condition = rep(c("N", "H"), each = 3))
print(targets)
bam label condition
1 N1_nomito_rdup.bam N1 N
2 N2_nomito_rdup.bam N2 N
3 N3_nomito_rdup.bam N3 N
4 H1_nomito_rdup.bam H1 H
5 H2_nomito_rdup.bam H2 H
6 H3_nomito_rdup.bam H3 H
condition <- factor(targets$condition, levels = c("N", "H"))
design <- model.matrix(~0+condition)
colnames(design) <- levels(condition)
print(design)
N H
1 1 0
2 1 0
3 1 0
4 0 1
5 0 1
6 0 1
attr(,"assign")
[1] 1 1
attr(,"contrasts")
attr(,"contrasts")$condition
[1] "contr.treatment"
CentLogRatios_Bound.df <- CentLogRatios.df[idx_bound, ]
fit <- lmFit(CentLogRatios_Bound.df, design)
contrasts <- makeContrasts(H-N, levels=design)
fit2 <- contrasts.fit(fit, contrasts)
fit2 <- eBayes(fit2, trend=TRUE)
num_diffbind <- summary(decideTests(fit2))
percent_diffbind <- round(num_diffbind / sum(num_diffbind) * 100, 2)
cat(num_diffbind[1], "sites differentially open in normoxia (", percent_diffbind[1], "%) \n",
num_diffbind[3], "sites differentially open in hypoxia (", percent_diffbind[3], "%) \n",
num_diffbind[2], "sites not significantly different (", percent_diffbind[2], "%) \n")
0 sites differentially open in normoxia ( 0 %)
0 sites differentially open in hypoxia ( 0 %)
25468 sites not significantly different ( 100 %)
# volcanoplot(fit2, main="H vs. N", xlab = "Difference in logRatios (H - N)")
plot(x = fit2$coef, y = -log10(fit2$p.value),
xlab = "Difference in logRatios (H - N)", ylab = "-log10(P-value)", main= paste(tf_name, "H vs. N"),
pch = 16, cex = 0.35)
sessionInfo()
R version 3.4.3 (2017-11-30)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] VennDiagram_1.6.20 futile.logger_1.4.3 edgeR_3.20.9
[4] limma_3.34.9 gridExtra_2.3 ggplot2_2.2.1
loaded via a namespace (and not attached):
[1] Rcpp_0.12.16 compiler_3.4.3 pillar_1.2.2
[4] formatR_1.5 git2r_0.21.0 plyr_1.8.4
[7] workflowr_1.1.1 R.methodsS3_1.7.1 R.utils_2.6.0
[10] futile.options_1.0.1 tools_3.4.3 digest_0.6.15
[13] evaluate_0.10.1 tibble_1.4.2 gtable_0.2.0
[16] lattice_0.20-35 rlang_0.2.0 yaml_2.1.18
[19] stringr_1.3.0 knitr_1.20 locfit_1.5-9.1
[22] rprojroot_1.3-2 rmarkdown_1.9 lambda.r_1.2.2
[25] magrittr_1.5 whisker_0.3-2 splines_3.4.3
[28] backports_1.1.2 scales_0.5.0 htmltools_0.3.6
[31] colorspace_1.3-2 labeling_0.3 stringi_1.1.7
[34] lazyeval_0.2.1 munsell_0.4.3 R.oo_1.22.0
This reproducible R Markdown analysis was created with workflowr 1.1.1