Transcription Factor Binding Site Analysis

Last updated: 2018-10-04

Code version: fafe120

Refined motif search

Now that we have predicted TSSs, we can refine our motif binding site search space and look at smaller regions of sequence just upstream of our predicted TSSs. We will use our most commonly used TSSs for this.

First we need to create promoter regions to analyze:

promoter_region <- 1000
core_genes <- readr::read_tsv("../data/gene_lists/core_pf3d7_genes.txt",col_names=F)$X1

upstream_ORF <- rtracklayer::import.gff3("../data/annotations/genes_3D7_v24.gff") %>% 
  tibble::as_tibble() %>% 
  dplyr::filter(ID %in% core_genes) %>%
  dplyr::mutate(newend=ifelse(strand=="+",start,end+promoter_region),
                newstart=ifelse(strand=="+",start-promoter_region,end)) %>%
  dplyr::select(-start,-end) %>%
  dplyr::rename(start=newstart,end=newend) %>%
  GenomicRanges::GRanges()

promoters_3d7 <- rtracklayer::import.gff3("../output/final_utrs/longest_utrs_3d7_plasmodb_compatible.gff") %>% 
  tibble::as_tibble() %>% 
  dplyr::mutate(Parent=stringr::str_replace(stringr::str_replace(stringr::str_replace(unlist(Parent), "rna_", ""), "-1", ""),"[.][0-9]","")) %>% 
  dplyr::distinct() %>%
  dplyr::mutate(newend=ifelse(strand=="+",start,end+promoter_region),
                newstart=ifelse(strand=="+",start-promoter_region,end)) %>%
  dplyr::select(-start,-end) %>%
  dplyr::rename(start=newstart,end=newend) %>%
  dplyr::filter(type=="5UTR") %>%
  GenomicRanges::GRanges()

promoters_hb3 <- rtracklayer::import.gff3("../output/final_utrs/longest_utrs_hb3_plasmodb_compatible.gff") %>% 
  tibble::as_tibble() %>% 
  dplyr::mutate(Parent=stringr::str_replace(stringr::str_replace(stringr::str_replace(unlist(Parent), "rna_", ""), "-1", ""),"[.][0-9]","")) %>% 
  dplyr::distinct() %>%
  dplyr::mutate(newend=ifelse(strand=="+",start,end+promoter_region),
                newstart=ifelse(strand=="+",start-promoter_region,end)) %>%
  dplyr::select(-start,-end) %>%
  dplyr::filter(type=="5UTR") %>%
  GenomicRanges::GRanges()

promoters_it <- rtracklayer::import.gff3("../output/final_utrs/longest_utrs_it_plasmodb_compatible.gff") %>% 
  tibble::as_tibble() %>% 
  dplyr::mutate(Parent=stringr::str_replace(stringr::str_replace(stringr::str_replace(unlist(Parent), "rna_", ""), "-1", ""),"[.][0-9]","")) %>% 
  dplyr::distinct() %>%
  dplyr::mutate(newend=ifelse(strand=="+",start,end+promoter_region),
                newstart=ifelse(strand=="+",start-promoter_region,end)) %>%
  dplyr::select(-start,-end) %>%
  dplyr::filter(type=="5UTR") %>%
  GenomicRanges::GRanges()

rtracklayer::export.gff3(object=upstream_ORF,con="../output/tfbs_analysis/upstream_ORF.gff")
rtracklayer::export.gff3(object=promoters_3d7,con="../output/tfbs_analysis/promoters_3d7.gff")
rtracklayer::export.gff3(object=promoters_hb3,con="../output/tfbs_analysis/promoters_hb3.gff")
rtracklayer::export.gff3(object=promoters_it,con="../output/tfbs_analysis/promoters_it.gff")

seqs_upstream_ORF <- BSgenome::getSeq(BSgenome.Pfalciparum.PlasmoDB.v24, upstream_ORF)
names(seqs_upstream_ORF) <- unlist(upstream_ORF$ID)
Biostrings::writeXStringSet(seqs_upstream_ORF,"../output/tfbs_analysis/upstream_ORF.fasta")
seqs3d7 <- BSgenome::getSeq(BSgenome.Pfalciparum.PlasmoDB.v24, promoters_3d7)
names(seqs3d7) <- stringr::str_replace(stringr::str_replace(unlist(promoters_3d7$Parent), "rna_", ""), "-1", "")
Biostrings::writeXStringSet(seqs3d7,"../output/tfbs_analysis/promoters_3d7.fasta")
seqshb3 <- BSgenome::getSeq(BSgenome.Pfalciparum.PlasmoDB.v24, promoters_hb3)
names(seqshb3) <- stringr::str_replace(stringr::str_replace(unlist(promoters_hb3$Parent), "rna_", ""), "-1", "")
Biostrings::writeXStringSet(seqshb3,"../output/tfbs_analysis/promoters_hb3.fasta")
seqsit <- BSgenome::getSeq(BSgenome.Pfalciparum.PlasmoDB.v24, promoters_it)
names(seqsit) <- stringr::str_replace(stringr::str_replace(unlist(promoters_it$Parent), "rna_", ""), "-1", "")
Biostrings::writeXStringSet(seqsit,"../output/tfbs_analysis/promoters_it.fasta")

We should also create files that parse the genomic coordinates so as to be able to load them into a genome browser:

for strain in 3d7 hb3 it; do bedtools getfasta -fi ../data/genomes/pf3d7/PlasmoDB-24_Pfalciparum3D7_Genome.fasta -bed ../output/tfbs_analysis/promoters_${strain}.gff -fo ../output/tfbs_analysis/promoters_${strain}_genome_coords.fasta -s; done

Now we need to run fimo on these regions and search for our motifs:

# generate background files
for strain in 3d7 hb3 it; do fasta-get-markov -m 3 ../output/tfbs_analysis/promoters_${strain}.fasta ../output/tfbs_analysis/promoters_${strain}.background; done

fasta-get-markov -m 3 ../output/tfbs_analysis/upstream_ORF.fasta ../output/tfbs_analysis/upstream_ORF.background

# run fimo for promoters with gene names
for strain in 3d7 hb3 it; do fimo --bgfile ../output/tfbs_analysis/promoters_${strain}.background --no-qvalue -oc ../output/tfbs_analysis/promoters_${strain} --thresh 1e-2 ../data/motif_cores/ap2_pbm_cores.meme ../output/tfbs_analysis/promoters_${strain}.fasta; done

fimo --bgfile ../output/tfbs_analysis/upstream_ORF.background --no-qvalue -oc ../output/tfbs_analysis/upstream_ORF --thresh 1e-2 ../data/motif_cores/ap2_pbm_cores.meme ../output/tfbs_analysis/upstream_ORF.fasta

# and for promoters with genome coordinates
for strain in 3d7 hb3 it; do fimo --bgfile ../output/tfbs_analysis/promoters_${strain}.background --no-qvalue --parse-genomic-coord -oc ../output/tfbs_analysis/promoters_${strain}_genome_coords --thresh 1e-2 ../data/motif_cores/ap2_pbm_cores.meme ../output/tfbs_analysis/promoters_${strain}_genome_coords.fasta; done

Now we can import that data and analyze it for positional information:

motifs_upstream_ORF <- readr::read_tsv("../output/tfbs_analysis/upstream_ORF/fimo.txt") %>%
  dplyr::rename(motif_id=`# motif_id`)

for (motif in unique(motifs_upstream_ORF$motif_id)) {
  n <- motifs_upstream_ORF %>% 
    dplyr::filter(motif_id==motif)
  #print(ks.test(n$start,runif(10000),alternative="less"))
  g <- n %>% ggplot(aes(x=start)) + 
    geom_line(stat="density") + 
    xlab("Start Site") + 
    ylab("Density") +
    ggtitle(motif)
  #print(g)
  ggsave(plot=g,filename=paste0("../output/tfbs_analysis/upstream_ORF/",motif,".svg"))
  ggsave(plot=g,filename=paste0("../output/tfbs_analysis/upstream_ORF/",motif,".png"))
}

rm(motifs_upstream_ORF)

motifs_3d7 <- readr::read_tsv("../output/tfbs_analysis/promoters_3d7/fimo.txt") %>%
  dplyr::rename(motif_id=`# motif_id`)

for (motif in unique(motifs_3d7$motif_id)) {
  n <- motifs_3d7 %>% 
    dplyr::filter(motif_id==motif)
  #print(ks.test(n$start,runif(10000),alternative="less"))
  g <- n %>% ggplot(aes(x=start)) + 
    geom_line(stat="density") + 
    xlab("Start Site") + 
    ylab("Density") +
    ggtitle(motif)
  #print(g)
  ggsave(plot=g,filename=paste0("../output/tfbs_analysis/promoters_3d7/",motif,".svg"))
  ggsave(plot=g,filename=paste0("../output/tfbs_analysis/promoters_3d7/",motif,".png"))
}

rm(motifs_3d7)

motifs_hb3 <- readr::read_tsv("../output/tfbs_analysis/promoters_hb3/fimo.txt") %>%
  dplyr::rename(motif_id=`# motif_id`)

for (motif in unique(motifs_hb3$motif_id)) {
  n <- motifs_hb3 %>% 
    dplyr::filter(motif_id==motif)
  #print(ks.test(n$start,runif(10000),alternative="less"))
  g <- n %>% ggplot(aes(x=start)) + 
    geom_line(stat="density") + 
    xlab("Start Site") + 
    ylab("Density") +
    ggtitle(motif)
  #print(g)
  ggsave(plot=g,filename=paste0("../output/tfbs_analysis/promoters_hb3/",motif,".svg"))
  ggsave(plot=g,filename=paste0("../output/tfbs_analysis/promoters_hb3/",motif,".png"))
}

rm(motifs_hb3)

motifs_it <- readr::read_tsv("../output/tfbs_analysis/promoters_it/fimo.txt") %>%
  dplyr::rename(motif_id=`# motif_id`)

for (motif in unique(motifs_it$motif_id)) {
  n <- motifs_it %>% 
    dplyr::filter(motif_id==motif)
  #print(ks.test(n$start,runif(10000),alternative="less"))
  g <- n %>% ggplot(aes(x=start)) + 
    geom_line(stat="density") + 
    xlab("Start Site") + 
    ylab("Density") +
    ggtitle(motif)
  #print(g)
  ggsave(plot=g,filename=paste0("../output/tfbs_analysis/promoters_it/",motif,".svg"))
  ggsave(plot=g,filename=paste0("../output/tfbs_analysis/promoters_it/",motif,".png"))
}

rm(motifs_it)

ApiAP2 Activity Predictions

Here we will redo the analysis performed in Campbell et al. using motif hits within newly defined promoter regions. To perform this analysis, two R scripts need to be sourced from within the analysis working directory.

First generate_inputs.R, then estimate_apiap2_activity.R.

Bidirectional promoters

It would be interesting to see whether there is an enrichment for a particular motif found within bidirectional promoters. First let’s extract bidirectional promoter sequences:

for (strain in c("3d7","hb3","it")) {

  divergent <- readr::read_tsv(paste0("../output/neighboring_genes/",strain,"_divergent.tsv")) %>%
    dplyr::filter(dist < 1000 & dist > 0 & cor >= 0.5)

  transcripts <- tibble::as_tibble(
    rtracklayer::import.gff3(
      paste0("../output/neighboring_genes/full_transcripts_",strain,".gff")))
  
  biprom <- tibble::tibble(seqnames=character(),
                           start=integer(),
                           end=integer(),
                           strand=character(),
                           source=character(),
                           type=character(),
                           ID=character())

  for (i in 1:nrow(divergent)) {
    start_gene <- dplyr::filter(transcripts, ID == divergent$left_gene[i])
    end_gene <- dplyr::filter(transcripts, ID == divergent$right_gene[i])
    new <- tibble::tibble(seqnames=start_gene$seqnames,
                        start=start_gene$end,
                        end=end_gene$start,
                        strand="+",
                        source="PlasmoDB",
                        type="BiPromoter",
                        ID=paste0(start_gene$ID,"-",end_gene$ID))
    biprom <- dplyr::bind_rows(biprom, new)
  }
  
  biprom <- GenomicRanges::GRanges(biprom)
  rtracklayer::export.gff3(object=biprom,con=paste0("../output/tfbs_analysis/bidirectional_",strain,".gff"))
  seqs <- BSgenome::getSeq(BSgenome.Pfalciparum.PlasmoDB.v24, biprom)
  names(seqs) <- biprom$ID
  Biostrings::writeXStringSet(seqs,paste0("../output/tfbs_analysis/bidirectional_",strain,".fasta"))
}

Now create background files and run fimo:

for strain in 3d7 hb3 it; do fasta-get-markov -m 3 ../output/tfbs_analysis/bidirectional_${strain}.fasta ../output/tfbs_analysis/bidirectional_${strain}.background; done

for strain in 3d7 hb3 it; do fimo --bgfile ../output/tfbs_analysis/bidirectional_${strain}.background --no-qvalue -oc ../output/tfbs_analysis/bidirectional_${strain} --thresh 1e-2 ../data/motif_cores/ap2_pbm_cores.meme ../output/tfbs_analysis/bidirectional_${strain}.fasta; done

And now we can read those files in and check the motif occurences:

bimotifs_3d7 <- readr::read_tsv("../output/tfbs_analysis/bidirectional_3d7/fimo.txt") %>%
  dplyr::rename(motif_id=`# motif_id`)
biprom_3d7 <- rtracklayer::import.gff("../output/tfbs_analysis/bidirectional_3d7.gff") %>%
  tibble::as_tibble()

tmp <- dplyr::inner_join(bimotifs_3d7,biprom_3d7, by=c("sequence_name"="ID")) %>%
  dplyr::select(motif_id,sequence_name,start.x,width) %>%
  dplyr::mutate(norm_start=start.x/width)

for (m in unique(tmp$motif_id)) {
  b <- tmp %>% dplyr::filter(motif_id==m)
  g <- b %>% ggplot(aes(x=norm_start)) + geom_line(stat="density")
  ggsave(plot=g,filename=paste0("../output/tfbs_analysis/bidirectional_3d7/",m,".svg"))
  ggsave(plot=g,filename=paste0("../output/tfbs_analysis/bidirectional_3d7/",m,".png"))
}

Dynamic motif usage

Additionally, we can look at TSSs that we can confidentally say is shifting and analyze the motifs within these regions to find a nice example to display. We did this for KARHP:

fasta-get-markov -m 3 ../output/tfbs_analysis/kahrp_short.fasta ../output/tfbs_analysis/kahrp_short.background

fimo --bgfile ../output/tfbs_analysis/kahrp_short.background -oc ../output/tfbs_analysis/kahrp_short --thresh 1e-3 ../data/motif_cores/ap2_pbm_cores.meme ../output/tfbs_analysis/kahrp_short.fasta

fasta-get-markov -m 3 ../output/tfbs_analysis/kahrp_long.fasta ../output/tfbs_analysis/kahrp_long.background

fimo --bgfile ../output/tfbs_analysis/kahrp_long.background -oc ../output/tfbs_analysis/kahrp_long --thresh 1e-3 ../data/motif_cores/ap2_pbm_cores.meme ../output/tfbs_analysis/kahrp_long.fasta

Now we can look which motifs are unique to the short and long isoforms.

kahrp_short <- readr::read_tsv("../output/tfbs_analysis/kahrp_short/fimo.txt") %>%
  dplyr::rename(motif_id=`# motif_id`)

kahrp_long <- readr::read_tsv("../output/tfbs_analysis/kahrp_long/fimo.txt") %>%
  dplyr::rename(motif_id=`# motif_id`)

short_unique <- kahrp_short %>% dplyr::filter(!(motif_id %in% kahrp_long$motif_id))
long_unique <- kahrp_long %>% dplyr::filter(!(motif_id %in% kahrp_short$motif_id))

DT::datatable(short_unique, rownames = FALSE)

DT::datatable(long_unique, rownames = FALSE)

Session Information

sessionInfo()

R version 3.5.0 (2018-04-23)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Gentoo/Linux

Matrix products: default
BLAS: /usr/local/lib64/R/lib/libRblas.so
LAPACK: /usr/local/lib64/R/lib/libRlapack.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] gdtools_0.1.7                        
 [2] bindrcpp_0.2.2                       
 [3] BSgenome.Pfalciparum.PlasmoDB.v24_1.0
 [4] BSgenome_1.48.0                      
 [5] rtracklayer_1.40.6                   
 [6] Biostrings_2.48.0                    
 [7] XVector_0.20.0                       
 [8] GenomicRanges_1.32.6                 
 [9] GenomeInfoDb_1.16.0                  
[10] org.Pf.plasmo.db_3.6.0               
[11] AnnotationDbi_1.42.1                 
[12] IRanges_2.14.10                      
[13] S4Vectors_0.18.3                     
[14] Biobase_2.40.0                       
[15] BiocGenerics_0.26.0                  
[16] scales_1.0.0                         
[17] cowplot_0.9.3                        
[18] magrittr_1.5                         
[19] forcats_0.3.0                        
[20] stringr_1.3.1                        
[21] dplyr_0.7.6                          
[22] purrr_0.2.5                          
[23] readr_1.1.1                          
[24] tidyr_0.8.1                          
[25] tibble_1.4.2                         
[26] ggplot2_3.0.0                        
[27] tidyverse_1.2.1                      

loaded via a namespace (and not attached):
 [1] nlme_3.1-137                bitops_1.0-6               
 [3] matrixStats_0.54.0          lubridate_1.7.4            
 [5] bit64_0.9-7                 httr_1.3.1                 
 [7] rprojroot_1.3-2             tools_3.5.0                
 [9] backports_1.1.2             DT_0.4                     
[11] R6_2.2.2                    DBI_1.0.0                  
[13] lazyeval_0.2.1              colorspace_1.3-2           
[15] withr_2.1.2                 tidyselect_0.2.4           
[17] bit_1.1-14                  compiler_3.5.0             
[19] git2r_0.23.0                cli_1.0.0                  
[21] rvest_0.3.2                 xml2_1.2.0                 
[23] DelayedArray_0.6.5          labeling_0.3               
[25] digest_0.6.15               Rsamtools_1.32.3           
[27] svglite_1.2.1               rmarkdown_1.10             
[29] R.utils_2.6.0               pkgconfig_2.0.2            
[31] htmltools_0.3.6             htmlwidgets_1.2            
[33] rlang_0.2.2                 readxl_1.1.0               
[35] rstudioapi_0.7              RSQLite_2.1.1              
[37] shiny_1.1.0                 bindr_0.1.1                
[39] jsonlite_1.5                crosstalk_1.0.0            
[41] BiocParallel_1.14.2         R.oo_1.22.0                
[43] RCurl_1.95-4.11             GenomeInfoDbData_1.1.0     
[45] Matrix_1.2-14               Rcpp_0.12.18               
[47] munsell_0.5.0               R.methodsS3_1.7.1          
[49] stringi_1.2.4               yaml_2.2.0                 
[51] SummarizedExperiment_1.10.1 zlibbioc_1.26.0            
[53] plyr_1.8.4                  grid_3.5.0                 
[55] blob_1.1.1                  promises_1.0.1             
[57] crayon_1.3.4                lattice_0.20-35            
[59] haven_1.1.2                 hms_0.4.2                  
[61] knitr_1.20                  pillar_1.3.0               
[63] XML_3.98-1.16               glue_1.3.0                 
[65] evaluate_0.11               modelr_0.1.2               
[67] httpuv_1.4.5                cellranger_1.1.0           
[69] gtable_0.2.0                assertthat_0.2.0           
[71] mime_0.5                    xtable_1.8-3               
[73] broom_0.5.0                 later_0.7.5                
[75] GenomicAlignments_1.16.0    memoise_1.1.0              
[77] workflowr_1.1.1

This R Markdown site was created with workflowr