#source("https://bioconductor.org/biocLite.R")
#biocLite("Biostrings")
library(Biostrings)
#devtools::install_github("myersgroup/MotifFinder")
library(MotifFinder)
library(ggseqlogo)
library(SDAtools)
library(biomaRt)
library(stringr)
library(data.table)
library(ggplot2)
library(NMF) # devtools::install_github("renozao/NMF", "devel")
nmf.options(grid.patch=TRUE) # stop blank pages appearing

Load SDA & tss sequences

SDAresults <- load_results(results_folder = "../data/conradV3/conradV3_sda_1/", data_path = "../data/conradV3/")
rownames(SDAresults$loadings[[1]]) <- paste0("V",1:50)

library(testisAtlas)

tss <- download_tss()
#saveRDS(tss, "../data/motifs/promoter_sequences/transcription_start_sites.rds")

tss <- readRDS("../data/motifs/promoter_sequences/transcription_start_sites.rds")
str(tss)

# sm = soft masked (i.e. repeats in lowercase)
download.file("ftp://ftp.ensembl.org/pub/release-92/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa.gz",
              "../data/motifs/promoter_sequences/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa.gz")
system("gunzip ../data/motifs/promoter_sequences/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa.gz")
stopifnot(tools::md5sum("../data/motifs/promoter_sequences/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa") == "02ce19f0bb7d12f71bac95e3457c5f33")
system("samtools faidx ../data/motifs/promoter_sequences/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa")

# Extract +- 1,000bp

tss <- tss[transcription_start_site-1000>0]

fwrite(tss[,.(chromosome_name, transcription_start_site-1000, transcription_start_site+1000, external_gene_name, 0, strand)], 
       "../data/motifs/promoter_sequences/all_genes_tss_1000.bed", col.names = FALSE, sep = "\t")

system("bedtools getfasta -s -fi ../data/motifs/promoter_sequences/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa -bed ../data/motifs/promoter_sequences/all_genes_tss_1000.bed -name > ../data/motifs/promoter_sequences/all_genes_tss_1000.fasta")

Calculate regprobs for all TSS Hocomoco

Download hocomoco

# Human
mkdir ../data/motifs/HocomocoV11/
curl http://hocomoco11.autosome.ru/final_bundle/hocomoco11/full/HUMAN/mono/HOCOMOCOv11_full_pcm_HUMAN_mono.tar.gz > ../data/motifs/HocomocoV11/human.tar.gz
tar zxvf ../data/motifs/HocomocoV11/human.tar.gz -C ../data/motifs/HocomocoV11/
mv ../data/motifs/HocomocoV11/pcm ../data/motifs/HocomocoV11/human

# Mouse

curl http://hocomoco11.autosome.ru/final_bundle/hocomoco11/full/MOUSE/mono/HOCOMOCOv11_full_pcm_MOUSE_mono.tar.gz > ../data/motifs/HocomocoV11/mouse.tar.gz
tar zxvf ../data/motifs/HocomocoV11/mouse.tar.gz -C ../data/motifs/HocomocoV11/
mv ../data/motifs/HocomocoV11/pcm ../data/motifs/HocomocoV11/mouse

PCM to PWM for hocomoco

motif_files <- list.files("../data/motifs/HocomocoV11/", recursive = T)

hocomoco <- vector("list", length(motif_files))

for(motif in seq_along(motif_files)){
  pwm <- fread(paste0("../data/motifs/HocomocoV11/", motif_files[motif]))
  pwm <- t(as.matrix(pwm))
  rownames(pwm) <- c("A", "C", "G", "T")
  hocomoco[[motif]] <- pwm
  names(hocomoco)[motif] <- gsub("[a-z]+/|.pcm","",motif_files[motif])
}

str(motif_files)
str(hocomoco[735])
str(hocomoco[1302])

saveRDS(hocomoco,"../data/motifs/Hocomoco_11_raw_motifs.rds")

# Split full motif set into Core and Extra
extra_motifs <- grep("\\.[1-3]\\.|\\.D\\.",motif_files, value=T)
core_motifs <- motif_files[!motif_files %in% extra_motifs]

# remove path and file ending
extra_motifs <- gsub("[a-z]+/|.pcm","",extra_motifs)
core_motifs <- gsub("[a-z]+/|.pcm","",core_motifs)


saveRDS(hocomoco[extra_motifs],"../data/motifs/Hocomoco_11_raw_motifs_extra.rds")
saveRDS(hocomoco[core_motifs],"../data/motifs/Hocomoco_11_raw_motifs_core.rds")

Run MotifFinder

this step on the server

tss_seqs <- seqinr::read.fasta("../data/motifs/promoter_sequences/all_genes_tss_1000.fasta", forceDNAtolower = FALSE, as.string = TRUE)
tss_seqs_s <- toupper(as.character(tss_seqs))
names(tss_seqs_s) <- gsub("\\(-\\)|\\(\\+\\)","",names(tss_seqs))

tss_seqs_s <- substr(tss_seqs_s,500,1500)

tss_seqs_s <- tss_seqs_s[names(tss_seqs_s) %in% colnames(SDAresults$loadings[[1]])]

library(MotifFinder)

# Core motifs

hocomoco <- readRDS("../data/motifs/Hocomoco_11_raw_motifs_core.rds")

#find_motifs_parallel takes hocomoco argument silently

system.time(results_core <- mclapply(seq_along(hocomoco), find_motifs_parallel, mc.cores = 16, mc.silent=TRUE, mc.preschedule = FALSE))

#     user    system   elapsed
# 118,276.39  68304.17  11818.16
# Warning message:
# In mclapply(seq_along(hocomoco), find_motifs_parallel, mc.cores = 16,  :
#   9 function calls resulted in an error

saveRDS(results_core, "../data/motifs/MotifFinder_Hocomoco_V11.rds")


# Non Core motifs

hocomoco <- readRDS("../data/motifs/Hocomoco_11_raw_motifs_extra.rds")

#find_motifs_parallel takes hocomoco argument silently

system.time(results_extra <- mclapply(seq_along(hocomoco), find_motifs_parallel, mc.cores = 16, mc.silent=TRUE, mc.preschedule = FALSE))

# V1 for extra motifs
#     user    system   elapsed
# 855,947.01  27,336.79  62,126.83

# V2
#   user    system   elapsed
# 84,440.452 52,009.525  8,686.221


saveRDS(results_extra, "../data/motifs/MotifFinder_Hocomoco_V11_extra.rds")

Load results

Back on local computer

results_core <- readRDS("../data/motifs/MotifFinder_Hocomoco_V11.rds")
results_extra <- readRDS("../data/motifs/MotifFinder_Hocomoco_V11_extra.rds")

results_core <- results_core[-which(sapply(results_core, class)=="try-error")]
results_extra <- results_extra[-which(sapply(results_extra, class)=="try-error")]
stopifnot(which(sapply(results_core, class)=="try-error")==0)
stopifnot(which(sapply(results_extra, class)=="try-error")==0)

names(results_core) <- sapply(results_core, function(x) x$motifID)
names(results_extra) <- sapply(results_extra, function(x) x$motifID)

length(results_core)
length(results_extra)

stopifnot(length(unique(c(names(results_core),names(results_extra)))) == length(results_core) + length(results_extra))

# to extract just gene names
#unlist(strsplit(sapply(results_core, function(x) x$motifID),"_"))[seq(1,length(results_core)*2,2)]

Clean & create matrix of regprobs

hocomoco_alphas_core <- sapply(results_core, function(x) x$alpha)
hocomoco_alphas_extra <- sapply(results_extra, function(x) x$alpha)

regprobs_matrix_core <- sapply(results_core, function(x) x$dt$regprob)
rownames(regprobs_matrix_core) <- results_core[[1]]$dt$sequence

regprobs_matrix_extra <- sapply(results_extra, function(x) x$dt$regprob)
rownames(regprobs_matrix_extra) <- results_extra[[1]]$dt$sequence

str(regprobs_matrix_core)
str(regprobs_matrix_extra)

saveRDS(regprobs_matrix_core, "../data/motifs/HocomocoV11_regprobs_matrix.rds")
saveRDS(regprobs_matrix_extra, "../data/motifs/HocomocoV11_regprobs_matrix_extra.rds")
saveRDS(cbind(regprobs_matrix_core,regprobs_matrix_extra),
        "../data/motifs/HocomocoV11_regprobs_matrix_core_and_extra.rds")

Check convergance

hocomoco_alphas_core <- unlist(hocomoco_alphas_core)
hocomoco_alphas_extra <- unlist(hocomoco_alphas_extra)

str(hocomoco_alphas_core)
str(hocomoco_alphas_extra)

hocomoco_alphas_core_matrix <- sapply(results_core, function(x) as.numeric(x$alphas))
hocomoco_alphas_extra_matrix <- sapply(results_extra, function(x) as.numeric(x$alphas))

rownames(hocomoco_alphas_core_matrix) <- 1:20
rownames(hocomoco_alphas_extra_matrix) <- 1:20

hocomoco_alphas_dt_core <- melt(data.table(motif=seq_along(hocomoco_alphas_core), t(hocomoco_alphas_core_matrix), motif_name=colnames(hocomoco_alphas_core_matrix)),
                         id.vars=c("motif","motif_name"),
                         variable.name = "iteration",
                         value.name = "alpha")

hocomoco_alphas_dt_extra <- melt(data.table(motif=seq_along(hocomoco_alphas_extra), t(hocomoco_alphas_extra_matrix), motif_name=colnames(hocomoco_alphas_extra_matrix)),
                         id.vars=c("motif","motif_name"),
                         variable.name = "iteration",
                         value.name = "alpha")

ggplot(hocomoco_alphas_dt_core, aes(iteration, alpha, group=motif)) + geom_line(alpha=0.15) + ggtitle("Alpha convergence per iteration (Core motifs)")
ggplot(hocomoco_alphas_dt_core, aes(iteration, alpha, group=motif)) + geom_line(alpha=0.15) + scale_y_log10() + ggtitle("Alpha convergence per iteration (Core motifs) Log")

ggplot(hocomoco_alphas_dt_extra, aes(iteration, alpha, group=motif)) + geom_line(alpha=0.15) + ggtitle("Alpha convergence per iteration (Extra motifs)")
ggplot(hocomoco_alphas_dt_extra, aes(iteration, alpha, group=motif)) + geom_line(alpha=0.15) + scale_y_log10() + ggtitle("Alpha convergence per iteration (Extra motifs) Log")

check Time running

hocomoco_time_core <- sapply(results_core, function(x) x$time)
hocomoco_time_extra <- sapply(results_extra, function(x) x$time)

plot(hocomoco_alphas_core, hocomoco_time_core[3,], main="Alpha vs Run time for core motifs")
plot(log(hocomoco_alphas_core),log(hocomoco_time_core[3,]), main="Alpha vs Run time for core motifs (log)")

plot(hocomoco_alphas_extra, hocomoco_time_extra[3,], main="Alpha vs Run time for extra motifs")
plot(log(hocomoco_alphas_extra),log(hocomoco_time_extra[3,]), main="Alpha vs Run time for extra motifs (log)")


MyersGroup/testisAtlas documentation built on Nov. 29, 2020, 9:21 p.m.