#source("https://bioconductor.org/biocLite.R") #biocLite("Biostrings") library(Biostrings) #devtools::install_github("myersgroup/MotifFinder") library(MotifFinder) library(ggseqlogo) library(SDAtools) library(biomaRt) library(stringr) library(data.table) library(ggplot2) library(NMF) # devtools::install_github("renozao/NMF", "devel") nmf.options(grid.patch=TRUE) # stop blank pages appearing
SDAresults <- load_results(results_folder = "../data/conradV3/conradV3_sda_1/", data_path = "../data/conradV3/") rownames(SDAresults$loadings[[1]]) <- paste0("V",1:50) library(testisAtlas) tss <- download_tss() #saveRDS(tss, "../data/motifs/promoter_sequences/transcription_start_sites.rds") tss <- readRDS("../data/motifs/promoter_sequences/transcription_start_sites.rds") str(tss) # sm = soft masked (i.e. repeats in lowercase) download.file("ftp://ftp.ensembl.org/pub/release-92/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa.gz", "../data/motifs/promoter_sequences/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa.gz") system("gunzip ../data/motifs/promoter_sequences/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa.gz") stopifnot(tools::md5sum("../data/motifs/promoter_sequences/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa") == "02ce19f0bb7d12f71bac95e3457c5f33") system("samtools faidx ../data/motifs/promoter_sequences/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa") # Extract +- 1,000bp tss <- tss[transcription_start_site-1000>0] fwrite(tss[,.(chromosome_name, transcription_start_site-1000, transcription_start_site+1000, external_gene_name, 0, strand)], "../data/motifs/promoter_sequences/all_genes_tss_1000.bed", col.names = FALSE, sep = "\t") system("bedtools getfasta -s -fi ../data/motifs/promoter_sequences/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa -bed ../data/motifs/promoter_sequences/all_genes_tss_1000.bed -name > ../data/motifs/promoter_sequences/all_genes_tss_1000.fasta")
# Human mkdir ../data/motifs/HocomocoV11/ curl http://hocomoco11.autosome.ru/final_bundle/hocomoco11/full/HUMAN/mono/HOCOMOCOv11_full_pcm_HUMAN_mono.tar.gz > ../data/motifs/HocomocoV11/human.tar.gz tar zxvf ../data/motifs/HocomocoV11/human.tar.gz -C ../data/motifs/HocomocoV11/ mv ../data/motifs/HocomocoV11/pcm ../data/motifs/HocomocoV11/human # Mouse curl http://hocomoco11.autosome.ru/final_bundle/hocomoco11/full/MOUSE/mono/HOCOMOCOv11_full_pcm_MOUSE_mono.tar.gz > ../data/motifs/HocomocoV11/mouse.tar.gz tar zxvf ../data/motifs/HocomocoV11/mouse.tar.gz -C ../data/motifs/HocomocoV11/ mv ../data/motifs/HocomocoV11/pcm ../data/motifs/HocomocoV11/mouse
motif_files <- list.files("../data/motifs/HocomocoV11/", recursive = T) hocomoco <- vector("list", length(motif_files)) for(motif in seq_along(motif_files)){ pwm <- fread(paste0("../data/motifs/HocomocoV11/", motif_files[motif])) pwm <- t(as.matrix(pwm)) rownames(pwm) <- c("A", "C", "G", "T") hocomoco[[motif]] <- pwm names(hocomoco)[motif] <- gsub("[a-z]+/|.pcm","",motif_files[motif]) } str(motif_files) str(hocomoco[735]) str(hocomoco[1302]) saveRDS(hocomoco,"../data/motifs/Hocomoco_11_raw_motifs.rds") # Split full motif set into Core and Extra extra_motifs <- grep("\\.[1-3]\\.|\\.D\\.",motif_files, value=T) core_motifs <- motif_files[!motif_files %in% extra_motifs] # remove path and file ending extra_motifs <- gsub("[a-z]+/|.pcm","",extra_motifs) core_motifs <- gsub("[a-z]+/|.pcm","",core_motifs) saveRDS(hocomoco[extra_motifs],"../data/motifs/Hocomoco_11_raw_motifs_extra.rds") saveRDS(hocomoco[core_motifs],"../data/motifs/Hocomoco_11_raw_motifs_core.rds")
this step on the server
tss_seqs <- seqinr::read.fasta("../data/motifs/promoter_sequences/all_genes_tss_1000.fasta", forceDNAtolower = FALSE, as.string = TRUE) tss_seqs_s <- toupper(as.character(tss_seqs)) names(tss_seqs_s) <- gsub("\\(-\\)|\\(\\+\\)","",names(tss_seqs)) tss_seqs_s <- substr(tss_seqs_s,500,1500) tss_seqs_s <- tss_seqs_s[names(tss_seqs_s) %in% colnames(SDAresults$loadings[[1]])] library(MotifFinder) # Core motifs hocomoco <- readRDS("../data/motifs/Hocomoco_11_raw_motifs_core.rds") #find_motifs_parallel takes hocomoco argument silently system.time(results_core <- mclapply(seq_along(hocomoco), find_motifs_parallel, mc.cores = 16, mc.silent=TRUE, mc.preschedule = FALSE)) # user system elapsed # 118,276.39 68304.17 11818.16 # Warning message: # In mclapply(seq_along(hocomoco), find_motifs_parallel, mc.cores = 16, : # 9 function calls resulted in an error saveRDS(results_core, "../data/motifs/MotifFinder_Hocomoco_V11.rds") # Non Core motifs hocomoco <- readRDS("../data/motifs/Hocomoco_11_raw_motifs_extra.rds") #find_motifs_parallel takes hocomoco argument silently system.time(results_extra <- mclapply(seq_along(hocomoco), find_motifs_parallel, mc.cores = 16, mc.silent=TRUE, mc.preschedule = FALSE)) # V1 for extra motifs # user system elapsed # 855,947.01 27,336.79 62,126.83 # V2 # user system elapsed # 84,440.452 52,009.525 8,686.221 saveRDS(results_extra, "../data/motifs/MotifFinder_Hocomoco_V11_extra.rds")
Back on local computer
results_core <- readRDS("../data/motifs/MotifFinder_Hocomoco_V11.rds") results_extra <- readRDS("../data/motifs/MotifFinder_Hocomoco_V11_extra.rds") results_core <- results_core[-which(sapply(results_core, class)=="try-error")] results_extra <- results_extra[-which(sapply(results_extra, class)=="try-error")] stopifnot(which(sapply(results_core, class)=="try-error")==0) stopifnot(which(sapply(results_extra, class)=="try-error")==0) names(results_core) <- sapply(results_core, function(x) x$motifID) names(results_extra) <- sapply(results_extra, function(x) x$motifID) length(results_core) length(results_extra) stopifnot(length(unique(c(names(results_core),names(results_extra)))) == length(results_core) + length(results_extra)) # to extract just gene names #unlist(strsplit(sapply(results_core, function(x) x$motifID),"_"))[seq(1,length(results_core)*2,2)]
hocomoco_alphas_core <- sapply(results_core, function(x) x$alpha) hocomoco_alphas_extra <- sapply(results_extra, function(x) x$alpha) regprobs_matrix_core <- sapply(results_core, function(x) x$dt$regprob) rownames(regprobs_matrix_core) <- results_core[[1]]$dt$sequence regprobs_matrix_extra <- sapply(results_extra, function(x) x$dt$regprob) rownames(regprobs_matrix_extra) <- results_extra[[1]]$dt$sequence str(regprobs_matrix_core) str(regprobs_matrix_extra) saveRDS(regprobs_matrix_core, "../data/motifs/HocomocoV11_regprobs_matrix.rds") saveRDS(regprobs_matrix_extra, "../data/motifs/HocomocoV11_regprobs_matrix_extra.rds") saveRDS(cbind(regprobs_matrix_core,regprobs_matrix_extra), "../data/motifs/HocomocoV11_regprobs_matrix_core_and_extra.rds")
hocomoco_alphas_core <- unlist(hocomoco_alphas_core) hocomoco_alphas_extra <- unlist(hocomoco_alphas_extra) str(hocomoco_alphas_core) str(hocomoco_alphas_extra) hocomoco_alphas_core_matrix <- sapply(results_core, function(x) as.numeric(x$alphas)) hocomoco_alphas_extra_matrix <- sapply(results_extra, function(x) as.numeric(x$alphas)) rownames(hocomoco_alphas_core_matrix) <- 1:20 rownames(hocomoco_alphas_extra_matrix) <- 1:20 hocomoco_alphas_dt_core <- melt(data.table(motif=seq_along(hocomoco_alphas_core), t(hocomoco_alphas_core_matrix), motif_name=colnames(hocomoco_alphas_core_matrix)), id.vars=c("motif","motif_name"), variable.name = "iteration", value.name = "alpha") hocomoco_alphas_dt_extra <- melt(data.table(motif=seq_along(hocomoco_alphas_extra), t(hocomoco_alphas_extra_matrix), motif_name=colnames(hocomoco_alphas_extra_matrix)), id.vars=c("motif","motif_name"), variable.name = "iteration", value.name = "alpha") ggplot(hocomoco_alphas_dt_core, aes(iteration, alpha, group=motif)) + geom_line(alpha=0.15) + ggtitle("Alpha convergence per iteration (Core motifs)") ggplot(hocomoco_alphas_dt_core, aes(iteration, alpha, group=motif)) + geom_line(alpha=0.15) + scale_y_log10() + ggtitle("Alpha convergence per iteration (Core motifs) Log") ggplot(hocomoco_alphas_dt_extra, aes(iteration, alpha, group=motif)) + geom_line(alpha=0.15) + ggtitle("Alpha convergence per iteration (Extra motifs)") ggplot(hocomoco_alphas_dt_extra, aes(iteration, alpha, group=motif)) + geom_line(alpha=0.15) + scale_y_log10() + ggtitle("Alpha convergence per iteration (Extra motifs) Log")
hocomoco_time_core <- sapply(results_core, function(x) x$time) hocomoco_time_extra <- sapply(results_extra, function(x) x$time) plot(hocomoco_alphas_core, hocomoco_time_core[3,], main="Alpha vs Run time for core motifs") plot(log(hocomoco_alphas_core),log(hocomoco_time_core[3,]), main="Alpha vs Run time for core motifs (log)") plot(hocomoco_alphas_extra, hocomoco_time_extra[3,], main="Alpha vs Run time for extra motifs") plot(log(hocomoco_alphas_extra),log(hocomoco_time_extra[3,]), main="Alpha vs Run time for extra motifs (log)")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.