knitr::opts_chunk$set(echo = TRUE) library(tidyverse) library(Biostrings) library(cowplot)
#MT_FMR1 <- readDNAStringSet("mydata/FMR1dat/SD3deltaLRGenes.longest3UTR.fa") #MT_FMR1_ctrl <- readDNAStringSet("mydata/FMR1dat/UnchangeddeltaLRGenes.longest3UTR.fa") #FMR1_genes <- substr(names(MT_FMR1), 1, 18) #FMR1_genes_ctrl <- substr(names(MT_FMR1_ctrl), 1, 18) #FMR1_UTR3_tx <- gene2Tx(longest_mm, FMR1_genes, "UTR3") #1s #FMR1_UTR3_tx_ctrl <- gene2Tx(longest_mm, FMR1_genes_ctrl, "UTR3") #1s #write_Sequence(mm_f_gff, FMR1_UTR3_tx, "UTR3", "mydata/FMR1dat/longest_3UTR_FMR1", "fa") #write_Sequence(mm_f_gff, FMR1_UTR3_tx_ctrl, "UTR3", "mydata/FMR1dat/longest_3UTR_FMR1_ctrl", "fa") longest_3UTR_FMR1 <- readDNAStringSet("mydata/FMR1dat/longest_3UTR_FMR1.fa") #30s longest_3UTR_FMR1_ctrl <- readDNAStringSet("mydata/FMR1dat/longest_3UTR_FMR1_ctrl.fa") #30s
#This function simply returns the full counts of each kmer for each gene in a set #access to raw data at a gene level. kmer_by_gene <- function(DNAStringSet, k){ print("counting kmers...", quote = FALSE) kmer_counts <- Biostrings::oligonucleotideFrequency(DNAStringSet, width = k) %>% dplyr::as_tibble() %>% dplyr::mutate(gene = names(DNAStringSet)) %>% dplyr::select(gene, dplyr::everything()) print("counting complete.", quote= FALSE) return(kmer_counts) } #This function compares kmer content between two DNAStringSets and returns a table with statistics for frequency of each kmer #it doesnt take terribly long and checks that case sequences are excluded from control sequences. #we could probably add options for pvalue correction kmer_compare <- function(caseDNAStringSet, ctrlDNAStringSet, k){ if (any(names(caseDNAStringSet) %in% names(ctrlDNAStringSet))){ warning("some sequences in case set are also in the control set. This is not recommended.") } print("counting kmers...", quote = FALSE) case_kmer <- Biostrings::oligonucleotideFrequency(caseDNAStringSet, width = k) %>% colSums() %>% data.frame(kmer = names(.), case = .) %>% dplyr::as_tibble() ctrl_kmer <- Biostrings::oligonucleotideFrequency(ctrlDNAStringSet, width = k) %>% colSums() %>% data.frame(kmer = names(.), ctrl = .) %>% dplyr::as_tibble() print("counting complete.", quote= FALSE) #compare kmers between case and ctrl takes ~30s fisher <- function(a, b, c, d){ mat <- matrix(c(a, b, c, d), nr = 2) fisher.test(mat, alternative = "two.sided")$p.value } print("calculating kmer statistics...", print = FALSE) kmer_stats <- dplyr::left_join(ctrl_kmer, case_kmer) %>% na.omit() %>% dplyr::mutate(ctrl_freq = ctrl / sum(ctrl), case_freq = case / sum(case), log2FC = log2(case_freq/ctrl_freq), ctrl_tot = sum(ctrl)-ctrl, case_tot = sum(case)-case) %>% dplyr::rowwise() %>% dplyr:: mutate(pval = fisher(case, ctrl, case_tot, ctrl_tot), p_adj = p.adjust(pval, method = "BH", 4^k)) %>% dplyr::arrange(p_adj) print("calculations complete.", quote = FALSE) return(kmer_stats) } #kmer_by_gene(longest_3UTR_FMR1, 6) #FMR1_kmer <- kmer_compare(longest_3UTR_FMR1, longest_3UTR_FMR1_ctrl, 6) #saveRDS(FMR1_kmer, "mydata/FMR1dat/kmer_FMR1_compare.txt") FMR1_kmer <- readRDS("mydata/FMR1dat/kmer_FMR1_compare.txt") FMR1_kmer %>% arrange(p_adj) ##for later pos_FMR1_kmer <- FMR1_kmer %>% filter(p_adj < 0.05, log2FC > 0) %>% pull(., kmer)
#this returns length of each sequence in a nice tibble get_length <- function(DNAStringSet){ dplyr::as_tibble(data.frame(gene = names(DNAStringSet), length = Biostrings::width(DNAStringSet))) } #This compares sequence length between case and control DNAStringSets #I am really uncertain of what the output should be... #The table right now includes means, wilcox.test results and cliffs delta effect size metrics... #we could add options as to what test to perform rather than just wilcox.test #feedback is welcome length_compare <- function(caseDNAStringSet, ctrlDNAStringSet){ if (any(names(caseDNAStringSet) %in% names(ctrlDNAStringSet))){ warning("some sequences in case set are also in the control set. This is not recommended.") } wilcox.p <- wilcox.test(Biostrings::width(caseDNAStringSet), Biostrings::width(ctrlDNAStringSet))$p.value mean_case <- mean(Biostrings::width(caseDNAStringSet)) mean_ctrl <- mean(Biostrings::width(ctrlDNAStringSet)) mean_FC <- mean_case/mean_ctrl CliffDelta <- effsize::cliff.delta(Biostrings::width(caseDNAStringSet), Biostrings::width(ctrlDNAStringSet))$estimate lowerCD <- effsize::cliff.delta(Biostrings::width(caseDNAStringSet), Biostrings::width(ctrlDNAStringSet))$conf.int[1] upperCD <- effsize::cliff.delta(Biostrings::width(caseDNAStringSet), Biostrings::width(ctrlDNAStringSet))$conf.int[2] data.frame(wilcox.p, mean_case, mean_ctrl, mean_FC, CliffDelta, lowerCD, upperCD) } get_length(longest_3UTR_FMR1) length_compare(longest_3UTR_FMR1, longest_3UTR_FMR1_ctrl)
#this returns GC of each sequence in a nice tibble get_GC <- function(DNAStringSet){ dplyr::as_tibble(data.frame(gene = names(DNAStringSet), GC = Biostrings::letterFrequency(DNAStringSet, "GC")/ BioStrings::width(DNAStringSet))) } #This is similar to the last function but with GC content. I again am uncertain of the output. GC_compare <- function(caseDNAStringSet, ctrlDNAStringSet){ if (any(names(caseDNAStringSet) %in% names(ctrlDNAStringSet))){ warning("some sequences in case set are also in the control set. This is not recommended.") } GC_case <- Biostrings::letterFrequency(caseDNAStringSet, "GC") / Biostrings::width(caseDNAStringSet) GC_ctrl <- Biostrings::letterFrequency(ctrlDNAStringSet, "GC") / Biostrings::width(ctrlDNAStringSet) wilcox.p <- wilcox.test(GC_case, GC_ctrl)$p.value mean_case <- mean(GC_case) mean_ctrl <- mean(GC_ctrl) mean_FC <- mean_case/mean_ctrl CliffDelta <- effsize::cliff.delta(GC_case, GC_ctrl)$estimate lowerCD <- effsize::cliff.delta(GC_case, GC_ctrl)$conf.int[1] upperCD <- effsize::cliff.delta(GC_case, GC_ctrl)$conf.int[2] data.frame(wilcox.p, mean_case, mean_ctrl, mean_FC, CliffDelta, lowerCD, upperCD) } get_GC(longest_3UTR_FMR1) GC_compare(longest_3UTR_FMR1, longest_3UTR_FMR1_ctrl)
#This function simply returns every motif counts for each gene #begin with folder containing PWM for motifs of intrest (downloaded from CisBPRNA called "pwms_all_motifs") #also need RBP info for the motifs (dowloaded from CisBPRNA called "RBP_Information_all_motifs") cisBPRNA_by_gene <- function(motif_path, RBPinfo, DNAStringset){ #get paths to each motif pwm print("getting Motif and RBP data...", quote = FALSE) motif_paths <- list.files(path = motif_path, full.names = TRUE) motif_info <- file.info(motif_paths) motif_info <- motif_info[motif_info$size != 0, ] motifs <- motif_info %>% dplyr::as_tibble(rownames = "PATH") %>% dplyr::mutate(motif = stringr::str_match(PATH, "motifs/(.*?).txt")[,2]) %>% dplyr::select(PATH, motif) #merge motif paths with RBP info RBP_info <- read.table(RBPinfo, header = TRUE, sep = "\t") RBP_info <- RBP_info %>% dplyr::as_tibble() %>% dplyr::select(Motif_ID, RBP_Name) %>% dplyr::filter(Motif_ID != ".") %>% dplyr::group_by(Motif_ID) %>% dplyr::summarise(RBP_name = dplyr::first(RBP_Name)) motifs <- dplyr::left_join(motifs, RBP_info, by = c("motif" = "Motif_ID")) print("Getting PWM data", quote = FALSE) motifs <- motifs %>% dplyr::mutate(PWM = lapply(PATH, function(x) t(read.table(x, header = TRUE, row.names = 1, col.names = c("pos", "A", "C", "G", "T"))))) print("Counting motif occurances...", quote = FALSE) counts_list <- lapply(motifs$PWM, function(x) lapply(DNAStringset, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% dplyr::as_tibble(rownames = "gene")) names(counts_list) <- motifs$motif print("Counting motif occurances complete.", quote = FALSE) motif_by_gene <- dplyr::bind_rows(counts_list, .id = "motif") %>% tidyr::spread(gene, value = value) %>% dplyr::left_join(., RBP_info, by = c("motif" = "Motif_ID")) %>% dplyr::select(motif, RBP_name, dplyr::everything()) return(motif_by_gene) } #This function compare the freqency of motifs in a case and control DNAStringSet #begin with folder containing PWM for motifs of intrest (downloaded from CisBPRNA called "pwms_all_motifs") #also need RBP info for the motifs (dowloaded from CisBPRNA called "RBP_Information_all_motifs") cisBPRNA_compare <- function(motif_path, RBPinfo, caseDNAStringset, ctrlDNAStringSet){ #get paths to each motif pwm print("getting Motif and RBP data...", quote = FALSE) motif_paths <- list.files(path = motif_path, full.names = TRUE) motif_info <- file.info(motif_paths) motif_info <- motif_info[motif_info$size != 0, ] motifs <- motif_info %>% dplyr::as_tibble(rownames = "PATH") %>% dplyr::mutate(motif = stringr::str_match(PATH, "motifs/(.*?).txt")[,2]) %>% dplyr::select(PATH, motif) #merge motif paths with RBP info RBP_info <- read.table(RBPinfo, header = TRUE, sep = "\t") RBP_info <- RBP_info %>% dplyr::as_tibble() %>% dplyr::select(Motif_ID, RBP_Name) %>% dplyr::filter(Motif_ID != ".") %>% dplyr::group_by(Motif_ID) %>% dplyr::summarise(RBP_name = dplyr::first(RBP_Name)) motifs <- dplyr::left_join(motifs, RBP_info, by = c("motif" = "Motif_ID")) fisher <- function(a, b, c, d){ mat <- matrix(c(a, b, c, d), nr = 2) fisher.test(mat, alternative = "two.sided")$p.value } print("Counting motif occurances and calculating statistics...", quote = FALSE) motifs <- motifs %>% mutate(PWM = lapply(PATH, function(x) t(read.table(x, header = TRUE, row.names = 1, col.names = c("pos", "A", "C", "G", "T")))), case = suppressWarnings(unlist(lapply(PWM, function(x) lapply(caseDNAStringset, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% sum()))), ctrl = suppressWarnings(unlist(lapply(PWM, function(x) lapply(ctrlDNAStringSet, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% sum()))), case_freq = case / sum(Biostrings::width(caseDNAStringset)), ctrl_freq = ctrl / sum(Biostrings::width(ctrlDNAStringSet)), log2FC = log2(case_freq/ctrl_freq), case_tot = sum(case)-case, ctrl_tot = sum(ctrl)-ctrl) %>% dplyr::rowwise() %>% dplyr::mutate(pval = fisher(case, ctrl, case_tot, ctrl_tot), p_adj = p.adjust(pval, method = "BH", nrow(motifs))) %>% dplyr::select(RBP_name, motif, case, ctrl, case_freq, ctrl_freq, log2FC, case_tot, ctrl_tot, pval, p_adj) return(motifs) } #cisBPRNA_by_gene("mydata/CisBPRNAdat/mm/pwms_all_motifs", "mydata/CisBPRNAdat/mm/RBP_Information_all_motifs.txt", longest_3UTR_FMR1) #FMR1_motif <- cisBPRNA_compare("mydata/CisBPRNAdat/mm/pwms_all_motifs", "mydata/CisBPRNAdat/mm/RBP_Information_all_motifs.txt", longest_3UTR_FMR1, longest_3UTR_FMR1_ctrl) #saveRDS(FMR1_motif, "mydata/FMR1dat/cisbpMotif_FMR1_compare.txt") FMR1_motif <- readRDS("mydata/FMR1dat/cisbpMotif_FMR1_compare.txt") FMR1_motif %>% arrange(p_adj)
#This function simply returns every motif counts for each gene #begin with folder containing PWM for motifs of intrest (from D.Dominguez called "RBNS_PWMs") RBNS_by_gene <- function(motif_path, DNAStringset){ #get paths to each motif pwm print("getting Motif and RBP data...", quote = FALSE) motif_paths <- list.files(path = motif_path, full.names = TRUE) motif_info <- file.info(path = motif_paths) motifs <- motif_info %>% dplyr::as_tibble(rownames = "PATH") %>% dplyr::mutate(motif = stringr::str_match(PATH, "PWMs/(.*?).PWM")[,2]) %>% dplyr::select(PATH, motif) print("Getting PWM data", quote = FALSE) motifs <- motifs %>% dplyr::mutate(PWM = lapply(PATH, function(x) t(read.table(x, skip = 1, row.names = 1, header = TRUE, col.names = c("pos","A", "C", "G", "T"))))) print("Counting motif occurances...", quote = FALSE) counts_list <- lapply(motifs$PWM, function(x) lapply(DNAStringset, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% dplyr::as_tibble(rownames = "gene")) names(counts_list) <- motifs$motif print("Counting motif occurance complete.", quote = FALSE) motif_by_gene <- dplyr::bind_rows(counts_list, .id = "motif") %>% tidyr::spread(gene, value = value) return(motif_by_gene) } #This function compare the freqency of motifs in a case and control DNAStringSet #begin with folder containing PWM for motifs of intrest (from D.Dominguez called "RBNS_PWMs") RBNS_compare <- function(motif_path, caseDNAStringset, ctrlDNAStringSet){ #get paths to each motif pwm print("getting Motif and RBP data...", quote = FALSE) motif_paths <- list.files(path = motif_path, full.names = TRUE) motif_info <- file.info(path = motif_paths) motifs <- motif_info %>% dplyr::as_tibble(rownames = "PATH") %>% dplyr::mutate(motif = stringr::str_match(PATH, "PWMs/(.*?).PWM")[,2]) %>% dplyr::select(PATH, motif) fisher <- function(a, b, c, d){ mat <- matrix(c(a, b, c, d), nr = 2) fisher.test(mat, alternative = "two.sided")$p.value } print("Counting motif occurances and calculating statistics...", quote = FALSE) motifs <- motifs %>% mutate(PWM = lapply(PATH, function(x) t(read.table(x, skip = 1, row.names = 1, header = TRUE, col.names = c("pos","A", "C", "G", "T")))), case = suppressWarnings(unlist(lapply(PWM, function(x) lapply(caseDNAStringset, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% sum()))), ctrl = suppressWarnings(unlist(lapply(PWM, function(x) lapply(ctrlDNAStringSet, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% sum()))), case_freq = case / sum(Biostrings::width(caseDNAStringset)), ctrl_freq = ctrl / sum(Biostrings::width(ctrlDNAStringSet)), log2FC = log2(case_freq/ctrl_freq), case_tot = sum(case)-case, ctrl_tot = sum(ctrl)-ctrl) %>% dplyr::rowwise() %>% dplyr::mutate(pval = fisher(case, ctrl, case_tot, ctrl_tot), p_adj = p.adjust(pval, method = "BH", nrow(motifs))) %>% dplyr::select(motif, case, ctrl, case_freq, ctrl_freq, log2FC, case_tot, ctrl_tot, pval, p_adj) return(motifs) } #RBNS_by_gene("mydata/RBNSdat/RBNS_PWMs", longest_3UTR_FMR1) #FMR1_RBNS <- RBNS_compare("mydata/RBNSdat/RBNS_PWMs", longest_3UTR_FMR1, longest_3UTR_FMR1_ctrl) #saveRDS(FMR1_RBNS, "mydata/FMR1dat/RBNSMotif_FMR1_compare.txt") FMR1_RBNS <- readRDS("mydata/FMR1dat/RBNSMotif_FMR1_compare.txt") FMR1_RBNS %>% arrange(p_adj)
#custom PWM lists should contain a list of named matrices where the name is unique, and the matrix columns represent positions, and the rows are the probability of each base "A", "C", "G", and "T". Motif_by_gene <- function(PWM_list, DNAStringset){ print("Checking PWM data", quote = FALSE) if (typeof(PWM_list) != "list"){ stop("PWM_list must be a list") } if (any(as.numeric(lapply(PWM_list, nrow)) != 4)){ stop("Error: Ensure all PWM matricies have exactly 4 rows (\"A\", \"C\", \"G\" and \"T\")") } motifs <- names(PWM_list) %>% dplyr::as_tibble() %>% dplyr::rename("motif" = value) %>% dplyr::mutate(PWM = unname(PWM_list)) print("Counting motif occurances...", quote = FALSE) counts_list <- lapply(motifs$PWM, function(x) lapply(DNAStringset, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% dplyr::as_tibble(rownames = "gene")) names(counts_list) <- motifs$motif print("Counting motif occurance complete.", quote = FALSE) motif_by_gene <- dplyr::bind_rows(counts_list, .id = "motif") %>% tidyr::spread(gene, value = value) return(motif_by_gene) } Motif_compare <- function(PWM_list, caseDNAStringset, ctrlDNAStringSet){ print("Checking PWM data", quote = FALSE) if (typeof(PWM_list) != "list"){ stop("PWM_list must be a list") } if (any(as.numeric(lapply(PWM_list, nrow)) != 4)){ stop("Error: Ensure all PWM matricies have exactly 4 rows (\"A\", \"C\", \"G\" and \"T\")") } motifs <- names(PWM_list) %>% dplyr::as_tibble() %>% dplyr::rename("motif" = value) %>% dplyr::mutate(PWM = unname(PWM_list)) fisher <- function(a, b, c, d){ mat <- matrix(c(a, b, c, d), nr = 2) fisher.test(mat, alternative = "two.sided")$p.value } print("Counting motif occurances and calculating statistics...", quote = FALSE) motifs <- motifs %>% mutate(case = suppressWarnings(unlist(lapply(PWM, function(x) lapply(caseDNAStringset, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% sum()))), ctrl = suppressWarnings(unlist(lapply(PWM, function(x) lapply(ctrlDNAStringSet, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% sum()))), case_freq = case / sum(Biostrings::width(caseDNAStringset)), ctrl_freq = ctrl / sum(Biostrings::width(ctrlDNAStringSet)), log2FC = log2(case_freq/ctrl_freq), case_tot = sum(case)-case, ctrl_tot = sum(ctrl)-ctrl) %>% dplyr::rowwise() %>% dplyr::mutate(pval = fisher(case, ctrl, case_tot, ctrl_tot), p_adj = p.adjust(pval, method = "BH", nrow(motifs))) %>% dplyr::select(motif, case, ctrl, case_freq, ctrl_freq, log2FC, case_tot, ctrl_tot, pval, p_adj) %>% dplyr::arrange(p_adj) return(motifs) print("All Finished.", quote = FALSE) } custom_PWM_list <- readRDS("mydata/example_custom_PWM_list") custom_PWM_by_gene(custom_PWM_list, longest_3UTR_FMR1) custom_PWM_compare(custom_PWM_list, longest_3UTR_FMR1, longest_3UTR_FMR1_ctrl) RBNS_PWM <- FeatureReachR::RBNS_PWM CISBPRNA_mm_PWM <- FeatureReachR::CISBPRNA_PWM cisbp_new_res <- custom_PWM_compare(CISBPRNA_PWM, longest_3UTR_FMR1, longest_3UTR_FMR1_ctrl)
##This function will create the logical matrix needed to estimate RBP motif occurance from kmers ##We will supply 4-7 mers related to both CisBPRNA and RBNS motifs ##These should be used unless a k outside that range is desired ##This will create logical matrices for those odd cases. relate_Kmer_RBNS <- function(k, motif_path){ ##make DNAStringSets for every kmer x <- c("T", "A", "C", "G") kmers <- do.call(expand.grid, rep(list(x), k)) %>% tidyr::unite(kmer, sep = "") %>% dplyr::pull(., kmer) %>% Biostrings::DNAStringSet() names(kmers) <- do.call(expand.grid, rep(list(x), k)) %>% tidyr::unite(kmer, sep = "") %>% dplyr::pull(., kmer) #get paths to each motif pwm print("getting Motif and RBP data...", quote = FALSE) motif_paths <- list.files(path = motif_path, full.names = TRUE) motif_info <- file.info(path = motif_paths) motifs <- motif_info %>% dplyr::as_tibble(rownames = "PATH") %>% dplyr::mutate(motif = stringr::str_match(PATH, "PWMs/(.*?).PWM")[,2]) %>% dplyr::select(PATH, motif) print("Getting PWM data", quote = FALSE) motifs <- motifs %>% mutate(PWM = lapply(PATH, function(x) t(read.table(x, skip = 1, row.names = 1, header = TRUE, col.names = c("pos","A", "C", "G", "T"))))) PWM_list <- motifs$PWM names(PWM_list) <- motifs$motif #this function makes every PWM the same length as k or shorter by tiling across longer PWM matrices mattiles <- function(Mat, k) { if(ncol(Mat[[1]]) > k){ x <- c(1:(ncol(Mat[[1]])-(k-1)))-1 list <- lapply(x, function(x) Mat[[1]][,(1+x):(k+x)]) names(list) <- lapply(x, function(x) paste(names(Mat), "_", x + 1, sep = "")) list } else Mat } x <- 1:length(PWM_list) tiled_motifs <- unlist(lapply(x, function(x) mattiles(PWM_list[x], k)), recursive = FALSE) print("Counting motif occurances...", quote = FALSE) counts_list <- lapply(tiled_motifs, function(x) lapply(kmers, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% dplyr::as_tibble(rownames = "kmer")) print("Counting motif occurances complete.", quote = FALSE) motif_by_kmer <- dplyr::bind_rows(counts_list, .id = "motif") %>% tidyr::spread(kmer, value = value) %>% dplyr::select(motif, dplyr::everything()) motif_by_kmer <- motif_by_kmer %>% dplyr::as_tibble((. > 0) + 0) %>% dplyr::mutate(motif = .$motif) return(motif_by_kmer) } relate_Kmer_CisBPRNA <- function(k, motif_path, RBPinfo){ ##make DNAStringSets for every kmer x <- c("T", "A", "C", "G") kmers <- do.call(expand.grid, rep(list(x), k)) %>% tidyr::unite(kmer, sep = "") %>% dplyr::pull(., kmer) %>% Biostrings::DNAStringSet() names(kmers) <- do.call(expand.grid, rep(list(x), k)) %>% tidyr::unite(kmer, sep = "") %>% dplyr::pull(., kmer) #get paths to each motif pwm print("getting Motif and RBP data...", quote = FALSE) motif_paths <- list.files(path = motif_path, full.names = TRUE) motif_info <- file.info(motif_paths) motif_info <- motif_info[motif_info$size != 0, ] motifs <- motif_info %>% dplyr::as_tibble(rownames = "PATH") %>% dplyr:: mutate(motif = stringr::str_match(PATH, "motifs/(.*?).txt")[,2]) %>% dplyr::select(PATH, motif) #merge motif paths with RBP info RBP_info <- read.table(RBPinfo, header = TRUE, sep = "\t") RBP_info <- RBP_info %>% dplyr::as_tibble() %>% dplyr::select(Motif_ID, RBP_Name) %>% dplyr::filter(Motif_ID != ".") %>% dplyr::group_by(Motif_ID) %>% dplyr::summarise(RBP_name = dplyr::first(RBP_Name)) motifs <- dplyr::left_join(motifs, RBP_info, by = c("motif" = "Motif_ID")) print("Getting PWM data", quote = FALSE) motifs <- motifs %>% dplyr::mutate(PWM = lapply(PATH, function(x) t(read.table(x, header = TRUE, row.names = 1, col.names = c("pos", "A", "C", "G", "T"))))) PWM_list <- motifs$PWM names(PWM_list) <- paste(motifs$RBP_name, "_", motifs$motif, sep = "") mattiles <- function(Mat, k) { if(ncol(Mat[[1]]) > k){ x <- c(1:(ncol(Mat[[1]])-(k-1)))-1 list <- lapply(x, function(x) Mat[[1]][,(1+x):(k+x)]) names(list) <- lapply(x, function(x) paste(names(Mat), "_", x + 1, sep = "")) list } else Mat } x <- 1:length(PWM_list) tiled_motifs <- unlist(lapply(x, function(x) mattiles(PWM_list[x], k)), recursive = FALSE) print("Counting motif occurances...", quote = FALSE) counts_list <- lapply(tiled_motifs, function(x) lapply(kmers, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% dplyr::as_tibble(rownames = "kmer")) print("Counting motif occurances complete.", quote = FALSE) motif_by_kmer <- dplyr::bind_rows(counts_list, .id = "motif") %>% tidyr::spread(kmer, value = value) %>% dplyr::select(motif, dplyr::everything()) motif_by_kmer <- motif_by_kmer %>% dplyr::as_tibble((. > 0) + 0) %>% dplyr::mutate(motif = .$motif) return(motif_by_kmer) } relate_kmer_PWM <- function(k, PWM_list){ ##make DNAStringSets for every kmer x <- c("T", "A", "C", "G") kmers <- do.call(expand.grid, rep(list(x), k)) %>% tidyr::unite(kmer, sep = "") %>% dplyr::pull(., kmer) %>% Biostrings::DNAStringSet() names(kmers) <- do.call(expand.grid, rep(list(x), k)) %>% tidyr::unite(kmer, sep = "") %>% dplyr::pull(., kmer) #Check PWM_list print("Checking PWM data", quote = FALSE) if (any(as.numeric(lapply(PWM_list, nrow)) != 4)){ stop("Error: Ensure all PWM matricies have exactly 4 rows (\"A\", \"C\", \"G\" and \"T\")") } #this function makes every PWM the same length as k or shorter by tiling across longer PWM matrices mattiles <- function(Mat, k) { if(ncol(Mat[[1]]) > k){ x <- c(1:(ncol(Mat[[1]])-(k-1)))-1 list <- lapply(x, function(x) Mat[[1]][,(1+x):(k+x)]) names(list) <- lapply(x, function(x) paste(names(Mat), "_", x + 1, sep = "")) list } else Mat } x <- 1:length(PWM_list) tiled_motifs <- unlist(lapply(x, function(x) mattiles(PWM_list[x], k)), recursive = FALSE) print("Counting motif occurances...", quote = FALSE) counts_list <- lapply(tiled_motifs, function(x) lapply(kmers, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% dplyr::as_tibble(rownames = "kmer")) print("Counting motif occurances complete.", quote = FALSE) motif_by_kmer <- dplyr::bind_rows(counts_list, .id = "motif") %>% tidyr::spread(kmer, value = value) %>% dplyr::select(motif, dplyr::everything()) motif_by_kmer <- motif_by_kmer %>% dplyr::mutate_if(is.numeric, ~1 * (. > 0)) return(motif_by_kmer) } #fourmer_mm_cisbpRNA <- relate_Kmer_PWM(4, CISBPRNA_mm_PWM) #3min #fivemer_mm_cisbpRNA <- relate_Kmer_PWM(5, CISBPRNA_mm_PWM) #8min #sixmer_mm_cisbpRNA <- relate_Kmer_PWM(6, CISBPRNA_mm_PWM) #24min #sevenmer_mm_cisbpRNA <- relate_Kmer_PWM(7, CISBPRNA_mm_PWM) #60min #fourmer_hs_cisbpRNA <- relate_Kmer_PWM(4, CISBPRNA_hs_PWM) #3min #fivemer_hs_cisbpRNA <- relate_Kmer_PWM(5, CISBPRNA_hs_PWM) #9min #sixmer_hs_cisbpRNA <- relate_Kmer_PWM(6, CISBPRNA_hs_PWM) #25min #sevenmer_hs_cisbpRNA <- relate_Kmer_PWM(7, CISBPRNA_hs_PWM) #60min #fourmer_RBNS <- relate_Kmer_PWM(4, RBNS_PWM) #1min #fivemer_RBNS <- relate_Kmer_PWM(5, RBNS_PWM) #3min #sixmer_RBNS <- relate_Kmer_PWM(6, RBNS_PWM) #8min #sevenmer_RBNS <- relate_Kmer_PWM(7, RBNS_PWM) #28min #write.table(fourmer_mm_cisbpRNA, file = "mydata/KmerMotifRelations/fourmer_mm_cisbpRNA.txt") #write.table(fivemer_mm_cisbpRNA, file = "mydata/KmerMotifRelations/fivemer_mm_cisbpRNA.txt") #write.table(sixmer_mm_cisbpRNA, file = "mydata/KmerMotifRelations/sixmer_mm_cisbpRNA.txt") #write.table(sevenmer_mm_cisbpRNA, file = "mydata/KmerMotifRelations/sevenmer_mm_cisbpRNA.txt") #write.table(fourmer_hs_cisbpRNA, file = "mydata/KmerMotifRelations/fourmer_hs_cisbpRNA.txt") #write.table(fivemer_hs_cisbpRNA, file = "mydata/KmerMotifRelations/fivemer_hs_cisbpRNA.txt") #write.table(sixmer_hs_cisbpRNA, file = "mydata/KmerMotifRelations/sixmer_hs_cisbpRNA.txt") #write.table(sevenmer_hs_cisbpRNA, file = "mydata/KmerMotifRelations/sevenmer_hs_cisbpRNA.txt") #write.table(fourmer_RBNS, file = "mydata/KmerMotifRelations/fourmer_RBNS.txt") #write.table(fivemer_RBNS, file = "mydata/KmerMotifRelations/fivemer_RBNS.txt") #write.table(sixmer_RBNS, file = "mydata/KmerMotifRelations/sixmer_RBNS.txt") #write.table(sevenmer_RBNS, file = "mydata/KmerMotifRelations/sevenmer_RBNS.txt") #fourmer_mm_cisbpRNA <- read.table(file = "mydata/KmerMotifRelations/fourmer_mm_cisbpRNA.txt") #fivemer_mm_cisbpRNA <- read.table(file = "mydata/KmerMotifRelations/fivemer_mm_cisbpRNA.txt") #sixmer_mm_cisbpRNA <- read.table(file = "mydata/KmerMotifRelations/sixmer_mm_cisbpRNA.txt") #sevenmer_mm_cisbpRNA <- read.table(file = "mydata/KmerMotifRelations/sevenmer_mm_cisbpRNA.txt") #fourmer_hs_cisbpRNA <- read.table(file = "mydata/KmerMotifRelations/fourmer_hs_cisbpRNA.txt") #fivemer_hs_cisbpRNA <- read.table(file = "mydata/KmerMotifRelations/fivemer_hs_cisbpRNA.txt") #sixmer_hs_cisbpRNA <- read.table(file = "mydata/KmerMotifRelations/sixmer_hs_cisbpRNA.txt") #sevenmer_hs_cisbpRNA <- read.table(file = "mydata/KmerMotifRelations/sevenmer_hs_cisbpRNA.txt") #fourmer_RBNS <- read.table(file = "mydata/KmerMotifRelations/fourmer_RBNS.txt") #fivemer_RBNS <- read.table(file = "mydata/KmerMotifRelations/fivemer_RBNS.txt") #sixmer_RBNS <- read.table(file = "mydata/KmerMotifRelations/sixmer_RBNS.txt") #sevenmer_RBNS <- read.table(file = "mydata/KmerMotifRelations/sevenmer_RBNS.txt") #usethis::use_data(fourmer_mm_cisbpRNA, fivemer_mm_cisbpRNA, sixmer_mm_cisbpRNA, sevenmer_mm_cisbpRNA, fourmer_hs_cisbpRNA, fivemer_hs_cisbpRNA, sixmer_hs_cisbpRNA, sevenmer_hs_cisbpRNA, fourmer_RBNS, fivemer_RBNS, sixmer_RBNS, sevenmer_RBNS, internal = TRUE, overwrite = TRUE)
##this should be faster than the full motif search by quite a lot. estimate_motif_from_kmer <- function(kmer_list, motif_set, custom_motif_by_kmer_matrix = NULL){ #infer k from kmer_list list. if (length(unique(nchar(as.character(kmer_list)))) != 1){ warning("kmers in kmer list are not the same length using the shortest kmer as k") k = min(unique(nchar(as.character(kmer_list)))) } else k = unique(nchar(as.character(kmer_list))) #check motif_set input if (motif_set != "CISBPRNA" & motif_set != "RBNS" & motif_set != "custom"){ stop(paste("motif_set must be either \"CISBPRNA\", \"RBNS\" or \"custom\"", quote = FALSE)) } #get appropriate motif_by_kmer if (k == 4 & motif_set == "CISBPRNA"){ motif_by_kmer <- read.table("mydata/KmerMotifRelations/fourmer_cisbpRNA.txt") } else if (k == 4 & motif_set == "RBNS"){ motif_by_kmer <- read.table("mydata/KmerMotifRelations/fourmer_RBNS.txt") #motif_by_kmer <- RNAreachr:::fourmer_RBNS } else if (k == 5 & motif_set == "CISBPRNA"){ motif_by_kmer <- read.table("mydata/KmerMotifRelations/fivemer_cisbpRNA.txt") } else if (k == 5 & motif_set == "RBNS"){ motif_by_kmer <- read.table("mydata/KmerMotifRelations/fivemer_RBNS.txt") } else if (k == 6 & motif_set == "CISBPRNA"){ motif_by_kmer <- read.table("mydata/KmerMotifRelations/sixmer_cisbpRNA.txt") } else if (k == 6 & motif_set == "RBNS"){ motif_by_kmer <- read.table("mydata/KmerMotifRelations/sixmer_RBNS.txt") #motif_by_kmer <- RNAreachr:::sixmer_RBNS } else if (k == 7 & motif_set == "CISBPRNA"){ motif_by_kmer <- read.table("mydata/KmerMotifRelations/sevenmer_cisbpRNA.txt") } else if (k == 7 & motif_set == "RBNS"){ motif_by_kmer <- read.table("mydata/KmerMotifRelations/sevenmer_RBNS.txt") } if (motif_set == "custom" & is.null(custom_motif_by_kmer_matrix) == TRUE){ stop("A custom motif_by_kmer_matrix is required for estimating custom motif occurance from kmers \n use ouput from relate_kmer_PWM()") } else if (motif_set == "custom" & is.null(custom_motif_by_kmer_matrix) == FALSE){ motif_by_kmer <- custom_motif_by_kmer_matrix } ##calculate probablility of matching... x <- c(1:nrow(motif_by_kmer)) motif_estimate <- motif_by_kmer %>% dplyr::as_tibble() %>% dplyr:: mutate(input_kmer = rowSums(dplyr::select(., kmer_list)), all_kmer = rowSums(dplyr::select(., -motif)), tile = as.character(lapply(x, function(x) strsplit(as.character(motif_by_kmer$motif[x]), "_(?=[^_]+$)", perl=TRUE)[[1]][2])), motif = as.character(lapply(x, function(x) strsplit(as.character(motif_by_kmer$motif[x]), "_(?=[^_]+$)", perl=TRUE)[[1]][1]))) %>% dplyr::group_by(motif) %>% dplyr::summarize(input_kmer = sum(all_kmer), all_kmer = sum(input_kmer)) %>% dplyr::mutate(p_val = phyper(input_kmer-1, length(kmer_list), (4^k)-length(kmer_list), all_kmer, lower.tail = FALSE), p_adj = p.adjust(p_val, method = "BH", n = nrow(.))) %>% dplyr::arrange(p_adj) return(motif_estimate) } FMR1_RBNS_estimate <- estimate_Motif_from_Kmer(pos_FMR1_kmer, "RBNS") #1sec FMR1_cisbpRNA_estimate <- estimate_Motif_from_Kmer(pos_FMR1_kmer, "CISBPRNA") #1sec
FMR1_RBNS_estimate <- FMR1_RBNS_estimate %>% arrange(p_adj) %>% mutate(est_rank = c(1:nrow(.))) FMR1_cisbpRNA_estimate <- FMR1_cisbpRNA_estimate %>% arrange(p_adj) %>% mutate(est_rank = c(1:nrow(.))) ##compare to direct motif scanning output: ##ranking by pvalue and log2FC (only for the motif scan) FMR1_RBNS_top <- FMR1_RBNS %>% ungroup() %>% filter(log2FC > 0) %>% arrange(desc(log2FC), p_adj) %>% mutate(rank = c(1:nrow(.))) FMR1_RBNS_bottom <- FMR1_RBNS %>% ungroup() %>% filter(log2FC < 0) %>% arrange(desc(log2FC), desc(p_adj)) %>% mutate(rank = c((nrow(FMR1_RBNS_top)+1):(nrow(.)+nrow(FMR1_RBNS_top)))) FMR1_RBNS <- rbind(FMR1_RBNS_top, FMR1_RBNS_bottom) FMR1_motif_top <- FMR1_motif %>% ungroup() %>% filter(log2FC > 0) %>% arrange(desc(log2FC), p_adj) %>% mutate(rank = c(1:nrow(.))) FMR1_motif_bottom <- FMR1_motif %>% ungroup() %>% filter(log2FC < 0) %>% arrange(desc(log2FC), desc(p_adj)) %>% mutate(rank = c((nrow(FMR1_motif_top)+1):(nrow(.)+nrow(FMR1_motif_top)))) FMR1_motif <- rbind(FMR1_motif_top, FMR1_motif_bottom) full_join(FMR1_RBNS, FMR1_RBNS_estimate, by = "motif") %>% ggplot(aes(x = rank, y = est_rank)) + geom_point() + geom_smooth(aes(x = rank, y = est_rank), method = lm, se = FALSE, inherit.aes = FALSE) + theme_cowplot() + ggpubr::stat_cor(method = "spearman") FMR1_cisbpRNA_estimate %>% separate(motif, into = c("RBP_name", "motif", "extra"), sep = "_") %>% unite(motif, extra, col = "motif") %>% full_join(., FMR1_motif, by = "motif") %>% ggplot(aes(x = rank, y = est_rank)) + geom_point() + geom_smooth(aes(x = rank, y = est_rank), method = lm, se = FALSE, inherit.aes = FALSE) + theme_cowplot() + ggpubr::stat_cor(method = "spearman") ##directly comparing adjusted p values FMR1_cisbpRNA_estimate %>% separate(motif, into = c("RBP", "motif"), extra = "merge") %>% dplyr::select(RBP, motif, p_val, p_adj) %>% rename(p_val = "estimated_p_val", p_adj = "estimated_p_adj") %>% full_join(., FMR1_motif) %>% ggplot(aes(x = -log(estimated_p_adj), y = -log(p_adj))) + geom_point() + geom_smooth(aes(x = -log(estimated_p_adj), y = -log(p_adj)), method = lm, se = FALSE, inherit.aes = FALSE) + theme_cowplot() + ggpubr::stat_cor(method = "spearman") FMR1_RBNS_estimate %>% dplyr::select(motif, p_val, p_adj) %>% rename(p_val = "estimated_p_val", p_adj = "estimated_p_adj") %>% full_join(., FMR1_RBNS) %>% ggplot(aes(x = -log(estimated_p_adj), y = -log(p_adj))) + geom_point() + geom_smooth(aes(x = -log(estimated_p_adj), y = -log(p_adj)), method = lm, se = FALSE, inherit.aes = FALSE) + theme_cowplot() + ggpubr::stat_cor(method = "spearman") ##phyper overlap est_hits <- FMR1_RBNS_estimate %>% filter(p_adj < 0.05) %>% pull(., motif) length(est_hits) scan_hits <- FMR1_RBNS %>% filter(p_adj < 0.05, log2FC > 0) %>% pull(., motif) length(scan_hits) est_hits %in% scan_hits %>% sum() phyper(sum(est_hits %in% scan_hits)-1, length(scan_hits), nrow(FMR1_RBNS)-length(scan_hits), length(est_hits), lower.tail = FALSE) est_hits <- FMR1_cisbpRNA_estimate %>% separate(motif, into = c("RBP_name", "motif"), extra = "merge") %>% filter(p_adj < 0.05) %>% pull(., motif) length(est_hits) scan_hits <- FMR1_motif %>% filter(p_adj < 0.05, log2FC > 0) %>% pull(., motif) length(scan_hits) est_hits %in% scan_hits %>% sum() phyper(sum(est_hits %in% scan_hits-1), length(scan_hits), nrow(FMR1_motif)-length(scan_hits), length(est_hits), lower.tail = FALSE)
consensusMatrix(DNAStringSet(pos_FMR1_kmer), as.prob = TRUE) %>% .[1:4,] %>% seqLogo::seqLogo() d <- adist(as.character(pos_FMR1_kmer)) #this is Levenshtein distance rownames(d) <- as.character(pos_FMR1_kmer) hc <- hclust(as.dist(d)) plot(hc) rect.hclust(hc, k=3, border = "red") df <- data.frame(kmer = as.character(pos_FMR1_kmer),group = cutree(hc,k=3)) %>% as_tibble() consensusMatrix(DNAStringSet(pull(filter(df, group == 1), kmer)), as.prob = TRUE) %>% .[1:4,] %>% seqLogo::seqLogo() consensusMatrix(DNAStringSet(pull(filter(df, group == 2), kmer)), as.prob = TRUE) %>% .[1:4,] %>% seqLogo::seqLogo() consensusMatrix(DNAStringSet(pull(filter(df, group == 3), kmer)), as.prob = TRUE) %>% .[1:4,] %>% seqLogo::seqLogo() pairwiseAlignment(DNAStringSet(pull(filter(df, group == 1), kmer)), subject = as.character(pull(filter(df, group == 1), kmer))[1], type = "overlap") %>% Views() %>% consensusMatrix(as.prob = TRUE, shift = start(.)-1) %>% .[1:4,] %>% seqLogo::seqLogo() pairwiseAlignment(DNAStringSet(pull(filter(df, group == 2), kmer)), subject = as.character(pull(filter(df, group == 2), kmer))[1], type = "overlap") %>% Views() %>% consensusMatrix(as.prob = TRUE, shift = start(.)-1) %>% .[1:4,] %>% seqLogo::seqLogo() pairwiseAlignment(DNAStringSet(pull(filter(df, group == 3), kmer)), subject = as.character(pull(filter(df, group == 3), kmer))[1], type = "overlap") %>% Views() %>% consensusMatrix(as.prob = TRUE, shift = start(.)-1) %>% .[1:4,] %>% seqLogo::seqLogo() pairwiseAlignment(DNAStringSet(pull(filter(df, group == 2), kmer))[1], DNAStringSet(pull(filter(df, group == 2), kmer))[2], type = "overlap") %>% consensusMatrix(as.prob = TRUE) %>% .[1:4,2:6] %>% seqLogo::seqLogo() ##cm is a consensus matrix from aligned() (with -) cm <- pairwiseAlignment(DNAStringSet(pull(filter(df, group == 3), kmer)), subject = as.character(pull(filter(df, group == 3), kmer))[1], type = "overlap") %>% aligned() %>% consensusMatrix() cm %>% t() %>% as_tibble(rownames = "pos") %>% dplyr::select(pos, A, C, G, T, '-') %>% mutate(N = `-`, N = N/4) %>% dplyr::select(pos,A,C,G,T,N) %>% mutate(A = A+N, C = C+N, G = G+N, T = T+N ) %>% dplyr::select(-pos, -N) %>% mutate(rowsum = rowSums(dplyr::select(., A,C,G,T)), A = A/rowsum, C = C/rowsum, G = G/rowsum, T = T/rowsum) %>% dplyr::select(-rowsum) %>% t() %>% as.matrix() %>% seqLogo::seqLogo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.