knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(Biostrings)
library(cowplot)
#MT_FMR1 <- readDNAStringSet("mydata/FMR1dat/SD3deltaLRGenes.longest3UTR.fa")
#MT_FMR1_ctrl <- readDNAStringSet("mydata/FMR1dat/UnchangeddeltaLRGenes.longest3UTR.fa")

#FMR1_genes <- substr(names(MT_FMR1), 1, 18)
#FMR1_genes_ctrl <- substr(names(MT_FMR1_ctrl), 1, 18)

#FMR1_UTR3_tx <- gene2Tx(longest_mm, FMR1_genes, "UTR3") #1s
#FMR1_UTR3_tx_ctrl <- gene2Tx(longest_mm, FMR1_genes_ctrl, "UTR3") #1s

#write_Sequence(mm_f_gff, FMR1_UTR3_tx, "UTR3", "mydata/FMR1dat/longest_3UTR_FMR1", "fa") 
#write_Sequence(mm_f_gff, FMR1_UTR3_tx_ctrl, "UTR3", "mydata/FMR1dat/longest_3UTR_FMR1_ctrl", "fa")

longest_3UTR_FMR1 <- readDNAStringSet("mydata/FMR1dat/longest_3UTR_FMR1.fa") #30s
longest_3UTR_FMR1_ctrl <- readDNAStringSet("mydata/FMR1dat/longest_3UTR_FMR1_ctrl.fa") #30s
#This function simply returns the full counts of each kmer for each gene in a set
#access to raw data at a gene level.

kmer_by_gene <- function(DNAStringSet, k){

   print("counting kmers...", quote = FALSE)
   kmer_counts <- Biostrings::oligonucleotideFrequency(DNAStringSet, width = k) %>% 
     dplyr::as_tibble() %>% 
     dplyr::mutate(gene = names(DNAStringSet)) %>% 
     dplyr::select(gene, dplyr::everything())

   print("counting complete.", quote= FALSE)

   return(kmer_counts)
}


#This function compares kmer content between two DNAStringSets and returns a table with statistics for frequency of each kmer
#it doesnt take terribly long and checks that case sequences are excluded from control sequences.
#we could probably add options for pvalue correction

kmer_compare <- function(caseDNAStringSet, ctrlDNAStringSet, k){

   if (any(names(caseDNAStringSet) %in% names(ctrlDNAStringSet))){
     warning("some sequences in case set are also in the control set. This is not recommended.")
   }

   print("counting kmers...", quote = FALSE)
   case_kmer <- Biostrings::oligonucleotideFrequency(caseDNAStringSet, width = k) %>% 
     colSums() %>% 
     data.frame(kmer = names(.), case = .) %>% 
     dplyr::as_tibble()
   ctrl_kmer <- Biostrings::oligonucleotideFrequency(ctrlDNAStringSet, width = k) %>% 
     colSums() %>%
     data.frame(kmer = names(.), ctrl = .) %>% 
     dplyr::as_tibble()
   print("counting complete.", quote= FALSE)

  #compare kmers between case and ctrl takes ~30s

  fisher <- function(a, b, c, d){
  mat <- matrix(c(a, b, c, d), nr = 2)
  fisher.test(mat, alternative = "two.sided")$p.value
  } 

  print("calculating kmer statistics...", print = FALSE)

  kmer_stats <- dplyr::left_join(ctrl_kmer, case_kmer) %>% 
    na.omit() %>% 
    dplyr::mutate(ctrl_freq = ctrl / sum(ctrl),
           case_freq = case / sum(case),
           log2FC = log2(case_freq/ctrl_freq),
           ctrl_tot = sum(ctrl)-ctrl,
           case_tot = sum(case)-case) %>% 
    dplyr::rowwise() %>% 
   dplyr:: mutate(pval = fisher(case, ctrl, case_tot, ctrl_tot),
           p_adj = p.adjust(pval, method = "BH", 4^k)) %>% 
    dplyr::arrange(p_adj)

  print("calculations complete.", quote = FALSE)

  return(kmer_stats)
}

#kmer_by_gene(longest_3UTR_FMR1, 6)
#FMR1_kmer <- kmer_compare(longest_3UTR_FMR1, longest_3UTR_FMR1_ctrl, 6)
#saveRDS(FMR1_kmer, "mydata/FMR1dat/kmer_FMR1_compare.txt")
FMR1_kmer <- readRDS("mydata/FMR1dat/kmer_FMR1_compare.txt")
FMR1_kmer %>% arrange(p_adj)

##for later
pos_FMR1_kmer <- FMR1_kmer %>% filter(p_adj < 0.05, log2FC > 0) %>% pull(., kmer)
#this returns length of each sequence in a nice tibble

get_length <- function(DNAStringSet){
  dplyr::as_tibble(data.frame(gene = names(DNAStringSet),
                              length = Biostrings::width(DNAStringSet)))
}


#This compares sequence length between case and control DNAStringSets
#I am really uncertain of what the output should be...
#The table right now includes means, wilcox.test results and cliffs delta effect size metrics...
#we could add options as to what test to perform rather than just wilcox.test
#feedback is welcome

length_compare <- function(caseDNAStringSet, ctrlDNAStringSet){

  if (any(names(caseDNAStringSet) %in% names(ctrlDNAStringSet))){
     warning("some sequences in case set are also in the control set. This is not recommended.")
  }

  wilcox.p <- wilcox.test(Biostrings::width(caseDNAStringSet), Biostrings::width(ctrlDNAStringSet))$p.value
  mean_case <- mean(Biostrings::width(caseDNAStringSet))
  mean_ctrl <- mean(Biostrings::width(ctrlDNAStringSet))
  mean_FC <- mean_case/mean_ctrl
  CliffDelta <- effsize::cliff.delta(Biostrings::width(caseDNAStringSet), Biostrings::width(ctrlDNAStringSet))$estimate
  lowerCD <- effsize::cliff.delta(Biostrings::width(caseDNAStringSet), Biostrings::width(ctrlDNAStringSet))$conf.int[1]
  upperCD <- effsize::cliff.delta(Biostrings::width(caseDNAStringSet), Biostrings::width(ctrlDNAStringSet))$conf.int[2]

  data.frame(wilcox.p, mean_case, mean_ctrl, mean_FC, CliffDelta, lowerCD, upperCD)

}

get_length(longest_3UTR_FMR1)
length_compare(longest_3UTR_FMR1, longest_3UTR_FMR1_ctrl)
#this returns GC of each sequence in a nice tibble

get_GC <- function(DNAStringSet){
  dplyr::as_tibble(data.frame(gene = names(DNAStringSet),
                       GC = Biostrings::letterFrequency(DNAStringSet, "GC")/ BioStrings::width(DNAStringSet)))
}

#This is similar to the last function but with GC content. I again am uncertain of the output.

GC_compare <- function(caseDNAStringSet, ctrlDNAStringSet){

  if (any(names(caseDNAStringSet) %in% names(ctrlDNAStringSet))){
  warning("some sequences in case set are also in the control set. This is not recommended.")
  }

  GC_case <- Biostrings::letterFrequency(caseDNAStringSet, "GC") / Biostrings::width(caseDNAStringSet)
  GC_ctrl <- Biostrings::letterFrequency(ctrlDNAStringSet, "GC") / Biostrings::width(ctrlDNAStringSet)
  wilcox.p <- wilcox.test(GC_case, GC_ctrl)$p.value
  mean_case <- mean(GC_case)
  mean_ctrl <- mean(GC_ctrl)
  mean_FC <- mean_case/mean_ctrl
  CliffDelta <- effsize::cliff.delta(GC_case, GC_ctrl)$estimate
  lowerCD <- effsize::cliff.delta(GC_case, GC_ctrl)$conf.int[1]
  upperCD <- effsize::cliff.delta(GC_case, GC_ctrl)$conf.int[2]

  data.frame(wilcox.p, mean_case, mean_ctrl, mean_FC, CliffDelta, lowerCD, upperCD)
}

get_GC(longest_3UTR_FMR1)
GC_compare(longest_3UTR_FMR1, longest_3UTR_FMR1_ctrl)
#This function simply returns every motif counts for each gene
#begin with folder containing PWM for motifs of intrest (downloaded from CisBPRNA called "pwms_all_motifs")
#also need RBP info for the motifs (dowloaded from CisBPRNA called "RBP_Information_all_motifs")
cisBPRNA_by_gene <- function(motif_path, RBPinfo, DNAStringset){

  #get paths to each motif pwm
  print("getting Motif and RBP data...", quote = FALSE)

  motif_paths <- list.files(path = motif_path, full.names = TRUE)
  motif_info <- file.info(motif_paths)
  motif_info <- motif_info[motif_info$size != 0, ]
  motifs <- motif_info %>% 
    dplyr::as_tibble(rownames = "PATH") %>% 
    dplyr::mutate(motif = stringr::str_match(PATH, "motifs/(.*?).txt")[,2]) %>% 
    dplyr::select(PATH, motif)

  #merge motif paths with RBP info

  RBP_info <- read.table(RBPinfo, header = TRUE, sep = "\t")
  RBP_info <- RBP_info %>% 
    dplyr::as_tibble() %>% 
    dplyr::select(Motif_ID, RBP_Name) %>% 
    dplyr::filter(Motif_ID != ".") %>% 
    dplyr::group_by(Motif_ID) %>%
    dplyr::summarise(RBP_name = dplyr::first(RBP_Name))

  motifs <- dplyr::left_join(motifs, RBP_info, by = c("motif" = "Motif_ID"))

  print("Getting PWM data", quote = FALSE)

  motifs <- motifs %>%
    dplyr::mutate(PWM = lapply(PATH, function(x) t(read.table(x, header = TRUE, row.names = 1, col.names = c("pos", "A", "C", "G", "T")))))

  print("Counting motif occurances...", quote = FALSE)

  counts_list <- lapply(motifs$PWM, function(x) lapply(DNAStringset, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% dplyr::as_tibble(rownames = "gene"))
  names(counts_list) <- motifs$motif

  print("Counting motif occurances complete.", quote = FALSE)

  motif_by_gene <- dplyr::bind_rows(counts_list, .id = "motif") %>% 
    tidyr::spread(gene, value = value) %>% 
    dplyr::left_join(., RBP_info, by = c("motif" = "Motif_ID")) %>% 
    dplyr::select(motif, RBP_name, dplyr::everything())            

  return(motif_by_gene)
}


#This function compare the freqency of motifs in a case and control DNAStringSet
#begin with folder containing PWM for motifs of intrest (downloaded from CisBPRNA called "pwms_all_motifs")
#also need RBP info for the motifs (dowloaded from CisBPRNA called "RBP_Information_all_motifs")

cisBPRNA_compare <- function(motif_path, RBPinfo, caseDNAStringset, ctrlDNAStringSet){

  #get paths to each motif pwm

  print("getting Motif and RBP data...", quote = FALSE)

  motif_paths <- list.files(path = motif_path, full.names = TRUE)
  motif_info <- file.info(motif_paths)
  motif_info <- motif_info[motif_info$size != 0, ]
  motifs <- motif_info %>% 
    dplyr::as_tibble(rownames = "PATH") %>% 
    dplyr::mutate(motif = stringr::str_match(PATH, "motifs/(.*?).txt")[,2]) %>% 
    dplyr::select(PATH, motif)

  #merge motif paths with RBP info

  RBP_info <- read.table(RBPinfo, header = TRUE, sep = "\t")
  RBP_info <- RBP_info %>% 
    dplyr::as_tibble() %>% 
    dplyr::select(Motif_ID, RBP_Name) %>% 
    dplyr::filter(Motif_ID != ".") %>% 
    dplyr::group_by(Motif_ID) %>%
    dplyr::summarise(RBP_name = dplyr::first(RBP_Name))

  motifs <- dplyr::left_join(motifs, RBP_info, by = c("motif" = "Motif_ID"))

  fisher <- function(a, b, c, d){
  mat <- matrix(c(a, b, c, d), nr = 2)
  fisher.test(mat, alternative = "two.sided")$p.value
  }

  print("Counting motif occurances and calculating statistics...", quote = FALSE)

  motifs <- motifs %>%
    mutate(PWM = lapply(PATH, function(x) t(read.table(x, header = TRUE, row.names = 1, col.names = c("pos", "A", "C", "G", "T")))), 
           case = suppressWarnings(unlist(lapply(PWM, function(x) lapply(caseDNAStringset, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% sum()))), 
           ctrl = suppressWarnings(unlist(lapply(PWM, function(x) lapply(ctrlDNAStringSet, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% sum()))), 
           case_freq = case / sum(Biostrings::width(caseDNAStringset)), 
           ctrl_freq = ctrl / sum(Biostrings::width(ctrlDNAStringSet)), 
           log2FC = log2(case_freq/ctrl_freq),
           case_tot = sum(case)-case,
           ctrl_tot = sum(ctrl)-ctrl) %>% 
    dplyr::rowwise() %>% 
    dplyr::mutate(pval = fisher(case, ctrl, case_tot, ctrl_tot),
             p_adj = p.adjust(pval, method = "BH", nrow(motifs))) %>% 
    dplyr::select(RBP_name, motif, case, ctrl, case_freq, ctrl_freq, log2FC, case_tot, ctrl_tot, pval, p_adj)

  return(motifs)
}

#cisBPRNA_by_gene("mydata/CisBPRNAdat/mm/pwms_all_motifs", "mydata/CisBPRNAdat/mm/RBP_Information_all_motifs.txt", longest_3UTR_FMR1)

#FMR1_motif <- cisBPRNA_compare("mydata/CisBPRNAdat/mm/pwms_all_motifs", "mydata/CisBPRNAdat/mm/RBP_Information_all_motifs.txt", longest_3UTR_FMR1, longest_3UTR_FMR1_ctrl)
#saveRDS(FMR1_motif, "mydata/FMR1dat/cisbpMotif_FMR1_compare.txt")
FMR1_motif <- readRDS("mydata/FMR1dat/cisbpMotif_FMR1_compare.txt")
FMR1_motif %>% arrange(p_adj)
#This function simply returns every motif counts for each gene
#begin with folder containing PWM for motifs of intrest (from D.Dominguez called "RBNS_PWMs")

RBNS_by_gene <- function(motif_path, DNAStringset){

  #get paths to each motif pwm

  print("getting Motif and RBP data...", quote = FALSE)

  motif_paths <- list.files(path = motif_path, full.names = TRUE)
  motif_info <- file.info(path = motif_paths)
  motifs <- motif_info %>% 
    dplyr::as_tibble(rownames = "PATH") %>% 
    dplyr::mutate(motif = stringr::str_match(PATH, "PWMs/(.*?).PWM")[,2]) %>% 
    dplyr::select(PATH, motif)

  print("Getting PWM data", quote = FALSE)

  motifs <- motifs %>%
    dplyr::mutate(PWM = lapply(PATH, function(x) t(read.table(x, skip = 1, row.names = 1, header = TRUE, col.names = c("pos","A", "C", "G", "T")))))

  print("Counting motif occurances...", quote = FALSE)

  counts_list <- lapply(motifs$PWM, function(x) lapply(DNAStringset, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% dplyr::as_tibble(rownames = "gene"))
  names(counts_list) <- motifs$motif

  print("Counting motif occurance complete.", quote = FALSE)

  motif_by_gene <- dplyr::bind_rows(counts_list, .id = "motif") %>% tidyr::spread(gene, value = value) 

  return(motif_by_gene)
}


#This function compare the freqency of motifs in a case and control DNAStringSet
#begin with folder containing PWM for motifs of intrest (from D.Dominguez called "RBNS_PWMs")

RBNS_compare <- function(motif_path, caseDNAStringset, ctrlDNAStringSet){

  #get paths to each motif pwm

  print("getting Motif and RBP data...", quote = FALSE)

  motif_paths <- list.files(path = motif_path, full.names = TRUE)
  motif_info <- file.info(path = motif_paths)
  motifs <- motif_info %>% 
    dplyr::as_tibble(rownames = "PATH") %>% 
    dplyr::mutate(motif = stringr::str_match(PATH, "PWMs/(.*?).PWM")[,2]) %>% 
    dplyr::select(PATH, motif)

  fisher <- function(a, b, c, d){
  mat <- matrix(c(a, b, c, d), nr = 2)
  fisher.test(mat, alternative = "two.sided")$p.value
  }

  print("Counting motif occurances and calculating statistics...", quote = FALSE)


  motifs <- motifs %>%
    mutate(PWM = lapply(PATH, function(x) t(read.table(x, skip = 1, row.names = 1, header = TRUE, col.names = c("pos","A", "C", "G", "T")))),
           case = suppressWarnings(unlist(lapply(PWM, function(x) lapply(caseDNAStringset, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% sum()))), 
           ctrl = suppressWarnings(unlist(lapply(PWM, function(x) lapply(ctrlDNAStringSet, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% sum()))), 
           case_freq = case / sum(Biostrings::width(caseDNAStringset)), 
           ctrl_freq = ctrl / sum(Biostrings::width(ctrlDNAStringSet)), 
           log2FC = log2(case_freq/ctrl_freq),
           case_tot = sum(case)-case,
           ctrl_tot = sum(ctrl)-ctrl) %>% 
    dplyr::rowwise() %>% 
    dplyr::mutate(pval = fisher(case, ctrl, case_tot, ctrl_tot),
             p_adj = p.adjust(pval, method = "BH", nrow(motifs))) %>% 
    dplyr::select(motif, case, ctrl, case_freq, ctrl_freq, log2FC, case_tot, ctrl_tot, pval, p_adj)

  return(motifs)

}



#RBNS_by_gene("mydata/RBNSdat/RBNS_PWMs", longest_3UTR_FMR1)

#FMR1_RBNS <- RBNS_compare("mydata/RBNSdat/RBNS_PWMs", longest_3UTR_FMR1, longest_3UTR_FMR1_ctrl)
#saveRDS(FMR1_RBNS, "mydata/FMR1dat/RBNSMotif_FMR1_compare.txt")
FMR1_RBNS <- readRDS("mydata/FMR1dat/RBNSMotif_FMR1_compare.txt")
FMR1_RBNS %>% arrange(p_adj)
#custom PWM lists should contain a list of named matrices where the name is unique, and the matrix columns represent positions, and the rows are the probability of each base "A", "C", "G", and "T".

Motif_by_gene <- function(PWM_list, DNAStringset){

  print("Checking PWM data", quote = FALSE)

  if (typeof(PWM_list) != "list"){
    stop("PWM_list must be a list")
  }

  if (any(as.numeric(lapply(PWM_list, nrow)) != 4)){
    stop("Error: Ensure all PWM matricies have exactly 4 rows (\"A\", \"C\", \"G\" and \"T\")")
  }
  motifs <- names(PWM_list) %>% dplyr::as_tibble() %>% dplyr::rename("motif" = value) %>% dplyr::mutate(PWM = unname(PWM_list))

  print("Counting motif occurances...", quote = FALSE)

  counts_list <- lapply(motifs$PWM, function(x) lapply(DNAStringset, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% dplyr::as_tibble(rownames = "gene"))
  names(counts_list) <- motifs$motif

  print("Counting motif occurance complete.", quote = FALSE)

  motif_by_gene <- dplyr::bind_rows(counts_list, .id = "motif") %>% tidyr::spread(gene, value = value) 

  return(motif_by_gene)
}

Motif_compare <- function(PWM_list, caseDNAStringset, ctrlDNAStringSet){

  print("Checking PWM data", quote = FALSE)

  if (typeof(PWM_list) != "list"){
    stop("PWM_list must be a list")
  }

  if (any(as.numeric(lapply(PWM_list, nrow)) != 4)){
    stop("Error: Ensure all PWM matricies have exactly 4 rows (\"A\", \"C\", \"G\" and \"T\")")
  }
  motifs <- names(PWM_list) %>% dplyr::as_tibble() %>% dplyr::rename("motif" = value) %>% dplyr::mutate(PWM = unname(PWM_list))

  fisher <- function(a, b, c, d){
  mat <- matrix(c(a, b, c, d), nr = 2)
  fisher.test(mat, alternative = "two.sided")$p.value
  }

  print("Counting motif occurances and calculating statistics...", quote = FALSE)


  motifs <- motifs %>%
    mutate(case = suppressWarnings(unlist(lapply(PWM, function(x) lapply(caseDNAStringset, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% sum()))), 
           ctrl = suppressWarnings(unlist(lapply(PWM, function(x) lapply(ctrlDNAStringSet, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% sum()))), 
           case_freq = case / sum(Biostrings::width(caseDNAStringset)), 
           ctrl_freq = ctrl / sum(Biostrings::width(ctrlDNAStringSet)), 
           log2FC = log2(case_freq/ctrl_freq),
           case_tot = sum(case)-case,
           ctrl_tot = sum(ctrl)-ctrl) %>% 
    dplyr::rowwise() %>% 
    dplyr::mutate(pval = fisher(case, ctrl, case_tot, ctrl_tot),
             p_adj = p.adjust(pval, method = "BH", nrow(motifs))) %>% 
    dplyr::select(motif, case, ctrl, case_freq, ctrl_freq, log2FC, case_tot, ctrl_tot, pval, p_adj) %>% 
    dplyr::arrange(p_adj)

  return(motifs)

  print("All Finished.", quote = FALSE)
}

custom_PWM_list <- readRDS("mydata/example_custom_PWM_list")
custom_PWM_by_gene(custom_PWM_list, longest_3UTR_FMR1)
custom_PWM_compare(custom_PWM_list, longest_3UTR_FMR1, longest_3UTR_FMR1_ctrl)

RBNS_PWM <- FeatureReachR::RBNS_PWM
CISBPRNA_mm_PWM <- FeatureReachR::CISBPRNA_PWM
cisbp_new_res <- custom_PWM_compare(CISBPRNA_PWM, longest_3UTR_FMR1, longest_3UTR_FMR1_ctrl)
##This function will create the logical matrix needed to estimate RBP motif occurance from kmers
##We will supply 4-7 mers related to both CisBPRNA and RBNS motifs
##These should be used unless a k outside that range is desired
##This will create logical matrices for those odd cases.

relate_Kmer_RBNS <- function(k, motif_path){

    ##make DNAStringSets for every kmer
  x <- c("T", "A", "C", "G")

  kmers <- do.call(expand.grid, rep(list(x), k)) %>% 
    tidyr::unite(kmer, sep = "") %>% 
    dplyr::pull(., kmer) %>% 
    Biostrings::DNAStringSet()

  names(kmers) <- do.call(expand.grid, rep(list(x), k)) %>% 
    tidyr::unite(kmer, sep = "") %>% 
    dplyr::pull(., kmer)

  #get paths to each motif pwm
  print("getting Motif and RBP data...", quote = FALSE)

  motif_paths <- list.files(path = motif_path, full.names = TRUE)
  motif_info <- file.info(path = motif_paths)
  motifs <- motif_info %>% 
    dplyr::as_tibble(rownames = "PATH") %>% 
    dplyr::mutate(motif = stringr::str_match(PATH, "PWMs/(.*?).PWM")[,2]) %>% 
    dplyr::select(PATH, motif)

  print("Getting PWM data", quote = FALSE)

  motifs <- motifs %>%
    mutate(PWM = lapply(PATH, function(x) t(read.table(x, skip = 1, row.names = 1, header = TRUE, col.names = c("pos","A", "C", "G", "T")))))


  PWM_list <- motifs$PWM
  names(PWM_list) <- motifs$motif

  #this function makes every PWM the same length as k or shorter by tiling across longer PWM matrices
  mattiles <- function(Mat, k) {

    if(ncol(Mat[[1]]) > k){  
      x <- c(1:(ncol(Mat[[1]])-(k-1)))-1
      list <- lapply(x, function(x) Mat[[1]][,(1+x):(k+x)])
      names(list) <- lapply(x, function(x) paste(names(Mat), "_", x + 1, sep = ""))
      list
    }
    else
      Mat
  }

  x <- 1:length(PWM_list)
  tiled_motifs <- unlist(lapply(x, function(x) mattiles(PWM_list[x], k)), recursive = FALSE)


  print("Counting motif occurances...", quote = FALSE)

  counts_list <- lapply(tiled_motifs, function(x) 
    lapply(kmers, function(y) 
      Biostrings::countPWM(x, y)) %>% unlist() %>% dplyr::as_tibble(rownames = "kmer"))

  print("Counting motif occurances complete.", quote = FALSE)

  motif_by_kmer <- dplyr::bind_rows(counts_list, .id = "motif") %>% 
    tidyr::spread(kmer, value = value) %>% 
    dplyr::select(motif, dplyr::everything())            

  motif_by_kmer <- motif_by_kmer %>% dplyr::as_tibble((. > 0) + 0) %>% dplyr::mutate(motif = .$motif)
  return(motif_by_kmer)
}

relate_Kmer_CisBPRNA <- function(k, motif_path, RBPinfo){

    ##make DNAStringSets for every kmer
  x <- c("T", "A", "C", "G")

  kmers <- do.call(expand.grid, rep(list(x), k)) %>% 
    tidyr::unite(kmer, sep = "") %>% 
    dplyr::pull(., kmer) %>% 
    Biostrings::DNAStringSet()

  names(kmers) <- do.call(expand.grid, rep(list(x), k)) %>% 
    tidyr::unite(kmer, sep = "") %>% 
    dplyr::pull(., kmer)

  #get paths to each motif pwm
  print("getting Motif and RBP data...", quote = FALSE)

  motif_paths <- list.files(path = motif_path, full.names = TRUE)
  motif_info <- file.info(motif_paths)
  motif_info <- motif_info[motif_info$size != 0, ]
  motifs <- motif_info %>% 
    dplyr::as_tibble(rownames = "PATH") %>% 
   dplyr:: mutate(motif = stringr::str_match(PATH, "motifs/(.*?).txt")[,2]) %>% 
    dplyr::select(PATH, motif)

  #merge motif paths with RBP info

  RBP_info <- read.table(RBPinfo, header = TRUE, sep = "\t")
  RBP_info <- RBP_info %>% 
    dplyr::as_tibble() %>% 
    dplyr::select(Motif_ID, RBP_Name) %>% 
    dplyr::filter(Motif_ID != ".") %>% 
    dplyr::group_by(Motif_ID) %>%
    dplyr::summarise(RBP_name = dplyr::first(RBP_Name))

  motifs <- dplyr::left_join(motifs, RBP_info, by = c("motif" = "Motif_ID"))

  print("Getting PWM data", quote = FALSE)

  motifs <- motifs %>%
    dplyr::mutate(PWM = lapply(PATH, function(x) t(read.table(x, header = TRUE, row.names = 1, col.names = c("pos", "A", "C", "G", "T")))))


  PWM_list <- motifs$PWM
  names(PWM_list) <- paste(motifs$RBP_name, "_", motifs$motif, sep = "")

  mattiles <- function(Mat, k) {

    if(ncol(Mat[[1]]) > k){  
      x <- c(1:(ncol(Mat[[1]])-(k-1)))-1
      list <- lapply(x, function(x) Mat[[1]][,(1+x):(k+x)])
      names(list) <- lapply(x, function(x) paste(names(Mat), "_", x + 1, sep = ""))
      list
    }
    else
      Mat
  }

  x <- 1:length(PWM_list)
  tiled_motifs <- unlist(lapply(x, function(x) mattiles(PWM_list[x], k)), recursive = FALSE)


  print("Counting motif occurances...", quote = FALSE)

  counts_list <- lapply(tiled_motifs, function(x) 
    lapply(kmers, function(y) 
      Biostrings::countPWM(x, y)) %>% unlist() %>% dplyr::as_tibble(rownames = "kmer"))

  print("Counting motif occurances complete.", quote = FALSE)

  motif_by_kmer <- dplyr::bind_rows(counts_list, .id = "motif") %>% 
    tidyr::spread(kmer, value = value) %>%
    dplyr::select(motif, dplyr::everything())            

  motif_by_kmer <- motif_by_kmer %>% dplyr::as_tibble((. > 0) + 0) %>% dplyr::mutate(motif = .$motif)
  return(motif_by_kmer)
}

relate_kmer_PWM <- function(k, PWM_list){

    ##make DNAStringSets for every kmer
  x <- c("T", "A", "C", "G")

  kmers <- do.call(expand.grid, rep(list(x), k)) %>% 
    tidyr::unite(kmer, sep = "") %>% 
    dplyr::pull(., kmer) %>% 
    Biostrings::DNAStringSet()

  names(kmers) <- do.call(expand.grid, rep(list(x), k)) %>% 
    tidyr::unite(kmer, sep = "") %>% 
    dplyr::pull(., kmer)

  #Check PWM_list
  print("Checking PWM data", quote = FALSE)

  if (any(as.numeric(lapply(PWM_list, nrow)) != 4)){
    stop("Error: Ensure all PWM matricies have exactly 4 rows (\"A\", \"C\", \"G\" and \"T\")")
  }

  #this function makes every PWM the same length as k or shorter by tiling across longer PWM matrices
  mattiles <- function(Mat, k) {

    if(ncol(Mat[[1]]) > k){  
      x <- c(1:(ncol(Mat[[1]])-(k-1)))-1
      list <- lapply(x, function(x) Mat[[1]][,(1+x):(k+x)])
      names(list) <- lapply(x, function(x) paste(names(Mat), "_", x + 1, sep = ""))
      list
    }
    else
      Mat
  }

  x <- 1:length(PWM_list)
  tiled_motifs <- unlist(lapply(x, function(x) mattiles(PWM_list[x], k)), recursive = FALSE)


  print("Counting motif occurances...", quote = FALSE)

  counts_list <- lapply(tiled_motifs, function(x) 
    lapply(kmers, function(y) 
      Biostrings::countPWM(x, y)) %>% unlist() %>% dplyr::as_tibble(rownames = "kmer"))

  print("Counting motif occurances complete.", quote = FALSE)

  motif_by_kmer <- dplyr::bind_rows(counts_list, .id = "motif") %>% 
    tidyr::spread(kmer, value = value) %>% 
    dplyr::select(motif, dplyr::everything())            

  motif_by_kmer <- motif_by_kmer %>% dplyr::mutate_if(is.numeric, ~1 * (. > 0))

  return(motif_by_kmer)
}


#fourmer_mm_cisbpRNA <- relate_Kmer_PWM(4, CISBPRNA_mm_PWM) #3min
#fivemer_mm_cisbpRNA <- relate_Kmer_PWM(5, CISBPRNA_mm_PWM) #8min
#sixmer_mm_cisbpRNA <- relate_Kmer_PWM(6, CISBPRNA_mm_PWM) #24min
#sevenmer_mm_cisbpRNA <- relate_Kmer_PWM(7, CISBPRNA_mm_PWM) #60min

#fourmer_hs_cisbpRNA <- relate_Kmer_PWM(4, CISBPRNA_hs_PWM) #3min
#fivemer_hs_cisbpRNA <- relate_Kmer_PWM(5, CISBPRNA_hs_PWM) #9min
#sixmer_hs_cisbpRNA <- relate_Kmer_PWM(6, CISBPRNA_hs_PWM) #25min
#sevenmer_hs_cisbpRNA <- relate_Kmer_PWM(7, CISBPRNA_hs_PWM) #60min

#fourmer_RBNS <- relate_Kmer_PWM(4, RBNS_PWM) #1min
#fivemer_RBNS <- relate_Kmer_PWM(5, RBNS_PWM) #3min
#sixmer_RBNS <- relate_Kmer_PWM(6, RBNS_PWM) #8min
#sevenmer_RBNS <- relate_Kmer_PWM(7, RBNS_PWM) #28min


#write.table(fourmer_mm_cisbpRNA, file = "mydata/KmerMotifRelations/fourmer_mm_cisbpRNA.txt")
#write.table(fivemer_mm_cisbpRNA, file = "mydata/KmerMotifRelations/fivemer_mm_cisbpRNA.txt")
#write.table(sixmer_mm_cisbpRNA, file = "mydata/KmerMotifRelations/sixmer_mm_cisbpRNA.txt")
#write.table(sevenmer_mm_cisbpRNA, file = "mydata/KmerMotifRelations/sevenmer_mm_cisbpRNA.txt")

#write.table(fourmer_hs_cisbpRNA, file = "mydata/KmerMotifRelations/fourmer_hs_cisbpRNA.txt")
#write.table(fivemer_hs_cisbpRNA, file = "mydata/KmerMotifRelations/fivemer_hs_cisbpRNA.txt")
#write.table(sixmer_hs_cisbpRNA, file = "mydata/KmerMotifRelations/sixmer_hs_cisbpRNA.txt")
#write.table(sevenmer_hs_cisbpRNA, file = "mydata/KmerMotifRelations/sevenmer_hs_cisbpRNA.txt")

#write.table(fourmer_RBNS, file = "mydata/KmerMotifRelations/fourmer_RBNS.txt")
#write.table(fivemer_RBNS, file = "mydata/KmerMotifRelations/fivemer_RBNS.txt")
#write.table(sixmer_RBNS, file = "mydata/KmerMotifRelations/sixmer_RBNS.txt")
#write.table(sevenmer_RBNS, file = "mydata/KmerMotifRelations/sevenmer_RBNS.txt")

#fourmer_mm_cisbpRNA <- read.table(file = "mydata/KmerMotifRelations/fourmer_mm_cisbpRNA.txt")
#fivemer_mm_cisbpRNA <- read.table(file = "mydata/KmerMotifRelations/fivemer_mm_cisbpRNA.txt")
#sixmer_mm_cisbpRNA <- read.table(file = "mydata/KmerMotifRelations/sixmer_mm_cisbpRNA.txt")
#sevenmer_mm_cisbpRNA <- read.table(file = "mydata/KmerMotifRelations/sevenmer_mm_cisbpRNA.txt")

#fourmer_hs_cisbpRNA <- read.table(file = "mydata/KmerMotifRelations/fourmer_hs_cisbpRNA.txt")
#fivemer_hs_cisbpRNA <- read.table(file = "mydata/KmerMotifRelations/fivemer_hs_cisbpRNA.txt")
#sixmer_hs_cisbpRNA <- read.table(file = "mydata/KmerMotifRelations/sixmer_hs_cisbpRNA.txt")
#sevenmer_hs_cisbpRNA <- read.table(file = "mydata/KmerMotifRelations/sevenmer_hs_cisbpRNA.txt")

#fourmer_RBNS <- read.table(file = "mydata/KmerMotifRelations/fourmer_RBNS.txt")
#fivemer_RBNS <- read.table(file = "mydata/KmerMotifRelations/fivemer_RBNS.txt")
#sixmer_RBNS <- read.table(file = "mydata/KmerMotifRelations/sixmer_RBNS.txt")
#sevenmer_RBNS <- read.table(file = "mydata/KmerMotifRelations/sevenmer_RBNS.txt")

#usethis::use_data(fourmer_mm_cisbpRNA, fivemer_mm_cisbpRNA, sixmer_mm_cisbpRNA, sevenmer_mm_cisbpRNA, fourmer_hs_cisbpRNA, fivemer_hs_cisbpRNA, sixmer_hs_cisbpRNA, sevenmer_hs_cisbpRNA, fourmer_RBNS, fivemer_RBNS, sixmer_RBNS, sevenmer_RBNS, internal = TRUE, overwrite = TRUE)
##this should be faster than the full motif search by quite a lot.


estimate_motif_from_kmer <- function(kmer_list, motif_set, custom_motif_by_kmer_matrix = NULL){
  #infer k from kmer_list list.
  if (length(unique(nchar(as.character(kmer_list)))) != 1){
    warning("kmers in kmer list are not the same length using the shortest kmer as k")
    k = min(unique(nchar(as.character(kmer_list))))
  }
  else
    k = unique(nchar(as.character(kmer_list)))

  #check motif_set input
  if (motif_set != "CISBPRNA" & motif_set != "RBNS" & motif_set != "custom"){
    stop(paste("motif_set must be either \"CISBPRNA\", \"RBNS\" or \"custom\"", quote = FALSE))
  }

  #get appropriate motif_by_kmer
  if (k == 4 & motif_set == "CISBPRNA"){
    motif_by_kmer <- read.table("mydata/KmerMotifRelations/fourmer_cisbpRNA.txt")
  }
  else if (k == 4 & motif_set == "RBNS"){
    motif_by_kmer <- read.table("mydata/KmerMotifRelations/fourmer_RBNS.txt")
    #motif_by_kmer <- RNAreachr:::fourmer_RBNS
  }
  else if (k == 5 & motif_set == "CISBPRNA"){
    motif_by_kmer <- read.table("mydata/KmerMotifRelations/fivemer_cisbpRNA.txt")
  }
  else if (k == 5 & motif_set == "RBNS"){
    motif_by_kmer <- read.table("mydata/KmerMotifRelations/fivemer_RBNS.txt")
  }
  else if (k == 6 & motif_set == "CISBPRNA"){
    motif_by_kmer <- read.table("mydata/KmerMotifRelations/sixmer_cisbpRNA.txt")
  }
  else if (k == 6 & motif_set == "RBNS"){
    motif_by_kmer <- read.table("mydata/KmerMotifRelations/sixmer_RBNS.txt")
    #motif_by_kmer <- RNAreachr:::sixmer_RBNS
  }
  else if (k == 7 & motif_set == "CISBPRNA"){
    motif_by_kmer <- read.table("mydata/KmerMotifRelations/sevenmer_cisbpRNA.txt")
  }
  else if (k == 7 & motif_set == "RBNS"){
    motif_by_kmer <- read.table("mydata/KmerMotifRelations/sevenmer_RBNS.txt")
  }

  if (motif_set == "custom" & is.null(custom_motif_by_kmer_matrix) == TRUE){
    stop("A custom motif_by_kmer_matrix is required for estimating custom motif occurance from kmers \n
         use ouput from relate_kmer_PWM()")
  } else if (motif_set == "custom" & is.null(custom_motif_by_kmer_matrix) == FALSE){
    motif_by_kmer <- custom_motif_by_kmer_matrix
  }

  ##calculate probablility of matching...

  x <- c(1:nrow(motif_by_kmer))
  motif_estimate <- motif_by_kmer %>% 
    dplyr::as_tibble() %>% 
    dplyr:: mutate(input_kmer = rowSums(dplyr::select(., kmer_list)), 
                   all_kmer = rowSums(dplyr::select(., -motif)),
                   tile = as.character(lapply(x, function(x) strsplit(as.character(motif_by_kmer$motif[x]), "_(?=[^_]+$)", perl=TRUE)[[1]][2])),
                   motif = as.character(lapply(x, function(x) strsplit(as.character(motif_by_kmer$motif[x]), "_(?=[^_]+$)", perl=TRUE)[[1]][1]))) %>% 
    dplyr::group_by(motif) %>% 
    dplyr::summarize(input_kmer = sum(all_kmer), all_kmer = sum(input_kmer)) %>% 
    dplyr::mutate(p_val = phyper(input_kmer-1, length(kmer_list), (4^k)-length(kmer_list), all_kmer, lower.tail = FALSE), 
                  p_adj = p.adjust(p_val, method = "BH", n = nrow(.)))  %>% 
    dplyr::arrange(p_adj) 

  return(motif_estimate)

}

FMR1_RBNS_estimate <- estimate_Motif_from_Kmer(pos_FMR1_kmer, "RBNS") #1sec
FMR1_cisbpRNA_estimate <- estimate_Motif_from_Kmer(pos_FMR1_kmer, "CISBPRNA") #1sec

ALL a work in progress from here down

no more defined functions

FMR1_RBNS_estimate <- FMR1_RBNS_estimate %>% arrange(p_adj) %>% mutate(est_rank = c(1:nrow(.)))
FMR1_cisbpRNA_estimate <- FMR1_cisbpRNA_estimate %>% arrange(p_adj) %>% mutate(est_rank = c(1:nrow(.)))

##compare to direct motif scanning output:

##ranking by pvalue and log2FC (only for the motif scan)
FMR1_RBNS_top <- FMR1_RBNS %>% ungroup() %>%  filter(log2FC > 0) %>% arrange(desc(log2FC), p_adj) %>% mutate(rank = c(1:nrow(.)))
FMR1_RBNS_bottom <- FMR1_RBNS %>% ungroup() %>%  filter(log2FC < 0) %>% arrange(desc(log2FC), desc(p_adj)) %>% mutate(rank = c((nrow(FMR1_RBNS_top)+1):(nrow(.)+nrow(FMR1_RBNS_top))))
FMR1_RBNS <- rbind(FMR1_RBNS_top, FMR1_RBNS_bottom)

FMR1_motif_top <- FMR1_motif %>% ungroup() %>%  filter(log2FC > 0) %>% arrange(desc(log2FC), p_adj) %>% mutate(rank = c(1:nrow(.)))
FMR1_motif_bottom <- FMR1_motif %>% ungroup() %>%  filter(log2FC < 0) %>% arrange(desc(log2FC), desc(p_adj)) %>% mutate(rank = c((nrow(FMR1_motif_top)+1):(nrow(.)+nrow(FMR1_motif_top))))
FMR1_motif <- rbind(FMR1_motif_top, FMR1_motif_bottom)

full_join(FMR1_RBNS, FMR1_RBNS_estimate, by = "motif") %>% ggplot(aes(x = rank, y = est_rank)) + geom_point() + geom_smooth(aes(x = rank, y = est_rank), method = lm, se = FALSE, inherit.aes = FALSE) + theme_cowplot() + ggpubr::stat_cor(method = "spearman") 

FMR1_cisbpRNA_estimate %>% separate(motif, into = c("RBP_name", "motif", "extra"), sep = "_") %>% unite(motif, extra, col = "motif") %>% full_join(., FMR1_motif, by = "motif") %>% ggplot(aes(x = rank, y = est_rank)) + geom_point() + geom_smooth(aes(x = rank, y = est_rank), method = lm, se = FALSE, inherit.aes = FALSE) + theme_cowplot() + ggpubr::stat_cor(method = "spearman") 

##directly comparing adjusted p values

FMR1_cisbpRNA_estimate %>% separate(motif, into = c("RBP", "motif"), extra = "merge") %>% dplyr::select(RBP, motif, p_val, p_adj) %>% rename(p_val = "estimated_p_val", p_adj = "estimated_p_adj") %>% full_join(., FMR1_motif) %>% ggplot(aes(x = -log(estimated_p_adj), y = -log(p_adj))) + geom_point() + geom_smooth(aes(x = -log(estimated_p_adj), y = -log(p_adj)), method = lm, se = FALSE, inherit.aes = FALSE) + theme_cowplot() + ggpubr::stat_cor(method = "spearman") 

FMR1_RBNS_estimate %>% dplyr::select(motif, p_val, p_adj) %>% rename(p_val = "estimated_p_val", p_adj = "estimated_p_adj") %>% full_join(., FMR1_RBNS) %>% ggplot(aes(x = -log(estimated_p_adj), y = -log(p_adj))) + geom_point() + geom_smooth(aes(x = -log(estimated_p_adj), y = -log(p_adj)), method = lm, se = FALSE, inherit.aes = FALSE) + theme_cowplot() + ggpubr::stat_cor(method = "spearman") 


##phyper overlap
est_hits <- FMR1_RBNS_estimate %>%  filter(p_adj < 0.05) %>% pull(., motif)
length(est_hits)
scan_hits <- FMR1_RBNS %>% filter(p_adj < 0.05, log2FC > 0) %>% pull(., motif)
length(scan_hits)
est_hits %in% scan_hits %>% sum()

phyper(sum(est_hits %in% scan_hits)-1, length(scan_hits), nrow(FMR1_RBNS)-length(scan_hits), length(est_hits), lower.tail = FALSE)



est_hits <- FMR1_cisbpRNA_estimate %>% separate(motif, into = c("RBP_name", "motif"), extra = "merge") %>% filter(p_adj < 0.05) %>% pull(., motif)
length(est_hits)
scan_hits <- FMR1_motif %>% filter(p_adj < 0.05, log2FC > 0) %>% pull(., motif)
length(scan_hits)
est_hits %in% scan_hits %>% sum()

phyper(sum(est_hits %in% scan_hits-1), length(scan_hits), nrow(FMR1_motif)-length(scan_hits), length(est_hits), lower.tail = FALSE)
consensusMatrix(DNAStringSet(pos_FMR1_kmer), as.prob = TRUE) %>% .[1:4,] %>% seqLogo::seqLogo()

d  <- adist(as.character(pos_FMR1_kmer)) #this is Levenshtein distance 
rownames(d) <- as.character(pos_FMR1_kmer)
hc <- hclust(as.dist(d))
plot(hc)
rect.hclust(hc, k=3, border = "red")
df <- data.frame(kmer = as.character(pos_FMR1_kmer),group = cutree(hc,k=3)) %>% as_tibble()


consensusMatrix(DNAStringSet(pull(filter(df, group == 1), kmer)), as.prob = TRUE) %>% .[1:4,] %>% seqLogo::seqLogo()
consensusMatrix(DNAStringSet(pull(filter(df, group == 2), kmer)), as.prob = TRUE) %>% .[1:4,] %>% seqLogo::seqLogo()
consensusMatrix(DNAStringSet(pull(filter(df, group == 3), kmer)), as.prob = TRUE) %>% .[1:4,] %>% seqLogo::seqLogo()

pairwiseAlignment(DNAStringSet(pull(filter(df, group == 1), kmer)), subject = as.character(pull(filter(df, group == 1), kmer))[1], type = "overlap") %>% Views() %>% consensusMatrix(as.prob = TRUE, shift = start(.)-1) %>% .[1:4,] %>% seqLogo::seqLogo()
pairwiseAlignment(DNAStringSet(pull(filter(df, group == 2), kmer)), subject = as.character(pull(filter(df, group == 2), kmer))[1], type = "overlap") %>% Views() %>% consensusMatrix(as.prob = TRUE, shift = start(.)-1) %>% .[1:4,] %>% seqLogo::seqLogo()
pairwiseAlignment(DNAStringSet(pull(filter(df, group == 3), kmer)), subject = as.character(pull(filter(df, group == 3), kmer))[1], type = "overlap") %>% Views() %>% consensusMatrix(as.prob = TRUE, shift = start(.)-1) %>% .[1:4,] %>% seqLogo::seqLogo()

pairwiseAlignment(DNAStringSet(pull(filter(df, group == 2), kmer))[1], DNAStringSet(pull(filter(df, group == 2), kmer))[2], type = "overlap") %>% consensusMatrix(as.prob = TRUE) %>% .[1:4,2:6] %>% seqLogo::seqLogo()



##cm is a consensus matrix from aligned() (with -)
cm <- pairwiseAlignment(DNAStringSet(pull(filter(df, group == 3), kmer)), subject = as.character(pull(filter(df, group == 3), kmer))[1], type = "overlap") %>% aligned() %>% consensusMatrix()

cm %>% t() %>%  as_tibble(rownames = "pos") %>% dplyr::select(pos, A, C, G, T, '-') %>% mutate(N = `-`, N = N/4) %>% dplyr::select(pos,A,C,G,T,N) %>% mutate(A = A+N, C = C+N, G = G+N, T = T+N ) %>% dplyr::select(-pos, -N) %>% mutate(rowsum = rowSums(dplyr::select(., A,C,G,T)), A = A/rowsum, C = C/rowsum, G = G/rowsum, T = T/rowsum) %>% dplyr::select(-rowsum) %>% t() %>% as.matrix() %>% seqLogo::seqLogo()


TaliaferroLab/FeatureReachR documentation built on Aug. 15, 2021, 2:21 p.m.