R/Motif_compare.R

Defines functions motif_compare

Documented in motif_compare

#' Compare Motif occurances between two DNAStringSets
#'
#' This function compares motif occurances between between case and control
#' DNAStringSets.
#'
#' @param PWM_list a named list of position weight matrices. A list of named
#'   matrices where the name is unique, and the matrix columns represent
#'   positions, and the rows are the probability of each base "A", "C", "G", and
#'   "T". Each column should sum to 1. FeatureReachR has three PWM_lists build in:
#'   "CISBPRNA_mm_PWM", "CISBPRNA_hs_PWM", "JASPAR_mm_PWM", "JASPAR_hs_PWM", and "RBNS_PWM".
#' @param caseDNAStringSet A DNAStringSet Object. Use
#'   \code{\link[Biostrings]{readDNAStringSet}} on a fasta file to create.
#' @param ctrlDNAStringSet A DNAStringSet Object. Use
#'   \code{\link[Biostrings]{readDNAStringSet}} on a fasta file to create.
#' @return a summary table including the pvalue as calculated by
#'   \code{wilcox.test}, then corrected by \code{p.adjust} with the \code{method = "BH"}.
#'   The mean values for each DNAStringSet, and the log2 fold
#'   change between those means.
#' @seealso \code{\link{write_Sequence}}, \code{\link[Biostrings]{readDNAStringSet}},
#'   \code{\link[Biostrings]{countPWM}}, \code{\link{motif_by_gene}}
#' @examples
#' case_fasta <- Biostrings::readDNAStringSet("example_case.fa")
#' ctrl_fasta <- Biostrings::readDNAStringSet("example_ctrl.fa")
#'
#' motif_compare(CISBPRNA_mm_PWM, case_fasta, ctrl_fasta)
#' motif_compare(RBNS_PWM, case_fasta, ctrl_fasta)
#'
#' @export
motif_compare <- function(PWM_list, caseDNAStringset, ctrlDNAStringSet){

  print("Checking PWM data", quote = FALSE)

  if (any(names(caseDNAStringset) %in% names(ctrlDNAStringSet))){
    warning("some sequences in case set are also in the control set. This is not recommended.")
  }

  if (typeof(PWM_list) != "list"){
    stop("PWM_list must be a list")
  }

  if (any(as.numeric(lapply(PWM_list, nrow)) != 4)){
    stop("Error: Ensure all PWM matricies have exactly 4 rows (\"A\", \"C\", \"G\" and \"T\")")
  }
  motifs <- names(PWM_list) %>% dplyr::as_tibble() %>% dplyr::rename("motif" = value) %>% dplyr::mutate(PWM = unname(PWM_list))

  fisher <- function(a, b, c, d){
    mat <- matrix(c(a, b, c, d), nr = 2)
    fisher.test(mat, alternative = "two.sided")$p.value
  }

  print("Counting motif occurances and calculating statistics...", quote = FALSE)


  motifs <- motifs %>%
    dplyr::mutate(case = suppressWarnings(unlist(lapply(PWM, function(x) lapply(caseDNAStringset, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% sum()))),
           ctrl = suppressWarnings(unlist(lapply(PWM, function(x) lapply(ctrlDNAStringSet, function(y) Biostrings::countPWM(x, y)) %>% unlist() %>% sum()))),
           case_freq = case / sum(Biostrings::width(caseDNAStringset)),
           ctrl_freq = ctrl / sum(Biostrings::width(ctrlDNAStringSet)),
           log2FC = log2(case_freq/ctrl_freq),
           case_tot = sum(case)-case,
           ctrl_tot = sum(ctrl)-ctrl) %>%
    dplyr::rowwise() %>%
    dplyr::mutate(pval = fisher(case, ctrl, case_tot, ctrl_tot)) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(p_adj = p.adjust(pval, method = "BH")) %>%
    dplyr::select(motif, case, ctrl, case_freq, ctrl_freq, log2FC, case_tot, ctrl_tot, pval, p_adj) %>%
    dplyr::arrange(p_adj)

  print("All Finished.", quote = FALSE)

  return(motifs)

}
TaliaferroLab/FeatureReachR documentation built on Aug. 15, 2021, 2:21 p.m.