R/dreme.R

Defines functions get_probability_matrix dreme_to_pfm dreme_get_background_freq dreme_motif_stats dreme_help parseDreme prepareDremeFlags runDreme.default runDreme.BStringSetList runDreme.list

#' @export
#' @noRd
runDreme.list <- function(input, control, outdir = "auto", meme_path = NULL, silent = TRUE, ...){

  x <- sequence_input_control_list(input, control)
  input <- x$input
  control <- x$control

  res <- purrr::map(input, runDreme.default,
             control = control,
             outdir = outdir,
             meme_path = meme_path,
             silent = silent,
             ...
             )
  return(res)
}

#' @export
#' @noRd
runDreme.BStringSetList <- function(input, control, outdir = "auto", meme_path = NULL, silent = TRUE, ...){
  runDreme.list(as.list(input), control, outdir, meme_path, silent, ...)
}

#' @export
#' @noRd
runDreme.default <- function(input, control, outdir = "auto", meme_path = NULL, silent = TRUE, ...){

  # Handle multiple input types by multiple dispatch
  # input & control will be coerced to file paths
  input <- sequence_input(input)
  control <- sequence_input(control)

  if (outdir == "auto") {outdir <- outdir_name(input, control)}

  flags <- prepareDremeFlags(input = input, control = control, outdir = outdir, ...)

  command <- search_meme_path(path = meme_path, util = "dreme")
  ps_out <- processx::run(command, flags, spinner = TRUE, error_on_status = FALSE)

  ps_out %>%
    process_check_error(help_fun = ~{dreme_help(command)},
                        user_flags = cmdfun::cmd_help_parse_flags(flags),
                        default_help_fun = TRUE)

  print_process_stdout(ps_out, silent = silent)

  n_motifs <- dreme_nmotifs_found(ps_out)

  if (n_motifs == 0) {return(NULL)}

  dreme_out <- cmdfun::cmd_file_expect("dreme", c("txt", "html", "xml"), outdir = outdir)

  dreme_results <- parseDreme(dreme_out$xml)

  return(dreme_results)
}

#' Prepare flags for Dreme
#'
#' @param input input file path
#' @param control control file path
#' @param outdir output directory
#' @param ... additional flags defined by dreme. Pass flag and resulting value
#'   (if any) as arg = value. If the flag has no value, set arg = TRUE.
#'
#' @return vector of flags for system2 or processx
#'
#' @importFrom magrittr %>%
#'
#' @noRd
prepareDremeFlags <- function(input, control, outdir, ...){
  argDict <- c(nmotifs = "m",
               sec = "t",
               evalue = "e",
               seed = "s",
               input = "p",
               control = "n",
               outdir = "oc",
               ngen = "g")

  flags <- cmdfun::cmd_args_all() %>%
    cmdfun::cmd_list_interp(argDict) %>%
    cmdfun::cmd_list_drop(c("n" = "shuffle")) %>%
    cmdfun::cmd_list_to_flags()

  return(flags)

}

#' Import dreme output to R
#'
#'
#' @param xml path to dreme_out/dreme.xml
#'
#' @return data.frame with `motif` column containing universalmotif object representation of each DREME motif.
#'
#' parseDreme("dreme_out/dreme.xml")
#' @noRd
parseDreme <- function(xml){
  dreme_stats <- dreme_motif_stats(xml) %>% 
    # Don't need pvalue or evalue cols anymore 
    # since they're added by universalmotif as 
    # pval and eval
    dplyr::select(-"pvalue", -"evalue")

  pfms <- dreme_to_pfm(xml)

  dreme_stats$motif <- pfms
  # Convert to motif_df format
  # suppressing messages about adding empty motif slots
  suppressMessages(
    universalmotif::update_motifs(dreme_stats, extrainfo = TRUE)
    )
}

#' Returns Dreme help lines
#'
#' @param command path to ame. output of search_meme_path(util = "ame")
#'
#' @return
#'
#' @noRd
dreme_help <- function(command){
  processx::run(command, "-h", error_on_status = FALSE)$stderr
}

#' Return statistics of DREME results
#'
#' @param dreme_xml_path path to dreme.xml
#'
#' @return data.frame of dreme result statistics
#'
#' Columns:
#'  - id: motif id. Number represents order of discovery.
#'  - alt: alternative id. Number represents order of discovery.
#'  - seq: IUPAC sequence of matched motif
#'  - length: basepair length of discovered motif
#'  - nsites: number of times motif is discovered in reference sequence (can be more than 1 per entry)
#'  - positive_hits: number of positive sequences with at least 1 site
#'  - negative_hits: number of negative sequences with at least 1 site
#'  - pvalue: pvalue
#'  - evalue: evalue
#'  - unerased_evalue
#'  - positive_total: total number of sequences in positives (ie number of fasta entries)
#'  - negative_total: total number of sequences in negatives
#'  - pos_frac/neg_frac: fraction of positive or negative sites with a match
#'
#' @importFrom magrittr %>%
#' @importFrom magrittr %<>%
#' @importFrom rlang .data
#'
#' @noRd
dreme_motif_stats <- function(dreme_xml_path) {
  # Extract metadata about number of matches for each motif, etc.

  dreme_xml <- xml2::read_xml(dreme_xml_path)

  # Get information about # of positive & # negative regions
  pos_info <- xml2::xml_children(dreme_xml)[1] %>%
    xml2::xml_find_all("//positives") %>%
    attrs_to_df() %>%
    dplyr::mutate(count = as.numeric(as.character(.data$count)))

  neg_info <- xml2::xml_children(dreme_xml)[1] %>%
    xml2::xml_find_all("//negatives") %>%
    attrs_to_df() %>%
    dplyr::mutate(count = as.numeric(as.character(.data$count)))

  # Extract statistics & motif counts for each dreme motif
  motif_stats <- xml2::xml_children(dreme_xml)[2] %>%
    xml2::xml_children() %>%
    attrs_to_df()
  dbl_cols <- grep("[^id|alt|seq]", names(motif_stats), value = TRUE)
  motif_stats %<>%
    dplyr::mutate_if(is.factor, as.character) %>%
    dplyr::mutate_at(dbl_cols, as.numeric) %>%
    dplyr::mutate_at(c("length", "nsites",
                       "p", "n"), as.integer)

  # append info about positive / negative regions
  # compute some useful statistics
  motif_stats_final <- motif_stats %>%
    dplyr::rename("positive_hits" = "p",
                  "negative_hits" = "n") %>%
    dplyr::mutate("positive_total" = pos_info$count %>% as.integer,
                  "negative_total" = neg_info$count %>% as.integer,
                  "pos_frac" = .data$positive_hits/.data$positive_total,
                  "neg_frac" = .data$negative_hits/.data$negative_total) %>%
    dplyr::mutate(rank = gsub("^m", "", .data$id) %>% as.integer(),
                  id = paste0(.data$id, "_", .data$seq)) %>%
    dplyr::select("rank", dplyr::everything()) %>%
    # Finally, change id and alt to "name" and "altname"
    # for compatibility with universalmotif
    dplyr::rename("name" = "id",
                  "altname" = "alt")

  return(motif_stats_final)
}


#' Return named vector of letter frequencies from DREME run
#'
#' @param dreme_run_info `xml_nodeset` of the dreme run info
#'
#' @return named vector of letter frequencies, suitable as input to `bkg` in
#'   universalmotif::create_motif
#'
#' @importFrom magrittr %>%
#'
#' @noRd
dreme_get_background_freq <- function(dreme_run_info){
  background_entry <- xml2::xml_find_all(dreme_run_info, "//background")

  background_df <- background_entry %>%
    attrs_to_df() %>%
    dplyr::select(-"from") %>%
    lapply(function(x) as.character(x) %>% as.numeric) %>%
    dplyr::bind_cols()

  background_freq <- as.numeric(background_df) %>%
    magrittr::set_names(names(background_df))

  return(background_freq)
}

#' return list of universalmotif PCM objects
#'
#' @param dreme_xml_path path to dreme.xml output
#'
#' @return list of PCM output in Universalmotif format. Appended with metadata
#'   from DREME output.
#'
#' @importFrom magrittr %>%
#'
#' @noRd
dreme_to_pfm <- function(dreme_xml_path){
  dreme_xml <- xml2::read_xml(dreme_xml_path)

  dreme_run_info <- xml2::xml_children(dreme_xml)[1] %>%
    xml2::xml_children()

  background_freq <- dreme_get_background_freq(dreme_run_info)

  motifs_matrix <- xml2::xml_children(dreme_xml)[2] %>%
    xml2::xml_children() %>%
    purrr::map(get_probability_matrix) %>%
    purrr::map(t)

  motif_stats_list <- dreme_motif_stats(dreme_xml_path) %>%
    split(.$name)

  pfmList <- purrr::map2(motif_stats_list, motifs_matrix, ~{
    universalmotif::create_motif(.y,
                                 type = "PCM",
                                 name = .x$name,
                                 altname = .x$altname,
                                 bkg = background_freq,
                                 pval = .x$pvalue,
                                 nsites = .x$nsites,
                                 bkgsites = .x$negative_total,
                                 eval = .x$evalue)

  })

  return(pfmList)
}

#' Return probability matrix for dreme motif
#'
#' Takes a <motif></motif> XML entry to return the probability matrix
#'
#' @param motif_xml_entry
#'
#' @return position probability matrix
#'
#' @noRd
get_probability_matrix <- function(motif_xml_entry){
  # takes a <motif></motif> XML entry to return the probability matrix
  # WARNING: matrix is a character matrix (NOT NUMERIC)
  # need to do the lapply(matrix, function(x)
  # as.character(x) %>% as.numeric()) %>% bind_cols(.)
  # trick for numeric matrix
  motif_attr <- attrs_to_df(motif_xml_entry, stringsAsFactors = FALSE)

  if (!("length" %in% names(motif_attr))){
    # STREME
    nsites <- motif_attr$width %>%
       as.character() %>%
       as.integer()
  } else {
    # DREME
    nsites <- motif_attr$length %>%
       as.character() %>%
       as.integer()
  }

  freqs <- motif_xml_entry %>%
    xml2::xml_children(.) %>%
    .[seq_len(nsites)]

  freq_table <- lapply(freqs, attrs_to_df, stringsAsFactors = FALSE) %>%
    dplyr::bind_rows()

  freq_matrix <- lapply(freq_table, function(x) as.character(x) %>% as.numeric) %>%
                    dplyr::bind_rows(.) %>%
                    as.matrix(.)

  return(freq_matrix)
}

#' Return line reporting number of motifs passing cutoff in DREME stdout
#'
#' @param stdout stdout from processx
#'
#' @return
#'
#' @noRd
dreme_nmotifs_line <- function(stdout){
  lines <- strsplit(stdout, "\n") %>%
    .[[1]]

  matchLine <- grep("\\d motifs with E-value <", lines, value = TRUE)
  return(matchLine)
}

#' Grab number of discovered motifs from stdout line
#'
#' @param line output of dreme_nmotifs_line
#'
#' @return
#'
#' @noRd
dreme_nmotifs <- function(line){
  nmotifs <- gsub("(^\\d+).+", "\\1", line)
  return(as.integer(nmotifs))
}

#' Return number of discovered DREME motifs
#'
#' @param processx_out output of processx run
#'
#' @return `integer(1)` of number of motifs passing threshold
#'
#' @noRd
dreme_nmotifs_found <- function(processx_out){
  processx_out$stdout %>%
    dreme_nmotifs_line() %>%
    dreme_nmotifs()
}
snystrom/dremeR documentation built on Oct. 13, 2024, 10:48 p.m.