R/fimo.R

Defines functions prepareFimoFlags runFimo

Documented in runFimo

#' Find instances of motifs using FIMO
#'
#' FIMO scans input sequences to identify the positions of matches to each input
#' motif. FIMO has no sequence length or motif number restrictions.
#'
#' @param sequences path to fasta file, or stringset input.
#' @param motifs path to .meme format file, or universalmotif/universalmotif list input.
#' @param bfile path to background file, or special values: "motif" to use
#'   0-order frequencies contained in the motif, or "uniform" to use a uniform
#'   letter distribution. (default: "motif")
#' @param outdir output directory location. Only used if text = FALSE. Default:
#'   "auto" to autogenerate directory name. Note: if not using a fasta path as
#'   input, this will be a temporary location unless explicity set.
#' @param parse_genomic_coord `logical(1)` whether to parse genomic position
#'   from fasta headers. Fasta headers must be UCSC format positions (ie
#'   "chr:start-end"), but base 1 indexed (GRanges format). If names of fasta
#'   entries are genomic coordinates and parse_genomic_coord == TRUE, results
#'   will contain genomic coordinates of motif matches, otherwise FIMO will return
#'   relative coordinates (i.e. positions from 1 to length of the fasta entry).
#' @param skip_matched_sequence `logical(1)` whether or not to include the DNA
#'   sequence of the match. Default: `FALSE`. Note: jobs will complete faster if
#'   set to `TRUE`. `add_sequence()` can be used to lookup the sequence after data import if
#'   `parse_genomic_coord` is `TRUE`, so setting this flag is not strictly needed.
#' @param max_strand if match is found on both strands, only report strand with
#'   best match (default: TRUE).
#' @param text `logical(1)` (default: `TRUE`). No output files will be created
#'   on the filesystem. The results are unsorted and no q-values are computed.
#'   This setting allows fast searches on very large inputs. When set to `FALSE`
#'   FIMO will discard 50% of the lower significance matches if >100,000 matches are
#'   detected. `text = FALSE` will also incur a performance penalty because it
#'   must first read a file to disk, then read it into memory. For these reasons,
#'   I suggest keeping `text = TRUE`.
#' @param meme_path path to `meme/bin/` (optional). Defaut: `NULL`, searches
#'   "MEME_PATH" environment variable or "meme_path" option for path to "meme/bin/".
#' @param silent `logical(1)` whether to suppress stdout/stderr printing to
#'   console (default: TRUE). If the command is failing or giving unexpected
#'   output, setting `silent = FALSE` can aid troubleshooting.
#' @param ... additional commandline arguments to fimo. See the FIMO Flag table below.
#'
#' @return GRanges object containing positions of each match. Note: if
#'   `parse_genomic_coord = FALSE`, each `seqnames` entry will be the full fasta
#'   header, and start/end will be the relative position within that sequence of the
#'   match. The GRanges object has the following additional `mcols`:
#'     * motif_id = primary name of matched motif
#'     * motif_alt_id = alternate name of matched motif
#'     * score = score of match (higher score is a better match)
#'     * pvalue = pvalue of the match
#'     * qvalue = qvalue of the match
#'     * matched_sequence = sequence that matches the motif
#' @export
#'
#' @details Additional arguments passed to `...`. See: [Fimo web manual](http://meme-suite.org/doc/fimo.html?man_type=web)
#'   for a complete description of FIMO flags.
#'
#'
#' | FIMO Flag         | allowed values | default | description                |
#' |:-----------------:|:--------------:|:-------:|:---------------------------|
#' | alpha             | `numeric(1)`   | 1       | alpha for calculating position-specific priors. Represents fraction of sites that are binding sites of TF of interest. Used in conjunction with `psp` |
#' | bfile             | "motif", "motif-file", "uniform", file path, | "motif" | If "motif" or "motif-file", use 0-order letter frequencies from motif. "uniform" sets uniform letter frequencies. |
#' | max_stored_scores | `integer(1)`   | NULL    | maximum number of scores to be stored for computing q-values. used when `text = FALSE`, see FIMO webpage for details |
#' | motif_pseudo      | `numeric(1)`   | 0.1     | pseudocount added to motif matrix |
#' | no_qvalue         | `logical(1)`   | FALSE   | only needed when `text = FALSE`, do not compute q-value for each p-value |
#' | norc              | `logical(1)`   | FALSE   | Do not score reverse complement strand |
#' | prior_dist        | file path      | NULL    | file containing binned distribution of priors |
#' | psp               | file path      | NULL    | file containing position specific priors. Requires `prior_dist` |
#' | qv_thresh         | `logical(1)`   | FALSE   | use q-values for the output threshold |
#' | thresh            | `numeric(1)`   | `1e-4`  | output threshold for returning a match, only matches with values less than `thresh` are returned. |
#'
#'
#' @details # Citation
#' If you use `runFimo()` in your analysis, please cite:
#'
#' Charles E. Grant, Timothy L. Bailey, and William Stafford Noble, "FIMO:
#' Scanning for occurrences of a given motif", Bioinformatics, 27(7):1017-1018,
#' 2011. [full text](http://bioinformatics.oxfordjournals.org/content/early/2011/02/16/bioinformatics.btr064.full)
#'
#' @details ## Licensing
#' The MEME Suite is free for non-profit use, but for-profit users should purchase a
#' license. See the [MEME Suite Copyright Page](http://meme-suite.org/doc/copyright.html) for details.
#'
#' @md
#'
#' @examples
#' if (meme_is_installed()){
#' # Generate some example input sequences
#' seq <- universalmotif::create_sequences()
#' # sequences must have values in their fasta headers
#' names(seq) <- seq_along(seq)
#' # Create random example motif to search for
#' motif <- universalmotif::create_motif()
#'
#' # Search for motif in sequences
#' # parse_genomic_coord set to FALSE since fasta headers aren't in "chr:start-end" format.
#' runFimo(seq, motif, parse_genomic_coord = FALSE)
#' }
runFimo <- function(sequences, motifs, bfile = "motif",
                    outdir = "auto",
                    parse_genomic_coord = TRUE,
                    skip_matched_sequence = FALSE,
                    max_strand = TRUE,
                    text = TRUE,
                    meme_path = NULL,
                    silent = TRUE,
                    ...){

  sequences <- sequence_input(sequences)
  motifs <- motif_input(motifs)

  if (outdir == "auto") outdir <- paste0(outdir_name(sequences, motifs$path), "_fimo")

  user_flags <- prepareFimoFlags(bfile = bfile,
                            parse_genomic_coord = parse_genomic_coord,
                            skip_matched_sequence = skip_matched_sequence,
                            max_strand = max_strand,
                            text = text,
                            outdir = outdir,
                            ...)

  flags <- c(user_flags, motifs$path, sequences)

  command <- search_meme_path(path = meme_path, util = "fimo")

  ps_out <- processx::run(command, flags, error_on_status = FALSE)

  ps_out %>%
    process_check_error(help_fun = ~{fimoHelp(command)},
                        user_flags = cmdfun::cmd_help_parse_flags(user_flags) %>%
                          # filter out special inputs to bfile
                          grep("--$", ., invert = TRUE, value = TRUE),
                        flags_fun = ~{gsub("-", "_", .x)},
                        default_help_fun = TRUE)

  print_process_stdout(ps_out, silent = silent)
  print_process_stderr(ps_out, silent = silent)

  if (!is.na(text)) {
    if (text){
      fimo_res <- ps_out$stdout %>%
        I() %>%
        parseFimo()
      return(fimo_res)
    }
  }


  fimo_out <- cmdfun::cmd_file_combn("fimo", "tsv", outdir = outdir)

  #print(fimo_out$tsv)
  fimo_out$tsv %>%
    parseFimo()
}

#' Parse flags for FIMO input
#'
#' @param bfile background file
#' @param parse_genomic_coord parse genomic coordinates from name
#' @param skip_matched_sequence skip matched
#' @param max_strand max strand
#' @param text text
#' @param outdir outdir
#' @param ... ...
#'
#' @return
#'
#' @noRd
prepareFimoFlags <- function(bfile, parse_genomic_coord, skip_matched_sequence, max_strand, text, outdir = outdir, ...){

  argsDict <- c("outdir" = "oc")

  flags <- cmdfun::cmd_args_all() %>%
    cmdfun::cmd_list_interp(argsDict) %>%
    purrr::set_names(~{gsub("_", "-", .x)})

  if (!is.null(bfile)){
    if (file.exists(bfile) & bfile %in% c("motif", "uniform")) {
      message(paste0("Working directory contains a file named: ", bfile, " that will be used as the `bfile` argument.",
      "To force use of the FIMO keyword, pass argument as: `bfile = --", bfile, "--`"))
    }
    if (!file.exists(bfile) & bfile %in% c("motif", "uniform")){
      # if bfile isn't a path, user is probably inputting special keywords which
      # get wrapped in '--', but first drop all "-" in bfile input in case user
      # added '--' already.
      flags$bfile %<>% gsub("-", "", .) %>%
        gsub("(.+)", "--\\1--", .)
    }
  }

  flags %>%
    cmdfun::cmd_list_to_flags(prefix = "--")

}

#' Returns help string for FIMO
#'
#' @param command
#'
#' @return
#'
#' @noRd
fimoHelp <- function(command){
  processx::run(command, error_on_status = FALSE)$stderr
}

#' Import fimo matches as GRanges object.
#'
#' @param fimo_tsv path to fimo.tsv output
#'
#' @return GenomicRanges object for each match position. Note: if
#'   parse_genomic_coord == FALSE, each `seqnames` entry will be the fasta
#'   header, and start/end will be the position within that sequence of the
#'   match.
#'
#' @noRd
parseFimo <- function(fimo_tsv){

  fimo_lines <- tryCatch(readr::read_tsv(fimo_tsv,
                                  comment = "#",
                                  col_types = c("motif_id" = "c",
                                                "motif_alt_id" = "c",
                                                "sequence_name" = "c",
                                                "start" = "i",
                                                "stop" = "i",
                                                "strand" = "c",
                                                "score" = "d",
                                                "p-value" = "d",
                                                "q-value" = "d",
                                                "matched_sequence" = "c")
                                  ), 
                         error = function(e) {
                           # move the empty string check down
                           if (fimo_tsv == ""){
                             return(NULL)
                           }
                           stop(e)
                           }, 
                         warning = function(w) {
                           # If the above file import fails w/ warning
                           # (usually because fimo.tsv is empty)
                           # Double check that the file is actually empty
                           # (no other lines except for comments & empty lines)
                           # Then return NULL if that's the case.
                           lines <- readr::read_lines(fimo_tsv) %>%
                              grep("^#", ., invert = TRUE, value = TRUE) 
                           line_lengths <- vapply(lines, nchar, integer(1))
                           
                           if (any(line_lengths) > 0){
                             stop(paste("Error reading file:", fimo_tsv))
                           }
                           
                           return(NULL)
                           }
                         )
 
  # NULL is only returned above if fimo_tsv is empty, therefore no matches
  if (is.null(fimo_lines)) {
    message("No matches were detected")
    return(NULL)
  }
  
  fimo_matches <- fimo_lines %>%
    dplyr::rename_all(~{gsub("-", "", .)}) %>%
    dplyr::rename("seqnames" = "sequence_name") %>%
    # NOTE: FIMO uses 1-based coordinates, so no need to shift for GRanges conversion
    GenomicRanges::GRanges()
    # Compute q-value?
    #dplyr::group_by(motif_alt_id) %>%
    #dplyr::mutate(q.value = p.value/n())
}

#' Import FIMO results
#'
#' @param fimo_tsv path to fimo.tsv output file
#'
#' @return GenomicRanges object for each match position. Note unless coordinates
#'   are genomic positions, each `seqnames` entry will be the fasta header, and
#'   start/end will be the position within that sequence of the match.
#' @export
#'
#' @examples
#' fimo_tsv <- system.file("extdata", "fimo.tsv", package = "memes")
#' importFimo(fimo_tsv)
importFimo <- function(fimo_tsv){
  parseFimo(fimo_tsv)
}
snystrom/dremeR documentation built on May 1, 2024, 7:14 a.m.