R/find_motifs_instances.R

Defines functions find_motifs_instances

Documented in find_motifs_instances

#' Find Instances of Motifs Across Regions
#'
#' Calls \code{findMotifsGenome.pl} with the \code{-find} option set
#' to run a search for instances of specified motifs across a given
#' set of regions. Results are then saved to a specified file. This
#' function does not do motif finding, but rather uses a motif file
#' to search for and return the exact regions in which motifs are found.
#' Motifs are provided as matrices as generated by \code{find_motifs_genome}
#' and read in by \code{read_*_results}. 
#' 
#' @param x \code{data.frame} with the first three columns being
#'   chromosome, start, and end coordinates, with a fourth column
#'   corresponding to the region identifier; extra columns may be kept;
#'   x may alternately be a path to an existing bed file
#' @param path path of file to save motif instances results to
#' @param genome ID of installed genome; check installed genomes using
#'   \code{list_homer_packages()}; examples include "hg38" and "mm10";
#'   add an 'r' at the end to mask repeats, e.g. "mm10r"
#' @param motif_file path to file containing all instances of motifs
#'   to be scanned for; can be written be \code{write_homer_motif}
#' @param scan_size size of sequence to scan; this can be a numeric to
#'   specify the number of bases to scan centered on the region, or
#'   alternately can be set to "given" to scan the entire region;
#'   if using "given", will use the "-chopify" option to cut large
#'   background sequences to average of target sequence size
#'   [default: \code{given}]
#' @param cores number of cores to use [default: all cores available]
#' @param cache number in MB to use as cache to store sequences in memory
#'   [default: calculates free memory and divides by 4]
#' @return Nothing; called for its side-effect of producing HOMER results
#' 
#' @export
#' @seealso \code{\link{read_known_results}}, \code{\link{read_denovo_results}}

## @param motif_pwm vector of motif_pwm's either derived from \code{find_motifs_genome}
## #' and read in by \code{read_*_results}, formatted with columns
## #' 'A', 'C', 'G', 'T', and rows as nucleotide probabilities such that
## #' each row sum equal to 1
## find_instances("inst/extdata/test_regions.bed", "inst/extdata/test_inst.txt", "mm10r",
##                "inst/extdata/test.motif",
##                scan_size = "given")

find_motifs_instances <- function(x, path, genome,
                           motif_file, 
                           scan_size = 'given',
                           cores = parallel::detectCores(),
                           cache = .calc_free_mem() / 4) {

    ## Error checking -----------------------------------------------------
    ## File format for `x` - write bed files if data.frame; assign if a character
    if ("data.frame" %in% class(x)) {
        target_bed <- tempfile("target_")
        .write_bed(x, path = target_bed)
    } else {
        if (file.exists(x) != TRUE) {
            stop("Check that your bed file for `x` exists")
        }        
        target_bed <- x
    }

    ## Run findMotifsGenome.pl ---------------------------------------------
    ## Make HOMER results output dir
    ## Construct and run command
    homer_base <- get_homer_bin()
    
    cmd <- paste(
        paste0(homer_base, "findMotifsGenome.pl"),
        target_bed, genome, tempfile(pattern = "fi_"),
        "-find", motif_file,
        "-size", scan_size,        
        "-p", cores,
        "-cache", cache,
        ">", path
    )
    system(cmd)

    if (scan_size == "given") {
        cmd <- paste(cmd, "-chopify")
    }

}
robertamezquita/marge documentation built on Sept. 30, 2020, 6:15 a.m.