R/filterdup.R

Defines functions filterdup

Documented in filterdup

#' filterdup
#'
#' @param ifile Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. REQUIRED.
#' @param gsize Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2,913,022,398), 'mm' for mouse (2,652,783,500), 'ce' for C. elegans (100,286,401) and 'dm' for fruitfly (142,573,017), Default:hs. The effective genome size numbers for the above four species are collected from Deeptools https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html Please refer to deeptools to define the best genome size you plan to use.
#' @param format Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let '%(prog)s' decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE or BAMPE/BEDPE. DEFAULT: \"AUTO\"
#' @param tsize Tag size. This will override the auto detected tag
#'     size. DEFAULT: Not set
#' @param pvalue Pvalue cutoff for binomial distribution
#'     test. DEFAULT:1e-5.
#' @param keepduplicates It controls the 'macs3 filterdup' behavior towards duplicate tags/pairs at the exact same location -- the same coordination and the same strand. The 'auto' option makes '%(prog)s' calculate the maximum tags at the exact same location based on binomal distribution using given -p as pvalue cutoff; and the 'all' option keeps every tags (useful if you only want to convert formats). If an integer is given, at most this number of tags will be kept at the same location. Note, MACS3 callpeak function uses KEEPDUPLICATES=1 as default. Note, if you've used samtools or picard to flag reads as 'PCR/Optical duplicate' in bit 1024, MACS3 will still read them although the reads may be decided by MACS3 as duplicate later. Default: auto
#' @param outputfile Output BED file name. If not specified, will write to standard output. Note, if the input format is BAMPE or BEDPE, the output will be in BEDPE format. DEFAULT: stdout
#' @param outdir The output directory.
#' @param verbose Set verbose level of runtime message. 0: only show
#'     critical message, 1: show additional warning message, 2: show
#'     process information, 3: show debug messages.  DEFAULT: 2.
#' @param buffer_size Buffer size for incrementally increasing
#'     internal array size to store reads alignment information. In
#'     most cases, you don't have to change this parameter. However,
#'     if there are large number of chromosomes/contigs/scaffolds in
#'     your alignment, it's recommended to specify a smaller buffer
#'     size in order to decrease memory usage (but it will take longer
#'     time to read alignment files). Minimum memory requested for
#'     reading an alignment file is about # of CHROMOSOME *
#'     BUFFER_SIZE * 8 Bytes. DEFAULT: 100000.
#' @param dryrun When set, filterdup will only output numbers instead
#'     of writing output files, including maximum allowable
#'     duplicates, total number of reads before filtering, total
#'     number of reads after filtering, and redundant rate. Default:
#'     not set.
#' @param log Whether to capture logs.
#' @importFrom utils read.table
#' @return `macsList` object.
#' @export
#' @examples
#' eh <- ExperimentHub::ExperimentHub()
#' CHIP <- eh[["EH4558"]]
#' res <- filterdup(ifile = CHIP, outputfile = "test.bed", outdir = tempdir())
filterdup <- function(ifile, gsize = "hs", format = "AUTO",
                      tsize = NULL, pvalue = 1e-5, keepduplicates = "auto",
                      outputfile = character(), outdir = ".", verbose = 2L,
                      buffer_size = 10000, dryrun = FALSE, log = TRUE){
    if(is.character(ifile)){
        ifile <- as.list(normalizePath(ifile))
    }

    cl <- basiliskStart(env_macs)
    on.exit(basiliskStop(cl))
    res <- basiliskRun(cl, function(.namespace, outdir){
        opts <- .namespace()$Namespace(gsize = gsize,
                                       tsize = tsize,
                                       pvalue = pvalue,
                                       format = format,
                                       keepduplicates = keepduplicates,
                                       verbose = verbose,
                                       outputfile = outputfile,
                                       outdir = outdir,
                                       ifile = ifile,
                                       buffer_size = buffer_size,
                                       dryrun = dryrun)
        .filterdup <- reticulate::import("MACS3.Commands.filterdup_cmd")
        if(log){
            reticulate::py_capture_output(.filterdup$run(opts))
        }else{
            .filterdup$run(opts)
        }
    }, .namespace = .namespace, outdir = outdir)
    if(log){
        message(res)
    }
    ofile <- file.path(outdir, outputfile)
    args <- as.list(match.call())
    macsList(arguments = args, outputs = ofile, log = res)
}
macs3-project/MACSr documentation built on Sept. 24, 2024, 11:09 p.m.