R/GenBackground.R

Defines functions randomSampleOnGenome genBackground

Documented in genBackground

#' @importFrom methods new
#' @importFrom GenomeInfoDb seqlengths
#' @importFrom BiocGenerics start end
#' @importFrom BiocGenerics start<- end<-
setClass(Class = "GenBackground",
         contains = "EnrichStep"
)

setMethod(
    f = "init",
    signature = "GenBackground",
    definition = function(.Object,prevSteps = list(),...){
        allparam <- list(...)
        inputForegroundBed <- allparam[["inputForegroundBed"]]
        param(.Object)$inputBackgroundBed <- allparam[["inputBackgroundBed"]]
        genome <- allparam[["genome"]]
        outputForegroundBed <- allparam[["outputForegroundBed"]]
        outputBackgroundBed <- allparam[["outputBackgroundBed"]]
        outputRegionBed <- allparam[["outputRegionBed"]]
        regionLen <- allparam[["regionLen"]]
        sampleNumb <- allparam[["sampleNumb"]]
        if(length(prevSteps)>0){
            prevStep <- prevSteps[[1]]
            foregroundBed <- getParam(prevStep,"bedOutput")
            input(.Object)$inputForegroundBed <- foregroundBed
        }

        if(!is.null(inputForegroundBed)){
            input(.Object)$inputForegroundBed <- inputForegroundBed
        }

        if(is.null(outputForegroundBed)){
            output(.Object)$outputForegroundBed <-
                getAutoPath(.Object,originPath =
                                input(.Object)[["inputForegroundBed"]],
                            regexSuffixName = "bed",suffix = "foreground.bed")
        }else{
            output(.Object)$outputForegroundBed <- outputForegroundBed
        }

        if(is.null(outputBackgroundBed)){
            output(.Object)$outputBackgroundBed <-
                getAutoPath(.Object,originPath =
                                input(.Object)[["inputForegroundBed"]],
                            regexSuffixName = "bed",suffix = "background.bed")
        }else{
            output(.Object)$outputBackgroundBed <- outputBackgroundBed
        }

        if(is.null(outputRegionBed)){
            output(.Object)$outputRegionBed <-
                getAutoPath(.Object,originPath =
                                input(.Object)[["inputForegroundBed"]],
                            regexSuffixName = "bed",suffix = "allregion.bed")
        }else{
            output(.Object)$outputRegionBed <- outputRegionBed
        }
        if(is.null(genome)){
            if(getGenome() == "testgenome"){
                param(.Object)$bsgenome <- BSgenome::getBSgenome("hg19")
            }else{
                param(.Object)$bsgenome <- BSgenome::getBSgenome(getGenome())
            }
        }else{
            if(genome == "testgenome"){
                genome <- "hg19"
            }
            param(.Object)$bsgenome <- BSgenome::getBSgenome(genome = genome)


        }
        param(.Object)$regionLen <- regionLen
        if(is.null(sampleNumb)){
            param(.Object)$sampleNumb<- 0
        }else{
            param(.Object)$sampleNumb <- sampleNumb
        }

        .Object
    }
)


# randomSampleOnGenome<-function(regionLen, sampleNumber,bsgenome){
#     chrlens <-seqlengths(bsgenome)
#     selchr <- grep("_|M",names(chrlens),invert=TRUE)
#     chrlens <- chrlens[selchr]
#     startchrlens <- chrlens - regionLen
#     totallen <- sum(startchrlens)
#     spnb <- floor(runif(sampleNumber) * totallen) + 1
#     acclen <- startchrlens
#     for(i in 2:length(acclen)){
#         acclen[i] <- acclen[i-1] + acclen[i]
#     }
#     acclen <- c(0,acclen)
#     gr <- GRanges()
#     for(i in 1:(length(acclen)-1)){
#         sel <- spnb[(spnb>acclen[i]) & (spnb<=acclen[i+1])]
#         sel <- sel - acclen[i]
#         gr <- c(gr,GRanges(seqnames = names(acclen)[i+1], ranges = IRanges(start = sel, width = 1000)))
#     }
#     return(sort(gr,ignore.strand=TRUE))
# }

randomSampleOnGenome<-function(regionLen, sampleNumber,bsgenome,bgbed = NULL){
    if(is.null(bgbed)){
        chrlens <-seqlengths(bsgenome)
        if(getGenome() == "testgenome"){
            chrlens<-chrlens['chr1']
        }
        selchr <- grep("_|M",names(chrlens),invert=TRUE)
        chrlens <- chrlens[selchr]
        startchrlens <- chrlens - regionLen
        spchrs <- sample(x = names(startchrlens),size =  sampleNumber,
                         replace = TRUE, prob = startchrlens / sum(startchrlens))
        #gr <- GRanges()
        # for(chr in names(startchrlens)){
        #     if(sum(spchrs == chr) == 0){
        #         next
        #     }
        #     startpt <- sample(x = 1:startchrlens[chr],size = sum(spchrs == chr),replace = FALSE)
        #     #non overlapped method:
        #     #startpt <- sample(x = 1:(startchrlens[chr] - sum(spchrs == chr) * regionLen),size = sum(spchrs == chr),replace = FALSE)
        #     #startpt <- startpt + 0:(sum(spchrs == chr)-1) * regionLen
        #     gr <- c(gr,GRanges(seqnames = chr, ranges = IRanges(start = startpt, width = 1000)))
        # }
        gr <- lapply(names(startchrlens), function(chr){
            if(sum(spchrs == chr) == 0){
                return(NULL)
            }
            startpt <- sample(x = seq_len(startchrlens[chr]),
                              size = sum(spchrs == chr),replace = FALSE)
            #non overlapped method:
            #startpt <- sample(x = 1:(startchrlens[chr] - sum(spchrs == chr) * regionLen),size = sum(spchrs == chr),replace = FALSE)
            #startpt <- startpt + 0:(sum(spchrs == chr)-1) * regionLen
            return(GRanges(seqnames = chr,
                           ranges = IRanges(start = startpt, width = regionLen)))
        })
        gr <- do.call("c",gr)
        return(sort(gr,ignore.strand=TRUE))
    }else{
        bgbed <- import.bed(bgbed)
        w <- width(bgbed)
        w <- w/sum(w)
        idx <- sample(1:length(bgbed), size = sampleNumber, prob = w, replace = TRUE)
        tb<-as.data.frame(table(idx))

        moresel <- tb[tb[,2]>1,]
        lesssel <- as.integer(as.character(tb[tb[,2]==1,1]))

        lst <- lapply(1:nrow(moresel), function(x){
            i <- as.integer(as.character(moresel[x,1]))
            nb <- as.integer(moresel[x,2])
            print(nb)
            chr <- as.character(seqnames(bgbed[i]))
            st <- start(bgbed[i]) - regionLen
            ed <- end(bgbed[i])
            sts <-sample(st:ed, size = nb, replace = TRUE)
            return(GRanges(seqnames=Rle(chr, nb), IRanges(sts, width=(regionLen+1))))
        })
        moregr <- do.call(c,lst)
        lessgr <- bgbed[lesssel]
        middle <- round((start(lessgr) + end(lessgr))/2)
        start(lessgr) <- middle - round(regionLen/2)
        end(lessgr) <- middle + round(regionLen/2)
        gr <-  c(moregr, lessgr)
        score(gr) <- NULL
        mcols(gr)$name <-NULL
        return(sort(gr,ignore.strand=TRUE))
    }
}

setMethod(
    f = "processing",
    signature = "GenBackground",
    definition = function(.Object,...){
        inputForegroundBed <- getParam(.Object,"inputForegroundBed")
        inputBackgroundBed <- getParam(.Object,"inputBackgroundBed")
        bsgenome <- getParam(.Object,"bsgenome")
        outputForegroundBed <- getParam(.Object,"outputForegroundBed")
        outputBackgroundBed <- getParam(.Object,"outputBackgroundBed")
        outputRegionBed <- getParam(.Object,"outputRegionBed")
        regionLen <- getParam(.Object,"regionLen")
        sampleNumb <- getParam(.Object,"sampleNumb")

        foregroundgr <- import(con = inputForegroundBed,format = "bed")
        midpoint <- (start(foregroundgr) + end(foregroundgr))/2
        start(foregroundgr) <- floor(midpoint - regionLen/2)
        end(foregroundgr) <- floor(midpoint + regionLen/2)
        foregroundgr <- sort(foregroundgr,ignore.strand=TRUE)
        mcols(foregroundgr)$name <-  seq_len(length(foregroundgr))
        export.bed(object = foregroundgr, con = outputForegroundBed)
        if(sampleNumb == 0){
            sampleNumb = length(foregroundgr)
        }
        backgroundgr <- randomSampleOnGenome(regionLen, sampleNumb, bsgenome, inputBackgroundBed)
        mcols(backgroundgr)$name <-
            (length(foregroundgr) + 1) : (length(foregroundgr) + length(backgroundgr))
        export.bed(object = backgroundgr, con = outputBackgroundBed)
        regiongr <- c(foregroundgr,backgroundgr)
        score(regiongr) <- 0
        export.bed(object = regiongr, con = outputRegionBed)
        .Object
    }
)


setMethod(
    f = "genReport",
    signature = "GenBackground",
    definition = function(.Object, ...){
        .Object
    }
)



#' @name GenBackground
#' @importFrom rtracklayer import
#' @importFrom rtracklayer import.bed
#' @title Generate background regions and reset the size of
#' foreground regions
#' @description
#' Use uniform distribution to generate background
#' sequence regions from genome.
#' The size of foreground regions will be unified into the
#' length specified in argument.
#' @param prevStep \code{\link{Step-class}} object scalar.
#' It needs to be the return value of upstream process
#' from other packages, such as esATAC.
#' @param inputForegroundBed \code{Character} scalar.
#' The directory of foreground BED file.
#' @param inputBackgroundBed \code{Character} scalar.
#' The directory of backgroud BED file.
#' Default: NULL to generate random region from genome.
#' @param genome \code{Character} scalar.
#' Bioconductor supported genome such as "hg19", "mm10", etc.
#' Default: NULL (e.g. after library (enrichTF),
#' you can call function \code{setGenome("hg19")})
#' @param outputForegroundBed \code{Character} scalar.
#' The BED file directory of reshaped foreground regions.
#' Default: NULL (generated base on inputForegroundBed)
#' @param outputBackgroundBed \code{Character} scalar.
#' The BED file directory of reshaped background regions.
#' Default: NULL (generated base on inputForegroundBed)
#' @param outputRegionBed \code{Character} scalar.
#' Foreground and background merged BED files.
#' Default: NULL (generated base on inputForegroundBed)
#' @param regionLen \code{Character} scalar.
#' It sets the length of forground sequence regions.
#' Default: 1000
#' @param sampleNumb \code{numeric} scalar.
#' It sets the number of background regions that
#' will be sampled. Default: 10000
#' @param ... Additional arguments, currently unused.
#' @details
#' Use uniform distribution to generate background sequence
#' regions from genome.
#' The size of foreground regions will be unified into the
#'  length specified in argument.
#' @return An invisible \code{\link{EnrichStep-class}}
#' object (\code{\link{Step-class}} based) scalar for downstream analysis.
#' @author Zheng Wei
#' @seealso
#' \code{\link{regionConnectTargetGene}}
#' \code{\link{findMotifsInRegions}}
#' \code{\link{tfsEnrichInRegions}}

#' @examples
#' setGenome("testgenome") #Use "hg19","hg38",etc. for your application
#' foregroundBedPath <- system.file(package = "enrichTF", "extdata","testregion.bed")
#' gen <- genBackground(inputForegroundBed = foregroundBedPath)


setGeneric("enrichGenBackground",
           function(prevStep,
                    inputForegroundBed = NULL,
                    inputBackgroundBed = NULL,
                    genome = NULL,
                    outputForegroundBed = NULL,
                    outputBackgroundBed = NULL,
                    outputRegionBed = NULL,
                    regionLen = 1000,
                    sampleNumb = 10000,
                    ...) standardGeneric("enrichGenBackground"))



#' @rdname GenBackground
#' @aliases enrichGenBackground
#' @export
setMethod(
    f = "enrichGenBackground",
    signature = "Step",
    definition = function(prevStep,
                          inputForegroundBed = NULL,
                          inputBackgroundBed = NULL,
                          genome = NULL,
                          outputForegroundBed = NULL,
                          outputBackgroundBed = NULL,
                          outputRegionBed = NULL,
                          regionLen = 1000,
                          sampleNumb = NULL,
                          ...){
        allpara <- c(list(Class = "GenBackground",
                          prevSteps = list(prevStep)),
                     as.list(environment()),list(...))
        step <- do.call(new,allpara)
        invisible(step)
    }
)
#' @rdname GenBackground
#' @aliases genBackground
#' @export
genBackground <- function(inputForegroundBed,
                          inputBackgroundBed = NULL,
                          genome = NULL,
                          outputForegroundBed = NULL,
                          outputBackgroundBed = NULL,
                          outputRegionBed = NULL,
                          regionLen = 1000,
                          sampleNumb = NULL,
                          ...){
    allpara <- c(list(Class = "GenBackground",
                      prevSteps = list()),
                 as.list(environment()),list(...))
    step <- do.call(new,allpara)
    invisible(step)
}
wzthu/enrichTF documentation built on March 30, 2020, 1:51 p.m.