R/recoup.R

Defines functions toOutput applySelectors recoup

Documented in recoup

recoup <- function(
    input,
    design=NULL,
    region=c("genebody","tss","tes","utr3","custom"),
    type=c("chipseq","rnaseq"),
    signal=c("coverage","rpm"),
    genome=c("hg18","hg19","hg38","mm9","mm10","rn5","rn6","dm3","dm6",
        "danrer7","danrer10","pantro4","pantro5","susscr3","susscr11",
        "ecucab2","tair10"),
    version="auto",
    refdb=c("ensembl","ucsc","refseq"),
    flank=c(2000,2000),
    onFlankFail=c("drop","trim"),
    fraction=1,
    orderBy=list(
        what=c("none","suma","sumn","maxa","maxn","avga","avgn","hcn"),
        order=c("descending","ascending"),
        custom=NULL
    ),
    #orderBy=getDefaultListArgs("orderBy"),
    binParams=list(
        flankBinSize=0,
        regionBinSize=0,
        sumStat=c("mean","median"),
        interpolation=c("auto","spline","linear","neighborhood"),
        binType=c("variable","fixed"),
        forceHeatmapBinning=TRUE,
        forcedBinSize=c(50,200),
        chunking=FALSE#,
        #seed=42
    ),
    #binParams=getDefaultListArgs("binParams"),
    selector=NULL,
    #selector=getDefaultListArgs("selector"),
    preprocessParams=list(
        fragLen=NA,
        cleanLevel=c(0,1,2,3),
        normalize=c("none","linear","downsample","sampleto"),
        sampleTo=1e+6,
        spliceAction=c("split","keep","remove"),
        spliceRemoveQ=0.75,
        #bedGenome=ifelse(genome %in% c("hg18","hg19","hg38","mm9","mm10","rn5",
        #    "dm3","danrer7","pantro4","susscr3"),genome,NA),
        bedGenome=NA#,
        #seed=42
    ),
    #preprocessParams=getDefaultListArgs("preprocessParams"),
    plotParams=list(
        plot=TRUE,
        profile=TRUE,
        heatmap=TRUE,
        correlation=TRUE,
        signalScale=c("natural","log2"),
        heatmapScale=c("common","each"),
        heatmapFactor=1,
        corrScale=c("normalized","each"),
        sumStat=c("mean","median"),
        smooth=TRUE,
        corrSmoothPar=ifelse(is.null(design),0.1,0.5),
        singleFacet=c("none","wrap","grid"),
        multiFacet=c("wrap","grid"),
        singleFacetDirection=c("horizontal","vertical"),
        conf=TRUE,
        device=c("x11","png","jpg","tiff","bmp","pdf","ps"),
        outputDir=".",
        outputBase=NULL
    ),
    #plotParams=getDefaultListArgs("plotParams",design),
    saveParams=list(
        ranges=TRUE,
        coverage=TRUE,
        profile=TRUE,
        profilePlot=TRUE,
        heatmapPlot=TRUE,
        correlationPlot=TRUE
    ),
    #saveParams=getDefaultListArgs("saveParams"),
    kmParams=list(
        k=0, # Do not perform kmeans
        nstart=20,
        algorithm=c("Hartigan-Wong","Lloyd","Forgy","MacQueen"),
        iterMax=20,
        reference=NULL#,
        #seed=42
    ),
    #kmParams=getDefaultListArgs("kmParams"),
    strandedParams=list(
        strand=NULL,
        ignoreStrand=TRUE
    ),
    #strandedParams=getDefaultListArgs("strandedParams"),
    ggplotParams=list(
        title=element_text(size=12),
        axis.title.x=element_text(size=10,face="bold"),
        axis.title.y=element_text(size=10,face="bold"),
        axis.text.x=element_text(size=9,face="bold"),
        axis.text.y=element_text(size=10,face="bold"),
        strip.text.x=element_text(size=10,face="bold"),
        strip.text.y=element_text(size=10,face="bold"),
        legend.position="bottom",
        panel.spacing=grid::unit(1,"lines")
    ),
    #ggplotParams=getDefaultListArgs("ggplotParams"),
    complexHeatmapParams=list(
        main=list(
            cluster_rows=ifelse(length(grep("hc",orderBy$what))>0,TRUE,FALSE),
            cluster_columns=FALSE,
            column_title_gp=grid::gpar(fontsize=10,font=2),
            show_row_names=FALSE,
            show_column_names=FALSE,
            heatmap_legend_param=list(
                color_bar="continuous"
            )
        ),
        group=list(
            cluster_rows=ifelse(length(grep("hc",orderBy$what))>0,TRUE,FALSE),
            cluster_columns=FALSE,
            column_title_gp=grid::gpar(fontsize=10,font=2),
            show_row_names=FALSE,
            show_column_names=FALSE,
            row_title_gp=grid::gpar(fontsize=8,font=2),
            gap=unit(5,"mm"),
            heatmap_legend_param=list(
                color_bar="continuous"
            )
        )
    ),
    #complexHeatmapParams=getDefaultListArgs("complexHeatmapParams"),
    bamParams=NULL,
    onTheFly=FALSE, # Directly from BAM w/o Ranges storing, also N/A with BED,
    #localDb=file.path(path.expand("~"),".recoup"),
    localDb=file.path(system.file(package="recoup"),"annotation.sqlite"),
    rc=NULL
) {
    # Simple check to throw error if user us using the old databasing system
    if (any(dir.exists(file.path(dirname(localDb),refdb))))
        stop("Possible old recoup database system detected. Please rebuild ",
            "using the new system or download a pre-built database. See also ",
            "the vignettes.")
    
    ############################################################################
    # Begin of simple parameter checking for a new or a restored object
    if (is.list(input) && !is.null(input$data)) { # Refeeding recoup object
        message("Detected a previous recoup output as input. Any existing ",
            "data requiring\nlengthy calculations (short reads, coverages and ",
            "profile matrices will be\nreused depending on other parameters. ",
            "If you want a complete recalculation,\neither input a fresh list ",
            "of arguments or remove any of the above with the\nremoveData ",
            "function.\n")
        prevCallParams <- input$callopts
        refRanges <- input$refRanges
        input <- input$data
    }
    else
        prevCallParams <- refRanges <- NULL
        
    if (!is.list(input) && file.exists(input))
        input <- readConfig(input)
    checkInput(input)
    if (is.null(names(input)))
        names(input) <- vapply(input,function(x) return(x$id),character(1))
    
    # Check if there are any mispelled or invalid parameters and throw a warning
    checkMainArgs(as.list(match.call()))
    
    # Check rest of arguments
    region <- tolower(region[1])
    refdb <- tolower(refdb[1])
    type <- tolower(type[1])
    onFlankFail <- tolower(onFlankFail[1])
    signal <- tolower(signal[1])
    if (!is.null(design) && !is.data.frame(design))
        checkFileArgs("design",design)
    if (!is.data.frame(genome) && !is.list(genome) && file.exists(genome))
        checkFileArgs("genome",genome)
    else if (is.character(genome)) {
        if (is.character(localDb) && file.exists(localDb)) {
            if (!.userOrg(genome,localDb))
                checkTextArgs("genome",genome,getSupportedOrganisms(),
                    multiarg=FALSE)
        }
    }
    if (is.character(localDb) && file.exists(localDb)) {
        if (!.userRefdb(refdb,localDb))
            checkTextArgs("refdb",refdb,getSupportedRefDbs(),multiarg=FALSE)
    }
    checkTextArgs("type",type,c("chipseq","rnaseq"),multiarg=FALSE)
    checkTextArgs("onFlankFail",onFlankFail,c("drop","trim"),multiarg=FALSE)
    checkTextArgs("signal",signal,c("coverage","rpm"),multiarg=FALSE)
    checkNumArgs("fraction",fraction,"numeric",c(0,1),"botheq")
    if (any(flank<0))
        stop("The minimum flanking allowed is 0 bp")
    if (any(flank>50000))
        stop("The maximum flanking allowed is 50000 bp")
    # Check the version argument
    version <- version[1]
    if (is.character(version)) {
        version <- tolower(version)
        checkTextArgs("version",version,c("auto"),multiarg=FALSE)
    }
    else
        checkNumArgs("version",version,"numeric")
    # If type is rnaseq, only genebody plots are valid
    if (type=="rnaseq" && region!="genebody") {
        warning("When type is \"rnaseq\", plots can be created only on ",
            "genebodies! Switching to genebody regions...",immediate.=TRUE)
        region <- "genebody"
    }
    
    # If type is rnaseq, read extension is illegal because of potential splicing
    if (type=="rnaseq" && !is.null(preprocessParams$fragLen)
        && !is.na(preprocessParams$fragLen)) {
        warning("When type is \"rnaseq\", read extension/reduction should not ",
            "happen because of potential splicing! Ignoring...",immediate.=TRUE)
        preprocessParams$fragLen <- NA
    }
    
    # If type is rnaseq, the only allowed genomes are the ones supported by
    # recoup for the time being. In the future, a custom RNA-Seq genome may be
    # constructed from a GFF or like...
    if (type=="rnaseq" && is.character(localDb) && file.exists(localDb) 
        && !.userOrg(genome,localDb) && !(genome %in% getSupportedOrganisms()))
        stop("When type is \"rnaseq\", only the supported or user imported ",
            "genomes are allowed!")

    # annotation must be a list to be fed to buildCustomAnnotation
    if (is.list(genome) && !is.data.frame(genome)) {
        # members are checked by buildCustomAnnotation if required
        # We only need to check that the gtfFile exists here
        if (!("gtf" %in% names(genome)))
            stop("A gtf field must be provided with an existing GTF file ",
                "when providing a list with custom annotation elements!")
        if ("gtf" %in% names(genome) && is.character(genome$gtf)
            && !file.exists(genome$gtf))
            stop("An existing GTF file must be provided when providing a ",
                "list with custom annotation elements!")
    }
        
    # Check what is going on with genome, first check if file, then localDb
    if (!is.data.frame(genome) && !is.list(genome) && is.character(genome)
        && !file.exists(genome) && is.character(localDb) 
        && file.exists(localDb)) {
        if (!.userOrg(genome,localDb))
            checkTextArgs("genome",genome,c("hg18","hg19","hg38","mm9","mm10",
                "rn5","rn6","dm3","dm6","danrer7","danrer10","pantro4",
                "pantro5","susscr3","susscr11","ecucab2","tair10"),
                multiarg=FALSE)
        if (!.userRefdb(refdb,localDb))
            checkTextArgs("refdb",refdb,c("ensembl","ucsc","refseq"),
                multiarg=FALSE)
    }
    
    # and list arguments
    orderByDefault <- getDefaultListArgs("orderBy")
    binParamsDefault <- getDefaultListArgs("binParams")
    selectorDefault <- getDefaultListArgs("selector")
    preprocessParamsDefault <- getDefaultListArgs("preprocessParams",
        genome=genome)
    plotParamsDefault <- getDefaultListArgs("plotParams",design=design)
    saveParamsDefault <- getDefaultListArgs("saveParams")
    kmParamsDefault <- getDefaultListArgs("kmParams")
    strandedParamsDefault <- getDefaultListArgs("strandedParams")
    
    orderBy <- setArg(orderByDefault,orderBy)
    binParams <- setArg(binParamsDefault,binParams)
    if (!is.null(selector))
        selector <- setArg(selectorDefault,selector)
    preprocessParams <- setArg(preprocessParamsDefault,preprocessParams)
    plotParams <- setArg(plotParamsDefault,plotParams)
    saveParams <- setArg(saveParamsDefault,saveParams)
    kmParams <- setArg(kmParamsDefault,kmParams)
    strandedParams <- setArg(strandedParamsDefault,strandedParams)
    
    orderBy <- validateListArgs("orderBy",orderBy)
    binParams <- validateListArgs("binParams",binParams)
    if (!is.null(selector))
        selector <- validateListArgs("selector",selector)
    preprocessParams <- validateListArgs("preprocessParams",preprocessParams)
    plotParams <- validateListArgs("plotParams",plotParams)
    saveParams <- validateListArgs("saveParams",saveParams)
    kmParams <- validateListArgs("kmParams",kmParams)
    strandedParams <- validateListArgs("strandedParams",strandedParams)
    
    if (is.null(plotParams$outputBase))
        plotParams$outputBase <- paste(vapply(input,function(x) return(x$id),
            character(1)),collapse="-")
    
    bbb <- vapply(input,function(x) return(tolower(x$format)=="bed"),logical(1))
    if (any(bbb) && !(preprocessParams$bedGenome %in% c("hg18","hg19","hg38",
            "mm9","mm10","rn5","dm3","danrer7","pantro4","susscr3")))
        stop("When short read files are in BED format, either the genome ",
            "parameter should be one\nof the supported organisms, or ",
            "preprocessParams$bedGenome must be specified as one of them.")
   
    # End of simple parameter checking for a new or a restored object
    ############################################################################
    
    ############################################################################
    # Begin of complex parameter storage procedure and parameter recall from a
    # previous call
    
    # Store all parameters to an obect for later reference to a next call, after
    # checking if something has changed (e.g. using setr)...
    thisCall <- as.list(match.call())[-1]
    if (!is.null(prevCallParams)) {
        callParams <- list(
            region=if (is.null(thisCall$region)) prevCallParams$region else
                region,
            type=if (is.null(thisCall$type)) prevCallParams$type else type,
            onFlankFail=if (is.null(thisCall$onFlankFail)) 
                prevCallParams$onFlankFail else onFlankFail,
            signal=if (is.null(thisCall$signal)) prevCallParams$signal 
                else signal,
            genome=if (is.null(thisCall$genome)) prevCallParams$genome else
                genome,
            refdb=if (is.null(thisCall$refdb)) prevCallParams$refdb else
                refdb,
            version=if (is.null(thisCall$version)) prevCallParams$version else
                version,
            flank=if (is.null(thisCall$flank)) prevCallParams$flank else flank,
            fraction=if (is.null(thisCall$fraction)) prevCallParams$fraction
                else fraction,
            orderBy=if (is.null(thisCall$orderBy)) prevCallParams$orderBy else
                orderBy,
            binParams=if (is.null(thisCall$binParams)) prevCallParams$binParams
                else binParams,
            selector=selector, # selector is a special case
            preprocessParams=if (is.null(thisCall$preprocessParams))
                prevCallParams$preprocessParams else preprocessParams,
            plotParams=if (is.null(thisCall$plotParams)) 
                prevCallParams$plotParams else plotParams,
            saveParams=if (is.null(thisCall$saveParams))
                prevCallParams$saveParams else saveParams,
            kmParams=if (is.null(thisCall$kmParams))
                prevCallParams$kmParams else kmParams,
            strandedParams=if (is.null(thisCall$strandedParams))
                prevCallParams$strandedParams else strandedParams,
            ggplotParams=if (is.null(thisCall$ggplotParams))
                prevCallParams$ggplotParams else ggplotParams,
            complexHeatmapParams=if (is.null(thisCall$complexHeatmapParams))
                prevCallParams$complexHeatmapParams else complexHeatmapParams,
            #bamParams=if (is.null(thisCall$bamParams),
            #    prevCallParams$bamParams,bamParams), ## Unused
            onTheFly=if (is.null(thisCall$onTheFly)) prevCallParams$onTheFly 
                else onTheFly,
            localDb=if (is.null(thisCall$localDb)) 
                prevCallParams$localDb else localDb,
            rc=if (is.null(thisCall$rc)) prevCallParams$rc else rc,
            customIsBase=NULL # Additional placeholder
        )
    }
    else {
        callParams <- list(
            region=region,
            type=type,
            onFlankFail=onFlankFail,
            signal=signal,
            genome=genome,
            version=version,
            refdb=refdb,
            flank=flank,
            fraction=fraction,
            orderBy=orderBy,
            binParams=binParams,
            selector=selector,
            preprocessParams=preprocessParams,
            plotParams=plotParams,
            saveParams=saveParams,
            kmParams=kmParams,
            strandedParams=strandedParams,
            ggplotParams=ggplotParams,
            complexHeatmapParams=complexHeatmapParams,
            bamParams=bamParams,
            onTheFly=onTheFly,
            localDb=localDb,
            rc=rc,
            customIsBase=NULL # Additional placeholder
        )
    }
    
    # ...and check if there has been a previous call and decide on big
    # recalculations...
    input <- decideChanges(input,callParams,prevCallParams)
    
    # Redefine all final arguments for this call
    region <- callParams$region
    type <- callParams$type
    onFlankFail <- callParams$onFlankFail
    signal <- callParams$signal
    genome <- callParams$genome
    version <- callParams$version
    refdb <- callParams$refdb
    flank <- callParams$flank
    fraction <- callParams$fraction
    orderBy <- callParams$orderBy
    binParams <- callParams$binParams
    selector <- callParams$selector
    preprocessParams <- callParams$preprocessParams
    plotParams <- callParams$plotParams
    saveParams <- callParams$saveParams
    kmParams <- callParams$kmParams
    strandedParams <- callParams$strandedParams
    ggplotParams <- callParams$ggplotParams
    complexHeatmapParams <- callParams$complexHeatmapParams
    bamParams <- callParams$bamParams
    onTheFly <- callParams$onTheFly
    localDb <- callParams$localDb
    rc <- callParams$rc
    # End of complex parameter storage procedure and parameter recall from a
    # previous call
    ############################################################################
    
    # Continue with actual work
    # If annotation already here and not removed from the input object because
    # of change in region, flank or type, no need to reload it and thus, recoup
    # object become even more mobile and portable as localDb is not needed.
    if (!is.null(refRanges)) {
        genomeRanges <- refRanges$genomeRanges
        helperRanges <- refRanges$helperRanges
    }
    else {
        # Annotation case #1: provided as a BED-like data frame with loci
        if (is.data.frame(genome)) {
            if (type=="rnaseq")
                stop("When type=\"rnaseq\", only the usage of a supported or ",
                    "user-imported organism from GTF file is allowed!")
            rownames(genome) <- as.character(genome[,4])
            genomeRanges <- makeGRangesFromDataFrame(
                df=genome,
                keep.extra.columns=TRUE
            )
            helperRanges <- NULL
        }
        # Annotation case #2: provided strictly as a BED-like file with loci
        else if (!is.list(genome) && !is.data.frame(genome) 
            && is.character(genome)) {
            if (file.exists(genome)) {
                if (type=="rnaseq")
                    stop("When type=\"rnaseq\", only the usage of a supported ",
                        "or user-imported organism from GTF file is allowed!")
                genome <- read.delim(genome)
                rownames(genome) <- as.character(genome[,4])
                genomeRanges <- makeGRangesFromDataFrame(
                    df=genome,
                    keep.extra.columns=TRUE
                )
                # Need to assigne Seqinfo...
                sf <- .chromInfoToSeqInfoDf(.chromInfoFromBAM(input[[1]]$file),
                    asSeqinfo=TRUE)
                seqlevels(genomeRanges) <- seqlevels(sf)
                seqinfo(genomeRanges) <- sf
                
                helperRanges <- NULL
            }
            # Annotation case #3: use local database or automatically on-the-fly
            else {
                if (type=="chipseq") {
                    if (region == "utr3")
                        genomeRanges <- tryCatch(loadAnnotation(genome,refdb,
                            type="utr",version=version,summarized=TRUE,
                            db=localDb,rc=rc),
                            error=function(e) {
                                tryCatch({
                                    gtfFile <- genome$gtf
                                    metadata <- genome
                                    metadata$gtf <- NULL
                                    genomeRanges <- importCustomAnnotation(
                                        gtfFile,metadata,"utr")
                                },error=function(e) {
                                    message("Error ",e)
                                    stop("Please provide an existing organism ",
                                        "or a list with annotation metadata ",
                                        "and GTF file!")
                                },finally="")
                            },finally="")
                    else
                        genomeRanges <- tryCatch(loadAnnotation(genome,refdb,
                            type="gene",version=version,db=localDb,rc=rc),
                            error=function(e) {
                                tryCatch({
                                    gtfFile <- genome$gtf
                                    metadata <- genome
                                    metadata$gtf <- NULL
                                    geneData <- importCustomAnnotation(gtfFile,
                                        metadata,"gene")
                                },error=function(e) {
                                    message("Error ",e)
                                    stop("Please provide an existing organism ",
                                        "or a list with annotation metadata ",
                                        "and GTF file!")
                                },finally="")
                            },finally="")

                    helperRanges <- NULL
                }
                else if (type=="rnaseq") {
                    genomeRanges <- tryCatch(loadAnnotation(genome,refdb,
                        type="exon",version=version,summarized=TRUE,db=localDb,
                        rc=rc),
                        error=function(e) {
                            tryCatch({
                                gtfFile <- genome$gtf
                                metadata <- genome
                                metadata$gtf <- NULL
                                genomeRanges <- importCustomAnnotation(gtfFile,
                                    metadata,"exon")
                            },error=function(e) {
                                message("Error ",e)
                                stop("Please provide an existing organism or ",
                                        "a list with annotation metadata and ",
                                        "GTF file!")
                            },finally="")
                        },finally="")
                    helperRanges <- tryCatch(loadAnnotation(genome,refdb,
                        type="gene",version=version,db=localDb,rc=rc),
                        error=function(e) {
                            tryCatch({
                                gtfFile <- genome$gtf
                                metadata <- genome
                                metadata$gtf <- NULL
                                geneData <- importCustomAnnotation(gtfFile,
                                    metadata,"gene")
                            },error=function(e) {
                                message("Error ",e)
                                stop("Please provide an existing organism or ",
                                    "a list with annotation metadata and GTF ",
                                    "file!")
                            },finally="")
                        },finally="")
                    if (!is(genomeRanges,"GRangesList"))
                        genomeRanges <- split(genomeRanges,genomeRanges$gene_id)
                    helperRanges <- helperRanges[names(genomeRanges)]
                }
            }
        }
        
        # For saving...
        refRanges <- list(genomeRanges=genomeRanges,helperRanges=helperRanges)
    }
    
    # After genome read, check if we have a valid custom orderer
    if (!is.null(orderBy$custom)) {
        if (length(orderBy$custom) != length(genomeRanges)) {
            warning("The custom orderer provided with orderBy parameter does ",
                "not have length equal to the number\nof elements in the ",
                "interrogated genomic regions and will be ignored!",
                immediate.=TRUE)
            orderBy$custom <- NULL
        }
    }
    
    # Read and check design compatibilities. Check if k-means is requested and
    # message accordingly. If k-means is requested it will be added to the 
    # design data frame
    #hasDesign <- FALSE
    if (!is.null(design)) {
        if (!is.data.frame(design))
            design <- read.delim(design,row.names=1)
        nfac <- ncol(design)
        if (length(input)>1 && nfac>2)
            stop("When more than one files are provided for coverage ",
                "generation, the maximum number of allowed design factors is 2")
        if (length(input)>1 && nfac>1 && kmParams$k>0)
            stop("When more than one files are provided for coverage ",
                "generation and k-means clustering is also requested, the ",
                "maximum number of allowed design factors is 1")
        if (length(input)==1 && nfac>3)
            stop("The maximum number of allowed design factors is 3")
        if (length(input)==1 && nfac>2 && kmParams$k>0)
            stop("The maximum number of allowed design factors when k-means ",
                "clustering is requested is 2")
        if (length(input)==1 && nfac>2 && plotParams$singleFacet!="none")
            stop("The maximum number of allowed design factors whith one ",
                "sample but wishing a gridded profile layout is 2")
        if (length(input)==1 && nfac>1 && kmParams$k>0 
            && plotParams$singleFacet!="none")
            stop("The maximum number of allowed design factors whith one ",
                "sample but wishing for k-means clustering and gridded ",
                "profile layout is 1")
        # Reduce the genomeRanges according to design or the other way
        if (nrow(design)>length(genomeRanges))
            design <- tryCatch({
                design[names(genomeRanges),,drop=FALSE]
            },error=function(e) {
                stop("Unexpected error occured! Are you sure that element ",
                    "(row) names in the design file are of the same type as ",
                    "the genome file?")
            },finally={})
        else if (nrow(design)<=length(genomeRanges)) {
            genomeRanges <- tryCatch({
                genomeRanges[rownames(design)]
            },error=function(e) {
                stop("Unexpected error occured! Are you sure that element ",
                    "(row) names in the design file are of the same type as ",
                    "the genome file?")
            },finally={})
            if (type=="rnaseq")
                helperRanges <- tryCatch({
                    helperRanges[rownames(design)]
                },error=function(e) {
                    stop("Unexpected error occured! Are you sure that element ",
                        "(row) names in the design file are of the same type ",
                        "as the genome file?")
                },finally={})
        }
        # ...but maybe the names are incompatible
        if (length(genomeRanges)==0)
            stop("No ranges left after using the identifiers provided with ",
                "the design file. Are you sure the identifiers between the ",
                "two files are compatible?")
        if (nrow(design)==0)
            stop("No design elements left after using the identifiers ",
                "provided with the genome file. Are you sure the identifiers ",
                "between the two files are compatible?")
    }
    
    # Apply the rest of the filters if any to reduce later computational burden
    if (!is.null(selector)) {
        if (type=="chipseq")
            genomeRanges <- applySelectors(genomeRanges,selector,rc=rc)
        if (type=="rnaseq") {
            helperRanges <- applySelectors(helperRanges,selector)
            # Filter and align names if we have helperRanges around
            genomeRanges <- genomeRanges[names(helperRanges)]
        }
    }
    
    # Now we must follow two paths according to region type, genebody and custom
    # areas with equal/unequal widths and utr3 or tss, tes and 1-width custom 
    # areas.
    callParams$customIsBase <- FALSE
    if (region=="custom" && all(width(genomeRanges)==1)) {
        if (all(flank==0)) {
            warning("Flanking cannot be zero bp in both directions when the ",
                "reference region is only 1bp! Setting to default ",
                "(2000,2000)...",immediate.=TRUE)
            flank <- c(2000,2000)
            callParams$flank <- flank
        }
        callParams$customIsBase <- TRUE
    }
    if (region %in% c("tss","tes")) {
        if (all(flank==0)) {
            warning("Flanking cannot be zero bp in both directions when the ",
                "reference region is \"tss\" or \"tes\"! Setting to default ",
                "(2000,2000)...", immediate.=TRUE)
            flank <- c(2000,2000)
            callParams$flank <- flank
        }
    }
    
    # Here we must determine the final ranges to pass to preprocessRanges and
    # then work with these later on
    intermRanges <- getMainRanges(genomeRanges,helperRanges=helperRanges,type,
        region,flank,onFlankFail,rc=rc)
    # It is very important that all the Ranges have Seqinfo objects attached as
    # they have to be trimmed for potential extensions (due to flanking regions)
    # beyond chromosome lengths. The built-in annotations do provide this option
    # otherwise when a custom genome/areas is/are provided, the genome of 
    # interest should be provided too. Or if not provided, the user will be
    # responsible for any crash.
    # TODO: When input ranges are custom, genome should be given to make Seqinfo
    
    # Should not be needed anymore with the new onFlankFail option
    #if (type == "chipseq") # Otherwise, throw error with GRangesList
    #    mainRanges <- trim(intermRanges$mainRanges)
    #else if (type == "rnaseq")
    #    mainRanges <- intermRanges$mainRanges
    #bamRanges <- trim(intermRanges$bamRanges)
    
    mainRanges <- intermRanges$mainRanges
    bamRanges <- intermRanges$bamRanges
    hadOffBound <- intermRanges$offBound
    # We must also correct the design in case of drop
    if (!is.null(design) && onFlankFail == "drop" && hadOffBound)
        design <- design[names(mainRanges),,drop=FALSE]
    # Say something if off bound detected
    if (hadOffBound) {
        if (onFlankFail == "drop")
        warning("Off bound regions detected after flanking! Offending ",
            "regions will be dropped from the reference\n  and the design if ",
            "provided...",immediate.=TRUE)
    else if (onFlankFail == "trim")
        warning("Off bound regions detected after flanking! Offending ",
            "reference regions will be trimmed\n  and the flanking regions ",
            "will be merged with the main...",immediate.=TRUE)
    }
    
    # Here we must write code for the reading and normalization of bam files
    # The preprocessRanges function looks if there is a valid (not null) ranges
    # field in input
    if (!onTheFly)
        input <- preprocessRanges(input,preprocessParams,genome,bamRanges,
            bamParams=bamParams,rc=rc)
    # At this point we must apply the fraction parameter if <1. We choose this
    # point in order not to restrict later usage of the read ranges and since it
    # does not take much time to load into memory. In addition, this operation
    # cannot be applied when streaming BAMs. In that case, user must subsample
    # outside recoup.
    if (!onTheFly && fraction<1) {
        newSize <- round(fraction*length(mainRanges))
        #set.seed(preprocessParams$seed)
        refIndex <- sort(sample(length(mainRanges),newSize))
        mainRanges <- mainRanges[refIndex]
        for (i in seq_len(length(input))) {
            if (!is.null(input[[i]]$ranges)) {
                newSize <- round(fraction*length(input[[i]]$ranges))
                #set.seed(preprocessParams$seed)
                fracIndex <- sort(sample(length(input[[i]]$ranges),newSize))
                input[[i]]$ranges <- input[[i]]$ranges[fracIndex]
            }
            if (!is.null(input[[i]]$coverage))
                input[[i]]$coverage <- input[[i]]$coverage[refIndex]
            if (!is.null(input[[i]]$profile))
                input[[i]]$profile <- input[[i]]$profile[refIndex,]
        }
    }

    # Remove unwanted seqnames from reference ranges (if not on the fly) and
    # properly subset ranges if not all chromosomes are provided with BAM
    if (!onTheFly) {
        chrs <- unique(unlist(lapply(input,function(x) {
            if (x$format %in% c("bam","bed"))
                return(as.character(runValue(seqnames(x$ranges))))
            else if (x$format=="bigwig") {
                if (!requireNamespace("rtracklayer"))
                    stop("R package rtracklayer is required to read and ",
                        "import BigWig files!")
                return(as.character(seqnames(seqinfo(BigWigFile(x$file)))))
            }
        })))
        if (type=="chipseq") {
            #keep <- which(as.character(seqnames(mainRanges)) %in% chrs)
            #mainRanges <- mainRanges[keep]
            mainRanges <- .subsetGRangesByChrs(mainRanges,chrs)
        }
        else if (type=="rnaseq") {
            #keeph <- which(as.character(seqnames(helperRanges)) %in% chrs)
            #helperRanges <- helperRanges[keeph]
            #mainRanges <- mainRanges[names(helperRanges)]
            helperRanges <- .subsetGRangesByChrs(helperRanges,chrs)
			mainRanges <- .subsetGRangesByChrs(mainRanges,chrs)
        }
    }
    
    if(signal == "coverage")
        input <- coverageRef(input,mainRanges,strandedParams,rc=rc)
    else if (signal == "rpm")
        input <- rpMatrix(input,mainRanges,flank,binParams,strandedParams,rc=rc)
    
    # If normalization method is linear, we must adjust the coverages
    # TODO: Check for onTheFly in the beginning
    if (preprocessParams$normalize == "linear" && !onTheFly) {
        message("Calculating normalization factors...")
        linFac <- calcLinearFactors(input,preprocessParams)
        for (n in names(input)) {
            message("  normalizing ",n)
            if (linFac[n]==1)
                next
            if (signal == "coverage")
                input[[n]]$coverage <- cmclapply(input[[n]]$coverage,
                function(x,f) {
                    return(x*f)
                },linFac[n])
            else if (signal == "rpm")
                input[[n]]$profile <- apply(input[[n]]$profile,1,function(x,f) {
                    return(x*f)
                },linFac[n])
        }
    }
    
    # Now we must summarize and create the matrices. If genebody, utr3 or 
    # unequal custom lengths, bin is a must, else we leave to user
    mustBin <- FALSE
    if (region=="genebody" || region=="utr3")
        mustBin <- TRUE
    if (region=="custom") {
        w <- width(mainRanges)
        if (any(w!=w[1]))
            mustBin <- TRUE
    }
    if (mustBin) {
        if (binParams$regionBinSize==0) {
            warning("Central region bin size not set for a region that must ",
                "be binned! Setting to 200...",immediate.=TRUE)
            binParams$regionBinSize <- 200
        }
    }
    
    # Chunking?
    if (signal == "coverage") { # Otherwise, matrix calculate by rpm
        if (binParams$chunking) {
            if (type=="chipseq")
                binParams$chunks <- split(seq_len(length(genomeRanges)),
                    as.character(seqnames(genomeRanges)))
            else if (type=="rnaseq")
                binParams$chunks <- split(seq_len(length(helperRanges)),
                    as.character(seqnames(helperRanges)))
        }
        # hadOffBounds must be passed on input at some point
        fe0 <- FALSE
        if (onFlankFail == "trim" && hadOffBound)
            fe0 <- TRUE
        input <- profileMatrix(input,flank,binParams,rc,.feNoSplit=fe0)
    }
    
    # In some strange glimpses, we are getting very few NaNs in profile matrix
    # which I was unable to reproduce on a gene by gene basis. If no NaNs are
    # detected, no action is performed in the input object. Also, these NaNs
    # seem to appear only on zero count gene profiles, so it's safe to impute
    # by zero.
    #input <- imputeProfile(input,method="simple",rc)
    input <- imputeZero(input)
    
    # Perform the k-means clustering if requested and append to design (which
    # has been checked, if we are allowed to do so)
    if (kmParams$k>0)
        design <- kmeansDesign(input,design,kmParams)
    
    # Coverages and profiles calculated... Now depending on plot option, we go 
    # further or return the enriched input object for saving
    if (!plotParams$profile && !plotParams$heatmap && !plotParams$correlation) {
        recoupObj <- toOutput(input,design,saveParams,callParams=callParams,
            refRanges=refRanges)
        return(recoupObj)
    }
    else {
        recoupObj <- toOutput(input,design,list(ranges=TRUE,coverage=TRUE,
            profile=TRUE),callParams=callParams,refRanges=refRanges)
    }
            
    ## Our plot objects
    recoupPlots <- list()
    
    # We must pass the matrices to plotting function
    if (plotParams$profile) {
        message("Constructing genomic coverage profile curve(s)")
        #theProfile <- recoupProfile(recoupObj,rc=rc)
        recoupObj <- recoupProfile(recoupObj,rc=rc)
        theProfile <- getr(recoupObj,"profile")
        recoupPlots$profilePlot <- theProfile
    }
    
    # Some default binning MUST be applied for the heatmap... Otherwise it could
    # take insanely long time and space to draw/store
    if (plotParams$heatmap) {
        # Inform the user about enforced binning (or not)
        if (region %in% c("tss","tes") || callParams$customIsBase) {
            if (binParams$regionBinSize==0 && binParams$forceHeatmapBinning) {
                message("The resolution of the requested profiles will be ",
                    "lowered to avoid\nincreased computation time and/or ",
                    "storage space for heatmap profiles...")
                
            }
            else if (binParams$regionBinSize==0
                && !binParams$forceHeatmapBinning)
                warning("forceHeatmapBinning is turned off for high ",
                    "resolution plotting. Be prepared for\nlong computational ",
                    "times and big figures!",immediate.=TRUE)
        }
        else {
            if ((binParams$regionBinSize==0 || binParams$flankBinSize==0)
                && binParams$forceHeatmapBinning) {
                message("The resolution of the requested profiles will be ",
                    "lowered to avoid\nincreased computation time and/or ",
                    "storage space for heatmap profiles...")
                
            }
            else if ((binParams$regionBinSize==0 || binParams$flankBinSize==0)
                && !binParams$forceHeatmapBinning)
                warning("forceHeatmapBinning is turned off for high ",
                    "resolution plotting. Be prepared for\nlong computational ",
                    "times and big figures!",immediate.=TRUE)
        }
        
        if (binParams$forceHeatmapBinning 
            && (binParams$regionBinSize==0 || binParams$flankBinSize==0)) {
            helpInput <- recoupObj$data
            if (region %in% c("tss","tes") || callParams$customIsBase) {
                for (n in names(helpInput)) {
                    message("Calculating ",region," profile for ",
                        helpInput[[n]]$name)
                    helpInput[[n]]$profile <- 
                        binCoverageMatrix(helpInput[[n]]$coverage,
                            binSize=binParams$forcedBinSize[2],
                            stat=binParams$sumStat,rc=rc)
                    helpInput[[n]]$profile <- helpInput[[n]]$profile
                }
            }
            else {
                for (n in names(helpInput)) {
                    message("Calculating ",region," profile for ",
                        helpInput[[n]]$name)
                    message(" center")
                    #center <- binCoverageMatrix(helpInput[[n]]$coverage$center,
                    #    binSize=binParams$forcedBinSize[2],
                    #    stat=binParams$sumStat,rc=rc)
                    center <- binCoverageMatrix(
                        input[[n]]$coverage,binSize=binParams$forcedBinSize[2],
                        stat=binParams$sumStat,
                        interpolation=binParams$interpolation,
                        flank=flank,where="center",rc=rc
                    )
                    message(" upstream")
                    #left <- binCoverageMatrix(helpInput[[n]]$coverage$upstream,
                    #    binSize=binParams$forcedBinSize[1],
                    #    stat=binParams$sumStat,rc=rc)
                    left <- binCoverageMatrix(
                        input[[n]]$coverage,binSize=binParams$forcedBinSize[1],
                        stat=binParams$sumStat,
                        interpolation=binParams$interpolation,flank=flank,
                        where="upstream",rc=rc
                    )
                    message(" downstream")
                    #right <- binCoverageMatrix(
                    #    helpInput[[n]]$coverage$downstream,
                    #    binSize=binParams$forcedBinSize[1],
                    #    stat=binParams$sumStat,rc=rc)
                    right <- binCoverageMatrix(
                        input[[n]]$coverage,binSize=binParams$forcedBinSize[1],
                        stat=binParams$sumStat,
                        interpolation=binParams$interpolation,flank=flank,
                        where="downstream",rc=rc
                    )
                    helpInput[[n]]$profile <- cbind(left,center,right)
                    rownames(helpInput[[n]]$profile) <-
                        names(input[[n]]$coverage)
                    helpInput[[n]]$profile <- helpInput[[n]]$profile
                }
            }
        }
        else
            helpInput <- recoupObj$data
        
        helpObj <- recoupObj
        helpObj$data <- helpInput
        message("Constructing genomic coverage heatmap(s)")
        #theHeatmap <- recoupHeatmap(helpObj,rc=rc)
        helpObj <- recoupHeatmap(helpObj,rc=rc)
        theHeatmap <- getr(helpObj,"heatmap")
        recoupObj <- setr(recoupObj,"heatmap",theHeatmap)
        recoupPlots$heatmapPlot <- theHeatmap
        
        # Derive the main heatmap in case of hierarchical clustering        
        mainh <- 1
        if (length(grep("hc",orderBy$what))>0) {
            nc <- nchar(orderBy$what)
            mh <- suppressWarnings(as.numeric(substr(orderBy$what,nc,nc)))
            if (is.na(mh))
                warning("Reference profile for hierarchical clustering order ",
                    "not recognized! Using the 1st...",immediate.=TRUE)
            else if (mh > length(input)) {
                warning("Reference profile (",mh,") for hierarchical ",
                    "clustering order does not exist (the input has only ",
                    length(input)," sources! Using the last...",
                    immediate.=TRUE)
                    mainh <- length(input)
            }
            else
                mainh <- mh
        }
    }
    
    # We must pass the matrices to plotting function
    if (plotParams$correlation) {
        message("Constructing coverage correlation profile curve(s)")
        recoupObj <- recoupCorrelation(recoupObj,rc=rc)
        theCorrelation <- getr(recoupObj,"correlation")
        recoupPlots$correlationPlot <- theCorrelation
    }
    
    # Overwrite final object so as to return it
    recoupObj <- toOutput(input,design,saveParams,recoupPlots,callParams,
        refRanges)
    
    # Make any plots asked
    if (plotParams$plot) {
        what <- character(0)
        if (plotParams$profile)
            what <- c(what,"profile")
        if (plotParams$heatmap)
            what <- c(what,"heatmap")
        if (plotParams$correlation)
            what <- c(what,"correlation")
        if (length(what)>0)
            recoupPlot(recoupObj,what,plotParams$device,plotParams$outputDir,
                plotParams$outputBase,mainh)
    }
    
    # Return the enriched input object according to save options
    return(recoupObj)
}

applySelectors <- function(ranges,selector,rc=NULL) {
    if (!is.null(selector$id)) {
        ranges <- ranges[selector$ids]
        if (length(ranges)==0)
            stop("No ranges left after using the identifiers provided with ",
                "the selector field. Are you sure the identifiers between the ",
                "two files are compatible?")
    }
    if (!is.null(selector$biotype) && !is.null(ranges$biotype)) {
        good <- which(ranges$biotype %in% selector$biotype)
        ranges <- ranges[good]
        if (length(ranges)==0)
            stop("No ranges left after using the biotypes provided with the ",
                "selector field. Are you sure the identifiers between the two ",
                "files are compatible?")
    }
    if (!is.null(selector$exonType) && !is.null(ranges$exon_type)) {
        good <- which(ranges$exonType %in% selector$exonType)
        ranges <- ranges[good]
        if (length(ranges)==0)
            stop("No ranges left after using the exon types provided with ",
                "the selector field. Are you sure the identifiers between the ",
                "two files are compatible?")
    }
    return(ranges)
}

toOutput <- function(input,design=NULL,saveParams,plotObjs=NULL,
    callParams=NULL,refRanges=NULL) {
    if (!saveParams$ranges)
        input <- removeData(input,"ranges")
    if (!saveParams$coverage)
        input <- removeData(input,"coverage")
    if (!saveParams$profile)
        input <- removeData(input,"profile")
    plots <- list(profile=NULL,heatmap=NULL,correlation=NULL)
    if (!is.null(plotObjs) && saveParams$profilePlot)
        plots$profile <- plotObjs$profilePlot
    if (!is.null(plotObjs) && saveParams$heatmapPlot)
        plots$heatmap <- plotObjs$heatmapPlot
    if (!is.null(plotObjs) && saveParams$correlationPlot)
        plots$correlation <- plotObjs$correlationPlot
    return(list(
        data=input,
        design=design,
        plots=plots,
        callopts=callParams,
        refRanges=refRanges
    ))
}
pmoulos/recoup documentation built on March 14, 2024, 5:21 p.m.