R/Methods.R

Defines functions atacRepsPipe2 getBinsReadsCount atacRepsPipe checkFilePathExist atacPipe2 atacPipe getSuffixlessFileName0 getSuffix0 getRshow getVMRShow getVMShow getVMR getVM getf getPer

Documented in atacPipe atacPipe2 atacRepsPipe atacRepsPipe2

getPer<- function(per){
    per <- as.numeric(per)*100
    return(paste0(sprintf("%.1f",per),"%"))
}
getf<- function(per){
    per <- as.numeric(per)
    return(sprintf("%.2f",per))
}
getVM <- function(readSize){
    readSize <- as.numeric(readSize)
    return(c(readSize,readSize/1e6))
}
getVMR <- function(readSize,total){
    readSize <- as.character(readSize)
    total <- as.integer(total)
    vm <- getVM(readSize)
    return(c(vm, 100*vm[1]/total,total))
}
getVMShow <- function(readSize,detail = FALSE){
    vm <- getVM(readSize)
    if(detail){
        return(sprintf("%.1fM (%d)",vm[2],vm[1]))
    }else{
        return(sprintf("%.1fM",vm[2]))
    }

}
getVMRShow <- function(readSize,total,detail = FALSE){
    vmr <- getVMR(readSize,total)
    total <- as.numeric(total)
    if(detail){
        return(sprintf("%.1fM (%.2f%s, %d / %d)",vmr[2],vmr[3],"%",vmr[1],vmr[4]))
    }else{
        return(sprintf("%.1fM (%.2f%s)",vmr[2],vmr[3],"%"))
    }

}
getRshow <- function(readSize,total,detail = FALSE){
    vmr <- getVMR(readSize,total)
    total <- as.numeric(total)
    if(detail){
        return(sprintf("%.2f, %d / %d",vmr[3]/100,vmr[1],vmr[4]))
    }else{
        return(sprintf("%.2f",vmr[3]/100))
    }

}


getSuffix0 <- function(filePath){
    filename<-basename(filePath)
    lst=strsplit(filename,"\\.")[[1]]
    if(length(lst)==1){
        return(NULL)
    }else{
        return(lst[length(lst)])
    }
}
getSuffixlessFileName0 <- function(filePath){
    sfx=getSuffix0(filePath)
    if(is.null(sfx)){
        return(basename(filePath))
    }else {
        return(strsplit(basename(filePath),paste0(".",sfx)))
    }
}


#' @docType package
#' @name esATAC-package
#' @details
#' See packageDescription('esATAC') for package details.
#'
#' @title An Easy-to-use Systematic pipeline for ATACseq data analysis
#' @description
#' This package provides a framework and complete preset pipeline for
#' the quantification and analysis of ATAC-seq Reads. It covers raw sequencing
#' reads preprocessing (FASTQ files), reads alignment (Rbowtie2), aligned reads
#' file operation (SAM, BAM, and BED files), peak calling (fseq), genome
#' annotations (Motif, GO, SNP analysis) and quality control report. The package
#' is managed by dataflow graph. It is easy for user to pass variables seamlessly
#' between processes and understand the workflow. Users can process FASTQ files
#' through end-to-end preset pipeline which produces a pretty HTML report for
#' quality control and preliminary statistical results, or customize workflow
#' starting from any intermediate stages with esATAC functions easily and flexibly.
#'
#' Preset pipeline for single replicate case study is shown below.
#'
#' For multi-replicates case study, see \code{\link{atacRepsPipe}}.
#'
#' For single replicate case-control study, see \code{\link{atacPipe2}}.
#'
#' For multi-replicates case-control study, see \code{\link{atacRepsPipe2}}.
#'
#'
#' NOTE:
#' Build bowtie index in the function may take some time.
#' If you already have bowtie2 index files or
#' you want to download(\url{ftp://ftp.ccb.jhu.edu/pub/data/bowtie2_indexes})
#' instead of building,
#' you can let esATAC skip the steps by renaming them following the format
#' (genome+suffix) and put them in reference installation path (refdir).
#' Example: hg19 bowtie2 index files
#'
#' \itemize{
#' \item hg19.1.bt2
#' \item hg19.2.bt2
#' \item hg19.3.bt2
#' \item hg19.4.bt2
#' \item hg19.rev.1.bt2
#' \item hg19.rev.2.bt2
#' }
#'
#' For single end reads FASTQ files,
#' The required parameters are fastqInput1 and adapter1.
#' For paired end reads non-interleaved FASTQ files (interleave=FALSE,defualt),
#' The required parameters are fastqInput1 and fastqInput2.
#' Otherwise, parameter fastqInput2 is not required (interleave=TRUE)
#'
#' The paths of sequencing data replicates can be a \code{Character} vector.
#' For example:
#'
#' fastqInput1=c("file_1.rep1.fastq","file_1.rep2.fastq")
#'
#' fastqInput2=c("file_2.rep1.fastq","file_2.rep2.fastq")
#'
#' The result will be return by the function.
#' An HTML report file will be created for paired end reads.
#' Intermediate files will be save at tmpdir path (default is ./)
#'
#' @param genome \code{Character} scalar. The genome(like hg19, mm10, etc.) reference data in "refdir" to be used in the pipeline.
#' @param fastqInput1 \code{Character} vector. For single-end sequencing,
#' it contains sequence file paths.
#' For paired-end sequencing, it can be file paths with #1 mates paired
#' with file paths in fastqInput2
#' And it can also be interleaved file paths when argument
#' interleaved=\code{TRUE}
#' @param fastqInput2 \code{Character} vector. It contains file paths with #2
#' mates paired with file paths in fastqInput1.
#' For single-end sequencing files and interleaved paired-end sequencing
#' files(argument interleaved=\code{TRUE}),
#' it must be \code{NULL}.
#' @param refdir \code{Character} scalar. The path for reference data being installed to and storage.
#' @param tmpdir \code{Character} scalar. The temporary file storage path.
#' @param threads \code{Integer} scalar. The max threads allowed to be created.
#' @param adapter1 \code{Character} scalar. It is an adapter sequence for file1.
#' For single end data, it is requied.
#' @param adapter2 \code{Character} scalar. It is an adapter sequence for file2.
#' @param interleave \code{Logical} scalar. Set \code{TRUE} when files are
#' interleaved paired-end sequencing data.
#' @param basicAnalysis \code{Logical} scalar. If it is TRUE, the pipeline will skip the time consuming steps
#' like GO annoation and motif analysis
#' @param createReport \code{Logical} scalar. If the HTML report file will be created.
#' @param motifs either\code{\link{PFMatrix}}, \code{\link{PFMatrixList}},
#' \code{\link{PWMatrix}}, \code{\link{PWMatrixList}}, default: vertebrates motif from JASPAR.
#' @param pipelineName \code{Character} scalar. Temporary file prefix for identifying files
#' when multiple pipeline generating file in the same tempdir.
#' @param chr Which chromatin the program will processing. It must be identical
#' with the filename of cut site information files or subset of .
#' Default:c(1:22, "X", "Y").
#' @param p.cutoff p-value cutoff for returning motifs, default: 1e-6.
#' @param ... Additional arguments, currently unused.
#' @return \code{List} scalar. It is a list that save the result of the pipeline.
#' Slot "filelist": the input file paths.
#' Slot "wholesummary": a dataframe that for quality control summary
#' Slot "atacProcs": \code{\link{ATACProc-class}} objects generated by each process in the pipeline.
#' Slot "filtstat": a dataframe that summary the reads filted in each process.
#'
#' @author Zheng Wei and Wei Zhang
#' @seealso
#' \code{\link{printMap}},
#' \code{\link{atacPipe2}},
#' \code{\link{atacRenamer}},
#' \code{\link{atacRemoveAdapter}},
#' \code{\link{atacBowtie2Mapping}},
#' \code{\link{atacPeakCalling}},
#' \code{\link{atacMotifScan}},
#' \code{\link{atacRepsPipe}},
#' \code{\link{atacRepsPipe2}}
#' @examples
#' \dontrun{
#' ## These codes are time consuming so they will not be run and
#' ## checked by bioconductor checker.
#'
#'
#' # call pipeline
#' # for a quick example(only CTCF and BATF3 will be processing)
#' conclusion <-
#'   atacPipe(
#'        # MODIFY: Change these paths to your own case files!
#'        # e.g. fastqInput1 = "your/own/data/path.fastq"
#'        fastqInput1 = system.file(package="esATAC", "extdata", "chr20_1.1.fq.gz"),
#'        fastqInput2 = system.file(package="esATAC", "extdata", "chr20_2.1.fq.gz"),
#'        # MODIFY: Set the genome for your data
#'        genome = "hg19",
#'        motifs = getMotifInfo(motif.file = system.file("extdata", "CustomizedMotif.txt", package="esATAC")))
#'
#' # call pipeline
#' # for overall example(all vertebrates motif in JASPAR will be processed)
#' conclusion <-
#'   atacPipe(
#'        # MODIFY: Change these paths to your own case files!
#'        # e.g. fastqInput1 = "your/own/data/path.fastq"
#'        fastqInput1 = system.file(package="esATAC", "extdata", "chr20_1.1.fq.gz"),
#'        fastqInput2 = system.file(package="esATAC", "extdata", "chr20_2.1.fq.gz"),
#'        # MODIFY: Set the genome for your data
#'        genome = "hg19")
#' }
#' @export

atacPipe <- function(genome, 
                     fastqInput1, 
                     fastqInput2=NULL, 
                     tmpdir = file.path(getwd(),"esATAC-pipeline"), 
                     refdir = file.path(tmpdir,"refdir"), 
                     threads = 2, 
                     adapter1 = NULL, 
                     adapter2 = NULL,
                     interleave = FALSE,  
                     basicAnalysis = FALSE, 
                     createReport = TRUE, 
                     motifs = NULL, 
                     pipelineName = "pipe",
                     chr = c(1:22, "X", "Y"), 
                     p.cutoff = 1e-6, ...){ #saveTmp = TRUE,
    
    prefix = pipelineName
    
    if(is.null(fastqInput2)&&!interleave&&is.null(adapter1)){
        stop("adapter1 should not be NULL for single end sequencing data")
    }
    if(!is.null(fastqInput1)){
        fastqInput1 = normalizePath(fastqInput1)
    }
    if(!is.null(fastqInput2)){
        fastqInput2 = normalizePath(fastqInput2)
    }
    
    dir.create(tmpdir,recursive = TRUE)
    setTmpDir(tmpdir)
    
    setJobName(pipelineName)
    
    setPipeName(pipelineName)
    
    setRefDir(refdir)
    
    setGenome(genome)
    setThreads(threads)
    unzipAndMerge <- atacUnzipAndMerge(fastqInput1 = fastqInput1,fastqInput2 = fastqInput2,interleave = interleave, ...)
    atacQC <- atacQCReport(atacProc = unzipAndMerge, ...)
    renamer <- atacRenamer(unzipAndMerge, ...)
    if(is.null(adapter1) || is.null(adapter2)){
        findAdapterObj <-atacFindAdapter(renamer, ...)
        removeAdapter <- atacRemoveAdapter(findAdapterObj, ...)
    }else{
        removeAdapter <- atacRemoveAdapter(renamer, adapter1 = adapter1, adapter2 = adapter2, ...)
    }
    bowtie2Mapping <- atacBowtie2Mapping(removeAdapter, ...)
    libComplexQC <- atacLibComplexQC(bowtie2Mapping, ...)
    sam2Bed <-atacSamToBed(bowtie2Mapping,maxFragLen = 2000, ...)
    bedToBigWig <- atacBedToBigWig(sam2Bed, ...)
    tssqc100 <-atacTSSQC(sam2Bed,fragLenRange = c(0,100), newStepType = "TSSQCNFR", ...)
    
    if(is.null(fastqInput2)&&!interleave){
        peakCalling <- atacPeakCalling(sam2Bed, ...)
        DHSQC <- atacPeakQC(peakCalling,qcbedInput = "DHS", ...)
        fripQC <- atacFripQC(atacProc = sam2Bed,atacProcPeak = peakCalling, ...)
    }else{
        tssqc180_247 <- atacTSSQC(sam2Bed,fragLenRange = c(180,247),  newStepType = "TSSQCneucleosome", ...)
        fragLenDistr <- atacFragLenDistr(sam2Bed, ...)
        shortBed <- atacBedUtils(sam2Bed,maxFragLen = 100, chrFilterList = NULL, ...)
        peakCalling <- atacPeakCalling(shortBed, ...)
        DHSQC <- atacPeakQC(peakCalling,qcbedInput = "DHS",newStepType = "PeakQCDHS", ...)
        blacklistQC <- atacPeakQC(peakCalling,qcbedInput = "blacklist", newStepType = "PeakQCblacklist", ...)
        fripQC <- atacFripQC(atacProc = shortBed,atacProcPeak = peakCalling, ...)
        
        Peakanno <- atacPeakAnno(atacProc = peakCalling, ...)
        if(!basicAnalysis){
            # GO term
            goAna <- atacGOAnalysis(atacProc = Peakanno, ont = "BP", pvalueCutoff = 0.01, ...)
            
            # set default motif
            if(is.null(motifs)){
                opts <- list()
                opts[["tax_group"]] <- "vertebrates"
                pfm <- getMatrixSet(JASPAR2018::JASPAR2018, opts)
                names(pfm) <- TFBSTools::name(pfm)
                names(pfm) <- gsub(pattern = "[^a-zA-Z0-9]", replacement = "", x = TFBSTools::name(pfm), perl = TRUE)
            }else{
                pfm <- motifs
            }
            output_motifscan <- atacMotifScan(atacProc = peakCalling, motifs = pfm, p.cutoff = p.cutoff, prefix = prefix, ...)
            
            cs_output <- atacExtractCutSite(atacProc = sam2Bed, prefix = prefix, ...)
            footprint <- atacCutSiteCount(atacProcCutSite = cs_output, atacProcMotifScan = output_motifscan,
                                          strandLength = 100, prefix = prefix, chr = chr, ...)
            robj <- atacSingleRepReport(footprint, ...)
        }
    }

    reportDir <- file.path(getTmpDir(), "report")
    dir.create(reportDir)
    file.copy(from = output(peakCalling)$bedOutput, to=file.path(reportDir,"peak.bed"))
    file.copy(from = output(sam2Bed)$bedOutput, to=file.path(reportDir, "frag.bed"))

    


}



#' @name atacPipe2
#' @title Pipeline for single replicate case-control paired-end sequencing data
#' @description
#' The preset pipeline to process case control study sequencing data.
#' An HTML report file, result files(e.g. BED, BAM files) and
#' conclusion list will generated. See detail for usage.
#' @param genome \code{Character} scalar. The genome(like hg19, mm10, etc.) reference data in "refdir" to be used in the pipeline.
#' @param case \code{List} scalar. Input for case sample. \code{fastqInput1},
#' the path(s) of the mate 1 fastq file(s), is required. \code{fastqInput2},
#' the path(s) of the mate 2 fastq file(s), is required, when \code{interleave=FALSE}.
#' \code{adapter1} and \code{adapter2} are optional.
#' @param control \code{List} scalar. Input for control sample. \code{fastqInput1},
#' the path(s) of the mate 1 fastq file(s), is required. \code{fastqInput2},
#' the path(s) of the mate 2 fastq file(s), is required, when \code{interleave=FALSE}.
#' \code{adapter1} and \code{adapter2} are optional.
#' @param refdir \code{Character} scalar. The path for reference data being installed to and storage.
#' @param tmpdir \code{Character} scalar. The temporary file storage path.
#' @param threads \code{Integer} scalar. The max threads allowed to be created.
#' @param interleave \code{Logical} scalar. Set \code{TRUE} when files are
#' interleaved paired-end sequencing data.
#' @param createReport \code{Logical} scalar. If the HTML report file will be created.
#' @param motifs either\code{\link{PFMatrix}}, \code{\link{PFMatrixList}},
#' \code{\link{PWMatrix}}, \code{\link{PWMatrixList}}, default: vertebrates motif from JASPAR.
#' @param chr Which chromatin the program will processing. It must be identical
#' with the filename of cut site information files or subset of .
#' Default:c(1:22, "X", "Y").
#' @param p.cutoff p-value cutoff for returning motifs, default: 1e-6.
#' @param ... Additional arguments, currently unused.
#' @return \code{List} scalar. It is a list that save the result of the pipeline.
#' Slot "wholesummary": a dataframe for quality control summary of  case and control data
#' Slot "caselist" and "ctrlist": Each of them is a list that save the result for case or control data.
#' Slots of "caselist" and "ctrllist":
#' Slot "filelist": the input file paths.
#' Slot "wholesummary": a dataframe for quality control summary of case or control data
#' Slot "atacProcs": \code{\link{ATACProc-class}} objects generated by each process in the pipeline.
#' Slot "filtstat": a dataframe that summary the reads filted in each process.
#' @details
#' NOTE:
#' Build bowtie index in this function may take some time. If you already have bowtie2 index files or you want to download(ftp://ftp.ccb.jhu.edu/pub/data/bowtie2_indexes) instead of building, you can let esATAC skip the steps by renaming them following the format (genome+suffix) and put them in reference installation path (refdir).
#' Example: hg19 bowtie2 index files
#'
#' \itemize{
#' \item hg19.1.bt2
#' \item hg19.2.bt2
#' \item hg19.3.bt2
#' \item hg19.4.bt2
#' \item hg19.rev.1.bt2
#' \item hg19.rev.2.bt2
#' }
#'
#' For single end reads FASTQ files,
#' The required parameters are fastqInput1 and adapter1.
#' For paired end reads non-interleaved FASTQ files (interleave=FALSE,defualt),
#' The required parameters are fastqInput1 and fastqInput2.
#' Otherwise, parameter fastqInput2 is not required (interleave=TRUE)
#'
#' The paths of sequencing data replicates can be a \code{Character} vector.
#' For example:
#'
#' fastqInput1=c("file_1.rep1.fastq","file_1.rep2.fastq")
#'
#' fastqInput2=c("file_2.rep1.fastq","file_2.rep2.fastq")
#'
#' The result will be return by the function.
#' An HTML report file will be created for paired end reads.
#' Intermediate files will be save at tmpdir path (default is ./)
#' @author Zheng Wei and Wei Zhang
#' @seealso
#' \code{\link{atacPipe}}
#' @import JASPAR2018
#' @importFrom TFBSTools getMatrixSet
#' @importFrom TFBSTools name
#' @examples
#' \dontrun{
#' ## These codes are time consuming so they will not be run and
#' ## checked by bioconductor checker.
#'
#'
#' # call pipeline
#' # for a quick example(only CTCF and BATF3 will be processed)
#' conclusion <-
#'    atacPipe2(
#'        # MODIFY: Change these paths to your own case files!
#'        # e.g. fastqInput1 = "your/own/data/path.fastq"
#'        case=list(fastqInput1 = system.file(package="esATAC", "extdata", "chr20_1.1.fq.gz"),
#'                 fastqInput2 = system.file(package="esATAC", "extdata", "chr20_2.1.fq.gz")),
#'        # MODIFY: Change these paths to your own control files!
#'        # e.g. fastqInput1 = "your/own/data/path.fastq"
#'        control=list(fastqInput1 = system.file(package="esATAC", "extdata", "chr20_1.2.fq.bz2"),
#'                     fastqInput2 = system.file(package="esATAC", "extdata", "chr20_2.2.fq.bz2")),
#'        # MODIFY: Set the genome for your data
#'        genome = "hg19",
#'        motifs = getMotifInfo(motif.file = system.file("extdata", "CustomizedMotif.txt", package="esATAC")))
#'
#' # call pipeline
#' # for overall example(all vertebrates motif in JASPAR will be processed)
#' conclusion <-
#'    atacPipe2(
#'        # MODIFY: Change these paths to your own case files!
#'        # e.g. fastqInput1 = "your/own/data/path.fastq"
#'        case=list(fastqInput1 = system.file(package="esATAC", "extdata", "chr20_1.1.fq.gz"),
#'                 fastqInput2 = system.file(package="esATAC", "extdata", "chr20_2.1.fq.gz")),
#'        # MODIFY: Change these paths to your own control files!
#'        # e.g. fastqInput1 = "your/own/data/path.fastq"
#'        control=list(fastqInput1 = system.file(package="esATAC", "extdata", "chr20_1.2.fq.bz2"),
#'                     fastqInput2 = system.file(package="esATAC", "extdata", "chr20_2.2.fq.bz2")),
#'        # MODIFY: Set the genome for your data
#'        genome = "hg19")
#'}
#' @export
#'
atacPipe2 <- function(genome, case = list(fastqInput1="paths/To/fastq1",fastqInput2="paths/To/fastq2", adapter1 = NULL, adapter2 = NULL),
                      control =list(fastqInput1="paths/To/fastq1",fastqInput2="paths/To/fastq2", adapter1 = NULL, adapter2 = NULL),
                      refdir=NULL, tmpdir=NULL, threads=2, interleave = FALSE, createReport = TRUE, motifs = NULL,
                      chr = c(1:22, "X", "Y"), p.cutoff = 1e-6, ...){ #saveTmp = TRUE,
    stop("this function is still under developing")

}


checkFilePathExist <- function(afilePaths){
    if(FALSE %in% file.exists(afilePaths)){
        stop(sprintf("file '%s' does not exist",afilePaths))
    }
}

#' @name atacRepsPipe
#' @title Pipeline for multi-replicates case paired-end sequencing data
#' @description
#' The preset pipeline to process multi-replicates case study sequencing data.
#' HTML report files, result files(e.g. BED, BAM files) and
#' conclusion list will generated. See detail for usage.
#' @param genome \code{Character} scalar. The genome(like hg19, mm10, etc.) reference data in "refdir" to be used in the pipeline.
#' @param fastqInput1 \code{List} scalar. For single-end sequencing,
#' it contains sequence file paths.
#' For paired-end sequencing, it can be file paths with #1 mates paired
#' with file paths in fastqInput2
#' And it can also be interleaved file paths when argument
#' interleaved=\code{TRUE}.
#' Each element in the fastqInput1 \code{List} is for a replicate
#' It can be a \code{Character} vector of FASTQ files paths to be merged.
#' @param fastqInput2 \code{List} scalar. It contains file paths with #2
#' mates paired with file paths in fastqInput1.
#' For single-end sequencing files and interleaved paired-end sequencing
#' files(argument interleaved=\code{TRUE}),
#' it must be \code{NULL}.
#' Each element in the fastqInput1 \code{List} is for a replicate
#' It can be a \code{Character} vector of FASTQ files paths to be merged.
#' @param refdir \code{Character} scalar. The path for reference data being installed to and storage.
#' @param tmpdir \code{Character} scalar. The temporary file storage path.
#' @param threads \code{Integer} scalar. The max threads allowed to be created.
#' @param adapter1 \code{Character} scalar. It is an adapter sequence for file1.
#' For single end data, it is requied.
#' @param adapter2 \code{Character} scalar. It is an adapter sequence for file2.
#' @param interleave \code{Logical} scalar. Set \code{TRUE} when files are
#' interleaved paired-end sequencing data.
#' @param createReport \code{Logical} scalar. If the HTML report file will be created.
#' @param motifs either\code{\link{PFMatrix}}, \code{\link{PFMatrixList}},
#' \code{\link{PWMatrix}}, \code{\link{PWMatrixList}}, default: vertebrates motif from JASPAR.
#' @param prefix \code{Character} scalar. Temporary file prefix for identifying files
#' when multiple pipeline generating file in the same tempdir.
#' @param chr Which chromatin the program will processing. It must be identical
#' with the filename of cut site information files or subset of .
#' Default:c(1:22, "X", "Y").
#' @param p.cutoff p-value cutoff for returning motifs, default: 1e-6.
#' @param ... Additional arguments, currently unused.
#' @return \code{List} scalar. It is a list that save the result of the pipeline.
#' Slot "filelist": the input file paths.
#' Slot "wholesummary": a dataframe that for quality control summary
#' Slot "atacProcs": \code{\link{ATACProc-class}} objects generated by each process in the pipeline.
#' Slot "filtstat": a dataframe that summary the reads filted in each process.
#'
#' @author Zheng Wei and Wei Zhang
#' @seealso
#' \code{\link{printMap}},
#' \code{\link{atacPipe2}},
#' \code{\link{atacRenamer}},
#' \code{\link{atacRemoveAdapter}},
#' \code{\link{atacBowtie2Mapping}},
#' \code{\link{atacPeakCalling}},
#' \code{\link{atacMotifScan}}
#' @examples
#' \dontrun{
#' ## These codes are time consuming so they will not be run and
#' ## checked by bioconductor checker.
#'
#'
#' # call pipeline
#' # for a quick example(only CTCF and BATF3 will be processing)
#' conclusion <-
#'   atacRepsPipe(
#'        # MODIFY: Change these paths to your own case files!
#'        # e.g. fastqInput1 = "your/own/data/path.fastq"
#'        fastqInput1 = list(system.file(package="esATAC", "extdata", "chr20_1.1.fq.gz"),
#'                           system.file(package="esATAC", "extdata", "chr20_1.2.fq.bz2")),
#'        fastqInput2 = list(system.file(package="esATAC", "extdata", "chr20_2.1.fq.gz"),
#'                           system.file(package="esATAC", "extdata", "chr20_2.2.fq.bz2")),
#'        # MODIFY: Set the genome for your data
#'        genome = "hg19",
#'        motifs = getMotifInfo(motif.file = system.file("extdata", "CustomizedMotif.txt", package="esATAC")))
#'
#' # call pipeline
#' # for overall example(all vertebrates motif in JASPAR will be processed)
#' conclusion <-
#'   atacRepsPipe(
#'        # MODIFY: Change these paths to your own case files!
#'        # e.g. fastqInput1 = "your/own/data/path.fastq"
#'        fastqInput1 = list(system.file(package="esATAC", "extdata", "chr20_1.1.fq.gz"),
#'                           system.file(package="esATAC", "extdata", "chr20_1.2.fq.bz2")),
#'        fastqInput2 = list(system.file(package="esATAC", "extdata", "chr20_2.1.fq.gz"),
#'                           system.file(package="esATAC", "extdata", "chr20_2.2.fq.bz2")),
#'        # MODIFY: Set the genome for your data
#'        genome = "hg19")
#' }
#' @importFrom VennDiagram venn.diagram
#' @importFrom corrplot corrplot
#' @importFrom grDevices colorRampPalette
#' @export
#'
atacRepsPipe <- function(genome, fastqInput1, fastqInput2 = NULL, refdir = NULL, tmpdir = NULL, threads = 2,
                         adapter1 = NULL, adapter2 = NULL, interleave = FALSE,  createReport = TRUE,
                         motifs = NULL, prefix = NULL, chr = c(1:22, "X", "Y"), p.cutoff = 1e-6, ...){
    stop("this function is still under developing")

}

#' @importFrom GenomeInfoDb seqlengths
#' @importFrom rtracklayer import.bed

getBinsReadsCount <- function(bedInput,bsgenome,binsize = 1000){
    abedfile <- import.bed(con = bedInput)
    chrominfo<-seqinfo(bsgenome)
    chroms <- seqnames(chrominfo)
    chromsize <- GenomeInfoDb::seqlengths(chrominfo)
    binnumb <- as.integer(chromsize/binsize)+1
    readsCountsList <- list()
    pos<-as.integer((end(ranges(abedfile))+start(ranges(abedfile)))/2/binsize) + 1
    for(i in 1:length(chroms)){
        readsCountsList[[chroms[i]]]<-rep(0,binnumb[i])
        posadd <- pos[as.character(seqnames(abedfile))==chroms[i]]
        readsCountsNumber<-table(posadd)
        readsCountsList[[chroms[i]]][as.integer(names(readsCountsNumber))] <- readsCountsList[[chroms[i]]][as.integer(names(readsCountsNumber))] + as.numeric(readsCountsNumber)
    }
    return(AnnotationDbi::unlist2(readsCountsList))
    #return(readsCountsList)
}



#' @name atacRepsPipe2
#' @title Pipeline for multi-replicates case-control paired-end sequencing data
#' @description
#' The preset pipeline to process multi-replicates case control study sequencing data.
#' HTML report files, result files(e.g. BED, BAM files) and
#' conclusion list will generated. See detail for usage.
#' @param genome \code{Character} scalar. The genome(like hg19, mm10, etc.) reference data in "refdir" to be used in the pipeline.
#' @param caseFastqInput1 \code{List} scalar. Input for case samples.
#' For single-end sequencing,
#' it contains sequence file paths.
#' For paired-end sequencing, it can be file paths with #1 mates paired
#' with file paths in fastqInput2
#' And it can also be interleaved file paths when argument
#' interleaved=\code{TRUE}.
#' Each element in the caseFastqInput1 \code{List} is for a replicate
#' It can be a \code{Character} vector of FASTQ files paths to be merged.
#' @param caseFastqInput2 \code{List} scalar. Input for case samples.
#' It contains file paths with #2
#' mates paired with file paths in caseFastqInput1
#' For single-end sequencing files and interleaved paired-end sequencing
#' files(argument interleaved=\code{TRUE}),
#' it must be \code{NULL}.
#' Each element in the caseFastqInput2 \code{List} is for a replicate
#' @param ctrlFastqInput1 \code{List} scalar. Input for control samples.
#' For single-end sequencing,
#' it contains sequence file paths.
#' For paired-end sequencing, it can be file paths with #1 mates paired
#' with file paths in ctrlFastqInput2
#' And it can also be interleaved file paths when argument
#' interleaved=\code{TRUE}.
#' Each element in the ctrlFastqInput1 \code{List} is for a replicate
#' It can be a \code{Character} vector of FASTQ files paths to be merged.
#' @param ctrlFastqInput2 \code{List} scalar. Input for control samples.
#' It contains file paths with #2
#' mates paired with file paths in fastqInput1.
#' For single-end sequencing files and interleaved paired-end sequencing
#' files(argument interleaved=\code{TRUE}),
#' it must be \code{NULL}.
#' Each element in the ctrlFastqInput1 \code{List} is for a replicate
#' @param caseAdapter1 \code{Character} scalar. Adapter for caseFastqInput1.
#' @param caseAdapter2 \code{Character} scalar. Adapter for caseFastqInput2.
#' @param ctrlAdapter1 \code{Character} scalar. Adapter for ctrlFastqInput1.
#' @param ctrlAdapter2 \code{Character} scalar. Adapter for ctrlFastqInput2.
#' @param refdir \code{Character} scalar. The path for reference data being installed to and storage.
#' @param tmpdir \code{Character} scalar. The temporary file storage path.
#' @param threads \code{Integer} scalar. The max threads allowed to be created.
#' @param interleave \code{Logical} scalar. Set \code{TRUE} when files are
#' interleaved paired-end sequencing data.
#' @param createReport \code{Logical} scalar. If the HTML report file will be created.
#' @param motifs either\code{\link{PFMatrix}}, \code{\link{PFMatrixList}},
#' \code{\link{PWMatrix}}, \code{\link{PWMatrixList}}, default: vertebrates motif from JASPAR.
#' @param chr Which chromatin the program will processing. It must be identical
#' with the filename of cut site information files or subset of .
#' Default:c(1:22, "X", "Y").
#' @param p.cutoff p-value cutoff for returning motifs, default: 1e-6.
#' @param ... Additional arguments, currently unused.
#' @return \code{List} scalar. It is a list that save the result of the pipeline.
#' Slot "caselist" and "ctrlist": Each of them is a list that save the result for case or control data.
#' Slot "comp_result": compare analysis result for case and control data
#' @details
#' NOTE:
#' Build bowtie index in this function may take some time. If you already have bowtie2 index files or you want to download(ftp://ftp.ccb.jhu.edu/pub/data/bowtie2_indexes) instead of building, you can let esATAC skip the steps by renaming them following the format (genome+suffix) and put them in reference installation path (refdir).
#' Example: hg19 bowtie2 index files
#'
#' \itemize{
#' \item hg19.1.bt2
#' \item hg19.2.bt2
#' \item hg19.3.bt2
#' \item hg19.4.bt2
#' \item hg19.rev.1.bt2
#' \item hg19.rev.2.bt2
#' }
#'
#' For single end reads FASTQ files,
#' The required parameters are fastqInput1 and adapter1.
#' For paired end reads non-interleaved FASTQ files (interleave=FALSE,defualt),
#' The required parameters are fastqInput1 and fastqInput2.
#' Otherwise, parameter fastqInput2 is not required (interleave=TRUE)
#'
#' The paths of sequencing data replicates can be a \code{Character} vector.
#' For example:
#'
#' fastqInput1=c("file_1.rep1.fastq","file_1.rep2.fastq")
#'
#' fastqInput2=c("file_2.rep1.fastq","file_2.rep2.fastq")
#'
#' The result will be return by the function.
#' An HTML report file will be created for paired end reads.
#' Intermediate files will be save at tmpdir path (default is ./)
#' @author Zheng Wei and Wei Zhang
#' @seealso
#' \code{\link{atacPipe}}
#' @import JASPAR2018
#' @importFrom TFBSTools getMatrixSet
#' @importFrom TFBSTools toPWM
#' @importFrom TFBSTools name
#' @examples
#' \dontrun{
#' ## These codes are time consuming so they will not be run and
#' ## checked by bioconductor checker.
#'
#'
#' # call pipeline
#' # for a quick example(only CTCF will be processed)
#' conclusion <-
#'     atacRepsPipe2(
#'         # MODIFY: Change these paths to your own case files!
#'         # e.g. fastqInput1 = "your/own/data/path.fastq"
#'      caseFastqInput1=list(system.file(package="esATAC", "extdata", "chr20_1.1.fq.gz"),
#'                           system.file(package="esATAC", "extdata", "chr20_1.1.fq.gz")),
#'      # MODIFY: Change these paths to your own case files!
#'      # e.g. fastqInput1 = "your/own/data/path.fastq"
#'      caseFastqInput2=list(system.file(package="esATAC", "extdata", "chr20_2.1.fq.gz"),
#'                           system.file(package="esATAC", "extdata", "chr20_2.1.fq.gz")),
#'      # MODIFY: Change these paths to your own control files!
#'      # e.g. fastqInput1 = "your/own/data/path.fastq"
#'      ctrlFastqInput1=list(system.file(package="esATAC", "extdata", "chr20_1.2.fq.bz2"),
#'                           system.file(package="esATAC", "extdata", "chr20_1.2.fq.bz2")),
#'      # MODIFY: Change these paths to your own control files!
#'      # e.g. fastqInput1 = "your/own/data/path.fastq"
#'      ctrlFastqInput2=list(system.file(package="esATAC", "extdata", "chr20_2.2.fq.bz2"),
#'                           system.file(package="esATAC", "extdata", "chr20_2.2.fq.bz2")),
#'      # MODIFY: Set the genome for your data
#'      genome = "hg19",
#'      motifs = getMotifInfo(motif.file = system.file("extdata", "CustomizedMotif.txt", package="esATAC")))
#'
#'
#' # call pipeline
#' # for overall example(all human motif in JASPAR will be processed)
#' conclusion <-
#'     atacRepsPipe2(
#'         # MODIFY: Change these paths to your own case files!
#'         # e.g. fastqInput1 = "your/own/data/path.fastq"
#'      caseFastqInput1=list(system.file(package="esATAC", "extdata", "chr20_1.1.fq.gz"),
#'                           system.file(package="esATAC", "extdata", "chr20_1.1.fq.gz")),
#'      # MODIFY: Change these paths to your own case files!
#'      # e.g. fastqInput1 = "your/own/data/path.fastq"
#'      caseFastqInput2=list(system.file(package="esATAC", "extdata", "chr20_2.1.fq.gz"),
#'                           system.file(package="esATAC", "extdata", "chr20_2.1.fq.gz")),
#'      # MODIFY: Change these paths to your own control files!
#'      # e.g. fastqInput1 = "your/own/data/path.fastq"
#'      ctrlFastqInput1=list(system.file(package="esATAC", "extdata", "chr20_1.2.fq.bz2"),
#'                           system.file(package="esATAC", "extdata", "chr20_1.2.fq.bz2")),
#'      # MODIFY: Change these paths to your own control files!
#'      # e.g. fastqInput1 = "your/own/data/path.fastq"
#'      ctrlFastqInput2=list(system.file(package="esATAC", "extdata", "chr20_2.2.fq.bz2"),
#'                           system.file(package="esATAC", "extdata", "chr20_2.2.fq.bz2")),
#'      # MODIFY: Set the genome for your data
#'      genome = "hg19"
#'      )
#'}
#' @export
#'
atacRepsPipe2 <- function(genome, caseFastqInput1,caseFastqInput2, ctrlFastqInput1, ctrlFastqInput2,
                          caseAdapter1 = NULL, caseAdapter2 = NULL, ctrlAdapter1 = NULL, ctrlAdapter2 = NULL,
                          refdir=NULL, tmpdir=NULL, threads=2, interleave = FALSE, createReport = TRUE, motifs = NULL,
                          chr = c(1:22, "X", "Y"), p.cutoff = 1e-6, ...){ #saveTmp = TRUE,

    stop("this function is still under developing")

}

Try the esATAC package in your browser

Any scripts or data that you put into this service are public.

esATAC documentation built on Nov. 8, 2020, 6:58 p.m.