R/core_in_WEXAC.R

Defines functions alignSTAR mkSTARgenome

Documented in alignSTAR mkSTARgenome

#' Make STAR genome
#'
#' Wrapper for STAR genome generation
#'
#' @param fastaGenome
#' @param bedAnnotation
#' @param outDir
#' @param nCores
#' @param maxReadLength
#'
#' @return
#' @export
#'
#' @examples
mkSTARgenome <- function(fastaGenome, bedAnnotation = NULL, outDir = NULL,
                         nCores = 2, maxReadLength = 101){
    outName <- paste0(strsplit(fastaGenome, split = "/") %>% unlist %>% utils::tail(1),
                      ".STAR")
    if(is.null(outDir)){
        mkTmpDir()
        outDir <- file.path(getwd(), "STORMtmp_dir", outName)
    }else if(!is.null(outDir)){
        outDir <- outDir
    }
    genome <- txtools::tx_load_genome(fastaGenome)
    lGen <- sum(Biostrings::width(genome))
    nRef <- length(Biostrings::width(genome))
    if(nRef > 1000){
        genomChrBinNbits <- floor(min(18,log2(max(lGen/nRef, maxReadLength))))
    }else{
        genomChrBinNbits <- 18
    }
    genomeindexNb <- floor(min(14, log2(lGen)/2 - 1))
    if(is.null(bedAnnotation)){
        com <- paste("module load STAR/2.7.5c &&",
                     "/apps/RH7U2/general/STAR/2.7.5c/bin/Linux_x86_64/STAR",
                     "--runMode genomeGenerate",
                     "--runThreadN", nCores,
                     "--genomeDir", outDir,
                     "--genomeFastaFiles", fastaGenome,
                     "--genomeChrBinNbits", genomChrBinNbits,
                     "--genomeSAindexNbases", genomeindexNb)
        system(com)
    }else{
        tmpF <- tempfile() %>% strsplit(split = "/") %>% unlist %>% utils::tail(1)
        tmpGTF <- file.path(getwd(), "STORMtmp_dir", tmpF)
        mkGTF(bedAnnotation, tmpGTF)
        com <- paste("module load STAR/2.7.5c &&",
                     "/apps/RH7U2/general/STAR/2.7.5c/bin/Linux_x86_64/STAR",
                     "--runMode genomeGenerate",
                     "--runThreadN", nCores,
                     "--genomeDir", outDir,
                     "--genomeFastaFiles", fastaGenome,
                     "--sjdbOverhang", maxReadLength - 1,
                     "--sjdbGTFfile", tmpGTF,
                     "--genomeChrBinNbits", genomChrBinNbits,
                     "--genomeSAindexNbases", genomeindexNb)
        system(com)
        invisible(file.remove(tmpGTF))
    }
    outDir
}

#' STAR alignment
#'
#' Wrapper to perform alignment of sequencing reads to a reference genome
#' using STAR (Dobin) and sorting and indexing using Samtools.
#'
#' @param read1Files character. Path to R1 FASTQ files
#' @param STARgenomeDir character. Path to STAR genome to map to
#' @param pairedEnd logical. Indicating if to expect a Read2 FASTQ file
#' File will be automatically looked for but both need to include "_R1", and "_R2"
#' in their respective file names.
#' @param zipped logical. TRUE as default for zipped FASTQ files, will be read with zcat.
#' If set to FALSE FASTQ files will be read with cat, as not zipped.
#' @param nCores numeric. Number of cores used for alignment and sorting processes.
#' @param outFilterMultimapNmax numeric. Threshold for which a read will have multiple
#' mappings. Default is 10. For unique alignment change the value to 1.
#' @param outDir character. Path to output directory
#' @param alignIntronMax numeric. maximum intron size, if 0, max intron size will be determined
#' by default as (2ˆwinBinNbits)*winAnchorDistNbins (See STAR manual for more info)
#' @param alignEndsType character. type of read ends alignment (See STAR manual for more info)
#' @param otherSTARparams character. Additional parameters not covered by the arguments
#' of this function can be added here, separated by spaces.
#' @param dry logical. If set to TRUE the alignment is not performed, only output
#' are the paths to expected output files.
#'
#' @return
#' @export
alignSTAR <- function(read1Files, STARgenomeDir, pairedEnd = TRUE, zipped = TRUE,
                      nCores = 4, alignEndsType = "Local",
                      alignIntronMax = 0, outFilterMultimapNmax = 10,
                      outDir, otherSTARparams = "", dry = FALSE){
    if(!all(grepl(patt = "_R1", read1Files))){stop("All read1Files must contain the string '_R1'")}
    rootNames <- lapply(read1Files, function(x){strsplit(x, split = "/") %>% unlist %>%
            tail(1)}) %>% unlist %>% gsub(pattern = "_R1(.)+", replacement = "")
    bamFiles <- file.path("STORMtmp_dir", paste0(rootNames, "_Aligned.out.bam"))
    BAM <- c(gsub(bamFiles, pattern = ".bam$", replacement = ".sorted.bam"),
             gsub(bamFiles, pattern = ".bam$", replacement = ".sorted.bam.bai"))
    outBAM <- lapply(gsub(bamFiles, pattern = ".bam$", replacement = ".sorted.bam"),
                     function(x){strsplit(x, split = "/") %>% unlist %>%
                             tail(1)}) %>% unlist %>% file.path(outDir, .)
    if(dry){return(outBAM)}
    mkTmpDir()
    if(!dir.exists(outDir)){dir.create(outDir)}
    if(zipped){rFCom <- "zcat"}else if(!zipped){rFCom <- "cat"}
    for(read1F in read1Files){
        if(pairedEnd){
            read2F <- gsub(read1F, pattern = "_R1", replacement = "_R2")
            if(!file.exists(read2F)){stop(read2F, "does not exist.")}
        }else if(!pairedEnd){
            read2F <- ""
        }else{stop("pairedEnd must be logical either TRUE or FALSE")}
        outFPrefix <- strsplit(read1F, split = "/") %>% unlist %>% tail(1) %>%
            gsub(pattern = "R1.fastq.gz", replacement = "") %>%
            gsub(read1F, pattern = "R1.fastq", replacement = "")
        outFPrefix <- file.path(getwd(), "STORMtmp_dir", outFPrefix)
        com <- paste0("module load STAR/2.7.5c &&",
                      "/apps/RH7U2/general/STAR/2.7.5c/bin/Linux_x86_64/STAR",
                      " --runMode alignReads",
                      " --runThreadN ", nCores,
                      " --genomeDir ", STARgenomeDir,
                      " --readFilesCommand ", rFCom,
                      " --readFilesIn ", read1F, " ", read2F,
                      " --outFileNamePrefix ", outFPrefix,
                      " --outSAMtype ", "BAM Unsorted",
                      " --outFilterMultimapNmax ", outFilterMultimapNmax,
                      " --alignEndsType ", alignEndsType,
                      " --alignIntronMax ", alignIntronMax,
                      " ", otherSTARparams)
        system(com)
    }
    # Alignment Summary Report
    logFiles <- file.path("STORMtmp_dir", paste0(rootNames, "_Log.final.out"))
    # all(file.exists(logFiles))
    RES <- lapply(logFiles, function(x){
        utils::read.delim(file = x, header = FALSE, stringsAsFactors = FALSE)
    })
    # Merge in one table
    summary <- lapply(seq_along(RES), function(x){
        RES[[x]][,2]
    }) %>% do.call(what = cbind)
    rownames(summary) <- RES[[1]][,1]
    colnames(summary) <- rootNames
    outReport <- file.path(outDir, "mappingSummary.txt")
    if(file.exists(outReport)){ # Add columns to existing summary report
        tmp <- data.table::fread(outReport, header = TRUE) %>%
            tibble::column_to_rownames("V1")
        utils::write.table(x = cbind(tmp, summary), file = outReport,
                           sep = "\t", quote = F, col.names = NA)
    }else{
        utils::write.table(x = summary, file = outReport, sep = "\t", quote = F,
                           col.names = NA)
    }
    # Sort and index with samtools
    for(file in bamFiles){
        system(paste0("module load samtools/1.9 && ",
                      "/apps/RH7U2/gnu/samtools/1.9/bin/samtools sort -o ",
                      gsub(file, pattern = ".bam$", replacement = ".sorted.bam"),
                      " ", file, " -@", nCores))
        system(paste0("module load samtools/1.9 && ",
                      "/apps/RH7U2/gnu/samtools/1.9/bin/samtools index ",
                      gsub(file, pattern = ".bam$", replacement = ".sorted.bam")))
        system(paste0("rm ", file))
    }
    # Remove all garbage
    garbage <- file.path("STORMtmp_dir",
                         lapply(rootNames, function(x){
                             paste0(x, c("_SJ.out.tab", "_Log.progress.out", "_Log.out",
                                         "_Log.final.out"))
                         }) %>% unlist)
    invisible(file.remove(garbage))
    # Move files to output dir
    for(file in BAM){
        system(paste("mv", file, outDir))
    }
    outBAM
}

#' STORM object constructor from META table
#'
#'
#'
#' @param META data.frame Experimental design
#' @param genome
#' @param geneAnnot
#' @param nCores
#'
#' @return
#' @export
#'
#' @examples
storm_STORM <- function(META, genome = NULL, geneAnnot = NULL, nCores = 1){
    if(hasDups(META$id)){
        stop("Not allowed duplicated id variables in META")
    }
    DTL <- lapply(META$RDS, function(x) readRDS(x)) %>% magrittr::set_names(META$id)# load RDS files
    # Check if data is in data.table or list format
    if(all(lapply(DTL, class) %>% unlist %>% magrittr::equals("data.frame"))){
        DTL <- lapply(DTL, data.table::data.table) %>% magrittr::set_names(META$id)
    }else if(all(lapply(DTL, class) %>% unlist %>% magrittr::equals("list"))){
        DTL <- lapply(DTL, function(x){
            do.call(x, what = rbind) %>% data.table::data.table()
        }) %>% magrittr::set_names(META$id)
    }
    # Have reference sequence, if not add it
    reqRefSeq <- lapply(DTL, function(x){"refSeq" %in% names(x)}) %>% unlist %>% magrittr::not()
    if(sum(reqRefSeq) > 0 & is.null(genome) | is.null(geneAnnot)){
        stop("Data requires reference sequence, genome and geneAnnot arguments must be provided")
    }
    if(sum(reqRefSeq) > 0){
        DTL[reqRefSeq] <- parallel::mclapply(mc.cores = nCores, DTL[reqRefSeq], function(DT){
            txtools::tx_split_DT(DT) %>% lapply(function(x){
                txtools::tx_add_refSeqDT(DT = x, genome = genome, geneAnnot = geneAnnot)
            }) %>% txtools::tx_merge_DT()
        })
    }
    # Check uniformity of DTs, if unequal equalize
    tmpPos <- lapply(DTL, function(DT){paste(DT$gene, DT$txcoor, sep = ":")})
    identCoors <- lapply(tmpPos[-1], function(x){identical(tmpPos[[1]], x)}) %>% unlist() %>% all()
    if(!identCoors){
        if(is.null(geneAnnot) | is.null(genome)){
            stop("Data needs compatibility adjustment, this requires geneAnnot and
                 genome arguments to be provided")
        }
        tmpGenes <- lapply(DTL, function(x) as.character(x$gene)) %>% unlist() %>% unique()
        tmpGA <- geneAnnot[match(tmpGenes, geneAnnot$name)]
        DTL <- lapply(DTL, function(DT){
            txtools::tx_complete_DT(DT, tmpGA, genome, nCores)
        })
    }
    if(!("pos" %in% names(DTL[[1]]))){
        DTL <- lapply(DTL, function(x) txtools::tx_add_pos(x))
    }
    STORM <- list(META = META,
                  DATA = DTL,
                  RES  = NULL)
    return(STORM)
}

#' Confussion matrix plot
#'
#' @param STORM
#' @param title
#' @param pointAlpha
#'
#' @return
#' @export
#'
storm_plot_confMat <- function(STORM, title = "", pointAlpha = 0.5){
    df <- storm_confMatrix4plot(STORM)
    ggplot(df, aes(x = outcome, y = RNAmod)) +
        geom_jitter(alpha = 0.5, width = 0.2, height = 0.2) +
        theme_bw() + ggtitle(title)
}
SchwartzLab/brainSTORM documentation built on May 14, 2022, 5:14 p.m.