#' Make STAR genome
#'
#' Wrapper for STAR genome generation
#'
#' @param fastaGenome
#' @param bedAnnotation
#' @param outDir
#' @param nCores
#' @param maxReadLength
#'
#' @return
#' @export
#'
#' @examples
mkSTARgenome <- function(fastaGenome, bedAnnotation = NULL, outDir = NULL,
nCores = 2, maxReadLength = 101){
outName <- paste0(strsplit(fastaGenome, split = "/") %>% unlist %>% utils::tail(1),
".STAR")
if(is.null(outDir)){
mkTmpDir()
outDir <- file.path(getwd(), "STORMtmp_dir", outName)
}else if(!is.null(outDir)){
outDir <- outDir
}
genome <- txtools::tx_load_genome(fastaGenome)
lGen <- sum(Biostrings::width(genome))
nRef <- length(Biostrings::width(genome))
if(nRef > 1000){
genomChrBinNbits <- floor(min(18,log2(max(lGen/nRef, maxReadLength))))
}else{
genomChrBinNbits <- 18
}
genomeindexNb <- floor(min(14, log2(lGen)/2 - 1))
if(is.null(bedAnnotation)){
com <- paste("module load STAR/2.7.5c &&",
"/apps/RH7U2/general/STAR/2.7.5c/bin/Linux_x86_64/STAR",
"--runMode genomeGenerate",
"--runThreadN", nCores,
"--genomeDir", outDir,
"--genomeFastaFiles", fastaGenome,
"--genomeChrBinNbits", genomChrBinNbits,
"--genomeSAindexNbases", genomeindexNb)
system(com)
}else{
tmpF <- tempfile() %>% strsplit(split = "/") %>% unlist %>% utils::tail(1)
tmpGTF <- file.path(getwd(), "STORMtmp_dir", tmpF)
mkGTF(bedAnnotation, tmpGTF)
com <- paste("module load STAR/2.7.5c &&",
"/apps/RH7U2/general/STAR/2.7.5c/bin/Linux_x86_64/STAR",
"--runMode genomeGenerate",
"--runThreadN", nCores,
"--genomeDir", outDir,
"--genomeFastaFiles", fastaGenome,
"--sjdbOverhang", maxReadLength - 1,
"--sjdbGTFfile", tmpGTF,
"--genomeChrBinNbits", genomChrBinNbits,
"--genomeSAindexNbases", genomeindexNb)
system(com)
invisible(file.remove(tmpGTF))
}
outDir
}
#' STAR alignment
#'
#' Wrapper to perform alignment of sequencing reads to a reference genome
#' using STAR (Dobin) and sorting and indexing using Samtools.
#'
#' @param read1Files character. Path to R1 FASTQ files
#' @param STARgenomeDir character. Path to STAR genome to map to
#' @param pairedEnd logical. Indicating if to expect a Read2 FASTQ file
#' File will be automatically looked for but both need to include "_R1", and "_R2"
#' in their respective file names.
#' @param zipped logical. TRUE as default for zipped FASTQ files, will be read with zcat.
#' If set to FALSE FASTQ files will be read with cat, as not zipped.
#' @param nCores numeric. Number of cores used for alignment and sorting processes.
#' @param outFilterMultimapNmax numeric. Threshold for which a read will have multiple
#' mappings. Default is 10. For unique alignment change the value to 1.
#' @param outDir character. Path to output directory
#' @param alignIntronMax numeric. maximum intron size, if 0, max intron size will be determined
#' by default as (2ˆwinBinNbits)*winAnchorDistNbins (See STAR manual for more info)
#' @param alignEndsType character. type of read ends alignment (See STAR manual for more info)
#' @param otherSTARparams character. Additional parameters not covered by the arguments
#' of this function can be added here, separated by spaces.
#' @param dry logical. If set to TRUE the alignment is not performed, only output
#' are the paths to expected output files.
#'
#' @return
#' @export
alignSTAR <- function(read1Files, STARgenomeDir, pairedEnd = TRUE, zipped = TRUE,
nCores = 4, alignEndsType = "Local",
alignIntronMax = 0, outFilterMultimapNmax = 10,
outDir, otherSTARparams = "", dry = FALSE){
if(!all(grepl(patt = "_R1", read1Files))){stop("All read1Files must contain the string '_R1'")}
rootNames <- lapply(read1Files, function(x){strsplit(x, split = "/") %>% unlist %>%
tail(1)}) %>% unlist %>% gsub(pattern = "_R1(.)+", replacement = "")
bamFiles <- file.path("STORMtmp_dir", paste0(rootNames, "_Aligned.out.bam"))
BAM <- c(gsub(bamFiles, pattern = ".bam$", replacement = ".sorted.bam"),
gsub(bamFiles, pattern = ".bam$", replacement = ".sorted.bam.bai"))
outBAM <- lapply(gsub(bamFiles, pattern = ".bam$", replacement = ".sorted.bam"),
function(x){strsplit(x, split = "/") %>% unlist %>%
tail(1)}) %>% unlist %>% file.path(outDir, .)
if(dry){return(outBAM)}
mkTmpDir()
if(!dir.exists(outDir)){dir.create(outDir)}
if(zipped){rFCom <- "zcat"}else if(!zipped){rFCom <- "cat"}
for(read1F in read1Files){
if(pairedEnd){
read2F <- gsub(read1F, pattern = "_R1", replacement = "_R2")
if(!file.exists(read2F)){stop(read2F, "does not exist.")}
}else if(!pairedEnd){
read2F <- ""
}else{stop("pairedEnd must be logical either TRUE or FALSE")}
outFPrefix <- strsplit(read1F, split = "/") %>% unlist %>% tail(1) %>%
gsub(pattern = "R1.fastq.gz", replacement = "") %>%
gsub(read1F, pattern = "R1.fastq", replacement = "")
outFPrefix <- file.path(getwd(), "STORMtmp_dir", outFPrefix)
com <- paste0("module load STAR/2.7.5c &&",
"/apps/RH7U2/general/STAR/2.7.5c/bin/Linux_x86_64/STAR",
" --runMode alignReads",
" --runThreadN ", nCores,
" --genomeDir ", STARgenomeDir,
" --readFilesCommand ", rFCom,
" --readFilesIn ", read1F, " ", read2F,
" --outFileNamePrefix ", outFPrefix,
" --outSAMtype ", "BAM Unsorted",
" --outFilterMultimapNmax ", outFilterMultimapNmax,
" --alignEndsType ", alignEndsType,
" --alignIntronMax ", alignIntronMax,
" ", otherSTARparams)
system(com)
}
# Alignment Summary Report
logFiles <- file.path("STORMtmp_dir", paste0(rootNames, "_Log.final.out"))
# all(file.exists(logFiles))
RES <- lapply(logFiles, function(x){
utils::read.delim(file = x, header = FALSE, stringsAsFactors = FALSE)
})
# Merge in one table
summary <- lapply(seq_along(RES), function(x){
RES[[x]][,2]
}) %>% do.call(what = cbind)
rownames(summary) <- RES[[1]][,1]
colnames(summary) <- rootNames
outReport <- file.path(outDir, "mappingSummary.txt")
if(file.exists(outReport)){ # Add columns to existing summary report
tmp <- data.table::fread(outReport, header = TRUE) %>%
tibble::column_to_rownames("V1")
utils::write.table(x = cbind(tmp, summary), file = outReport,
sep = "\t", quote = F, col.names = NA)
}else{
utils::write.table(x = summary, file = outReport, sep = "\t", quote = F,
col.names = NA)
}
# Sort and index with samtools
for(file in bamFiles){
system(paste0("module load samtools/1.9 && ",
"/apps/RH7U2/gnu/samtools/1.9/bin/samtools sort -o ",
gsub(file, pattern = ".bam$", replacement = ".sorted.bam"),
" ", file, " -@", nCores))
system(paste0("module load samtools/1.9 && ",
"/apps/RH7U2/gnu/samtools/1.9/bin/samtools index ",
gsub(file, pattern = ".bam$", replacement = ".sorted.bam")))
system(paste0("rm ", file))
}
# Remove all garbage
garbage <- file.path("STORMtmp_dir",
lapply(rootNames, function(x){
paste0(x, c("_SJ.out.tab", "_Log.progress.out", "_Log.out",
"_Log.final.out"))
}) %>% unlist)
invisible(file.remove(garbage))
# Move files to output dir
for(file in BAM){
system(paste("mv", file, outDir))
}
outBAM
}
#' STORM object constructor from META table
#'
#'
#'
#' @param META data.frame Experimental design
#' @param genome
#' @param geneAnnot
#' @param nCores
#'
#' @return
#' @export
#'
#' @examples
storm_STORM <- function(META, genome = NULL, geneAnnot = NULL, nCores = 1){
if(hasDups(META$id)){
stop("Not allowed duplicated id variables in META")
}
DTL <- lapply(META$RDS, function(x) readRDS(x)) %>% magrittr::set_names(META$id)# load RDS files
# Check if data is in data.table or list format
if(all(lapply(DTL, class) %>% unlist %>% magrittr::equals("data.frame"))){
DTL <- lapply(DTL, data.table::data.table) %>% magrittr::set_names(META$id)
}else if(all(lapply(DTL, class) %>% unlist %>% magrittr::equals("list"))){
DTL <- lapply(DTL, function(x){
do.call(x, what = rbind) %>% data.table::data.table()
}) %>% magrittr::set_names(META$id)
}
# Have reference sequence, if not add it
reqRefSeq <- lapply(DTL, function(x){"refSeq" %in% names(x)}) %>% unlist %>% magrittr::not()
if(sum(reqRefSeq) > 0 & is.null(genome) | is.null(geneAnnot)){
stop("Data requires reference sequence, genome and geneAnnot arguments must be provided")
}
if(sum(reqRefSeq) > 0){
DTL[reqRefSeq] <- parallel::mclapply(mc.cores = nCores, DTL[reqRefSeq], function(DT){
txtools::tx_split_DT(DT) %>% lapply(function(x){
txtools::tx_add_refSeqDT(DT = x, genome = genome, geneAnnot = geneAnnot)
}) %>% txtools::tx_merge_DT()
})
}
# Check uniformity of DTs, if unequal equalize
tmpPos <- lapply(DTL, function(DT){paste(DT$gene, DT$txcoor, sep = ":")})
identCoors <- lapply(tmpPos[-1], function(x){identical(tmpPos[[1]], x)}) %>% unlist() %>% all()
if(!identCoors){
if(is.null(geneAnnot) | is.null(genome)){
stop("Data needs compatibility adjustment, this requires geneAnnot and
genome arguments to be provided")
}
tmpGenes <- lapply(DTL, function(x) as.character(x$gene)) %>% unlist() %>% unique()
tmpGA <- geneAnnot[match(tmpGenes, geneAnnot$name)]
DTL <- lapply(DTL, function(DT){
txtools::tx_complete_DT(DT, tmpGA, genome, nCores)
})
}
if(!("pos" %in% names(DTL[[1]]))){
DTL <- lapply(DTL, function(x) txtools::tx_add_pos(x))
}
STORM <- list(META = META,
DATA = DTL,
RES = NULL)
return(STORM)
}
#' Confussion matrix plot
#'
#' @param STORM
#' @param title
#' @param pointAlpha
#'
#' @return
#' @export
#'
storm_plot_confMat <- function(STORM, title = "", pointAlpha = 0.5){
df <- storm_confMatrix4plot(STORM)
ggplot(df, aes(x = outcome, y = RNAmod)) +
geom_jitter(alpha = 0.5, width = 0.2, height = 0.2) +
theme_bw() + ggtitle(title)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.