#' @title Run Arriba on RNA-Seq samples using structural-variants information derived from WGS.
#'
#' @param data.SV (GRanges): GRanges of SV derived from WGS.
#' @param minTAF (integer): Min. TAF of SV to take into the analysis, set to zero (0) for no filtering.
#' @param bamFolder (character): Path to folder containing the RNA-Seq BAM files (STAR).
#' @param outputFolder (character): Path to folder to write Arriba output.
#'
#' @return (chr) Command to run Arriba.
#' @examples
#' \donttest{
#'
#' runArriba(
#' data.SV = DR71.CohortWGS$structuralVariants,
#' minTAF = 0.05,
#' bamFolder = '/mnt/data2/hartwig/DR71/Oct2020/dataHMF/RNASeq/BAM/',
#' outputFolder = '/mnt/data2/hartwig/DR71/Oct2020/dataHMF/RNASeq/Arriba/'
#' )
#'
#' }
#' @author Job van Riet \email{j.vanriet@erasmusmc.nl}
#' @family RNA-Seq
#' @export
runArriba <- function(data.SV, minTAF = 0.05, bamFolder, outputFolder){
# Input validation --------------------------------------------------------
checkmate::checkClass(data.SV, 'GRanges')
checkmate::checkNumeric(minTAF)
checkmate::checkAccess(bamFolder, access = 'r')
checkmate::checkAccess(outputFolder, access = 'rw')
base::sprintf('Performing fusion-gene detection using Arriba on %s samples.', dplyr::n_distinct(data.SV$sample)) %>% ParallelLogger::logInfo()
# Convert SV to Arriba-friendly format ------------------------------------
base::sprintf('\t- Converting to Arriba-format.') %>% ParallelLogger::logInfo()
# Remove single-ends.
data.SV <- data.SV[data.SV$SVTYPE != 'SINGLE',]
# Clean remaining factors.
S4Vectors::mcols(data.SV) <- base::droplevels(S4Vectors::mcols(data.SV))
# Loop per sample
arriba.SV <- pbapply::pblapply(base::split(data.SV, data.SV$sample), function(x){
# Only retain partnered SV.
x <- x[!is.na(base::match(x$partner, base::gsub('.*\\.', '', base::names(x)))),]
# Format to Arriba standards.
x.SV <- data.frame(
'A' = sprintf('%s:%s', base::gsub('chr', '', base::as.character(GenomeInfoDb::seqnames(x))), as.numeric(GenomicRanges::start(x))),
'B' = sprintf('%s:%s', base::gsub('.*(\\[|\\])', '', base::gsub(':.*', '', x$ALT)), as.integer(base::gsub('(\\[|\\]).*', '', base::gsub('.*:', '', x$ALT)))),
'A.strand' = as.character(strand(x)),
'B.strand' = as.character(BiocGenerics::strand(x[base::match(x$partner, base::gsub('.*\\.', '', base::names(x))),])),
TAF = x$TAF
)
# Filter on min. TAF (if requested)
x.SV <- x.SV[x.SV$TAF >= minTAF,]
# Filter out SV which had no proper strand orientation.
x.SV <- x.SV %>% dplyr::filter(A.strand != '*', B.strand != '*')
return(x.SV)
}, cl = 1)
# Run Arriba --------------------------------------------------------------
base::sprintf('\t- Generating the command to run Arriba.') %>% ParallelLogger::logInfo()
# Write Arriba-friendly SVs to temp. dir.
base::lapply(base::names(arriba.SV), function(x){
z.SV <- arriba.SV[base::names(arriba.SV) == x][[1]]
write.table(
x = z.SV,
file = base::file.path(base::tempdir(), base::sprintf('%s_SV.txt', x)),
row.names = FALSE, quote = FALSE, sep = '\t', col.names = FALSE
)
})
# Retrieve the BAM files for which SV are present.
bamFiles <- data.frame(BAM = list.files(path = bamFolder, pattern = 'markDup.bam$', full.names = TRUE)) %>% dplyr::mutate(sample = gsub('_Aligned.*bam', '', basename(as.character(BAM))))
bamFiles <- bamFiles %>% dplyr::filter(sample %in% base::names(arriba.SV))
# Generate the Bash commands and write to tmp. file.
z <- base::sprintf('/mnt/onco0002/repository/software/arriba_v2.1.0/arriba -x %s -g /mnt/onco0002/repository/software/ensembl-vep/Plugins/GRCh37/noChrPrefix_gencode.v38lift37.annotation.gtf -a /mnt/onco0002/repository/general/genomes/hsapiens/hg19_HMF/Homo_sapiens.GRCh37.GATK.illumina.fasta -b /mnt/onco0002/repository/software/arriba_v2.1.0/database/blacklist_hg19_hs37d5_GRCh37_v2.1.0.tsv.gz -k /mnt/onco0002/repository/general/annotation/ChimerDB_4.0/knownRecurrentFusions.txt -o %s -d %s -s reverse', bamFiles$BAM, paste0(outputFolder, '/', bamFiles$sample, '_Arriba_fusions.tsv'), paste0(tempdir(), '/', bamFiles$sample,'_SV.txt'))
outputFile <- base::tempfile()
utils::write.table(z, outputFile, row.names = FALSE, quote = FALSE, sep = '\t', col.names = FALSE)
# Return statement --------------------------------------------------------
base::sprintf('Run the following command in Bash:\ncat %s | parallel -j <threads>', outputFile) %>% ParallelLogger::logInfo()
return(outputFile)
}
#' @title Import Arriba results.
#'
#' @param inputFolder (character): Path to folder containing Arriba output.
#'
#' @return (tibble) Returns a tibble of Arriba-findings.
#' @examples
#' \donttest{
#'
#' importArriba('/mnt/data2/hartwig/DR71/Oct2020/dataHMF/RNASeq/Arriba/')
#'
#' }
#' @author Job van Riet \email{j.vanriet@erasmusmc.nl}
#' @family RNA-Seq
#' @export
importArriba <- function(inputFolder){
# Input validation --------------------------------------------------------
checkmate::checkAccess(inputFolder, access = 'r')
base::sprintf('Importing Arriba results from: %s', inputFolder) %>% ParallelLogger::logInfo()
# Import results ----------------------------------------------------------
files.Arriba <- list.files(inputFolder, pattern = '_Arriba_fusions.tsv$', full.names = TRUE)
results.Arriba <- pbapply::pblapply(files.Arriba, function(x){
# Import.
z <- readr::read_tsv(x, col_types = 'cccccccccdddddcccccccccccccccc')
# Add sample-identifier.
z$sample <- base::gsub('_.*', '', base::basename(x))
# Clean columns.
z <- z %>% dplyr::mutate(
gene1 = `#gene1`,
`#gene1` = NULL,
fusion = sprintf('%s-%s', gene1, gene2),
read_identifiers = NULL
)
# Return.
return(z)
})
results.Arriba <- dplyr::bind_rows(results.Arriba)
# Return statement --------------------------------------------------------
return(results.Arriba)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.