R/STAR_utils.R

Defines functions .STAR_twopassGenome STAR_mappability STAR_loadGenomeGTF STAR_buildGenome .validate_STAR_trim_sequence .validate_STAR_fastq_samples .STAR_clean_temp_FASTA_GTF .STAR_get_GTF .STAR_get_FASTA .validate_OTF_STAR_reference .validate_STAR_reference .validate_STAR_version STAR_alignReads STAR_alignExperiment STAR_buildRef STAR_version

Documented in STAR_alignExperiment STAR_alignReads STAR_buildGenome STAR_buildRef STAR_loadGenomeGTF STAR_mappability STAR_version

#' STAR wrappers for building reference for STAR, and aligning RNA-sequencing
#'
#' These STAR helper / wrapper functions allow users to (1) create a STAR
#' genome reference (with or without GTF), (2) align one or more RNA-seq
#' samples, and (3) calculate regions of low mappability. STAR references
#' can be created using one-step (genome and GTF), or two-step (genome first,
#' then on-the-fly with injected GTF) approaches.
#'
#' @details
#' **Pre-requisites**
#'
#' `STAR_buildRef()` and `STAR_buildGenome()` require prepared genome
#' and gene annotation reference retrieved using [getResources], which is run
#' internally by [buildRef]
#'
#' `STAR_loadGenomeGTF()` requires the above, and additionally a STAR genome 
#' created using `STAR_buildGenome()` 
#' 
#' `STAR_alignExperiment()`, `STAR_alignReads()`, and `STAR_mappability()`: 
#' requires a `STAR` genome, which can be built using `STAR_buildRef()` or
#' `STAR_buildGenome()` followed by `STAR_loadGenomeGTF()`
#'
#' **Function Description**
#'
#' For `STAR_buildRef`: this function will create a `STAR` genome reference 
#'   using the same genome FASTA and gene annotation GTF used to create
#'   the SpliceWiz reference. Optionally, it will run `STAR_mappability`
#'   if `also_generate_mappability` is set to `TRUE`
#'
#' For `STAR_alignExperiment`: aligns a set of FASTQ or paired FASTQ files
#'   using the given
#'   `STAR` genome using the `STAR` aligner.
#'   A data.frame specifying sample names and corresponding FASTQ files are
#'   required
#'
#' For `STAR_alignReads`: aligns a single or pair of FASTQ files to the given
#'   `STAR` genome using the `STAR` aligner.
#' 
#' For `STAR_buildGenome`: Creates a STAR genome reference, using ONLY the
#'   FASTA file used to create the SpliceWiz reference. This allows users to
#'   create a single STAR reference for use with multiple transcriptome (GTF)
#'   references (on different occasions). Optionally, it will run 
#'   `STAR_mappability` if `also_generate_mappability` is set to `TRUE`
#'
#' For `STAR_loadGenomeGTF`: Creates an "on-the-fly" STAR genome, injecting GTF
#' from the given SpliceWiz `reference_path`, setting `sjdbOverhang` setting,
#' and (optionally) any spike-ins via the `extraFASTA` parameter.
#' This allows users to create a single STAR reference for use with multiple 
#' transcriptome (GTF) references, with different sjdbOverhang settings,
#' and/or spike-ins (on different occasions or for different projects).
#'
#' For `STAR_mappability`: this function will first
#'   will run [generateSyntheticReads], then use the given `STAR` genome to 
#'   align the synthetic reads using `STAR`. The aligned BAM file will then be
#'   processed using [calculateMappability] to calculate the
#'   lowly-mappable genomic regions,
#'   producing the `MappabilityExclusion.bed.gz` output file.
#'
#' @param reference_path The path to the reference.
#'    [getResources] must first be run using this path
#'    as its `reference_path`
#' @param STAR_ref_path (Default - the "STAR" subdirectory under
#'    `reference_path`) The directory containing the STAR reference to be
#'    used or to contain the newly-generated STAR reference
#' @param also_generate_mappability Whether `STAR_buildRef()` and 
#'   `STAR_buildGenome()` also calculate Mappability Exclusion regions.
#' @param map_depth_threshold (Default `4`) The depth of mapped reads
#'   threshold at or below which Mappability exclusion regions are defined. See
#'   [Mappability-methods]. Ignored if `also_generate_mappability = FALSE`
#' @param sjdbOverhang (Default = 100) A STAR setting indicating the length of
#'   the donor / acceptor sequence on each side of the junctions. Ideally equal
#'   to (mate_length - 1). See the STAR aligner manual for details.
#' @param n_threads The number of threads to run the STAR aligner.
#' @param additional_args A character vector of additional arguments to be
#'   parsed into STAR. See examples below.
#' @param Experiment A two or three-column data frame with the columns denoting
#'   sample names, forward-FASTQ and reverse-FASTQ files. This can be
#'   conveniently generated using [findFASTQ]
#' @param BAM_output_path The path under which STAR outputs the aligned BAM
#'   files. In `STAR_alignExperiment()`, STAR will output aligned
#'   BAMS inside subdirectories of this folder, named by sample names. In
#'   `STAR_alignReads()`, STAR will output directly into this path.
#' @param trim_adaptor The sequence of the Illumina adaptor to trim via STAR's
#'   `--clip3pAdapterSeq` option
#' @param two_pass Whether to use two-pass mapping. In
#'   `STAR_alignExperiment()`, STAR first-pass will align every sample
#'   to generate a list of splice junctions but not BAM files. The junctions
#'   are then given to STAR to generate a temporary genome containing
#'   information about novel junctions, thereby improving novel junction
#'   detection. In `STAR_alignReads()`, STAR will use `--twopassMode Basic`
#' @param fastq_1,fastq_2 In STAR_alignReads: character vectors giving the
#'   path(s) of one or more FASTQ (or FASTA) files to be aligned.
#'   If single reads are to be aligned, omit \code{fastq_2}
#' @param memory_mode The parameter to be parsed to \code{--genomeLoad}; either
#'   \code{NoSharedMemory} or \code{LoadAndKeep} are used.
#' @param overwrite (default `FALSE`) If BAM file(s) already exist from a
#'   previous run, whether these would be overwritten.
#' @param sparsity (default `1`) Sets STAR's `--genomeSAsparseD` option. For
#'   human (and mouse) genomes, set this to `2` to allow STAR to perform
#'   genome generation and mapping using < 16 Gb of RAM, albeit with slightly
#'   lower mapping rate (~ 0.1% lower, according to STAR's author). Setting
#'   this to higher values is experimental (and not tested)
#' @param overwrite (default `FALSE`)
#'   For `STAR_buildRef`, `STAR_buildGenome` and `STAR_loadGenomeGTF` - 
#'     if STAR genome already exists, should it be overwritten.
#'   For `STAR_alignExperiment` and `STAR_alignReads` - if BAM file already
#'     exists, should it be overwritten.
#' @param ... Additional arguments to be parsed into
#'   \code{generateSyntheticReads()}. See \link{Mappability-methods}.
#' @examples
#' # 0) Check that STAR is installed and compatible with SpliceWiz
#'
#' STAR_version()
#' \dontrun{
#'
#' # The below workflow illustrates
#' # 1) Getting the reference resource
#' # 2) Building the STAR Reference, including Mappability Exclusion calculation
#' # 3) Building the SpliceWiz Reference, using the Mappability Exclusion file
#' # 4) Aligning (a) one or (b) multiple raw sequencing samples.
#'
#'
#' # 1) Reference generation from Ensembl's FTP links
#'
#' FTP <- "ftp://ftp.ensembl.org/pub/release-94/"
#'
#' getResources(
#'     reference_path = "Reference_FTP",
#'     fasta = paste0(FTP, "fasta/homo_sapiens/dna/",
#'         "Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"),
#'     gtf = paste0(FTP, "gtf/homo_sapiens/",
#'         "Homo_sapiens.GRCh38.94.chr.gtf.gz")
#' )
#'
#' # 2) Generates STAR genome within the SpliceWiz reference. Also generates
#' # mappability exclusion gzipped BED file inside the "Mappability/" sub-folder
#'
#' STAR_buildRef(
#'     reference_path = "Reference_FTP",
#'     STAR_ref_path = file.path("Reference_FTP", "STAR"),
#'     n_threads = 8,
#'     also_generate_mappability = TRUE
#' )
#'
#' # 2a) Generates STAR genome of the example SpliceWiz genome.
#' #     This demonstrates using custom STAR parameters, as the example 
#' #     SpliceWiz genome is ~100k in length, 
#' #     so --genomeSAindexNbases needs to be
#' #     adjusted to be min(14, log2(GenomeLength)/2 - 1)
#'
#' getResources(
#'     reference_path = "Reference_chrZ",
#'     fasta = chrZ_genome(),
#'     gtf = chrZ_gtf()
#' )
#'
#' STAR_buildRef(
#'     reference_path = "Reference_chrZ",
#'     STAR_ref_path = file.path("Reference_chrZ", "STAR"),
#'     n_threads = 8,
#'     additional_args = c("--genomeSAindexNbases", "7"),
#'     also_generate_mappability = TRUE
#' )
#'
#' # 3) Build SpliceWiz reference using the newly-generated 
#' #    Mappability exclusions
#'
#' #' NB: also specifies to use the hg38 nonPolyA resource
#'
#' buildRef(reference_path = "Reference_FTP", genome_type = "hg38")
#'
#' # 4a) Align a single sample using the STAR reference
#'
#' STAR_alignReads(
#'     fastq_1 = "sample1_1.fastq", fastq_2 = "sample1_2.fastq",
#'     STAR_ref_path = file.path("Reference_FTP", "STAR"),
#'     BAM_output_path = "./bams/sample1",
#'     n_threads = 8
#' )
#'
#' # 4b) Align multiple samples, using two-pass alignment
#'
#' Experiment <- data.frame(
#'     sample = c("sample_A", "sample_B"),
#'     forward = file.path("raw_data", c("sample_A", "sample_B"),
#'         c("sample_A_1.fastq", "sample_B_1.fastq")),
#'     reverse = file.path("raw_data", c("sample_A", "sample_B"),
#'         c("sample_A_2.fastq", "sample_B_2.fastq"))
#' )
#'
#' STAR_alignExperiment(
#'     Experiment = Experiment,
#'     STAR_ref_path = file.path("Reference_FTP", "STAR"),
#'     BAM_output_path = "./bams",
#'     n_threads = 8,
#'     two_pass = TRUE
#' )
#'
#' # - Building a STAR genome (only) reference, and injecting GTF as a
#' #   subsequent step
#' #
#' #   This is useful for users who want to create a single STAR genome, for
#' #   experimentation with different GTF files.
#' #   It is important to note that the chromosome names of the genome (FASTA)
#' #   file and the GTF file needs to be identical. Thus, Ensembl and Gencode
#' #   GTF files should not be mixed (unless the chromosome GTF names have
#' #   been fixed)
#'
#' # - also set sparsity = 2 to build human genome so that it will fit in
#' #   16 Gb RAM. NB: this step's RAM usage can be set using the
#' #   `--limitGenomeGenerateRAM` parameter
#'
#' STAR_buildGenome(
#'     reference_path = "Reference_FTP",
#'     STAR_ref_path = file.path("Reference_FTP", "STAR_genomeOnly"),
#'     n_threads = 8, sparsity = 2,
#'     additional_args = c("--limitGenomeGenerateRAM", "16000000000")
#' )
#'
#' # - Injecting a GTF into a genome-only STAR reference
#' #
#' #   This creates an on-the-fly STAR genome, using a GTF file 
#' #   (derived from a SpliceWiz reference) into a new location.
#' #   This allows a single STAR reference to use multiple GTFs
#' #   on different occasions.
#'
#' STAR_new_ref <- STAR_loadGenomeGTF(
#'     reference_path = "Reference_FTP",
#'     STAR_ref_path = file.path("Reference_FTP", "STAR_genomeOnly"),
#'     STARgenome_output = file.path(tempdir(), "STAR"),
#'     n_threads = 4,
#'     sjdbOverhang = 100
#' )
#' 
#' # This new reference can then be used to align your experiment:
#'
#' STAR_alignExperiment(
#'     Experiment = Experiment,
#'     STAR_ref_path = STAR_new_ref,
#'     BAM_output_path = "./bams",
#'     n_threads = 8,
#'     two_pass = TRUE
#' )
#'
#' # Typically, one should `clean up` the on-the-fly STAR reference (as it is
#' #   large!). If it is in a temporary directory, it will be cleaned up
#' #   when the current R session ends; otherwise this needs to be done manually:
#'
#' unlink(file.path(tempdir(), "STAR"), recursive = TRUE)
#'
#' }
#' @name STAR-methods
#' @aliases
#' STAR_buildRef STAR_alignExperiment STAR_alignReads
#' @seealso
#' [Build-Reference-methods] [findSamples] [Mappability-methods]\cr\cr
#' [The latest STAR documentation](https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf)
#' @md
NULL

#' @describeIn STAR-methods Checks whether STAR is installed, and its version
#' @return For `STAR_version()`: The STAR version
#' @export
STAR_version <- function() .validate_STAR_version(type = "message")

#' @describeIn STAR-methods Creates a STAR genome reference, using both FASTA
#'   and GTF files used to create the SpliceWiz reference
#' @return For `STAR_buildRef()`: None
#' @export
STAR_buildRef <- function(
        reference_path,
        STAR_ref_path = file.path(reference_path, "STAR"),
        n_threads = 4,
        overwrite = FALSE,
        sjdbOverhang = 100,
        sparsity = 1,
        also_generate_mappability = FALSE,
        map_depth_threshold = 4,
        additional_args = NULL,
        ...
) {
    .validate_reference_resource(reference_path)
    .validate_STAR_version()
    .validate_path(STAR_ref_path)
    # Unzip reference files
    genome.fa <- .STAR_get_FASTA(reference_path)
    transcripts.gtf <- .STAR_get_GTF(reference_path)
    # Build STAR using defaults
    .log(paste("Building STAR genome from", reference_path), type = "message")

    args <- NULL
    if (!("--runMode" %in% additional_args)) args <- c(
        "--runMode", "genomeGenerate")

    if(dir.exists(STAR_ref_path)) {
        if(
            file.exists(file.path(STAR_ref_path, "genomeParameters.txt")) &&
            !overwrite
        ) {
            .log(paste(
                STAR_ref_path, "already exists.",
                "Set overwrite = TRUE to override"
            ))
        } else if(
            file.exists(file.path(STAR_ref_path, "genomeParameters.txt"))
        ) {
            .log(
                paste(
                    "Overwriting STAR reference in", STAR_ref_path
                ), "message"
            )
            unlink(list.files(STAR_ref_path), recursive = FALSE)
        }
    }

    .validate_path(STAR_ref_path)
    if (!("--genomeDir" %in% additional_args)) args <- c(args,
        "--genomeDir", STAR_ref_path)

    if (!("--genomeFastaFiles" %in% additional_args)) args <- c(args,
        "--genomeFastaFiles", genome.fa)

    if (!("--sjdbGTFfile" %in% additional_args)) args <- c(args,
        "--sjdbGTFfile", transcripts.gtf)

    if (!("--sjdbOverhang" %in% additional_args)) args <- c(args,
        "--sjdbOverhang", sjdbOverhang)

    if (!("--runThreadN" %in% additional_args)) args <- c(args,
        "--runThreadN", .validate_threads(n_threads, as_BPPARAM = FALSE))

    # sanity check sparsity
    if(is.na(as.numeric(sparsity))) sparsity <- 1
    sparsity <- floor(sparsity)
    sparsity <- max(sparsity, 1)
    args <- c(args, "--genomeSAsparseD", sparsity)

    if (!is.null(additional_args) && all(is.character(additional_args))) {
        args <- c(args, additional_args)
    }
    system2(command = "STAR", args = args)

    if (also_generate_mappability) {
        STAR_mappability(
            reference_path = reference_path,
            STAR_ref_path = STAR_ref_path,
            map_depth_threshold = map_depth_threshold,
            n_threads = n_threads,
            ...
        )
    }

    # Clean up
    .STAR_clean_temp_FASTA_GTF(reference_path)
}

#' @describeIn STAR-methods Aligns multiple sets of FASTQ files, belonging to
#'   multiple samples
#' @return For `STAR_alignExperiment()`: None
#' @export
STAR_alignExperiment <- function(
    Experiment, 
    STAR_ref_path, 
    BAM_output_path,
    n_threads = 4,
    overwrite = FALSE,
    two_pass = FALSE, 
    trim_adaptor = "AGATCGGAAG",
    additional_args = NULL
) {

    .validate_STAR_version()
    STAR_ref_path <- .validate_STAR_reference(STAR_ref_path)
    BAM_output_path <- .validate_path(BAM_output_path)

    # Dissect Experiment:
    if (ncol(Experiment) < 2 || ncol(Experiment) > 3) {
        .log(paste("Experiment must be a 2- or 3- column data frame,",
            "with the columns denoting sample name, fastq file (forward),",
            "and (optionally) fastq file (reverse)"))
    } else if (ncol(Experiment) == 2) {
        colnames(Experiment) <- c("sample", "forward")
        fastq_1 <- Experiment[, "forward"]
        fastq_2 <- NULL
        .validate_STAR_fastq_samples(fastq_1)
        paired <- FALSE
    } else if (ncol(Experiment) == 3) {
        colnames(Experiment) <- c("sample", "forward", "reverse")
        fastq_1 <- Experiment[, "forward"]
        fastq_2 <- Experiment[, "reverse"]
        .validate_STAR_fastq_samples(fastq_1, fastq_2)
        paired <- TRUE
    }
    gzipped <- all(grepl(paste0("\\", ".gz", "$"), fastq_1)) &&
        (!paired || all(grepl(paste0("\\", ".gz", "$"), fastq_2)))
    if (is_valid(trim_adaptor)) .validate_STAR_trim_sequence(trim_adaptor)

    samples <- unique(Experiment[, "sample"])
    SJ.files <- NULL
    two_pass_genome <- NULL
    loaded_ref <- NULL
    for (pass in seq_len(ifelse(two_pass, 2, 1))) {
        if (two_pass && pass == 1) message("STAR - first pass")
        if (two_pass && pass == 2) message("STAR - second pass")
        if (pass == 1) {
            ref <- STAR_ref_path
            system2(command = "STAR", args = c(
                "--genomeLoad", "LoadAndExit", "--genomeDir", ref,
                "--outFileNamePrefix", tempdir()
            ))
            loaded_ref <- ref
        }
        for (i in seq_len(length(samples))) {
            # Generate two-pass genome using spoof reads
            if(pass == 2 && is.null(two_pass_genome) && !is.null(SJ.files)) {
                two_pass_genome <- .STAR_twopassGenome(
                    STAR_ref_path, SJ_files = SJ.files$path,
                    n_threads = n_threads
                )
                system2(command = "STAR", args = c(
                    "--genomeLoad", "LoadAndExit", "--genomeDir", ref,
                    "--outFileNamePrefix", tempdir()
                ))
                loaded_ref <- ref <- two_pass_genome
            }

            sample <- samples[i]
            Expr_sample <- Experiment[Experiment[, "sample"] == sample, ]
            if (!paired) {
                fastq_1 <- Expr_sample[, "forward"]
                fastq_2 <- NULL
            } else {
                fastq_1 <- Expr_sample[, "forward"]
                fastq_2 <- Expr_sample[, "reverse"]
            }
            memory_mode <- "LoadAndKeep"
            if (two_pass && pass == 1) {
                additional_args_use <- c(additional_args,
                    "--outSAMtype", "None")
            } else {
                additional_args_use <- additional_args
            }

            .log(paste("Aligning", sample, "using STAR"), "message")
            STAR_alignReads(
                STAR_ref_path = ref,
                BAM_output_path = file.path(BAM_output_path, sample),
                fastq_1 = fastq_1, fastq_2 = fastq_2,
                trim_adaptor = trim_adaptor,
                memory_mode = memory_mode,
                additional_args = additional_args_use,
                n_threads = n_threads,
                overwrite = overwrite
            )

        } # end of FOR loop

        if (two_pass && pass == 1) {
            SJ.files <- findSamples(BAM_output_path, suffix = ".out.tab")
            if (nrow(SJ.files) == 0) {
                .log(paste("In STAR two-pass,",
                    "no SJ.out.tab files were found"))
            }
        }
        .log(paste("Unloading STAR reference:", loaded_ref), "message")
        system2(command = "STAR", args = c(
            "--genomeLoad", "Remove", "--genomeDir", loaded_ref,
                "--outFileNamePrefix", tempdir()
        ))
        loaded_ref <- NULL
        
        if(pass == 2 && !is.null(two_pass_genome)) {
            unlink(file.path(tempdir(), "STAR_twopass"), recursive = TRUE)
        }
    }
}

#' @describeIn STAR-methods Aligns a single sample (with single or paired FASTQ
#'   or FASTA files)
#' @return For `STAR_alignReads()`: None
#' @export
STAR_alignReads <- function(
        fastq_1 = c("./sample_1.fastq"), 
        fastq_2 = NULL,
        STAR_ref_path, 
        BAM_output_path,
        n_threads = 4,
        overwrite = FALSE,
        two_pass = FALSE,
        trim_adaptor = "AGATCGGAAG",
        memory_mode = "NoSharedMemory",
        additional_args = NULL
) {
    expectedBAM <- file.path(BAM_output_path, "Aligned.out.bam")
    if(!overwrite) {
        if(file.exists(expectedBAM)) {
            .log(paste(
                expectedBAM, 
                "already exists. Set overwrite = TRUE to overwrite"
                ), "warning"
            )
            message("") # for \n
            return()
        }
    } else {
        .log(
            paste(
                "Overwriting", expectedBAM
            ), "message"
        )
        unlink(expectedBAM)
    }

    .validate_STAR_version()
    STAR_ref_path <- .validate_STAR_reference(STAR_ref_path)
    .validate_STAR_fastq_samples(fastq_1, fastq_2)

    paired <- (length(fastq_1) == length(fastq_2))
    gzipped <- all(grepl(paste0("\\", ".gz", "$"), fastq_1)) &&
        (!paired || all(grepl(paste0("\\", ".gz", "$"), fastq_2)))
    if (is_valid(trim_adaptor)) .validate_STAR_trim_sequence(trim_adaptor)

    BAM_output_path <- .validate_path(BAM_output_path)
    # Load STAR reference

    # Remove duplication:
    args <- NULL
    if (!("--genomeLoad" %in% additional_args)) 
        args <- c("--genomeLoad", memory_mode)
    if (!("--runThreadN" %in% additional_args)) 
        args <- c(args,
            "--runThreadN", .validate_threads(n_threads, as_BPPARAM = FALSE))
    if (!("--genomeDir" %in% additional_args)) 
        args <- c(args, "--genomeDir", STAR_ref_path)
    if (!("--outFileNamePrefix" %in% additional_args))
        args <- c(args, "--outFileNamePrefix",
            paste0(BAM_output_path, "/"))

    if (!("--outStd" %in% additional_args)) args <- c(args, "--outStd", "Log")
    if (!("--outBAMcompression" %in% additional_args)) args <- c(args, 
        "--outBAMcompression", "6")

    if (!("--outSAMstrandField" %in% additional_args)) args <- c(args, 
        "--outSAMstrandField", "intronMotif")

    if (!("--outSAMunmapped" %in% additional_args)) 
        args <- c(args, "--outSAMunmapped", "None")

    if (!("--outFilterMultimapNmax" %in% additional_args))
        args <- c(args, "--outFilterMultimapNmax", "1")

    if (!("--outSAMtype" %in% additional_args))
        args <- c(args, "--outSAMtype", "BAM", "Unsorted")

    if (two_pass) args <- c(args, "--twopassMode", "Basic")

    args <- c(args, "--readFilesIn", paste(fastq_1, collapse = ","))

    if (paired) args <- c(args, paste(fastq_2, collapse = ","))
    if (gzipped) args <- c(args, "--readFilesCommand", shQuote("gzip -dc"))
    if (is_valid(trim_adaptor)) {
        if(.validate_STAR_version(silent = TRUE) >= "2.7.8a" && paired) {
            args <- c(args, 
                "--clip3pAdapterSeq", trim_adaptor, trim_adaptor,
                "--clip3pAdapterMMp", "0.1", "0.1"
            )      
        } else {
            args <- c(args, "--clip3pAdapterSeq", trim_adaptor)        
        }
    }

    if (!is.null(additional_args) && all(is.character(additional_args)))
        args <- c(args, additional_args)

    system2(command = "STAR", args = args)
}

.validate_STAR_version <- function(type = "error", silent = FALSE) {
    if (!(Sys.info()["sysname"] %in% c("Linux", "Darwin"))) {
        .log("STAR is only supported on Linux or MacOS", type = type)
        return(NULL)
    }
    # Super-safe checking if STAR is available
    which_star <- NULL
    tryCatch({
        which_star <- system2("which", "STAR", stdout = TRUE)
    }, error = function(e) {
        which_star <- NULL
    }, warning = function(e) {
        which_star <- NULL
    })
    if (is.null(which_star)) {
        .log("STAR is not installed", type = type)
        return(NULL)
    }
    star_version <- NULL
    tryCatch({
        star_version <- system2("STAR", "--version", stdout = TRUE)
    }, error = function(e) {
        star_version <- NULL
    })
    if (is.null(star_version)) {
        .log("STAR is not installed", type = type)
    }
    if (!is.null(star_version) && star_version < "2.5.0") {
        .log(paste("STAR version < 2.5.0 is not supported;",
            "current version:", star_version), type = type)
    }
    if (!is.null(star_version) && star_version >= "2.5.0" && !silent) {
        .log(paste("STAR version", star_version), type = "message")
    }
    return(star_version)
}

.validate_STAR_reference <- function(STAR_ref_path) {
    if (!file.exists(file.path(STAR_ref_path, "genomeParameters.txt")))
        .log(paste(
            STAR_ref_path, "does not appear to be a valid STAR reference"))
    return(normalizePath(STAR_ref_path))
}

.validate_OTF_STAR_reference <- function(STAR_ref_path) {
    if (!file.exists(file.path(STAR_ref_path, "genomeParameters.txt"))) {
        .log(paste(STAR_ref_path, 
            "does not appear to be a valid STAR reference"))    
    } else {
        .log(paste(STAR_ref_path, "on-the-fly STAR genome created"), "message")
    }
    return(normalizePath(STAR_ref_path))
}

# Creates a temporary FASTA file from locally-stored TwoBit
.STAR_get_FASTA <- function(reference_path) {
    genome.fa <- file.path(reference_path, "resource", "genome.fa")
    if (!file.exists(genome.fa)) {
        genome.fa <- paste0(genome.fa, ".temp")
        genome.2bit <- file.path(reference_path, "resource", "genome.2bit")
        if (!file.exists(genome.2bit)) {
            .log(paste(genome.2bit, "not found"))
        }
        .log("Extracting temp genome FASTA from TwoBit file", "message")
        tmp <- rtracklayer::import(TwoBitFile(genome.2bit))
        rtracklayer::export(
            tmp,
            genome.fa, "fasta"
        )
        rm(tmp)
        gc()
    }
    return(genome.fa)
}

# Creates a temporary unzipped GTF for STAR
.STAR_get_GTF <- function(reference_path) {
    transcripts.gtf <- file.path(reference_path, "resource", "transcripts.gtf")
    if (!file.exists(transcripts.gtf)) {
        if (!file.exists(paste0(transcripts.gtf, ".gz"))) {
            .log(paste(paste0(transcripts.gtf, ".gz"), "not found"))
        }
        .log("Extracting temp Annotation GTF from GZ file", "message")
        R.utils::gunzip(paste0(transcripts.gtf, ".gz"), remove = FALSE,
            overwrite = TRUE)
        file.rename(transcripts.gtf, paste0(transcripts.gtf, ".temp"))
        transcripts.gtf <- paste0(transcripts.gtf, ".temp")
    }
    return(transcripts.gtf)
}

.STAR_clean_temp_FASTA_GTF <- function(reference_path) {
    .log("Cleaning temp genome / gene annotation files", "message")
    genome.fa <- file.path(reference_path, "resource", "genome.fa.temp")
    transcripts.gtf <- file.path(reference_path,
        "resource", "transcripts.gtf.temp")
    if (file.exists(genome.fa)) file.remove(genome.fa)
    if (file.exists(transcripts.gtf)) file.remove(transcripts.gtf)
}

.validate_STAR_fastq_samples <- function(fastq_1, fastq_2) {
    if (!is_valid(fastq_2)) {
        # assume single
        if (!all(file.exists(fastq_1))) {
            .log("Some fastq files were not found")
        }
    } else {
        if (length(fastq_2) != length(fastq_1)) {
            .log(paste("There must be equal numbers of",
                "forward and reverse fastq samples"))
        }
        if (!all(file.exists(fastq_1)) || !all(file.exists(fastq_2))) {
            .log("Some fastq files were not found")
        }
    }
    paired <- (length(fastq_1) == length(fastq_2))
    gzipped <- all(grepl(paste0("\\", ".gz", "$"), fastq_1)) &&
        (!paired || all(grepl(paste0("\\", ".gz", "$"), fastq_2)))
    if (!gzipped &&
        (
            any(grepl(paste0("\\", ".gz", "$"), fastq_1)) ||
            (paired && any(grepl(paste0("\\", ".gz", "$"), fastq_2)))
        )
    ) {
        .log(paste("A mixture of gzipped and uncompressed",
            "fastq files found.", "You must supply either all",
            "gzipped or all uncompressed fastq files"))
    }
}

.validate_STAR_trim_sequence <- function(sequence) {
    if (length(sequence) != 1) {
        .log("Multiple adaptor sequences are not supported")
    }
    tryCatch({
        ACGT_sum <- sum(Biostrings::letterFrequency(
            Biostrings::DNAString(sequence),
            letters = "AGCT", OR = 0))
    }, error = function(e) ACGT_sum <- 0)
    if (nchar(sequence) != ACGT_sum) {
        .log("Adaptor sequence can only contain A, C, G or T")
    }
}

# New approach to STAR utilities
#
# Functions
# - STAR_buildGenome: builds transcriptome-agnostic genome STAR reference
# - STAR_loadGenome: loads STAR genome into shared memory
#   - adding extra features as required, including:
#     - transcriptome GTF file
#     - sjdbOverhang settings
#     - any spike-ins (as FASTA files)
# - STAR_alignReads: aligns single or paired FASTQ reads using either
#     a given STAR genome, or the loaded STAR genome
# - STAR_alignExperiment: pipeline for STAR analysis with multiple samples

#' @describeIn STAR-methods Creates a STAR genome reference, using ONLY the
#'   FASTA file used to create the SpliceWiz reference
#' @return For `STAR_buildGenome()`: None
#' @export
STAR_buildGenome <- function(
    reference_path,
    STAR_ref_path = file.path(reference_path, "STAR"),
    n_threads = 4,
    overwrite = FALSE,
    sparsity = 1,
    also_generate_mappability = FALSE,
    map_depth_threshold = 4,
    additional_args = NULL,
    ...
) {
    .validate_reference_resource(reference_path)
    .validate_STAR_version()

    genome.fa <- .STAR_get_FASTA(reference_path)

    .log(paste("Building STAR genome from", reference_path), type = "message")

    args <- NULL
    if (!("--runMode" %in% additional_args)) args <- c(
        "--runMode", "genomeGenerate")

    if(dir.exists(STAR_ref_path)) {
        if(
            file.exists(file.path(STAR_ref_path, "genomeParameters.txt")) &&
            !overwrite
        ) {
            .log(paste(
                STAR_ref_path, "already exists.",
                "Set overwrite = TRUE to override"
            ))
        } else if(
            file.exists(file.path(STAR_ref_path, "genomeParameters.txt"))
        ) {
            .log(
                paste(
                    "Overwriting STAR reference in", STAR_ref_path
                ), "message"
            )
            unlink(list.files(STAR_ref_path), recursive = FALSE)
        }
    }

    .validate_path(STAR_ref_path)
    if (!("--genomeDir" %in% additional_args)) args <- c(args,
        "--genomeDir", STAR_ref_path)


    if (!("--genomeFastaFiles" %in% additional_args)) args <- c(args,
        "--genomeFastaFiles", genome.fa)

    if (!("--runThreadN" %in% additional_args)) args <- c(args,
        "--runThreadN", .validate_threads(n_threads, as_BPPARAM = FALSE))

    # sanity check sparsity
    if(is.na(as.numeric(sparsity))) sparsity <- 1
    sparsity <- floor(sparsity)
    sparsity <- max(sparsity, 1)
    
    args <- c(args, "--genomeSAsparseD", sparsity)

    if (!is.null(additional_args) && all(is.character(additional_args))) {
        args <- c(args, additional_args)
    }
    system2(command = "STAR", args = args)

    # Perform STAR_mappability by inserting GTF from SpliceWiz reference
    if (also_generate_mappability) {
        tmpGenome <- STAR_loadGenomeGTF(
            STAR_ref_path,
            reference_path = reference_path,
            sjdbOverhang = 100,
            STARgenome_output = file.path(tempdir(), "STARmap"),
            overwrite = TRUE
        )
        
        STAR_mappability(
            reference_path = reference_path,
            STAR_ref_path = tmpGenome,
            map_depth_threshold = map_depth_threshold,
            n_threads = n_threads,
            ...
        )
        
        unlink(file.path(tempdir(), "STARmap"), recursive = TRUE)
    }

    # Clean up
    .STAR_clean_temp_FASTA_GTF(reference_path)
}


#' @describeIn STAR-methods Creates an "on-the-fly" STAR genome, injecting GTF
#' from the given SpliceWiz `reference_path`, setting `sjdbOverhang` setting,
#' and (optionally) any spike-ins as `extraFASTA`
#' @param extraFASTA (default `""`) One or more FASTA files containing spike-in
#'   genome sequences (e.g. ERCC, Sequins), as required.
#' @param STARgenome_output The output path of the created on-the-fly genome
#' @return For `STAR_loadGenomeGTF()`:
#'   The path of the on-the-fly STAR genome, typically in the subdirectory
#'   "_STARgenome" within the given `STARgenome_output` directory
#' @export
STAR_loadGenomeGTF <- function(
    reference_path,
    STAR_ref_path,
    STARgenome_output = file.path(tempdir(), "STAR"),
    n_threads = 4,
    overwrite = FALSE,
    sjdbOverhang = 100,
    extraFASTA = "",
    additional_args = NULL
) {
    .validate_reference_resource(reference_path)
    .validate_STAR_version()
    .validate_STAR_reference(STAR_ref_path)

    # Prepare GTF
    transcripts.gtf <- .STAR_get_GTF(reference_path)

    .log(paste(
        "Loading STAR genome from", STAR_ref_path, "with injected GTF"),
        type = "message"
    )

    args <- NULL

    if (!("--genomeDir" %in% additional_args)) args <- c(args,
        "--genomeDir", STAR_ref_path)

    if (!("--genomeFastaFiles" %in% additional_args)) {
        if(all(file.exists(extraFASTA))) {
            args <- c(args, "--genomeFastaFiles", extraFASTA)
        } else if(length(extraFASTA) != 1 || extraFASTA != "") {
            .log(paste("Some files in extraFASTA do not exist... aborting"))
        }
    }

    if (!("--sjdbGTFfile" %in% additional_args)) args <- c(args,
        "--sjdbGTFfile", transcripts.gtf)

    if (!("--sjdbOverhang" %in% additional_args)) {
        # sanity check overhangs
        if(is.na(as.numeric(sjdbOverhang))) sjdbOverhang <- 100
        sjdbOverhang <- floor(sjdbOverhang)
        sjdbOverhang <- max(sjdbOverhang, 25)
        args <- c(args, "--sjdbOverhang", sjdbOverhang)
    }

    if (!("--runThreadN" %in% additional_args)) args <- c(args,
        "--runThreadN", .validate_threads(n_threads, as_BPPARAM = FALSE))

    nch <- nchar(STARgenome_output)
    if(substr(STARgenome_output, nch,nch) == "/")
        STARgenome_output <- substr(STARgenome_output, 1, nch-1)
    # create output directory (destroy old STAR ref if already exists)
    if(dir.exists(STARgenome_output)) {
        if(
            file.exists(file.path(STARgenome_output, "genomeParameters.txt")) &&
            !overwrite
        ) {
            .log(paste(
                STARgenome_output, "already exists.",
                "Set overwrite = TRUE to override"
            ))
        } else if(
            file.exists(file.path(STARgenome_output, "genomeParameters.txt"))
        ) {
            # Likely location of STAR ref, remove before generating STAR ref
            unlink(STARgenome_output, recursive = FALSE)
        }
    }
    .validate_path(STARgenome_output)    
    args <- c(args, "--outFileNamePrefix", paste0(STARgenome_output,"/"))

    if (!is.null(additional_args) && all(is.character(additional_args))) {
        args <- c(args, additional_args)
    }

    args <- c(args, "--sjdbInsertSave", "All")
    args <- c(args, "--readFilesIn", 
        system.file(
            "extdata/spoof_1.fq",
            package = "SpliceWiz"
        ),
        system.file(
            "extdata/spoof_1.fq",
            package = "SpliceWiz"
        )
    )

    system2(command = "STAR", args = args)
   
    .STAR_clean_temp_FASTA_GTF(reference_path)

    return(.validate_OTF_STAR_reference(
        file.path(STARgenome_output, "_STARgenome")
    ))
}

#' @describeIn STAR-methods Calculates lowly-mappable genomic regions using STAR
#' @return For `STAR_mappability()`: None
#' @export
STAR_mappability <- function(
        reference_path,
        STAR_ref_path = file.path(reference_path, "STAR"),
        map_depth_threshold = 4,
        n_threads = 4,
        ...
) {
    .validate_reference_resource(reference_path)
    .validate_STAR_version()
    STAR_ref_path <- .validate_STAR_reference(STAR_ref_path)
    mappability_reads_fasta <- file.path(
        reference_path, "Mappability", "Reads.fa")
    generateSyntheticReads(reference_path, ...)

    .log(paste("Aligning genome fragments back to the genome, from:",
        mappability_reads_fasta), type = "message")
    aligned_bam <- file.path(reference_path, "Mappability",
        "Aligned.out.bam")
    STAR_alignReads(
        fastq_1 = mappability_reads_fasta,
        fastq_2 = NULL,
        STAR_ref_path = STAR_ref_path,
        BAM_output_path = dirname(aligned_bam),
        n_threads = n_threads,
        trim_adaptor = "",
        additional_args = c(
            "--outSAMstrandField", "None",
            "--outSAMattributes", "None"
        )
    )
    if (file.exists(aligned_bam)) {
        # Cleaan up fasta
        if (file.exists(mappability_reads_fasta))
            file.remove(mappability_reads_fasta)

        .log(paste("Calculating Mappability from:", aligned_bam),
            type = "message")
        calculateMappability(
            reference_path = reference_path,
            aligned_bam = aligned_bam,
            threshold = map_depth_threshold,
            n_threads = n_threads
        )
    } else {
        .log("STAR failed to align mappability reads", "warning")
    }
    if (file.exists(file.path(reference_path, "Mappability",
            "MappabilityExclusion.bed.gz"))) {
        message("Mappability Exclusion calculations complete")
        # Clean up BAM
        if (file.exists(aligned_bam)) file.remove(aligned_bam)
    } else {
        .log("Mappability Exclusion calculations not performed", "warning")
    }
}

.STAR_twopassGenome <- function(
    STAR_ref_path,
    STARgenome_output = file.path(tempdir(), "STAR_twopass"),
    SJ_files = "",
    n_threads = 4,
    overwrite = TRUE
) {
    .validate_STAR_version()
    .validate_STAR_reference(STAR_ref_path)

    .log("Loading STAR two-pass genome", type = "message")
    args <- c("--genomeDir", STAR_ref_path)

    args <- c(args, "--sjdbFileChrStartEnd",
        paste(SJ_files, collapse = " "),
        "--sjdbInsertSave", "All"
    )

    args <- c(args, "--runThreadN", 
        .validate_threads(n_threads, as_BPPARAM = FALSE))

    nch <- nchar(STARgenome_output)
    if(substr(STARgenome_output, nch,nch) == "/")
        STARgenome_output <- substr(STARgenome_output, 1, nch-1)
    # create output directory (destroy old STAR ref if already exists)
    if(dir.exists(STARgenome_output)) {
        if(
            file.exists(file.path(STARgenome_output, "genomeParameters.txt")) &&
            !overwrite
        ) {
            .log(paste(
                STARgenome_output, "already exists.",
                "Set overwrite = TRUE to override"
            ))
        } else if(
            file.exists(file.path(STARgenome_output, "genomeParameters.txt"))
        ) {
            # Likely location of STAR ref, remove before generating STAR ref
            unlink(STARgenome_output, recursive = FALSE)
        }
    }
    .validate_path(STARgenome_output)    
    args <- c(args, "--outFileNamePrefix", paste0(STARgenome_output,"/"))

    args <- c(args, "--readFilesIn", 
        system.file(
            "extdata/spoof_1.fq",
            package = "SpliceWiz"
        ),
        system.file(
            "extdata/spoof_1.fq",
            package = "SpliceWiz"
        )
    )

    system2(command = "STAR", args = args)

    return(.validate_OTF_STAR_reference(
        file.path(STARgenome_output, "_STARgenome")
    ))
}
alexchwong/SpliceWiz documentation built on March 17, 2024, 3:16 a.m.