R/05_04_assembly_quantification_stringtie.R

Defines functions stringtie2se stringtie_quantify taco_merge taco_clean stringtie_assemble

Documented in stringtie2se stringtie_assemble stringtie_quantify taco_merge

#' Assemble transcripts with StringTie
#' 
#' @param sample_info Data frame of sample metadata created with the
#' functions \code{create_sample_info} and \code{infer_strandedness}.
#' The function \code{infer_strandedness} adds a column named "Orientation" 
#' with library strandedness information, which is mandatory.
#' @param qc_table Data frame of fastp summary statistics as returned
#' by \code{summary_stats_fastp()}.
#' @param mappingdir Directory where .bam files are stored.
#' @param gff_path Path to GFF/GTF file with annotations.
#' @param stringtiedir Directory where StringTie output files will be stored.
#' @param threads Number of threads to use. Default: 1.
#' 
#' @return A 2-column data frame with samples in the first column and status
#' in the second column, with "OK" if transcripts were assembled, and 
#' NA otherwise.
#' @export
#' @rdname stringtie_assemble
#' @examples
#' data(sample_info)
#' qc_table <- summary_stats_fastp(system.file("extdata", package = "bears"))
#' mappingdir <- system.file("extdata", package="bears")
#' gff_path <- system.file("extdata", "Homo_sapiens.GRCh37.75_subset.gtf",
#'                         package="bears")
#' stringtiedir <- tempdir()
#' if(stringtie_is_installed()) {
#'     a <- stringtie_assemble(sample_info, qc_table, mappingdir, 
#'                             gff_path, stringtiedir)
#' }
stringtie_assemble <- function(
    sample_info = NULL, qc_table = NULL,
    mappingdir = "results/04_read_mapping",
    gff_path = NULL,
    stringtiedir = "results/05_quantification/stringtie",
    threads = 1) {
    
    if(!stringtie_is_installed()) { stop("Unable to find StringTie in PATH.") }
    if(!dir.exists(stringtiedir)) { dir.create(stringtiedir, recursive = TRUE) }
    
    bamfiles <- list.files(mappingdir, pattern = ".bam", full.names = FALSE)
    bamfiles <- gsub("Aligned.*", "", bamfiles)
    sample_meta <- sample_info[!duplicated(sample_info$BioSample), ]
    sample_meta <- sample_meta[sample_meta$BioSample %in% bamfiles, ]
    
    t <- lapply(seq_len(nrow(sample_meta)), function(x) {
        var <- var2list(sample_meta, index = x)
        infile <- paste0(mappingdir, "/", 
                         var$biosample, "Aligned.sortedByCoord.out.bam")
        outfile <- paste0(stringtiedir, "/assembly/", var$biosample, ".gff")
        orientation <- sample_meta[x, "Orientation"]
        lib <- translate_strandedness(orientation, var$layout)$stringtie
        read_len <- qc_table[qc_table$Sample == var$biosample, 
                                 "after_meanlength"]
        anchor_len <- read_len / 4
        args <- c(infile, "-G", gff_path, "-o", outfile, 
                  "-p", threads, "-j 5 -c 10 -a", anchor_len, lib)
        system2("stringtie", args = args)
    })
    gff_files <- list.files(paste0(stringtiedir, "/assembly"), full.names=FALSE)
    gff_files <- gsub(".gff", "", gff_files)
    df_status <- data.frame(sample = sample_meta$BioSample)
    df_status$status <- ifelse(df_status$sample %in% gff_files, "OK", NA)
    return(df_status)
}


#' Clean directories after running TACO
#' 
#' @param out2dir Directory where final assembly.gtf file is.
#' @param final_dir Directory where final assembly.gtf file will be stored.
#' 
#' @return A NULL object.
#' @importFrom fs file_move dir_delete file_delete
#' @noRd
taco_clean <- function(out2dir = NULL, final_dir = NULL) {
    
    move <- fs::file_move(paste0(out2dir, "/assembly.gtf"),
                          paste0(final_dir, "/final_assembly.gtf"))
    dir_list <- list.dirs(final_dir, recursive = FALSE, full.names=TRUE)
    dir_list <- dir_list[grepl("biosample_", dir_list)]
    del <- fs::dir_delete(dir_list)
    del <- fs::dir_delete(out2dir)
    txtfiles <- list.files(final_dir, pattern = ".txt", full.names = TRUE)
    txtdel <- fs::file_delete(txtfiles)
    return(NULL)
}

#' Merge assembled transcripts with TACO
#'
#' This function merges GFF files from StringTie assembly into a single GFF.
#' 
#' @param sample_info Data frame of sample metadata created with the
#' function \code{create_sample_info}.
#' @param stringtiedir Directory where StringTie output files will be stored.
#' @param threads Number of threads to use. Default: 1.
#'
#' @details The GFF files for each BioSample are first merged into 
#' BioProject-level GFF files. Then, these Bioproject-level GFF files are
#' merged into a single file that represents the whole set.
#'
#' @return A 2-column data frame with the TACO run status, with "OK" if the
#' program merged transcripts successfully, and NA otherwise.
#' @export
#' @rdname taco_merge
#' @examples
#' data(sample_info)
#' qc_table <- summary_stats_fastp(system.file("extdata", package = "bears"))
#' mappingdir <- system.file("extdata", package="bears")
#' gff_path <- system.file("extdata", "Homo_sapiens.GRCh37.75_subset.gtf",
#'                         package="bears")
#' stringtiedir <- tempdir()
#' if(stringtie_is_installed()) {
#'     a <- stringtie_assemble(sample_info, qc_table, mappingdir, 
#'                             gff_path, stringtiedir)
#' }
#' if(taco_is_installed()) {
#'     taco_merge(sample_info, stringtiedir)
#' }
taco_merge <- function(
    sample_info = NULL, 
    stringtiedir = "results/05_quantification/stringtie",
    threads = 1) {
    
    if(!taco_is_installed()) { stop("Unable to find TACO in PATH.") }
    assemblydir <- paste0(stringtiedir, "/assembly")
    mergeddir <- paste0(assemblydir, "/merged_assembly")
    if(!dir.exists(mergeddir)) { dir.create(mergeddir, recursive = TRUE ) }
    
    gffs <- list.files(assemblydir, pattern = ".gff", full.names = FALSE)
    gffs <- gsub(".gff", "", gffs)
    sample_meta <- sample_info[!duplicated(sample_info$BioSample), ]
    sample_meta <- sample_meta[sample_meta$BioSample %in% gffs, ]
    
    # Create a .gtf file for each bioproject
    bioproject_list <- split(sample_meta$BioSample, sample_meta$BioProject)
    file_list <- lapply(seq_along(bioproject_list), function(x) {
        files <- paste0(assemblydir, "/", bioproject_list[[x]], ".gff")
        txt_file1 <- paste0(mergeddir, "/", names(bioproject_list)[x], ".txt")
        writeLines(files, txt_file1)
        outdir1 <- paste0(mergeddir, "/biosample_", names(bioproject_list)[x])
        args1 <- c("--num-processes", threads, "-o", outdir1, 
                   "--gtf-expr-attr TPM", txt_file1)
        system2("taco_run", args = args1)
    })
    # Merge everything into a single .gtf file
    bp_gffs <- paste0(mergeddir, "/biosample_", names(bioproject_list),
                      "/assembly.gtf")
    txt_file2 <- paste0(mergeddir, "/all_bioprojects.txt")
    writeLines(bp_gffs, txt_file2)
    outdir2 <- paste0(mergeddir, "/final_assembly")
    args2 <- c("--num-processes", threads, "-o", outdir2, 
               "--gtf-expr-attr expr", txt_file2)
    system2("taco_run", args = args2)
    
    # Cleaning up
    c <- taco_clean(outdir2, mergeddir)
    df_status <- data.frame(file = "final_assembly.gtf",
                            status = NA)
    if(file.exists(paste0(mergeddir, "/final_assembly.gtf"))) {
        df_status$status <- "OK"
    }
    return(df_status)
}


#' Quantify expression in TPM with StringTie
#' 
#' @param sample_info Data frame of sample metadata created with the
#' functions \code{create_sample_info} and \code{infer_strandedness}.
#' The function \code{infer_strandedness} adds a column named "Orientation" 
#' with library strandedness information, which is mandatory.
#' @param qc_table Data frame of fastp summary statistics as returned
#' by \code{summary_stats_fastp()}.
#' @param mappingdir Directory where .bam files are stored.
#' @param gff_path Path to GFF/GTF file with annotations.
#' @param stringtiedir Directory where StringTie output files will be stored.
#' @param threads Number of threads to use. Default: 1.
#' 
#' @return A 2-column data frame with samples in the first column and status
#' in the second column, with "OK" if expression in TPM was obtained, and 
#' NA otherwise.
#' @export
#' @rdname stringtie_quantify
#' @examples
#' data(sample_info)
#' qc_table <- summary_stats_fastp(system.file("extdata", package = "bears"))
#' mappingdir <- system.file("extdata", package="bears")
#' gff_path <- system.file("extdata", "Homo_sapiens.GRCh37.75_subset.gtf",
#'                         package="bears")
#' stringtiedir <- tempdir()
#' if(stringtie_is_installed()) {
#'     a <- stringtie_quantify(sample_info, qc_table, mappingdir, 
#'                             gff_path, stringtiedir)
#' }
stringtie_quantify <- function(
    sample_info = NULL, qc_table = NULL,
    mappingdir = "results/04_read_mapping",
    gff_path = NULL,
    stringtiedir = "results/05_quantification/stringtie",
    threads = 1) {
    
    if(!stringtie_is_installed()) { stop("Unable to find StringTie in PATH.") }
    if(!dir.exists(stringtiedir)) { dir.create(stringtiedir, recursive = TRUE) }
    quantdir <- paste0(stringtiedir, "/quant/")
    if(!dir.exists(quantdir)) { dir.create(quantdir, recursive = TRUE) }
    
    bamfiles <- list.files(mappingdir, pattern = ".bam", full.names = FALSE)
    bamfiles <- gsub("Aligned.*", "", bamfiles)
    sample_meta <- sample_info[!duplicated(sample_info$BioSample), ]
    sample_meta <- sample_meta[sample_meta$BioSample %in% bamfiles, ]
    
    t <- lapply(seq_len(nrow(sample_meta)), function(x) {
        var <- var2list(sample_meta, index = x)
        infile <- paste0(mappingdir, "/", 
                         var$biosample, "Aligned.sortedByCoord.out.bam")
        outgtf <- paste0(quantdir, var$biosample, ".gtf")
        outbg <- paste0(quantdir, var$biosample, ".bw")
        outabund <- paste0(quantdir, var$biosample, ".gene_abund.out")
        outcov <- paste0(quantdir, var$biosample, ".cov")
        orientation <- sample_meta[x, "Orientation"]
        lib <- translate_strandedness(orientation, var$layout)$stringtie
        read_len <- qc_table[qc_table$Sample == var$biosample, 
                                 "after_meanlength"]
        anchor_len <- read_len / 4
        args <- c(infile, "-G", gff_path, "-o", outgtf, "-p", threads, 
                  "-e -b", outbg, "-A", outabund, "-C", outcov, 
                  "-j 5 -c 10 -a", anchor_len, lib)
        system2("stringtie", args = args)
    })
    ab_files <- list.files(paste0(stringtiedir, "/quant"), pattern=".out",
                            full.names=FALSE)
    ab_files <- gsub(".gene_abund.out", "", ab_files)
    df_status <- data.frame(sample = sample_meta$BioSample)
    df_status$status <- ifelse(df_status$sample %in% ab_files, "OK", NA)
    return(df_status)
}


#' Create a SummarizedExperiment object from StringTie output
#' 
#' @param sample_info Data frame of sample metadata created with the
#' functions \code{create_sample_info}
#' @param stringtiedir Directory where StringTie output files are stored.
#' @param level Character indicating to which level expression must be 
#' quantified in the SE object. One of "gene" (default), "transcript", 
#' or "both". For "both", the SE object will have two assays named "transcript"
#' and "gene".
#' @param tx2gene Data frame of correspondence between genes and transcripts, 
#' with gene IDs in the first column and transcript IDs in the second column.
#' Only required if level = 'gene' or 'both'. 
#' @param countsFromAbundance Exactly as in \code{tximport::tximport},
#' whether ot not to generate estimated counts using abundance estimates.
#' One of "lengthScaledTPM" (default), "scaledTPM", "no", or "dtuScaledTPM".
#' See ?tximport for details.
#' 
#' @return A SummarizedExperiment object with gene/transcript expression
#' levels and sample metadata.
#' @importFrom tximport tximport summarizeToGene
#' @importFrom SummarizedExperiment SummarizedExperiment
#' @export
#' @rdname stringtie2se
#' @examples 
#' data(sample_info)
#' qc_table <- summary_stats_fastp(system.file("extdata", package = "bears"))
#' data(tx2gene)
#' mappingdir <- system.file("extdata", package="bears")
#' gff_path <- system.file("extdata", "Homo_sapiens.GRCh37.75_subset.gtf",
#'                         package="bears")
#' stringtiedir <- tempdir()
#' if(stringtie_is_installed()) {
#'     a <- stringtie_quantify(sample_info, qc_table, mappingdir, 
#'                             gff_path, stringtiedir)
#'     se_gene <- stringtie2se(sample_info, stringtiedir, tx2gene = tx2gene)
#' }
stringtie2se <- function(sample_info = NULL,
                         stringtiedir = "results/05_quantification/stringtie",
                         level = "gene",
                         tx2gene = NULL,
                         countsFromAbundance = "lengthScaledTPM") {
    
    quantdir <- paste0(stringtiedir, "/quant")
    dirs <- list.dirs(quantdir, full.names = TRUE, recursive=FALSE)
    dirs <- dirs[grepl("SAMN", dirs)]
    samples <- gsub(paste0(quantdir, "/"), "", dirs)
    samples <- gsub("\\.bw", "", samples)
    files <- paste0(dirs, "/t_data.ctab")
    names(files) <- samples
    sample_meta <- sample_info[!duplicated(sample_info$BioSample), ]
    sample_meta <- sample_meta[sample_meta$BioSample %in% samples, ]
    coldata <- data.frame(row.names = sample_meta$BioSample)
    coldata <- cbind(coldata, sample_meta[, !names(sample_meta) == "BioSample"])
    
    if(level == "gene") {
        exp <- tximport::tximport(
            files, type = "stringtie", tx2gene = tx2gene,
            countsFromAbundance = countsFromAbundance
        )
        final <- SummarizedExperiment::SummarizedExperiment(
            assays = list(gene_TPM = exp$abundance),
            colData = coldata
        )
    } else if(level == "transcript") {
        exp <- tximport::tximport(
            files, type = "stringtie", txOut = TRUE,
            countsFromAbundance = countsFromAbundance
        )
        final <- SummarizedExperiment::SummarizedExperiment(
            assays = list(tx_TPM = exp$abundance),
            colData = coldata
        )
    } else if(level == "both") {
        exp_tx <- tximport::tximport(
            files, type = "stringtie", txOut = TRUE,
            countsFromAbundance = countsFromAbundance
        )
        exp_gene <- tximport::summarizeToGene(exp_tx, tx2gene)
        se_gene <- SummarizedExperiment::SummarizedExperiment(
            assays = list(gene_TPM = exp_gene$abundance),
            colData = coldata
        )
        se_tx <- SummarizedExperiment::SummarizedExperiment(
            assays = list(tx_TPM = exp_tx$abundance),
            colData = coldata
        )
        final <- list(gene = se_gene, transcript = se_tx)
    } else {
        stop("Invalid parameter for the 'level' argument.")
    }
    return(final)
}
almeidasilvaf/gead documentation built on April 26, 2023, 3:47 p.m.