R/readAlevinQC.R
In alevinQC: Generate QC Reports For Alevin Output

Documented in readAlevinQC

# Make a summary table for a selection of barcodes in cbtable
# colName should be the name of a logical column in cbtable, indicating
# inclusion/exclusion with respect to the barcode collection
# cbName will be used for display purposes, to identify the barcode collection
.makeSummaryTable <- function(cbtable, colName, cbName = "", quantmat = NULL) {
    stopifnot(colName %in% colnames(cbtable))
    stopifnot(is.logical(cbtable[[colName]]))

    df <- list()
    df[[paste0("Number of barcodes", cbName)]] <-
        as.character(sum(cbtable[[colName]], na.rm = TRUE))
    df[[paste0("Number of barcodes with quantification", cbName)]] <-
        as.character(sum(cbtable[[colName]] & !is.na(cbtable$mappingRate),
                         na.rm = TRUE))
    df[[paste0("Fraction reads in barcodes", cbName)]] <-
        as.character(paste0(signif(
            100 * sum(cbtable$collapsedFreq[cbtable[[colName]]],
                      na.rm = TRUE) /
                sum(cbtable$originalFreq, na.rm = TRUE), 4), "%"))
    df[[paste0("Mean number of reads per cell", cbName)]] <-
        as.character(round(mean(
            cbtable$collapsedFreq[cbtable[[colName]]],
            na.rm = TRUE)))
    df[[paste0("Median number of reads per cell", cbName)]] <-
        as.character(round(stats::median(
            cbtable$collapsedFreq[cbtable[[colName]]],
            na.rm = TRUE)))
    df[[paste0("Mean number of detected genes per cell", cbName)]] <-
        as.character(round(mean(
            cbtable$nbrGenesAboveZero[cbtable[[colName]]],
            na.rm = TRUE)))
    df[[paste0("Median number of detected genes per cell", cbName)]] <-
        as.character(round(stats::median(
            cbtable$nbrGenesAboveZero[cbtable[[colName]]],
            na.rm = TRUE)))
    if (!is.null(quantmat)) {
        df[[paste0("Total number of detected genes", cbName)]] <-
            as.character(sum(
                rowSums(quantmat[, colnames(quantmat) %in%
                                     cbtable$CB[cbtable[[colName]]],
                                 drop = FALSE]) > 0))
    }
    df[[paste0("Mean UMI count per cell", cbName)]] <-
        as.character(round(mean(
            cbtable$totalUMICount[cbtable[[colName]]],
            na.rm = TRUE)))
    df[[paste0("Median UMI count per cell", cbName)]] <-
        as.character(round(stats::median(
            cbtable$totalUMICount[cbtable[[colName]]],
            na.rm = TRUE)))
    df <- t(data.frame(df,
                       stringsAsFactors = FALSE,
                       check.names = FALSE))
    df
}


#' Read alevin data required to generate summary report
#'
#' Read all alevin output files required to generate the summary report or shiny
#' app.
#'
#' @param baseDir Path to the output directory from the alevin run (should be
#'   the directory containing the \code{alevin} directory).
#' @param customCBList Named list with custom set(s) of barcodes to provide
#'   summary statistics/plots for, in addition to the whitelists generated by
#'   alevin.
#'
#' @author Charlotte Soneson
#'
#' @export
#'
#' @importFrom utils read.delim
#' @import dplyr
#' @importFrom rjson fromJSON
#' @importFrom tximport tximport
#'
#' @return A list collecting all necessary information for generating the
#'   summary report/shiny app.
#'
#' @examples
#' alevin <- readAlevinQC(system.file("extdata/alevin_example_v0.14",
#'                        package = "alevinQC"))
#'
readAlevinQC <- function(baseDir, customCBList = list()) {
    if (!is.list(customCBList)) {
        stop("'customCBList' must be a (possibly empty) list")
    }
    if (length(customCBList) > 0 && (any(is.null(names(customCBList))) ||
                                     any(names(customCBList) == ""))) {
        stop("'customCBList' must be a named list")
    }

    ## Check that all required files are available, stop if not
    infversion <- checkAlevinInputFiles(baseDir)

    ## Depending on the inferred version, read alevin output files
    if (infversion == "pre0.14") {
        ## pre-v0.14
        .readAlevinQC_pre0.14(baseDir = baseDir, customCBList = customCBList)
    } else if (infversion == "v0.14") {
        ## v0.14 or newer, final whitelist inferred from data
        .readAlevinQC_v0.14(baseDir = baseDir, customCBList = customCBList,
                            type = "standard")
    } else if (infversion == "v0.14extwl") {
        ## v0.14 or newer, external whitelist provided
        .readAlevinQC_v0.14(baseDir = baseDir, customCBList = customCBList,
                            type = "extwl")
    } else if (infversion == "v0.14nowl") {
        ## v0.14 or newer, no whitelist.txt file but no external whitelist
        .readAlevinQC_v0.14(baseDir = baseDir, customCBList = customCBList,
                            type = "nowl")
    } else {
        stop("Unidentifiable alevin output")
    }
}

.readAlevinQC_pre0.14 <- function(baseDir, customCBList = list()) {

    alevinDir <- file.path(baseDir, "alevin")

    ## Raw CB frequencies (assumed to be in descending order)
    rawcbfreq <- utils::read.delim(file.path(alevinDir, "raw_cb_frequency.txt"),
                                   header = FALSE, as.is = TRUE) %>%
        dplyr::rename(CB = V1, originalFreq = V2) %>%
        dplyr::mutate(ranking = seq_len(length(CB)))

    ## First set of whitelisted CBs (quantified)
    filtcbfreq <- utils::read.delim(file.path(alevinDir,
                                              "filtered_cb_frequency.txt"),
                                    header = FALSE, as.is = TRUE) %>%
        dplyr::rename(CB = V1, collapsedFreq = V2)

    ## FeatureDump
    ## dedupRate = nbr deduplicated UMI counts/nbr mapped reads
    ## nbrGenesAboveMean = nbr genes with count > mean gene count
    featuredump <- utils::read.delim(file.path(alevinDir, "featureDump.txt"),
                                     header = FALSE, as.is = TRUE) %>%
        dplyr::rename(CB = V1, mappingRate = V2, duplicationRate = V3,
                      dedupRate = V4, nbrGenesAboveMean = V5)

    ## Mapped UMI
    mappedumi <- utils::read.delim(file.path(alevinDir, "MappedUmi.txt"),
                                   header = FALSE, as.is = TRUE) %>%
        dplyr::rename(CB = V1, nbrMappedUMI = V2)

    ## Final set of whitelisted CBs
    finalwhitelist <- utils::read.delim(file.path(alevinDir, "whitelist.txt"),
                                        header = FALSE, as.is = TRUE)$V1

    ## Quantification
    quantmat <- tximport::tximport(file.path(baseDir, "alevin/quants_mat.gz"),
                                   type = "alevin")$counts
    quants <- data.frame(CB = colnames(quantmat),
                         totalUMICount = colSums(quantmat),
                         nbrGenesAboveZero = colSums(quantmat > 0),
                         stringsAsFactors = FALSE)

    ## Merge information about quantified CBs
    quantbcs <- filtcbfreq %>%
        dplyr::full_join(featuredump, by = "CB") %>%
        dplyr::full_join(mappedumi, by = "CB") %>%
        dplyr::full_join(quants, by = "CB") %>%
        dplyr::mutate(inFinalWhiteList = CB %in% finalwhitelist) %>%
        dplyr::mutate(inFirstWhiteList = TRUE)

    cbtable <- dplyr::full_join(
        rawcbfreq,
        quantbcs
    ) %>% dplyr::mutate(inFirstWhiteList = replace(inFirstWhiteList,
                                                   is.na(inFirstWhiteList),
                                                   FALSE),
                        inFinalWhiteList = replace(inFinalWhiteList,
                                                   is.na(inFinalWhiteList),
                                                   FALSE))

    ## Add information from custom barcode sets
    customCBsummary <- list()
    for (i in seq_along(customCBList)) {
        nm <- paste0("customCB__", names(customCBList)[i])
        cbtable[[nm]] <- cbtable$CB %in% customCBList[[i]]
        efr <- signif(100 * sum(cbtable[[nm]])/length(customCBList[[i]]), 4)
        qfr <- signif(100 * sum(cbtable[[nm]] & !is.na(cbtable$mappingRate))/
                          length(customCBList[[i]]), 4)
        message(efr, "% of barcodes in custom barcode set ",
                names(customCBList)[i], " were found in the data set (",
                qfr, "% were quantified)")
        customCBsummary[[nm]] <- .makeSummaryTable(
            cbtable = cbtable,
            colName = nm,
            cbName = paste0(" (", gsub("customCB__", "", nm), ")"),
            quantmat = quantmat)
    }

    ## Meta information and command information
    metainfo <- rjson::fromJSON(file = file.path(baseDir,
                                                 "aux_info/meta_info.json"))
    cmdinfo <- rjson::fromJSON(file = file.path(baseDir, "cmd_info.json"))

    ## Create "version info" table
    versiontable <- t(data.frame(
        `Start time` = metainfo$start_time,
        `Salmon version` = metainfo$salmon_version,
        `Index` = cmdinfo$index,
        `R1file` = paste(cmdinfo$mates1,
                         collapse = ", "),
        `R2file` = paste(cmdinfo$mates2,
                         collapse = ", "),
        `tgMap` = cmdinfo$tgMap,
        `Library type` = metainfo$library_types,
        stringsAsFactors = FALSE,
        check.names = FALSE
    ))

    ## Create summary tables
    summarytable_full <- t(data.frame(
        `Total number of processed reads` =
            as.character(metainfo$num_processed),
        `Number of reads with valid cell barcode (no Ns)` =
            as.character(round(sum(rawcbfreq$originalFreq, na.rm = TRUE))),
        `Number of mapped reads` = metainfo$num_mapped,
        `Percent mapped` = metainfo$percent_mapped,
        `Total number of observed cell barcodes` =
            as.character(length(unique(cbtable$CB))),
        stringsAsFactors = FALSE,
        check.names = FALSE
    ))

    summarytable_initialwl <- .makeSummaryTable(
        cbtable = cbtable,
        colName = "inFirstWhiteList",
        cbName = " (initial whitelist)",
        quantmat = quantmat
    )

    summarytable_finalwl <- .makeSummaryTable(
        cbtable = cbtable,
        colName = "inFinalWhiteList",
        cbName = " (final whitelist)",
        quantmat = quantmat
    )

    ## Return
    list(cbTable = cbtable, versionTable = versiontable,
         summaryTables = c(list(fullDataset = summarytable_full,
                                initialWhitelist = summarytable_initialwl,
                                finalWhitelist = summarytable_finalwl),
                           customCBsummary),
         type = "pre0.14"
    )
}

.readAlevinQC_v0.14 <- function(baseDir, customCBList = list(), type = "standard") {

    alevinDir <- file.path(baseDir, "alevin")

    ## Raw CB frequencies (assumed to be in descending order)
    rawcbfreq <- utils::read.delim(file.path(alevinDir, "raw_cb_frequency.txt"),
                                   header = FALSE, as.is = TRUE) %>%
        dplyr::rename(CB = V1, originalFreq = V2) %>%
        dplyr::mutate(ranking = seq_len(length(CB)))
    if (!all(diff(rawcbfreq$originalFreq) <= 0)) {
        warning("The raw CB frequencies are not sorted in decreasing order")
    }

    ## FeatureDump
    featuredump <- try({
        utils::read.delim(file.path(alevinDir, "featureDump.txt"),
                          header = TRUE, as.is = TRUE, sep = "\t")
    }, silent = TRUE)
    if (is(featuredump, "try-error")) {
        if (grepl("more columns than column names", featuredump)) {
            warning("The 'featureDump.txt' file could not be cleanly read. ",
                    "Please check results carefully (e.g. by reading the ",
                    "input manually and checking the 'cbTable' output.")
            ## In alevin 1.0.0 (fixed in 1.1.0), with certain flags,
            ## the ArborescenceCount column could be split over multiple columns
            tmpcn <- unlist(utils::read.delim(
                file.path(alevinDir, "featureDump.txt"),
                header = FALSE, as.is = TRUE, sep = "\t", nrows = 1
            ))
            tmpdt <- utils::read.delim(
                file.path(alevinDir, "featureDump.txt"),
                header = FALSE, as.is = TRUE, sep = "\t", skip = 1
            )
            if (length(tmpcn) != ncol(tmpdt)) {
                tmpdt <- tmpdt[, seq_len(length(tmpcn)), drop = FALSE]
            }
            colnames(tmpdt) <- tmpcn
            featuredump <- tmpdt
        } else {
            stop("The 'featureDump.txt' file could not be read.")
        }
    }
    featuredump <- featuredump %>%
        dplyr::rename(mappingRate = MappingRate,
                      collapsedFreq = CorrectedReads,
                      dedupRate = DedupRate,
                      nbrGenesAboveMean = NumGenesOverMean,
                      nbrMappedUMI = MappedReads,
                      totalUMICount = DeduplicatedReads,
                      nbrGenesAboveZero = NumGenesExpressed)

    if (type == "standard") {
        ## Final set of whitelisted CBs
        finalwhitelist <- utils::read.delim(
            file.path(alevinDir, "whitelist.txt"),
            header = FALSE, as.is = TRUE)$V1
    }

    ## Meta information and command information
    metainfo <- rjson::fromJSON(file = file.path(baseDir,
                                                 "aux_info/meta_info.json"))
    cmdinfo <- rjson::fromJSON(file = file.path(baseDir, "cmd_info.json"))
    alevinmetainfo <- rjson::fromJSON(
        file = file.path(baseDir, "aux_info/alevin_meta_info.json"))

    ## Merge information about quantified CBs
    cbtable <- dplyr::full_join(
        rawcbfreq,
        featuredump,
        by = "CB"
    )
    if (type == "standard") {
        ## we have a whitelist.txt file representing the final whitelist
        cbtable <- cbtable %>%
            dplyr::mutate(inFinalWhiteList = CB %in% finalwhitelist) %>%
            dplyr::mutate(
                inFirstWhiteList = ranking <= alevinmetainfo$initial_whitelist
            )
        ## Check if there is any barcode that is not in the first whitelist,
        ## but which has an original ranking lower than any barcode that is
        ## in the first whitelist, and remove it.
        toremove <-
            !cbtable$inFirstWhiteList &
            cbtable$ranking <= max(cbtable$ranking[cbtable$inFirstWhiteList])
        if (any(toremove)) {
            warning("Excluding ", sum(toremove), " unquantified barcode",
                    ifelse(sum(toremove) > 1, "s", ""),
                    " with higher original frequency than barcodes ",
                    "included in the first whitelist: ",
                    paste0(cbtable$CB[toremove], collapse = ", "))
            cbtable <- cbtable[!toremove, ]
        }
    } else if (type == "nowl") {
        ## we have an indication of the size of the initial whitelist, but
        ## no final whitelist (and no whitelist.txt file).
        ## the final number of considered CBs should be those in the initial
        ## whitelist
        cbtable <- cbtable %>%
            dplyr::mutate(
                inFirstWhiteList = ranking <= alevinmetainfo$initial_whitelist
            ) %>%
            dplyr::mutate(inFinalWhiteList = inFirstWhiteList)
    } else if (type == "extwl") {
        ## the CBs included in the featureDump file are the ones included
        ## in the external whitelist. There is no initial whitelisting step
        cbtable <- cbtable %>%
            dplyr::mutate(inFirstWhiteList = CB %in% featuredump$CB) %>%
            dplyr::mutate(inFinalWhiteList = inFirstWhiteList)
    }

    ## Add information from custom barcode sets
    customCBsummary <- list()
    for (i in seq_along(customCBList)) {
        nm <- paste0("customCB__", names(customCBList)[i])
        cbtable[[nm]] <- cbtable$CB %in% customCBList[[i]]
        efr <- signif(100 * sum(cbtable[[nm]])/length(customCBList[[i]]), 4)
        qfr <- signif(100 * sum(cbtable[[nm]] & !is.na(cbtable$mappingRate))/
                          length(customCBList[[i]]), 4)
        message(efr, "% of barcodes in custom barcode set ",
                names(customCBList)[i], " were found in the data set (",
                qfr, "% were quantified)")
        customCBsummary[[nm]] <- .makeSummaryTable(
            cbtable = cbtable,
            colName = nm,
            cbName = paste0(" (", gsub("customCB__", "", nm), ")"),
            quantmat = NULL)
    }

    ## Create "version info" table
    versiontable <- t(data.frame(
        `Start time` = metainfo$start_time,
        `Salmon version` = metainfo$salmon_version,
        `Index` = cmdinfo$index,
        `R1file` = paste(cmdinfo$mates1,
                         collapse = ", "),
        `R2file` = paste(cmdinfo$mates2,
                         collapse = ", "),
        `tgMap` = cmdinfo$tgMap,
        `Library type` = metainfo$library_types,
        stringsAsFactors = FALSE,
        check.names = FALSE
    ))
    if (type == "extwl") {
        versiontable <- rbind(
            versiontable,
            t(data.frame(
                `External whitelist` = cmdinfo$whitelist,
                stringsAsFactors = FALSE,
                check.names = FALSE
            ))
        )
    }
    if ("expectCells" %in% names(cmdinfo)) {
        versiontable <- rbind(
            versiontable,
            t(data.frame(
                `Expected number of cells` = cmdinfo$expectCells,
                stringsAsFactors = FALSE,
                check.names = FALSE
            ))
        )
    }
    if ("forceCells" %in% names(cmdinfo)) {
        versiontable <- rbind(
            versiontable,
            t(data.frame(
                `Forced number of cells` = cmdinfo$forceCells,
                stringsAsFactors = FALSE,
                check.names = FALSE
            ))
        )
    }

    ## Create summary tables
    summarytable_full <- t(data.frame(
        `Total number of processed reads` =
            as.character(alevinmetainfo$total_reads),
        `Number of reads with Ns` =
            as.character(alevinmetainfo$reads_with_N),
        `Number of reads with valid cell barcode (no Ns)` =
            as.character(round(sum(rawcbfreq$originalFreq, na.rm = TRUE))),
        `Number of mapped reads` = alevinmetainfo$reads_in_eqclasses,
        `Percent mapped (of all reads)` =
            paste0(signif(as.numeric(alevinmetainfo$mapping_rate), 4), "%"),
        `Number of noisy CB reads` =
            as.character(alevinmetainfo$noisy_cb_reads),
        `Number of noisy UMI reads` =
            as.character(alevinmetainfo$noisy_umi_reads),
        `Total number of observed cell barcodes` =
            as.character(length(unique(cbtable$CB))),
        stringsAsFactors = FALSE,
        check.names = FALSE
    ))

    ## If used_reads is reported, add actual mapping rate
    if (!is.null(alevinmetainfo$used_reads)) {
        summarytable_full <- rbind(
            summarytable_full,
            t(data.frame(
                `Number of used reads` = alevinmetainfo$used_reads,
                `Percent mapped (of used reads)` =
                    paste0(signif(100 * alevinmetainfo$reads_in_eqclasses /
                    alevinmetainfo$used_reads, 4), "%"),
                stringsAsFactors = FALSE,
                check.names = FALSE
            ))
        )
    }

    summarytable_initialwl <- .makeSummaryTable(
        cbtable = cbtable,
        colName = "inFirstWhiteList",
        cbName = " (initial whitelist)",
        quantmat = NULL
    )

    summarytable_finalwl <- .makeSummaryTable(
        cbtable = cbtable,
        colName = "inFinalWhiteList",
        cbName = " (final whitelist)",
        quantmat = NULL
    )

    ## Return
    list(cbTable = cbtable, versionTable = versiontable,
         summaryTables = c(list(fullDataset = summarytable_full,
                                initialWhitelist = summarytable_initialwl,
                                finalWhitelist = summarytable_finalwl),
                           customCBsummary),
         type = type
    )
}