R/SequencingSummary.R

Defines functions SequencingSummaryFlowCellID SequencingSummaryExtractRuntime handleSeqSumCache SequenceSummaryBasicInfoPlot SequenceSummaryExecutiveSummary SequencingSummaryActiveChannelPlot SequencingSummarySpeedPlot SequencingSummaryCumulativeReads SequencingSummaryT50 SequencingSummaryBase50 SequencingSummaryCumulativeBases SequencingSummaryTemporalThroughput getTemporalDataset SequencingSummaryReadLengthQualityDensity SequencingSummaryReadQualityHistogram SequencingSummaryReadLengthHistogram SequencingSummaryWeightedReadLength getBinBreaks getBinAssignments roundUpNice SequencingSummaryGetPlatform SequencingSummaryChannelActivity SequencingSummaryPassGauge SequencingSummaryHasBarcodeInfo importSequencingSummary handleCompression

Documented in importSequencingSummary SequenceSummaryBasicInfoPlot SequenceSummaryExecutiveSummary SequencingSummaryActiveChannelPlot SequencingSummaryBase50 SequencingSummaryChannelActivity SequencingSummaryCumulativeBases SequencingSummaryCumulativeReads SequencingSummaryExtractRuntime SequencingSummaryFlowCellID SequencingSummaryGetPlatform SequencingSummaryHasBarcodeInfo SequencingSummaryPassGauge SequencingSummaryReadLengthHistogram SequencingSummaryReadLengthQualityDensity SequencingSummaryReadQualityHistogram SequencingSummarySpeedPlot SequencingSummaryT50 SequencingSummaryTemporalThroughput SequencingSummaryWeightedReadLength

handleCompression <- function(filename, skip=TRUE) {
    if (R.utils::isBzipped(filename)) {
        cat(paste0(paste("bunzip2",filename),"\n"))
        return(R.utils::bunzip2(filename, temporary=TRUE, remove=FALSE, skip=skip, overwrite=!skip))
    }
    if (R.utils::isGzipped(filename)) {
        cat(paste0(paste("gunzip",filename),"\n"))
        return(R.utils::gunzip(filename, temporary=TRUE, remove=FALSE, skip=skip, overwrite=!skip))
    }
    return(filename)
}



#' load a sequencing_summary.txt file into memory
#'
#' importSequencingSummary loads a sequencing_summary.txt file into
#' memory and performs basic sanity checking to ensure that the file
#' is cleaned of potential duplicate headers from aggregation
#'
#' @importFrom R.utils bunzip2
#' @importFrom R.utils gunzip
#' @importFrom LaF laf_open_csv
#' @param seqsum is a path to a file
#' @param cache whether the object created should be cached in env
#' @param chunksize the number of reads to parse per iteration block
#' @param skip whether to skip uncompression if file already exists - may
#' cause collisions due to standard sequencing_summary nomenclature
#' @return data.frame of observations from the sequencing_summary.txt file
#' provided
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#'     "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#'
#' @export
importSequencingSummary <- function(seqsum, cache=TRUE, chunksize=1000000, skip=FALSE) {
    seqsum <- handleCompression(seqsum, skip=skip)
    # identify the available columns ...
    con <- file(seqsum, "r")
    sample_lines <- readLines(con, n=2)
    close(con)
    columns <- strsplit(sample_lines[1], "\t")[[1]]

    model <- LaF::detect_dm_csv(filename=seqsum, header=TRUE, sep="\t")
    dat <- LaF::laf_open(model, ignore_failed_conversion=TRUE)

    select_columns <- c(
        "read_id", "channel", "start_time", "duration", "passes_filtering",
        "sequence_length_template", "mean_qscore_template", "barcode_arrangement")
    cids <- as.integer(na.omit(match(select_columns, columns)))

    seqsumdata <- data.frame()
    LaF::begin(dat)
    while (TRUE) {
        #cat(paste0(dim(seqsumdata), "\n"))
        d <- LaF::next_block(dat, columns=cids, nrows=chunksize)
        if ("passes_filtering" %in% columns) {
            d$passes_filtering <- as.logical(d$passes_filtering)
        }
        if (nrow(d) == 0) break;
        seqsumdata <- rbind(seqsumdata, d)
    }
    # remove the redundant headers from merged files
    if (length(which(seqsumdata$read_id == "read_id")) > 0) {
        seqsumdata <- seqsumdata[-which(seqsumdata$read_id == "read_id"), ]
    }

    if (!"passes_filtering" %in% colnames(seqsumdata)) {
        # set all of the reads to pass? apply a cutoff?
        seqsumdata$passes_filtering <- TRUE
    }

    if (cache) {
        setCachedObject("seqsumdata", seqsumdata)
    }
    return(invisible(seqsumdata))
}




#' does the SequencingSummary information define a barcode?
#'
#' does the SequencingSummary information define a barcode?
#'
#' @return logical
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#'     "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' SequencingSummaryHasBarcodeInfo()
#'
#' @export
SequencingSummaryHasBarcodeInfo <- function() {
    seqsum <- handleSeqSumCache(NA)
    if ("barcode_arrangement" %in% names(seqsum)) {
        return(TRUE)
    } else if (hasCachedObject("barcodedata")) {
        return(TRUE)
    }
    return(FALSE)
}



#' prepare a gauge plot of sequencing_summary reads passing QC
#'
#' plots the basic eye-candy gauge plot of reads passing QC threshold
#'
#' @importFrom dplyr mutate
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @return ggplot2 gauge plot
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#'     "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummaryPassGauge(seqsum)
#'
#' @export
SequencingSummaryPassGauge <- function(seqsum = NA) {

    seqsum <- handleSeqSumCache(seqsum)

    df <- data.frame(matrix(nrow = 1, ncol = 3))

    names(df) <- c("variable", "percentage", "label")
    df$variable <- c("pass")
    df$percentage <- c(round(
        length(which(seqsum$passes_filtering == TRUE))/nrow(seqsum), 3))

    df <- df %>% dplyr::mutate(
        group = ifelse(df$percentage < 0.6, "red", ifelse(df$percentage >= 0.6 &
        df$percentage < 0.8, "orange", "green")),
        label = paste0(df$percentage * 100, "%"))

    title = "Percentage of reads\npassing QC filter"

    gaugePlot <- ggplot(df, aes_string(fill = "group", ymax = "percentage",
        ymin = 0, xmax = 2, xmin = 1)) + geom_rect(aes(ymax = 1, ymin = 0, xmax
        = 2, xmin = 1), fill = "#ece8bd") + geom_rect() + coord_polar(theta =
        "y", start = -pi/2) + xlim(c(0, 2)) + ylim(c(0, 2)) + guides(fill =
        FALSE) + guides(colour = FALSE) + theme_void() + theme(strip.background
        = element_blank(), strip.text.x = element_blank()) + geom_text(
        aes_string(x = 0, y = 0, label = "label"), size = 13) + geom_text(
        aes_string(x = 1.5, y = 1.5, label = "title"), size = 11) +
        scale_fill_manual(values = c(red = "#C9146C", orange = "#DA9112",
        green = "#129188")) + scale_colour_manual(values = c(red = "#C9146C",
        orange = "#DA9112", green = "#129188"))

    return(ggplot2handler(gaugePlot))
}






#' prepare a channel activity plot from sequencing_summary reads file
#'
#' plots the basic eye-candy gauge channel activity plot of reads against
#' channel of origin
#'
#' @usage SequencingSummaryChannelActivity(
#'     seqsum=NA, platform=NA, showcount=FALSE)
#' @importFrom reshape2 acast
#' @importFrom grDevices colorRamp colorRampPalette
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @param platform is the nanopore platform [MinION/Flongle/PromethION]
#' @param showcount logical - show read counts per channel
#' @return ggplot2 channel activity plot
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#'     "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummaryChannelActivity(seqsum)
#'
#' @export
SequencingSummaryChannelActivity <- function(
    seqsum=NA, platform=NA, showcount=FALSE) {

    seqsum <- handleSeqSumCache(seqsum)

    if (is.na(platform)) {
        platform <- SequencingSummaryGetPlatform(seqsum)
    }
    channelMap <- SequencingSummaryGetChannelMap(platform)

    hm.palette <- colorRampPalette(brewer.pal(9, "Blues"), space = "Lab")
    #RdPu, Oranges, Greens, YlOrRd, Purples

    channelCounts <-
        as.data.frame(matrix(rep(0, max(channelMap$channel)), ncol = 1))
    channelCountRaw <-
        as.data.frame(table(unlist(seqsum[, "channel"])), row.names = 1)
    channelCounts[row.names(channelCountRaw), ] <- channelCountRaw[, 1]

    channelMap <- merge(channelMap, channelCounts, by.x = "channel", by.y = 0)
    colnames(channelMap)[4] <- "count"
    channelMapMatrix <-
        reshape2::acast(channelMap, col ~ row, value.var = "count")

    theme_update(plot.title = element_text(hjust = 0.5))

    activityPlot <- ggplot(
        channelMap, aes_string(x = "col", y = "row", fill = "count")) +
        geom_tile() + scale_x_discrete(breaks = NULL) + scale_y_discrete(
        breaks = NULL) + coord_equal() + scale_fill_gradientn(colours =
        hm.palette(100)) + scale_color_gradient2(low = hm.palette(100),
        high = hm.palette(1)) + theme(axis.text.x = element_text(angle = 90,
        hjust = 1)) + labs(title =
        "Channel activity plot showing number of reads per flowcell channel") +
        theme(panel.border = element_blank(), panel.grid.major=element_blank(),
        panel.grid.minor = element_blank(), axis.title.x = element_blank(),
        axis.title.y = element_blank(), legend.position = "bottom",
        legend.key.width = unit(5.6, "cm"))
    if (showcount) {
        activityPlot <- activityPlot + geom_text(data = channelMap, aes_string(
            x = "col", y = "row", label = "count", color = "count"),
            show.legend = FALSE, size = 2.5)
    }
    return(ggplot2handler(activityPlot))
}


#' identify the most likely sequencing platform used to create
#' the summary data
#'
#' when provided with the seqsum object from a Sequencing_summary.txt file
#' identify the
#' most likely sequencing platform used
#'
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @return platform is the nanopore platform [MinION/Flongle/PromethION]
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#'     "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' platform <- SequencingSummaryGetPlatform(seqsum)
#' platform
#'
#' @export
SequencingSummaryGetPlatform <- function(seqsum = NA) {

    if (!is.data.frame(seqsum) && is.na(seqsum)) {
        oname <- "seqsumdata"
        if (hasCachedObject(oname)) {
            seqsum <- getCachedObject(oname)
        }
    }

    platform <- "MinION"

    if (max(seqsum$channel) < 130) {
        # this is likely to be a Flongle ...
        platform <- "Flongle"
    }

    if (max(seqsum$channel) > 1000) {
        # this is likely to be a PromethION
        platform <- "PromethION"
    }
    return(platform)
}







# https://stackoverflow.com/questions/6461209/
# how-to-round-up-to-the-nearest-10-or-100-or-x
roundUpNice <- function(x, nice = seq(from = 1, to = 10, by = 0.25)) {
    if (length(x) != 1)
        stop("'x' must be of length 1")
    10^floor(log10(x)) * nice[[which(x <= 10^floor(log10(x)) * nice)[[1]]]]
}


getBinAssignments <- function(seqsum, breaks) {
    binAssignments <- cut(seqsum$sequence_length_template, breaks,
        include.lowest = TRUE, right = FALSE)
    return(binAssignments)
}

#' @importFrom stats quantile
getBinBreaks <- function(seqsum) {
    # pick a friendly upper limit to render sequence lengths into a histogram
    # here we're aiming for a  robustly rounded up 97.5 quantile of the data
    # (skip a few outliers ...)
    upperLimit <- roundUpNice(as.numeric(quantile(
        x = seqsum$sequence_length_template, probs = c(0.975))))
    # an ideal histogram will have 40 or so bins
    histogramBinCount <- 40
    breakVal = roundUpNice(upperLimit/histogramBinCount)
    breaks <- seq(0, to = upperLimit, by = breakVal)
    return(breaks)
}


#' plot a weighted histogram of sequence read lengths
#'
#' plots the histogram of read lengths, weighted, and shaded by pass/fail status
#'
#' @importFrom dplyr last
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @return ggplot2 showing weighted read length distribution
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#'     "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummaryWeightedReadLength(seqsum)
#'
#' @export
SequencingSummaryWeightedReadLength <- function(seqsum=NA) {
    seqsum <- handleSeqSumCache(seqsum)
    breaks <- getBinBreaks(seqsum)
    breakVal <- breaks[2]  # assuming that the range is 0 based
    upperLimit <- dplyr::last(breaks)
    binAssignments <- getBinAssignments(seqsum, breaks)

    passedSeqs <- seqsum[which(seqsum$passes_filtering), ]
    N50 <- ncalc(passedSeqs$sequence_length_template, 0.5)
    passedMeanLength = round(
        mean(passedSeqs$sequence_length_template), digits = 0)

    scrapeBinnedBases <- function(level, qcpass, binAssignments, seqsum) {
        candidates <- which(binAssignments == level)
        if (length(candidates) > 0) {
            data <- seqsum[candidates, ]
            candidates2 <- which(data$passes_filtering == qcpass)
            if (length(candidates2) > 0) {
                return(sum(data[candidates2, "sequence_length_template"]))
            }
        }
        return(0)
    }

    passedBinnedBases <-unlist(lapply(levels(binAssignments), scrapeBinnedBases,
        qcpass = TRUE, binAssignments = binAssignments,seqsum = seqsum))
    failedBinnedBases <-unlist(lapply(levels(binAssignments), scrapeBinnedBases,
        qcpass = FALSE, binAssignments = binAssignments,seqsum = seqsum))

    binnedBaseDist <- data.frame(length = head(breaks, -1), pass =
        passedBinnedBases, fail = failedBinnedBases)
    binnedBaseMelt <- reshape2::melt(binnedBaseDist, id.vars = c("length"))

    weightedReadLengths <- ggplot(binnedBaseMelt, aes_string(x = "length",
        fill = "variable", y = "value")) + geom_bar(stat = "identity") +
        xlab("Read length\n") + ylab("Number of bases sequenced\n") +
        scale_fill_manual("QC", values = c(fail = brewer.pal(6, "Paired")[1],
        pass = brewer.pal(6, "Paired")[2])) + scale_x_continuous(limits = c(
        -breakVal,upperLimit),breaks=pretty(passedSeqs$sequence_length_template,
        n = 40)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
        labs(title = paste0("Histogram showing the number of sequenced bases a",
        "gainst sequence length"), fill = "QV filter") + geom_vline(xintercept =
        N50, size = 1) + annotate("text", x = N50, y = max(passedBinnedBases +
        failedBinnedBases), label = " N50", hjust = 0, colour = "SteelBlue") +
        geom_vline(xintercept = passedMeanLength, size = 1) + annotate("text",
        x = passedMeanLength, y = max(passedBinnedBases + failedBinnedBases),
        label = " Mean", hjust = 0, colour = "SteelBlue")

    return(ggplot2handler(weightedReadLengths))
}







#' plot a histogram of sequence read lengths
#'
#' plots the histogram of read lengths shaded by pass/fail status
#'
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @return ggplot2 showing read length distribution
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#'     "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummaryReadLengthHistogram(seqsum)
#'
#' @export
SequencingSummaryReadLengthHistogram <- function(seqsum=NA) {

    seqsum <- handleSeqSumCache(seqsum)

    breaks <- getBinBreaks(seqsum)
    breakVal <- breaks[2]  # assuming that the range is 0 based
    upperLimit <- dplyr::last(breaks)
    binAssignments <- getBinAssignments(seqsum, breaks)

    passedSeqs <- seqsum[which(seqsum$passes_filtering), ]
    N50 <- ncalc(passedSeqs$sequence_length_template, 0.5)
    passedMeanLength = round(mean(
        passedSeqs$sequence_length_template), digits = 0)

    scrapeBinnedReads <- function(level, qcpass) {
        # length(subset(seqsum[which(binAssignments == level), ],
        # `passes_filtering`==qcpass)$sequence_length_template)
        length(which(seqsum[which(
            binAssignments == level), ]$passes_filtering == qcpass))
    }

    passedBinnedReads <- unlist(lapply(levels(
        binAssignments), scrapeBinnedReads, qcpass = TRUE))
    failedBinnedReads <- unlist(lapply(levels(
        binAssignments), scrapeBinnedReads, qcpass = FALSE))

    binnedReadDist <- data.frame(length = head(
        breaks, -1), pass = passedBinnedReads, fail = failedBinnedReads)
    binnedReadMelt <- reshape2::melt(binnedReadDist, id.vars = c("length"))

    lengthHistogram <- ggplot(binnedReadMelt, aes_string(x = "length", fill =
        "variable", y = "value")) + geom_bar(stat = "identity") + xlab(
        "Read length\n") + ylab("Number of reads\n") + scale_fill_manual("QC",
        values = c(fail = brewer.pal(6, "Paired")[1], pass = brewer.pal(6,
        "Paired")[2])) + scale_x_continuous(limits = c(-breakVal, upperLimit),
        breaks = pretty(passedSeqs$sequence_length_template, n = 40)) +
        theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title =
        paste0("Histogram showing distribution of read lengths across quality ",
        "passing sequences"), fill = "QV filter") + geom_vline(xintercept = N50,
        size = 1) + annotate("text", x = N50, y = max(passedBinnedReads +
        failedBinnedReads), label = " N50", hjust = 0, colour = "SteelBlue") +
        geom_vline(xintercept = passedMeanLength, size = 1) + annotate("text",
        x = passedMeanLength, y = max(passedBinnedReads + failedBinnedReads),
        label = " Mean", hjust = 0, colour = "SteelBlue")

    return(ggplot2handler(lengthHistogram))
}




#' plot a histogram of sequence quality scores
#'
#' plots the histogram of read mean quality scores shaded by pass/fail status
#'
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @return ggplot2 showing read quality distribution
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#'     "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummaryReadQualityHistogram(seqsum)
#'
#' @export
SequencingSummaryReadQualityHistogram <- function(seqsum=NA) {

    seqsum <- handleSeqSumCache(seqsum)

    qdist <- ggplot(seqsum, aes_string(x = "mean_qscore_template", fill =
        "passes_filtering")) + geom_histogram(breaks = seq(from = 0,
        to = 15, by = 0.1)) + scale_fill_manual(name = "QC", values = c(`TRUE`=
        brewer.pal(6, "Paired")[2], `FALSE` = brewer.pal(6, "Paired")[1]),
        labels = c("pass", "fail"), breaks = c("TRUE", "FALSE")) + labs(title =
        "Plot showing distribution of quality scores across all reads") +
        xlab("Mean Q score of read") + ylab("Number of reads")

    return(ggplot2handler(qdist))
}






#' plot a density map of sequence lengths and quality scores
#'
#' plots the density plot of read length against read mean quality scores
#'
#' @usage SequencingSummaryReadLengthQualityDensity(seqsum=NA,
#'     binFilter = 2,
#'     qcThreshold = 7
#' )
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @param binFilter is the minimum number of reads to include a cell in plot
#' (removes speckle)
#' @param qcThreshold is the QC threshold used
#' @return ggplot2 showing densities of read length and quality distribution
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#'     "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummaryReadLengthQualityDensity(seqsum)
#'
#' @export
SequencingSummaryReadLengthQualityDensity <- function(
    seqsum=NA, binFilter=2, qcThreshold=7) {
    seqsum <- handleSeqSumCache(seqsum)

    # prepare the density plot, but do not render
    lq_dens <- ggplot(seqsum, aes(log10(seqsum$sequence_length_template),
        seqsum$mean_qscore_template)) + geom_bin2d(bins = 100)
    # extract the density map from the plot
    lq_dens_counts <- ggplot_build(lq_dens)$data[[1]]
    if (binFilter > 0) {
        # remove the bins from the density map that do not contain sequence
        # count above threshold
        lq_dens_counts <- lq_dens_counts[-which(
            lq_dens_counts$count <= binFilter), ]
    }
    # directly plot this modified density map (stat=='identity')
    qldensityplot <- ggplot(lq_dens_counts) + geom_bin2d(aes_string("x", "y",
        fill = "count"), stat = "identity") + scale_fill_distiller(palette =
        "Blues", trans = "reverse") + geom_hline(yintercept = qcThreshold,
        size = 1) + xlab("log10(read length)") + ylab("read mean quality") +
        scale_x_continuous(breaks = c(1, 2, 3, 4, 5), labels = c("10", "100",
        "1000", "10,000", "100,000")) + annotation_logticks(base = 10, sides =
        "b", scaled = TRUE) + labs(title = paste0("Contour Plot showing distri",
        "bution of quality scores against log10 read lengths (all reads)"))
    return(ggplot2handler(qldensityplot))
}



getTemporalDataset <- function(seqsum, sampleHours, sampleIntervalMinutes) {

    breaks = seq(0, sampleHours * 60 * 60, by = 60 * sampleIntervalMinutes)

    # has this object already been created?
    objectName <- paste0(
        "seqsum.temporaldata.",sampleIntervalMinutes,".",length(breaks))
    if (hasCachedObject(objectName)) {
        return(getCachedObject(objectName))
    }
    binass <- findInterval(seqsum$start_time, breaks)

    mergeItPerHour <- function(interval, binnedAssignments, filter) {
        totalbases = 0
        if (length(which(binnedAssignments == interval)) > 0) {
            subset <- seqsum[which(binnedAssignments == interval), ]
            if (length(which(subset$passes_filtering == filter)) > 0) {
                totalbases = sum(subset[which(subset$passes_filtering ==
                    filter), "sequence_length_template"])
            }
        }
        # need to scale what is being returned - totalbases value is total
        # bases within an interval
        # (sampleIntervalMinutes)
        return(totalbases/1e+09/sampleIntervalMinutes * 60)
    }

    binnedTemporalDataPerHour <- data.frame(cbind(time = breaks, pass = unlist(
        lapply(seq(breaks), mergeItPerHour, binnedAssignments = binass,
        filter = TRUE)), fail = unlist(lapply(seq(breaks), mergeItPerHour,
        binnedAssignments = binass, filter = FALSE))))

    binnedTemporalDataPerHour$time <- binnedTemporalDataPerHour$time/60/60
    setCachedObject(objectName, binnedTemporalDataPerHour)
    return(binnedTemporalDataPerHour)
}



#' plot a sequence throughput against time for specified
#' sequencing_summary run
#'
#' plots a ggplot2 graph of performance against time for run and separates
#' passed and failed sequence reads
#'
#' @usage SequencingSummaryTemporalThroughput(seqsum,
#'     sampleHours = NA,
#'     sampleIntervalMinutes = 60
#' )
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @param sampleHours is the number of hours to plot data for
#' @param sampleIntervalMinutes is the resolution to plot data at
#' @return ggplot2 showing temporal performance
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#'     "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummaryTemporalThroughput(seqsum)
#'
#' @export
SequencingSummaryTemporalThroughput <- function(
    seqsum=NA, sampleHours = NA, sampleIntervalMinutes = 60) {

    seqsum <- handleSeqSumCache(seqsum)
    if (is.na(sampleHours)) {
        sampleHours <- SequencingSummaryExtractRuntime()
    }

    binnedTemporalDataPerHour <- getTemporalDataset(
        seqsum, sampleHours, sampleIntervalMinutes)

    plot <- ggplot(binnedTemporalDataPerHour, aes_string("time")) + geom_line(
        aes(y = binnedTemporalDataPerHour$fail, colour = "fail"), size = 1) +
        geom_line(aes(y = binnedTemporalDataPerHour$pass, colour = "pass"),
        size = 1) + scale_color_manual(name = "QV", values = c(fail =
        brewer.pal(6, "Paired")[1], pass = brewer.pal(6, "Paired")[2])) +
        xlab("Time (hours)") + ylab("Gigabases sequenced per hour") +
        labs(title = "Plot showing sequence throughput against time")

    return(ggplot2handler(plot))
}


#' plot cumulative volumes of sequence bases
#'
#' plots a ggplot2 graph of accumulated sequenced bases against time for run
#' and separates bases called from passed and failed sequence reads
#'
#' @usage SequencingSummaryCumulativeBases(seqsum,
#'     sampleHours = NA,
#'     sampleIntervalMinutes = 60
#' )
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @param sampleHours is the number of hours to plot data for
#' @param sampleIntervalMinutes is the resolution to plot data at
#' @return ggplot2 showing temporal performance
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#'     "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummaryCumulativeBases(seqsum)
#'
#' @export
SequencingSummaryCumulativeBases <- function(seqsum=NA, sampleHours = NA,
    sampleIntervalMinutes = 60) {

    seqsum <- handleSeqSumCache(seqsum)
    if (is.na(sampleHours)) {
        sampleHours <- SequencingSummaryExtractRuntime()
    }

    binnedTemporalDataPerHour <- getTemporalDataset(
        seqsum, sampleHours, sampleIntervalMinutes)

    # binnedTemporalDataPerHour is scaled to Gbp per hour - rescale to raw for
    # cumulative plotting
    binnedTemporalDataPerHour$pass <- binnedTemporalDataPerHour$pass/60 *
        sampleIntervalMinutes
    binnedTemporalDataPerHour$fail <- binnedTemporalDataPerHour$fail/60 *
        sampleIntervalMinutes

    base50 <- SequencingSummaryBase50(seqsum, b = 0.5)
    base90 <- SequencingSummaryBase50(seqsum, b = 0.9)
    T50 <- SequencingSummaryT50(seqsum, t = 0.5, sampleHours = sampleHours,
        sampleIntervalMinutes = sampleIntervalMinutes)
    T90 <- SequencingSummaryT50(seqsum, t = 0.9, sampleHours = sampleHours,
        sampleIntervalMinutes = sampleIntervalMinutes)


    cumulativePlot <- ggplot(binnedTemporalDataPerHour, aes_string("time")) +
        geom_line(aes(y = cumsum(binnedTemporalDataPerHour$fail), colour =
        "fail"), size = 1) + geom_line(aes(y = cumsum(
        binnedTemporalDataPerHour$pass), colour = "pass"), size = 1) +
        scale_color_manual(name = "QV", values = c(fail = brewer.pal(6,
        "Paired")[1], pass = brewer.pal(6, "Paired")[2])) + geom_segment(x =
        T50$minimum, y = 0, xend = T50$minimum, yend = base50, colour =
        "darkgray", size = 1) + geom_segment(x = 0, y = base50, xend =
        T50$minimum, yend = base50, colour = "darkgray", size = 1) +
        annotate("text", x = T50$minimum, y = base50, label = " T50", vjust =1,
        hjust = 0, colour = "SteelBlue") + geom_segment(x = T90$minimum, y = 0,
        xend = T90$minimum, yend = base90, colour = "darkgray", size = 1) +
        geom_segment(x = 0, y = base90, xend = T90$minimum, yend = base90,
        colour = "darkgray", size = 1) + annotate("text", x = T90$minimum,
        y = base90, label = " T90", vjust = 1, hjust = 0, colour = "SteelBlue")+
        xlab("Time (hours)") + ylab("Number of bases sequenced (Gigabases)") +
        labs(title = "Plot showing cumulative bases sequenced against time")

    return(ggplot2handler(cumulativePlot))

}




#' calculates the fractional number of bases according to
#' supplied b parameter
#'
#' an accessory method for various logicals; simple fractional base calculator
#'
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @param b is a fractional point through run against which time will be
#' calculated
#' @return a numeric value expressed in gigabases
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#'     "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' Base50 <- SequencingSummaryBase50(seqsum)
#' Base50
#'
#' @export
SequencingSummaryBase50 <- function(seqsum, b = 0.5) {
    passedSeqs <- seqsum[which(seqsum$passes_filtering), ]
    base50 <- sum(passedSeqs$sequence_length_template)/1e+09 * b
    return(base50)
}




#' calculates the timepoint within a sequencing run where
#' 50percent of the total data is produced
#'
#' an accessory method for identifying a timepoint where a given amount of data
#' has been produced
#'
#' @importFrom stats approxfun
#' @importFrom stats optimize
#' @usage SequencingSummaryT50(seqsum=NA,
#'     t = 0.5,
#'     sampleHours = NA,
#'     sampleIntervalMinutes = 60
#' )
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @param t is a fractional point through run against which time will be
#' calculated
#' @param sampleHours is the number of hours to consider
#' @param sampleIntervalMinutes is the resolution of the plot in minutes
#' @return a numeric value expressed in hours
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#'     "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' T50 <- SequencingSummaryT50(seqsum)
#' T50
#'
#' @export
SequencingSummaryT50 <- function(
    seqsum=NA, t = 0.5, sampleHours = NA, sampleIntervalMinutes = 60) {

    seqsum <- handleSeqSumCache(seqsum)
    if (is.na(sampleHours)) {
        sampleHours <- SequencingSummaryExtractRuntime()
    }

    binnedTemporalDataPerHour <- getTemporalDataset(
        seqsum, sampleHours, sampleIntervalMinutes)

    # binnedTemporalDataPerHour is scaled to Gbp per hour - rescale to raw
    # for cumulative plotting
    binnedTemporalDataPerHour$pass <- binnedTemporalDataPerHour$pass/60 *
        sampleIntervalMinutes

    # https://stackoverflow.com/questions/31404679/
    # can-ggplot2-find-the-intersections-or-is-there-any-other-neat-way
    acquireTimePoints <- which(binnedTemporalDataPerHour$pass > 0)
    targetInterpolate <- approxfun(x = binnedTemporalDataPerHour[
        acquireTimePoints, "time"], y = cumsum(binnedTemporalDataPerHour[
        acquireTimePoints, "pass"]))

    base50 <- SequencingSummaryBase50(seqsum, b = t)
    T50 <- optimize(function(t0) abs(targetInterpolate(t0) - base50),
        interval = range(binnedTemporalDataPerHour[acquireTimePoints, "time"]))

    return(T50)
}




#' plot cumulative volumes of sequence reads
#'
#' plots a ggplot2 graph of accumulated sequence reads against time for run and
#' separates passed and failed sequence reads
#'
#' @usage SequencingSummaryCumulativeReads(seqsum,
#'     sampleHours = NA,
#'     sampleIntervalMinutes = 60
#' )
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @param sampleHours is the number of hours to plot data for
#' @param sampleIntervalMinutes is the resolution to plot data at
#' @return ggplot2 showing temporal performance
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#'     "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummaryCumulativeReads(seqsum)
#'
#' @export
SequencingSummaryCumulativeReads <- function(seqsum=NA, sampleHours = NA,
    sampleIntervalMinutes = 60) {

    seqsum <- handleSeqSumCache(seqsum)
    if (is.na(sampleHours)) {
        sampleHours <- SequencingSummaryExtractRuntime()
    }

    breaks = seq(0, sampleHours * 60 * 60, by = 60 * sampleIntervalMinutes)
    binass <- findInterval(seqsum$start_time, breaks)

    mergeItReadsPerHour <- function(interval, binnedAssignments, filter) {
        totalreads = 0
        if (length(which(binnedAssignments == interval)) > 0) {
            subset <- seqsum[which(binnedAssignments == interval), ]
            if (length(which(subset$passes_filtering == filter)) > 0) {
                totalreads = nrow(subset[which(
                    subset$passes_filtering == filter), ])
            }
        }
        # scale results to mean millions of reads per hour
        return(totalreads/1e+06/sampleIntervalMinutes * 60)
    }

    binnedTemporalDataReadsPerHour <- data.frame(cbind(time = breaks, pass =
        unlist(lapply(seq(breaks), mergeItReadsPerHour, binnedAssignments =
        binass, filter = TRUE)), fail = unlist(lapply(seq(breaks),
        mergeItReadsPerHour, binnedAssignments = binass, filter = FALSE))))

    binnedTemporalDataReadsPerHour$time <-
        binnedTemporalDataReadsPerHour$time/60/60
    # binnedTemporalDataReadsPerHour is scaled to Gbp per hour - rescale to raw
    # for cumulative plotting
    binnedTemporalDataReadsPerHour$pass <-
        binnedTemporalDataReadsPerHour$pass/60 * sampleIntervalMinutes
    binnedTemporalDataReadsPerHour$fail <-
        binnedTemporalDataReadsPerHour$fail/60 * sampleIntervalMinutes

    cumulativePlot <- ggplot(binnedTemporalDataReadsPerHour,aes_string("time"))+
        geom_line(aes(y = cumsum(binnedTemporalDataReadsPerHour$fail),
        colour = "fail"), size = 1) + geom_line(aes(y = cumsum(
        binnedTemporalDataReadsPerHour$pass), colour = "pass"), size = 1) +
        scale_color_manual(name = "QV", values = c(fail = brewer.pal(6,
        "Paired")[1], pass = brewer.pal(6, "Paired")[2]))+xlab("Time (hours)")+
        ylab("Number of reads sequenced (Millions)") + labs(title =
        "Plot showing cumulative reads sequenced against time")

    return(ggplot2handler(cumulativePlot))
}





#' plot speed of sequencing against time (bases per second distribution)
#'
#' plots a ggplot2 box-and-whisker plot for the distribution of sequencing
#' speeds against time
#'
#' @usage SequencingSummarySpeedPlot(seqsum = NA,
#'     sampleHours = NA,
#'     sampleIntervalMinutes = 60
#' )
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @param sampleHours is the number of hours to plot data for
#' @param sampleIntervalMinutes is the resolution to plot data at
#' @return ggplot2 showing temporal performance
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#'     "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummarySpeedPlot()
#'
#' @export
SequencingSummarySpeedPlot <- function(
    seqsum = NA, sampleHours = NA, sampleIntervalMinutes = 60) {

    seqsum <- handleSeqSumCache(seqsum)
    if (is.na(sampleHours)) {
        sampleHours <- SequencingSummaryExtractRuntime()
    }

    breaks = seq(0, sampleHours * 60 * 60, by = 60 * sampleIntervalMinutes)
    binass <- findInterval(seqsum$start_time, breaks)

    speedTime <- data.frame(segment = binass, rate =
        seqsum$sequence_length_template/(seqsum$duration))

    speedplot <- ggplot(speedTime, aes_string(x = "segment", y = "rate", group=
        "segment")) + geom_boxplot(fill = "steelblue", outlier.shape = NA) +
        scale_x_continuous(name = "Time (hours)") +
        ylab("Sequencing rate (bases per second)") + labs(title =
        "boxplot showing distribution of translocation speed against time")

    return(ggplot2handler(speedplot))
}




#' plot number of observed channels actively producing data against time
#'
#' plots a ggplot2 plot of active channels against time
#'
#' @usage SequencingSummaryActiveChannelPlot(seqsum,
#'     sampleHours = NA,
#'     sampleIntervalMinutes = 60
#' )
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @param sampleHours is the number of hours to plot data for
#' @param sampleIntervalMinutes is the resolution to plot data at
#' @return ggplot2 showing temporal performance
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#'     "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummaryActiveChannelPlot(seqsum)
#'
#' @export
SequencingSummaryActiveChannelPlot <- function(
    seqsum=NA, sampleHours = NA, sampleIntervalMinutes = 60) {

    seqsum <- handleSeqSumCache(seqsum)
    if (is.na(sampleHours)) {
        sampleHours <- SequencingSummaryExtractRuntime()
    }

    breaks = seq(0, sampleHours * 60 * 60, by = 60 * sampleIntervalMinutes)
    binass <- findInterval(seqsum$start_time, breaks)

    mergeActiveChannels <- function(interval, binnedAssignments) {
        totalChannels = 0
        if (length(which(binnedAssignments == interval)) > 0) {
            subset <- seqsum[which(binnedAssignments == interval), ]
            totalChannels = length(unique(subset$channel))
        }
        return(totalChannels)
    }

    binnedTemporalChannels <- data.frame(time = breaks, channels =unlist(lapply(
        seq(breaks), mergeActiveChannels, binnedAssignments = binass)))

    binnedTemporalChannels$time <- binnedTemporalChannels$time/60/60

    activityPlot <- ggplot(binnedTemporalChannels, aes_string("time")) +
        geom_step(aes_string(y = "channels"), size = 1, colour = "Steelblue") +
        xlab("Time (hours)") + ylab("Number of channels producing data") +
        labs(title = "Plot showing number of functional channels against time")
    return(ggplot2handler(activityPlot))
}








#' present an infographic styled executive summary of sequence_summary.txt
#' content
#'
#' present an infographic styled executive summary of sequence_summary.txt
#' content
#'
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @param flowcellId is a label for the plot
#' @return file path to ggplot2 format file
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#'     "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequenceSummaryExecutiveSummary(seqsum)
#'
#' @export
SequenceSummaryExecutiveSummary <- function(
    seqsum=NA, flowcellId = "undefined") {
    # calculate some basic, but key, metrics

    seqsum <- handleSeqSumCache(seqsum)

    passedSeqs <- seqsum[which(seqsum$passes_filtering), ]

    readCount <- formatC(nrow(seqsum), big.mark = ",")
    totalBases = sum(seqsum$sequence_length_template, na.rm = TRUE)/10^9
    passedBases = sum(passedSeqs$sequence_length_template, na.rm = TRUE)/10^9
    gigabases <- round(totalBases, 2)

    # render an info-graphic-like plot for these observations

    infoFile1 <- infoGraphicPlot3(identifier = "ExecutiveSummaryValueBoxes",
        panelA = c(value = flowcellId, key = "flowcell", icon = "fa-qrcode"),
        panelB = c(value = readCount, key = "Reads produced", icon="fa-filter"),
        panelC = c(value = gigabases, key = "gigabases called", icon =
        "fa-file-text-o"))

    return(infoFile1)
}



#' present an infographic styled basic characteristics plot of
#' sequence_summary.txt content
#'
#' present an infographic styled basic characteristics plot of
#' sequence_summary.txt content
#'
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @return file path to ggplot2 format file
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#'     "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequenceSummaryBasicInfoPlot(seqsum)
#'
#' @export
SequenceSummaryBasicInfoPlot <- function(seqsum=NA) {

    seqsum <- handleSeqSumCache(seqsum)

    passedSeqs <- seqsum[which(seqsum$passes_filtering), ]
    failedSeqs <- seqsum[which(!seqsum$passes_filtering), ]

    passedMeanLength = round(mean(
        passedSeqs$sequence_length_template), digits = 0)

    N50 <- ncalc(passedSeqs$sequence_length_template, 0.5)

    passedMeanQ = round(mean(passedSeqs$mean_qscore_template), digits = 1)
    failedMeanQ = round(mean(failedSeqs$mean_qscore_template), digits = 1)
    longestRead <- (scales::comma_format())(max(
        passedSeqs$sequence_length_template))

    # N50 length is the length of the shortest contig such that the sum of
    # contigs of equal length oe longer is at least 50% of the total length of
    # all contigs

    infoFile2 <- infoGraphicPlot5(identifier="SequenceCharacteristicValueBoxes",
        panelA = c(value = (scales::comma_format())(passedMeanLength), key =
        "Mean Read Length (nt)", icon = "fa-bar-chart"), panelB = c(value =
        (scales::comma_format())(N50), key = "N50", icon = "fa-play"), panelC =
        c(value = passedMeanQ, key = "Mean Read Quality (QV)", icon =
        "fa-area-chart"), panelD = c(value = failedMeanQ,key="Mean Failed QV",
        icon = "fa-bug"), panelE = c(value = longestRead, key = "Longest Read",
        icon = "fa-sort"))
    return(infoFile2)
}




handleSeqSumCache <- function(seqsum) {
    if (!is.data.frame(seqsum) && is.na(seqsum)) {
        oname <- "seqsumdata"
        if (hasCachedObject(oname)) {
            seqsum <- getCachedObject(oname)
        }
    }
    return(seqsum)
}



#' from sequencing_summary.txt return integer describing runtime in hours
#'
#' from provided sequencing_summary.txt file return an integer best describing
#' the runtime in hours
#'
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @return integer of runtime in hours
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#'     "sequencing_summary.txt.bz2", package = "nanopoRe")
#' importSequencingSummary(seqsumFile)
#' SequencingSummaryExtractRuntime()
#'
#' @export
SequencingSummaryExtractRuntime <- function(seqsum=NA) {
    seqsum <- handleSeqSumCache(seqsum)
    runtime <- max(seqsum[,"start_time"]) / 60 / 60
    expectedRuntimes <- c(4,8,12,24,36,48,64,72,96)
    temporaldistance <- sqrt((expectedRuntimes - runtime)^2)
    rruntime <- expectedRuntimes[[which(
        temporaldistance==min(temporaldistance))]]
    return(rruntime)
}




#' get flowcell id from sequencing_summary file
#'
#' The sequencing summary file contains a load of additional information;
#' this method will quickly extract and return the flowcell id
#'
#' @return character vector of flowcell id
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#'     "sequencing_summary.txt.bz2", package = "nanopoRe")
#' FCid <- SequencingSummaryFlowCellID(seqsumFile)
#'
#' @export
SequencingSummaryFlowCellID <- function(seqsumFile) {
    con <- file(seqsumFile, "r")
    sample_lines <- readLines(con, n=2)
    close(con)
    columns <- strsplit(sample_lines[1], "\t")[[1]]
    values <- strsplit(sample_lines[2], "\t")[[1]]
    if ("filename" %in% columns) {
        filename <- values[na.omit(match(columns, "filename"))]
        filenamesplit <- strsplit(filename, "_")[[1]]
        if (length(filenamesplit)==3) {
            return(filenamesplit[[1]])
        }
        if (length(filenamesplit)==15) {
            return(filenamesplit[[3]])
        }
    }
    return("RuleMissing")
}
sagrudd/nanopoRe documentation built on June 7, 2020, 10:20 p.m.