R/plotMethods.R

Defines functions plotVolcano.FRASER

#' 
#' Visualization functions for FRASER
#'
#' The FRASER package provides mutliple functions to visualize
#' the data and the results of a full data set analysis.
#'
#' This is the list of all plotting function provided by FRASER:
#' \itemize{
#'   \item plotAberrantPerSample()
#'   \item plotVolcano()
#'   \item plotExpression()
#'   \item plotQQ()
#'   \item plotExpectedVsObservedPsi()
#'   \item plotCountCorHeatmap()
#'   \item plotFilterExpression()
#'   \item plotFilterVariability()
#'   \item plotEncDimSearch()
#' }
#'
#' For a detailed description of each plot function please see the details.
#' Most of the functions share the same parameters.
#'
#### Data specific parameters
#' @param object,fds An \code{\link{FraserDataSet}} object.
#' @param type The psi type: either psi5, psi3 or theta (for SE).
#' @param sampleID A sample ID which should be plotted. Can also be a vector.
#'             Integers are treated as indices.
#' @param idx,site A junction site ID or gene ID or one of both, which
#'             should be plotted. Can also be a vector. Integers are treated
#'             as indices.
#' @param padjCutoff,zScoreCutoff,deltaPsiCutoff Significance, Z-score or delta
#'             psi cutoff to mark outliers
#' @param global Flag to plot a global Q-Q plot, default FALSE
#' @param normalized If TRUE, the normalized psi values are used, the default,
#'             otherwise the raw psi values
#' @param aggregate If TRUE, the pvalues are aggregated by gene (default), 
#'             otherwise junction level pvalues are used (default for Q-Q plot).
#' @param result The result table to be used by the method.
#' @param label Indicates the genes or samples that will be labelled in the 
#'             plot (only for \code{basePlot=TRUE}). Setting 
#'             \code{label="aberrant"} will label all aberrant genes or 
#'             samples. Labelling can be turned off by setting 
#'             \code{label=NULL}. The user can also provide a custom 
#'             list of gene symbols or sampleIDs.
#' @param BPPARAM BiocParallel parameter to use.
#' @param Ncpus Number of cores to use.
#' @param plotType The type of plot that should be shown as character string. 
#'             For plotEncDimSearch, it has to be either \code{"auc"} 
#'             for a plot of the area under the curve (AUC) or 
#'             \code{"loss"} for the model loss. For the correlation heatmap,
#'             it can be either \code{"sampleCorrelation"} for a
#'             sample-sample correlation heatmap or \code{"junctionSample"}
#'             for a junction-sample correlation heatmap.
#' @param onlyVariableIntrons Logical value indicating whether to show only 
#'              introns that also pass the variability filter. Defaults to 
#'              FALSE.
#' @param onlyExpressedIntrons Logical value indicating whether to show only 
#'              introns that also pass the expression filter. Defaults to 
#'              FALSE.
#'              
#### Graphical parameters
#' @param main Title for the plot, if missing a default title will be used.
#' @param colGroup Group of samples that should be colored.
#' @param basePlot if \code{TRUE} (default), use the R base plot version, else
#'             use the plotly framework.
#' @param conf.alpha If set, a confidence interval is plotted, defaults to 0.05
#' @param samplingPrecision Plot only non overlapping points in Q-Q plot to 
#'             reduce number of points to plot. Defines the digits to round to. 
#' @param logit If TRUE, the default, psi values are plotted in logit space.
#' @param nClust Number of clusters to show in the row and
#'             column dendrograms.
#' @param sampleClustering A clustering of the samples that should be used as an
#'             annotation of the heatmap.
#' @param show_rownames,show_colnames Logical value indicating whether to show
#'             row or column names on the heatmap axes.
#' @param annotation_col,annotation_row Row or column annotations that should be
#'             plotted on the heatmap.
#' @param topN,topJ Top x most variable junctions that should be used in the
#'             heatmap. TopN is used for sample-sample correlation heatmaps and
#'             topJ for junction-sample correlation heatmaps.
#' @param minMedian,minCount,minDeltaPsi Minimal median (\eqn{m \ge 1}), 
#'             delta psi (\eqn{|\Delta\psi| > 0.1}), read count (\eqn{n \ge 10})
#'             value of a junction to be considered for the correlation heatmap.
#' @param border_color Sets the border color of the heatmap
#' @param plotMeanPsi,plotCov If \code{TRUE}, then the heatmap is annotated with
#'             the mean psi values or the junction coverage.
#' @param bins Set the number of bins to be used in the histogram.
#' @param legend.position Set legend position (x and y coordinate), defaults to
#'             the top right corner.
#'
#### Additional ... parameter
#' @param ... Additional parameters passed to plot() or plot_ly() if not stated
#'             otherwise in the details for each plot function
#'
#' @details
#'
#' \code{plotAberrantPerSample}: The number of aberrant events per sample are
#' plotted sorted by rank. The ... parameters are passed on to the
#' \code{\link{aberrant}} function.
#'
#' \code{plotVolcano}: the volcano plot is sample-centric. It plots for a given
#' sample and psi type the negative log10 nominal P-values against the delta psi
#' values for all splice sites or aggregates by gene if requested.
#'
#' \code{plotExpression}: This function plots for a given site the
#' read count at this site (i.e. K) against the total coverage (i.e. N) for the
#' given psi type (\eqn{\psi_5, \psi_3, or \theta}{\psi5, \psi3, or \theta}
#' (SE)) for all samples.
#'
#' \code{plotQQ}: the quantile-quantile plot for a given gene or if
#' \code{global} is set to \code{TRUE} over the full data set. Here the
#' observed P-values are plotted against the expected ones in the negative
#' log10 space.
#'
#' \code{plotExpectedVsObservedPsi}: A scatter plot of the observed psi
#' against the predicted psi for a given site.
#'
#' \code{plotCountCorHeatmap}: The correlation heatmap of the count data either
#' of the full data set (i.e. sample-sample correlations) or of the top x most
#' variable junctions (i.e. junction-sample correlations). By default the values
#' are log transformed and row centered. The ... arguments are passed to the
#' \code{\link[pheatmap]{pheatmap}} function.
#'
#' \code{plotFilterExpression}: The distribution of FPKM values. If the 
#' FraserDataSet object contains the \code{passedFilter} column, it will plot 
#' both FPKM distributions for the expressed introns and for the filtered 
#' introns.
#' 
#' \code{plotFilterVariability}: The distribution of maximal delta Psi values. 
#' If the FraserDataSet object contains the \code{passedFilter} column, 
#' it will plot both maximal delta Psi distributions for the variable 
#' introns and for the filtered (i.e. non-variable) introns.
#'
#' \code{plotEncDimSearch}: Visualization of the hyperparameter optimization.
#' It plots the encoding dimension against the achieved loss (area under the
#' precision-recall curve). From this plot the optimum should be choosen for
#' the \code{q} in fitting process.
#'
#' @return If base R graphics are used nothing is returned else the plotly or
#'             the gplot object is returned.
#'
#' @name plotFunctions
#' @rdname plotFunctions
#' @aliases plotFunctions plotAberrantPerSample plotVolcano plotQQ 
#'             plotExpression plotCountCorHeatmap plotFilterExpression 
#'             plotExpectedVsObservedPsi plotEncDimSearch
#' @examples
#' # create full FRASER object 
#' fds <- makeSimulatedFraserDataSet(m=40, j=200)
#' fds <- calculatePSIValues(fds)
#' fds <- filterExpressionAndVariability(fds, filter=FALSE)
#' # this step should be done for all splicing metrics and more dimensions
#' fds <- optimHyperParams(fds, "psi5", q_param=c(2,5,10,25))
#' fds <- FRASER(fds)
#' 
#' # QC plotting
#' plotFilterExpression(fds)
#' plotFilterVariability(fds)
#' plotCountCorHeatmap(fds, "theta")
#' plotCountCorHeatmap(fds, "theta", normalized=TRUE)
#' plotEncDimSearch(fds, type="psi5")
#' 
#' # extract results 
#' plotAberrantPerSample(fds)
#' plotVolcano(fds, "sample1", "psi5")
#' 
#' # dive into gene/sample level results
#' res <- results(fds)
#' res
#' plotExpression(fds, result=res[1])
#' plotQQ(fds, result=res[1])
#' plotExpectedVsObservedPsi(fds, type="psi5", res=res[1])
#' 
#'
NULL


plotVolcano.FRASER <- function(object, sampleID, 
                    type=c("psi3", "psi5", "theta"), basePlot=TRUE, 
                    aggregate=FALSE, main=NULL, label=NULL,
                    deltaPsiCutoff=0.3, padjCutoff=0.1, ...){
    
    type <- match.arg(type)

    dt <- getPlottingDT(object, axis="col", type=type, idx=sampleID,
            aggregate=aggregate, deltaPsiCutoff=deltaPsiCutoff, 
            padjCutoff=padjCutoff, ...)
    
    g <- ggplot(dt, aes(x=deltaPsi, y=-log10(pval), color=aberrant, 
                        label=featureID, text=paste0(
            "SampleID: ", sampleID, "<br>",
            "featureID: ", featureID, "<br>",
            "Raw count (K): ", k, "<br>",
            "Raw total count (N): ", n, "<br>",
            "p value: ", signif(pval, 5), "<br>",
            "delta Psi: ", round(deltaPsi, 2), "<br>",
            "Type: ", type))) +
        geom_point(aes(alpha=ifelse(aberrant == TRUE, 1, 0.8))) +
        xlab(as.expression(
                bquote(paste(Delta, .(ggplotLabelPsi(type)[[1]]) ))
            )) +
        ylab(expression(paste(-log[10], "(P value)"))) +
        theme_bw() +
        theme(legend.position="none") +
        scale_color_manual(values=c("gray40", "firebrick"))
    
    if(!is.na(deltaPsiCutoff)){
        g <- g + 
            geom_vline(xintercept=c(-deltaPsiCutoff, deltaPsiCutoff),
                    color="firebrick", linetype=2)
    }
    
    if(!is.na(padjCutoff)){
        if(dt[,any(padj < padjCutoff)]){
            padj_line <- min(dt[padj < padjCutoff, -log10(pval)])
        }
        if(!"padj_line" %in% ls() || padj_line > 10 || is.na(padj_line)){
            padj_line <- 6
        }
        g <- g + 
            geom_hline(yintercept=padj_line, color="firebrick", linetype=4)
    }
    
    
    if(isFALSE(basePlot)){
        g <- g + xlab(paste("delta", 
                            ggplotLabelPsi(type, asCharacter=TRUE)[[1]])) +
            ylab("-log[10](P value)")
        if(is.null(main)){
            main <- paste0("Volcano plot: ", sampleID, ", ", 
                            ggplotLabelPsi(type, asCharacter=TRUE)[[1]])
        }
    } else{
        if(!is.null(label)){
            if(isScalarCharacter(label) && label == "aberrant"){
                if(nrow(dt[aberrant == TRUE,]) > 0){
                    g <- g + geom_text_repel(data=dt[aberrant == TRUE,],
                                        fontface='bold', hjust=-.2, vjust=-.2)
                }
            }
            else{
                if(nrow(dt[featureID %in% label]) > 0){
                    g <- g + geom_text_repel(data=
                                        subset(dt, featureID %in% label),
                                    fontface='bold', hjust=-.2, vjust=-.2)
                }
                if(any(!(label %in% dt[,featureID]))){
                    warning("Did not find gene(s) ", 
                            paste(label[!(label %in% dt[,featureID])], 
                                collapse=", "), " to label.")
                }
            }
        }
        
        if(is.null(main)){
            main <- as.expression(bquote(paste(
                bold("Volcano plot: "), .(sampleID), ", ",
                .(ggplotLabelPsi(type)[[1]]))))
        }  
    }
    g <- g + ggtitle(main)
    
    plotBasePlot(g, basePlot)
}

#'
#' Volcano plot
#'
#' Plots the p values over the delta psi values, known as volcano plot.
#' Visualizes per sample the outliers. By type and aggregate by
#' gene if requested.
#'
#' @rdname plotFunctions
#' @export
setMethod("plotVolcano", signature="FraserDataSet", plotVolcano.FRASER)


plotAberrantPerSample.FRASER <- function(object, main, 
                    type=c("psi3", "psi5", "theta"),
                    padjCutoff=0.1, zScoreCutoff=NA, deltaPsiCutoff=0.3,
                    aggregate=TRUE, BPPARAM=bpparam(), ...){

    type <- match.arg(type, several.ok=TRUE)

    if(missing(main)){
        main <- paste('Aberrant events per sample')
        if(isTRUE(aggregate)){
            main <- paste(main, "by gene")
        }
    }

    # extract outliers
    outliers <- bplapply(type, aberrant, object=object, by="sample",
            padjCutoff=padjCutoff, zScoreCutoff=zScoreCutoff,
            deltaPsiCutoff=deltaPsiCutoff, ..., BPPARAM=BPPARAM)
    dt2p <- rbindlist(lapply(seq_along(outliers), function(idx){
        vals <- outliers[[idx]]
        data.table(type=type[idx], value=sort(vals), median=median(vals),
                rank=seq_along(vals))
    }))

    # plot them
    g <- ggplot(dt2p, aes(x=rank, y=value, color=type)) +
        geom_line() +
        geom_hline(aes(yintercept=median, color=type, lty="Median")) +
        theme_bw() +
        theme_cowplot() +
        ggtitle(main) +
        xlab("Sample rank") +
        ylab("Number of outliers") +
        scale_color_brewer(palette="Dark2", name=element_blank(),
                labels=ggplotLabelPsi(dt2p[,unique(type)])) +
        scale_linetype_manual(name="", values=2, labels="Median")

    if(!all(dt2p[,value] == 0)){
        g <- g + scale_y_log10()
    }
    
    g
}

#'
#' Number of aberrant events per sample
#'
#' Plot the number of aberrant events per samples
#'
#' @rdname plotFunctions
#' @export
setMethod("plotAberrantPerSample", signature="FraserDataSet",
        plotAberrantPerSample.FRASER)


#'
#' Junction expression plot
#'
#' Plots the observed split reads of the junction of interest over all reads
#' coming from the given donor/acceptor.
#'
#' @rdname plotFunctions
#' @export
plotExpression <- function(fds, type=c("psi5", "psi3", "theta"),
                    site=NULL, result=NULL, colGroup=NULL, 
                    basePlot=TRUE, main=NULL, label="aberrant", ...){
    if(!is.null(result)){
        type <- as.character(result$type)
        site <- getIndexFromResultTable(fds, result)
    } else {
        type <- match.arg(type)
    }

    dt <- getPlottingDT(fds, axis="row", type=type, idx=site, ...)
    if(!is.null(colGroup)){
        if(all(colGroup %in% samples(fds))){
            colGroup <- samples(fds) %in% colGroup
        }
        dt[colGroup,aberrant:=TRUE]
    }
    dt[,aberrant:=factor(aberrant, levels=c("TRUE", "FALSE"))]

    if(is.null(main)){
        if(isTRUE(basePlot)){
            main <- as.expression(bquote(bold(paste(
                .(ggplotLabelPsi(type)[[1]]), " expression plot: ",
                bolditalic(.(as.character(dt[,unique(featureID)]))),
                " (site ", .(as.character(dt[,unique(idx)])), ")"))))
        } else{
            main <- paste0(ggplotLabelPsi(type, asCharacter=TRUE)[[1]], 
                        " expression plot: ", dt[,unique(featureID)], 
                        " (site ", dt[,unique(idx)], ")")
        }
    }

    g <- ggplot(dt, aes(x=n + 2, y=k + 1, color=aberrant, label=sampleID, 
                        text=paste0(
            "Sample: ", sampleID, "<br>",
            "Counts (K): ", k, "<br>",
            "Total counts (N): ", n, "<br>",
            "p value: ", signif(pval, 5), "<br>",
            "padjust: ", signif(padj, 5), "<br>",
            "Observed Psi: ", round(obsPsi, 2), "<br>",
            "Predicted mu: ", round(predPsi, 2), "<br>"))) +
        geom_point(alpha=ifelse(as.character(dt$aberrant) == "TRUE", 1, 0.7)) +
        scale_x_log10() +
        scale_y_log10() +
        theme_bw() +
        theme(legend.position="none", title=) +
        xlab("Total junction coverage + 2 (N)") +
        ylab("Junction count + 1 (K)") +
        ggtitle(main) +
        annotation_logticks(sides='bl')
    
    if(isTRUE(basePlot) && !is.null(label)){
        if(isScalarCharacter(label) && label == "aberrant"){
            if(nrow(dt[aberrant == TRUE,]) > 0){
                g <- g + geom_text_repel(data=dt[aberrant == TRUE,], 
                                        aes(col=aberrant),
                                        fontface='bold', hjust=-.2, vjust=-.2)
            }
        }
        else{
            if(nrow(dt[sampleID %in% label]) > 0){
                g <- g + geom_text_repel(data=subset(dt, sampleID %in% label), 
                                        aes(col=aberrant),
                                        fontface='bold', hjust=-.2, vjust=-.2)
            }
            if(any(!(label %in% dt[,sampleID]))){
                warning("Did not find sample(s) ", 
                        paste(label[!(label %in% dt[,sampleID])], 
                            collapse=", "), " to label.")
            }
        }
    }

    if(is.null(colGroup)){
        g <- g + scale_colour_manual(
                values=c("FALSE"="gray70", "TRUE"="firebrick"))
    }

    plotBasePlot(g, basePlot)
}


#'
#' Expected over Overserved plot
#'
#' Plots the expected psi value over the observed psi value of the given
#' junction.
#'
#' @rdname plotFunctions
#' @export
plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta"),
                    idx=NULL, result=NULL, colGroup=NULL, main=NULL,
                    basePlot=TRUE, label="aberrant", ...){
    type <- match.arg(type)
    
    # get plotting data
    dt   <- getPlottingDT(fds, axis="row", type=type, result=result, 
            idx=idx, ...)
    type <- as.character(unique(dt$type))
    idx  <- unique(dt$idx)
    
    if(is.null(main)){
        if(isTRUE(basePlot)){
            main <- as.expression(bquote(bold(paste(
                .(ggplotLabelPsi(type)[[1]]), 
                " observed expression vs prediction plot: ",
                bolditalic(.(as.character(dt[,unique(featureID)]))),
                " (site ", .(as.character(idx)), ")"))))
        } else{
            main <- paste0(ggplotLabelPsi(type, asCharacter=TRUE)[[1]], 
                            " observed expression vs prediction plot: ", 
                            dt[,unique(featureID)], " (site ", idx, ")")
        }
    }
    
    if(!is.null(colGroup)){
        if(is.logical(colGroup)){
            dt[colGroup, aberrant:=TRUE]
        } else {
            warning("not implemented yet!")
        }
    }

    if(isTRUE(basePlot)){
        ylab <- bquote("Observed " ~ .(ggplotLabelPsi(type)[[1]]))
        xlab <- bquote("Predicted " ~ .(ggplotLabelPsi(type)[[1]]))
    } else{
        ylab <- paste("Observed", ggplotLabelPsi(type, asCharacter=TRUE)[[1]])
        xlab <- paste("Predicted", ggplotLabelPsi(type, asCharacter=TRUE)[[1]])
    }
    
    g <- ggplot(dt, aes(y=obsPsi, x=predPsi, label=sampleID, color=aberrant, 
            text=paste0(
            "Sample: ", sampleID, "<br>",
            "Counts (K): ", k, "<br>",
            "Total counts (N): ", n, "<br>",
            "p value: ", signif(pval, 5), "<br>",
            "padjust: ", signif(padj, 5), "<br>",
            "Observed Psi: ", round(obsPsi, 2), "<br>",
            "Predicted mu: ", round(predPsi, 2), "<br>"))) +
        geom_point(alpha=ifelse(dt$aberrant, 1, 0.5),
                color=c("gray70", "firebrick")[dt$aberrant + 1]) +
        geom_abline(intercept = 0, slope=1) +
        theme_bw() +
        theme(legend.position="none") +
        xlab(xlab) +
        ylab(ylab) +
        ggtitle(main)
    
    if(isTRUE(basePlot) && !is.null(label)){
        if(isScalarCharacter(label) && label == "aberrant"){
            if(nrow(dt[aberrant == TRUE,]) > 0){
                g <- g + geom_text_repel(data=dt[aberrant == TRUE,], 
                                        aes(col=aberrant),
                                        fontface='bold', hjust=-.2, vjust=-.2)
            }
        }
        else{
            if(nrow(dt[sampleID %in% label]) > 0){
                g <- g + geom_text_repel(data=subset(dt, sampleID %in% label), 
                                    aes(col=aberrant),
                                    fontface='bold', hjust=-.2, vjust=-.2)
            }
            if(any(!(label %in% dt[,sampleID]))){
                warning("Did not find sample(s) ", 
                        paste(label[!(label %in% dt[,sampleID])], 
                                collapse=", "), " to label.")
            }
        }
    }    

    if(is.null(colGroup)){
        g <- g + scale_colour_manual(
            values=c("FALSE"="gray70", "TRUE"="firebrick"))
    }
    
    plotBasePlot(g, basePlot)
}


plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL, 
                    aggregate=FALSE, global=FALSE, main=NULL, conf.alpha=0.05,
                    samplingPrecision=3, basePlot=TRUE, label="aberrant",
                    Ncpus=min(3, getDTthreads()), ...){

    # check parameters
    if(is.null(aggregate)){
        aggregate <- isTRUE(global)
    } else if(!(is.logical(aggregate) |
                all(aggregate %in% colnames(mcols(object))))){
        stop("Please provide TRUE/FALSE or a ",
            "charactor matching a column name in mcols.")
    }

    if(isTRUE(global)){
        if(is.null(type)){
            type <- psiTypes
        }
        dt <- rbindlist(bplapply(type, getPlottingDT, fds=object, axis="col",
                idx=TRUE, aggregate=aggregate, Ncpus=Ncpus, ...))
        # remove duplicated entries donor/acceptor sites if not aggregated 
        # by a feature
        if(isFALSE(aggregate)){
            dt <- dt[!duplicated(dt, by=c("type", "spliceID", "sampleID"))]
        }
    } else {
        dots <- list(...)
        if(!"pvalLevel" %in% names(dots)){
            dots[["pvalLevel"]] <- "junction"
        }
        dots <- append(list(fds=object, axis="row", type=type, idx=idx, 
                            result=result, aggregate=aggregate), 
                        dots)
        dt <- do.call(getPlottingDT, args=dots)
    }
    if(is.null(main)){
        if(isTRUE(global)){
            main <- "Global QQ plot"
        } else {
            type <- as.character(dt[,unique(type)])
            featureID <- as.character(dt[,unique(featureID)])
            if(isTRUE(basePlot)){
                main <- as.expression(bquote(bold(paste(
                        .(ggplotLabelPsi(type)[[1]]),
                        " Q-Q plot: ", bolditalic(.(featureID)),
                        " (site ", .(as.character(dt[,unique(idx)])), ")"))))
            } else{
                main <- paste0(ggplotLabelPsi(type, asCharacter=TRUE)[[1]],
                                " Q-Q plot: ", featureID, 
                                " (site ", dt[,unique(idx)], ")")
            }
        }
    }


    # points
    dt2p <- dt[order(type, pval)]
    dt2p[,obs:=-log10(pval)]
    dt2p[is.na(obs), obs:=0]
    dt2p[is.infinite(obs), obs:=dt2p[is.finite(obs),max(obs)]]
    if(dt2p[,length(unique(obs))] < 2 | nrow(dt2p) < 2){
        warning("There are no pvalues or all are NA or 1!")
        return(FALSE)
    }
    dt2p[,exp:=-log10(ppoints(.N)),by=type]

    # down sample if requested
    dt2p[,plotPoint:=TRUE]
    if(isTRUE(samplingPrecision) | isScalarNumeric(samplingPrecision)){
        if(is.logical(samplingPrecision)){
            samplingPrecision <- 3
        }
        mypoints <- !duplicated(dt2p[,.(obs=round(obs, samplingPrecision), 
                        exp=round(exp, samplingPrecision), type)], 
                by=c("obs", "exp", "type"))
        dt2p[,plotPoint:=mypoints]
    }

    # create qq-plot
    g <- ggplot(dt2p[plotPoint == TRUE,], aes(x=exp, y=obs, col=aberrant, 
            label=sampleID,
            text=paste(
                "<br>SampleID: ", sampleID, "<br>K: ", k, "<br>N: ", n))) +
        geom_point() +
        theme_bw() +
        theme(legend.position="none") +
        ggtitle(main) 
    
    if(isTRUE(basePlot)){
        g <- g +
            xlab(expression(-log[10]~"(expected P)")) +
            ylab(expression(-log[10]~"(observed P)"))
    } else{
        g <- g +
            xlab("-log[10] (expected P)") +
            ylab("-log[10] (observed P)")
    }

    # Set color scale for global/local
    if(isFALSE(global)){
        g <- g + scale_color_manual(values=c("black", "firebrick"),
                name="Aberrant")
    } else {
        g$mapping$colour <- quote(type)
        g <- g + scale_color_brewer(palette="Dark2", name="Splice metric",
                labels=ggplotLabelPsi(unique(dt2p$type)))
    }


    # add confidence band if requesded
    # http://genome.sph.umich.edu/wiki/Code_Sample:_Generating_QQ_Plots_in_R
    if(is.numeric(conf.alpha)){
        dt2p[,rank:=seq_len(.N), by=type]
        dt2p[plotPoint == TRUE,upper:=-log10(
                qbeta(conf.alpha/2, rank, max(rank) - rank)), by=type]
        dt2p[plotPoint == TRUE,lower:=-log10(
                qbeta(1-conf.alpha/2, rank, max(rank) - rank)), by=type]
        # only plot one psiType if multiple are existing
        if(length(unique(dt2p$type)) > 1){
            typeOrder <- c("theta", "psi5", "psi3")
            type2take <- min(which(typeOrder %in% unique(dt2p$type)))
            dt2p[type != typeOrder[type2take], 
                    c("upper", "lower"):=list(NA, NA)]
        }
        g <- g + geom_ribbon(data=dt2p[plotPoint == TRUE & !is.na(upper),],
                aes(x=exp, ymin=lower, ymax=upper, text=NULL),
                alpha=0.2, color="gray")
    }
    
    # add labels if requested
    if(isFALSE(global) && isTRUE(basePlot) && !is.null(label)){
        if(isScalarCharacter(label) && label == "aberrant"){
            if(nrow(dt2p[aberrant == TRUE,]) > 0){
                g <- g + geom_text_repel(data=dt2p[aberrant == TRUE,], 
                                        aes(col=aberrant),
                                        fontface='bold', hjust=-.2, vjust=-.2)
            }
        }
        else{
            if(nrow(dt2p[sampleID %in% label]) > 0){
                g <- g + geom_text_repel(data=subset(dt2p, sampleID %in% 
                                                                        label), 
                                        aes(col=aberrant),
                                        fontface='bold', hjust=-.2, vjust=-.2)
            }
            if(any(!(label %in% dt2p[,sampleID]))){
                warning("Did not find sample(s) ", 
                        paste(label[!(label %in% dt2p[,sampleID])], 
                                collapse=", "), " to label.")
            }
        }
    }   

    # add abline in the end
    g <- g + geom_abline(intercept=0, slope=1, col="firebrick")
    if(isFALSE(global)){
        return(plotBasePlot(g, basePlot))
    }
    g
}

#'
#' Q-Q plot
#'
#' Plots the quantile-quantile plot
#'
#' @rdname plotFunctions
#' @export
setMethod("plotQQ", signature="FraserDataSet", plotQQ.FRASER)


plotEncDimSearch.FRASER <- function(object, type=c("psi3", "psi5", "theta"), 
                    plotType=c("auc", "loss")){
    type <- match.arg(type)
    plotType <- match.arg(plotType)
    data <- hyperParams(object, type=type, all=TRUE)
    if (is.null(data)) {
        warning(paste("no hyperparameters were estimated for", type, 
                        "\nPlease use `optimHyperParams` to compute them."))
        return(NULL)
    }
    if(!"nsubset" %in% colnames(data)){
        data[,nsubset:="NA"]
    }
    data[,noise:=as.factor(noise)]
    data[,nsubset:=as.factor(nsubset)]

    if(plotType == "auc"){
        
        g1 <- ggplot(data, aes(q, aroc, col=nsubset, linetype=noise)) +
            geom_point() +
            geom_smooth(method="loess", formula=y~x) +
            ggtitle("Q estimation") +
            xlab("Estimated q") +
            ylab("Area under the PR curve") +
            theme_bw(base_size=16)
        g1
    }
    else{

        g2 <- ggplot(data, aes(q, eval, col=nsubset, linetype=noise)) +
            geom_point() +
            geom_smooth() +
            ggtitle("Q estimation") +
            xlab("Estimated q") +
            ylab("Model loss") +
            theme_bw(base_size=16)
        g2
    }

}

#'
#' Plots the results from the hyperparamter optimization.
#'
#' @rdname plotFunctions
#' @export
setMethod("plotEncDimSearch", signature="FraserDataSet", 
        plotEncDimSearch.FRASER)


#'
#' Plot filter expression
#'
#' Histogram of the geometric mean per junction based on the filter status
#'
#' @rdname plotFunctions
#' @export
plotFilterExpression <- function(fds, bins=200, legend.position=c(0.8, 0.8),
                                    onlyVariableIntrons=FALSE){
    
    # check that expression filter has been calculated
    if(!("passedExpression" %in% colnames(mcols(fds, type="j")))){
        stop("Please calculate the expression filter values first with the ",
                "filterExpression function.")
    }
    
    # get mean count for all junctions in the fds object
    cts    <- K(fds, "psi5")
    rowlgm <- exp(rowMeans(log(cts + 1)))

    dt <- data.table(
            value=rowlgm,
            passed=mcols(fds, type="j")[['passedExpression']])
    if(isTRUE(onlyVariableIntrons)){
        dt[,passed:=mcols(fds, type="j")[['passed']]]
    }
    dt[,passed:=factor(passed, levels=c(TRUE, FALSE))]
    
    colors <- brewer.pal(3, "Dark2")[seq_len(2)]
    ggplot(dt, aes(value, fill=passed)) +
        geom_histogram(bins=bins) +
        scale_x_log10() +
        scale_y_log10() +
        scale_fill_manual(values=colors, name="Passed",
                labels=c("True", "False")) +
        xlab("Mean Junction Expression") +
        ylab("Count") +
        ggtitle("Expression filtering") +
        theme_bw() +
        theme(legend.position=legend.position)
}

#'
#' Plot filter variability
#'
#' Histogram of minimal delta psi per junction
#'
#' @rdname plotFunctions
#' @export
plotFilterVariability <- function(fds, bins=200, legend.position=c(0.8, 0.8),
                                    onlyExpressedIntrons=FALSE){
    
    # check that expression filter has been calculated
    if(!("passedVariability" %in% colnames(mcols(fds, type="j")))){
        stop("Please calculate the expression filter values first with the ",
                "filterVariability function.")
    }
    
    # get plotting data
    dt <- data.table(
        value=pmax(mcols(fds, type="j")[['maxDPsi3']], 
                    mcols(fds, type="j")[['maxDPsi5']],
                    mcols(fds, type="j")[['maxDThetaDonor']],
                    mcols(fds, type="j")[['maxDThetaAcceptor']]),
        passed=mcols(fds, type="j")[['passedVariability']])
    if(isTRUE(onlyExpressedIntrons)){
        dt[,passed:=mcols(fds, type="j")[['passed']]]
    }
    
    # check if file with removed counts exists and add them when it exists
    nonVarDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds),
                            "nonVariableJunctions")

    if(dir.exists(nonVarDir)){
        nV_stored <- loadHDF5SummarizedExperiment(dir=nonVarDir) 
        nonVar_dt <- data.table(
            value=pmax(mcols(nV_stored)[['maxDPsi3']], 
                        mcols(nV_stored)[['maxDPsi5']],
                        mcols(nV_stored)[['maxDThetaDonor']],
                        mcols(nV_stored)[['maxDThetaAcceptor']]),
            passed=FALSE)
        dt <- rbind(dt, nonVar_dt)
    }
    
    dt[,passed:=factor(passed, levels=c(TRUE, FALSE))]
    colors <- brewer.pal(3, "Dark2")[seq_len(2)]
    ggplot(dt, aes(value, fill=passed)) +
        geom_histogram(bins=bins) +
        scale_y_log10() + 
        scale_fill_manual(values=colors, name="Passed",
                            labels=c("True", "False")) +
        xlab(bquote("Maximal Junction" ~ Delta*Psi[5] ~ "or" ~ Delta*Psi[3])) +
        ylab("Count") +
        ggtitle("Variability filtering") +
        theme_bw() +
        theme(legend.position=legend.position)
}


plotCountCorHeatmap.FRASER <- function(object,
                    type=c("psi5", "psi3", "theta"), logit=FALSE, topN=50000, 
                    topJ=5000, minMedian=1, minCount=10, 
                    main=NULL, normalized=FALSE, show_rownames=FALSE,
                    show_colnames=FALSE, minDeltaPsi=0.1, annotation_col=NA,
                    annotation_row=NA, border_color=NA, nClust=5,
                    plotType=c("sampleCorrelation", "junctionSample"),
                    sampleClustering=NULL, plotMeanPsi=TRUE, plotCov=TRUE, ...){

    type <- match.arg(type)
    plotType <- match.arg(plotType)

    # use counts as matrix, otherwise x(fds,...) does not work later on
    counts(object, type=type, side="other", HDF5=FALSE)      <-
        as.matrix(counts(object, type=type, side="other"))
    counts(object, type=type, side="ofInterest", HDF5=FALSE) <-
        as.matrix(counts(object, type=type, side="ofInterest"))

    kmat <- K(object, type=type)
    nmat <- N(object, type=type)

    expRowsMedian <- rowMedians(kmat) >= minMedian
    expRowsMax    <- rowMax(kmat)     >= minCount
    table(expRowsMax & expRowsMedian)

    skmat <- kmat[expRowsMax & expRowsMedian,]
    snmat <- nmat[expRowsMax & expRowsMedian,]

    xmat <- (skmat + 1)/(snmat + 2)
    if(isTRUE(logit)){
        xmat <- qlogisWithCap(xmat)
    }
    xmat[snmat < minCount] <- NA
    xmat_rc    <- xmat - rowMeans(xmat, na.rm=TRUE)

    xmat_rc_sd <- rowSds(xmat_rc, na.rm=TRUE)
    plotIdx <- rank(xmat_rc_sd) >= length(xmat_rc_sd) - topN
    xmat_rc_2_plot <- xmat_rc[plotIdx,]
    cormatS <- cor(xmat_rc_2_plot, use="pairwise", method="spearman")
    if(isTRUE(normalized)){
        pred_mu <- as.matrix(predictedMeans(object, type=type)[
            expRowsMax & expRowsMedian,][plotIdx,])
        if(isTRUE(logit)){
            pred_mu <- qlogisWithCap(pred_mu)
        }
        pred_mu[(snmat < minCount)[plotIdx,]] <- NA
        lpred_mu_rc <- pred_mu - rowMeans(pred_mu, na.rm=TRUE)
        xmat_rc_2_plot <- xmat_rc_2_plot - lpred_mu_rc
    }
    cormat <- cor(xmat_rc_2_plot, use="pairwise", method="spearman")
    breaks <- seq(-1, 1, length.out=50)

    if(plotType == "junctionSample"){

        if(isTRUE(normalized)){
            pred_mu <- as.matrix(predictedMeans(object, type=type)[
                expRowsMax & expRowsMedian,])
            if(isTRUE(logit)){
                pred_mu <- qlogisWithCap(pred_mu)
            }
            lpred_mu_rc <- pred_mu - rowMeans(pred_mu)
            xmat_rc <- xmat_rc - lpred_mu_rc
        }

        object <- object[expRowsMax & expRowsMedian,,by=type]
        j2keepVa <- variableJunctions(object, type, minDeltaPsi)
        j2keepDP <- rowQuantiles(kmat[expRowsMax & expRowsMedian,],
                                    probs=0.75) >= 10
        j2keep <- j2keepDP & j2keepVa
        xmat_rc_2_plot <- xmat_rc[j2keep,]
        mostVarKeep <- subsetKMostVariableJunctions(object[j2keep,,by=type],
                                                    type, topJ)
        xmat_rc_2_plot <- xmat_rc_2_plot[mostVarKeep,]
        rownames(xmat_rc_2_plot) <- seq_len(nrow(xmat_rc_2_plot))
        breaks <- seq(-5, 5, length.out=50)

    }


    if(is.character(annotation_col)){
        annotation_col <- getColDataAsDFFactors(object, annotation_col)
    }
    if(is.character(annotation_row)){
        annotation_row <- getColDataAsDFFactors(object, annotation_row)
    }

    # annotate with sample clusters
    if(is.null(sampleClustering)){
        # annotate samples with clusters from sample correlation heatmap
        nClust <- min(nClust, nrow(cormatS))
        clusters <- as.factor(cutree(hclust(dist(cormatS)), k=nClust))
    } else if(!is.na(sampleClustering)){
        clusters <- sampleClustering
    }

    if(!isTRUE(is.na(sampleClustering))){
        if(!is.null(nrow(annotation_col))){
            annotation_col$sampleCluster <- clusters
        } else {
            annotation_col <- data.frame(sampleCluster=clusters)
        }
    }


    if(plotType == "junctionSample"){

        # annotate junctions with meanPsi and meanCoverage
        xmat <- xmat[j2keep,]
        xmat <- xmat[mostVarKeep,]
        meanPsi <- if(isTRUE(logit)){
            rowMeans(plogis(xmat))
        } else{
            rowMeans(xmat)
        }
        meanPsiBins <- cut(meanPsi, breaks = c(0, 0.33, 0.66, 1),
                            include.lowest=TRUE)
        if(isTRUE(plotMeanPsi)){
            if(!is.null(nrow(annotation_row))){
                annotation_row$meanPsi <- meanPsiBins
            } else{
                annotation_row <- data.frame(meanPsi=meanPsiBins)
            }
        }

        snmat <- snmat[j2keep,]
        snmat <- snmat[mostVarKeep,]
        meanCoverage <- rowMeans(snmat)
        cutpoints <- sort(unique(round(log10(meanCoverage))))
        if(max(cutpoints) < ceiling(log10(max(meanCoverage)))){
            cutpoints <- c(cutpoints, ceiling(log10(max(meanCoverage))))
        }
        meanCoverage <- cut(meanCoverage, breaks=10^(cutpoints),
                            include.lowest=TRUE)

        if(isTRUE(plotCov)){
            annotation_row$meanCoverage <- meanCoverage
        }
        if(isTRUE(nrow(annotation_row) > 0)){
            rownames(annotation_row) <- rownames(xmat_rc_2_plot)
        }
        cormat <- xmat_rc_2_plot
    }

    if(is.null(main)){
        main <- ifelse(normalized, "Normalized intron-centered ", 
                        "Raw intron-centered ")
        if(plotType == "sampleCorrelation"){
            if(isTRUE(logit)){
                main <- paste0(main, "Logit(PSI) correlation (", type, ")")
            } else {
                main <- paste0(main, "PSI correlation (", type, ")")
            }
        } else {
            if(isTRUE(logit)){
                main <- paste0(main, "Logit(PSI) data (", type, ", top ", topJ, 
                                ")")
            } else {
                main <- paste0(main, "PSI data (", type, ", top ", topJ, ")")
            }
        }
    }

    pheatmap(cormat, show_rownames=show_rownames, show_colnames=show_colnames,
            main=main, annotation_col=annotation_col, breaks=breaks,
            annotation_row=annotation_row, ..., border_color=border_color,
            color=colorRampPalette(colors=rev(brewer.pal(11, "RdBu")))(50)
    )
}

#'
#' Plot count correlation
#'
#' Count correlation heatmap function
#'
#' @rdname plotFunctions
#' @export
setMethod("plotCountCorHeatmap", signature="FraserDataSet", 
        plotCountCorHeatmap.FRASER)

#'
#' helper function to get the annotation as data frame from the col data object
#'
#' @noRd
getColDataAsDFFactors <- function(fds, names){
    tmpDF <- data.frame(colData(fds)[,names])
    colnames(tmpDF) <- names
    for(i in names){
        if(any(is.na(tmpDF[, i]))){
            tmpDF[,i] <- as.factor(paste0("", tmpDF[,i]))
        }
        if(is.integer(tmpDF[,i]) && length(levels(as.factor(tmpDF[,i]))) <= 10){
            tmpDF[,i] <- as.factor(paste0("", tmpDF[,i]))
        }
    }
    rownames(tmpDF) <- rownames(colData(fds))
    return(tmpDF)
}

#' 
#' used to cap the qlogis for the correlation heatmap
#' 
#' @noRd
qlogisWithCap <- function(x, digits=2){
    x <- round(x, digits)
    x <- pmin(pmax(x, 10^-digits), 1-10^-digits)
    ans <- qlogis(x)
    ans[is.infinite(ans)] <- NA
    rowm <- rowMaxs(ans, na.rm=TRUE)
    idx <- which(is.na(ans), arr.ind=TRUE)
    ans[idx] <- rowm[idx[,"row"]]
    return(ans)
}

#'
#' Helper to get nice Splice metric labels in ggplot
#'
#' @noRd
ggplotLabelPsi <- function(type, asCharacter=FALSE){
    type <- as.character(type)
    if(isFALSE(asCharacter)){
        vapply(type, FUN=function(x)
            switch (x,
                    psi5 = c(bquote(psi[5])),
                    psi3 = c(bquote(psi[3])),
                    theta = c(bquote(theta))),
            FUN.VALUE=c(bquote(psi[3])))
    } else{
        vapply(type, FUN=function(x)
            switch (x,
                    psi5 = "psi[5]",
                    psi3 = "psi[3]",
                    theta = "theta"),
            FUN.VALUE=character(1))
    }
}

Try the FRASER package in your browser

Any scripts or data that you put into this service are public.

FRASER documentation built on Feb. 3, 2021, 2:01 a.m.