R/tx_plots.R

Defines functions tx_plot_metageneExons tx_plot_metageneRegions tx_plot_metageneAtCDS tx_plot_ggseqlogo tx_plot_numeric tx_plot_staEndCov tx_plot_nucFreq

Documented in tx_plot_ggseqlogo tx_plot_metageneAtCDS tx_plot_metageneExons tx_plot_metageneRegions tx_plot_nucFreq tx_plot_numeric tx_plot_staEndCov

#' Nucleotide frequency plot
#'
#' Generates a ggplot barplot using the nucleotide frequency counts in
#' data.tables generated by tx_covNucFreqDT().
#'
#' @param DT data.table. Input data from which to generate the plot
#' @param gene character. Name of the gene in DT which wants to be plotted.
#' @param txRange integer. Range in data to be used, 'txcoor' column is used to
#' delimit this range in the data.table.
#' @param removeInsert logical. Removes counts from insert reads, which do not
#' call for a nucleotide, are just the gap between read1 and read2.
#' @param show_yLabels logical. If set to FALSE hides the y axis labels.
#' @param bar_border logical. If set to FALSE removes the border of the bars.
#' @param showLegend logical. If set to FALSE does not renger a legend.
#'
#' @return ggplot
#' @export
tx_plot_nucFreq <- function(DT,
                            gene,
                            txRange = 1:nrow(DT),
                            removeInsert = TRUE,
                            show_yLabels = TRUE,
                            bar_border = TRUE,
                            showLegend = TRUE){
    check_refSeq(DT)
    DT <- check_DT(DT)
    allCols <- all(c("refSeq", "A", "C", "G", "T", "-", "N") %in% names(DT))
    if(!allCols){
        stop("DT must containg the reference sequence column 'refSeq' along ",
             "nucleotide frequency columns 'A', 'C', 'G', 'T', 'N', and '-'")
    }
    if(!(gene %in% DT$gene)){stop("gene not found in DT object")}
    DT <- DT[DT$gene == gene,]
    DT <- DT[DT$txcoor %in% txRange,]
    DT$REF <- 0
    nucsInRef <- unique(DT$refSeq)
    for(i in nucsInRef){
        DT[which(DT$refSeq == i),"REF"] <- DT[which(DT$refSeq == i), i,  with = F]
        DT[which(DT$refSeq == i), i] <- 0
    }
    DT$pos <- paste(DT$txcoor, DT$refSeq, sep = "-")
    DT$pos <- factor(DT$pos, levels = DT$pos)
    tmpData <- tidyr::pivot_longer(DT, cols = c(txtools::IUPAC_code_simpl, "REF"),
                                   values_to = "counts", names_to = "nuc") %>% data.table::data.table()
    if(removeInsert){
        tmpData <- tmpData[tmpData$nuc != ".",]
        tmpData$nuc <- factor(tmpData$nuc, levels = c("REF", "A", "C", "G", "T", "-", "N"))
        tmpGG <- ggplot2::ggplot(tmpData, ggplot2::aes(x = .data$pos,
                                                       y = .data$counts,
                                                       fill = .data$nuc)) +
            ggplot2::theme_minimal() + scale_fill_txBrowser_2() +
            ggplot2::ylab("Frequency") + ggplot2::xlab("Transcriptome coordinate") +
            ggplot2::theme(legend.position="bottom") +
            ggplot2::guides(fill= ggplot2::guide_legend(nrow = 1, byrow=TRUE, title = "")) +
            ggplot2::ggtitle(gene)
    }else{
        tmpData$nuc <- factor(tmpData$nuc, levels = c(".", "REF", "A", "C", "G", "T", "-", "N"))
        tmpGG <- ggplot2::ggplot(tmpData, ggplot2::aes(x = .data$pos,
                                                       y = .data$counts,
                                                       fill = .data$nuc)) +
            ggplot2::theme_minimal() + scale_fill_txBrowser() +
            ggplot2::ylab("Frequency") + ggplot2::xlab("Transcriptome coordinate") +
            ggplot2::theme(legend.position="bottom") +
            ggplot2::guides(fill= ggplot2::guide_legend(nrow = 1, byrow = TRUE, title = "")) +
            ggplot2::ggtitle(gene)
    }
    if(bar_border){
        tmpGG <- tmpGG + ggplot2::geom_bar(stat = "identity", colour = "black", size = 0.3)
    }else{
        tmpGG <- tmpGG + ggplot2::geom_bar(stat = "identity")
    }
    if(show_yLabels & length(txRange) < 200){
        nucCols <- txBrowser_pal()(6)[-1:-2][as.numeric(factor(DT$refSeq))]
        tmpGG <- suppressWarnings(
            tmpGG +
                ggplot2::theme(axis.text.x =
                                   ggplot2::element_text(angle = 90, hjust = 1,
                                                         vjust = 0.5,
                                                         colour = nucCols,
                                                         face = "bold")))
    }else{
        tmpGG <- tmpGG + ggplot2::theme(axis.text.x = ggplot2::element_blank())
    }
    if(!showLegend){
        tmpGG <- tmpGG + ggplot2::theme(legend.position = "none")
    }
    tmpGG
}




#' Transcript coverage plot highlighting read-starts and read-ends counts
#'
#' @param DT data.table. A table as output by the \code{\link{tx_makeDT_coverage}}(),
#' \code{\link{tx_makeDT_nucFreq}}() or \code{\link{tx_makeDT_covNucFreq}}() functions.
#' @param gene character. Name of the gene in DT which wants to be plotted.
#' @param txRange integer. Range in data to be used, 'txcoor' column is used to
#' delimit this range in the data.table.
#' @param removeCov logical. If set to TRUE remove coverage counts.
#' @param show_yLabels logical. If set to FALSE hides the y axis labels.
#' @param bar_border logical. If set to FALSE removes the border of the bars.
#' @param showLegend logical. If set to FALSE does not renger a legend.
#'
#' @return ggplot
#' @export
tx_plot_staEndCov <- function(
        DT, gene, txRange = 1:nrow(DT), removeCov = FALSE,
        show_yLabels = TRUE, bar_border = TRUE, showLegend = TRUE){
    check_refSeq(DT)
    DT <- check_DT(DT)
    if(!(gene %in% DT$gene)){stop("gene not found in DT object")}
    DT <- DT[DT$gene == gene,]
    DT <- DT[DT$txcoor %in% txRange,]
    DT$pos <- paste(DT$txcoor, DT$refSeq, sep = "-")
    DT$pos <- factor(DT$pos, levels = DT$pos)
    DT$cov <- DT$cov - DT$start_5p - DT$end_3p
    if(removeCov){DT$cov <- 0}
    tmpData <- tidyr::pivot_longer(DT, cols = c("start_5p", "end_3p", "cov"),
                                   values_to = "counts", names_to = "coverage") %>%
        data.table::data.table()
    tmpData$coverage <- factor(tmpData$coverage, levels = c("cov",
                                                            "start_5p",
                                                            "end_3p"))
    tmpGG <- ggplot2::ggplot(tmpData,
                             ggplot2::aes(x = .data$pos,
                                          y = .data$counts,
                                          fill = .data$coverage)) +
        ggplot2::theme_minimal() +
        ggplot2::scale_fill_manual(values = c("#c2c2c2", "#5b54a0", "#f1876d")) +
        ggplot2::ylab("Frequency") + ggplot2::xlab("Transcriptome coordinate") +
        ggplot2::theme(legend.position="bottom") +
        ggplot2::guides(fill = ggplot2::guide_legend(nrow=1, byrow=TRUE, title = "")) +
        ggplot2::ggtitle(gene)
    if(bar_border){
        tmpGG <- tmpGG + ggplot2::geom_bar(stat = "identity",
                                           colour = "black",
                                           size = 0.3)
    }else{
        tmpGG <- tmpGG + ggplot2::geom_bar(stat = "identity")
    }
    if(show_yLabels & length(txRange) < 200){
        nucCols <- txBrowser_pal()(6)[-1:-2][as.numeric(factor(DT$refSeq))]
        tmpGG <- suppressWarnings(
            tmpGG +
                ggplot2::theme(axis.text.x =
                                   ggplot2::element_text(angle = 90,
                                                         hjust = 1,
                                                         vjust = 0.5,
                                                         colour = nucCols,
                                                         face = "bold")))
    }else{
        tmpGG <- tmpGG + ggplot2::theme(axis.text.x = ggplot2::element_blank())
    }
    if(!showLegend){
        tmpGG <- tmpGG + ggplot2::theme(legend.position = "none")
    }
    tmpGG
}

#' Numeric plot
#'
#' Plot numeric variables in a txDT by gene and in a specified range
#'
#' If the range exceeds 250 bp the labels in the x-axis are not shown by default
#' unless force_xAxis is set to TRUE.
#'
#' @param DT data.table. Input data from which to generate the plot
#' @param gene character
#' @param colVars character. Names of columns for which values will be extracted
#' @param txRange integer. Range in data to be used, 'txcoor' column is used to
#' delimit this range in the data.table.
#' @param plot_type character. Type of plot to be output, either "lineplot" or "barplot".
#' @param lwd numeric. Width of line (lineplot)
#' @param addPoints logical. Add points to line (lineplot)
#' @param scales character. Scales the y axis accordingly. Either 'fixed',
#'  by default, or 'free_y' (barplot)
#' @param show_yLabels logical. If set to FALSE hides the y axis labels.
#' @param showLegend logical. If set to FALSE does not renger a legend.
#'
#' @return ggplot
#' @export
#'
#' @examples
#' # Barplot using data from the Use-case-#3
#' tx_plot_numeric(DT = sc_txDTL[[1]], gene = "18s", txRange = 100:150,
#' colVars = c("start_5p", "end_3p", "cov"), plot_type = "barplot", scales = "free")
#'
#' # Lineplot using data from the Use-case-#3
#' tx_plot_numeric(DT = sc_txDTL[[1]], gene = "18s", txRange = 100:150,
#' colVars = c("start_5p", "end_3p", "cov"), plot_type = "lineplot")
tx_plot_numeric <- function(DT, gene, colVars, txRange = 1:nrow(DT),
                            plot_type = "lineplot", lwd = 1, addPoints = TRUE,
                            scales = "fixed", show_yLabels = TRUE, showLegend = TRUE){
    check_refSeq(DT)
    DT <- check_DT(DT)
    DT <- as.data.frame(DT)
    isNum <- unlist(lapply(colVars, function(x) is.numeric(DT[[x]])))
    if(!all(isNum)){
        stop("Selected column(s): ", paste(colVars[!isNum], collapse = ", ") ," are not numeric")
    }
    if(!(gene %in% DT$gene)){stop("gene not found in DT object")}
    DT <- DT[DT$gene == gene, ]
    DT <- DT[DT$txcoor %in% txRange,]
    DT$pos <- paste(DT$txcoor, DT$refSeq, sep = "-")
    DT$pos <- factor(DT$pos, levels = DT$pos)
    tmpData <- tidyr::pivot_longer(DT, cols = tidyr::all_of(colVars),
                                   values_to = "counts",
                                   names_to = "variable") %>%
        data.table::data.table()
    tmpData$variable <- factor(tmpData$variable, levels = colVars)
    if(plot_type == "barplot"){
        tmpGG <- ggplot2::ggplot(tmpData,
                                 ggplot2::aes(x = tmpData$pos,
                                              y = tmpData$counts,
                                              fill = tmpData$variable)) +
            ggplot2::geom_bar(stat = "identity") +
            ggplot2::guides(fill = ggplot2::guide_legend(nrow = 1, byrow = TRUE, title = "")) +
            ggplot2::scale_fill_brewer(palette = "Set1")
        if(length(colVars) > 1){
            tmpGG <- tmpGG + ggplot2::facet_grid(.data$variable~., scales = scales)
        }
    }else if(plot_type == "lineplot"){
        tmpGG <- ggplot2::ggplot(tmpData,
                                 ggplot2::aes(x = tmpData$pos,
                                              y = tmpData$counts,
                                              colour = tmpData$variable,
                                              group = tmpData$variable)) +
            ggplot2::geom_line(lwd = lwd) +
            ggplot2::guides(color = ggplot2::guide_legend(nrow = 1, byrow = TRUE, title = ""))
        ggplot2::scale_color_brewer(palette = "Set1")
        if(addPoints){tmpGG <- tmpGG + ggplot2::geom_point(alpha = 0.5)}
    }else{stop("plot_type must be either lineplot or barplot")}
    tmpGG <- tmpGG +
        ggplot2::theme_bw() +
        ggplot2::xlab("Transcriptome coordinate") +
        ggplot2::theme(legend.position="top") +
        ggplot2::ggtitle(gene) +
        ggplot2::ylab("Value")
    if(show_yLabels & length(txRange) < 250){
        nucCols <- txBrowser_pal()(6)[-1:-2][as.numeric(factor(DT$refSeq))]
        tmpGG <- suppressWarnings(
            tmpGG +
                ggplot2::theme(axis.text.x =
                                   ggplot2::element_text(angle = 90,
                                                         hjust = 1,
                                                         vjust = 0.5,
                                                         colour = nucCols,
                                                         face = "bold")))
    }else{
        tmpGG <- tmpGG + ggplot2::theme(axis.text.x = ggplot2::element_blank())
    }
    if(!showLegend){
        tmpGG <- tmpGG + ggplot2::theme(legend.position = "none")
    }
    tmpGG
}

#' Plot motif centered in logical annotation
#'
#' Plots a motif of the sequence surrounding sites marked as TRUE in a logical
#' vector in a
#'
#' @param DT data.table. A table as output by the \code{\link{tx_makeDT_coverage}}(),
#' \code{\link{tx_makeDT_nucFreq}}() or \code{\link{tx_makeDT_covNucFreq}}() functions.
#' @param logi_col character. Name of column of logical class, which indicates queried sites
#' @param upFlank numeric. Up-stream flank length
#' @param doFlank numeric. Down-stream flank length
#' @param method character. Height method, can be either "bits" or "probability". Default = "bits".
#'
#' @return ggplot
#' @export
 <- function(DT, logi_col, upFlank, doFlank, method = "bits"){
    tmpO <- tx_get_flankSequence(DT = DT, logi_col = logi_col, upFlank = upFlank, doFlank = doFlank)
    ggOUT <- ggseqlogo::ggseqlogo(tmpO, method = method) + ggplot2::theme_minimal() +
        ggplot2::ggtitle(paste0("SegLogo at '", logi_col, "' sites"), paste("n =", length(tmpO)))
    if(method == "bits"){
        ggOUT + ggplot2::ylim(0,2)
    }else{
        ggOUT
    }
}


#' Plot metagene at CDS
#'
#' @param txDT data.table. A table as output by the \code{\link{tx_makeDT_coverage}}(),
#' \code{\link{tx_makeDT_nucFreq}}() or \code{\link{tx_makeDT_covNucFreq}}() functions.
#' @param geneAnnot GRanges. Gene annotation as loaded by \code{\link{tx_load_bed}}().
#' @param colVars character. Names of columns for which values will be extracted
#' @param CDS_align character. Either "start", "end", or "spliceSite" depending on the desired
#' alignment, either to CDS start, CDS end, or splicing sites, respectively.
#' @param upFlank numeric. Up-stream flank length
#' @param doFlank numeric. Down-stream flank length
#' @param summ_fun character. Summing function either "sum" or "mean". Default: "mean"
#' @param roll_fun character. Rolling function either "sum" or "mean". Default: NULL
#' @param roll_n numeric. Window size for rolling function
#' @param roll_align character. Either "center" (default), "left" or "right"
#' @param roll_fill vector. Either an empty vector (no fill), or a vector
#' (recycled to) length 3 giving left, middle and right fills. NA by default
#' @param smooth logical. Set to FALSE for not smoothing line.
#' @param spar numeric. Smoothing parameter, typically (but not necessarily) in (0,1].
#' @param na.rm logical. Omit all NAs from computations. Default: TRUE
#' @param normalize logical. If set to TRUE, values are normalized so that the
#' area bellow the curve approximates to 1 for each variable.
#' @param tick_by numeric. Distance between ticks in plot. Default: upFlank/2
#'
#' @seealso Smoothing function: \code{\link[stats]{smooth.spline}}().
#'
#' @return ggplot
#' @export
tx_plot_metageneAtCDS <- function(txDT, geneAnnot, colVars, CDS_align, upFlank,
                                  doFlank, summ_fun = "mean", roll_fun = NULL, roll_n = 100,
                                  roll_align = "center", roll_fill = NA, smooth = TRUE, spar  = 0.3,
                                  na.rm = TRUE, normalize = FALSE, tick_by = NULL){
    tmpO <- tx_get_metageneAtCDS(txDT = txDT, geneAnnot = geneAnnot, colVars = colVars,
                                 CDS_align = CDS_align, upFlank = upFlank, doFlank = doFlank)
    tmpDF <- lapply(names(tmpO), function(x){
        if(summ_fun == "sum"){
            tmp2 <- colSums(tmpO[[x]], na.rm = na.rm)
        }else if(summ_fun == "mean"){
            tmp2 <- colMeans(tmpO[[x]], na.rm = na.rm)
        }else{stop("Argument 'summ_fun' has to be either sum or mean")}
        if(is.null(roll_fun)){
            tmp3 <- tmp2
        }else if(roll_fun == "sum"){
            tmp3 <- RcppRoll::roll_sum(tmp2, n = roll_n, align = roll_align, fill = roll_fill, na.rm = na.rm)
        }else if(roll_fun == "mean"){
            tmp3 <- RcppRoll::roll_mean(tmp2, n = roll_n, align = roll_align, fill = roll_fill, na.rm = na.rm)
        }else{stop("Argument 'roll_fun' has to be either sum or mean")}
        tmpDF <- data.frame(value = tmp3, position = factor(names(tmp2), levels = names(tmp2)), group = x)
        if(smooth){
            tmpDF[!is.na(tmpDF$value), ]$value <- stats::smooth.spline(tmpDF[!is.na(tmpDF$value), ]$value, spar = spar)$y
        }
        if(normalize){
            tmpDF$value <- tmpDF$value / sum(tmpDF$value, na.rm = TRUE) * sum(upFlank, doFlank, 1)
        }
        tmpDF
    }) %>% do.call(what = "rbind") %>% data.table::data.table()
    if(is.null(tick_by)){
        tick_by <- upFlank / 2
    }
    tmpGG <- ggplot2::ggplot(tmpDF, ggplot2::aes(x = .data$position, y = .data$value,
                                                 group = .data$group, colour = .data$group)) +
        ggplot2::geom_line(ggplot2::aes(x = .data$position), size = 1) +
        ggplot2::scale_x_discrete(limits = unique(tmpDF$position),
                                  breaks = unique(tmpDF$position)[seq(1, length(unique(tmpDF$position)), by = tick_by)]) +
        ggplot2::theme_minimal() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1)) +
        ggplot2::xlab("Relative position") + ggplot2::ylab("Value")
    if(CDS_align == "end"){
        tmpGG <- tmpGG + ggplot2::geom_vline(xintercept = "CDS_end", col = "black", linetype = "dashed") +
            ggplot2::ggtitle("Metagene aligned at CDS_end")
    }else if(CDS_align == "start"){
        tmpGG <- tmpGG + ggplot2::geom_vline(xintercept = "CDS_start", col = "black", linetype = "dashed") +
            ggplot2::ggtitle("Metagene aligned at CDS_start")
    }else if(CDS_align == "spliceSite"){
        tmpGG <- tmpGG + ggplot2::geom_vline(xintercept = "spliceSite", col = "black", linetype = "dashed") +
            ggplot2::ggtitle("Metagene aligned at spliceSite")
    }
    tmpGG
}


#' Plot metagene by regions
#'
#' @param txDT data.table. A table as output by the \code{\link{tx_makeDT_coverage}}(),
#' \code{\link{tx_makeDT_nucFreq}}() or \code{\link{tx_makeDT_covNucFreq}}() functions.
#' @param geneAnnot GRanges. Gene annotation as loaded by \code{\link{tx_load_bed}}().
#' @param colVars character. Names of columns for which values will be extracted
#' @param nBins_5UTR integer. Number of bins into which allocate data on 5'UTR regions
#' @param nBins_CDS integer. Number of bins into which allocate data on CDS regions
#' @param nBins_3UTR  integer. Number of bins into which allocate data on 3'UTR regions
#' @param summ_fun character. Summarizing function either "sum" or "mean". Default: "mean"
#' @param smooth logical. Set to FALSE for not smoothing line.
#' @param spar numeric. Smoothing parameter, typically (but not necessarily) in (0,1].
#' @param nCores integer. Number of cores to run the function with. Multicore
#' capability not available in Windows OS. @param plot_type character. Type of plot to be output, either "lineplot" or
#' "boxplot".
#' @param plot_type character. Type of plot to be output, either "lineplot" or "boxplot".
#' @param normalize logical. If set to TRUE, values are normalized so that the
#' area bellow the curve approximates to 1 for each variable.
#'
#' @return ggplot
#' @export
tx_plot_metageneRegions <- function(txDT, geneAnnot, colVars, nBins_5UTR,
                                    nBins_CDS = NULL, nBins_3UTR = NULL, summ_fun = "mean",
                                    smooth = TRUE, spar = 0.5, nCores = 1, plot_type = "lineplot",
                                    normalize = FALSE){
    if(!all(colVars %in% names(txDT))){
        stop("colVars: ", paste(colVars[!colVars %in% names(txDT)],
                                collapse = ", "), " are not present in txDT")
    }
    if(is.null(nBins_CDS)){nBins_CDS <- nBins_5UTR * 3}
    if(is.null(nBins_3UTR)){nBins_3UTR <- nBins_5UTR * 3}
    metaGeneMatrix <- tx_get_metageneRegions(txDT = txDT, geneAnnot = geneAnnot,
                                             colVars = colVars, nBins_5UTR = nBins_5UTR,
                                             nBins_CDS = nBins_CDS,
                                             nBins_3UTR = nBins_3UTR, nCores = nCores)
    hlp_plot_metageneRegions(metaGeneMatrix = metaGeneMatrix, colVars = colVars,
                             nBins_5UTR = nBins_5UTR, nBins_CDS = nBins_CDS,
                             nBins_3UTR = nBins_3UTR, summ_fun = summ_fun,
                             smooth = smooth, spar = spar, plot_type = plot_type,
                             normalize = normalize)
}

#' Plot metagene exons
#'
#' @param txDT data.table. A table as output by the \code{\link{tx_makeDT_coverage}}(),
#' \code{\link{tx_makeDT_nucFreq}}() or \code{\link{tx_makeDT_covNucFreq}}() functions.
#' @param colVars character. Names of columns for which values will be extracted
#' @param nBins integer. Number of bins into which exon data will be allocated.
#' @param geneAnnot GRanges. Gene annotation as loaded by \code{\link{tx_load_bed}}().
#' @param nCores integer. Number of cores to run the function with.
#' @param summ_fun character. Summarizing function either "sum" or "mean". Default: "mean".
#' @param smooth logical. Set to FALSE for not smoothing line.
#' @param spar numeric. Smoothing parameter, typically (but not necessarily) in (0,1].
#' @param plot_type character. Type of plot to be output, either "lineplot" or "boxplot".
#' @param xLabelJump Number of bins to be skipped for label plotting
#' @param normalize logical. If set to TRUE, values are normalized so that the
#' area bellow the curve approximates to 1 for each variable.
#'
#' @return ggplot
#' @export
tx_plot_metageneExons <- function(txDT, colVars, nBins, geneAnnot = NULL,
                                  nCores = 1, summ_fun = "mean",
                                  smooth = TRUE, spar = 0.5, plot_type = "lineplot",
                                  xLabelJump = 10, normalize = FALSE){
    tmpMG <- tx_get_metageneExons(txDT = txDT, colVars = colVars, nBins = nBins, geneAnnot = geneAnnot, nCores = nCores)
    metageneBinNames <- paste("exonBin", seq(nBins), sep = " ")
    metageneBinLevls <- paste("exonBin", seq(nBins), sep = "_")
    if(plot_type == "lineplot"){
        tmpO <- lapply(colVars, function(k){
            tmpY <- apply(tmpMG[[k]], 2, FUN = summ_fun, na.rm = TRUE)
            if(smooth){
                tmpY[!is.na(tmpY)] <- stats::smooth.spline(tmpY[!is.na(tmpY)], spar = spar)$y
            }
            if(normalize){
                tmpY <- tmpY / sum(tmpY, na.rm = TRUE) * nBins
            }
            data.frame(value = tmpY,
                       bin = factor(metageneBinNames, levels = metageneBinNames),
                       var = k)
        }) %>% do.call(what = "rbind") %>% magrittr::set_rownames(NULL)
        ggplot2::ggplot(tmpO, ggplot2::aes(x = .data$bin, y = .data$value, group = .data$var, colour = .data$var)) +
            ggplot2::geom_line(ggplot2::aes(x = .data$bin), size = 1) +
            ggplot2::scale_x_discrete(limits = metageneBinNames,
                                      breaks = metageneBinNames[c(1, seq(0, length(metageneBinNames), xLabelJump))]) +
            ggplot2::theme_minimal() +
            ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1)) +
            ggplot2::xlab("Summarizing Bins") +
            ggplot2::ylab("Value") +
            ggplot2::geom_vline(xintercept = "CDS start", col = "black", linetype = "dashed") +
            ggplot2::geom_vline(xintercept = "CDS end", col = "black", linetype = "dashed") +
            ggplot2::ggtitle("Metagene exons") +
            ggplot2::theme(legend.position="bottom")
    }else if(plot_type == "boxplot"){
        tmpO <- lapply(colVars, function(k){
            tidyr::pivot_longer(data = data.frame(tmpMG[[k]]), names_to = "bin",
                                values_to = "value", cols = dplyr::everything()) %>% cbind(var = k)
        }) %>% do.call(what = "rbind") %>% data.frame()
        tmpO$bin <- factor(tmpO$bin, levels = metageneBinLevls)
        ggplot2::ggplot(tmpO, ggplot2::aes(x = .data$bin, y = .data$value, colour = .data$var)) +
            ggplot2::geom_boxplot(outlier.colour = NA) +
            ggplot2::facet_grid(tmpO$var ~ .) +
            ggplot2::scale_x_discrete(limits = levels(tmpO$bin),
                                      breaks = c("CDS start", "CDS end")) +
            ggplot2::theme_bw() +
            ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1)) +
            ggplot2::xlab("Summarizing Bins") +
            ggplot2::ylab("Value") +
            ggplot2::geom_vline(xintercept = "CDS start", col = "black", linetype = "dashed") +
            ggplot2::geom_vline(xintercept = "CDS end", col = "black", linetype = "dashed") +
            ggplot2::ggtitle("Metagene codifying genes") +
            ggplot2::theme(legend.position="bottom")
    }else{
        stop("'plot_type' argument must be either 'lineplot' or 'boxplot'")
    }
}

# Helpers ###############

#' Plot an extracted metaGeneMatrix
#'
#' It plots an extracted metagene matrix from tx_get_metageneRegions(). Useful
#' when modifying the metagene matrix, like filtering genes out.
#'
#' Note It must have the same nBins arguments as the ones used to generate the
#' metagene matrix.
#'
#' @param metaGeneMatrix matrix. As obtained by \code{\link{tx_get_metageneRegions}}()
#' @param colVars character. Names of columns for which values will be extracted
#' @param nBins_5UTR integer. Number of bins into which allocate data on 5'UTR regions
#' @param nBins_CDS integer. Number of bins into which allocate data on CDS regions
#' @param nBins_3UTR  integer. Number of bins into which allocate data on 3'UTR regions
#' @param summ_fun character. Summarizing function either "sum" or "mean". Default: "mean"
#' @param smooth logical. Set to FALSE for not smoothing line.
#' @param spar numeric. Smoothing parameter, typically (but not necessarily) in (0,1].
#' @param plot_type character. Type of plot to be output, either "lineplot" or "boxplot".
#' @param normalize logical. If set to TRUE, values are normalized so that the
#' area bellow the curve approximates to 1 for each variable.
#'
#' @return ggplot
#' @export
#'
hlp_plot_metageneRegions <- function(metaGeneMatrix, colVars, nBins_5UTR,
                                     nBins_CDS, nBins_3UTR, summ_fun = "mean",
                                     smooth = TRUE, spar = 0.5, plot_type = "lineplot",
                                     normalize = FALSE){
    metageneBinNames <- c(paste("UTR5", seq(nBins_5UTR), sep = "_"),
                          paste("CDS", seq(nBins_CDS), sep = "_"),
                          paste("UTR3", seq(nBins_3UTR), sep = "_"))
    metageneBinLevls <- c(paste("UTR5", seq(nBins_5UTR), sep = "_"), "CDS start",
                          paste("CDS", seq(nBins_CDS), sep = "_"), "CDS end",
                          paste("UTR3", seq(nBins_3UTR), sep = "_"))
    if(plot_type == "lineplot"){
        tmpO <- lapply(colVars, function(k){
            tmpY <- apply(metaGeneMatrix[[k]], 2, FUN = summ_fun, na.rm = TRUE)
            if(smooth){
                tmpY[!is.na(tmpY)] <- stats::smooth.spline(tmpY[!is.na(tmpY)], spar = spar)$y
            }
            if(normalize){
                tmpY <- tmpY / sum(tmpY, na.rm = TRUE) * sum(nBins_5UTR, nBins_CDS, nBins_3UTR)
            }
            data.frame(value = tmpY,
                       bin = factor(metageneBinNames, levels = metageneBinLevls),
                       var = k)
        }) %>% do.call(what = "rbind") %>% magrittr::set_rownames(NULL)

        tmpP <- lapply(colVars, function(k){
            data.frame(value = NA, bin = factor(c("CDS start", "CDS end"),
                                                levels = metageneBinLevls), var = k)
        }) %>% do.call(what = "rbind") %>% rbind(tmpO)

        ggplot2::ggplot(tmpP, ggplot2::aes(x = .data$bin, y = .data$value, group = .data$var, colour = .data$var)) +
            ggplot2::geom_line(ggplot2::aes(x = .data$bin), size = 1) +
            ggplot2::scale_x_discrete(limits = levels(tmpP$bin),
                                      breaks = c("CDS start", "CDS end")) +
            ggplot2::theme_minimal() +
            ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1)) +
            ggplot2::xlab("Summarizing Bins") +
            ggplot2::ylab("Value") +
            ggplot2::geom_vline(xintercept = "CDS start", col = "black", linetype = "dashed") +
            ggplot2::geom_vline(xintercept = "CDS end", col = "black", linetype = "dashed") +
            ggplot2::ggtitle("Metagene codifying genes") +
            ggplot2::theme(legend.position = "bottom")
    }else if(plot_type == "boxplot"){
        tmpO <- lapply(colVars, function(k){
            tidyr::pivot_longer(data = data.table::data.table(metaGeneMatrix[[k]]), names_to = "bin",
                                values_to = "value", cols = dplyr::everything()) %>% cbind(var = k)
        }) %>% do.call(what = "rbind") %>% data.frame()
        tmpO$bin <- factor(tmpO$bin, levels = metageneBinLevls)
        ggplot2::ggplot(tmpO, ggplot2::aes(x = .data$bin, y = .data$value, colour = .data$var)) +
            ggplot2::geom_boxplot(outlier.colour = NA) +
            ggplot2::facet_grid(tmpO$var ~ .) +
            ggplot2::scale_x_discrete(limits = levels(tmpO$bin),
                                      breaks = c("CDS start", "CDS end")) +
            ggplot2::theme_bw() +
            ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1)) +
            ggplot2::xlab("Summarizing Bins") +
            ggplot2::ylab("Value") +
            ggplot2::geom_vline(xintercept = "CDS start", col = "black", linetype = "dashed") +
            ggplot2::geom_vline(xintercept = "CDS end", col = "black", linetype = "dashed") +
            ggplot2::ggtitle("Metagene codifying genes") +
            ggplot2::theme(legend.position = "bottom")
    }else{
        stop("'plot_type' argument must be either 'lineplot' or 'boxplot'")
    }
}
AngelCampos/tx_tools documentation built on April 8, 2024, 9:46 p.m.