prolfqua: Proteomics Label Free Quantification

# ContrastsPlotter ----
#' plot contrasts
#'
#' @export
#' @family modelling
#' @family plotting
#' @examples
#'
#' istar <- sim_lfq_data_peptide_config(Nprot = 100)
#' modelName <- "Model"
#' modelFunction <-
#'   strategy_lmer("abundance  ~ group_ + (1 | peptide_Id) + (1|sample)",
#'    model_name = modelName)
#' pepIntensity <- istar$data
#' config <- istar$config
#'
#' mod <- build_model(
#'   pepIntensity,
#'   modelFunction,
#'   modelName = modelName,
#'   subject_Id = config$table$hierarchy_keys_depth())
#'
#'  Contr <- c("group_A_vs_Ctrl" = "group_A - group_Ctrl",
#'  "group_B_vs_Ctrl" = "group_B - group_Ctrl"
#'  )
#' contrast <- prolfqua::Contrasts$new(mod,
#'   Contr)
#' contr <- contrast$get_contrasts()
#' cp <- ContrastsPlotter$new(contr,
#'   contrast$subject_Id,
#'   volcano = list(list(score = "FDR", thresh = 0.1)),
#'   histogram = list(list(score = "p.value", xlim = c(0,1,0.05)),
#'                  list(score = "FDR", xlim = c(0,1,0.05))),
#'   score =list(list(score = "statistic",  thresh = 5))
#'   )
#'
#' stopifnot("plotly" %in% class(cp$volcano_plotly()$FDR))
#' stopifnot("ggplot" %in% class(cp$score_plot(legend=FALSE)$statistic))
#' p <- cp$histogram()
#' stopifnot("ggplot" %in% class(p$FDR))
#' stopifnot("ggplot" %in% class(p$p.value))
#' p <- cp$histogram_estimate()
#' stopifnot("ggplot" %in% class(p))
#' res <- cp$volcano()
#' stopifnot("ggplot" %in% class(res$FDR))
#' respltly <- cp$volcano_plotly()
#' stopifnot("plotly" %in% class(respltly$FDR))
#' stopifnot("ggplot" %in%  class(cp$ma_plot()))
#' stopifnot("plotly" %in%   class(cp$ma_plotly(rank=TRUE)))
#' res  <- cp$barplot_threshold()
#' stopifnot("ggplot" %in% class(res$FDR$plot))
#' stopifnot("ggplot" %in%  class(cp$histogram_diff()))
#'
ContrastsPlotter <- R6::R6Class(
  "ContrastsPlotter",
  public = list(
    #' @field contrastDF data frame with contrasts
    contrastDF = NULL,
    #' @field modelName of column with model name
    modelName =  character(),
    #' @field subject_Id hierarchy key columns
    subject_Id = character(),
    #' @field prefix default Contrasts - used to generate file names
    prefix = "Contrasts",
    #' @field diff column with fold change differences
    diff = "diff",
    #' @field contrast column with contrasts names, default "contrast"
    contrast = "contrast",
    #' @field volcano_spec volcano plot specification
    volcano_spec = NULL,
    #' @field score_spec score plot specification
    score_spec = NULL,
    #' @field histogram_spec plot specification
    histogram_spec = NULL,
    #' @field fcthresh fold change threshold
    fcthresh = 1,
    #' @field avg.abundance name of column containing avg abundance values.
    avg.abundance = character(),
    #' @field protein_annot protein annotation
    protein_annot = NULL,
    #' @description
    #' create Crontrast_Plotter
    #' @param contrastDF frame with contrast data
    #' @param subject_Id columns containing subject Identifier
    #' @param volcano which score to plot and which ablines to add.
    #' @param histogram which scores to plot and which range (x) should be shown.
    #' @param score score parameters
    #' @param fcthresh default 1 (log2 FC threshold)
    #' @param modelName name of column with model names
    #' @param diff fold change (difference) diff column
    #' @param contrast contrast column
    #' @param avg.abundance name of column with average abundance
    #' @param protein_annot add protein annotation (optional)
    initialize = function(contrastDF,
                          subject_Id,
                          volcano = list(list(score = "FDR", thresh = 0.1)),
                          histogram = list(list(score = "p.value", xlim = c(0,1,0.05)),
                                           list(score = "FDR", xlim = c(0,1,0.05))),
                          score = list(list(score = "statistic",  thresh = NULL)),
                          fcthresh = 1,
                          modelName = "modelName",
                          diff = "diff",
                          contrast = "contrast",
                          avg.abundance = "avgAbd",
                          protein_annot = NULL
    ){
      self$contrastDF <- tidyr::unite(
        contrastDF,
        "subject_Id", subject_Id, sep = "~", remove = FALSE)

      self$modelName  = modelName
      self$subject_Id = subject_Id
      self$diff = diff
      self$volcano_spec = volcano
      self$score_spec = score
      self$histogram_spec = histogram
      self$fcthresh = fcthresh
      self$contrast = contrast
      self$avg.abundance = avg.abundance
      self$protein_annot = protein_annot
    },
    #' @description
    #' plot histogram of selected scores (e.g. p-value, FDR, t-statistics)
    histogram = function(){
      res <- list()
      if (length(self$histogram_spec) > 0) {
        for (spec in self$histogram_spec) {
          fig <- private$.histogram(score = spec )
          res[[spec$score]] <- fig
        }
        return(res)
      }
    },
    #' @description
    #' plot histogram of effect size - difference between groups
    #' @param binwidth with of bin in histogram
    histogram_estimate = function(binwidth = 0.05){
      re <- range(self$contrastDF[[self$diff]], na.rm = TRUE)
      re[1] <- floor(re[1])
      re[2] <- ceiling(re[2])

      fig <- ggplot(self$contrastDF, aes(x = !!sym(self$diff))) +
        geom_histogram(breaks = seq(from = re[1], to = re[2], by = binwidth)) +
        geom_vline(aes(xintercept = median(!!sym( self$diff ), na.rm = T)),   # Ignore NA values for mean
                   color = "red", linetype = "dashed", size = 1) +
        geom_vline(xintercept = 0, col = "green" , size = 1) +
        facet_wrap(vars(!!sym(self$contrast)))

      return(fig)
    },
    #' @description
    #' plot histogram of differences (diff) fold change
    #' @param binwidth with of bin in histogram
    histogram_diff = function(binwidth = 0.05){
      self$histogram_estimate(binwidth = binwidth)
    },
    #' @description
    #' volcano plots (fold change vs FDR)
    #' @param colour column name with color information default modelName
    #' @param legend default TRUE
    #' @param scales default fixed \code{\link{facet_wrap}}, scales argument
    #' @param min_score replace p.values or FDR's smaller then min_score with min_score (default 0.0001).
    volcano = function(colour,
                       legend = TRUE,
                       scales = c("fixed","free","free_x","free_y"),
                       min_score = 0.0001){
      if (missing(colour)) {
        colour <- self$modelName
      }
      scales <- match.arg(scales)
      fig <- private$.volcano(self$contrastDF,
                              self$volcano_spec,
                              colour = colour,
                              legend = legend,
                              scales = scales,
                              min_score = min_score)
      return(fig)
    },
    #' @description
    #' plotly volcano plots
    #' @param colour column in contrast matrix with colour coding
    #' @return list of ggplots
    #' @param legend default TRUE
    #' @param scales default fixed \code{\link{facet_wrap}}, scales argument
    #' @param min_score replace p.values or FDR's smaller then min_score with min_score (default 0.0001).
    volcano_plotly = function(colour,
                              legend = TRUE,
                              scales = c("fixed","free","free_x","free_y"),
                              min_score = 0.0001){
      if (missing(colour)) {
        colour <- self$modelName
      }
      scales <- match.arg(scales)
      res <- private$.volcano(self$contrastDF,
                              self$volcano_spec,
                              colour = colour,
                              legend = legend,
                              scales = scales,
                              plotly = TRUE,
                              min_score = min_score)
      return(res)
    },
    #' @description
    #' ma plot
    #'
    #' MA plot displays the effect size estimate as a function
    #' of the mean protein intensity across groups.
    #' Each dot represents an observed protein.
    #' Red horizontal lines represent the fold-change threshold.
    #'
    #' Sometimes measured effects sizes (differences between samples groups)
    #' are biased by the signal intensity (here protein abundance).
    #' Such systematic effects can be explored using MA-plots.
    #'
    #' @param fc fold change abline
    #' @param colour column in contrast matrix with colour coding
    #' @param legend enable legend default TRUE
    #' @param rank default FALSE, if TRUE then rank of avgAbd is used.
    #' @return ggplot
    ma_plot = function(fc, colour, legend = TRUE, rank = TRUE){
      if ( missing(fc))
        fc <- self$fcthresh
      if (missing(colour)) {
        colour <- self$modelName
      }
      contrastDF <- self$contrastDF
      if (!is.null(contrastDF[[self$avg.abundance]])) {
        # pdf version
        if (rank) {
          rankcol <- paste0("rank_", self$avg.abundance)

          contrastDF <- contrastDF |>
            dplyr::group_by(!!sym(self$contrast)) |>
            mutate(!!rankcol := rank(!!sym(self$avg.abundance)))

          #contrastDF[[ rankcol ]] <- rank(contrastDF[[self$avg.abundance]])
          fig <- private$.ma_plot(
            contrastDF,
            rankcol,
            self$diff,
            self$contrast,
            fc,
            colour = colour,
            legend = legend )
        }else{
          fig <- private$.ma_plot(
            contrastDF,
            self$avg.abundance,
            self$diff,
            self$contrast,
            fc,
            colour = colour,
            legend = legend)
        }
      }else{
        warning("no group_1 group_2 columns can't generate MA")
        fig <- NULL
      }
      return(fig)
    },
    #' @description
    #' ma plotly
    #' @param fc horizontal lines
    #' @param colour column in contrast matrix with colour coding.
    #' @param legend enable legend default TRUE
    #' @param rank default FALSE, if TRUE then rank of avgAbd is used.
    #' @return list of ggplots
    ma_plotly = function(fc, colour, legend = TRUE, rank = FALSE){
      # html version
      if (missing(fc))
        fc <- self$fcthresh
      if (missing(colour))
        colour <- self$modelName
      contrastDF  <- self$contrastDF
      if (!is.null(contrastDF[[self$avg.abundance]])) {
        if (rank) {
          rankcol <- paste0("rank_", self$avg.abundance)
          contrastDF <- contrastDF |>
            dplyr::group_by(!!sym(self$contrast)) |>
            mutate(!!rankcol := rank(!!sym(self$avg.abundance)))

          fig <- private$.ma_plot(
            contrastDF,
            rankcol,
            self$diff,
            self$contrast,
            fc,
            colour = colour,
            legend = legend
          )
        } else {
          contrastDF  <- contrastDF |>
            plotly::highlight_key(~subject_Id)
          fig <- private$.ma_plot(
            contrastDF,
            self$avg.abundance,
            self$diff,
            self$contrast,
            fc,
            colour = colour,
            legend = legend
          )
        }

        fig_plotly <- fig |>
          plotly::ggplotly(tooltip = "subject_Id")

        return(fig_plotly)
      }else{
        return(NULL)
      }
    },
    #' @description
    #' plot a score against the log2 fc e.g. t-statistic
    #' @param scorespec list(score="statistics", fcthres = 2, thresh = 5)
    #' @param colour column with colour coding
    #' @param legend enable legend default TRUE
    #' @return list of ggplots
    score_plot = function(scorespec,  colour, legend = TRUE ){
      if (!missing(scorespec)) {
        self$score_spec[[scorespec$score]] <- scorespec
      }
      if (missing(colour))
        colour <- self$modelName
      res <- list()
      if (length(self$score_spec) > 0) {
        res <- private$.score_plot(
          self$contrastDF,
          self$score_spec,
          colour = colour,
          legend = legend )
      }
      return(res)
    },
    #' @description
    #' plot a score against the log2 fc e.g. t-statistic
    #' @param scorespec list(score="statistics", fcthres = 2, thresh = 5)
    #' @param colour column with colour coding
    #' @param legend enable legend default TRUE
    #' @return list of ggplots
    score_plotly = function(scorespec,
                            colour,
                            legend = TRUE ){
      if (!missing(scorespec)) {
        self$score_spec[[scorespec$score]] <- scorespec
      }
      if (missing(colour))
        colour <- self$modelName
      contrastDF <- self$contrastDF |> plotly::highlight_key( ~subject_Id )
      res <- private$.score_plot(
        contrastDF,
        self$score_spec,
        colour = colour,
        legend = legend )

      for (i in seq_along(res)) {
        res[[i]] <- plotly::ggplotly(res[[i]], tooltip = "subject_Id")
      }
      return(res)
    },
    #' @description
    #' shows the number of significant proteins per contrasts
    #' @return list which contains ggplots and summary tables
    barplot_threshold = function(){
      resBar <- list()
      for (i in seq_along(self$volcano_spec) ) {
        scN <- self$volcano_spec[[i]]$score
        scT <- self$volcano_spec[[i]]$thresh
        filt <- dplyr::filter(
          self$contrastDF ,
          !is.na(!!sym(scN)) & !!sym(scN)  < scT)
        if (is.numeric(self$fcthresh)) {
          filt <-  dplyr::filter(filt, abs(!!sym(self$diff)) > self$fcthresh)
        }
        sumC <- group_by(filt, !!sym(self$contrast), !!sym(self$modelName)) |>
          dplyr::summarize(n = n())
        p <- ggplot(sumC,
                    aes(x = !!sym(self$contrast), y = n, fill = !!sym(self$modelName))) +
          geom_bar(position = "stack", stat = "identity")
        resBar[[scN]] <- list(plot = p, summary = sumC)
      }
      return(resBar)
    }
  ),
  private = list(
    .volcano = function(contrasts,
                        scores,
                        colour = NULL,
                        legend = TRUE,
                        scales = "free_y",
                        plotly = FALSE,
                        min_score = 0.0001 ){
      fig <- list()
      for (score in scores) {
        column <- score$score
        contrasts2 <- contrasts |>
          dplyr::filter(!is.na(!!sym(self$diff))) |>
          dplyr::filter(!is.na(!!sym(column))) |>
          dplyr::mutate(!! column := case_when(!!sym(column) < min_score ~ min_score, TRUE ~ !!sym(column)))
        if (plotly) {
          contrasts2 <- contrasts2 |> plotly::highlight_key(~subject_Id)
        }
        p <- prolfqua:::.multigroup_volcano(
          contrasts2,
          effect = self$diff,
          significance = column,
          contrast = self$contrast,
          text = "subject_Id",
          xintercept = if (is.numeric(self$fcthresh)) { c(-self$fcthresh, self$fcthresh) } else {NULL},
          yintercept = score$thresh,
          colour = colour,
          scales = scales)
        if (!legend) {
          p <- p + guides(colour = "none")
        }
        if (plotly) {
          p <-  plotly::ggplotly(p, tooltip = "subject_Id")
        }
        fig[[column]] <- p
      }
      return(fig)
    },
    .histogram  = function(score){
      xlim = score$xlim
      score = score$score
      plot <- self$contrastDF |> ggplot(aes(x = !!sym(score))) +
        geom_histogram(breaks = seq(from = xlim[1], to = xlim[2], by = xlim[3])) +
        facet_wrap(vars(!!sym(self$contrast)))
      return(plot)
    },
    .ma_plot = function(x, avg.abundance, diff, contrast, fc, colour = NULL, legend = TRUE){
      p <- ggplot(x , aes(x = !!sym(avg.abundance),
                          y = !!sym(diff),
                          text = !!sym("subject_Id"),
                          colour = !!sym(colour))) +
        geom_point(alpha = 0.5) +
        scale_colour_manual(values = c("black", "green")) +
        facet_wrap(vars(!!sym(contrast)))
      if (FALSE) { ylab("log fold change (M)") + xlab("mean log intensities (A)") } else { NULL }
      if ( is.numeric(fc) ) {
        p <- p + geom_hline(yintercept = c(-fc, fc), linetype = "dashed", colour = "red")
      }

      if (!legend) {
        p <- p + guides(colour = "none")
      }
      return(p)
    },
    .score_plot = function(x, scores, colour, legend = TRUE){
      fig <- list()
      for (score in scores) {
        xlim = self$fcthresh
        ylim = score$thresh
        score = score$score
        scoreVal <- if ("data.frame" %in% class(x)) {
          x[[score]]
        } else {
          x$data()[[score]]
        }

        ylims <- c( sign(min(scoreVal, na.rm = TRUE)) * ylim, sign( max(scoreVal, na.rm = TRUE)) * ylim)
        p <- ggplot(x, aes(x = !!sym(self$diff),
                           y = !!sym(score),
                           text = !!sym("subject_Id"),
                           colour = !!sym(colour))) +
          scale_colour_manual(values = c("black", "green")) +
          geom_point(alpha = 0.5) +
          facet_wrap(vars(!!sym(self$contrast))) +
          geom_hline(yintercept = c(0), colour = 1) +
          geom_vline(xintercept = c(0), colour = 1 ) +
          geom_hline(yintercept = ylims, colour = 2, linetype = "dashed")

        if (is.numeric(xlim)) {
          p <- p + geom_vline(xintercept = c(-xlim, xlim), colour = 2, linetype = "dashed" )
        }

        if (!legend) {
          p <- p + guides(colour = "none")
        }
        fig[[score]] <- p
      }
      return(fig)
    }
  )
)