R/viz.R

Defines functions tweedie_index_plot association_plots save_heatmap Tweedieverse_heatmap theme_omicsEye

###############################################################################
# Tweedieverse visualizations

# Copyright (c) 2020 Himel Mallick and Ali Rahnavard

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
###############################################################################

# Author: Ali Rahnavard
# Email: gholamali.rahnavard@gmail.com
# This script includes functions for visualizing overall output of Tweedieverse and
# individual associations as scatterplot and boxplot
#  Visualization results are provided as pdf and RDS format to be used with manuscript quality.

# Load libraries
for (lib in c('ggplot2', "grid", 'pheatmap', 'cowplot')) {
  suppressPackageStartupMessages(require(lib, character.only = TRUE))
}


#' @export
theme_omicsEye <- function() list(
  cowplot::theme_cowplot(),
  ggplot2::theme(
    text               = ggplot2::element_text(size = 6),
    axis.text          = ggplot2::element_text(size = 5),
    axis.title.x       = ggplot2::element_text(margin=ggplot2::margin(1, 0, 0.5, 0)),
    axis.title.x.top   = ggplot2::element_text(margin=ggplot2::margin(0, 0, 2, 0)),
    axis.title.y       = ggplot2::element_text(margin=ggplot2::margin(0, 1, 0, 0.5)),
    axis.title.y.right = ggplot2::element_text(margin=ggplot2::margin(0, 0, 0, 2)),
    axis.text.x        = ggplot2::element_text(margin=ggplot2::margin(1, 0, 0, 0)),
    axis.text.x.top    = ggplot2::element_text(margin=ggplot2::margin(0, 0, 1, 0)),
    axis.text.y        = ggplot2::element_text(margin=ggplot2::margin(0, 1, 0, 0)),
    axis.text.y.right  = ggplot2::element_text(margin=ggplot2::margin(0, 0, 0, 1)),
    axis.ticks         = ggplot2::element_line(size=0.3),
    axis.ticks.length  = ggplot2::unit(2, "pt"),
    axis.line          = ggplot2::element_line(size=0.3),
    axis.line.x        = ggplot2::element_line(size=0.3),
    axis.line.y        = ggplot2::element_line(size=0.3),
    line               = ggplot2::element_line(size=0.3),
    legend.margin      = ggplot2::margin(4, 4, 4, 4),
    legend.key.size    = ggplot2::unit(8, "pt"),
    legend.box.spacing = ggplot2::unit(4, "pt"),
    panel.spacing      = ggplot2::unit(1.5, "pt"),
    plot.title         = ggplot2::element_text(size=8),
    plot.margin        = ggplot2::margin(1, 1, 1, 1),
    strip.background   = ggplot2::element_blank(),
    strip.text         = ggplot2::element_text(size=6),
    strip.text.x       = ggplot2::element_text(margin= ggplot2::margin(3, 0, 3, 0)),
    strip.text.y       = ggplot2::element_text(margin= ggplot2::margin(0, 3, 0, 3))
  )
)



# Tweedieverse heatmap function for overall view of associations
#' @export
Tweedieverse_heatmap <-
  function(output_results,
           title = NA,
           cell_value = 'qval',
           data_label = 'data',
           metadata_label = 'metadata',
           border_color = 'grey93',
           color = colorRampPalette(c("darkblue", "grey90", "darkred")),
           col_rotate = 90,
           first_n = 50,
           write_to = NA) {
    # read Tweedieverse output
    if (is.character(output_results)) {
      df <- read.table(
        output_results,
        header = TRUE,
        sep = "\t",
        fill = TRUE,
        comment.char = "" ,
        check.names = FALSE
      )
    } else {
      data <- output_results
    }

    title_additional <- ""

    title_additional <- ""
    if (!is.na(first_n) & first_n > 0 & first_n < dim(df)[1]) {
      if (cell_value == 'coef') {
        df <- df[order(-abs(df[cell_value])) ,]
      } else{
        df <- df[order(df[cell_value]),]
      }
      # get the top n features with significant associations
      df_sub <- df[1:first_n, ]
      for (first_n_index in seq(first_n, dim(df)[1]))
      {
        if (length(unique(df_sub$feature)) == first_n)
        {
          break
        }
        df_sub <- df[1:first_n_index, ]
      }
      # get all rows that have the top N features
      df <- df[which(df$feature %in% df_sub$feature), ]
      title_additional <- paste("Top", first_n, sep = " ")
    }

    if (dim(df)[1] < 2) {
      print('There are no associations to plot!')
      return(NULL)
    }

    metadata <- df$metadata
    data <- df$feature
    dfvalue <- df$value
    value <- NA

    # values to use for coloring the heatmap
    # and set the colorbar boundaries
    if (cell_value == "pval") {
      value <- -log(df$pval) * sign(df$coef)
      value <- pmax(-20, pmin(20, value))
      if (is.null(title))
        title <- "(-log(pval)*sign(coeff))"
    } else if (cell_value == "qval") {
      value <- -log(df$qval) * sign(df$coef)
      value <- pmax(-20, pmin(20, value))
      if (is.null(title))
        title <- "(-log(qval)*sign(coeff))"
    } else if (cell_value == "coef") {
      value <- df$coef
      if (is.null(title))
        title <- "(coeff)"
    }

    if (title_additional != "") {
      title <-
        paste(title_additional,
              "features with significant associations",
              title,
              sep = " ")
    } else {
      title <- paste("Significant associations", title, sep = " ")
    }

    # identify variables with more than one level present
    verbose_metadata <- c()
    metadata_multi_level <- c()
    for (i in unique(metadata)) {
      levels <- unique(df$value[df$metadata == i])
      if (length(levels) > 1) {
        metadata_multi_level <- c(metadata_multi_level, i)
        for (j in levels) {
          verbose_metadata <- c(verbose_metadata, paste(i, j))
        }
      } else {
        verbose_metadata <- c(verbose_metadata, i)
      }
    }

    n <- length(unique(data))
    m <- length(unique(verbose_metadata))

    if (n < 2) {
      print(
        paste(
          "There is not enough features in the associations",
          "to create a heatmap plot.",
          "Please review the associations in text output file."
        )
      )
      return(NULL)
    }

    if (m < 2) {
      print(
        paste(
          "There is not enough metadata in the associations",
          "to create a heatmap plot.",
          "Please review the associations in text output file."
        )
      )
      return(NULL)
    }

    a = matrix(0, nrow = n, ncol = m)
    a <- as.data.frame(a)

    rownames(a) <- unique(data)
    colnames(a) <- unique(verbose_metadata)

    for (i in seq_len(dim(df)[1])) {
      current_metadata <- metadata[i]
      if (current_metadata %in% metadata_multi_level) {
        current_metadata <- paste(metadata[i], dfvalue[i])
      }
      if (abs(a[as.character(data[i]),
                as.character(current_metadata)]) > abs(value[i]))
        next
      a[as.character(data[i]), as.character(current_metadata)] <-
        value[i]
    }

    # get the range for the colorbar
    max_value <- ceiling(max(a))
    min_value <- ceiling(min(a))
    range_value <- max(c(abs(max_value), abs(min_value)))
    breaks <- seq(-1 * range_value, range_value, by = 1)

    p <- NULL
    tryCatch({
      p <-
        pheatmap::pheatmap(
          a,
          cellwidth = 5,
          cellheight = 5,
          # changed to 3
          main = title,
          fontsize = 6,
          kmeans_k = NA,
          border = TRUE,
          show_rownames = TRUE,
          show_colnames = TRUE,
          scale = "none",
          cluster_rows = FALSE,
          cluster_cols = TRUE,
          clustering_distance_rows = "euclidean",
          clustering_distance_cols = "euclidean",
          legend = TRUE,
          border_color = border_color,
          color = color(range_value * 2),
          breaks = breaks,
          treeheight_row = 0,
          treeheight_col = 0,
          display_numbers = matrix(ifelse(a > 0.0, "+", ifelse(a < 0.0, "-", "")), nrow(a))
        )
    }, error = function(err) {
      logging::logerror("Unable to plot heatmap")
      logging::logerror(err)
    })
    if (!is.na(write_to))
      saveRDS(p, file = paste(write_to, "/gg_heatmap.RDS", sep = ""))
    return(p)
  }

save_heatmap <-
  function(results_file,
           heatmap_file,
           figures_folder,
           title = NULL,
           cell_value = "qval",
           data_label = 'data',
           metadata_label = 'metadata',
           border_color = "grey93",
           color = colorRampPalette(c("blue", "grey90", "red")),
           first_n = 50) {
    # generate a heatmap and save it to a pdf and as a jpg
    heatmap <-
      Tweedieverse_heatmap(
        results_file,
        title,
        cell_value,
        data_label,
        metadata_label,
        border_color,
        color,
        first_n,
        figures_folder
      )

    if (!is.null(heatmap)) {
      pdf(heatmap_file)
      print(heatmap)
      dev.off()

      jpg_file <- file.path(figures_folder, "heatmap.jpg")
      jpeg(jpg_file,
           res = 150,
           height = 800,
           width = 1100)
      print(heatmap)
      dev.off()
    }

  }

association_plots <-
  function(metadata,
           features,
           output_results,
           write_to = './',
           figures_folder = './figures/',
           max_jpgs = 3)
  {
    #Tweedieverse scatter plot function and theme

    # combine the data and metadata to one datframe using common rows
    # read Tweedieverse output
    if (is.character(features)) {
      features <-
        data.frame(
          read.delim::fread(features, header = TRUE, sep = '\t'),
          header = TRUE,
          fill = T,
          comment.char = "" ,
          check.names = F,
          row.names = 1
        )
      if (nrow(data) == 1) {
        # read again to get row name
        features <- read.delim(
          features,
          header = TRUE,
          fill = T,
          comment.char = "" ,
          check.names = F,
          row.names = 1
        )
      }
    } else {
      features <- features
    }
    if (is.character(metadata)) {
      metadata <-
        data.frame(
          read.delim::fread(input_metadata, header = TRUE, sep = '\t'),
          header = TRUE,
          fill = T,
          comment.char = "" ,
          check.names = F,
          row.names = 1
        )
      if (nrow(metadata) == 1) {
        metadata <- read.delim(
          input_metadata,
          header = TRUE,
          fill = T,
          comment.char = "" ,
          check.names = F,
          row.names = 1
        )
      }
    } else {
      metadata <- metadata
    }
    common_rows <-
      intersect(rownames(features), rownames(metadata))
    input_df_all <-
      cbind(features[common_rows, , drop = FALSE],
            metadata[common_rows, , drop = FALSE])

    # read Tweedieverse output
    if (is.character(output_results)) {
      output_df_all <- read.table(
        output_results,
        header = TRUE,
        row.names = NULL,
        sep = "\t",
        fill = FALSE,
        comment.char = "" ,
        check.names = FALSE
      )
    } else {
      output_df_all <- output_results
    }

    if (dim(output_df_all)[1] < 1) {
      print('There are no associations to plot!')
      return(NULL)
    }

    logging::loginfo(
      paste(
        "Plotting associations from most",
        "to least significant,",
        "grouped by metadata"
      )
    )
    metadata_types <- unlist(output_df_all[, 'metadata'])
    metadata_labels <-
      unlist(metadata_types[!duplicated(metadata_types)])
    metadata_number <- 1
    if (is.na(max_jpgs))
      max_jpgs <- dim(output_df_all)[1]
    saved_plots <- vector('list', max_jpgs)
    for (label in metadata_labels) {
      # for file name replace any non alphanumeric with underscore
      plot_file <-
        paste(write_to,
              "/",
              gsub("[^[:alnum:]_]", "_", label),
              ".pdf",
              sep = "")
      data_index <- which(label == metadata_types)
      saved_ggs <- vector('list', length(data_index))
      logging::loginfo("Plotting data for metadata number %s, %s",
                       metadata_number,
                       label)
      pdf(plot_file,
          width = 2.65,
          height = 2.5,
          onefile = TRUE)

      x <- NULL
      y <- NULL
      count <- 1
      for (i in data_index) {
        x_label <- as.character(output_df_all[i, 'metadata'])
        y_label <- as.character(output_df_all[i, 'feature'])
        results_value <-
          as.character(output_df_all[i, 'value'])
        qval <- as.numeric(output_df_all[i, 'qval'])
        coef_val <- as.numeric(output_df_all[i, 'coef'])
        input_df <- input_df_all[c(x_label, y_label)]
        colnames(input_df) <- c("x", "y")

        # if Metadata is continuous generate a scatter plot
        # Continuous is defined as numerical with more than
        # 2 values (to exclude binary data)
        temp_plot <- NULL
        if (is.numeric(input_df[1, 'x']) &
            length(unique(input_df[['x']])) > 2) {
          logging::loginfo("Creating scatter plot for continuous data, %s vs %s",
                           x_label,
                           y_label)
          temp_plot <- ggplot2::ggplot(data = input_df,
                                       ggplot2::aes(
                                         as.numeric(as.character(x)),
                                         as.numeric(as.character(y))
                                       )) +
            ggplot2::geom_point(
              fill = 'darkolivegreen4',
              color = 'black',
              alpha = .5,
              shape = 21,
              size = 1,
              stroke = 0.15

            ) +
            ggplot2::scale_x_continuous(limits = c(min(input_df['x']), max(input_df['x']))) +
            ggplot2::scale_y_continuous(limits = c(min(input_df['y']), max(input_df['y']))) +
            ggplot2::stat_smooth(
              method = "glm",
              size = 0.25,
              color = 'blue',
              na.rm = TRUE
            ) +
            ggplot2::guides(alpha = 'none') +
            ggplot2::labs("") +
            ggplot2::xlab(x_label) +
            ggplot2::ylab(y_label) +
            theme_omicsEye() +
            ggplot2::annotate(
              geom = "text",
              x = Inf,
              y = Inf,
              hjust = 1,
              vjust = 1,
              label = sprintf(
                "FDR: %s\nCoefficient: %s\nN: %s",
                formatC(qval, format = "e", digits = 3),
                formatC(coef_val,
                        format = "e",
                        digits = 2),
                formatC(length(input_df[, 'x']))
              ) ,
              color = "black",
              size = 2,
              fontface = "italic"
            ) + ggplot2::scale_y_log10()
        } else{
          # if Metadata is categorical generate a boxplot
          ### check if the variable is categorical

          logging::loginfo("Creating boxplot for categorical data, %s vs %s",
                           x_label,
                           y_label)
          input_df['x'] <-
            lapply(input_df['x'], as.character)

          # count the Ns for each group
          x_axis_label_names <- unique(input_df[['x']])
          renamed_levels <-
            as.character(levels(metadata[, x_label]))
          if (length(renamed_levels) == 0) {
            renamed_levels <- x_axis_label_names
          }
          for (name in x_axis_label_names) {
            total <- length(which(input_df[['x']] == name))
            new_n <-
              paste(name, " (n=", total, ")", sep = "")
            input_df[which(input_df[['x']] == name), 'x'] <-
              new_n
            renamed_levels <-
              replace(renamed_levels, renamed_levels == name, new_n)
          }
          input_df$xnames <-
            factor(input_df[['x']], levels = renamed_levels)
          temp_plot <-
            ggplot2::ggplot(data = input_df, ggplot2::aes(xnames, y)) +
            ggplot2::geom_boxplot(
              ggplot2::aes(fill = xnames),
              outlier.alpha = 0.0,
              na.rm = TRUE,
              alpha = .5,
              show.legend = FALSE
            ) +
            ggplot2::geom_point(
              ggplot2::aes(fill = xnames),
              alpha = 0.75 ,
              size = 1,
              shape = 21,
              stroke = 0.15,
              color = 'black',
              show.legend = FALSE,
              position = ggplot2::position_jitterdodge()
            ) +
            ggplot2::scale_fill_brewer(palette = "Spectral", direction = -1)

          # format the figure to default nature format
          # remove legend, add x/y labels
          temp_plot <- temp_plot +
            theme_omicsEye() +
            ggplot2::theme(
              panel.grid.major = ggplot2::element_blank(),
              panel.grid.minor = ggplot2::element_blank(),
              panel.background = ggplot2::element_blank(),
              axis.line = ggplot2::element_line(colour = "black")
            ) +
            ggplot2::xlab("") + #x_label
            ggplot2::ylab(y_label) +
            ggplot2::annotate(
              geom = "text",
              x = Inf,
              y = Inf,
              hjust = 1,
              vjust = 1,
              label = sprintf(
                "FDR: %s\nCoefficient: %s\nValue: %s",
                formatC(qval, format = "e", digits = 3),
                formatC(coef_val,
                        format = "e",
                        digits = 2),
                results_value
              ) ,
              color = "black",
              size = 2,
              fontface = "italic"
            ) + ggplot2::scale_y_log10()
        }
        stdout <-
          capture.output(print(temp_plot), type = "message")
        if (length(stdout) > 0)
          logging::logdebug(stdout)
        if (count < max_jpgs + 1)
          saved_plots[[count]] <- temp_plot
        saved_ggs[[count]] <- temp_plot
        count <- count + 1
      }

      dev.off()
      # print the saved figures
      for (plot_number in seq(1, max_jpgs)) {
        jpg_file <- file.path(figures_folder,
                              paste0(substr(
                                basename(plot_file), 1, nchar(basename(plot_file)) - 4
                              ),
                              "_",
                              plot_number,
                              ".jpg"))
        jpeg(jpg_file,
             res = 300,
             width = 960,
             height = 960)
        stdout <-
          capture.output(print(saved_plots[[plot_number]]))
        dev.off()
      }
      saveRDS(saved_ggs,
              file = paste(figures_folder,
                           "/" ,
                           label,
                           "_gg_associations.RDS",
                           sep = ""))
      metadata_number <- metadata_number + 1
    }

  }

tweedie_index_plot <-
  function(output_results,
           figures_folder = './figures/')
  {
    # read Tweedieverse output
    if (is.character(output_results)) {
      output_df_all <- read.table(
        output_results,
        header = TRUE,
        row.names = NULL,
        sep = "\t",
        fill = FALSE,
        comment.char = "" ,
        check.names = FALSE
      )
    } else {
      output_df_all <- output_results
    }

    if (dim(output_df_all)[1] < 1) {
      print('There are no associations to plot!')
      return(NULL)
    }

    logging::loginfo(paste("Plotting tweedie.index ",
                           "colored by metadata"))
    plot_file <-
      paste(figures_folder,
            "/tweedie_index_plot.pdf",
            sep = "")

    logging::loginfo("Plotting tweedie.index")
    pdf(plot_file,
        width = 2.65,
        height = 1.5,
        onefile = TRUE)

    temp_plot <- NULL

    temp_plot <- ggplot2::ggplot(output_df_all,
      ggplot2::aes(x=tweedie.index))+
      ggplot2::geom_histogram(position = "identity", alpha = 0.8)  +
      ggplot2::xlab("Tweedie index") + ggplot2::ylab("Number of omics features") +
      theme_omicsEye() +
      ggplot2::theme(legend.justification = c(0, 0),
                     legend.position = c(.3, .5))

    # print the saved figures
    tdout <-
      capture.output(print(temp_plot), type = "message")
    dev.off()
    jpg_file <- paste(figures_folder,
                      "/tweedie_index_plot.jpg",
                      sep = "")
    jpeg(jpg_file,
         res = 300,
         width = 960,
         height = 600)
    stdout <-
      capture.output(print(temp_plot))
    dev.off()
    saveRDS(temp_plot,
            file = paste(figures_folder,
                         "/gg_tweedie_index_plot.RDS",
                         sep = ""))

  }
himelmallick/tweedieverse documentation built on Aug. 13, 2024, 5:48 a.m.