idepGolem: Integrated Differential Expression and Pathway Analysis

Documented in chromosome_plotly

#' fct_07_genome.R This file holds all of the main data analysis functions
#' associated with eighth tab of the iDEP website.
#'
#'
#' @section fct_07_genome.R functions:
#'
#'
#' @name fct_07_genome.R
NULL

#' Plotly of chromosome position
#'
#' Calculate a plotly that shows the chromosome position of significant genes
#' and significantly enriched regions.
#'
#' @param limma Return from \code{limma_value} function
#' @param select_contrast DEG contrast to examine
#' @param all_gene_info Gene information return from \code{get_gene_info}
#' @param ignore_non_coding When TRUE only use protein coding genes
#' @param limma_p_val_viz Adjusted p-value to use for significant genes
#' @param limma_fc_viz Minimum fold-change value to filter with
#' @param label_gene_symbol Paste the gene symbol label on the plot
#' @param ma_window_size Moving average window size for a chromosome
#'   (1, 2, 4, 6, 8, 10, 15, 20)
#' @param ma_window_steps Number of moving average window steps (1, 2, 3, 4)
#' @param ch_region_p_val P-value to use for finding significant chromosome
#'  region enrichment
#' @param hide_patches Boolean to indicate to only keep within 2 MAD from the
#'  median (TRUE/FALSE)
#' @param hide_chr Boolean to indicate if chromosomes with less than 100 genes
#'  are excluded (TRUE/FALSE)
#'
#' @export
#' @return Plotly visualization of chromosomes and significantly enriched genes
chromosome_plotly <- function(limma,
                              select_contrast,
                              all_gene_info,
                              ignore_non_coding,
                              limma_p_val_viz,
                              limma_fc_viz,
                              label_gene_symbol,
                              ma_window_size,
                              ma_window_steps,
                              ch_region_p_val,
                              hide_patches,
                              hide_chr) {
  # Default plot
  fake <- data.frame(a = 1:3, b = 1:3)
  p <- ggplot2::ggplot(fake, ggplot2::aes(x = a, y = b)) +
    ggplot2::geom_blank() +
    ggplot2::ggtitle("No genes with position info.") +
    ggplot2::theme(
      axis.title.x = ggplot2::element_blank(),
      axis.title.y = ggplot2::element_blank()
    )

  if (length(limma$comparisons) == 1) {
    top_1 <- limma$top_genes[[1]]
  } else {
    top <- limma$top_genes
    ix <- match(select_contrast, names(top))
    if (is.na(ix)) {
      return(plotly::ggplotly(p))
    }
    top_1 <- top[[ix]]
  }
  if (dim(top_1)[1] == 0) {
    return(plotly::ggplotly(p))
  }

  # Species in STRING-db do not have chr. location data
  if (sum(!is.na(all_gene_info$start_position)) < 5) {
    return(p)
  }

  colnames(top_1) <- c("Fold", "FDR")

  x <- merge(
    top_1,
    all_gene_info,
    by.x = "row.names",
    by.y = "ensembl_gene_id"
  )

  colnames(x)[which(colnames(x) == "Row.names")] <- "ensembl_gene_id"

  # only coding genes?
  if (ignore_non_coding) {
    x <- subset(x, gene_biotype == "protein_coding")
  }

  # If no chromosomes found. For example if user do not convert gene IDs.
  if (dim(x)[1] > 5) {
    x <- x[order(x$chromosome_name, x$start_position), ]
    x$ensembl_gene_id <- as.character(x$ensembl_gene_id)
    # If symbol is missing use Ensembl id
    x$symbol <- as.character(x$symbol)
    ix <- which(is.na(x$symbol))
    ix2 <- which(nchar(as.character(x$symbol)) <= 2)
    ix3 <- which(duplicated(x$symbol))
    ix <- unique(c(ix, ix2, ix3))
    x$symbol[ix] <- x$ensembl_gene_id[ix]

    x <- x[!is.na(x$chromosome_name), ]
    x <- x[!is.na(x$start_position), ]

    # Only keep significant genes
    ix <- which(
      (x$FDR < as.numeric(limma_p_val_viz)) &
        (abs(x$Fold) > log2(as.numeric(limma_fc_viz)))
    )

    if (length(ix) < 5) {
      return(NULL)
    }
    # Remove non-significant / not selected genes
    # Keep a copy
    x0 <- x
    x <- x[ix, ]

    # gather info on chrosomes
    tem <- sort(table(x$chromosome_name), decreasing = TRUE)
    # ch with less than 100 genes are excluded
    if (hide_chr) {
      ch <- names(tem[tem >= 5])
    } else {
      ch <- names(tem[tem >= 1])
    }

    if (length(ch) > 50) {
      # At most 50 ch
      ch <- ch[1:50]
    }

    if (hide_patches) {
      tem <- nchar(ch)
      # only keep within 2 MAD from the median
      ix <- which(tem <= median(tem) + 2 * mad(tem) + 0.0001)
      ch <- ch[ix]
      # ch. name less than 10 characters
      #    ch <- ch[nchar(ch) <= 12]
    }
    ch <- ch[order(as.numeric(ch))]
    tem <- ch
    # The numbers are continous from 1 to length(ch)
    ch <- 1:(length(ch))
    # The names are real chr. names
    names(ch) <- tem
    x <- x[which(x$chromosome_name %in% names(ch)), ]
    x <- droplevels(x)
    # Numeric encoding
    x$chNum <- 1
    x$chNum <- ch[x$chromosome_name]

    # Use max position as chr. length before filtering
    ch_length_table <- aggregate(start_position ~ chromosome_name, data = x, max)
    # Add chr. numer
    ch_length_table$chNum <- ch[ch_length_table$chromosome_name]
    ch_length_table <- ch_length_table[!is.na(ch_length_table$chNum), ]
    ch_length_table <- ch_length_table[order(ch_length_table$chNum), c(3, 2)]
    ch_length_table <- ch_length_table[order(ch_length_table$chNum), ]
    ch_length_table$start_position <- ch_length_table$start_position / 1e6

    # Prepare coordinates
    # Mbp
    x$start_position <- x$start_position / 1000000
    # Distance between chs.
    chD <- 30
    # Max log2 fold
    fold_cutoff <- 4

    # log2 fold within -5 to 5
    x$Fold[which(x$Fold > fold_cutoff)] <- fold_cutoff
    x$Fold[which(x$Fold < -1 * fold_cutoff)] <- -1 * fold_cutoff
    x$Fold <- 4 * x$Fold

    x$y <- x$chNum * chD + x$Fold
    ch_total <- dim(ch_length_table)[1]
    x$R <- as.factor(sign(x$Fold))

    colnames(x)[which(colnames(x) == "start_position")] <- "x"
    x$R <- as.character(x$R)
    x$R[x$R == "-1"] <- "Down"
    x$R[x$R == "1"] <- "Up"
    x$R <- as.factor(x$R)
    # remove chromosomes with no genes left
    x <- droplevels(x)
    # Don't define x and y, so that we could plot use two datasets
    p <- ggplot2::ggplot() +
      ggplot2::geom_point(
        data = x,
        ggplot2::aes(
          x = x,
          y = y,
          color = R,
          text = paste0(
            "Symbol: ",
            symbol,
            "\nRegulation: ",
            R,
            "\nChr Pos: ",
            x
          )
        ),
        shape = 20,
        size = 0.2
      )

    if (label_gene_symbol) {
      p <- p + ggplot2::geom_text(
        data = x,
        ggplot2::aes(x = x, y = y, label = symbol),
        check_overlap = FALSE,
        angle = 45,
        size = 2,
        vjust = 0,
        nudge_y = 4
      )
    }
    # Label y with ch names
    p <- p + ggplot2::scale_y_continuous(
      labels = paste(
        "chr",
        names(ch[ch_length_table$chNum]),
        sep = ""
      ),
      breaks = chD * (1:ch_total),
      limits = c(0, chD * (ch_total + 1) + 5)
    )

    # Draw horizontal lines for each ch.
    for (i in 1:dim(ch_length_table)[1]) {
      p <- p + ggplot2::annotate(
        "segment",
        x = 0,
        xend = ch_length_table$start_position[i],
        y = ch_length_table$chNum[i] * chD,
        yend = ch_length_table$chNum[i] * chD
      )
    }
    # Change legend  http://ggplot2.tidyverse.org/reference/scale_manual.html
    # Customize legend text
    p <- p + ggplot2::scale_color_manual(
      name = "",
      values = c("red", "blue"),
      breaks = c("Up", "Down"),
      labels = c("Up", "Dn")
    )
    p <- p + ggplot2::xlab("Position on chrs. (Mbp)") +
      ggplot2::theme(axis.title.y = ggplot2::element_blank())
    p <- p + ggplot2::theme(legend.position = "none")


    x0 <- x0[x0$chromosome_name %in% unique(x$chromosome_name), ]
    # Numeric encoding
    x0$chNum <- 1
    x0$chNum <- ch[x0$chromosome_name]
    # Mbp
    x0$start_position <- x0$start_position / 1e6

    window_size <- as.numeric(ma_window_size)
    # Step size is then windowSize / steps
    steps <- as.numeric(ma_window_steps)

    # Centering
    x0$Fold <- x0$Fold - mean(x0$Fold)

    for (i in 0:(steps - 1)) {
      # Step size is windowSize/steps
      # If windowSize=10 and steps = 2; then step size is 5Mb
      # 1.3 becomes 5, 11.2 -> 15 for step 1
      # 1.3 -> -5
      x0$x <- (
        floor((x0$start_position - i * window_size / steps) / window_size)
        + 0.5 + i / steps
      ) * window_size

      moving_average_start <- x0 |>
        dplyr::select(chNum, x, Fold) |>
        # Beginning bin can be negative for first bin in the 2nd step
        dplyr::filter(x >= 0) |>
        dplyr::group_by(chNum, x) |>
        dplyr::summarize(
          ma = mean(Fold),
          n = dplyr::n(),
          pval = ifelse(
            dplyr::n() >= 3 && sd(Fold) > 0, stats::t.test(Fold)$p.value, 0
          )
        ) |>
        # na when only 1 data point?
        dplyr::filter(!is.na(pval))

      if (i == 0) {
        moving_average <- moving_average_start
      } else {
        moving_average <- rbind(moving_average, moving_average_start)
      }
    }
    # Translate fold to y coordinates
    moving_average <- moving_average |>
      dplyr::filter(n >= 3) |>
      dplyr::mutate(pval = stats::p.adjust(pval, method = "fdr")) |>
      dplyr::filter(pval < as.numeric(ch_region_p_val)) |>
      dplyr::mutate(y = ifelse(ma > 0, 1, -1)) |>
      dplyr::mutate(y = chNum * chD + 3 * y) |>
      dplyr::mutate(ma = ifelse(ma > 0, 1, -1)) |>
      dplyr::mutate(ma = as.factor(ma))

    moving_average$ma <- as.character(moving_average$ma)
    moving_average$ma[moving_average$ma == "-1"] <- "Down"
    moving_average$ma[moving_average$ma == "1"] <- "Up"
    moving_average$ma <- as.factor(moving_average$ma)

    # Significant regions are marked as horizontal error bars
    if (dim(moving_average)[1] > 0) {
      p <- p + ggplot2::geom_errorbarh(
        data = moving_average,
        ggplot2::aes(
          x = x,
          y = y,
          xmin = x - window_size / 2,
          xmax = x + window_size / 2,
          colour = ma,
          text = paste0(
            paste0(
              "Window: ",
              x - window_size / 2,
              " - ",
              x + window_size / 2,
              "\nRegulation: ",
              ma,
              "\nChr Pos: ",
              x
            )
          )
        ),
        size = 2,
        height = 15
      )

      # Label significant regions
      sig_ch <- sort(table(moving_average$chNum), decreasing = TRUE)
      sig_ch <- names(ch)[as.numeric(names(sig_ch))]
      if (length(sig_ch) <= 5) {
        # More than 5 just show 5
        sig_ch <- paste0("chr", sig_ch, collapse = ", ")
      } else {
        sig_ch <- sig_ch[1:5]
        sig_ch <- paste0("chr", sig_ch, collapse = ", ")
        sig_ch <- paste0(sig_ch, ", ...")
      }

      sig_ch <- paste(
        dim(moving_average)[1],
        " enriched regions \n(",
        round(
          sum(ch_length_table$start_position) /
            window_size * steps * as.numeric(ch_region_p_val),
          2
        ),
        " expected)  detected on:\n ",
        sig_ch
      )

      p <- p + ggplot2::annotate(
        geom = "text",
        x = max(x$x) * 0.70,
        y = max(x$y) * 0.90,
        label = sig_ch
      )
    }
  }
  plotly::ggplotly(p, tooltip = "text")
}