R/DEG.R

Defines functions EIF4F_DEG_analysis .plot_boxgraph_RNAratio_TCGA .plot_boxgraph_RNAseq_TCGA .RNAratio_tumortype .RNAratio_boxplot .RNAratio_selection .RNAratio_calculation .violinplot .RNAseq_tumortype .RNAseq_boxplot .RNAseq_ind_gene .RNAseq_grouped_boxplot .RNAseq_all_gene .get_TCGA_GTEX_RNAseq initialize_RNAseq_data

Documented in EIF4F_DEG_analysis .get_TCGA_GTEX_RNAseq initialize_RNAseq_data .plot_boxgraph_RNAratio_TCGA .plot_boxgraph_RNAseq_TCGA .RNAratio_boxplot .RNAratio_calculation .RNAratio_selection .RNAratio_tumortype .RNAseq_all_gene .RNAseq_boxplot .RNAseq_grouped_boxplot .RNAseq_ind_gene .RNAseq_tumortype .violinplot

# Differential gene expression (ratio) analysis of EIF4F genes in TCGA
# This R script contains four sections.
#
# (1) RNAseq data preparation
#
# (2) analyze differential mRNA gene (ratio) expression and plotting
#
# (3) composite functions to execute a pipeline of functions to select related
#  RNAseq data  with supply of EIF4F gene names for analysis and plotting.
#
# (4) wrapper function to call all master functions with inputs
#

## Wrapper function for data initialization of RNA-seq related datasets ========

#' @noRd
## due to NSE notes in R CMD check
TCGA_GTEX_RNAseq_sampletype <- NULL


#' @title Read all RNA-seq related datasets from TCGA and GTEx
#'
#' @description
#'
#' A wrapper function reads RNA-seq related datasets from TCGA and GTEx.
#'
#' @details
#'
#' Its side effects is the global variable `TCGA_GTEX_RNAseq_sampletype`, which
#'  was merged from two internal data frames:
#'
#' (1) `.TCGA_GTEX_RNAseq`: the recomputed RNAseq data from both TCGA and GTEx
#'  generated by [.get_TCGA_GTEX_RNAseq()], which imports the dataset
#'  `TcgaTargetGtex_RSEM_Hugo_norm_count`.
#'
#' (2) `.TCGA_GTEX_sampletype` annotates the feature for each sample from
#'  TCGA and GTEx. The data frame imports the `TcgaTargetGTEX_phenotype.txt`
#'  dataset and performed basic data cleaning steps including removal of
#'  duplicates and NAs.
#'
#'  To reduce the data size, we only select the following four relevant columns
#'  out of `TcgaTargetGTEX_phenotype.txt` to construct `.TCGA_GTEX_sampletype`.
#'
#'   * `sample.type` column that annotates  malignant of normal tissues
#'   * `primary.disease` column that annotates cancer types for each sample
#'   * `primary.site` column that annotates the tissue types
#'   * `study` column that annotates the cohort “TCGA” or “GTEx”
#'
#' `TCGA_GTEX_RNAseq_sampletype` was stored as `TCGA_GTEX_RNAseq_sampletype.csv` in
#'  `~/Documents/EIF_output/ProcessedData` folder.
#'
#' @family wrapper function for data initialization
#'
#' @importFrom dplyr distinct filter select select_if mutate mutate_at
#' summarise rename group_by all_of
#'
#' @importFrom readr read_tsv
#'
#' @importFrom tibble remove_rownames column_to_rownames
#'
#' @importFrom data.table fread transpose
#'
#' @export
#'
#' @examples \dontrun{
#' initialize_RNAseq_data()
#' }
#'
initialize_RNAseq_data <- function() {
  rlang::env_binding_unlock(parent.env(environment()), nms = NULL)

  if (!file.exists(file.path(
    output_directory,
    "ProcessedData",
    "TCGA_GTEX_RNAseq_sampletype.csv"
  ))) {
    .TCGA_GTEX_RNAseq <- .get_TCGA_GTEX_RNAseq()

    .TCGA_GTEX_sampletype <- readr::read_tsv(
      file.path(
        data_file_directory,
        "TcgaTargetGTEX_phenotype.txt"
      ),
      show_col_types = FALSE
    ) %>%
      tibble::as_tibble() %>%
      dplyr::distinct(.data$sample, .keep_all = TRUE) %>%
      tibble::remove_rownames() %>%
      tibble::column_to_rownames(var = "sample") %>%
      dplyr::select(
        "_sample_type",
        "primary disease or tissue",
        "_primary_site",
        "_study"
      ) %>%
      dplyr::rename(
        "sample.type" = "_sample_type",
        "primary.disease" = "primary disease or tissue",
        "primary.site" = "_primary_site",
        "study" = "_study"
      )

    assign("TCGA_GTEX_RNAseq_sampletype",
           merge(.TCGA_GTEX_RNAseq,
                 .TCGA_GTEX_sampletype,
                 by    = "row.names",
                 all.x = TRUE
           ) %>%
             tibble::remove_rownames() %>%
             tibble::column_to_rownames(var = "Row.names"),
           envir = parent.env(environment()))

    data.table::fwrite(TCGA_GTEX_RNAseq_sampletype, file.path(
      output_directory,
      "ProcessedData",
      "TCGA_GTEX_RNAseq_sampletype.csv"
    ),row.names = TRUE)

    } else {
      assign("TCGA_GTEX_RNAseq_sampletype",
             data.table::fread(file.path(
               output_directory,
               "ProcessedData",
               "TCGA_GTEX_RNAseq_sampletype.csv"),data.table = FALSE
             ) %>%
               tibble::as_tibble() %>%
               #tibble::remove_rownames() %>%
               tibble::column_to_rownames(var = "V1"),
             envir = parent.env(environment()))
      }
  rlang::env_binding_lock(parent.env(environment()), nms = NULL)

  return(NULL)
}


#' @title Read recomputed RNAseq data from both TCGA and GTEx
#'
#' @description
#'
#' A helper function reads the dataset `TcgaTargetGtex_RSEM_Hugo_norm_count`,
#'  which contains the recomputed RNAseq data from both TCGA and GTEx.
#'
#' @details
#'
#' The function also removes possible duplicated tumor samples and samples with
#'  NAs in the dataset. It should not be used directly, only inside
#'  [initialize_RNAseq_data()] function.
#'
#' @family helper function for data initialization
#'
#' @return
#'
#' a data frame that contains the recomputed RNAseq data from both TCGA and GTEx
#'
#' @examples \dontrun{
#' .get_TCGA_GTEX_RNAseq()
#' }
#'
#' @keywords internal
#'
.get_TCGA_GTEX_RNAseq <- function() {
  .TCGA_pancancer <- data.table::fread(
    file.path(
      data_file_directory,
      "TcgaTargetGtex_RSEM_Hugo_norm_count"
    ),
    data.table = FALSE
  ) %>%
    tibble::as_tibble() %>%
    dplyr::distinct(.data$sample, .keep_all = TRUE) %>%
    stats::na.omit() %>%
    tibble::remove_rownames() %>%
    tibble::column_to_rownames(var = "sample")
  # transpose function from the data.table keeps numeric values as numeric.
  .TCGA_pancancer_transpose <- data.table::transpose(.TCGA_pancancer)
  # get row and colnames in order
  rownames(.TCGA_pancancer_transpose) <- colnames(.TCGA_pancancer)
  colnames(.TCGA_pancancer_transpose) <- rownames(.TCGA_pancancer)

  return(.TCGA_pancancer_transpose)
}


## Differential expression analysis and plotting ===============================

#' @title Compares expressions among genes in tumor samples
#'
#' @description
#'
#' This function selects the RNAseq data from TCGA samples, excludes
#'  samples with label of `Solid Tissue Normal` and ranks the genes by their
#'  mRNA expression.
#' It should not be used directly, only inside [.plot_boxgraph_RNAseq_TCGA()]
#'  function.
#'
#' @family helper function for differential expression analysis
#'
#' @param df
#'
#' `.TCGA_GTEX_RNAseq_sampletype_subset` generated
#'  inside [.plot_boxgraph_RNAseq_TCGA()]
#'
#' @return
#'
#' a data frame ranking genes by their mRNA expressions in lung adenocarcinoma
#'
#' @importFrom dplyr pull
#'
#' @importFrom forcats fct_reorder
#'
#' @examples \dontrun{
#' .RNAseq_all_gene(.TCGA_GTEX_RNAseq_sampletype_subset)
#' }
#'
#' @keywords internal
#'
.RNAseq_all_gene <- function(df) {
  df <- df %>%
    dplyr::filter(.data$study == "TCGA" & .data$sample.type != "Solid Tissue Normal")
  # order the EIF genes according to the order of the expression means
  order <- df %>%
    # dplyr::filter out category equal to 'Lung Adenocarcinoma'
    dplyr::filter(.data$primary.disease == "Lung Adenocarcinoma") %>%
    # use the same groups as in the ggplot
    dplyr::group_by(.data$variable) %>%
    # calculate the means
    dplyr::summarise(mean_RNAseq = mean(.data$RNAseq)) %>%
    dplyr::mutate(variable = forcats::fct_reorder(.data$variable,
                                                  .data$mean_RNAseq)) %>%
    dplyr::pull(.data$variable) %>%
    levels()

  return(df %>%
    dplyr::mutate(variable = factor(.data$variable, levels = rev(order))))
}

#' Grouped box plots of RNA expression across tumors
#'
#' @description
#'
#' This function takes the output of [.RNAseq_all_gene()] function and generate
#'  box plots of differential gene expression across all TCGA cancer types
#' This function should not be used directly,
#'  only inside [.plot_boxgraph_RNAseq_TCGA()]function.
#'
#' Side effect: box plots on screen and as pdf file to compare relative
#'  expression of EIF genes across all TCGA cancer type
#'
#' @family helper function for differential expression plotting
#'
#' @param df
#'
#' output dataset from `.RNAseq_all_gene(.TCGA_GTEX_RNAseq_sampletype_subset)`,
#'  which was generated inside [.plot_boxgraph_RNAseq_TCGA()]function.
#'
#' @keywords internal
#'
.RNAseq_grouped_boxplot <- function(df) {
  p1 <- ggplot(
    data = df,
    aes_(
      x = ~primary.disease,
      y = ~ 2**RNAseq
    )
  ) +
    scale_y_continuous(
      trans = log2_trans(),
      # limits = c(2**4, 2**17),
      labels = label_comma()
    ) +
    stat_n_text(
      size = 5,
      fontface = "bold",
      angle = 90,
      hjust = 0
    ) +
    geom_boxplot(aes_(
      # colour = factor(variable),
      colour = ~variable,
    ),
    outlier.shape = NA,
    position = position_dodge(width = 1)
    ) +
    labs(
      x = "primary disease",
      y = paste("Normalized expression (RNA-Seq counts)")
    ) +
    theme_bw() +
    theme(
      plot.title = black_bold_12,
      axis.title.x = element_blank(),
      axis.title.y = black_bold_12,
      axis.text.x = black_bold_12_45,
      axis.text.y = black_bold_12,
      panel.grid = element_blank(),
      legend.title = element_blank(),
      legend.text = black_bold_12,
      legend.position = c(0, 0.95),
      legend.direction = "horizontal",
      legend.justification = c(0, 1),
      legend.box = "horizontal",
      strip.text = black_bold_12
    )
  print(p1)

  ggplot2::ggsave(
    path = file.path(output_directory, "DEG"),
    filename = "EIF RNAseq Grouped Boxplot.pdf",
    plot = p1,
    width = 16.5,
    height = 8,
    useDingbats = FALSE
  )

  return(NULL)
}


#' @title Analyzes differential gene expression in tumors vs adjacent normal tissues
#'
#' @description
#'
#' A helper function selects the RNAseq data of each EIF genes from TCGA tumors
#'  and solid tissue normal samples in `.TCGA_GTEX_RNAseq_sampletype_subset`.
#' It should not be used directly, only inside [.plot_boxgraph_RNAseq_TCGA()]
#' function.
#'
#' @family helper function for differential expression analysis
#'
#' @param df
#'
#' `.TCGA_GTEX_RNAseq_sampletype_subset` generated
#'  inside [.plot_boxgraph_RNAseq_TCGA()]
#'
#' @param gene
#'
#' one gene from the input argument of [.plot_boxgraph_RNAseq_TCGA()]
#'
#' @return
#'
#' a data frame of differential gene expression in tumors vs adjacent
#'  normal tissues from individual TCGA cancer types
#'
#' @importFrom dplyr case_when
#'
#' @importFrom forcats fct_rev
#'
#' @keywords internal
#'
.RNAseq_ind_gene <- function(df, gene) {
  df <- df %>%
    dplyr::filter(.data$study == "TCGA") %>%
    droplevels() %>%
    dplyr::mutate(sample.type = dplyr::case_when(
      sample.type != "Solid Tissue Normal" ~ "Tumor",
      sample.type == "Solid Tissue Normal" ~ "Normal"
    )) %>%
    dplyr::filter(.data$variable == gene) %>%
    dplyr::mutate(primary.disease = forcats::fct_rev(.data$primary.disease))

  return(list(df, gene))
}

#' Box plots of differential gene expression across tumors
#'
#' @description
#'
#' This function takes the output of [.RNAseq_ind_gene()] and generate box plots
#'  for differential gene expression in tumors vs NATs in TCGA cancer types
#' This function should not be used directly,
#'  only inside [.plot_boxgraph_RNAseq_TCGA()]function.
#'
#' Side effects:
#'
#' (1) box plots on screen and as pdf file to show differential
#'  gene expression in tumors vs NATs in TCGA cancer types
#'
#' @family helper function for differential expression plotting
#'
#' @param df
#'
#' output dataset generated from [.RNAseq_ind_gene()] function
#'
#' @importFrom scales log2_trans label_comma
#'
#' @keywords internal
#'
.RNAseq_boxplot <- function(df) {
  p1 <- ggplot(
    data = df[[1]],
    aes_(
      x = ~primary.disease,
      y = ~ 2**RNAseq,
      color = ~sample.type
    )
  ) +
    scale_y_continuous(
      trans = scales::log2_trans(),
      # limits = c(2**11, 2**17),# for 4g
      # limits = c(2**7, 2**14),# for eif4E
      labels = scales::label_comma()
    ) +
    stat_n_text(
      size = 5,
      fontface = "bold",
      hjust = 0
    ) +
    geom_boxplot(
      # alpha = .1,
      # fill = sample.type,
      outlier.shape = NA,
      # size = .75,
      # width = 1,
      position = "dodge"
    ) +
    scale_color_manual(
      values = c(
        "Tumor" = "#CC79A7",
        "Normal" = "#0072B2"
      ),
      breaks = c("Tumor", "Normal"),
      labels = c("Tumor\n", "Normal\n")
    ) +
    labs(
      x = "primary disease",
      y = paste(df[[2]], "expression (RNA-Seq counts)")
    ) +
    coord_flip() +
    theme_bw() +
    theme(
      plot.title = black_bold_12,
      axis.title.x = black_bold_12,
      axis.title.y = element_blank(),
      axis.text.x = black_bold_12,
      axis.text.y = black_bold_12,
      panel.grid = element_blank(),
      legend.title = element_blank(),
      legend.text = black_bold_12,
      legend.position = "top",
      legend.justification = "left",
      legend.box = "horizontal",
      strip.text = black_bold_12
    )
  # geom_signif(comparisons=list(c("Tumor", "Normal")))
  print(p1)
  ggplot2::ggsave(
    path = file.path(output_directory, "DEG"),
    filename = paste(df[[2]], "pancancer RNAseq.pdf"),
    plot = p1,
    width = 7.5,
    height = 9,
    useDingbats = FALSE
  )

  return(NULL)
}


#' @title Select gene expression data in primary, metastatic tumors vs adjacent normal
#'  tissues from all TCGA cancer types combined
#'
#' @description
#'
#' This function selects the RNAseq data from TCGA samples including
#'  tumor samples that are labeled as `metastatic`, `primary tumor`,
#'  and `solid tissue normal` for comparison.
#' It should not be used directly, only inside [.plot_boxgraph_RNAseq_TCGA()]
#' function.
#'
#' @family helper function for differential expression analysis
#'
#' @param df
#'
#' `.TCGA_GTEX_RNAseq_sampletype_subset` generated inside
#' [.plot_boxgraph_RNAseq_TCGA()]
#'
#' @return
#'
#' a data frame of differential gene expression in tumors vs adjacent
#'  normal tissues in individual TCGA cancer types
#'
#' @keywords internal
#'
.RNAseq_tumortype <- function(df) {
  return(df %>%
    dplyr::filter(.data$study == "TCGA") %>%
    dplyr::mutate_if(is.character, as.factor) %>%
    dplyr::filter(.data$sample.type %in% c(
      "Metastatic",
      "Primary Tumor",
      "Solid Tissue Normal"
    )))
}

#' @title Violin plots of differential gene expression/ratio in primary,
#'  metastatic tumors vs adjacent normal tissues
#'
#' @description
#'
#' This function should not be used directly, only inside
#'  [.plot_boxgraph_RNAseq_TCGA()] or [.plot_boxgraph_RNAratio_TCGA()] function.
#'
#'  Side effect:
#'  (1) the violin plots on screen and as pdf file to show
#'   differential gene expression/ratio in primary, metastatic tumors vs
#'   adjacent normal tissues from all combined TCGA cancer types
#'
#' @family helper function for differential expression plotting
#'
#' @param df
#'
#' output dataset generated from [.RNAseq_tumortype()] or
#'  [.RNAratio_tumortype()] function
#'
#' @param y.axis.title the title name for y axis
#'
#' @param y.axis.break the break mark on the y axis
#'
#' @param dashline the position of horizontal reference line
#'
#' @importFrom EnvStats stat_n_text
#'
#' @keywords internal
#'
.violinplot <- function(df, y.axis.title, y.axis.break, dashline) {
  p1 <- ggplot(
    data = df,
    # data = EIF.TCGA.RNAseq.anno.subset.long,
    aes_(
      x = ~sample.type,
      y = ~ 2**RNAseq,
      color = ~sample.type,
      fill = ~sample.type
    )
  ) +
    EnvStats::stat_n_text(
      size = 6,
      fontface = "bold",
      angle = 90,
      hjust = 0
    ) +
    ggplot2::facet_grid(. ~ variable,
      scales = "free",
      space  = "free"
    ) +
    # facet_wrap(~ variable, ncol = 6) +
    geom_violin(trim = TRUE) +
    geom_boxplot(
      alpha = .01,
      width = .25,
      color = "black",
      position = position_dodge(width = .9)
    ) +
    labs(
      x = "sample type",
      y = y.axis.title
      # y = "normalized RNA counts"
    ) +
    scale_x_discrete(labels = c(
      "Metastatic Tumor",
      "Primary Tumor",
      # "Normal Tissue",
      "Adjacent Normal"
    )) +
    scale_y_continuous(
      trans = scales::log2_trans(),
      labels = scales::label_comma(),
      breaks = y.axis.break,
      # breaks = c(1, 128, 2048, 32768),
      # labels = c("1","8","64","512"),
      position = "left"
    ) +
    geom_hline(yintercept = dashline, linetype = "dashed") +
    # for color-blind palettes
    scale_color_manual(values = c("#56B4E9", "#009E73", "#D55E00")) +
    # for color-blind palettes
    scale_fill_manual(values = c("#56B4E9", "#009E73", "#D55E00")) +
    theme_bw() +
    theme(
      plot.title = black_bold_16,
      axis.title.x = element_blank(),
      axis.title.y = black_bold_16_right,
      axis.text.x = black_bold_16_90,
      axis.text.y = black_bold_16_90,
      axis.line.x = element_line(color = "black"),
      axis.line.y = element_line(color = "black"),
      panel.grid = element_blank(),
      legend.position = "none",
      strip.text = black_bold_16
    ) +
    ggpubr::stat_compare_means(
      comparisons = list(
        c("Metastatic", "Solid Tissue Normal"),
        c("Primary Tumor", "Solid Tissue Normal"),
        c("Metastatic", "Primary Tumor")
      ),
      method = "t.test",
      label = "p.signif",
      size = 6
    )
  print(p1)
  ggplot2::ggsave(
    path = file.path(output_directory, "DEG"),
    filename = paste(y.axis.title, "violin.pdf"),
    plot = p1,
    width = 18,
    height = 9,
    useDingbats = FALSE
  )

  return(NULL)
}


#' @title Calculate RNA ratios
#'
#' @description A helper function calculates the RNA ratios between genes
#'
#' @details This function
#'
#'  * select the of RNAseq data of input genes from all TCGA samples, using the
#'   data frame `TCGA_GTEX_RNAseq_sampletype` prepared by the function
#'   [initialize_RNAseq_data()].
#'  * calculates the RNA ratio data in each TCGA samples including
#'  tumors and solid tissue normal samples for comparison.
#'
#' It should not be used directly, only inside [.plot_boxgraph_RNAratio_TCGA()]
#'  function.
#'
#' @family helper function for differential expression analysis
#'
#' @param gene01
#' gene name, same input from [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @param gene02
#' gene name, same input from [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @param gene03
#' gene name, same input from [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @param gene04
#' gene name, same input from [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @param gene05
#' gene name, same input from [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @param gene06
#' gene name, same input from [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @param gene07
#' gene name, same input from [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @param gene08
#' gene name, same input from [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @param gene09
#' gene name, same input from [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @param gene10
#' gene name, same input from [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @importFrom rlang :=
#'
#' @return
#'
#' a data frame of differential RNA ratios in tumors vs adjacent normal tissues
#'  from individual TCGA cancer types
#'
#' @keywords internal
#'
.RNAratio_calculation <- function(gene01 = "EIF4E",
                                  gene02 = "EIF4E2",
                                  gene03 = "EIF4E3",
                                  gene04 = "EIF4EBP1",
                                  gene05 = "EIF4G1",
                                  gene06 = "EIF4G2",
                                  gene07 = "EIF4G3",
                                  gene08 = "EIF3D",
                                  gene09 = "EIF4A1",
                                  gene10 = "EIF4A2") {
  .genes_names <- c(
    gene01, gene02, gene03, gene04,
    gene05, gene06, gene07, gene08,
    gene09, gene10
  )

  return(TCGA_GTEX_RNAseq_sampletype %>%
           dplyr::select(
             dplyr::all_of(.genes_names),
             "sample.type",
             "primary.disease",
             "primary.site",
             "study"
           ) %>%
           tibble::as_tibble() %>%
           dplyr::filter(gene01 != 0 & !is.na(.data$primary.site)) %>%
           # calculate the ratio of mRNA counts
           dplyr::mutate(
             (!!paste0(gene01, "+", gene04)) :=
               log2(2**(!!as.name(gene01)) + 2**(!!as.name(gene04)) - 1),
             # ":", "\n" split the final ratio name in two lines
             # for the plotting purpose.
             (!!paste0(gene09, ":", "\n", gene01)) :=
               (!!as.name(gene09)) - (!!as.name(gene01)),
             (!!paste0(gene09, ":", "\n", gene02)) :=
               (!!as.name(gene09)) - (!!as.name(gene02)),
             (!!paste0(gene10, ":", "\n", gene01)) :=
               (!!as.name(gene10)) - (!!as.name(gene01)),
             (!!paste0(gene10, ":", "\n", gene02)) :=
               (!!as.name(gene10)) - (!!as.name(gene02)),
             (!!paste0(gene05, ":", "\n", gene01)) :=
               (!!as.name(gene05)) - (!!as.name(gene01)),
             (!!paste0(gene05, ":", "\n", gene02)) :=
               (!!as.name(gene05)) - (!!as.name(gene02)),
             (!!paste0(gene07, ":", "\n", gene01)) :=
               (!!as.name(gene07)) - (!!as.name(gene01)),
             (!!paste0(gene07, ":", "\n", gene02)) :=
               (!!as.name(gene07)) - (!!as.name(gene02)),
             (!!paste0(gene09, ":", "\n", gene05)) :=
               (!!as.name(gene09)) - (!!as.name(gene05)),
             (!!paste0(gene10, ":", "\n", gene05)) :=
               (!!as.name(gene10)) - (!!as.name(gene05)),
             (!!paste0(gene09, ":", "\n", gene06)) :=
               (!!as.name(gene09)) - (!!as.name(gene06)),
             (!!paste0(gene10, ":", "\n", gene06)) :=
               (!!as.name(gene10)) - (!!as.name(gene06)),
             (!!paste0(gene01, ":", "\n", gene04)) :=
               (!!as.name(gene01)) - (!!as.name(gene04)),
             (!!paste0(gene02, ":", "\n", gene01)) :=
               (!!as.name(gene02)) - (!!as.name(gene01)),
             (!!paste0(gene06, ":", "\n", gene05)) :=
               (!!as.name(gene06)) - (!!as.name(gene05)),
             (!!paste0(gene05, ":", "\n", gene07)) :=
               (!!as.name(gene05)) - (!!as.name(gene07)),
             (!!paste0(gene09, ":", "\n", gene10)) :=
               (!!as.name(gene09)) - (!!as.name(gene10)),
             (!!paste0(gene05, ":", "\n", gene01, "+", gene04)) :=
               (!!as.name(gene05)) -
               log2(2**(!!as.name(gene01)) + 2**(!!as.name(gene04)) - 1),
             (!!paste0(gene09, ":", "\n", gene01, "+", gene04)) :=
               (!!as.name(gene09)) -
               log2(2**(!!as.name(gene01)) + 2**(!!as.name(gene04)) - 1)
           ) %>%
           dplyr::select(
             (!!paste0(gene09, ":", "\n", gene01)),
             (!!paste0(gene09, ":", "\n", gene02)),
             (!!paste0(gene10, ":", "\n", gene01)),
             (!!paste0(gene10, ":", "\n", gene02)),
             (!!paste0(gene05, ":", "\n", gene01)),
             (!!paste0(gene05, ":", "\n", gene02)),
             (!!paste0(gene07, ":", "\n", gene01)),
             (!!paste0(gene07, ":", "\n", gene02)),
             (!!paste0(gene09, ":", "\n", gene05)),
             (!!paste0(gene10, ":", "\n", gene05)),
             (!!paste0(gene09, ":", "\n", gene06)),
             (!!paste0(gene10, ":", "\n", gene06)),
             (!!paste0(gene01, ":", "\n", gene04)),
             (!!paste0(gene02, ":", "\n", gene01)),
             (!!paste0(gene06, ":", "\n", gene05)),
             (!!paste0(gene05, ":", "\n", gene07)),
             (!!paste0(gene09, ":", "\n", gene10)),
             (!!paste0(gene05, ":", "\n", gene01, "+", gene04)),
             (!!paste0(gene09, ":", "\n", gene01, "+", gene04)),
             "sample.type",
             "primary.disease",
             "primary.site",
             "study"
           ) %>%
           dplyr::mutate_if(is.character, as.factor) %>%
           stats::na.omit())
}

#' @title Select RNA ratio data for plotting
#'
#' @description
#'
#' This function should not be used directly,
#'  only inside [.plot_boxgraph_RNAratio_TCGA()]function.
#'
#' @family helper function for differential expression analysis
#'
#' @param df
#'
#' output dataset generated from [.RNAratio_calculation()] function
#'
#' @param gene_ratio gene ratio name
#'
#' input argument for selected variables generated
#'  from [.plot_boxgraph_RNAratio_TCGA()] function
#'
#' @keywords internal
#'
.RNAratio_selection <- function(df, gene_ratio) {
  return(df %>%
           dplyr::filter(.data$study == "TCGA") %>%
           droplevels() %>%
           dplyr::mutate(sample.type = dplyr::case_when(
             sample.type != "Solid Tissue Normal" ~ "Tumor",
             sample.type == "Solid Tissue Normal" ~ "NAT"
           )) %>%
           dplyr::select(
             dplyr::all_of(gene_ratio),
             "sample.type",
             "primary.disease",
             "primary.site",
             "study"
           ) %>%
           reshape2::melt(
             id = c(
               "sample.type",
               "primary.disease",
               "primary.site",
               "study"
             ),
             value.name = "RNAseq"
           ) %>%
           dplyr::mutate_if(is.character, as.factor) %>%
           dplyr::mutate(
             primary.disease = forcats::fct_rev(.data$primary.disease)))
}


#' @title Box plots of differential RNA ratios across tumors
#'
#' @description
#'
#' This function should not be used directly,
#'  only inside [.plot_boxgraph_RNAratio_TCGA()]function.
#'
#' Side effects:
#' (1)the box plots on screen and as pdf files to show differential
#'  RNA ratios across tumors
#'
#' @family helper function for differential expression plotting
#'
#' @param df
#'
#' output dataset generated from [.RNAratio_selection()] function
#'
#' @param dashline the position of horizontal reference line
#'
#' @param ylimit the lower and upper limit of the axis scale
#'
#' @param filename the name for the output file
#'
#' @keywords internal
#'
.RNAratio_boxplot <- function(df, dashline, ylimit, filename) {
  p1 <- ggplot(
    data = df,
    aes_(
      x = ~primary.disease,
      # x = f.ordered1,
      y = ~ 2**RNAseq,
      # fill  = variable,
      color = ~sample.type
    )
  ) +
    geom_boxplot(
      # alpha = .1,
      # fill = sample.type,
      outlier.shape = NA,
      # size = .75,
      # width = 1,
      position = "dodge"
    ) +
    geom_hline(
      # data = df,
      aes(yintercept = dashline),
      linetype = "dashed"
    ) +
    scale_color_manual(
      values = c("Tumor" = "#CC79A7", "NAT" = "#0072B2"),
      breaks = c("Tumor", "NAT")
    ) +
    # for color-blind palettes
    facet_wrap(~variable, scales = "free_x") +
    ggplot2::facet_grid(~variable, scales = "free_x", space = "free") +
    guides(colour = guide_legend(nrow = 1)) +
    labs(
      x = "primary disease",
      y = "Ratio of RNA counts"
    ) +
    coord_flip(ylim = ylimit) +
    theme_bw() +
    theme(
      plot.title = black_bold_12,
      axis.title.x = black_bold_12,
      axis.title.y = element_blank(),
      axis.text.x = black_bold_12,
      axis.text.y = black_bold_12,
      panel.grid = element_blank(),
      legend.title = element_blank(),
      legend.position = "top",
      legend.justification = "left",
      legend.box = "horizontal",
      legend.text = black_bold_12,
      # strip.background = element_blank(),
      strip.text.x = black_bold_12
    )
  print(p1)
  ggplot2::ggsave(
    path = file.path(output_directory, "DEG"),
    filename = filename,
    plot = p1,
    width = 18,
    height = 8,
    useDingbats = FALSE
  )

  return(NULL)
}


#' @title Analyzes differential RNA ratios in primary, metastatic tumors vs
#'  adjacent normal tissues from all TCGA cancer types combined
#'
#' @description
#'
#' This function selects the RNA ratio data from TCGA samples including
#'  tumor samples that are labeled as `metastatic`, `primary tumor`,
#'  and `solid tissue normal` for comparison.
#'
#' It should not be used directly, only
#'  inside [.plot_boxgraph_RNAratio_TCGA()] function.
#'
#' @family helper function for differential expression analysis
#'
#' @param df
#'
#' `.RNAratio_tumortype(.RNAratio_data)` generated
#'  inside [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @param gene_ratio
#'
#' @return
#'
#' a data frame of differential RNA ratios in tumors vs adjacent normal tissues
#' from individual TCGA cancer types
#'
#' @keywords internal
#'
.RNAratio_tumortype <- function(df, gene_ratio) {
  .RNAratio_data <- df %>%
    dplyr::filter(.data$sample.type %in% c(
      "Metastatic",
      "Primary Tumor",
      "Solid Tissue Normal"
    )) %>%
    droplevels() %>%
    dplyr::select(
      dplyr::all_of(gene_ratio),
      "sample.type",
      "primary.disease",
      "primary.site",
      "study"
    ) %>%
    reshape2::melt(
      id = c(
        "sample.type",
        "primary.disease",
        "primary.site",
        "study"
      ),
      value.name = "RNAseq"
    ) %>%
    dplyr::mutate_if(is.character, as.factor) %>%
    dplyr::mutate(primary.disease = forcats::fct_rev(.data$primary.disease))

  return(.RNAratio_data)
}


## Composite functions to call DEG gene analysis and plotting ==================

#' @title Compare the expression of EIF4F genes
#'
#' @description
#'
#' This function generates a summary box plot to compare the expression of all
#'  EIF4F genes across TCGA cancer types, box plots for differential gene
#'  expression in tumors tumors vs adjacent normal tissues for each  gene.
#'  and a violin plot to compare differential gene expression in primary,
#'  metastatic tumors vs adjacent normal tissues from all TCGA cancer types
#'  combined.
#'
#' @family composite function to call DEG gene analysis and plotting
#'
#' @param gene_list
#'
#' gene names in a vector of characters
#'
#' @details This function
#'
#' * selects RNASeq of input genes and columns including `sample.type`,
#'  `primary.disease`, and `primary.site`,from the data frame
#'  `TCGA_GTEX_RNAseq_sampletype` that was prepared by the function
#'  [initialize_RNAseq_data()] to produce a subset data frame called
#'  `.TCGA_GTEX_RNAseq_sampletype_subset`
#' * with the subset data, calls the functions [.RNAseq_all_gene()] to compare
#'   relative mRNA expression of all inquired genes and calls the functions
#'   [.RNAseq_grouped_boxplot()] plot the results as a grouped box plot.
#' * performs differential expression analysis for each gene across all
#'  tumors types with the function [.RNAseq_ind_gene()] and plots with
#'  [.RNAseq_boxplot()].
#' * performs differential expression analysis for each gene in
#'  primary, metastatic tumors vs adjacent normal tissues from all TCGA cancer
#'  types combined with the function [.RNAseq_tumortype()] and plots with
#'  [.violinplot()].
#'
#' This function is not accessible to the user and will not show at the users'
#'  workspace. It can only be called by the exported [EIF4F_DEG_analysis()]
#'  function.
#'
#' Side effects:
#'
#'  (1) box plots on screen and as pdf file to compare relative gene
#'  expression in across all TCGA cancer types
#'
#'  (2) box plots on screen and as pdf file to show differential
#'  gene expression in tumors vs NATs in TCGA cancer types
#'
#'  (3) violin plots on screen and as pdf file to show
#'   differential gene expression in primary, metastatic tumors vs
#'   adjacent normal tissues from all combined TCGA cancer types
#'
#' @keywords internal
#'
#' @examples \dontrun{
#' .plot_boxgraph_RNAseq_TCGA(c(
#'   "EIF4G1", "EIF4G2", "EIF4G3",
#'   "PABPC1", "EIF4A1", "EIF4A2", "EIF4B", "EIF4H", "EIF4E",
#'   "EIF4E2", "EIF4E3", "EIF4EBP1", "EIF3D"
#' ))
#' }
#'
.plot_boxgraph_RNAseq_TCGA <- function(gene_list) {
  .TCGA_GTEX_RNAseq_sampletype_subset <- TCGA_GTEX_RNAseq_sampletype %>%
    dplyr::select(
      dplyr::all_of(gene_list),
      "sample.type",
      "primary.disease",
      "primary.site",
      "study"
    ) %>%
    tibble::as_tibble() %>%
    reshape2::melt(
      id = c(
        "sample.type",
        "primary.disease",
        "primary.site",
        "study"
      ),
      value.name = "RNAseq"
    ) %>%
    dplyr::filter(!is.na(.data$primary.site)) %>%
    # na.omit(.$primary.site) %>%
    # filter(RNAseq != 0) %>%
    dplyr::mutate_if(is.character, as.factor)

  # boxplot to compare relative abundance of genes across tumors
  .RNAseq_all_gene(.TCGA_GTEX_RNAseq_sampletype_subset) %>%
    .RNAseq_grouped_boxplot()

  # boxplot to compare RNA-seq of one gene in tumor vs adjacent normal
  RNAseq.ind.gene.df <- lapply(gene_list, .RNAseq_ind_gene,
    df = .TCGA_GTEX_RNAseq_sampletype_subset
  )
  lapply(RNAseq.ind.gene.df, .RNAseq_boxplot)

  # violin plot to compare  expression in primary, metastatic tumors vs NATs
  .RNAseq_tumortype(.TCGA_GTEX_RNAseq_sampletype_subset) %>%
    .violinplot(
      y.axis.title = "normalized RNA counts",
      y.axis.break = c(128, 2048, 32768),
      dashline = NULL
    )

  return(NULL)
}

#' @title Compare the RNA ratios between EIF4F genes
#'
#' @description
#'
#' This function generates box plots to compare the RNA ratios between EIF4F
#'  genes in tumors tumors vs adjacent normal tissues across TCGA cancer types,
#'  and a violin plot to compare RNA ratios in primary, metastatic tumors vs
#'  adjacent normal tissues from all TCGA cancer types combined.
#'
#' @family composite functions to call DEG gene analysis and plotting
#'
#' @details This function
#'
#' * calculates RNA ratio across individual tumor types from all cancers with
#'  the function [.RNAratio_calculation()],
#' * selects the ratio data with [.RNAratio_selection()] and make boxplot with
#'  [.RNAratio_boxplot()].
#' * compares RNA ratio in primary, metastatic tumors vs adjacent
#'  normal tissues from all TCGA cancer types combined with the function
#'  [.RNAratio_tumortype()] and plots with [.violinplot()].
#'
#' This function is not accessible to the user and will not show at the users'
#'  workspace. It can only be called by the exported [EIF4F_DEG_analysis()]
#'  function.
#'
#' Side effects:
#'
#'  (1) box plots on screen and as pdf file to show differential
#'  RNA ratios in tumors vs NATs in TCGA cancer type
#'
#'  (2) violin plots on screen and as pdf file to show
#'   differential gene ratios in primary, metastatic tumors vs
#'   adjacent normal tissues from all combined TCGA cancer types
#'
#' @param gene01
#' gene name as a string
#'
#' @param gene02
#' gene name as a string
#'
#' @param gene03
#' gene name as a string
#'
#' @param gene04
#' gene name as a string
#'
#' @param gene05
#' gene name as a string
#'
#' @param gene06
#' gene name as a string
#'
#' @param gene07
#' gene name as a string
#'
#' @param gene08
#' gene name as a string
#'
#' @param gene09
#' gene name as a string
#'
#' @param gene10
#' gene name as a string
#'
#'
#' @examples \dontrun{
#' .plot_boxgraph_RNAratio_TCGA(
#'   EIF4E = "EIF4E", EIF4E2 = "EIF4E2",
#'   EIF4E3 = "EIF4E3", EIF4EBP1 = "EIF4EBP1", EIF4G1 = "EIF4G1", EIF4G2 = "EIF4G2",
#'   EIF4G3 = "EIF4G3", EIF3D = "EIF3D", EIF4A1 = "EIF4A1", EIF4A2 = "EIF4A2"
#' )
#' }
#'
#' @keywords internal
#'
.plot_boxgraph_RNAratio_TCGA <- function(gene01, gene02, gene03, gene04,
                                         gene05, gene06, gene07, gene08,
                                         gene09, gene10) {
  RNAratio.data <- .RNAratio_calculation(
    gene01, gene02, gene03, gene04,
    gene05, gene06, gene07, gene08,
    gene09, gene10
  )

  .RNAratio_boxplot(
    df = .RNAratio_selection(RNAratio.data, c(
      (paste0(gene05, ":", "\n", gene01)), (paste0(gene09, ":", "\n", gene01)),
      (paste0(gene10, ":", "\n", gene01)), (paste0(gene07, ":", "\n", gene01)),
      (paste0(gene07, ":", "\n", gene02)), (paste0(gene05, ":", "\n", gene07))
    )),
    dashline = 1,
    ylimit = c(0, 25),
    filename = "EIF4F RNA ratio plot1.pdf"
  )

  .RNAratio_boxplot(
    df = .RNAratio_selection(RNAratio.data, c(
      (paste0(gene06, ":", "\n", gene05)), (paste0(gene02, ":", "\n", gene01)),
      (paste0(gene09, ":", "\n", gene10)), (paste0(gene01, ":", "\n", gene04)),
      (paste0(gene05, ":", "\n", gene01, "+", gene04)),
      (paste0(gene09, ":", "\n", gene01, "+", gene04))
    )),
    dashline = 4,
    ylimit = c(0, 25),
    filename = "EIF4F RNA ratio plot2.pdf"
  )

  .RNAratio_boxplot(
    df = .RNAratio_selection(RNAratio.data, c(
      (paste0(gene07, ":", "\n", gene01)), (paste0(gene07, ":", "\n", gene02)),
      (paste0(gene06, ":", "\n", gene05)), (paste0(gene02, ":", "\n", gene01)),
      (paste0(gene09, ":", "\n", gene10)), (paste0(gene01, ":", "\n", gene04))
    )),
    dashline = 1,
    ylimit = c(0, 5),
    filename = "EIF4F RNA ratio plot3.pdf"
  )

  .RNAratio_tumortype(RNAratio.data, c(
    (paste0(gene05, ":", "\n", gene01)), (paste0(gene09, ":", "\n", gene01)),
    (paste0(gene10, ":", "\n", gene01)), (paste0(gene07, ":", "\n", gene01)),
    (paste0(gene07, ":", "\n", gene02)), (paste0(gene05, ":", "\n", gene07)),
    (paste0(gene09, ":", "\n", gene10)), (paste0(gene01, ":", "\n", gene04)),
    (paste0(gene05, ":", "\n", gene01, "+", gene04)),
    (paste0(gene09, ":", "\n", gene01, "+", gene04))
  )) %>%
    .violinplot(
      y.axis.title = "Ratio of RNA counts",
      # y.axis.break = c(1, 128, 2048, 32768)
      y.axis.break = c(0.125, 1, 4, 8, 64, 512),
      dashline = c(1, 4)
    )

  return(NULL)
}


## Wrapper function to call all composite functions with inputs ================

#' @title Perform differential gene expression or ratio analysis and
#'  generate plots
#'
#' @description
#'
#' A wrapper function to call all composite functions for differential gene
#'  expression or ratio analysis with inputs
#'
#' @details
#'
#' This function run two composite functions together, with inputs:
#'
#'  * [.plot_boxgraph_RNAseq_TCGA()]
#'  * [.plot_boxgraph_RNAratio_TCGA()]
#'
#' Side effects:
#'
#' (1) box plots on screen and as pdf file to compare relative gene
#'   expression in across all TCGA cancer types
#'
#' (2) box plots on screen and as pdf file to show differential
#'  gene expression in tumors vs NATs in TCGA cancer types
#'
#' (3) violin plots on screen and as pdf file to show
#'   differential gene expression in primary, metastatic tumors vs
#'   adjacent normal tissues from all combined TCGA cancer types
#'
#' (4) box plots on screen and as pdf file to show differential
#'  RNA ratios in tumors vs NATs in TCGA cancer type
#'
#' (5) violin plots on screen and as pdf file to show
#'   differential gene ratios in primary, metastatic tumors vs
#'   adjacent normal tissues from all combined TCGA cancer types
#'
#' @family wrapper function to call all composite functions with inputs
#'
#' @export
#'
#' @examples \dontrun{
#' EIF4F_DEG_analysis()
#' }
#'
EIF4F_DEG_analysis <- function() {
  .plot_boxgraph_RNAseq_TCGA(c(
    "EIF4G1", "EIF4G2", "EIF4G3", "PABPC1", "EIF4A1", "EIF4A2",
    "EIF4B", "EIF4H", "EIF4E", "EIF4E2", "EIF4E3", "EIF4EBP1", "EIF3D"
  ))

  .plot_boxgraph_RNAratio_TCGA(
    gene01 = "EIF4E", gene02 = "EIF4E2", gene03 = "EIF4E3", gene04 = "EIF4EBP1",
    gene05 = "EIF4G1", gene06 = "EIF4G2", gene07 = "EIF4G3", gene08 = "EIF3D",
    gene09 = "EIF4A1", gene10 = "EIF4A2"
  )

  return(NULL)
}
a3609640/eIF4F.analysis documentation built on Jan. 2, 2023, 11:19 p.m.