# Differential gene expression (ratio) analysis of EIF4F genes in TCGA
# This R script contains four sections.
#
# (1) RNAseq data preparation
#
# (2) analyze differential mRNA gene (ratio) expression and plotting
#
# (3) composite functions to execute a pipeline of functions to select related
# RNAseq data with supply of EIF4F gene names for analysis and plotting.
#
# (4) wrapper function to call all master functions with inputs
#
## Wrapper function for data initialization of RNA-seq related datasets ========
#' @noRd
## due to NSE notes in R CMD check
TCGA_GTEX_RNAseq_sampletype <- NULL
#' @title Read all RNA-seq related datasets from TCGA and GTEx
#'
#' @description
#'
#' A wrapper function reads RNA-seq related datasets from TCGA and GTEx.
#'
#' @details
#'
#' Its side effects is the global variable `TCGA_GTEX_RNAseq_sampletype`, which
#' was merged from two internal data frames:
#'
#' (1) `.TCGA_GTEX_RNAseq`: the recomputed RNAseq data from both TCGA and GTEx
#' generated by [.get_TCGA_GTEX_RNAseq()], which imports the dataset
#' `TcgaTargetGtex_RSEM_Hugo_norm_count`.
#'
#' (2) `.TCGA_GTEX_sampletype` annotates the feature for each sample from
#' TCGA and GTEx. The data frame imports the `TcgaTargetGTEX_phenotype.txt`
#' dataset and performed basic data cleaning steps including removal of
#' duplicates and NAs.
#'
#' To reduce the data size, we only select the following four relevant columns
#' out of `TcgaTargetGTEX_phenotype.txt` to construct `.TCGA_GTEX_sampletype`.
#'
#' * `sample.type` column that annotates malignant of normal tissues
#' * `primary.disease` column that annotates cancer types for each sample
#' * `primary.site` column that annotates the tissue types
#' * `study` column that annotates the cohort “TCGA” or “GTEx”
#'
#' `TCGA_GTEX_RNAseq_sampletype` was stored as `TCGA_GTEX_RNAseq_sampletype.csv` in
#' `~/Documents/EIF_output/ProcessedData` folder.
#'
#' @family wrapper function for data initialization
#'
#' @importFrom dplyr distinct filter select select_if mutate mutate_at
#' summarise rename group_by all_of
#'
#' @importFrom readr read_tsv
#'
#' @importFrom tibble remove_rownames column_to_rownames
#'
#' @importFrom data.table fread transpose
#'
#' @export
#'
#' @examples \dontrun{
#' initialize_RNAseq_data()
#' }
#'
initialize_RNAseq_data <- function() {
rlang::env_binding_unlock(parent.env(environment()), nms = NULL)
if (!file.exists(file.path(
output_directory,
"ProcessedData",
"TCGA_GTEX_RNAseq_sampletype.csv"
))) {
.TCGA_GTEX_RNAseq <- .get_TCGA_GTEX_RNAseq()
.TCGA_GTEX_sampletype <- readr::read_tsv(
file.path(
data_file_directory,
"TcgaTargetGTEX_phenotype.txt"
),
show_col_types = FALSE
) %>%
tibble::as_tibble() %>%
dplyr::distinct(.data$sample, .keep_all = TRUE) %>%
tibble::remove_rownames() %>%
tibble::column_to_rownames(var = "sample") %>%
dplyr::select(
"_sample_type",
"primary disease or tissue",
"_primary_site",
"_study"
) %>%
dplyr::rename(
"sample.type" = "_sample_type",
"primary.disease" = "primary disease or tissue",
"primary.site" = "_primary_site",
"study" = "_study"
)
assign("TCGA_GTEX_RNAseq_sampletype",
merge(.TCGA_GTEX_RNAseq,
.TCGA_GTEX_sampletype,
by = "row.names",
all.x = TRUE
) %>%
tibble::remove_rownames() %>%
tibble::column_to_rownames(var = "Row.names"),
envir = parent.env(environment()))
data.table::fwrite(TCGA_GTEX_RNAseq_sampletype, file.path(
output_directory,
"ProcessedData",
"TCGA_GTEX_RNAseq_sampletype.csv"
),row.names = TRUE)
} else {
assign("TCGA_GTEX_RNAseq_sampletype",
data.table::fread(file.path(
output_directory,
"ProcessedData",
"TCGA_GTEX_RNAseq_sampletype.csv"),data.table = FALSE
) %>%
tibble::as_tibble() %>%
#tibble::remove_rownames() %>%
tibble::column_to_rownames(var = "V1"),
envir = parent.env(environment()))
}
rlang::env_binding_lock(parent.env(environment()), nms = NULL)
return(NULL)
}
#' @title Read recomputed RNAseq data from both TCGA and GTEx
#'
#' @description
#'
#' A helper function reads the dataset `TcgaTargetGtex_RSEM_Hugo_norm_count`,
#' which contains the recomputed RNAseq data from both TCGA and GTEx.
#'
#' @details
#'
#' The function also removes possible duplicated tumor samples and samples with
#' NAs in the dataset. It should not be used directly, only inside
#' [initialize_RNAseq_data()] function.
#'
#' @family helper function for data initialization
#'
#' @return
#'
#' a data frame that contains the recomputed RNAseq data from both TCGA and GTEx
#'
#' @examples \dontrun{
#' .get_TCGA_GTEX_RNAseq()
#' }
#'
#' @keywords internal
#'
.get_TCGA_GTEX_RNAseq <- function() {
.TCGA_pancancer <- data.table::fread(
file.path(
data_file_directory,
"TcgaTargetGtex_RSEM_Hugo_norm_count"
),
data.table = FALSE
) %>%
tibble::as_tibble() %>%
dplyr::distinct(.data$sample, .keep_all = TRUE) %>%
stats::na.omit() %>%
tibble::remove_rownames() %>%
tibble::column_to_rownames(var = "sample")
# transpose function from the data.table keeps numeric values as numeric.
.TCGA_pancancer_transpose <- data.table::transpose(.TCGA_pancancer)
# get row and colnames in order
rownames(.TCGA_pancancer_transpose) <- colnames(.TCGA_pancancer)
colnames(.TCGA_pancancer_transpose) <- rownames(.TCGA_pancancer)
return(.TCGA_pancancer_transpose)
}
## Differential expression analysis and plotting ===============================
#' @title Compares expressions among genes in tumor samples
#'
#' @description
#'
#' This function selects the RNAseq data from TCGA samples, excludes
#' samples with label of `Solid Tissue Normal` and ranks the genes by their
#' mRNA expression.
#' It should not be used directly, only inside [.plot_boxgraph_RNAseq_TCGA()]
#' function.
#'
#' @family helper function for differential expression analysis
#'
#' @param df
#'
#' `.TCGA_GTEX_RNAseq_sampletype_subset` generated
#' inside [.plot_boxgraph_RNAseq_TCGA()]
#'
#' @return
#'
#' a data frame ranking genes by their mRNA expressions in lung adenocarcinoma
#'
#' @importFrom dplyr pull
#'
#' @importFrom forcats fct_reorder
#'
#' @examples \dontrun{
#' .RNAseq_all_gene(.TCGA_GTEX_RNAseq_sampletype_subset)
#' }
#'
#' @keywords internal
#'
.RNAseq_all_gene <- function(df) {
df <- df %>%
dplyr::filter(.data$study == "TCGA" & .data$sample.type != "Solid Tissue Normal")
# order the EIF genes according to the order of the expression means
order <- df %>%
# dplyr::filter out category equal to 'Lung Adenocarcinoma'
dplyr::filter(.data$primary.disease == "Lung Adenocarcinoma") %>%
# use the same groups as in the ggplot
dplyr::group_by(.data$variable) %>%
# calculate the means
dplyr::summarise(mean_RNAseq = mean(.data$RNAseq)) %>%
dplyr::mutate(variable = forcats::fct_reorder(.data$variable,
.data$mean_RNAseq)) %>%
dplyr::pull(.data$variable) %>%
levels()
return(df %>%
dplyr::mutate(variable = factor(.data$variable, levels = rev(order))))
}
#' Grouped box plots of RNA expression across tumors
#'
#' @description
#'
#' This function takes the output of [.RNAseq_all_gene()] function and generate
#' box plots of differential gene expression across all TCGA cancer types
#' This function should not be used directly,
#' only inside [.plot_boxgraph_RNAseq_TCGA()]function.
#'
#' Side effect: box plots on screen and as pdf file to compare relative
#' expression of EIF genes across all TCGA cancer type
#'
#' @family helper function for differential expression plotting
#'
#' @param df
#'
#' output dataset from `.RNAseq_all_gene(.TCGA_GTEX_RNAseq_sampletype_subset)`,
#' which was generated inside [.plot_boxgraph_RNAseq_TCGA()]function.
#'
#' @keywords internal
#'
.RNAseq_grouped_boxplot <- function(df) {
p1 <- ggplot(
data = df,
aes_(
x = ~primary.disease,
y = ~ 2**RNAseq
)
) +
scale_y_continuous(
trans = log2_trans(),
# limits = c(2**4, 2**17),
labels = label_comma()
) +
stat_n_text(
size = 5,
fontface = "bold",
angle = 90,
hjust = 0
) +
geom_boxplot(aes_(
# colour = factor(variable),
colour = ~variable,
),
outlier.shape = NA,
position = position_dodge(width = 1)
) +
labs(
x = "primary disease",
y = paste("Normalized expression (RNA-Seq counts)")
) +
theme_bw() +
theme(
plot.title = black_bold_12,
axis.title.x = element_blank(),
axis.title.y = black_bold_12,
axis.text.x = black_bold_12_45,
axis.text.y = black_bold_12,
panel.grid = element_blank(),
legend.title = element_blank(),
legend.text = black_bold_12,
legend.position = c(0, 0.95),
legend.direction = "horizontal",
legend.justification = c(0, 1),
legend.box = "horizontal",
strip.text = black_bold_12
)
print(p1)
ggplot2::ggsave(
path = file.path(output_directory, "DEG"),
filename = "EIF RNAseq Grouped Boxplot.pdf",
plot = p1,
width = 16.5,
height = 8,
useDingbats = FALSE
)
return(NULL)
}
#' @title Analyzes differential gene expression in tumors vs adjacent normal tissues
#'
#' @description
#'
#' A helper function selects the RNAseq data of each EIF genes from TCGA tumors
#' and solid tissue normal samples in `.TCGA_GTEX_RNAseq_sampletype_subset`.
#' It should not be used directly, only inside [.plot_boxgraph_RNAseq_TCGA()]
#' function.
#'
#' @family helper function for differential expression analysis
#'
#' @param df
#'
#' `.TCGA_GTEX_RNAseq_sampletype_subset` generated
#' inside [.plot_boxgraph_RNAseq_TCGA()]
#'
#' @param gene
#'
#' one gene from the input argument of [.plot_boxgraph_RNAseq_TCGA()]
#'
#' @return
#'
#' a data frame of differential gene expression in tumors vs adjacent
#' normal tissues from individual TCGA cancer types
#'
#' @importFrom dplyr case_when
#'
#' @importFrom forcats fct_rev
#'
#' @keywords internal
#'
.RNAseq_ind_gene <- function(df, gene) {
df <- df %>%
dplyr::filter(.data$study == "TCGA") %>%
droplevels() %>%
dplyr::mutate(sample.type = dplyr::case_when(
sample.type != "Solid Tissue Normal" ~ "Tumor",
sample.type == "Solid Tissue Normal" ~ "Normal"
)) %>%
dplyr::filter(.data$variable == gene) %>%
dplyr::mutate(primary.disease = forcats::fct_rev(.data$primary.disease))
return(list(df, gene))
}
#' Box plots of differential gene expression across tumors
#'
#' @description
#'
#' This function takes the output of [.RNAseq_ind_gene()] and generate box plots
#' for differential gene expression in tumors vs NATs in TCGA cancer types
#' This function should not be used directly,
#' only inside [.plot_boxgraph_RNAseq_TCGA()]function.
#'
#' Side effects:
#'
#' (1) box plots on screen and as pdf file to show differential
#' gene expression in tumors vs NATs in TCGA cancer types
#'
#' @family helper function for differential expression plotting
#'
#' @param df
#'
#' output dataset generated from [.RNAseq_ind_gene()] function
#'
#' @importFrom scales log2_trans label_comma
#'
#' @keywords internal
#'
.RNAseq_boxplot <- function(df) {
p1 <- ggplot(
data = df[[1]],
aes_(
x = ~primary.disease,
y = ~ 2**RNAseq,
color = ~sample.type
)
) +
scale_y_continuous(
trans = scales::log2_trans(),
# limits = c(2**11, 2**17),# for 4g
# limits = c(2**7, 2**14),# for eif4E
labels = scales::label_comma()
) +
stat_n_text(
size = 5,
fontface = "bold",
hjust = 0
) +
geom_boxplot(
# alpha = .1,
# fill = sample.type,
outlier.shape = NA,
# size = .75,
# width = 1,
position = "dodge"
) +
scale_color_manual(
values = c(
"Tumor" = "#CC79A7",
"Normal" = "#0072B2"
),
breaks = c("Tumor", "Normal"),
labels = c("Tumor\n", "Normal\n")
) +
labs(
x = "primary disease",
y = paste(df[[2]], "expression (RNA-Seq counts)")
) +
coord_flip() +
theme_bw() +
theme(
plot.title = black_bold_12,
axis.title.x = black_bold_12,
axis.title.y = element_blank(),
axis.text.x = black_bold_12,
axis.text.y = black_bold_12,
panel.grid = element_blank(),
legend.title = element_blank(),
legend.text = black_bold_12,
legend.position = "top",
legend.justification = "left",
legend.box = "horizontal",
strip.text = black_bold_12
)
# geom_signif(comparisons=list(c("Tumor", "Normal")))
print(p1)
ggplot2::ggsave(
path = file.path(output_directory, "DEG"),
filename = paste(df[[2]], "pancancer RNAseq.pdf"),
plot = p1,
width = 7.5,
height = 9,
useDingbats = FALSE
)
return(NULL)
}
#' @title Select gene expression data in primary, metastatic tumors vs adjacent normal
#' tissues from all TCGA cancer types combined
#'
#' @description
#'
#' This function selects the RNAseq data from TCGA samples including
#' tumor samples that are labeled as `metastatic`, `primary tumor`,
#' and `solid tissue normal` for comparison.
#' It should not be used directly, only inside [.plot_boxgraph_RNAseq_TCGA()]
#' function.
#'
#' @family helper function for differential expression analysis
#'
#' @param df
#'
#' `.TCGA_GTEX_RNAseq_sampletype_subset` generated inside
#' [.plot_boxgraph_RNAseq_TCGA()]
#'
#' @return
#'
#' a data frame of differential gene expression in tumors vs adjacent
#' normal tissues in individual TCGA cancer types
#'
#' @keywords internal
#'
.RNAseq_tumortype <- function(df) {
return(df %>%
dplyr::filter(.data$study == "TCGA") %>%
dplyr::mutate_if(is.character, as.factor) %>%
dplyr::filter(.data$sample.type %in% c(
"Metastatic",
"Primary Tumor",
"Solid Tissue Normal"
)))
}
#' @title Violin plots of differential gene expression/ratio in primary,
#' metastatic tumors vs adjacent normal tissues
#'
#' @description
#'
#' This function should not be used directly, only inside
#' [.plot_boxgraph_RNAseq_TCGA()] or [.plot_boxgraph_RNAratio_TCGA()] function.
#'
#' Side effect:
#' (1) the violin plots on screen and as pdf file to show
#' differential gene expression/ratio in primary, metastatic tumors vs
#' adjacent normal tissues from all combined TCGA cancer types
#'
#' @family helper function for differential expression plotting
#'
#' @param df
#'
#' output dataset generated from [.RNAseq_tumortype()] or
#' [.RNAratio_tumortype()] function
#'
#' @param y.axis.title the title name for y axis
#'
#' @param y.axis.break the break mark on the y axis
#'
#' @param dashline the position of horizontal reference line
#'
#' @importFrom EnvStats stat_n_text
#'
#' @keywords internal
#'
.violinplot <- function(df, y.axis.title, y.axis.break, dashline) {
p1 <- ggplot(
data = df,
# data = EIF.TCGA.RNAseq.anno.subset.long,
aes_(
x = ~sample.type,
y = ~ 2**RNAseq,
color = ~sample.type,
fill = ~sample.type
)
) +
EnvStats::stat_n_text(
size = 6,
fontface = "bold",
angle = 90,
hjust = 0
) +
ggplot2::facet_grid(. ~ variable,
scales = "free",
space = "free"
) +
# facet_wrap(~ variable, ncol = 6) +
geom_violin(trim = TRUE) +
geom_boxplot(
alpha = .01,
width = .25,
color = "black",
position = position_dodge(width = .9)
) +
labs(
x = "sample type",
y = y.axis.title
# y = "normalized RNA counts"
) +
scale_x_discrete(labels = c(
"Metastatic Tumor",
"Primary Tumor",
# "Normal Tissue",
"Adjacent Normal"
)) +
scale_y_continuous(
trans = scales::log2_trans(),
labels = scales::label_comma(),
breaks = y.axis.break,
# breaks = c(1, 128, 2048, 32768),
# labels = c("1","8","64","512"),
position = "left"
) +
geom_hline(yintercept = dashline, linetype = "dashed") +
# for color-blind palettes
scale_color_manual(values = c("#56B4E9", "#009E73", "#D55E00")) +
# for color-blind palettes
scale_fill_manual(values = c("#56B4E9", "#009E73", "#D55E00")) +
theme_bw() +
theme(
plot.title = black_bold_16,
axis.title.x = element_blank(),
axis.title.y = black_bold_16_right,
axis.text.x = black_bold_16_90,
axis.text.y = black_bold_16_90,
axis.line.x = element_line(color = "black"),
axis.line.y = element_line(color = "black"),
panel.grid = element_blank(),
legend.position = "none",
strip.text = black_bold_16
) +
ggpubr::stat_compare_means(
comparisons = list(
c("Metastatic", "Solid Tissue Normal"),
c("Primary Tumor", "Solid Tissue Normal"),
c("Metastatic", "Primary Tumor")
),
method = "t.test",
label = "p.signif",
size = 6
)
print(p1)
ggplot2::ggsave(
path = file.path(output_directory, "DEG"),
filename = paste(y.axis.title, "violin.pdf"),
plot = p1,
width = 18,
height = 9,
useDingbats = FALSE
)
return(NULL)
}
#' @title Calculate RNA ratios
#'
#' @description A helper function calculates the RNA ratios between genes
#'
#' @details This function
#'
#' * select the of RNAseq data of input genes from all TCGA samples, using the
#' data frame `TCGA_GTEX_RNAseq_sampletype` prepared by the function
#' [initialize_RNAseq_data()].
#' * calculates the RNA ratio data in each TCGA samples including
#' tumors and solid tissue normal samples for comparison.
#'
#' It should not be used directly, only inside [.plot_boxgraph_RNAratio_TCGA()]
#' function.
#'
#' @family helper function for differential expression analysis
#'
#' @param gene01
#' gene name, same input from [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @param gene02
#' gene name, same input from [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @param gene03
#' gene name, same input from [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @param gene04
#' gene name, same input from [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @param gene05
#' gene name, same input from [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @param gene06
#' gene name, same input from [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @param gene07
#' gene name, same input from [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @param gene08
#' gene name, same input from [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @param gene09
#' gene name, same input from [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @param gene10
#' gene name, same input from [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @importFrom rlang :=
#'
#' @return
#'
#' a data frame of differential RNA ratios in tumors vs adjacent normal tissues
#' from individual TCGA cancer types
#'
#' @keywords internal
#'
.RNAratio_calculation <- function(gene01 = "EIF4E",
gene02 = "EIF4E2",
gene03 = "EIF4E3",
gene04 = "EIF4EBP1",
gene05 = "EIF4G1",
gene06 = "EIF4G2",
gene07 = "EIF4G3",
gene08 = "EIF3D",
gene09 = "EIF4A1",
gene10 = "EIF4A2") {
.genes_names <- c(
gene01, gene02, gene03, gene04,
gene05, gene06, gene07, gene08,
gene09, gene10
)
return(TCGA_GTEX_RNAseq_sampletype %>%
dplyr::select(
dplyr::all_of(.genes_names),
"sample.type",
"primary.disease",
"primary.site",
"study"
) %>%
tibble::as_tibble() %>%
dplyr::filter(gene01 != 0 & !is.na(.data$primary.site)) %>%
# calculate the ratio of mRNA counts
dplyr::mutate(
(!!paste0(gene01, "+", gene04)) :=
log2(2**(!!as.name(gene01)) + 2**(!!as.name(gene04)) - 1),
# ":", "\n" split the final ratio name in two lines
# for the plotting purpose.
(!!paste0(gene09, ":", "\n", gene01)) :=
(!!as.name(gene09)) - (!!as.name(gene01)),
(!!paste0(gene09, ":", "\n", gene02)) :=
(!!as.name(gene09)) - (!!as.name(gene02)),
(!!paste0(gene10, ":", "\n", gene01)) :=
(!!as.name(gene10)) - (!!as.name(gene01)),
(!!paste0(gene10, ":", "\n", gene02)) :=
(!!as.name(gene10)) - (!!as.name(gene02)),
(!!paste0(gene05, ":", "\n", gene01)) :=
(!!as.name(gene05)) - (!!as.name(gene01)),
(!!paste0(gene05, ":", "\n", gene02)) :=
(!!as.name(gene05)) - (!!as.name(gene02)),
(!!paste0(gene07, ":", "\n", gene01)) :=
(!!as.name(gene07)) - (!!as.name(gene01)),
(!!paste0(gene07, ":", "\n", gene02)) :=
(!!as.name(gene07)) - (!!as.name(gene02)),
(!!paste0(gene09, ":", "\n", gene05)) :=
(!!as.name(gene09)) - (!!as.name(gene05)),
(!!paste0(gene10, ":", "\n", gene05)) :=
(!!as.name(gene10)) - (!!as.name(gene05)),
(!!paste0(gene09, ":", "\n", gene06)) :=
(!!as.name(gene09)) - (!!as.name(gene06)),
(!!paste0(gene10, ":", "\n", gene06)) :=
(!!as.name(gene10)) - (!!as.name(gene06)),
(!!paste0(gene01, ":", "\n", gene04)) :=
(!!as.name(gene01)) - (!!as.name(gene04)),
(!!paste0(gene02, ":", "\n", gene01)) :=
(!!as.name(gene02)) - (!!as.name(gene01)),
(!!paste0(gene06, ":", "\n", gene05)) :=
(!!as.name(gene06)) - (!!as.name(gene05)),
(!!paste0(gene05, ":", "\n", gene07)) :=
(!!as.name(gene05)) - (!!as.name(gene07)),
(!!paste0(gene09, ":", "\n", gene10)) :=
(!!as.name(gene09)) - (!!as.name(gene10)),
(!!paste0(gene05, ":", "\n", gene01, "+", gene04)) :=
(!!as.name(gene05)) -
log2(2**(!!as.name(gene01)) + 2**(!!as.name(gene04)) - 1),
(!!paste0(gene09, ":", "\n", gene01, "+", gene04)) :=
(!!as.name(gene09)) -
log2(2**(!!as.name(gene01)) + 2**(!!as.name(gene04)) - 1)
) %>%
dplyr::select(
(!!paste0(gene09, ":", "\n", gene01)),
(!!paste0(gene09, ":", "\n", gene02)),
(!!paste0(gene10, ":", "\n", gene01)),
(!!paste0(gene10, ":", "\n", gene02)),
(!!paste0(gene05, ":", "\n", gene01)),
(!!paste0(gene05, ":", "\n", gene02)),
(!!paste0(gene07, ":", "\n", gene01)),
(!!paste0(gene07, ":", "\n", gene02)),
(!!paste0(gene09, ":", "\n", gene05)),
(!!paste0(gene10, ":", "\n", gene05)),
(!!paste0(gene09, ":", "\n", gene06)),
(!!paste0(gene10, ":", "\n", gene06)),
(!!paste0(gene01, ":", "\n", gene04)),
(!!paste0(gene02, ":", "\n", gene01)),
(!!paste0(gene06, ":", "\n", gene05)),
(!!paste0(gene05, ":", "\n", gene07)),
(!!paste0(gene09, ":", "\n", gene10)),
(!!paste0(gene05, ":", "\n", gene01, "+", gene04)),
(!!paste0(gene09, ":", "\n", gene01, "+", gene04)),
"sample.type",
"primary.disease",
"primary.site",
"study"
) %>%
dplyr::mutate_if(is.character, as.factor) %>%
stats::na.omit())
}
#' @title Select RNA ratio data for plotting
#'
#' @description
#'
#' This function should not be used directly,
#' only inside [.plot_boxgraph_RNAratio_TCGA()]function.
#'
#' @family helper function for differential expression analysis
#'
#' @param df
#'
#' output dataset generated from [.RNAratio_calculation()] function
#'
#' @param gene_ratio gene ratio name
#'
#' input argument for selected variables generated
#' from [.plot_boxgraph_RNAratio_TCGA()] function
#'
#' @keywords internal
#'
.RNAratio_selection <- function(df, gene_ratio) {
return(df %>%
dplyr::filter(.data$study == "TCGA") %>%
droplevels() %>%
dplyr::mutate(sample.type = dplyr::case_when(
sample.type != "Solid Tissue Normal" ~ "Tumor",
sample.type == "Solid Tissue Normal" ~ "NAT"
)) %>%
dplyr::select(
dplyr::all_of(gene_ratio),
"sample.type",
"primary.disease",
"primary.site",
"study"
) %>%
reshape2::melt(
id = c(
"sample.type",
"primary.disease",
"primary.site",
"study"
),
value.name = "RNAseq"
) %>%
dplyr::mutate_if(is.character, as.factor) %>%
dplyr::mutate(
primary.disease = forcats::fct_rev(.data$primary.disease)))
}
#' @title Box plots of differential RNA ratios across tumors
#'
#' @description
#'
#' This function should not be used directly,
#' only inside [.plot_boxgraph_RNAratio_TCGA()]function.
#'
#' Side effects:
#' (1)the box plots on screen and as pdf files to show differential
#' RNA ratios across tumors
#'
#' @family helper function for differential expression plotting
#'
#' @param df
#'
#' output dataset generated from [.RNAratio_selection()] function
#'
#' @param dashline the position of horizontal reference line
#'
#' @param ylimit the lower and upper limit of the axis scale
#'
#' @param filename the name for the output file
#'
#' @keywords internal
#'
.RNAratio_boxplot <- function(df, dashline, ylimit, filename) {
p1 <- ggplot(
data = df,
aes_(
x = ~primary.disease,
# x = f.ordered1,
y = ~ 2**RNAseq,
# fill = variable,
color = ~sample.type
)
) +
geom_boxplot(
# alpha = .1,
# fill = sample.type,
outlier.shape = NA,
# size = .75,
# width = 1,
position = "dodge"
) +
geom_hline(
# data = df,
aes(yintercept = dashline),
linetype = "dashed"
) +
scale_color_manual(
values = c("Tumor" = "#CC79A7", "NAT" = "#0072B2"),
breaks = c("Tumor", "NAT")
) +
# for color-blind palettes
facet_wrap(~variable, scales = "free_x") +
ggplot2::facet_grid(~variable, scales = "free_x", space = "free") +
guides(colour = guide_legend(nrow = 1)) +
labs(
x = "primary disease",
y = "Ratio of RNA counts"
) +
coord_flip(ylim = ylimit) +
theme_bw() +
theme(
plot.title = black_bold_12,
axis.title.x = black_bold_12,
axis.title.y = element_blank(),
axis.text.x = black_bold_12,
axis.text.y = black_bold_12,
panel.grid = element_blank(),
legend.title = element_blank(),
legend.position = "top",
legend.justification = "left",
legend.box = "horizontal",
legend.text = black_bold_12,
# strip.background = element_blank(),
strip.text.x = black_bold_12
)
print(p1)
ggplot2::ggsave(
path = file.path(output_directory, "DEG"),
filename = filename,
plot = p1,
width = 18,
height = 8,
useDingbats = FALSE
)
return(NULL)
}
#' @title Analyzes differential RNA ratios in primary, metastatic tumors vs
#' adjacent normal tissues from all TCGA cancer types combined
#'
#' @description
#'
#' This function selects the RNA ratio data from TCGA samples including
#' tumor samples that are labeled as `metastatic`, `primary tumor`,
#' and `solid tissue normal` for comparison.
#'
#' It should not be used directly, only
#' inside [.plot_boxgraph_RNAratio_TCGA()] function.
#'
#' @family helper function for differential expression analysis
#'
#' @param df
#'
#' `.RNAratio_tumortype(.RNAratio_data)` generated
#' inside [.plot_boxgraph_RNAratio_TCGA()]
#'
#' @param gene_ratio
#'
#' @return
#'
#' a data frame of differential RNA ratios in tumors vs adjacent normal tissues
#' from individual TCGA cancer types
#'
#' @keywords internal
#'
.RNAratio_tumortype <- function(df, gene_ratio) {
.RNAratio_data <- df %>%
dplyr::filter(.data$sample.type %in% c(
"Metastatic",
"Primary Tumor",
"Solid Tissue Normal"
)) %>%
droplevels() %>%
dplyr::select(
dplyr::all_of(gene_ratio),
"sample.type",
"primary.disease",
"primary.site",
"study"
) %>%
reshape2::melt(
id = c(
"sample.type",
"primary.disease",
"primary.site",
"study"
),
value.name = "RNAseq"
) %>%
dplyr::mutate_if(is.character, as.factor) %>%
dplyr::mutate(primary.disease = forcats::fct_rev(.data$primary.disease))
return(.RNAratio_data)
}
## Composite functions to call DEG gene analysis and plotting ==================
#' @title Compare the expression of EIF4F genes
#'
#' @description
#'
#' This function generates a summary box plot to compare the expression of all
#' EIF4F genes across TCGA cancer types, box plots for differential gene
#' expression in tumors tumors vs adjacent normal tissues for each gene.
#' and a violin plot to compare differential gene expression in primary,
#' metastatic tumors vs adjacent normal tissues from all TCGA cancer types
#' combined.
#'
#' @family composite function to call DEG gene analysis and plotting
#'
#' @param gene_list
#'
#' gene names in a vector of characters
#'
#' @details This function
#'
#' * selects RNASeq of input genes and columns including `sample.type`,
#' `primary.disease`, and `primary.site`,from the data frame
#' `TCGA_GTEX_RNAseq_sampletype` that was prepared by the function
#' [initialize_RNAseq_data()] to produce a subset data frame called
#' `.TCGA_GTEX_RNAseq_sampletype_subset`
#' * with the subset data, calls the functions [.RNAseq_all_gene()] to compare
#' relative mRNA expression of all inquired genes and calls the functions
#' [.RNAseq_grouped_boxplot()] plot the results as a grouped box plot.
#' * performs differential expression analysis for each gene across all
#' tumors types with the function [.RNAseq_ind_gene()] and plots with
#' [.RNAseq_boxplot()].
#' * performs differential expression analysis for each gene in
#' primary, metastatic tumors vs adjacent normal tissues from all TCGA cancer
#' types combined with the function [.RNAseq_tumortype()] and plots with
#' [.violinplot()].
#'
#' This function is not accessible to the user and will not show at the users'
#' workspace. It can only be called by the exported [EIF4F_DEG_analysis()]
#' function.
#'
#' Side effects:
#'
#' (1) box plots on screen and as pdf file to compare relative gene
#' expression in across all TCGA cancer types
#'
#' (2) box plots on screen and as pdf file to show differential
#' gene expression in tumors vs NATs in TCGA cancer types
#'
#' (3) violin plots on screen and as pdf file to show
#' differential gene expression in primary, metastatic tumors vs
#' adjacent normal tissues from all combined TCGA cancer types
#'
#' @keywords internal
#'
#' @examples \dontrun{
#' .plot_boxgraph_RNAseq_TCGA(c(
#' "EIF4G1", "EIF4G2", "EIF4G3",
#' "PABPC1", "EIF4A1", "EIF4A2", "EIF4B", "EIF4H", "EIF4E",
#' "EIF4E2", "EIF4E3", "EIF4EBP1", "EIF3D"
#' ))
#' }
#'
.plot_boxgraph_RNAseq_TCGA <- function(gene_list) {
.TCGA_GTEX_RNAseq_sampletype_subset <- TCGA_GTEX_RNAseq_sampletype %>%
dplyr::select(
dplyr::all_of(gene_list),
"sample.type",
"primary.disease",
"primary.site",
"study"
) %>%
tibble::as_tibble() %>%
reshape2::melt(
id = c(
"sample.type",
"primary.disease",
"primary.site",
"study"
),
value.name = "RNAseq"
) %>%
dplyr::filter(!is.na(.data$primary.site)) %>%
# na.omit(.$primary.site) %>%
# filter(RNAseq != 0) %>%
dplyr::mutate_if(is.character, as.factor)
# boxplot to compare relative abundance of genes across tumors
.RNAseq_all_gene(.TCGA_GTEX_RNAseq_sampletype_subset) %>%
.RNAseq_grouped_boxplot()
# boxplot to compare RNA-seq of one gene in tumor vs adjacent normal
RNAseq.ind.gene.df <- lapply(gene_list, .RNAseq_ind_gene,
df = .TCGA_GTEX_RNAseq_sampletype_subset
)
lapply(RNAseq.ind.gene.df, .RNAseq_boxplot)
# violin plot to compare expression in primary, metastatic tumors vs NATs
.RNAseq_tumortype(.TCGA_GTEX_RNAseq_sampletype_subset) %>%
.violinplot(
y.axis.title = "normalized RNA counts",
y.axis.break = c(128, 2048, 32768),
dashline = NULL
)
return(NULL)
}
#' @title Compare the RNA ratios between EIF4F genes
#'
#' @description
#'
#' This function generates box plots to compare the RNA ratios between EIF4F
#' genes in tumors tumors vs adjacent normal tissues across TCGA cancer types,
#' and a violin plot to compare RNA ratios in primary, metastatic tumors vs
#' adjacent normal tissues from all TCGA cancer types combined.
#'
#' @family composite functions to call DEG gene analysis and plotting
#'
#' @details This function
#'
#' * calculates RNA ratio across individual tumor types from all cancers with
#' the function [.RNAratio_calculation()],
#' * selects the ratio data with [.RNAratio_selection()] and make boxplot with
#' [.RNAratio_boxplot()].
#' * compares RNA ratio in primary, metastatic tumors vs adjacent
#' normal tissues from all TCGA cancer types combined with the function
#' [.RNAratio_tumortype()] and plots with [.violinplot()].
#'
#' This function is not accessible to the user and will not show at the users'
#' workspace. It can only be called by the exported [EIF4F_DEG_analysis()]
#' function.
#'
#' Side effects:
#'
#' (1) box plots on screen and as pdf file to show differential
#' RNA ratios in tumors vs NATs in TCGA cancer type
#'
#' (2) violin plots on screen and as pdf file to show
#' differential gene ratios in primary, metastatic tumors vs
#' adjacent normal tissues from all combined TCGA cancer types
#'
#' @param gene01
#' gene name as a string
#'
#' @param gene02
#' gene name as a string
#'
#' @param gene03
#' gene name as a string
#'
#' @param gene04
#' gene name as a string
#'
#' @param gene05
#' gene name as a string
#'
#' @param gene06
#' gene name as a string
#'
#' @param gene07
#' gene name as a string
#'
#' @param gene08
#' gene name as a string
#'
#' @param gene09
#' gene name as a string
#'
#' @param gene10
#' gene name as a string
#'
#'
#' @examples \dontrun{
#' .plot_boxgraph_RNAratio_TCGA(
#' EIF4E = "EIF4E", EIF4E2 = "EIF4E2",
#' EIF4E3 = "EIF4E3", EIF4EBP1 = "EIF4EBP1", EIF4G1 = "EIF4G1", EIF4G2 = "EIF4G2",
#' EIF4G3 = "EIF4G3", EIF3D = "EIF3D", EIF4A1 = "EIF4A1", EIF4A2 = "EIF4A2"
#' )
#' }
#'
#' @keywords internal
#'
.plot_boxgraph_RNAratio_TCGA <- function(gene01, gene02, gene03, gene04,
gene05, gene06, gene07, gene08,
gene09, gene10) {
RNAratio.data <- .RNAratio_calculation(
gene01, gene02, gene03, gene04,
gene05, gene06, gene07, gene08,
gene09, gene10
)
.RNAratio_boxplot(
df = .RNAratio_selection(RNAratio.data, c(
(paste0(gene05, ":", "\n", gene01)), (paste0(gene09, ":", "\n", gene01)),
(paste0(gene10, ":", "\n", gene01)), (paste0(gene07, ":", "\n", gene01)),
(paste0(gene07, ":", "\n", gene02)), (paste0(gene05, ":", "\n", gene07))
)),
dashline = 1,
ylimit = c(0, 25),
filename = "EIF4F RNA ratio plot1.pdf"
)
.RNAratio_boxplot(
df = .RNAratio_selection(RNAratio.data, c(
(paste0(gene06, ":", "\n", gene05)), (paste0(gene02, ":", "\n", gene01)),
(paste0(gene09, ":", "\n", gene10)), (paste0(gene01, ":", "\n", gene04)),
(paste0(gene05, ":", "\n", gene01, "+", gene04)),
(paste0(gene09, ":", "\n", gene01, "+", gene04))
)),
dashline = 4,
ylimit = c(0, 25),
filename = "EIF4F RNA ratio plot2.pdf"
)
.RNAratio_boxplot(
df = .RNAratio_selection(RNAratio.data, c(
(paste0(gene07, ":", "\n", gene01)), (paste0(gene07, ":", "\n", gene02)),
(paste0(gene06, ":", "\n", gene05)), (paste0(gene02, ":", "\n", gene01)),
(paste0(gene09, ":", "\n", gene10)), (paste0(gene01, ":", "\n", gene04))
)),
dashline = 1,
ylimit = c(0, 5),
filename = "EIF4F RNA ratio plot3.pdf"
)
.RNAratio_tumortype(RNAratio.data, c(
(paste0(gene05, ":", "\n", gene01)), (paste0(gene09, ":", "\n", gene01)),
(paste0(gene10, ":", "\n", gene01)), (paste0(gene07, ":", "\n", gene01)),
(paste0(gene07, ":", "\n", gene02)), (paste0(gene05, ":", "\n", gene07)),
(paste0(gene09, ":", "\n", gene10)), (paste0(gene01, ":", "\n", gene04)),
(paste0(gene05, ":", "\n", gene01, "+", gene04)),
(paste0(gene09, ":", "\n", gene01, "+", gene04))
)) %>%
.violinplot(
y.axis.title = "Ratio of RNA counts",
# y.axis.break = c(1, 128, 2048, 32768)
y.axis.break = c(0.125, 1, 4, 8, 64, 512),
dashline = c(1, 4)
)
return(NULL)
}
## Wrapper function to call all composite functions with inputs ================
#' @title Perform differential gene expression or ratio analysis and
#' generate plots
#'
#' @description
#'
#' A wrapper function to call all composite functions for differential gene
#' expression or ratio analysis with inputs
#'
#' @details
#'
#' This function run two composite functions together, with inputs:
#'
#' * [.plot_boxgraph_RNAseq_TCGA()]
#' * [.plot_boxgraph_RNAratio_TCGA()]
#'
#' Side effects:
#'
#' (1) box plots on screen and as pdf file to compare relative gene
#' expression in across all TCGA cancer types
#'
#' (2) box plots on screen and as pdf file to show differential
#' gene expression in tumors vs NATs in TCGA cancer types
#'
#' (3) violin plots on screen and as pdf file to show
#' differential gene expression in primary, metastatic tumors vs
#' adjacent normal tissues from all combined TCGA cancer types
#'
#' (4) box plots on screen and as pdf file to show differential
#' RNA ratios in tumors vs NATs in TCGA cancer type
#'
#' (5) violin plots on screen and as pdf file to show
#' differential gene ratios in primary, metastatic tumors vs
#' adjacent normal tissues from all combined TCGA cancer types
#'
#' @family wrapper function to call all composite functions with inputs
#'
#' @export
#'
#' @examples \dontrun{
#' EIF4F_DEG_analysis()
#' }
#'
EIF4F_DEG_analysis <- function() {
.plot_boxgraph_RNAseq_TCGA(c(
"EIF4G1", "EIF4G2", "EIF4G3", "PABPC1", "EIF4A1", "EIF4A2",
"EIF4B", "EIF4H", "EIF4E", "EIF4E2", "EIF4E3", "EIF4EBP1", "EIF3D"
))
.plot_boxgraph_RNAratio_TCGA(
gene01 = "EIF4E", gene02 = "EIF4E2", gene03 = "EIF4E3", gene04 = "EIF4EBP1",
gene05 = "EIF4G1", gene06 = "EIF4G2", gene07 = "EIF4G3", gene08 = "EIF3D",
gene09 = "EIF4A1", gene10 = "EIF4A2"
)
return(NULL)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.