# CNV analyses of EIF genes in TCGA data
#
# This R script contains four sections:
#
# (1) CNV data preparation
#
# (2) CNV data analyses and plotting
#
# (3) composite functions to execute a pipeline of functions to select related
# CNV data with supply of EIF4F gene names for analysis and plotting.
#
# (4) wrapper function to call all composite functions with inputs
#
## Internal function for data initialization of CNV related datasets ============
#' @noRd
## due to NSE notes in R CMD check
TCGA_CNV_value <- TCGA_CNV_sampletype <- TCGA_CNVratio_sampletype <- NULL
#' @title Read all CNV related datasets from TCGA
#'
#' @description
#'
#' A wrapper function reads all CNV related datasets from TCGA and its side effects
#' are three global variables.
#'
#' @details Side effects:
#'
#' (1) `TCGA_CNV_value`: the unthreshold CNV value data generated as the output
#' [.get_TCGA_CNV_value()], which imports the download dataset
#' `Gistic2_CopyNumber_Gistic2_all_data_by_genes`.
#'
#' (2) `TCGA_CNV_sampletype`: the merged dataset from
#' * `.TCGA.CNV` that contains the threshold CNV data from the download dataset
#' `Gistic2_CopyNumber_Gistic2_all_thresholded.by_genes` and was generated
#' from [.get_TCGA_CNV()],
#' * `.TCGA_sampletype` that contains the annotation data from
#' `TCGA_phenotype_denseDataOnlyDownload.tsv` with the selection of
#' `sample.type` and `primary.disease` columns. `Solid Tissue Normal` samples
#' are excluded.
#'
#' (3) `TCGA_CNVratio_sampletype`: the merged dataset from
#' * `.TCGA_CNV_ratio` that contains the CNV ratio data from the download
#' dataset `broad.mit.edu_PANCAN_Genome_Wide_SNP_6_whitelisted.gene.xena` and
#' was generated by [.get_TCGA_CNV_ratio()],
#' * `.TCGA_sampletype`.
#'
#' `TCGA_CNV_value`, `TCGA_CNV_sampletype` and `TCGA_CNVratio_sampletype` are
#' stored as `TCGA_CNV_value.csv`, `TCGA_CNV_sampletype.csv` and
#' `TCGA_CNVratio_sampletype.csv` in `~/Documents/EIF_output/ProcessedData` folder.
#'
#' @family wrapper function for data initialization
#'
#' @importFrom data.table fread transpose
#'
#' @importFrom dplyr distinct filter select select_if mutate mutate_at
#' summarise rename group_by
#'
#' @importFrom readr write_csv
#'
#' @importFrom stats na.omit
#'
#' @importFrom tibble remove_rownames column_to_rownames
#'
#' @export
#'
#' @examples \dontrun{
#' initialize_cnv_data()
#' }
#'
initialize_cnv_data <- function() {
rlang::env_binding_unlock(parent.env(environment()), nms = NULL)
.path <- file.path(output_directory, "ProcessedData","TCGA_CNV_value.csv")
if (!file.exists(.path)) {
data.table::fwrite(.get_TCGA_CNV_value(), .path, row.names = TRUE)
}
assign("TCGA_CNV_value",
data.table::fread(.path ,data.table = FALSE) %>%
tibble::as_tibble() %>%
#tibble::remove_rownames() %>%
tibble::column_to_rownames(var = "V1"),
envir = parent.env(environment())
)
.TCGA_sampletype <- readr::read_tsv(file.path(
data_file_directory,
"TCGA_phenotype_denseDataOnlyDownload.tsv"
)) %>%
as.data.frame() %>%
dplyr::distinct(.data$sample, .keep_all = TRUE) %>%
stats::na.omit() %>%
tibble::remove_rownames() %>%
tibble::column_to_rownames(var = "sample") %>%
dplyr::select("sample_type", "_primary_disease") %>%
dplyr::rename(
"sample.type" = "sample_type",
"primary.disease" = "_primary_disease"
)
if (!file.exists(file.path(
output_directory,
"ProcessedData",
"TCGA_CNV_sampletype.csv"
))) {
.TCGA_CNV <- .get_TCGA_CNV()
assign("TCGA_CNV_sampletype",
merge(.TCGA_CNV,
.TCGA_sampletype,
by = "row.names",
all.x = TRUE
) %>%
dplyr::filter(.data$sample.type != "Solid Tissue Normal") %>%
tibble::remove_rownames() %>%
tibble::column_to_rownames(var = "Row.names"),
envir = parent.env(environment())
)
data.table::fwrite(TCGA_CNV_sampletype, file.path(
output_directory,
"ProcessedData",
"TCGA_CNV_sampletype.csv"
),row.names = TRUE)
} else {
assign("TCGA_CNV_sampletype",
data.table::fread(file.path(
output_directory,
"ProcessedData",
"TCGA_CNV_sampletype.csv"),data.table = FALSE
) %>%
tibble::as_tibble() %>%
#tibble::remove_rownames() %>%
tibble::column_to_rownames(var = "V1"),
envir = parent.env(environment())
)
}
if (!file.exists(file.path(
output_directory,
"ProcessedData",
"TCGA_CNVratio_sampletype.csv"
))) {
.TCGA_CNV_ratio <- .get_TCGA_CNV_ratio()
assign("TCGA_CNVratio_sampletype",
merge(.TCGA_CNV_ratio,
.TCGA_sampletype,
by = "row.names",
all.x = TRUE
) %>%
tibble::remove_rownames() %>%
tibble::column_to_rownames(var = "Row.names"),
envir = parent.env(environment())
)
data.table::fwrite(TCGA_CNVratio_sampletype, file.path(
output_directory,
"ProcessedData",
"TCGA_CNVratio_sampletype.csv"
),row.names = TRUE)
} else {
assign("TCGA_CNVratio_sampletype",
data.table::fread(file.path(
output_directory,
"ProcessedData",
"TCGA_CNVratio_sampletype.csv"),data.table = FALSE
) %>%
tibble::as_tibble() %>%
#tibble::remove_rownames() %>%
tibble::column_to_rownames(var = "V1"),
envir = parent.env(environment())
)
}
rlang::env_binding_lock(parent.env(environment()), nms = NULL)
return(NULL)
}
#' @title Read CNV threshold dataset from TCGA
#'
#' @description
#'
#' This function reads the threshold CNV data
#' `Gistic2_CopyNumber_Gistic2_all_thresholded.by_genes`, which grouped the
#' CNV values with thresholds 3+, 3, 2, 1, 0, to represent high-level copy
#' number gain, low-level copy number gain, diploid, shallow deletion, or
#' deep deletion.
#'
#' @details
#'
#' The function also removes possible duplicated tumor samples and
#' samples with NAs in the dataset.
#'
#' It should not be used directly, only inside [initialize_cnv_data()]
#' function.
#'
#' @family helper function for data initialization
#'
#' @importFrom tibble as_tibble
#'
#' @return a data frame that contains CNV threshold data for all TCGA tumor
#'
#' @examples \dontrun{
#' .get_TCGA_CNV()
#' }
#'
#' @keywords internal
#'
.get_TCGA_CNV <- function() {
.TCGA_pancancer <- data.table::fread(
file.path(
data_file_directory,
"Gistic2_CopyNumber_Gistic2_all_thresholded.by_genes"
),
data.table = FALSE
) %>%
tibble::as_tibble() %>%
# as.data.frame(.) %>%
dplyr::distinct(.data$Sample, .keep_all = TRUE) %>%
na.omit() %>%
tibble::remove_rownames() %>%
tibble::column_to_rownames(var = "Sample")
# transpose function from the data.table keeps numeric values as numeric.
.TCGA_pancancer_transpose <- data.table::transpose(.TCGA_pancancer)
# get row and column names in order
rownames(.TCGA_pancancer_transpose) <- colnames(.TCGA_pancancer)
colnames(.TCGA_pancancer_transpose) <- rownames(.TCGA_pancancer)
return(.TCGA_pancancer_transpose)
}
#' @title Read CNV value dataset from TCGA
#'
#' @description
#'
#' This function reads the unthreshold CNV value data from TCGA
#' `Gistic2_CopyNumber_Gistic2_all_data_by_genes`.
#'
#' @details
#'
#' The function also removes possible duplicated tumor samples and
#' samples with NAs in the dataset.
#'
#' It should not be used directly, only inside [initialize_cnv_data()]
#' function.
#'
#' @family helper function for data initialization
#'
#' @return
#'
#' a data frame that contains CNV value data for each tumor
#'
#' @examples \dontrun{
#' .get_TCGA_CNV_value()
#' }
#'
#' @keywords internal
#'
.get_TCGA_CNV_value <- function() {
.TCGA_pancancer <- data.table::fread(
file.path(
data_file_directory,
"Gistic2_CopyNumber_Gistic2_all_data_by_genes"
),
data.table = FALSE
) %>%
tibble::as_tibble() %>%
# as.data.frame(.) %>%
dplyr::distinct(.data$Sample, .keep_all = TRUE) %>%
stats::na.omit() %>%
tibble::remove_rownames() %>%
tibble::column_to_rownames(var = "Sample")
# transpose function keeps numeric values as numeric.
.TCGA_pancancer_transpose <- data.table::transpose(.TCGA_pancancer)
# get row and colnames in order
rownames(.TCGA_pancancer_transpose) <- colnames(.TCGA_pancancer)
colnames(.TCGA_pancancer_transpose) <- rownames(.TCGA_pancancer)
return(.TCGA_pancancer_transpose)
}
#' @title Read CNV ratio dataset from TCGA
#'
#' @description
#'
#' This function reads the CNV ratio data
#' `broad.mit.edu_PANCAN_Genome_Wide_SNP_6_whitelisted.gene.xena`, in which
#' CNV ratios were calculated by dividing, for each gene, the estimated
#' gene-level CNV values in malignant tumors to the average CNV value in
#' normal adjacent tissues (NATs) of the same cancer type.
#'
#' @details
#'
#' The function also removes possible duplicated tumor samples and
#' samples with NAs in the dataset.
#'
#' It should not be used directly, only inside [initialize_cnv_data()]
#' function.
#'
#' @family helper function for data initialization
#'
#' @return
#'
#' a data frame that contains CNV ratio data for each tumor
#'
#' @examples \dontrun{
#' .get_TCGA_CNV_ratio()
#' }
#'
#' @keywords internal
#'
.get_TCGA_CNV_ratio <- function() {
.TCGA_pancancer <- data.table::fread(
file.path(
data_file_directory,
"broad.mit.edu_PANCAN_Genome_Wide_SNP_6_whitelisted.gene.xena"
),
data.table = FALSE
) %>%
tibble::as_tibble() %>%
# as.data.frame(.) %>%
dplyr::distinct(.data$sample, .keep_all = TRUE) %>%
stats::na.omit() %>%
tibble::remove_rownames() %>%
tibble::column_to_rownames(var = "sample")
# transpose function keeps numeric values as numeric.
.TCGA_pancancer_transpose <- data.table::transpose(.TCGA_pancancer)
# get row and colnames in order
rownames(.TCGA_pancancer_transpose) <- colnames(.TCGA_pancancer)
colnames(.TCGA_pancancer_transpose) <- rownames(.TCGA_pancancer)
return(.TCGA_pancancer_transpose)
}
## CNV data analysis and plotting ==============================================
#' @title Calculates the frequency of CNV status in all TCGA cancer types combined
#'
#' @description
#'
#' This function calculates the frequency of each CNV status
#' across tumors in all TCGA cancer types for every EIF4F gene.
#'
#' @details
#'
#' It should not be used directly, only inside
#' [.plot_bargraph_CNV_TCGA()] function.
#'
#' @family helper function for CNV data analysis
#'
#' @param df
#'
#' `.TCGA_CNV_sampletype_subset` generated inside
#' [.plot_bargraph_CNV_TCGA()]
#'
#' @return
#'
#' a summary table ranking the EIF4F gene by the frequencies of
#' duplication (CNV threshold labeled as “1” in the dataset)
#'
#' @importFrom reshape2 dcast melt
#'
#' @examples \dontrun{
#' .CNV_all_cancer(.TCGA_CNV_sampletype_subset)
#' }
#'
#' @keywords internal
#'
.CNV_all_cancer <- function(df) {
.TCGA_CNV_anno_subset_long <- reshape2::melt(df,
id = c(
"sample.type",
"primary.disease"
),
value.name = "CNV"
) %>%
dplyr::mutate_if(is.character, as.factor)
.CNV_sum <-
table(.TCGA_CNV_anno_subset_long[, c("CNV", "variable")]) %>%
# tibble(.) %>%
as.data.frame() %>%
dplyr::mutate(CNV = factor(.data$CNV,
levels = c("-2", "-1", "0", "1", "2")))
# reorder stack bars by the frequency of duplication.
Freq.sum <- reshape2::dcast(.CNV_sum, variable ~ CNV, mean)
.CNV_sum$variable <- factor(.CNV_sum$variable,
levels = Freq.sum[order(Freq.sum$`1`), ]$variable
)
return(.CNV_sum)
}
#' @title Stacked bar plots of the CNV status summary
#'
#' @description
#'
#' This function generates the stacked bar plots to the summary
#' of CNV statuses
#'
#' @details
#'
#' This plot function uses dataset generated from
#' [.CNV_all_cancer()] function
#'
#' It should not be used directly, only inside
#' [.plot_bargraph_CNV_TCGA()]function.
#'
#' Side effect: stacked bar plots on screen and as pdf file to show the summary
#' table of the CNV statuses
#'
#' @family helper function for CNV data plotting
#'
#' @param data summary table from
#' `.CNV_all_cancer(.TCGA_CNV_sampletype_subset)`
#'
#' @examples \dontrun{
#' .CNV_sum_barplot(.CNV_all_cancer(.TCGA_CNV_sampletype_subset))
#' }
#'
#' @keywords internal
#'
.CNV_sum_barplot <- function(data) {
p1 <- ggplot2::ggplot(
data,
aes_(
fill = ~CNV,
y = ~Freq,
x = ~variable
)
) +
geom_bar(
stat = "identity",
position = "fill"
) +
geom_col() +
geom_text(aes(label = paste0(.data$Freq / 100, "%")),
position = position_stack(vjust = 0.5),
size = 4
) +
# scale_y_continuous(labels = scales::percent_format())+
labs(
x = "Tumor types (TCGA pan cancer atlas 2018)",
y = "All TCGA tumors combined"
) +
coord_flip() +
theme_bw() +
theme(
plot.title = black_bold_16,
axis.title.x = black_bold_16,
axis.title.y = element_blank(),
axis.text.x = black_bold_16,
axis.text.y = black_bold_16,
panel.grid = element_blank(),
legend.title = element_blank(),
legend.text = black_bold_16,
legend.position = "top",
legend.justification = "left",
legend.box = "horizontal",
strip.text = black_bold_16
) +
# Flip ordering of legend without altering ordering in plot
guides(fill = guide_legend(reverse = TRUE)) +
scale_fill_manual(
name = "Copy number variation",
breaks = c("-2", "-1", "0", "1", "2"),
labels = c(
"Deep del\n 0",
"Shallow del\n 1",
"Diploid\n 2",
"Gain\n 3",
"Amp\n 3+"
),
values = c("darkblue", "blue", "lightgreen", "red", "darkred")
)
print(p1)
ggplot2::ggsave(
path = file.path(output_directory, "CNV"),
filename = "EIF CNV summary.pdf",
plot = p1,
width = 9,
height = 9,
useDingbats = FALSE
)
return(NULL)
}
#' @title Calculates the frequency of CNV status in individual TCGA cancer types
#'
#' @description
#'
#' a data analysis function that calculates the frequency of
#' CNV status for each EIF4F gene in individual TCGA cancer types.
#'
#' @details
#'
#' It should not be used directly, only inside
#' [.plot_bargraph_CNV_TCGA()] function.
#'
#' @param df `.TCGA_CNV_sampletype_subset` generated inside
#' [.plot_bargraph_CNV_TCGA()]
#'
#' @param gene one gene from the input argument of
#' [.plot_bargraph_CNV_TCGA()]
#'
#' @family helper function for CNV data analysis
#'
#' @return
#'
#' a list with the summary table of CNV in individual
#' TCGA cancer types and gene name
#'
#' @importFrom forcats fct_rev
#'
#' @importFrom tidyselect all_of any_of
#'
#' @examples \dontrun{
#' lapply(EIF, .CNV_ind_cancer, df = .TCGA_CNV_sampletype_subset)
#' }
#'
#' @keywords internal
#'
.CNV_ind_cancer <- function(df, gene) {
.TCGA_CNV_anno_subset_long <- df %>%
dplyr::select(
dplyr::all_of(gene),
"sample.type",
"primary.disease"
) %>%
reshape2::melt(
id = c(
"sample.type",
"primary.disease"
),
value.name = "CNV"
) %>%
dplyr::mutate_if(is.character, as.factor)
.CNV_sum <-
table(.TCGA_CNV_anno_subset_long[, c("CNV", "primary.disease")]) %>%
# tibble(.) %>%
as.data.frame() %>%
dplyr::mutate(CNV = factor(.data$CNV,
levels = c("-2", "-1", "0", "1", "2"))) %>%
dplyr::mutate(primary.disease = forcats::fct_rev(.data$primary.disease))
return(list(.CNV_sum, gene))
}
#' @title Stacked bar plots of the CNV status
#'
#' @description
#'
#' This function generates stacked bar plots for CNV status of
#' each gene in an individual cancer type.
#'
#' @details
#'
#' This plot function uses dataset generated from
#' [.CNV_ind_cancer()] function
#'
#' It should not be used directly, only inside
#' [.plot_bargraph_CNV_TCGA()] function.
#'
#' Side effect: stacked bar plots on screen and as saved pdf files to show
#' CNV status of each gene in an individual cancer type.
#'
#' @family helper function for CNV data plotting
#'
#' @param df `.EIF_CNV_ind_cancer` generated inside
#' [.plot_bargraph_CNV_TCGA()]
#'
#' @examples \dontrun{
#' lapply(.EIF_CNV_ind_cancer, .CNV_barplot)
#' }
#'
#' @keywords internal
#'
.CNV_barplot <- function(df) {
p1 <- ggplot(
df[[1]],
aes_(
fill = ~CNV,
order = ~ as.numeric(CNV),
y = ~Freq,
x = ~primary.disease
)
) +
geom_bar(stat = "identity", position = "fill") +
labs(
x = "Tumor types (TCGA pan cancer atlas 2018)",
y = paste0("Percentages of ", df[[2]], " CNVs")
) +
coord_flip() +
theme_bw() +
theme(
plot.title = black_bold_12,
axis.title.x = black_bold_12,
axis.title.y = element_blank(),
axis.text.x = black_bold_12,
axis.text.y = black_bold_12,
panel.grid = element_blank(),
legend.title = element_blank(),
legend.text = black_bold_12,
legend.position = "top",
legend.justification = "left",
legend.box = "horizontal",
strip.text = black_bold_12
) +
scale_y_continuous(labels = scales::percent_format()) +
# Flip ordering of legend without altering ordering in plot
guides(fill = guide_legend(reverse = TRUE)) +
scale_fill_manual(
name = "Copy number variation",
breaks = c("-2", "-1", "0", "1", "2"),
labels = c(
"Deep del\n 0",
"Shallow del\n 1",
"Diploid\n 2",
"Gain\n 3",
"Amp\n 3+"
),
values = c(
"darkblue", "blue",
"lightgreen", "red",
"darkred"
)
)
print(p1)
ggplot2::ggsave(
path = file.path(output_directory, "CNV"),
filename = paste(df[[2]], "pancancer CNV.pdf"),
plot = p1,
width = 7.5,
height = 9,
useDingbats = FALSE
)
return(NULL)
}
#' @title Correlation matrix for gene pairs
#'
#' @description
#'
#' This function calculates the correlation coefficients
#' between every two genes and plot the correlation matrix with the function.
#'
#' @details
#'
#' This plot function uses dataset `TCGA_CNV_value` generated
#' from [initialize_cnv_data()] function.
#' It should not be used directly, only inside
#' [.plot_matrix_CNVcorr_TCGA()] function.
#'
#' Side effects:
#'
#' (1) the correlation matrix plot on screen and as saved pdf files
#'
#' @family helper function for CNV data plotting
#'
#' @param df output of `TCGA_CNV_value \%>\% select(all_of(EIF))`
#' generated inside [.plot_matrix_CNVcorr_TCGA()]
#'
#' @importFrom corrplot cor.mtest corrplot
#'
#' @importFrom grDevices dev.off pdf
#'
#' @importFrom stats cor cor.test setNames
#'
#' @examples \dontrun{
#' TCGA_CNV_value %>%
#' dplyr::select(all_of(gene_list)) %>%
#' .matrix_plot()
#' }
#'
#' @keywords internal
#'
.matrix_plot <- function(df) {
#M <- stats::cor(df)
testRes <- corrplot::cor.mtest(df, conf.level = 0.95)
p1 <- corrplot::corrplot(
stats::cor(df),
method = "color",
cl.pos = "n", # remove color legend
tl.cex = 1,
number.cex = 1,
addgrid.col = "gray",
addCoef.col = "black",
tl.col = "black",
type = "lower",
order = "FPC",
tl.srt = 45,
p.mat = testRes$p,
sig.level = 0.05, # insig = "blank"
)
print(p1) # print correlation matrix on the screen
# save correlation plot as a pdf file
pdf(
file.path(output_directory, "CNV", "EIF CNV Corrmatrix.pdf"),
width = 9,
height = 9,
useDingbats = FALSE
)
corrplot::corrplot(
stats::cor(df),
method = "color",
cl.pos = "n", # remove color legend
tl.cex = 1,
number.cex = 1,
addgrid.col = "gray",
addCoef.col = "black",
tl.col = "black",
type = "lower",
order = "FPC",
tl.srt = 45,
p.mat = testRes$p,
sig.level = 0.05, # insig = "blank"
)
dev.off()
return(NULL)
}
#' @title Calculates the frequency of CNV status in all TCGA cancer types combined
#'
#' @description
#'
#' This function selects the CNV ratio data in tumors vs
#' adjacent normals from individual TCGA cancer types for each input gene.
#'
#' @details
#'
#' It should not be used directly, only inside
#' [.plot_boxgraph_CNVratio_TCGA()] function.
#'
#' @family helper function for CNV data analysis
#'
#' @param df `.TCGA_CNVratio_sampletype_subset` generated inside
#' [.plot_boxgraph_CNVratio_TCGA()]
#'
#' @param gene one gene from the input argument of [.plot_boxgraph_CNVratio_TCGA()]
#'
#' @return
#'
#' a list with the data frame of CNV ratio of the input gene
#' in individual TCGA cancer types and gene name
#'
#' @examples \dontrun{
#' lapply(EIF, .CNVratio_tumor, df = .TCGA_CNVratio_sampletype_subset)
#' }
#'
#' @keywords internal
#'
.CNVratio_tumor <- function(df, gene) {
.CNVratio_data <- df %>%
dplyr::select(dplyr::all_of(gene), "sample.type", "primary.disease") %>%
reshape2::melt(
id = c("sample.type", "primary.disease"),
value.name = "CNV"
) %>%
dplyr::mutate_if(is.character, as.factor) %>%
dplyr::mutate(primary.disease = forcats::fct_rev(.data$primary.disease))
return(list(.CNVratio_data, gene))
}
#' @title Box plots of the CNV ratios in tumors vs adjacent normals
#'
#' @description
#'
#' This function generates boxplot for CNV ratios in tumors
#' vs adjacent normals from individual TCGA cancer types.
#'
#' @details
#'
#' This plot function uses dataset `.TCGA_CNVratio_sampletype_subset`
#' generated from [.plot_boxgraph_CNVratio_TCGA()] function.
#' It should not be used directly, only inside [.plot_boxgraph_CNVratio_TCGA()]
#' function.
#'
#' Side effects:
#'
#' (1) boxplot on screen and as saved pdf files to show CNV ratios
#' in tumors vs adjacent normals from individual TCGA cancer types.
#'
#' @family helper function for CNV data plotting
#'
#' @param df `.EIF_CNVratio_ind_cancer` generated
#' inside [.plot_boxgraph_CNVratio_TCGA()]
#'
#' @examples \dontrun{
#' lapply(.EIF_CNVratio_ind_cancer, .CNVratio_boxplot)
#' }
#'
#' @keywords internal
#'
.CNVratio_boxplot <- function(df) {
p1 <- ggplot(
data = df[[1]],
aes_(
y = ~ 2**CNV,
x = ~primary.disease,
# x = f.ordered1,
color = ~primary.disease
)
) +
ylim(0, 3) +
geom_hline(yintercept = 1, linetype = "dashed") +
stat_n_text(
size = 5,
fontface = "bold",
hjust = 0
) +
geom_boxplot(
alpha = .01,
outlier.colour = NA,
# size = .75,
# width = 1,
position = position_dodge(width = .9)
) +
labs(
x = "primary disease",
y = paste(df[[2]], "CNV ratio", "(tumor/normal)")
) +
# scale_color_manual(values = col_vector) +
coord_flip() +
theme_bw() +
theme(
plot.title = black_bold_12,
axis.title.x = black_bold_12,
axis.title.y = element_blank(),
axis.text.x = black_bold_12,
axis.text.y = black_bold_12,
panel.grid = element_blank(),
legend.title = element_blank(),
legend.text = black_bold_12,
legend.position = "none",
legend.justification = "left",
legend.box = "horizontal",
strip.text = black_bold_12
)
print(p1)
ggplot2::ggsave(
path = file.path(output_directory, "CNV"),
filename = paste0(df[[2]], "pancancer CNVratio.pdf"),
plot = p1,
width = 7,
height = 9,
useDingbats = FALSE
)
return(NULL)
}
## Composite function to call CNV data analysis and plotting ===================
#' @title Summary of CNV statuses in bar plots
#'
#' @description
#'
#' Provides the summary of CNV statuses of EIF4F genes in tumors from all TCGA
#' cancer types combined and in tumors from individual TCGA cancer types
#'
#' @param gene_list gene names in a vector of characters
#'
#' @details This function
#'
#' * selects a subset of CNV and sample type data of only EIF4F gene from the
#' data frame `TCGA_CNV_sampletype` - a global variable generated from
#' [initialize_cnv_data()].
#' * uses the subset data `.TCGA_CNV_sampletype_subset` to perform the
#' CNV status analysis of all inquired genes with the internal functions
#' [.CNV_all_cancer()] and plot the results as a bar plot
#' with the internal function [.CNV_sum_barplot()].
#' * uses the same subset data `.TCGA_CNV_sampletype_subset` to perform CNV
#' status analysis for each gene across all tumors types by the internal
#' function [.CNV_ind_cancer()] and plots the CNV results by the internal
#' function [.CNV_barplot()].
#'
#' This function is not accessible to the user and will not show at the users'
#' workspace. It can only be called by the exported [EIF4F_CNV_analysis()] function.
#'
#' Side effects:
#'
#' (1) the stacked bar plots on screen and as pdf files to show the summary
#' table of the CNV statuses of all `gene_list` in TCGA tumors
#'
#' (2) stacked bar plots on screen and as saved pdf files to show
#' CNV status of each gene in `gene_list` from an individual cancer type
#'
#' @family composite function to call CNV data analysis and plotting
#'
#' @examples \dontrun{
#' plot_bargraph_CNV_TCGA(c(
#' "TP53", "EIF4A1", "EIF4A2", "EIF4E",
#' "EIF4E2", "EIF4E3", "MYC", "EIF3D", "EIF4EBP1", "EIF4G1", "EIF4G2",
#' "EIF4G3", "PABPC1", "MKNK1", "MKNK2"
#' ))
#' }
#'
#' @keywords internal
#'
.plot_bargraph_CNV_TCGA <- function(gene_list) {
.TCGA_CNV_sampletype_subset <- NULL
# combine CNV data with annotation data
.TCGA_CNV_sampletype_subset <- TCGA_CNV_sampletype %>%
dplyr::select(dplyr::all_of(gene_list), "sample.type", "primary.disease")
# stacked bar plots for CNV status in combined TCGA tumors
.CNV_all_cancer(.TCGA_CNV_sampletype_subset) %>%
.CNV_sum_barplot()
# stacked bar plots for CNV status in each TCGA tumor type
.EIF_CNV_ind_cancer <- lapply(gene_list,
.CNV_ind_cancer,
df = .TCGA_CNV_sampletype_subset
)
lapply(.EIF_CNV_ind_cancer, .CNV_barplot)
return(NULL)
}
#' @title Correlation matrix for CNV values
#'
#' @description
#'
#' generates correlation matrix for eIF4F CNV in tumors
#' from all TCGA cancer type combined
#'
#' @param gene_list gene names in a vector of characters
#'
#' @details This function
#'
#' * selects a subset of unthreshold CNV values of only EIF4F gene
#' from the data frame `TCGA_CNV_value` - a global variable generated from [initialize_cnv_data()].
#' * uses the subset data to calculate the correlation coefficients
#' between every two genes and plot the correlation matrix with the function
#' [.matrix_plot()]
#'
#' This function is not accessible to the user and will not show at the users'
#' workspace. It can only be called by the exported [EIF4F_CNV_analysis()] function.
#'
#' side effect: the correlation matrix plot on screen and as saved pdf files
#' to show co-occurrence of `gene_list` CNV status in TCGA tumors
#'
#' @family Composite function to call CNV data analysis and plotting
#'
#' @examples \dontrun{
#' plot_matrix_CNVcorr_TCGA(c("EIF4A1", "EIF4E", "EIF4EBP1", "EIF4G1"))
#' }
#'
#' @keywords internal
#'
.plot_matrix_CNVcorr_TCGA <- function(gene_list) {
TCGA_CNV_value %>%
dplyr::select(dplyr::all_of(gene_list)) %>%
.matrix_plot()
return(NULL)
}
#' @title CNV ratios in tumors vs adjacent normal tissues across tumor types
#'
#' @description
#'
#' This function generates boxplot for CNV ratios in tumors vs adjacent normals
#' from individual TCGA cancer types.
#'
#' @param gene_list gene names in a vector of characters
#'
#' @details This function
#'
#' * selects a subset of CNV and sample type data of only EIF4F gene from the
#' data frame `TCGA_CNVratio_sampletype` - a global variable generated from
#' [initialize_cnv_data()].
#' * uses the subset data `.TCGA_CNVratio_sampletype_subset` to perform CNV
#' status analysis for each gene across all tumors types with the internal
#' function [.CNVratio_tumor()] and plots the CNV results with the internal
#' function [.CNVratio_boxplot()].
#'
#' This function is not accessible to the user and will not show at the users'
#' workspace. It can only be called by the exported [EIF4F_CNV_analysis()] function.
#'
#' Side effects:
#'
#' (1) the box plots on screen and as saved pdf files to show CNV ratio of
#' `gene_list` in TCGA tumors vs normals
#'
#' @family composite function to call CNV data analysis and plotting
#'
#' @examples \dontrun{
#' plot_boxgraph_CNVratio_TCGA(c("EIF4A1", "EIF4E", "EIF4EBP1", "EIF4G1"))
#' }
#'
#' @keywords internal
#'
.plot_boxgraph_CNVratio_TCGA <- function(gene_list) {
.TCGA_CNVratio_sampletype_subset <- NULL
.TCGA_CNVratio_sampletype_subset <- TCGA_CNVratio_sampletype %>%
dplyr::select(
dplyr::all_of(gene_list),
"sample.type",
"primary.disease"
)
.EIF_CNVratio_ind_cancer <- lapply(gene_list,
.CNVratio_tumor,
df = .TCGA_CNVratio_sampletype_subset
)
lapply(.EIF_CNVratio_ind_cancer, .CNVratio_boxplot)
return(NULL)
}
## Wrapper function to call all internal composite functions with provided inputs =======
#' @title Perform all CNV related analysis and generate plots
#'
#' @description
#'
#' A wrapper function to call all internal composite functions for CNV analysis with
#' provided inputs
#'
#' @details
#'
#' This function run three internal composite functions together with inputs:
#'
#' * [.plot_bargraph_CNV_TCGA()]
#' * [.plot_matrix_CNVcorr_TCGA()]
#' * [.plot_boxgraph_CNVratio_TCGA()]
#'
#' Side effect:
#'
#' (1) stacked bar plots on screen and as pdf file to show the summary
#' table of the CNV statuses of all EIF genes in TCGA tumors
#'
#' (2) stacked bar plots on screen and as saved pdf files to show
#' CNV status of each EIF gene from an individual cancer type
#'
#' (3) correlation matrix plot on screen and as saved pdf files to show
#' co-occurrence of EIF genes CNV status in TCGA tumors
#'
#' (4) box plots on screen and as saved pdf files to show CNV ratio
#' of EIF genes in TCGA tumors vs normals
#'
#' @family wrapper function to call all composite functions with inputs
#'
#' @export
#'
#' @examples \dontrun{
#' EIF4F_CNV_analysis()
#' }
#'
EIF4F_CNV_analysis <- function() {
.plot_bargraph_CNV_TCGA(c(
"TP53", "EIF4A1", "EIF4A2",
"EIF4E", "EIF4E2", "EIF4E3",
"MYC", "EIF3D", "EIF4EBP1",
"EIF4G1", "EIF4G2", "EIF4G3",
"EIF4H", "EIF4B"
))
.plot_matrix_CNVcorr_TCGA(c(
"TP53", "EIF4A1", "EIF4A2",
"EIF4E", "EIF4E2", "EIF4E3",
"MYC", "EIF3D", "EIF4EBP1",
"EIF4G1", "EIF4G2", "EIF4G3",
"EIF4H", "EIF4B"
))
.plot_boxgraph_CNVratio_TCGA(c(
"EIF4G1", "EIF4E", "EIF4A1", "EIF4EBP1", "EIF4G2", "EIF4G3",
"EIF4E2", "EIF4E3", "EIF4A2", "EIF3D", "EIF4H", "EIF4B"
))
return(NULL)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.