# Correlation between RNA and protein levels for EIF4F
# This R script contains four sections.
#
# (1) CCLE and LUAD proteomics data preparation
#
# (2) selection of RNA and protein expression data and plotting
#
# (3) composite functions to execute a pipeline of functions to select related
# expression with supply of EIF4F gene names for correlation analysis and
# plotting.
#
# (4) wrapper function to call all master functions with inputs
#
## Wrapper function for data initialization of RNA and proteomics datasets =====
#' @noRd
## due to NSE notes in R CMD check
CCLE_RNAseq <- CCLE_Anno <- CCLE_Proteomics <- CPTAC_LUAD_Proteomics <- NULL
CPTAC_LUAD_RNAseq <- NULL
#' @title Read all proteomics related datasets from CCLE and CPTAC LUAD
#'
#' @description
#'
#' A wrapper function reads all proteomics related datasets from CCLE and
#' CPTAC LUAD.
#'
#' @details Side effects:
#'
#' (1) `CCLE_RNAseq`: the RNAseq data of CCLE from the download dataset
#' `CCLE_expression_full.csv`.
#' It is stored as `CCLE_RNAseq.csv` in
#' `~/Documents/EIF_output/ProcessedData` folder.
#'
#' (2) `CCLE_Anno`: the annotation data of CCLE from `sample_info.csv`.
#' It is stored as `CCLE_Anno.csv` in
#' `~/Documents/EIF_output/ProcessedData` folder.
#'
#' (3) `CCLE_Proteomics`: the proteomics data of CCLE from
#' `protein_quant_current_normalized.csv`.
#' It is stored as `CCLE_Proteomics.csv` in
#' `~/Documents/EIF_output/ProcessedData` folder.
#'
#' (4) `CPTAC_LUAD_Proteomics`: the proteomics data with annotation of CPTAC
#' LUAD study from the download data file `Protein.xlsx`.
#' It is stored as `CPTAC_LUAD_Proteomics.csv` in
#' `~/Documents/EIF_output/ProcessedData` folder.
#'
#' (5) `CPTAC_LUAD_RNAseq`: the RNAseq data of CPTAC LUAD samples from
#' the download data file `RNA.xlsx`.
#' It is stored as `CPTAC_LUAD_RNAseq.csv` in
#' `~/Documents/EIF_output/ProcessedData` folder.
#'
#' @family wrapper function for data initialization
#'
#' @importFrom readxl read_excel
#'
#' @export
#'
#' @examples \dontrun{
#' initialize_proteomics_data()
#' }
#'
initialize_proteomics_data <- function() {
rlang::env_binding_unlock(parent.env(environment()), nms = NULL)
if (!file.exists(file.path(
output_directory,
"ProcessedData",
"CCLE_RNAseq.csv"
))) {
assign("CCLE_RNAseq",
fread(
file.path(
data_file_directory,
"CCLE_expression_full.csv"
),
data.table = FALSE
),
envir = parent.env(environment())
)
data.table::fwrite(CCLE_RNAseq, file.path(
output_directory,
"ProcessedData",
"CCLE_RNAseq.csv"
))
} else {
assign("CCLE_RNAseq",
data.table::fread(file.path(
output_directory,
"ProcessedData",
"CCLE_RNAseq.csv"),data.table = FALSE
) %>%
tibble::as_tibble(),
envir = parent.env(environment())
)
}
if (!file.exists(file.path(
output_directory,
"ProcessedData",
"CCLE_Anno.csv"
))) {
assign("CCLE_Anno",
fread(
file.path(
data_file_directory,
"sample_info.csv"
),
data.table = FALSE
) %>% dplyr::select(1, 2),
envir = parent.env(environment())
)
data.table::fwrite(CCLE_Anno, file.path(
output_directory,
"ProcessedData",
"CCLE_Anno.csv"
))
} else {
assign("CCLE_Anno",
data.table::fread(file.path(
output_directory,
"ProcessedData",
"CCLE_Anno.csv"),data.table = FALSE
) %>%
tibble::as_tibble(),
envir = parent.env(environment())
)
}
if (!file.exists(file.path(
output_directory,
"ProcessedData",
"CCLE_Proteomics.csv"
))) {
assign("CCLE_Proteomics",
fread(
file.path(
data_file_directory,
"protein_quant_current_normalized.csv"
),
data.table = FALSE
),
envir = parent.env(environment())
)
data.table::fwrite(CCLE_Proteomics, file.path(
output_directory,
"ProcessedData",
"CCLE_Proteomics.csv"
))
} else {
assign("CCLE_Proteomics",
data.table::fread(file.path(
output_directory,
"ProcessedData",
"CCLE_Proteomics.csv"),data.table = FALSE
) %>%
tibble::as_tibble(),
envir = parent.env(environment())
)
}
if (!file.exists(file.path(
output_directory,
"ProcessedData",
"CPTAC_LUAD_Proteomics.csv"
))) {
assign("CPTAC_LUAD_Proteomics",
readxl::read_excel(
file.path(data_file_directory, "Protein.xlsx"),
col_names = FALSE
) %>%
# as.data.frame(.) %>%
dplyr::mutate(...1 = make.unique(.data$...1)) %>% # relabel the duplicates
tibble::column_to_rownames(var = "...1") %>%
t() %>%
tibble::as_tibble() %>%
dplyr::mutate_at(
dplyr::vars(-.data$Type, -.data$Sample),
# exclude two columns convert character to number
~ suppressWarnings(as.numeric(.))
),
envir = parent.env(environment())
)
data.table::fwrite(CPTAC_LUAD_Proteomics, file.path(
output_directory,
"ProcessedData",
"CPTAC_LUAD_Proteomics.csv"
))
} else {
assign("CPTAC_LUAD_Proteomics",
data.table::fread(file.path(
output_directory,
"ProcessedData",
"CPTAC_LUAD_Proteomics.csv"),data.table = FALSE
) %>%
tibble::as_tibble(),
envir = parent.env(environment())
)
}
if (!file.exists(file.path(
output_directory,
"ProcessedData",
"CPTAC_LUAD_RNAseq.csv"
))) {
assign("CPTAC_LUAD_RNAseq",
readxl::read_excel(
file.path(data_file_directory, "RNA.xlsx"),
col_names = FALSE
) %>%
# as_tibble(.) %>%
dplyr::mutate(...1 = make.unique(.data$...1)) %>% # relabel the duplicates
tibble::column_to_rownames(var = "...1") %>%
t() %>%
tibble::as_tibble() %>%
dplyr::mutate_at(
dplyr::vars(-.data$Type, -.data$Sample),
# exclude two columns convert character to number
~ suppressWarnings(as.numeric(.))
),
envir = parent.env(environment())
)
data.table::fwrite(CPTAC_LUAD_RNAseq, file.path(
output_directory,
"ProcessedData",
"CPTAC_LUAD_RNAseq.csv"
))
} else {
assign("CPTAC_LUAD_RNAseq",
data.table::fread(file.path(
output_directory,
"ProcessedData",
"CPTAC_LUAD_RNAseq.csv"),data.table = FALSE
) %>%
tibble::as_tibble() ,
envir = parent.env(environment())
)
}
rlang::env_binding_lock(parent.env(environment()), nms = NULL)
}
## Select EIF RNA/pro data and plotting ================================
#' @title Select subset of CCLE RNAseq data
#'
#' @description An internal helper function selects the data of input gene
#' `EIF` from the data frame `CCLE_RNAseq` - a global variable generated from
#' [initialize_proteomics_data].
#'
#' @details The function should not be used directly, only inside
#' [.plot_scatter_RNApro_CCLE()] function.
#'
#' @param gene_name gene name
#'
#' @return a data frame of CCLE RNAseq data from the input `gene_name` genes
#'
#' @family helper function for RNA protein correlation analysis
#'
#' @importFrom dplyr starts_with
#'
#' @examples \dontrun{
#' .get_CCLE_RNAseq_subset()
#' }
#'
#' @keywords internal
#'
.get_CCLE_RNAseq_subset <- function(gene_name) {
return(CCLE_RNAseq %>%
rename("DepMap_ID" = "V1") %>%
dplyr::select("DepMap_ID", dplyr::starts_with((!!paste(gene_name, "(ENSG")))))
}
#' @title Select subset of CCLE proteomics data
#'
#' @description A helper function selects the data of input gene `gene_name`.
#' from the data frame `CCLE_Proteomics` - a global variable generated from
#' [initialize_proteomics_data].
#'
#' @details The function should not be used directly, only inside
#' [.plot_scatter_RNApro_CCLE()] function.
#'
#' @param gene_name gene name
#'
#' @return a data frame of CCLE proteomics data from the input `gene_name`
#'
#' @family helper function for RNA protein correlation analysis
#'
#' @importFrom dplyr across
#'
#' @importFrom tidyselect contains vars_select_helpers
#'
#' @examples \dontrun{
#' .get_CCLE_Proteomics_subset()
#' }
#'
#' @keywords internal
#'
.get_CCLE_Proteomics_subset <- function(gene_name) {
.CCLE_Proteomics_subset <- CCLE_Proteomics %>%
dplyr::filter(.data$Gene_Symbol == gene_name)
if (nrow(.CCLE_Proteomics_subset) > 1) {
df <- .CCLE_Proteomics_subset %>%
dplyr::select(contains("_Peptides")) %>%
dplyr::mutate(sum = rowSums(
dplyr::across(tidyselect::vars_select_helpers$where(is.numeric))))
name <- rownames(df[df$sum == max(df$sum), ]) # for the maximum value
.CCLE_Proteomics_subset <- .CCLE_Proteomics_subset %>%
dplyr::filter(row.names(.CCLE_Proteomics_subset) == name)
} else {
TRUE
}
.CCLE_Proteomics_subset <- .CCLE_Proteomics_subset %>%
tibble::column_to_rownames(var = "Gene_Symbol") %>%
dplyr::select(contains("TenPx"), -contains("_Peptides")) %>%
t() %>%
as.data.frame() %>%
tibble::rownames_to_column(var = "rn") %>%
dplyr::mutate(
Celline = sub("\\_.*", "", .data$rn),
Type = sub(".*_ *(.*?) *_.*", "\\1", .data$rn)
) %>%
dplyr::select(-.data$rn) %>%
dplyr::mutate_if(is.character, as.factor) %>%
# select(!!paste0(EIF,".pro") := EIF)
rename(!!paste0(gene_name, ".pro") := gene_name) # rename with dynamic variables
return(.CCLE_Proteomics_subset)
}
#' @title Scatter plots of correlation between RNAseq and proteomics data
#'
#' @description A helper function plots RNAseq and proteomics data in scatter
#' plot and analyzes the correlation
#'
#' @details This function should not be used directly, only inside
#' [.plot_scatter_RNApro_CCLE()] or [.plot_scatter_RNApro_LUAD()] function.
#'
#' Side effects:
#'
#' (1) scatter plots to show correlation between RNA and protein
#' expressions
#'
#' @param df dataframe of both RNAseq and proteomics generated inside
#'
#' @param gene_name protein or gene name
#'
#' @param cohort database name. `CCLE` or `LUAD`
#'
#' @family helper function for RNA protein correlation analysis
#'
#' @importFrom ggpubr ggscatter
#'
#' @keywords internal
#'
.RNApro_scatterplot <- function(df, gene_name, cohort) {
p1 <- ggscatter(df,
x = paste0(gene_name, ".pro"),
y = paste0(gene_name, ".rna"), # color = "Type",
add = "reg.line", # conf.int = TRUE,
cor.coef = TRUE,
cor.method = "pearson",
title = paste(gene_name, "(", cohort, ")"),
xlab = "Protein expresion",
ylab = "RNA expression"
) +
theme_bw() +
theme(
plot.title = black_bold_12,
axis.title.x = black_bold_12,
axis.title.y = black_bold_12,
axis.text.x = black_bold_12,
axis.text.y = black_bold_12,
panel.grid = element_blank(),
legend.position = "none",
strip.text = black_bold_12,
strip.background = element_rect(fill = "white")
)
print(p1)
ggplot2::ggsave(
path = file.path(output_directory, "RNApro"),
filename = paste(cohort, gene_name, "cor", ".pdf"),
plot = p1,
#width = 3,
#height = 3,
width = 6,
height = 6,
useDingbats = FALSE
)
return(NULL)
}
## Composite function to perform RNA protein correlation =======================
#' @title Correlation between CCLE RNAseq and proteomics data
#'
#' @description
#'
#' An internal composite function generates correlation scatter plot for eIF4F RNAseq and
#' proteomics data from CCLE.
#'
#' @details This function
#'
#' * merges the CCLE RNAseq values of EIF4F genes in the data frame prepared
#' from [.get_CCLE_RNAseq_subset()], the annotation data of CCLE `CCLE_Anno`,
#' and proteomics data of the same EIF4F protein in the data frame prepared
#' from [.get_CCLE_Proteomics_subset()].
#'
#' * uses the combined data to calculate the correlation coefficients
#' between protein and RNA levels, and plot the result with the function
#' [.RNApro_scatterplot()]
#'
#' This function is not accessible to the user and will not show at the users'
#' workspace. It can only be called by the exported [EIF4F_RNA_pro_correlation()]
#' function.
#'
#' Side effects:
#'
#' (1) scatter plot to show correlation between RNA and protein of
#' input `gene_name` expressions in CCLE samples
#'
#' @param gene_name gene name
#'
#' @family composite function to call RNA protein correlation analysis and
#' plotting
#'
#' @keywords internal
#'
#' @examples \dontrun{
#' lapply(c("EIF4G1", "EIF4A1", "EIF4E"), .plot_scatter_RNApro_CCLE)
#' }
#'
.plot_scatter_RNApro_CCLE <- function(gene_name) {
.df <- .get_CCLE_RNAseq_subset(gene_name) %>%
dplyr::full_join(CCLE_Anno, by = "DepMap_ID") %>%
stats::na.omit() %>%
dplyr::select(-.data$DepMap_ID) %>%
dplyr::rename(
"Celline" = "stripped_cell_line_name",
!!paste0(gene_name, ".rna") := tidyselect::contains(gene_name)
) %>%
dplyr::full_join(.get_CCLE_Proteomics_subset(gene_name), by = "Celline") %>%
stats::na.omit()
.RNApro_scatterplot(df = .df, gene_name = gene_name, cohort = "CCLE")
return(NULL)
}
#' @title Correlation between LUAD RNAseq and proteomics data
#'
#' @description
#' A composite function generates correlation scatter plot for eIF4F RNAseq and
#' proteomics data from LUAD.
#'
#' @details This function
#'
#' * merges the LUAD RNAseq data of EIF4F genes prepared from
#' `CPTAC_LUAD_Proteomics` and proteomics data of the EIF4F proteins
#' prepared from `CPTAC_LUAD_RNAseq`.
#' * uses the combined data to calculate the correlation coefficients between
#' protein and RNA levels, and plot the result with the function
#' [.RNApro_scatterplot()]
#'
#' This function is not accessible to the user and will not show at the users'
#' workspace. It can only be called by the exported [EIF4F_RNA_pro_correlation()]
#' function.
#'
#' Side effects:
#'
#' (1) scatter plot to show correlation between RNA and protein
#' expressions of input `gene_name` in CPTAC LUAD samples
#'
#' @param gene_name gene name
#'
#' @family composite function to call RNA protein correlation analysis and
#' plotting
#'
#' @keywords internal
#'
#' @examples \dontrun{
#' lapply(c("EIF4G1", "EIF4A1", "EIF4E"), .plot_scatter_RNApro_LUAD)
#' }
#'
.plot_scatter_RNApro_LUAD <- function(gene_name) {
.EIF_LUAD_Proteomics <- CPTAC_LUAD_Proteomics %>%
dplyr::select(dplyr::all_of(gene_name), "Type", "Sample") %>%
dplyr::filter(.data$Type == "Tumor")
.EIF_LUAD_RNAseq <- CPTAC_LUAD_RNAseq %>%
dplyr::select(dplyr::all_of(gene_name), "Type", "Sample") %>%
dplyr::filter(.data$Type == "Tumor")
df <- merge(.EIF_LUAD_Proteomics,
.EIF_LUAD_RNAseq,
by = c("Sample", "Type"),
suffixes = c(".pro", ".rna")
)
.RNApro_scatterplot(df = df, gene_name = gene_name, cohort = "LUAD")
return(NULL)
}
## Wrapper function to call all composite functions with inputs ================
#' @title Perform RNA protein correlation and generate scatter plots
#'
#' @description
#'
#' A wrapper function to call all composite functions for RNA and protein
#' correlation with inputs.
#'
#' @details
#'
#' This function run the internal composite functions [.plot_scatter_RNApro_CCLE()] and
#' [.plot_scatter_RNApro_LUAD()] with EIF4F gene name as inputs.
#'
#' Side effects:
#'
#' (1) scatter plot to show correlation between EIF4F RNA and protein
#' expressions in CCLE samples
#'
#' (2) scatter plot to show correlation between EIF4F RNA and protein
#' expressions in CPTAC LUAD samples
#'
#' @family wrapper function to call all composite functions with inputs
#'
#' @export
#'
#' @examples \dontrun{
#' EIF4F_RNA_pro_correlation()
#' }
#'
EIF4F_RNA_pro_correlation <- function() {
lapply(c("EIF4G1", "EIF4A1", "EIF4E"), .plot_scatter_RNApro_CCLE)
lapply(c("EIF4G1", "EIF4A1", "EIF4E"), .plot_scatter_RNApro_LUAD)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.