R/CNV.R

Defines functions EIF4F_CNV_analysis .plot_boxgraph_CNVratio_TCGA .plot_matrix_CNVcorr_TCGA .plot_bargraph_CNV_TCGA .CNVratio_boxplot .CNVratio_tumor .matrix_plot .CNV_barplot .CNV_ind_cancer .CNV_sum_barplot .CNV_all_cancer .get_TCGA_CNV_ratio .get_TCGA_CNV_value .get_TCGA_CNV initialize_cnv_data

Documented in .CNV_all_cancer .CNV_barplot .CNV_ind_cancer .CNVratio_boxplot .CNVratio_tumor .CNV_sum_barplot EIF4F_CNV_analysis .get_TCGA_CNV .get_TCGA_CNV_ratio .get_TCGA_CNV_value initialize_cnv_data .matrix_plot .plot_bargraph_CNV_TCGA .plot_boxgraph_CNVratio_TCGA .plot_matrix_CNVcorr_TCGA

# CNV analyses of EIF genes in TCGA data
#
# This R script contains four sections:
#
# (1) CNV data preparation
#
# (2) CNV data analyses and plotting
#
# (3) composite functions to execute a pipeline of functions to select related
#  CNV data with supply of EIF4F gene names for analysis and plotting.
#
# (4) wrapper function to call all composite functions with inputs
#

## Internal function for data initialization of CNV related datasets ============

#' @noRd
## due to NSE notes in R CMD check
TCGA_CNV_value <- TCGA_CNV_sampletype <- TCGA_CNVratio_sampletype <- NULL



#' @title Read all CNV related datasets from TCGA
#'
#' @description
#'
#' A wrapper function reads all CNV related datasets from TCGA and its side effects
#'  are three global variables.
#'
#' @details Side effects:
#'
#' (1) `TCGA_CNV_value`: the unthreshold CNV value data generated as the output
#'  [.get_TCGA_CNV_value()], which imports the download dataset
#'  `Gistic2_CopyNumber_Gistic2_all_data_by_genes`.
#'
#' (2) `TCGA_CNV_sampletype`: the merged dataset from
#'   * `.TCGA.CNV` that contains the threshold CNV data from the download dataset
#'  `Gistic2_CopyNumber_Gistic2_all_thresholded.by_genes` and was generated
#'  from [.get_TCGA_CNV()],
#'   * `.TCGA_sampletype` that contains the annotation data from
#'  `TCGA_phenotype_denseDataOnlyDownload.tsv` with the selection of
#'  `sample.type` and `primary.disease` columns. `Solid Tissue Normal` samples
#'  are excluded.
#'
#' (3) `TCGA_CNVratio_sampletype`: the merged dataset from
#'   * `.TCGA_CNV_ratio` that contains the CNV ratio data from the download
#'  dataset `broad.mit.edu_PANCAN_Genome_Wide_SNP_6_whitelisted.gene.xena` and
#'  was generated by [.get_TCGA_CNV_ratio()],
#'   * `.TCGA_sampletype`.
#'
#' `TCGA_CNV_value`, `TCGA_CNV_sampletype` and `TCGA_CNVratio_sampletype` are
#' stored as `TCGA_CNV_value.csv`, `TCGA_CNV_sampletype.csv` and
#' `TCGA_CNVratio_sampletype.csv` in `~/Documents/EIF_output/ProcessedData` folder.
#'
#' @family wrapper function for data initialization
#'
#' @importFrom data.table fread transpose
#'
#' @importFrom dplyr distinct filter select select_if mutate mutate_at
#'  summarise rename group_by
#'
#' @importFrom readr write_csv
#'
#' @importFrom stats na.omit
#'
#' @importFrom tibble remove_rownames column_to_rownames
#'
#' @export
#'
#' @examples \dontrun{
#'  initialize_cnv_data()
#'  }
#'
initialize_cnv_data <- function() {
  rlang::env_binding_unlock(parent.env(environment()), nms = NULL)

  .path <- file.path(output_directory, "ProcessedData","TCGA_CNV_value.csv")

  if (!file.exists(.path)) {
    data.table::fwrite(.get_TCGA_CNV_value(), .path, row.names = TRUE)
  }

  assign("TCGA_CNV_value",
         data.table::fread(.path ,data.table = FALSE) %>%
           tibble::as_tibble() %>%
           #tibble::remove_rownames() %>%
           tibble::column_to_rownames(var = "V1"),
         envir = parent.env(environment())
  )

  .TCGA_sampletype <- readr::read_tsv(file.path(
    data_file_directory,
    "TCGA_phenotype_denseDataOnlyDownload.tsv"
  )) %>%
    as.data.frame() %>%
    dplyr::distinct(.data$sample, .keep_all = TRUE) %>%
    stats::na.omit() %>%
    tibble::remove_rownames() %>%
    tibble::column_to_rownames(var = "sample") %>%
    dplyr::select("sample_type", "_primary_disease") %>%
    dplyr::rename(
      "sample.type" = "sample_type",
      "primary.disease" = "_primary_disease"
    )

  if (!file.exists(file.path(
    output_directory,
    "ProcessedData",
    "TCGA_CNV_sampletype.csv"
  ))) {
    .TCGA_CNV <- .get_TCGA_CNV()

    assign("TCGA_CNV_sampletype",
      merge(.TCGA_CNV,
        .TCGA_sampletype,
        by    = "row.names",
        all.x = TRUE
      ) %>%
        dplyr::filter(.data$sample.type != "Solid Tissue Normal") %>%
        tibble::remove_rownames() %>%
        tibble::column_to_rownames(var = "Row.names"),
      envir = parent.env(environment())
    )

    data.table::fwrite(TCGA_CNV_sampletype, file.path(
      output_directory,
      "ProcessedData",
      "TCGA_CNV_sampletype.csv"
    ),row.names = TRUE)

  } else {
    assign("TCGA_CNV_sampletype",
      data.table::fread(file.path(
        output_directory,
        "ProcessedData",
        "TCGA_CNV_sampletype.csv"),data.table = FALSE
      ) %>%
        tibble::as_tibble() %>%
        #tibble::remove_rownames() %>%
        tibble::column_to_rownames(var = "V1"),
      envir = parent.env(environment())
    )
  }

  if (!file.exists(file.path(
    output_directory,
    "ProcessedData",
    "TCGA_CNVratio_sampletype.csv"
  ))) {
    .TCGA_CNV_ratio <- .get_TCGA_CNV_ratio()

    assign("TCGA_CNVratio_sampletype",
      merge(.TCGA_CNV_ratio,
        .TCGA_sampletype,
        by    = "row.names",
        all.x = TRUE
      ) %>%
        tibble::remove_rownames() %>%
        tibble::column_to_rownames(var = "Row.names"),
      envir = parent.env(environment())
    )

    data.table::fwrite(TCGA_CNVratio_sampletype, file.path(
      output_directory,
      "ProcessedData",
      "TCGA_CNVratio_sampletype.csv"
    ),row.names = TRUE)

  } else {
    assign("TCGA_CNVratio_sampletype",
           data.table::fread(file.path(
             output_directory,
             "ProcessedData",
             "TCGA_CNVratio_sampletype.csv"),data.table = FALSE
           ) %>%
             tibble::as_tibble() %>%
             #tibble::remove_rownames() %>%
             tibble::column_to_rownames(var = "V1"),
      envir = parent.env(environment())
    )
  }

  rlang::env_binding_lock(parent.env(environment()), nms = NULL)

  return(NULL)
}

#' @title Read CNV threshold dataset from TCGA
#'
#' @description
#'
#' This function reads the threshold CNV data
#'  `Gistic2_CopyNumber_Gistic2_all_thresholded.by_genes`, which grouped the
#'  CNV values with thresholds 3+, 3, 2, 1, 0, to represent high-level copy
#'  number gain, low-level copy number gain, diploid, shallow deletion, or
#'  deep deletion.
#'
#' @details
#'
#' The function also removes possible duplicated tumor samples and
#'  samples with NAs in the dataset.
#'
#' It should not be used directly, only inside [initialize_cnv_data()]
#'  function.
#'
#' @family helper function for data initialization
#'
#' @importFrom tibble as_tibble
#'
#' @return a data frame that contains CNV threshold data for all TCGA tumor
#'
#' @examples \dontrun{
#' .get_TCGA_CNV()
#' }
#'
#' @keywords internal
#'
.get_TCGA_CNV <- function() {
  .TCGA_pancancer <- data.table::fread(
    file.path(
      data_file_directory,
      "Gistic2_CopyNumber_Gistic2_all_thresholded.by_genes"
    ),
    data.table = FALSE
  ) %>%
    tibble::as_tibble() %>%
    # as.data.frame(.) %>%
    dplyr::distinct(.data$Sample, .keep_all = TRUE) %>%
    na.omit() %>%
    tibble::remove_rownames() %>%
    tibble::column_to_rownames(var = "Sample")

  # transpose function from the data.table keeps numeric values as numeric.
  .TCGA_pancancer_transpose <- data.table::transpose(.TCGA_pancancer)
  # get row and column names in order
  rownames(.TCGA_pancancer_transpose) <- colnames(.TCGA_pancancer)
  colnames(.TCGA_pancancer_transpose) <- rownames(.TCGA_pancancer)

  return(.TCGA_pancancer_transpose)
}

#' @title Read CNV value dataset from TCGA
#'
#' @description
#'
#' This function reads the unthreshold CNV value data from TCGA
#'  `Gistic2_CopyNumber_Gistic2_all_data_by_genes`.
#'
#' @details
#'
#' The function also removes possible duplicated tumor samples and
#'  samples with NAs in the dataset.
#'
#' It should not be used directly, only inside [initialize_cnv_data()]
#'  function.
#'
#' @family helper function for data initialization
#'
#' @return
#'
#' a data frame that contains CNV value data for each tumor
#'
#' @examples \dontrun{
#' .get_TCGA_CNV_value()
#' }
#'
#' @keywords internal
#'
.get_TCGA_CNV_value <- function() {
  .TCGA_pancancer <- data.table::fread(
    file.path(
      data_file_directory,
      "Gistic2_CopyNumber_Gistic2_all_data_by_genes"
    ),
    data.table = FALSE
  ) %>%
    tibble::as_tibble() %>%
    # as.data.frame(.) %>%
    dplyr::distinct(.data$Sample, .keep_all = TRUE) %>%
    stats::na.omit() %>%
    tibble::remove_rownames() %>%
    tibble::column_to_rownames(var = "Sample")

  # transpose function keeps numeric values as numeric.
  .TCGA_pancancer_transpose <- data.table::transpose(.TCGA_pancancer)
  # get row and colnames in order
  rownames(.TCGA_pancancer_transpose) <- colnames(.TCGA_pancancer)
  colnames(.TCGA_pancancer_transpose) <- rownames(.TCGA_pancancer)

  return(.TCGA_pancancer_transpose)
}

#' @title Read CNV ratio dataset from TCGA
#'
#' @description
#'
#' This function reads the CNV ratio data
#'  `broad.mit.edu_PANCAN_Genome_Wide_SNP_6_whitelisted.gene.xena`, in which
#'  CNV ratios were calculated by dividing, for each gene, the estimated
#'  gene-level CNV values in malignant tumors to the average CNV value in
#'  normal adjacent tissues (NATs) of the same cancer type.
#'
#' @details
#'
#' The function also removes possible duplicated tumor samples and
#'  samples with NAs in the dataset.
#'
#' It should not be used directly, only inside [initialize_cnv_data()]
#'  function.
#'
#' @family helper function for data initialization
#'
#' @return
#'
#' a data frame that contains CNV ratio data for each tumor
#'
#' @examples \dontrun{
#' .get_TCGA_CNV_ratio()
#' }
#'
#' @keywords internal
#'
.get_TCGA_CNV_ratio <- function() {
  .TCGA_pancancer <- data.table::fread(
    file.path(
      data_file_directory,
      "broad.mit.edu_PANCAN_Genome_Wide_SNP_6_whitelisted.gene.xena"
    ),
    data.table = FALSE
  ) %>%
    tibble::as_tibble() %>%
    # as.data.frame(.) %>%
    dplyr::distinct(.data$sample, .keep_all = TRUE) %>%
    stats::na.omit() %>%
    tibble::remove_rownames() %>%
    tibble::column_to_rownames(var = "sample")

  # transpose function keeps numeric values as numeric.
  .TCGA_pancancer_transpose <- data.table::transpose(.TCGA_pancancer)
  # get row and colnames in order
  rownames(.TCGA_pancancer_transpose) <- colnames(.TCGA_pancancer)
  colnames(.TCGA_pancancer_transpose) <- rownames(.TCGA_pancancer)

  return(.TCGA_pancancer_transpose)
}


## CNV data analysis and plotting ==============================================


#' @title Calculates the frequency of CNV status in all TCGA cancer types combined
#'
#' @description
#'
#' This function calculates the frequency of each CNV status
#' across tumors in all TCGA cancer types for every EIF4F gene.
#'
#' @details
#'
#' It should not be used directly, only inside
#' [.plot_bargraph_CNV_TCGA()] function.
#'
#' @family helper function for CNV data analysis
#'
#' @param df
#'
#' `.TCGA_CNV_sampletype_subset` generated inside
#' [.plot_bargraph_CNV_TCGA()]
#'
#' @return
#'
#' a summary table ranking the EIF4F gene by the frequencies of
#' duplication (CNV threshold labeled as “1” in the dataset)
#'
#' @importFrom reshape2 dcast melt
#'
#' @examples \dontrun{
#' .CNV_all_cancer(.TCGA_CNV_sampletype_subset)
#' }
#'
#' @keywords internal
#'
.CNV_all_cancer <- function(df) {
  .TCGA_CNV_anno_subset_long <- reshape2::melt(df,
    id = c(
      "sample.type",
      "primary.disease"
    ),
    value.name = "CNV"
  ) %>%
    dplyr::mutate_if(is.character, as.factor)

  .CNV_sum <-
    table(.TCGA_CNV_anno_subset_long[, c("CNV", "variable")]) %>%
    # tibble(.) %>%
    as.data.frame() %>%
    dplyr::mutate(CNV = factor(.data$CNV,
                               levels = c("-2", "-1", "0", "1", "2")))

  # reorder stack bars by the frequency of duplication.
  Freq.sum <- reshape2::dcast(.CNV_sum, variable ~ CNV, mean)
  .CNV_sum$variable <- factor(.CNV_sum$variable,
    levels = Freq.sum[order(Freq.sum$`1`), ]$variable
  )

  return(.CNV_sum)
}

#' @title Stacked bar plots of the CNV status summary
#'
#' @description
#'
#' This function generates the stacked bar plots to the summary
#' of CNV statuses
#'
#' @details
#'
#' This plot function uses dataset generated from
#' [.CNV_all_cancer()] function
#'
#' It should not be used directly, only inside
#' [.plot_bargraph_CNV_TCGA()]function.
#'
#' Side effect: stacked bar plots on screen and as pdf file to show the summary
#'  table of the CNV statuses
#'
#' @family helper function for CNV data plotting
#'
#' @param data summary table from
#' `.CNV_all_cancer(.TCGA_CNV_sampletype_subset)`
#'
#' @examples \dontrun{
#' .CNV_sum_barplot(.CNV_all_cancer(.TCGA_CNV_sampletype_subset))
#' }
#'
#' @keywords internal
#'
.CNV_sum_barplot <- function(data) {
  p1 <- ggplot2::ggplot(
    data,
    aes_(
      fill = ~CNV,
      y = ~Freq,
      x = ~variable
    )
  ) +
    geom_bar(
      stat = "identity",
      position = "fill"
    ) +
    geom_col() +
    geom_text(aes(label = paste0(.data$Freq / 100, "%")),
      position = position_stack(vjust = 0.5),
      size = 4
    ) +
    # scale_y_continuous(labels = scales::percent_format())+
    labs(
      x = "Tumor types (TCGA pan cancer atlas 2018)",
      y = "All TCGA tumors combined"
    ) +
    coord_flip() +
    theme_bw() +
    theme(
      plot.title = black_bold_16,
      axis.title.x = black_bold_16,
      axis.title.y = element_blank(),
      axis.text.x = black_bold_16,
      axis.text.y = black_bold_16,
      panel.grid = element_blank(),
      legend.title = element_blank(),
      legend.text = black_bold_16,
      legend.position = "top",
      legend.justification = "left",
      legend.box = "horizontal",
      strip.text = black_bold_16
    ) +
    # Flip ordering of legend without altering ordering in plot
    guides(fill = guide_legend(reverse = TRUE)) +
    scale_fill_manual(
      name = "Copy number variation",
      breaks = c("-2", "-1", "0", "1", "2"),
      labels = c(
        "Deep del\n 0",
        "Shallow del\n 1",
        "Diploid\n 2",
        "Gain\n 3",
        "Amp\n 3+"
      ),
      values = c("darkblue", "blue", "lightgreen", "red", "darkred")
    )
  print(p1)
  ggplot2::ggsave(
    path = file.path(output_directory, "CNV"),
    filename = "EIF CNV summary.pdf",
    plot = p1,
    width = 9,
    height = 9,
    useDingbats = FALSE
  )

  return(NULL)
}

#' @title Calculates the frequency of CNV status in individual TCGA cancer types
#'
#' @description
#'
#' a data analysis function that calculates the frequency of
#' CNV status for each EIF4F gene in individual TCGA cancer types.
#'
#' @details
#'
#' It should not be used directly, only inside
#' [.plot_bargraph_CNV_TCGA()] function.
#'
#' @param df `.TCGA_CNV_sampletype_subset` generated inside
#' [.plot_bargraph_CNV_TCGA()]
#'
#' @param gene one gene from the input argument of
#' [.plot_bargraph_CNV_TCGA()]
#'
#' @family helper function for CNV data analysis
#'
#' @return
#'
#' a list with the summary table of CNV in individual
#' TCGA cancer types and gene name
#'
#' @importFrom forcats fct_rev
#'
#' @importFrom tidyselect all_of any_of
#'
#' @examples \dontrun{
#' lapply(EIF, .CNV_ind_cancer, df = .TCGA_CNV_sampletype_subset)
#' }
#'
#' @keywords internal
#'
.CNV_ind_cancer <- function(df, gene) {
  .TCGA_CNV_anno_subset_long <- df %>%
    dplyr::select(
      dplyr::all_of(gene),
      "sample.type",
      "primary.disease"
    ) %>%
    reshape2::melt(
      id = c(
        "sample.type",
        "primary.disease"
      ),
      value.name = "CNV"
    ) %>%
    dplyr::mutate_if(is.character, as.factor)

  .CNV_sum <-
    table(.TCGA_CNV_anno_subset_long[, c("CNV", "primary.disease")]) %>%
    # tibble(.) %>%
    as.data.frame() %>%
    dplyr::mutate(CNV = factor(.data$CNV,
                               levels = c("-2", "-1", "0", "1", "2"))) %>%
    dplyr::mutate(primary.disease = forcats::fct_rev(.data$primary.disease))

  return(list(.CNV_sum, gene))
}

#' @title Stacked bar plots of the CNV status
#'
#' @description
#'
#' This function generates stacked bar plots for CNV status of
#'  each gene in an individual cancer type.
#'
#' @details
#'
#' This plot function uses dataset generated from
#'  [.CNV_ind_cancer()] function
#'
#' It should not be used directly, only inside
#'  [.plot_bargraph_CNV_TCGA()] function.
#'
#' Side effect: stacked bar plots on screen and as saved pdf files to show
#'  CNV status of each gene in an individual cancer type.
#'
#' @family helper function for CNV data plotting
#'
#' @param df `.EIF_CNV_ind_cancer` generated inside
#'  [.plot_bargraph_CNV_TCGA()]
#'
#' @examples \dontrun{
#' lapply(.EIF_CNV_ind_cancer, .CNV_barplot)
#' }
#'
#' @keywords internal
#'
.CNV_barplot <- function(df) {
  p1 <- ggplot(
    df[[1]],
    aes_(
      fill = ~CNV,
      order = ~ as.numeric(CNV),
      y = ~Freq,
      x = ~primary.disease
    )
  ) +
    geom_bar(stat = "identity", position = "fill") +
    labs(
      x = "Tumor types (TCGA pan cancer atlas 2018)",
      y = paste0("Percentages of ", df[[2]], " CNVs")
    ) +
    coord_flip() +
    theme_bw() +
    theme(
      plot.title = black_bold_12,
      axis.title.x = black_bold_12,
      axis.title.y = element_blank(),
      axis.text.x = black_bold_12,
      axis.text.y = black_bold_12,
      panel.grid = element_blank(),
      legend.title = element_blank(),
      legend.text = black_bold_12,
      legend.position = "top",
      legend.justification = "left",
      legend.box = "horizontal",
      strip.text = black_bold_12
    ) +
    scale_y_continuous(labels = scales::percent_format()) +
    # Flip ordering of legend without altering ordering in plot
    guides(fill = guide_legend(reverse = TRUE)) +
    scale_fill_manual(
      name = "Copy number variation",
      breaks = c("-2", "-1", "0", "1", "2"),
      labels = c(
        "Deep del\n 0",
        "Shallow del\n 1",
        "Diploid\n 2",
        "Gain\n 3",
        "Amp\n 3+"
      ),
      values = c(
        "darkblue", "blue",
        "lightgreen", "red",
        "darkred"
      )
    )
  print(p1)
  ggplot2::ggsave(
    path = file.path(output_directory, "CNV"),
    filename = paste(df[[2]], "pancancer CNV.pdf"),
    plot = p1,
    width = 7.5,
    height = 9,
    useDingbats = FALSE
  )

  return(NULL)
}

#' @title Correlation matrix for gene pairs
#'
#' @description
#'
#' This function calculates the correlation coefficients
#'  between every two genes and plot the correlation matrix with the function.
#'
#' @details
#'
#' This plot function uses dataset `TCGA_CNV_value` generated
#'  from [initialize_cnv_data()] function.
#' It should not be used directly, only inside
#'  [.plot_matrix_CNVcorr_TCGA()] function.
#'
#' Side effects:
#'
#' (1) the correlation matrix plot on screen and as saved pdf files
#'
#' @family helper function for CNV data plotting
#'
#' @param df output of `TCGA_CNV_value \%>\% select(all_of(EIF))`
#' generated inside [.plot_matrix_CNVcorr_TCGA()]
#'
#' @importFrom corrplot cor.mtest corrplot
#'
#' @importFrom grDevices dev.off pdf
#'
#' @importFrom stats cor cor.test setNames
#'
#' @examples \dontrun{
#' TCGA_CNV_value %>%
#'   dplyr::select(all_of(gene_list)) %>%
#'   .matrix_plot()
#' }
#'
#' @keywords internal
#'
.matrix_plot <- function(df) {
  #M <- stats::cor(df)
  testRes <- corrplot::cor.mtest(df, conf.level = 0.95)

  p1 <- corrplot::corrplot(
    stats::cor(df),
    method      = "color",
    cl.pos      = "n", # remove color legend
    tl.cex      = 1,
    number.cex  = 1,
    addgrid.col = "gray",
    addCoef.col = "black",
    tl.col      = "black",
    type        = "lower",
    order       = "FPC",
    tl.srt      = 45,
    p.mat       = testRes$p,
    sig.level   = 0.05, # insig = "blank"
  )
  print(p1) # print correlation matrix on the screen
  # save correlation plot as a pdf file
  pdf(
    file.path(output_directory, "CNV", "EIF CNV Corrmatrix.pdf"),
    width = 9,
    height = 9,
    useDingbats = FALSE
  )
  corrplot::corrplot(
    stats::cor(df),
    method      = "color",
    cl.pos      = "n", # remove color legend
    tl.cex      = 1,
    number.cex  = 1,
    addgrid.col = "gray",
    addCoef.col = "black",
    tl.col      = "black",
    type        = "lower",
    order       = "FPC",
    tl.srt = 45,
    p.mat       = testRes$p,
    sig.level   = 0.05, # insig = "blank"
  )
  dev.off()

  return(NULL)
}

#' @title Calculates the frequency of CNV status in all TCGA cancer types combined
#'
#' @description
#'
#' This function selects the CNV ratio data in tumors vs
#'  adjacent normals from individual TCGA cancer types for each input gene.
#'
#' @details
#'
#' It should not be used directly, only inside
#' [.plot_boxgraph_CNVratio_TCGA()] function.
#'
#' @family helper function for CNV data analysis
#'
#' @param df `.TCGA_CNVratio_sampletype_subset` generated inside
#'  [.plot_boxgraph_CNVratio_TCGA()]
#'
#' @param gene one gene from the input argument of [.plot_boxgraph_CNVratio_TCGA()]
#'
#' @return
#'
#' a list with the data frame of CNV ratio of the input gene
#'  in individual TCGA cancer types and gene name
#'
#' @examples \dontrun{
#' lapply(EIF, .CNVratio_tumor, df = .TCGA_CNVratio_sampletype_subset)
#' }
#'
#' @keywords internal
#'
.CNVratio_tumor <- function(df, gene) {
  .CNVratio_data <- df %>%
    dplyr::select(dplyr::all_of(gene), "sample.type", "primary.disease") %>%
    reshape2::melt(
      id = c("sample.type", "primary.disease"),
      value.name = "CNV"
    ) %>%
    dplyr::mutate_if(is.character, as.factor) %>%
    dplyr::mutate(primary.disease = forcats::fct_rev(.data$primary.disease))

  return(list(.CNVratio_data, gene))
}

#' @title Box plots of the CNV ratios in tumors vs adjacent normals
#'
#' @description
#'
#' This function generates boxplot for CNV ratios in tumors
#' vs adjacent normals from individual TCGA cancer types.
#'
#' @details
#'
#' This plot function uses dataset `.TCGA_CNVratio_sampletype_subset`
#'  generated from [.plot_boxgraph_CNVratio_TCGA()] function.
#' It should not be used directly, only inside [.plot_boxgraph_CNVratio_TCGA()]
#'  function.
#'
#' Side effects:
#'
#' (1) boxplot on screen and as saved pdf files to show CNV ratios
#'  in tumors vs adjacent normals from individual TCGA cancer types.
#'
#' @family helper function for CNV data plotting
#'
#' @param df `.EIF_CNVratio_ind_cancer` generated
#' inside [.plot_boxgraph_CNVratio_TCGA()]
#'
#' @examples \dontrun{
#' lapply(.EIF_CNVratio_ind_cancer, .CNVratio_boxplot)
#' }
#'
#' @keywords internal
#'
.CNVratio_boxplot <- function(df) {
  p1 <- ggplot(
    data = df[[1]],
    aes_(
      y = ~ 2**CNV,
      x = ~primary.disease,
      # x = f.ordered1,
      color = ~primary.disease
    )
  ) +
    ylim(0, 3) +
    geom_hline(yintercept = 1, linetype = "dashed") +
    stat_n_text(
      size = 5,
      fontface = "bold",
      hjust = 0
    ) +
    geom_boxplot(
      alpha = .01,
      outlier.colour = NA,
      # size     = .75,
      # width    = 1,
      position = position_dodge(width = .9)
    ) +
    labs(
      x = "primary disease",
      y = paste(df[[2]], "CNV ratio", "(tumor/normal)")
    ) +
    # scale_color_manual(values = col_vector) +
    coord_flip() +
    theme_bw() +
    theme(
      plot.title = black_bold_12,
      axis.title.x = black_bold_12,
      axis.title.y = element_blank(),
      axis.text.x = black_bold_12,
      axis.text.y = black_bold_12,
      panel.grid = element_blank(),
      legend.title = element_blank(),
      legend.text = black_bold_12,
      legend.position = "none",
      legend.justification = "left",
      legend.box = "horizontal",
      strip.text = black_bold_12
    )
  print(p1)
  ggplot2::ggsave(
    path = file.path(output_directory, "CNV"),
    filename = paste0(df[[2]], "pancancer CNVratio.pdf"),
    plot = p1,
    width = 7,
    height = 9,
    useDingbats = FALSE
  )

  return(NULL)
}



## Composite function to call CNV data analysis and plotting ===================

#' @title Summary of CNV statuses in bar plots
#'
#' @description
#'
#' Provides the summary of CNV statuses of EIF4F genes in tumors from all TCGA
#' cancer types combined and in tumors from individual TCGA cancer types
#'
#' @param gene_list gene names in a vector of characters
#'
#' @details  This function
#'
#' * selects a subset of CNV and sample type data of only EIF4F gene from the
#'  data frame `TCGA_CNV_sampletype` - a global variable generated from
#'  [initialize_cnv_data()].
#' * uses the subset data `.TCGA_CNV_sampletype_subset` to perform the
#'  CNV status analysis of all inquired genes with the internal functions
#'  [.CNV_all_cancer()] and plot the results as a bar plot
#'  with the internal function [.CNV_sum_barplot()].
#' * uses the same subset data `.TCGA_CNV_sampletype_subset` to perform CNV
#'  status analysis for each gene across all tumors types by the internal
#'  function [.CNV_ind_cancer()] and plots the CNV results by the internal
#'  function [.CNV_barplot()].
#'
#' This function is not accessible to the user and will not show at the users'
#' workspace. It can only be called by the exported [EIF4F_CNV_analysis()] function.
#'
#' Side effects:
#'
#' (1) the stacked bar plots on screen and as pdf files to show the summary
#'  table of the CNV statuses of all `gene_list` in TCGA tumors
#'
#' (2) stacked bar plots on screen and as saved pdf files to show
#'  CNV status of each gene in `gene_list` from an individual cancer type
#'
#' @family composite function to call CNV data analysis and plotting
#'
#' @examples \dontrun{
#' plot_bargraph_CNV_TCGA(c(
#'   "TP53", "EIF4A1", "EIF4A2", "EIF4E",
#'   "EIF4E2", "EIF4E3", "MYC", "EIF3D", "EIF4EBP1", "EIF4G1", "EIF4G2",
#'   "EIF4G3", "PABPC1", "MKNK1", "MKNK2"
#' ))
#' }
#'
#' @keywords internal
#'
.plot_bargraph_CNV_TCGA <- function(gene_list) {
  .TCGA_CNV_sampletype_subset <- NULL
  # combine CNV data with annotation data
  .TCGA_CNV_sampletype_subset <- TCGA_CNV_sampletype %>%
    dplyr::select(dplyr::all_of(gene_list), "sample.type", "primary.disease")

  # stacked bar plots for CNV status in combined TCGA tumors
  .CNV_all_cancer(.TCGA_CNV_sampletype_subset) %>%
    .CNV_sum_barplot()

  # stacked bar plots for CNV status in each TCGA tumor type
  .EIF_CNV_ind_cancer <- lapply(gene_list,
    .CNV_ind_cancer,
    df = .TCGA_CNV_sampletype_subset
  )

  lapply(.EIF_CNV_ind_cancer, .CNV_barplot)

  return(NULL)
}


#' @title Correlation matrix for CNV values
#'
#' @description
#'
#' generates correlation matrix for eIF4F CNV in tumors
#' from all TCGA cancer type combined
#'
#' @param gene_list gene names in a vector of characters
#'
#' @details This function
#'
#' * selects a subset of unthreshold CNV values of only EIF4F gene
#' from the data frame `TCGA_CNV_value` - a global variable generated from [initialize_cnv_data()].
#' * uses the subset data to calculate the correlation coefficients
#' between every two genes and plot the correlation matrix with the function
#' [.matrix_plot()]
#'
#' This function is not accessible to the user and will not show at the users'
#' workspace. It can only be called by the exported [EIF4F_CNV_analysis()] function.
#'
#' side effect: the correlation matrix plot on screen and as saved pdf files
#'  to show co-occurrence of `gene_list` CNV status in TCGA tumors
#'
#' @family Composite function to call CNV data analysis and plotting
#'
#' @examples \dontrun{
#' plot_matrix_CNVcorr_TCGA(c("EIF4A1", "EIF4E", "EIF4EBP1", "EIF4G1"))
#' }
#'
#' @keywords internal
#'
.plot_matrix_CNVcorr_TCGA <- function(gene_list) {
  TCGA_CNV_value %>%
    dplyr::select(dplyr::all_of(gene_list)) %>%
    .matrix_plot()

  return(NULL)
}


#' @title CNV ratios in tumors vs adjacent normal tissues across tumor types
#'
#' @description
#'
#' This function generates boxplot for CNV ratios in tumors vs adjacent normals
#' from individual TCGA cancer types.
#'
#' @param gene_list gene names in a vector of characters
#'
#' @details This function
#'
#' * selects a subset of CNV and sample type data of only EIF4F gene from the
#'  data frame `TCGA_CNVratio_sampletype` - a global variable generated from
#'  [initialize_cnv_data()].
#' * uses the subset data `.TCGA_CNVratio_sampletype_subset` to perform CNV
#'  status analysis for each gene across all tumors types with the internal
#'  function [.CNVratio_tumor()] and plots the CNV results with the internal
#'  function [.CNVratio_boxplot()].
#'
#' This function is not accessible to the user and will not show at the users'
#' workspace. It can only be called by the exported [EIF4F_CNV_analysis()] function.
#'
#' Side effects:
#'
#' (1) the box plots on screen and as saved pdf files to show CNV ratio of
#'  `gene_list` in TCGA tumors vs normals
#'
#' @family composite function to call CNV data analysis and plotting
#'
#' @examples \dontrun{
#' plot_boxgraph_CNVratio_TCGA(c("EIF4A1", "EIF4E", "EIF4EBP1", "EIF4G1"))
#' }
#'
#' @keywords internal
#'
.plot_boxgraph_CNVratio_TCGA <- function(gene_list) {
  .TCGA_CNVratio_sampletype_subset <- NULL
  .TCGA_CNVratio_sampletype_subset <- TCGA_CNVratio_sampletype %>%
    dplyr::select(
      dplyr::all_of(gene_list),
      "sample.type",
      "primary.disease"
    )

  .EIF_CNVratio_ind_cancer <- lapply(gene_list,
    .CNVratio_tumor,
    df = .TCGA_CNVratio_sampletype_subset
  )

  lapply(.EIF_CNVratio_ind_cancer, .CNVratio_boxplot)

  return(NULL)
}


## Wrapper function to call all internal composite functions with provided inputs =======

#' @title Perform all CNV related analysis and generate plots
#'
#' @description
#'
#' A wrapper function to call all internal composite functions for CNV analysis with
#' provided inputs
#'
#' @details
#'
#' This function run three internal composite functions together with inputs:
#'
#'  * [.plot_bargraph_CNV_TCGA()]
#'  * [.plot_matrix_CNVcorr_TCGA()]
#'  * [.plot_boxgraph_CNVratio_TCGA()]
#'
#' Side effect:
#'
#' (1) stacked bar plots on screen and as pdf file to show the summary
#'  table of the CNV statuses of all EIF genes in TCGA tumors
#'
#' (2) stacked bar plots on screen and as saved pdf files to show
#'  CNV status of each EIF gene from an individual cancer type
#'
#' (3) correlation matrix plot on screen and as saved pdf files to show
#'   co-occurrence of EIF genes CNV status in TCGA tumors
#'
#' (4) box plots on screen and as saved pdf files to show CNV ratio
#'   of EIF genes in TCGA tumors vs normals
#'
#' @family wrapper function to call all composite functions with inputs
#'
#' @export
#'
#' @examples \dontrun{
#' EIF4F_CNV_analysis()
#' }
#'
EIF4F_CNV_analysis <- function() {
  .plot_bargraph_CNV_TCGA(c(
    "TP53", "EIF4A1", "EIF4A2",
    "EIF4E", "EIF4E2", "EIF4E3",
    "MYC", "EIF3D", "EIF4EBP1",
    "EIF4G1", "EIF4G2", "EIF4G3",
    "EIF4H", "EIF4B"
  ))

  .plot_matrix_CNVcorr_TCGA(c(
    "TP53", "EIF4A1", "EIF4A2",
    "EIF4E", "EIF4E2", "EIF4E3",
    "MYC", "EIF3D", "EIF4EBP1",
    "EIF4G1", "EIF4G2", "EIF4G3",
    "EIF4H", "EIF4B"
  ))

  .plot_boxgraph_CNVratio_TCGA(c(
    "EIF4G1", "EIF4E", "EIF4A1", "EIF4EBP1", "EIF4G2", "EIF4G3",
    "EIF4E2", "EIF4E3", "EIF4A2", "EIF3D", "EIF4H", "EIF4B"
  ))

  return(NULL)
}
a3609640/eIF4F.analysis documentation built on Jan. 2, 2023, 11:19 p.m.