R/EdgeRAnalyze.R

Defines functions edgeR_analyze

Documented in edgeR_analyze

#' Differential Gene Expression Analysis using 'edgeR'
#'
#' This function performs differential gene expression analysis using the 'edgeR' package.
#' It reads tumor and normal expression data, merges them, filters low-expressed genes,
#' normalizes the data, performs edgeR analysis, and outputs the results along with information
#' on gene expression changes.
#'
#' @importFrom tibble column_to_rownames
#' @importFrom dplyr mutate
#' @importFrom edgeR DGEList cpm calcNormFactors estimateGLMCommonDisp estimateGLMTrendedDisp estimateGLMTagwiseDisp glmFit glmLRT topTags
#' @importFrom stats model.matrix
#' @param tumor_file Path to the tumor data file (RDS format).
#' @param normal_file Path to the normal data file (RDS format).
#' @param output_file Path to save the output DEG data (RDS format).
#' @param logFC_threshold Threshold for log fold change for marking up/down-regulated genes.
#' @param p_value_threshold Threshold for p-value for filtering significant genes.
#' @return A data frame of differential expression results.
#' @references
#' edgeR: Differential analysis of sequence read count data.
#' For more information, visit the edgeR Bioconductor page:
#' https://www.bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
#' @export
#'
#' @examples
#' # Define file paths for tumor and normal data from the data folder
#' tumor_file <- system.file("extdata",
#'                          "removebatch_SKCM_Skin_TCGA_exp_tumor_test.rds",
#'                          package = "TransProR")
#' normal_file <- system.file("extdata",
#'                            "removebatch_SKCM_Skin_Normal_TCGA_GTEX_count_test.rds",
#'                            package = "TransProR")
#' output_file <- file.path(tempdir(), "DEG_edgeR.rds")
#'
#' DEG_edgeR <- edgeR_analyze(
#'   tumor_file = tumor_file,
#'   normal_file = normal_file,
#'   output_file = output_file,
#'   2.5,
#'   0.01
#' )
#'
#' # View the top 5 rows of the result
#' head(DEG_edgeR, 5)
#'
edgeR_analyze <- function(tumor_file, normal_file, output_file, logFC_threshold = 2.5, p_value_threshold = 0.01) {
  tumor <- readRDS(tumor_file)
  normal <- readRDS(normal_file)

  # Merge the datasets, ensuring both have genes as row names
  all_count_exp <- merge(tumor, normal, by = "row.names")
  all_count_exp <- tibble::column_to_rownames(all_count_exp, var = "Row.names")

  # Define groups for tumor and normal samples
  group <- c(rep('tumor', ncol(tumor)), rep('normal', ncol(normal)))
  group <- factor(group, levels = c("normal", "tumor"))
  group_table <- table(group)

  message("Group Table:")
  message(paste(names(group_table), group_table, sep = ": ", collapse = "\n"))
  # Add a space after the output for separation
  message(" ")

  # Create DGEList object for gene expression data and group information
  d <- edgeR::DGEList(counts = all_count_exp, group = group)

  # Filter lowly expressed genes based on CPM values
  keep <- rowSums(edgeR::cpm(d) > 1) >= 2
  d <- d[keep, , keep.lib.sizes = FALSE]

  # Update library size information in the samples
  d$samples$lib.size <- colSums(d$counts)

  # Normalize the data using the TMM method
  d <- edgeR::calcNormFactors(d)

  # Create design matrix for differential analysis model
  design <- stats::model.matrix(~0 + factor(group))
  rownames(design) <- colnames(d)
  colnames(design) <- levels(factor(group))

  # Estimate dispersions - common dispersion, trended dispersion, tagwise dispersion
  d <- edgeR::estimateGLMCommonDisp(d, design)
  d <- edgeR::estimateGLMTrendedDisp(d, design)
  d <- edgeR::estimateGLMTagwiseDisp(d, design)

  # Fit the model using Generalized Linear Model (GLM)
  fit <- edgeR::glmFit(d, design)

  # Perform differential expression analysis using Likelihood Ratio Test (LRT)
  lrt <- edgeR::glmLRT(fit, contrast = c(-1, 1)) # Note that the 'contrast' here is different from 'DESeq2'. Here, we only need to input c(-1, 1): -1 corresponds to normal, 1 corresponds to tumor.

  # Retrieve top differentially expressed genes
  nrDEG <- edgeR::topTags(lrt, n = nrow(d))
  DEG_edgeR <- as.data.frame(nrDEG)

  # Add 'change' column to mark up/down-regulated genes
  k1 <- (DEG_edgeR$PValue < p_value_threshold) & (DEG_edgeR$logFC < -logFC_threshold)
  k2 <- (DEG_edgeR$PValue < p_value_threshold) & (DEG_edgeR$logFC > logFC_threshold)
  DEG_edgeR <- dplyr::mutate(DEG_edgeR, change = ifelse(k1, "down", ifelse(k2, "up", "stable")))

  change_table <- table(DEG_edgeR$change)

  message("Change Table:")
  message(paste(names(change_table), change_table, sep = ": ", collapse = "\n"))
  # Add a space after the output for separation
  message(" ")

  # Save results to the specified output file
  saveRDS(DEG_edgeR, file = output_file)

  return(DEG_edgeR)
}

Try the TransProR package in your browser

Any scripts or data that you put into this service are public.

TransProR documentation built on April 4, 2025, 3:16 a.m.