R/limma.R

Defines functions plotVolcano limmaFit dprocess_dgeList dgeList

Documented in dgeList dprocess_dgeList limmaFit plotVolcano

#' Construct a DGEList Object
#'
#' This function creates a `DGEList` object from a count matrix, sample
#' information, and feature information. It is designed to facilitate the
#' analysis of differential gene expression using the edgeR package.
#'
#' @param count A numeric matrix where rows represent features (e.g., genes) and
#'   columns represent samples. Row names should correspond to feature identifiers,
#'   and column names should correspond to sample identifiers.
#' @param sample.info A data frame containing information about the samples. The
#'   number of rows should match the number of columns in the `count` matrix.
#' @param feature.info A data frame containing information about the features. The
#'   number of rows should match the number of rows in the `count` matrix.
#'
#' @return A `DGEList` object as defined by the edgeR package, which includes the
#'   count data, sample information, and feature information.
#' @import data.table
#' @export
dgeList <- function(count, sample.info, feature.info) {
  stopifnot(rownames(count) == rownames(feature.info))
  stopifnot(colnames(count) == rownames(sample.info))
  x <- edgeR::DGEList(count, samples = sample.info, genes = feature.info)

  x
}


#' Filter Low-Expressed Genes and Normalize DGEList Data
#'
#' This function filters out low-expressed genes from a `DGEList` object and
#' normalizes the count data. It also provides diagnostic plots for raw and
#' filtered data.
#'
#' @param x A `DGEList` object containing raw count data.
#' @param group.column The name of the column in `x$samples` that contains the
#'   grouping information for the samples.
#' @param min.count The minimum number of counts required for a gene to be
#'   considered expressed. Genes with counts below this threshold in any group
#'   will be filtered out. Defaults to 10.
#'
#' @return The function returns a `DGEList` object with low-expressed genes
#'   filtered out and normalization factors calculated.
#' @import data.table
#' @import grDevices
#' @import graphics
#' @import stats
#' @import utils
#' @export
dprocess_dgeList <- function(x, group.column, min.count = 10) {
  lcpm <- edgeR::cpm(x, log = TRUE, prior.count = 2)
  # filter low expressed genes
  keep_exprs <- edgeR::filterByExpr(x, group = x$samples[[group.column]], min.count = min.count)
  x <- x[keep_exprs, , keep.lib.sizes = FALSE]

  nsamples <- ncol(x)
  if (nsamples > 10) nsamples <- sample(ncol(x), 10)

  oldpar <- par(no.readonly = TRUE)
  on.exit(par(oldpar))

  par(mfrow = c(1, 2))
  plot(density(lcpm[, nsamples[[1]]]), lwd = 2, ylim = c(0, 1), las = 2, main = "", xlab = "")
  title(main = "A. Raw data", xlab = "Log-cpm")
  for (i in nsamples) {
    den <- density(lcpm[, i])
    lines(den$x, den$y, lwd = 2, col = sample(colors(), 1))
  }
  lcpm <- edgeR::cpm(x, log = TRUE)
  plot(density(lcpm[, nsamples[[1]]]), lwd = 2, ylim = c(0, 1), las = 2, main = "", xlab = "")
  title(main = "B. Filtered data", xlab = "Log-cpm")
  for (i in nsamples) {
    den <- density(lcpm[, i])
    lines(den$x, den$y, lwd = 2, col = sample(colors(), 1))
  }


  # Normalize the data
  x <- edgeR::calcNormFactors(x)
  lcpm <- edgeR::cpm(x, log = TRUE)
  boxplot(lcpm, las = 2)
  title("Normalized data")
  limma::plotMDS(lcpm,
    label = x$samples[[group.column]],
    col = as.integer((x$samples[[group.column]])), dim = c(1, 2)
  )
  title("MDS")

  x
}


#' Fit a Linear Model for RNA-seq data using limma
#'
#' This function fits a linear model to processed `DGEList` data using the
#' `limma` package. It defines contrasts between groups and performs
#' differential expression analysis.
#'
#' @param x A processed `DGEList` object containing normalized count data.
#' @param group.column The name of the column in `x$samples` that contains
#'   the grouping information for the samples.
#'
#' @return An `eBayes` object containing the fitted linear model and
#'   results of the differential expression analysis.
#' @import data.table
#' @importFrom limma makeContrasts
#' @export
limmaFit <- function(x, group.column) {
  oldpar <- par(no.readonly = TRUE)
  on.exit(par(oldpar))

  design <- model.matrix(~ 0 + x$samples[[group.column]])
  colnames(design) <- gsub(".*\\]\\]", "", colnames(design))

  all_vs <- utils::combn(unique(x$samples[[group.column]]), 2, simplify = TRUE)
  all_vs2 <- str2expression(paste0(all_vs[1, ], "-", all_vs[2, ]))
  all_vs2 <- setNames(as.list(all_vs2), paste0(all_vs[1, ], "vs", all_vs[2, ]))
  contr_matrix <- do.call("makeContrasts", c(all_vs2, levels = list(colnames(design))))

  par(mfrow = c(1, 2))
  v <- limma::voom(x, design, plot = TRUE)
  vfit <- limma::lmFit(v, design)
  vfit <- limma::contrasts.fit(vfit, contrasts = contr_matrix)
  efit <- limma::eBayes(vfit)
  limma::plotSA(efit, main = "Final model: Mean-variance trend")

  setattr(efit, name = "design", design)
  setattr(efit, name = "contrast", contr_matrix)

  efit
}


#' Plot Volcano Plot for Differentially Expressed Genes
#'
#' This function generates a volcano plot for differentially expressed genes
#' (DEGs) using `ggplot2`. It allows for customization of the plot with
#' different aesthetic parameters.
#'
#' @param data A data frame containing the DEGs result.
#' @param data.text A data frame containing labeled data for text annotation.
#' @param x variable representing the aesthetic for the x-axis.
#' @param y variable representing the aesthetic for the y-axis.
#' @param color variable representing the column name for the color aesthetic.
#' @param label variable representing the column name for the text label aesthetic.
#'
#' @return A `ggplot` object representing the volcano plot.
#' @import ggplot2
#' @export
plotVolcano <- function(data, data.text, x, y, color, label) {
  p <- ggplot(data, aes(x = {{ x }}, y = {{ y }})) +
    geom_point(aes(color = {{ color }})) +
    theme_classic() +
    scale_color_manual(
      values = c("#414788FF", "darkgray", "#22A884FF"),
      guide = guide_legend(override.aes = list(size = 4))
    )

  if (!missing(data.text)) {
    p + ggrepel::geom_text_repel(
      data = data.text, aes(label = {{ label }}),
      arrow = arrow(length = unit(0.1, "cm")),
      max.overlaps = 20,
      size = 4,
      box.padding = unit(0.5, "lines"),
      min.segment.length = 0,
      point.padding = unit(0.8, "lines"),
      segment.color = "black",
      show.legend = FALSE
    )
  }

  p
}

Try the easybio package in your browser

Any scripts or data that you put into this service are public.

easybio documentation built on Sept. 17, 2024, 1:08 a.m.