R/feature_selection.R
In IOBR: Immune Oncology Biological Research

Documented in feature_select limma.dif

#' Feature Selection via Correlation or Differential Expression
#'
#' Selects informative features using either correlation with a quantitative
#' response or differential expression (limma) for binary/continuous
#' responses.
#'
#' @param x Numeric matrix. Features (rows) by samples (columns).
#' @param y Numeric or factor. Response vector (quantitative or binary).
#' @param method Character. "cor" (correlation) or "dif" (differential expression). Default c("cor","dif").
#' @param family Character. Correlation method if method = "cor": "spearman" or "pearson".
#' @param cutoff Numeric. Absolute correlation (for cor) or |log2FC| (for dif) threshold.
#' @param padjcut Numeric. Adjusted p-value cutoff.
#'
#' @import dplyr
#' @import tibble
#' @import tidyr
#' @import purrr
#' @import stringr
#' @import glmnet
#'
#' @return Character vector of selected feature names.
#' @export
#'
#' @examples
#' # Simulate data
#' set.seed(123)
#' sim_eset <- matrix(rnorm(100 * 50), 100, 50)
#' rownames(sim_eset) <- c("PDCD1", paste0("Gene", 2:100))
#' colnames(sim_eset) <- paste0("Sample", 1:50)
#' pd1 <- as.numeric(sim_eset["PDCD1", ])
#' group <- ifelse(pd1 > mean(pd1), "high", "low")
#' 
#' # Correlation method
#' pd1_cor <- feature_select(
#'   x = sim_eset, y = pd1, method = "cor",
#'   family = "pearson", padjcut = 0.05, cutoff = 0.5
#' )
#' 
#' # Differential expression method
#' pd1_dif <- feature_select(
#'   x = sim_eset, y = pd1, method = "dif",
#'   padjcut = 0.05, cutoff = 2
#' )
#' pd1_dif_2 <- feature_select(
#'   x = sim_eset, y = group,
#'   method = "dif", padjcut = 0.05, cutoff = 2
#' )
feature_select <- function(x, y, method = c("cor", "dif"),
                           family = c("spearman", "pearson"),
                           cutoff = NULL, padjcut = NULL) {
  method <- match.arg(method)
  if (length(unique(y)) == 1) {
    stop("There are only one constant value in y, y must be binary or quantitative value.")
  }
  type <- ifelse(length(unique(y)) == 2, "Binary", "quantitative")
  if (length(unique(y)) == 2) {
    message("Deteching two levels in y, we will treat y as a binary varibale")
  }
  if (length(unique(y)) > 2) {
    message("Deteching more than two levels in y, we will treat y as a quantitative varibale")
  }

  if (method == "cor") {
    if (type != "quantitative") {
      stop("Correlation between x and y, y must be quantitative")
    }
    Gene <- rownames(x)
    x <- tibble::as_tibble(t(x))
    tmp <- x %>%
      map(cor.test, y, method = family)
    pvalue <- tmp %>% map_dbl("p.value")
    estimate <- tmp %>% map_dbl("estimate")
    P.adj <- p.adjust(as.numeric(pvalue), method = "fdr")
    cor_feature <- tibble(Gene = Gene, P.adj = P.adj, Estimate = estimate)
    cor_feature <- cor_feature %>%
      filter(P.adj < padjcut, abs(Estimate) > cutoff) %>%
      select("Gene")
    feature <- cor_feature$Gene %>% as.character()
  }
  if (method == "dif") {
    if (type == "quantitative") {
      message("For quantitative varibale, upper 25% and bottom 25% samples
              were treated as upregulated group and downregulated group.")
      up <- which(y > quantile(y, 0.75))
      down <- which(y < quantile(y, 0.25))
      pdata <- data.frame(
        samples = c(colnames(x)[up], colnames(x)[down]),
        group = c(rep("up", length(up)), rep("down", length(down)))
      )
      exprdata <- x[, c(up, down)]
      contrastfml <- c("up - down")
    }
    if (type == "Binary") {
      if (length(unique(y)) != 2) {
        stop("y must be binary feature, please check your data carefully")
      }
      pdata <- data.frame(samples = colnames(x), group = y)
      contrastfml <- paste(unique(y)[1], "-", unique(y)[2])
      exprdata <- x
    }
    dif <- limma.dif(exprdata = exprdata, pdata = pdata, contrastfml = contrastfml)
    dif <- data.frame(Probe = rownames(dif), dif)
    dif_feature <- dif %>%
      as_tibble() %>%
      filter(abs(logFC) > cutoff, adj.P.Val < padjcut) %>%
      select("Probe")
    feature <- dif_feature$Probe %>% as.character()
  }
  return(feature)
}

#' Differential Expression Analysis Using Limma
#'
#' Performs differential expression analysis using the limma package on a
#' given gene expression dataset.
#' Constructs a design matrix from phenotype data, fits a linear model,
#' applies contrasts, and computes
#' statistics for differential expression.
#'
#' @param exprdata A matrix with rownames as features like gene symbols or cgi, and colnames as samples.
#' @param pdata A two-column dataframe where the first column matches the colnames of exprdata and the second column contains the grouping variable.
#' @param contrastfml A character vector for contrasts to be tested (see ?makeContrasts for more details).
#'
#' @return Returns a dataframe from limma::topTable, which includes genes as rows and columns like genelist, logFC, AveExpr, etc.
#' @export
#' @examples
#' # Toy example with 100 genes and 6 samples
#' set.seed(123)
#' exprdata <- matrix(
#'   rnorm(100 * 6),
#'   nrow = 100,
#'   ncol = 6,
#'   dimnames = list(
#'     paste0("gene", 1:100),
#'     paste0("sample", 1:6)
#'   )
#' )
#'
#' # Phenotype data: 3 vs 3
#' pdata <- data.frame(
#'   sample = colnames(exprdata),
#'   group = rep(c("group1", "group2"), each = 3),
#'   stringsAsFactors = FALSE
#' )
#'
#' # Differential expression: group1 vs group2
#' res <- limma.dif(
#'   exprdata = exprdata,
#'   pdata = pdata,
#'   contrastfml = "group1 - group2"
#' )
#' head(res)
limma.dif <- function(exprdata, pdata, contrastfml) {
  group_list <- as.character(pdata[, 2])
  design <- model.matrix(~ 0 + factor(group_list))
  colnames(design) <- levels(as.factor(pdata[, 2]))
  rownames(design) <- colnames(exprdata)
  if (!all(colnames(exprdata) == pdata[, 1])) {
    stop(" expression data do not match pdata")
  }
  rlang::check_installed("limma")
  contrast.matrix <- limma::makeContrasts(contrasts = contrastfml, levels = design)
  fit <- limma::lmFit(exprdata, design)
  fit <- limma::contrasts.fit(fit, contrast.matrix)
  fit <- limma::eBayes(fit)
  dif <- limma::topTable(fit, adjust.method = "BH", coef = contrastfml, number = Inf)
  return(dif)
}