R/standardScreeningBinaryTrait.R

Defines functions standardScreeningBinaryTrait

# The function standardScreeningBinaryTrait computes widely used statistics for relating the columns of
# the input data frame (argument datExpr) to a binary sample trait (argument y). The statistics include
# Student t-test p-value and the corresponding local false discovery rate (known as q-value, Storey et al
# 2004), the fold change, the area under the ROC curve (also known as C-index), mean values etc. If the
# input option kruskalTest is set to TRUE, it also computes the kruskal Wallist test p-value and
# corresponding q-value. The kruskal Wallis test is a non-parametric, rank-based group comparison test.

standardScreeningBinaryTrait <- function(datExpr, y,
                                         corFnc = cor, corOptions = list(use = "p"),
                                         kruskalTest = FALSE, qValues = FALSE, var.equal = FALSE, na.action = "na.exclude",
                                         getAreaUnderROC = TRUE) {
  datExpr <- data.frame(datExpr, check.names = FALSE)
  levelsy <- levels(factor(y))
  if (length(levelsy) > 2) {
    stop("The sample trait y contains more than 2 levels. Please input a binary variable y")
  }
  if (length(levelsy) == 1) {
    stop("The sample trait y is constant. Please input a binary sample trait with some variation.")
  }
  yNumeric <- as.numeric(factor(y))
  if (length(yNumeric) != dim(datExpr)[[1]]) {
    stop("the length of the sample trait y does not equal the number of rows of datExpr")
  }
  pvalueStudent <- t.Student <- Z.Student <- rep(NA, dim(datExpr)[[2]])
  pvaluekruskal <- stat.Kruskal <- Z.Kruskal <- sign.Kruskal <- rep(NA, dim(datExpr)[[2]])
  nPresent <- rep(0, dim(datExpr)[[2]])
  AreaUnderROC <- rep(NA, dim(datExpr)[[2]])

  if (var.equal) {
    printFlush(paste(
      "Warning: T-test that assumes equal variances in each group is requested.\n",
      "This is not the default option for t.test. We recommend to use var.equal=FALSE."
    ))
  }

  corFnc <- match.fun(corFnc)
  corOptions$y <- yNumeric
  corOptions$x <- datExpr
  corPearson <- as.numeric(do.call(corFnc, corOptions))
  nGenes <- dim(datExpr)[[2]]
  nPresent1 <- as.numeric(t(as.matrix(!is.na(yNumeric) & yNumeric == 1)) %*% !is.na(datExpr))
  nPresent2 <- as.numeric(t(as.matrix(!is.na(yNumeric) & yNumeric == 2)) %*% !is.na(datExpr))
  nPresent <- nPresent1 + nPresent2

  for (i in 1:nGenes) {
    no.present1 <- nPresent1[i]
    no.present2 <- nPresent2[i]
    no.present <- nPresent[i]
    if (no.present1 < 2 | no.present2 < 2) {
      pvalueStudent[i] <- t.Student[i] <- NA
    } else {
      tst <- try(t.test(as.numeric(datExpr[, i]) ~ yNumeric, var.equal = var.equal, na.action = na.action),
        silent = TRUE
      )
      if (!inherits(tst, "try-error")) {
        pvalueStudent[i] <- tst$p.value
        t.Student[i] <- -tst$statistic
        # The - sign above is intentional to make the sign of t consistent with correlation
      } else {
        printFlush(paste(
          "standardScreeningBinaryTrait: An error ocurred in t.test for variable",
          i, ":\n", tst
        ))
        printFlush(paste("Will return missing value(s) for this variable.\n\n"))
      }
    }
    if (getAreaUnderROC) AreaUnderROC[i] <- rcorr.cens(datExpr[, i], yNumeric, outx = TRUE)[[1]]
    if (kruskalTest) {
      if (no.present < 5) {
        pvaluekruskal[i] <- stat.Kruskal[i] <- NA
      } else {
        kt <- try(kruskal.test(datExpr[, i] ~ factor(yNumeric), na.action = "na.exclude"), silent = TRUE)
        if (!inherits(kt, "try-error")) {
          pvaluekruskal[i] <- kt$p.value
          stat.Kruskal[i] <- kt$statistic
          # Find which side is higher
          r <- rank(datExpr[, i])
          means <- tapply(r, factor(yNumeric), mean, na.rm = TRUE)
          sign.Kruskal[i] <- 2 * ((means[1] < means[2]) - 0.5)
          # sign.Kruskal is 1 if the ranks in group 1 are smaller than in group 2
        } else {
          printFlush(paste(
            "standardScreeningBinaryTrait: An error ocurred in kruskal.test for variable",
            i, ":\n", kt
          ))
          printFlush(paste("Will return missing value(s) for this variable.\n\n"))
        }
      }
    }
  }
  q.Student <- rep(NA, length(pvalueStudent))
  rest1 <- !is.na(pvalueStudent)
  if (qValues) {
    x <- try(
      {
        q.Student[rest1] <- qvalue(pvalueStudent[rest1])$qvalues
      },
      silent = TRUE
    )
    if (inherits(x, "try-error")) {
      printFlush(paste(
        "Warning in standardScreeningBinaryTrait: function qvalue returned an error.\n",
        "calculated q-values will be invalid. qvalue error:\n\n", x, "\n"
      ))
    }
    if (kruskalTest) {
      q.kruskal <- rep(NA, length(pvaluekruskal))
      rest1 <- !is.na(pvaluekruskal)
      xx <- try(
        {
          q.kruskal[rest1] <- qvalue(pvaluekruskal[rest1])$qvalues
        },
        silent = TRUE
      )
      if (inherits(xx, "try-error")) {
        printFlush(paste(
          "Warning in standardScreeningBinaryTrait: function qvalue returned an error.\n",
          "calculated q-values will be invalid. qvalue error:\n\n", xx, "\n"
        ))
      }
    }
  }
  meanLevel1 <- as.numeric(apply(datExpr[!is.na(y) & y == levelsy[[1]], ], 2, mean, na.rm = TRUE))
  meanLevel2 <- as.numeric(apply(datExpr[!is.na(y) & y == levelsy[[2]], ], 2, mean, na.rm = TRUE))

  Z.Student <- qnorm(pvalueStudent / 2, lower.tail = FALSE) * sign(t.Student)
  if (kruskalTest) {
    Z.Kruskal <- qnorm(pvaluekruskal / 2, lower.tail = FALSE) * sign(stat.Kruskal)
  }


  stderr1 <- function(x) {
    no.present <- sum(!is.na(x))
    if (no.present < 2) {
      out <- NA
    } else {
      out <- sqrt(var(x, na.rm = TRUE) / no.present)
    }
    out
  } # end of function stderr1

  SE.Level1 <- as.numeric(apply(datExpr[y == levelsy[[1]] & !is.na(y), ], 2, stderr1))
  SE.Level2 <- as.numeric(apply(datExpr[y == levelsy[[2]] & !is.na(y), ], 2, stderr1))

  FoldChangeLevel1vsLevel2 <- ifelse(meanLevel1 / meanLevel2 > 1, meanLevel1 / meanLevel2, -meanLevel2 / meanLevel1)

  output <- data.frame(
    ID = dimnames(datExpr)[[2]], corPearson = corPearson,
    t.Student = t.Student,
    pvalueStudent = pvalueStudent,
    FoldChange = FoldChangeLevel1vsLevel2,
    meanFirstGroup = meanLevel1,
    meanSecondGroup = meanLevel2,
    SE.FirstGroup = SE.Level1,
    SE.SecondGroup = SE.Level2
  )

  if (getAreaUnderROC) {
    output$AreaUnderROC <- AreaUnderROC
  }

  if (kruskalTest) {
    output <- data.frame(output,
      stat.Kruskal = stat.Kruskal,
      stat.Kruskal.signed = sign.Kruskal * stat.Kruskal,
      pvaluekruskal = pvaluekruskal
    )
  }

  if (qValues && !inherits(x, "try-error")) output <- data.frame(output, q.Student)
  if (qValues & kruskalTest) {
    if (!inherits(xx, "try-error")) output <- data.frame(output, q.kruskal)
  }
  names(output)[3:5] <- paste(names(output)[3:5], levelsy[[1]], "vs", levelsy[[2]], sep = ".")
  output <- data.frame(output, nPresentSamples = nPresent)
  output
}
milescsmith/WGCNA documentation built on April 11, 2024, 1:26 a.m.