MolEndoMatch: Molecular Endophenotype Patient Matchmaking

Documented in data.fitDataToNormal

#' Fit non-normal data to a normal distribution.
#'
#' Here, we conduct a two stage fitting process to the normal distribution: 2D quantile normalization, followed by a non-paranormal transformation. You can choose to substitute by other fitting methods, too. A matrix of z-scores is returned.
#' @param data - A matrix of floating point numbers.
#' @param isControlSample - A vector indicating which rows in the matrix come from control samples and which do not (1 or 0).
#' @export data.fitDataToNormal
#' @examples
#' data.zscore <- data.fitDataToNormal(matrix, c(1,1,0,0,0,1,1,1,1))
data.fitDataToNormal <- function(data, isControlSample) {
  # Here, we first quantile normalize across both the variables (across columns), and patients (across rows).
  dataQN <- t(data.quantileNormalize(as.data.frame(data)))
  dataQN <- data.quantileNormalize(as.data.frame(dataQN))

  # Next, we apply a non-paranormal transformation (yes, it's called a *paranormal* transform, believe it or not!).
  dataQN.npn = huge.npn(dataQN)
  dataQN.controls <- huge.npn(dataQN[which(isControlSample==1),])

  # Before we calculate zscores and convert to pvalues, we remove miRNAs that show no variability in the control distribution by removing miRNAs with
  #     a standard deviation that is 0.
  miRNAMeanStds <- cbind(round(apply(dataQN.controls, 2, mean), 4), round(apply(dataQN.controls, 2, sd), 4))
  keepTheseMiRNAs <- which(round(miRNAMeanStds[,2],4)>0)
  dataQN.controls <- dataQN.controls[,keepTheseMiRNAs]
  dataQN.npn <- dataQN.npn[,keepTheseMiRNAs]

  # 1.4 Now, convert raw scores to z-scores.
  data.zscore = as.data.frame(matrix(1, nrow=length(rownames(dataQN.npn)), ncol=length(colnames(dataQN.npn))))
  rownames(data.zscore) = rownames(dataQN.npn)
  colnames(data.zscore) = colnames(dataQN.npn)
  # Create an empirical cdf on just the normal/control patients, and get the pvalue that way.
  for (pt in 1:length(rownames(dataQN.npn))) {
    for (miRNA in 1:length(colnames(dataQN.controls))) {
      # Convert each observation to a zscore
      z = (dataQN.npn[pt,miRNA] - mean(as.numeric(dataQN.controls[,miRNA])))/sd(dataQN.controls[,miRNA])
      # Now, convert z score to a 2-tailed pvalue. This is our patient's pvalue for this specific miRNA.
      data.zscore[pt,miRNA] <- z
    }
  }
  return(data.zscore)
}