R/data.fitDataToNormal.r

#' Fit non-normal data to a normal distribution.
#'
#' Here, we conduct a two stage fitting process to the normal distribution: 2D quantile normalization,
#' followed by a non-paranormal transformation. You can choose to substitute by other fitting methods, too.
#' A matrix of z-scores is returned.
#' @param data - A matrix of floating point numbers.
#' @param isControlSample - A vector indicating which rows in the matrix come from control samples and which do not (1 or 0).
#' @export data.fitDataToNormal
#' @examples
#' data.zscore = data.fitDataToNormal(data_mat, c(1,1,0,0,0,1,1,1,1))
data.fitDataToNormal = function(data, isControlSample) {
  # Here, we first quantile normalize across both the variables (across columns), and patients (across rows).
  dataQN = t(data.quantileNormalize(as.data.frame(data)))
  dataQN = data.quantileNormalize(as.data.frame(dataQN))

  # Next, we apply a non-paranormal transformation (yes, it's called a *paranormal* transform, believe it or not!).
  dataQN.npn = huge.npn(dataQN)
  dataQN.controls = huge.npn(dataQN[which(isControlSample==1),])

  # Before we calculate zscores and convert to pvalues, we remove features that show no variability in the control distribution by removing features with
  #     a standard deviation that is 0.
  miRNAMeanStds = cbind(round(apply(dataQN.controls, 2, mean), 4), round(apply(dataQN.controls, 2, sd), 4))
  keepThesefeatures = which(round(miRNAMeanStds[,2],4)>0)
  dataQN.controls = dataQN.controls[,keepThesefeatures]
  dataQN.npn = dataQN.npn[,keepThesefeatures]

  # 1.4 Now, convert raw scores to z-scores.
  data.zscore = as.data.frame(matrix(1, nrow=length(rownames(dataQN.npn)), ncol=length(colnames(dataQN.npn))))
  rownames(data.zscore) = rownames(dataQN.npn)
  colnames(data.zscore) = colnames(dataQN.npn)
  # Create an empirical cdf on just the normal/control patients, and get the pvalue that way.
  for (pt in 1:length(rownames(dataQN.npn))) {
    for (miRNA in 1:length(colnames(dataQN.controls))) {
      # Convert each observation to a zscore
      z = (dataQN.npn[pt,miRNA] - mean(as.numeric(dataQN.controls[,miRNA])))/sd(dataQN.controls[,miRNA])
      # Now, convert z score to a 2-tailed pvalue. This is our patient's pvalue for this specific miRNA.
      data.zscore[pt,miRNA] = z
    }
  }
  return(data.zscore)
}
BRL-BCM/MolPhenoMatch documentation built on May 26, 2019, 6:38 a.m.