PRPS: Binary classification with PRPS, PRPS-ST and more

Documented in PStraining

#' PS training
#' @description  This is the wrap up function to select top features, estimate parameters, 
#'  and calculate PS (Prediction Strength) scores based on a given training data set. 
#' @details PS calculation is based on Golub 1999. In this wrap up function, we use four steps to calculate 
#'  PS scores and classification. The range of PS scores is [-1,1]. Before these four steps, there is an option
#'  for NA imputation, but standardization is required. The four steps are:
#' a) apply "standardize" to standardize input data matrix for each feature;
#' b) apply "getTrainingWeights" to select features and return weights for these features;
#' c) apply "getMeanOfGroupMeans" to get mean of group means for each selected feature;
#' d) use "apply" function to get PS scores for all samples with "getPS1sample", the formula is:
#'   \eqn{PS = (V_win − V_lose)/(V_win + V_lose)}
#' Here, where V_win and V_lose are the vote totals for the winning and losing features/traits for a given sample
#' 
#' The theoretical cutoff for PS is 0, in addition, we also classification based on Empirical Bayesian.
#' When we calculate a Empirical Bayes' probability, the 1st group in the input mean and sd vectors is treated
#'  as the test group. 
#' When we calculate the probabilities, we first calcualte probability that a sample belongs to either group, 
#'  and then use the 
#' following formula to get Empirical Bayes' probability:
#' \eqn{prob(x) = d_test(x)/(d_test(x) + d_ref(x))}
#' Here prob(x) is the Empirical Bayes' probability of a given sample, d_test(x) is the density value
#'  that a given sample belongs to the test group, d_ref(x) is the density value that a given sample belongs
#'   to the reference group.
#' Notice that the test and reference group is just the relative grouping, in fact, for this step, 
#' we often need to calculate Empirical Bayes' probabilities for a given sample from two different standing points.
#' 
#' This function also give classification for the training group and confusion matrix to compare PS classification
#'  with original group info for training data set.
#' If NAs are not imputed, they are ignored for feature selection, weight calculation, PS parameter estimation,
#'  and PS calculation.
#' @param trainDat training data set, a data matrix or a data frame, samples are in columns, and features/traits are in rows
#' @param selectedTraits  a selected trait list if available
#' @param groupInfo a known group classification, which order should be the same as in colnames of trainDat
#' @param refGroup the code for reference group, default is the 1st item in groupInfo
#' @param topN an integer to indicate how many top features to be selected
#' @param FDRcut  a FDR cutoff to select top features, which is only valid when topN is set as defaul NULL, 
#'  all features will be returned if both topN and FDRcut are set as default NULL
#' @param weightMethod  a string to indicate weight calculation method, there are five choices: 
#'  "limma" for for limma linear model based t value,"ttest" for t test based t value, 
#'  "MannWhitneyU" for Mann Whitney U based rank-biserial,"PearsonR" for Pearson correlation coefficient,
#'  "SpearmanR" for Spearman correlation coefficient, and the defualt value is "limma"
#' @param classProbCut a numeric variable within (0,1), which is a cutoff of Empirical Bayesian probability, 
#'  often used values are 0.8 and 0.9, default value is 0.9. Only one value is used for both groups, 
#'  the samples that are not included in either group will be assigned as UNCLASS
#' @param imputeNA a logic variable to indicate if NA imputation is needed, if it is TRUE, 
#'  NA imputation is processed before any other steps, the default is FALSE
#' @param byrow a logic variable to indicate direction for imputation, default is TRUE, 
#'  which will use the row data for imputation
#' @param imputeValue a character variable to indicate which value to be used to replace NA, default is "median", 
#'  the median value of the chose direction with "byrow" data to be used
#' @keywords PS training limma weight
#' @return A list with three items is returned: PS parameters for selected features, PS scores and classifications for training samples, and confusion matrix to compare classification based on PS scores and original classification.
#' \item{PS_pars}{a data frame with all parameters needed for PS calculation for each selected features}
#' \item{PS_train}{a data frame of PS score, true classification and its classification based on scores for all training samples}
#' \item{classCompare}{a confusion matrix list object that compare PS classification based on selected features and weights compared to input group classification for training data set}
#' \item{classTable}{a table to display comparison of PS classification based on selected features and weights compared to input group classification for training data set}
#' @references 
#' Golub TR, Slonim DK, Tamayo P, Huard C, Gaasenbeek M, Mesirov JP, et al. Molecular classification of cancer: 
#' class discovery and class prediction by gene expression monitoring. Science. 1999;286:531–7
#' @export
PStraining = function(trainDat, selectedTraits = NULL, groupInfo, refGroup = NULL, topN = NULL, FDRcut = NULL, 
                      weightMethod = c("ttest","limma","PearsonR", "SpearmanR", "MannWhitneyU"), classProbCut = 0.9,
                      imputeNA = FALSE, byrow = TRUE, imputeValue = c("median","mean")){

  groupInfo = as.character(groupInfo)
  if(is.null(refGroup)){
    refGroup = groupInfo[1]
  }
  
  weightMethod = weightMethod[1]
  
  imputeValue = imputeValue[1]
  
  ## impute NA if imputeNA is true
  if(imputeNA){
    trainDat = imputeNAs(dataIn = trainDat, byrow = byrow, imputeValue = imputeValue)
  }
  
  # for PS approach, should standardize data first
  trainDat = standardize(trainDat)
  
  # use reference group, we can select top features and calculate their weights
  weights = getTrainingWeights(trainDat = trainDat, selectedTraits = selectedTraits, groupInfo = groupInfo,
                               refGroup = refGroup, topN = topN, FDRcut = FDRcut, weightMethod = weightMethod)
  
  #  now, get mean of groups' means for each selected top features
  mean_2means = t(apply(trainDat[rownames(weights),], 1, getMeanOfGroupMeans, groupInfo = groupInfo, refGroup = refGroup))
  colnames(mean_2means) = c("meanOfGroupMeans","refGroupMean","testGroupMean")
  
  # in fact, the above two objects can be combined into one matrix, and put the two most important items into col1 and col2
  PS_pars = cbind( mean_2means[,1],weights[,1], mean_2means[,-1], weights[,-1])
  colnames(PS_pars)[1:2] = c(colnames(mean_2means)[1],colnames(weights)[1])
  
  # get PS scores for all samples
  PS_score = apply(trainDat[rownames(weights),], 2, getPS1sample, PSpars = PS_pars[,1:2])
  
  # for PS, 0 is a natural cutoff for two group classification
  testGroup = setdiff(unique(groupInfo), refGroup)
  PS_class0 = ifelse(PS_score >= 0, testGroup, refGroup)
  
  #### 20190905, use the true class to calculate PS group mean and sd, which are used for empirial Bayesian prob calculation
  refind = which(groupInfo == refGroup)
  refPS = PS_score[refind]
  testPS = PS_score[-refind]
  
  refPSmean = mean(refPS, na.rm = T)
  refPSsd = sd(refPS, na.rm = T)
  
  testPSmean = mean(testPS, na.rm = T)
  testPSsd = sd(testPS, na.rm = T)
 
  PS_prob_test = getProb(PS_score, groupMeans = c(testPSmean, refPSmean), groupSds = c(testPSsd, refPSsd))
  PS_prob_ref= getProb(PS_score, groupMeans = c(refPSmean, testPSmean), groupSds = c(refPSsd, testPSsd))
  
  PS_class = rep("UNCLASS",length(PS_score))
  PS_class[which(PS_prob_test >= classProbCut)] = testGroup
  PS_class[which(PS_prob_ref >= classProbCut)] = refGroup
  
  true_class = groupInfo
  PS_score = data.frame(PS_score)
  PS_train = cbind(PS_score, true_class, PS_class, PS_prob_test, PS_prob_ref,PS_class0, stringsAsFactors = F)
  
  groupInfo = factor(groupInfo, levels = c(refGroup, testGroup))
  ## in order to get comparison, change UNCLASS to NA, therefore only two groups are considered in PS_class
  PS_class2 = ifelse(PS_class == "UNCLASS", NA, PS_class)
  PS_class2 = factor(PS_class, levels = c(refGroup,  testGroup))
  ### PS_class2 is for confusion matirx only, we do not export it
  ## notice that confusion matrix does not work if the number of levels are not the same
  
  classCompare = caret::confusionMatrix(PS_class2, reference = groupInfo, positive = testGroup)
  
  meansds = c(testPSmean, refPSmean, testPSsd, refPSsd)
  names(meansds) = c("testPSmean","refPSmean","testPSsd","refPSsd")
  
  PS_pars =  list(weights,meansds, mean_2means)  ### notice that this is different from the PS_pars above
  names(PS_pars) = c("weights","meansds","traits")
  
  #### since UNCLASS is excluded from confusion matrix, add one more output for full comparison
  classTable = table(groupInfo, PS_class)
  
  outs = list(PS_pars, PS_train, classCompare, classTable)
  names(outs) = c("PS_pars","PS_train","classCompare", "classTable")
  return(outs)
 
}