R/votingLinearPredictor.R
In WGCNA: Weighted Correlation Network Analysis

Documented in removePrincipalComponents votingLinearPredictor

.devianceResidual = function(y)
{
  event = y[, ncol(y) ];
  fit = summary(coxph(y~1, na.action = na.exclude))
  CumHazard = predict(fit, type = "expected")
  martingale1 = event - CumHazard
  deviance0 = ifelse(event == 0, 2 * CumHazard, -2 * log(CumHazard) +
                     2 * CumHazard - 2)
  sign(martingale1) * sqrt(deviance0);
}


.dropThirdDim = function(x)
{
  d = dim(x);
  dim(x) = c(d[1], d[2]*d[3]);
  x;
}
 

#-----------------------------------------------------------------------------------------------------
#
# Voting linear predictor: given a set of vectors y and a set of vectors x, 
# predictor is given by the correlations of y with x[,i] 
# 
#-----------------------------------------------------------------------------------------------------

votingLinearPredictor = function(x, y, xtest = NULL, 
                    classify = FALSE, 
                    CVfold = 0,
                    randomSeed = 12345,
                    assocFnc = "cor", assocOptions = "use = 'p'",
                    featureWeightPowers = NULL, priorWeights = NULL,
                    weighByPrediction = 0,
                    nFeatures.hi = NULL,
                    nFeatures.lo = NULL,
                    dropUnusedDimensions = TRUE,
                    verbose = 2, indent = 0)
{

  # Special handling of a survival response

  if (is.Surv(y))
  {
    pr = votingLinearPredictor(x, .devianceResidual(y), xtest, classify = FALSE,
                             CVfold = CVfold, randomSeed = randomSeed,
                             assocFnc = assocFnc, assocOptions = assocOptions,
                             featureWeightPowers = featureWeightPowers, priorWeights = priorWeights,
                             nFeatures.hi = nFeatures.hi, nFeatures.lo = nFeatures.lo,
                             dropUnusedDimensions = dropUnusedDimensions,
                             verbose = verbose, indent = indent);
    # may possibly need completion here. 
    return(pr)
  }       

  # Standard code for a numeric response

  ySaved = y;

  if (classify)
  { 
    # Convert factors to numeric variables.
    if (is.null(dim(y)))
    {
      ySaved = list(y);
      if (classify)
      {
         originalYLevels = list( sort(unique(y)) );
         y = as.factor(y);
      }
      y = as.matrix(as.numeric(y));
    } else {
      ySaved = y;
      if (classify)
      {
         y = as.data.frame(lapply(as.data.frame(y), as.factor));
         originalYLevels = lapply(as.data.frame(apply(ySaved, 2, unique)), sort);
      } else
         y = as.data.frame(y);
      y = as.matrix(sapply(lapply(y, as.numeric), I));
    }
    numYLevels = lapply(as.data.frame(apply(y, 2, unique)), sort);
    minY = apply(y, 2, min);
    maxY = apply(y, 2, max);
  } else 
    y = as.matrix(y);

  doTest = !is.null(xtest)

  x = as.matrix(x);
  nSamples = nrow(y);

  nTraits = ncol(y);
  nVars = ncol(x);

  if (is.null(featureWeightPowers)) featureWeightPowers = 0;
  nPredWPowers = length(featureWeightPowers);
    
  if (is.null(rownames(x)))
  {
    sampleNames = spaste("Sample.", c(1:nSamples));
  } else
    sampleNames = rownames(x);
   
  if (doTest) 
  { 
    xtest = as.matrix(xtest);
    nTestSamples = nrow(xtest);
    if (is.null(rownames(xtest)))
    {
      testSampleNames = spaste("testSample.", c(1:nTestSamples));
    } else
      testSampleNames = rownames(xtest);
    if (ncol(x)!=ncol(xtest))
      stop("Number of learning and testing predictors (columns of x, xtest) must equal.");
  }

  if (is.null(colnames(y)))
  {
    traitNames = spaste("Response.", c(1:nTraits));
  } else
    traitNames = colnames(y);

  if (is.null(colnames(x)))
  {
    featureNames = spaste("Feature.", c(1:nVars));
  } else
    featureNames = colnames(x);

  powerNames = spaste("Power.", featureWeightPowers);
   

  spaces = indentSpaces(indent);
 
  # If cross-validation is requested, change the whole flow and use a recursive call.
  if (CVfold > 0)
  {
    if (CVfold > nSamples )
    {
      printFlush("CVfold is larger than nSamples. Will perform leave-one-out cross-validation.");
      CVfold = nSamples;
    }
    ratio = nSamples/CVfold;
    if (floor(ratio)!=ratio)
    {
      smaller =floor(ratio);
      nLarger = nSamples - CVfold * smaller
      binSizes = c(rep(smaller, CVfold-nLarger), rep(smaller +1, nLarger));
    } else
      binSizes = rep(ratio, CVfold);
    if (!is.null(randomSeed))
    {
      if (exists(".Random.seed"))
      {
        saved.seed = .Random.seed;
        seedSaved = TRUE;
      } else
        seedSaved = FALSE;
      set.seed(randomSeed);
    }

    sampleOrder = sample(1:nSamples);
      
    CVpredicted = array(NA, dim = c(nSamples, nTraits, nPredWPowers));
    CVbin = rep(0, nSamples);

    if (verbose > 0) 
    {
      cat(paste(spaces, "Running cross-validation: "));
      if (verbose==1) pind = initProgInd() else printFlush("");
    }

    ind = 1;
    for (cv in 1:CVfold)
    {
      if (verbose > 1) printFlush(paste("..cross validation bin", cv, "of", CVfold));
      end = ind + binSizes[cv] - 1;
      samples = sampleOrder[ind:end];
      CVbin[samples] = cv;
      xCVtrain = x[-samples, , drop = FALSE];
      xCVtest = x[samples, , drop = FALSE];
      yCVtrain = y[-samples, , drop = FALSE];
      yCVtest = y[samples,, drop = FALSE ];
      pr = votingLinearPredictor(xCVtrain, yCVtrain, xCVtest,
                        classify = FALSE,
                        CVfold = 0,
                        assocFnc = assocFnc, assocOptions = assocOptions, 
                        featureWeightPowers = featureWeightPowers,
                        priorWeights = priorWeights, 
                        nFeatures.hi = nFeatures.hi, nFeatures.lo = nFeatures.lo,
                        dropUnusedDimensions = dropUnusedDimensions,
                        verbose = verbose - 1, indent = indent + 1);
      CVpredicted[samples, , ] = pr$predictedTest;
      ind = end + 1;
      if (verbose==1) pind = updateProgInd(cv/CVfold, pind);
    }
    if (verbose==1) printFlush("");

    collectGarbage();
  }

  if (nrow(x)!=nrow(y))
    stop("Number of observations in x and y must equal.");

  xSD = apply(x, 2, sd, na.rm = TRUE);

  validFeatures = xSD > 0;

  xMean = apply(x, 2, mean, na.rm = TRUE);
  x = scale(x);
  if (doTest) 
  {
     xtest = (xtest - matrix(xMean, nTestSamples, nVars, byrow = TRUE) ) / 
               matrix(xSD, nTestSamples, nVars, byrow = TRUE);
     xtest[, !validFeatures] = 0
  }

  # This prevents NA's generated from zero xSD to contaminate the results
  x[, !validFeatures] = 0

  xSD[!validFeatures] = 1;

  obsMean = apply(y, 2, mean, na.rm = TRUE);
  obsSD = apply(y, 2, sd, na.rm = TRUE);

  if (sum(!is.finite(obsSD)) > 0)
    stop("Something is wrong with given trait: not all standard deviations are finite.");

  if (sum(obsSD==0) > 0)
    stop("Some of given traits have variance zero. Prediction on such traits will not work.");

  y = scale(y);

  if (is.null(priorWeights))
  {
    priorWeights = array(1, dim = c(nTraits, nPredWPowers, nVars));
  } else {
    dimPW = dim(priorWeights);
    if (length(dimPW)<=1) 
    {
      if (length(priorWeights!=nVars))
         stop ("priorWeights are a vector - must have length = number of variables (ncol(x)).");
      priorWeights = matrix(priorWeights, nrow = nSamples * nPredWPowers, ncol = nVars, 
                            byrow = TRUE);
      dim(priorWeights) = c(nTraits, nPredWPowers, nVars);
    } else if (length(dimPW)==2)
    {
      if ((dimPW[1]!=nPredWPowers) | (dimPW[2]!=nVars) )
        stop(paste("If priorWeights is two-dimensional, 1st dimension must equal",
                   "number of predictor weight powers, 2nd must equal number of variables"));
      # if (verbose>0) printFlush("..converting dimensions of priorWeights..");
      newWeights = array(0, dim = c(nTraits, nPredWPowers, nVars));
      for (trait in 1:nTraits)
        newWeights[trait, , ] = priorWeights[,];
      priorWeights = newWeights;
      collectGarbage();
    } else if ( (dimPW[1] != nTraits) | (dimPW[3] != nVars) | (dimPW[2] != nPredWPowers) )
        stop(paste("priorWeights have incorrect dimensions. Dimensions must be",
                   "(ncol(y), length(featureWeightPowers), ncol(x))."));
  }
      
  varImportance = array(0, dim = c(nTraits, nPredWPowers, nVars));
  predictMat = array(0, dim = c(nSamples, nTraits, nPredWPowers));
  if (doTest) predictTestMat = array(0, dim = c(nTestSamples, nTraits, nPredWPowers));

  corEval = parse(text = paste(assocFnc, "(x, y ", prepComma(assocOptions), ")"));
  r = eval(corEval);
  if (!is.null(nFeatures.hi))
  {
    if (is.null(nFeatures.lo)) nFeatures.lo = nFeatures.hi;
    # Zero out associations (and therefore weights) for features that do not make the cut
    rank = apply(r, 2, rank, na.last = TRUE);
    nFinite = colSums(!is.na(r));
    for (t in 1:nTraits)
      r[ rank[, t]>nFeatures.lo & rank[, t] <= nFinite[t]-nFeatures.hi, t] = 0;
  }
  r[is.na(r)] = 0; 

  validationWeights = rep(1, nVars);
  if (weighByPrediction > 0)
  {
    select = c(1:nVars)[rowSums(r!=0) > 0];
    nSelect = length(select);
    pr = .quickGeneVotingPredictor.CV(x[, select], xtest[, select], c(1:nSelect))
    dCV = sqrt(colMeans( (pr$CVpredicted - x[, select])^2, na.rm = TRUE));
    dTS = sqrt(colMeans( (pr$predictedTest - xtest[, select])^2, na.rm = TRUE));
    dTS[dTS==0] = min(dTS[dTS>0]);
    w = (dCV/dTS)^weighByPrediction;
    w[w>1] = 1;
    validationWeights[select] = w; 
  } 

  finiteX = (is.finite(x) + 1)-1
  x.fin = x;
  x.fin[!is.finite(x)] = 0;

  if (doTest)
  {
    finiteXTest =  (is.finite(xtest) + 1)-1;
    xtest.fin = xtest;
    xtest.fin[!is.finite(xtest)] = 0;
  }

  for (power in 1:nPredWPowers)
  {
    prWM = priorWeights[, power, ];
    dim(prWM) = c(nTraits, nVars);

    weights = abs(r)^featureWeightPowers[power] * t(prWM) * validationWeights;
    #weightSum = apply(weights, 2, sum);
    weightSum = finiteX %*% weights;
    RWeights = sign(r) * weights;

    predictMat[ , , power] = 
         x.fin %*% RWeights / weightSum;
    predMean = apply(.dropThirdDim(predictMat[ , , power, drop = FALSE]), 2, mean, na.rm = TRUE);
    predSD = apply(.dropThirdDim(predictMat[, , power, drop = FALSE]), 2, sd, na.rm = TRUE);
    predictMat[, , power] = scale(predictMat[, , power]) *
                      matrix(obsSD, nrow = nSamples, ncol = nTraits, byrow = TRUE) +
                      matrix(obsMean, nrow = nSamples, ncol = nTraits, byrow = TRUE);
    if (doTest) 
    {
      weightSum.test = finiteXTest %*% weights;
      predictTestMat[, , power] = 
         (xtest.fin %*% RWeights /  weightSum.test -
                  matrix(predMean, nrow = nTestSamples, ncol = nTraits, byrow = TRUE) ) /
                  matrix(predSD, nrow = nTestSamples, ncol = nTraits, byrow = TRUE) * 
                  matrix(obsSD, nrow = nTestSamples, ncol = nTraits, byrow = TRUE) +
                  matrix(obsMean, nrow = nTestSamples, ncol = nTraits, byrow = TRUE); 
    }
    varImportance[ , power, ] = RWeights;

  }

  dimnames(predictMat) = list(sampleNames, traitNames, powerNames);
  if (doTest) dimnames(predictTestMat) = list(testSampleNames, traitNames, powerNames);

  dimnames(r) = list(featureNames, traitNames);
  dimnames(varImportance) = list(traitNames, powerNames, featureNames);

  discretize = function(x, min, max)
  {
      fac = round(x);
      fac[fac < min] = min;
      fac[fac > max] = max;
      # dim(fac) = dim(x);
      fac;
  }

  trafo = function(x, drop)
  {
    if (classify)
    {
      for (power in 1:nPredWPowers) for (t in 1:nTraits)
      {
        disc = discretize(x[, t, power], minY[t], maxY[t])
        x[, t, power] = originalYLevels[[t]] [disc];
      }
    } 
    x = x[ , , , drop = drop];
    x;
  }

  out = list(predicted = trafo(predictMat, drop = dropUnusedDimensions), 
             weightBase = abs(r), 
             variableImportance = varImportance)
  if (doTest) out$predictedTest = trafo(predictTestMat, drop = dropUnusedDimensions)
  if (CVfold > 0)
  {
    dimnames(CVpredicted) = list(sampleNames, traitNames, powerNames);
    CVpredFac = trafo(CVpredicted, drop = dropUnusedDimensions)
    out$CVpredicted = CVpredFac;
  } 
  out;
}




#=======================================================================================================
#
# Quick linear predictor for genes. 
#
#=======================================================================================================

# Assume we have expression data (training and test). Run the prediction
# in a CV-like way. Keep the predictions for CV samples and for the test samples; at the end average the
# test predictions to get one final test prediction. 
 
# In each CV run: 
# scale the training data and keep scale
# scale the test data using the training scale
# get correlations of predicted vectors and predictors (note: number of predicted vectors should be
# relatively small, but all genes may be predictors - however, here we can also make some restrictions)
# Form the ensemble of predictors for each gene
# Predict the CV samples and test samples

.quickGeneVotingPredictor = function(x, xtest, predictedIndex, nPredictorGenes = 20, power = 3,
                         corFnc = "bicor", corOptions = "use = 'p'", verbose = 0)
{
  nSamples = nrow(x)
  nTestSamples = nrow(xtest);
  nGenes = ncol(x)
  nPredicted = length(predictedIndex);
  if (nPredictorGenes >=nGenes) nPredictorGenes = nGenes -1;

  geneMeans = as.numeric(colMeans(x, na.rm = TRUE));
  geneSD = as.numeric(sqrt(colMeans(x^2, na.rm = TRUE) - geneMeans^2));
  
  xScaled = (x-matrix(geneMeans, nSamples, nGenes, byrow = TRUE))/
                 matrix(geneSD, nSamples, nGenes, byrow = TRUE);

  xTestScaled = (xtest-matrix(geneMeans, nTestSamples, nGenes, byrow = TRUE))/
                 matrix(geneSD, nTestSamples, nGenes, byrow = TRUE);
  corExpr = parse(text = paste(corFnc, " (xScaled, xScaled[, predictedIndex] ", prepComma(corOptions), ")"));
  significance = eval(corExpr);

  predictedTest = matrix(NA, nTestSamples, nPredicted)
  
  if (verbose > 0) pind = initProgInd();
  for (g in 1:nPredicted)
  {
    gg = predictedIndex[g];
    sigOrder = order(-significance[, g]);
    useGenes = sigOrder[2:(nPredictorGenes+1)];
    w.base = significance[useGenes, g]
    w = w.base * abs(w.base)^(power -1);
    bsub = rowSums(xScaled[, useGenes, drop = FALSE] * matrix(w, nSamples, nPredictorGenes, byrow = TRUE));
    bsub.scale = sqrt(mean(bsub^2))
    predictedTest[, g] = rowSums(xTestScaled[, useGenes, drop = FALSE] * 
                               matrix(w, nTestSamples, nPredictorGenes, byrow = TRUE)) / bsub.scale *
                             geneSD[gg] + geneMeans[gg]
    if (verbose > 0) pind = updateProgInd(g/nPredicted, pind)
  }

  predictedTest;
}
      
.quickGeneVotingPredictor.CV = function(x, xtest = NULL, predictedIndex, nPredictorGenes = 20, 
                            power = 3, CVfold = 10,
                            corFnc = "bicor", corOptions = "use = 'p'")
{
  nSamples = nrow(x)
  nTestSamples = if (is.null(xtest)) 0 else nrow(xtest);
  nGenes = ncol(x)
  nPredicted = length(predictedIndex);
  
  ratio = nSamples/CVfold;
  if (floor(ratio)!=ratio)
  {
    smaller =floor(ratio);
    nLarger = nSamples - CVfold * smaller
    binSizes = c(rep(smaller, CVfold-nLarger), rep(smaller +1, nLarger));
  } else
    binSizes = rep(ratio, CVfold);

  sampleOrder = sample(1:nSamples);

  CVpredicted = matrix(NA, nSamples, nPredicted);
  if (!is.null(xtest)) predictedTest = matrix(0, nTestSamples, nPredicted) else predictedTest = NULL;

  cvStart = 1;
  for (cv in 1:CVfold)
  {
    end = cvStart + binSizes[cv] - 1;
    oob = sampleOrder[cvStart:end];
    CVx = x[-oob, , drop = FALSE];
    if (is.null(xtest)) CVxTest = x[oob, , drop = FALSE] else CVxTest = rbind( x[oob, , drop = FALSE], xtest);
    pred = .quickGeneVotingPredictor(CVx, CVxTest, predictedIndex, nPredictorGenes, power, corFnc,
                                    corOptions);
    CVpredicted[oob, ] = pred[c(1:binSizes[cv]), ];
    if (!is.null(xtest)) predictedTest = predictedTest + pred[-c(1:binSizes[cv]), ];
    cvStart = end + 1;
  }

  if (!is.null(xtest)) predictedTest = predictedTest/CVfold;

  list(CVpredicted = CVpredicted, predictedTest= predictedTest);
}

removePrincipalComponents = function(x, n)
{
  if (sum(is.na(x)) > 0) x = t(impute.knn(t(x))$data);
  svd = svd(x, nu = n, nv = 0);
  PCs = as.data.frame(svd$u);
  names(PCs) = spaste("PC", c(1:n));
  fit = lm(x~., data = PCs);
  res = residuals(fit)
  res;
}

#========================================================================================================
#
# Lin's network screening correlation functions
#
#========================================================================================================

.corWeighted = function(expr, y, ...) {
  modules = blockwiseModules(expr, ...)
  MEs = modules$MEs
  MEs = MEs[, colnames(MEs)!="MEgrey"]
  ns = networkScreening(y, MEs, expr, getQValues=F)
  ns$cor.Weighted
}

.corWeighted.new = function(expr, y, ...) {
  modules = blockwiseModules(expr, ...)
  MEs = modules$MEs
  MEs = MEs[, colnames(MEs)!="MEgrey"]
  scaledMEs = scale(MEs)
  ES = t(as.matrix(cor( y, MEs, use="p")))
  weightedAverageME = as.numeric(as.matrix(scaledMEs)%*%ES)/ncol(MEs)

  w=0.25
  y.new = w*scale(y) + (1-w)*weightedAverageME
  GS.new = as.numeric(cor(y.new, expr, use="p"))
  GS.new
}