R/empiricalBayesLM.R

Defines functions bicovWeightsFromFactors bicovWeights bicovWeightFactors .bicov .linearModelCoefficients empiricalBayesLM .initialFit.requiresFormula .initialFit.defaultOptions .initialFit.defaultWeightName .weightedVar .weightedScale .colWeightedMeans.x

Documented in bicovWeightFactors bicovWeights bicovWeightsFromFactors empiricalBayesLM

#===================================================================================================
#
# Multi-variate empirical Bayes with proper accounting for sigma
#
#===================================================================================================

.colWeightedMeans.x = function(data, weights, na.rm)
{
  nc = ncol(data);
  means = rep(NA, nc);

  for (c in 1:nc)
    means[c] = weighted.mean(data[, c], weights[, c], na.rm = na.rm);

  names(means) = colnames(data);
  means;
}

.weightedScale = function(data, weights, replaceZeroScale = FALSE)
{
  weightSums = colSums(weights);
  means = .colWeightedMeans.x(data, weights, na.rm = TRUE);
  V1 = colSums(weights);
  V2 = colSums(weights^2);
  centered = data - matrix(means, nrow(data), ncol(data), byrow = TRUE);
  scale = sqrt( colSums(centered^2* weights, na.rm = TRUE) / ( V1 - V2/V1));
  if (replaceZeroScale) scale[scale == 0] = 1;
  scaled = centered/ matrix(scale,  nrow(data), ncol(data), byrow = TRUE);

  attr(scaled, "scaled:center") = means;
  attr(scaled, "scaled:scale") = scale;
  scaled;
}

.weightedVar = function(x, weights)
{
  V1 = sum(weights);
  V2 = sum(weights^2);
  mean = sum(x * weights)/V1;
  centered = x-mean;
  sum(centered^2 * weights)/(V1-V2/V1);
}

# Defaults for certain fitting functions

.initialFit.defaultWeightName = function(fitFnc)
{
  wNames = c(rlm = "w", lmrob = "rweights");
  if (fitFnc %in% names(wNames)) return(wNames[ match(fitFnc, names(wNames))]);
  NULL;
}

.initialFit.defaultOptions = function(fitFnc)
{
  defOpt = list(
       rlm = list())
  if (fitFnc %in% names(defOpt)) return(defOpt[[ match(fitFnc, names(defOpt))]]);
  list();
}

.initialFit.requiresFormula = function(fitFnc)
{
  reqForm = c(rlm = FALSE, lmrob = TRUE);
  if (fitFnc %in% names(reqForm)) return(reqForm[ match(fitFnc, names(reqForm))]);
  FALSE;
}


empiricalBayesLM = function(
  data,
  removedCovariates,
  retainedCovariates = NULL,

  initialFitFunction = NULL,
  initialFitOptions = NULL,
  initialFitRequiresFormula = NULL,
  initialFit.returnWeightName = NULL,

  fitToSamples = NULL,

  weights = NULL,
  automaticWeights = c("none", "bicov"),
  aw.maxPOutliers = 0.1,
  weightType = c("apriori", "empirical"),
  stopOnSmallWeights = TRUE,

  minDesignDeviation = 1e-10,
  robustPriors = FALSE,
  tol = 1e-4, maxIterations = 1000,
  garbageCollectInterval = 50000,

  scaleMeanToSamples = fitToSamples,
  scaleMeanOfSamples = NULL,
  getOLSAdjustedData = TRUE,
  getResiduals = TRUE,
  getFittedValues = TRUE,
  getWeights = TRUE,
  getEBadjustedData = TRUE,

  verbose = 0, indent = 0)
{

  spaces = indentSpaces(indent);

  nSamples = nrow(data);
  designMat = NULL;
  #mean.x = NULL;
  #scale.x = NULL;

  automaticWeights = match.arg(automaticWeights);
  if (automaticWeights=="bicov")
  {
    weightType = "empirical";
    if (verbose > 0) printFlush(paste(spaces, "..calculating weights.."));
    weights = bicovWeights(data, maxPOutliers = aw.maxPOutliers);
  }

  weightType = match.arg(weightType);
  wtype = match(weightType, c("apriori", "empirical"));

  if (!is.null(retainedCovariates))
  {
     if (is.null(dim(retainedCovariates))) 
        retainedCovariates = data.frame(retainedCovariate = retainedCovariates)
     if (any(is.na(retainedCovariates))) stop("All elements of 'retainedCovariates' must be finite.");
     if (nrow(retainedCovariates)!=nSamples)
       stop("Numbers of rows in 'data' and 'retainedCovariates' differ.");
     retainedCovariates = as.data.frame(retainedCovariates);
     mm = model.matrix(~., data = retainedCovariates)[, -1, drop = FALSE];
     colSDs = colSds(mm);
     if (any(colSDs==0))
       stop("Some columns in 'retainedCovariates' have zero variance.");
     designMat = mm;
  }

  if (is.null(removedCovariates))
    stop("'removedCovariates' must be supplied.");
  if (is.null(dim(removedCovariates))) 
      removedCovariates = data.frame(removedCovariate = removedCovariates)
  if (any(is.na(removedCovariates))) stop("All elements of 'removedCovariates' must be finite.");
  if (nrow(removedCovariates)!=nSamples)
    stop("Numbers of rows in 'data' and 'removedCovariates' differ.");
  removedCovariates = as.data.frame(removedCovariates);
  
  mm = model.matrix(~., data = removedCovariates)[, -1, drop = FALSE];
  colSDs = colSds(mm);
  if (any(colSDs==0))
    stop("Some columns in 'removedCovariates' have zero variance.");
  designMat = cbind(designMat, mm)
  removedColumns = (ncol(designMat)-ncol(mm) + 1):ncol(designMat);

  y.original = as.matrix(data);
  N.original = ncol(y.original);
  if (any(!is.finite(y.original))) 
  {
    warning(immediate. = TRUE,
            "Found missing and/or non-finite data. These will be removed.");
  }
    
  if (is.null(weights))
    weights = matrix(1, nSamples, ncol(y.original));

  if (any(!is.finite(weights)))
    stop("Given 'weights' contain some infinite or missing entries. All weights must be present and finite.");

  if (any(weights<0))
    stop("Given 'weights' contain negative entries. All weights must be non-negative.");

  originalWeights = weights;

  dimnamesY = dimnames(y.original);

  if (verbose > 0) printFlush(paste(spaces, "..checking for non-varying responses.."));
  varY = colVars(y.original, na.rm = TRUE);
  varYMissing = is.na(varY);
  varYZero = varY==0;
  varYZero[is.na(varYZero)] = FALSE;
  keepY = !(varYZero | varYMissing);

  y = y.original[, keepY];
  weights = weights[, keepY];
  yFinite = is.finite(y);
  weights[!yFinite] = 0;
  nSamples.y = colSums(yFinite);

  if (weightType == "apriori")
  { 
    # Check weights
    if (any(weights[yFinite]<1))
    {
      if (stopOnSmallWeights)
      {
         stop("When weights are determined 'apriori', small weights are not allowed.\n",
              "Weights must be at least 1. Use weightType='empirical' if weights were determined from data.")
      } else
         warning(immediate. = TRUE,
                 "Small weights found. This can lead to unreliable fit with weight type 'apriori'.\n",
                 "Proceed with caution.");
    }
  } 

  N = ncol(y);
  nc = ncol(designMat);

  if (verbose > 0) printFlush(paste(spaces, "..standardizing responses.."));
  if (length(scaleMeanToSamples)==0) scaleMeanToSamples = c(1:nSamples);
  if (length(scaleMeanOfSamples)==0) scaleMeanOfSamples = c(1:nSamples);
  if (length(fitToSamples)==0) fitToSamples = c(1:nSamples);

  mean.y.target = .colWeightedMeans.x(y[scaleMeanToSamples, ], weights[scaleMeanToSamples, ], na.rm = TRUE);

  if (is.logical(fitToSamples)) fitToSamples = which(fitToSamples);

  if (length(fitToSamples) < 3) 
    stop("'fitToSamples' must specify at least 3 samples for the fit.");

  nRefSamples = length(fitToSamples);
  yFinite.refSamples = yFinite[fitToSamples, ];
  weights.refSamples = weights[fitToSamples, ];

  # Scale y to mean zero and variance 1. This is needed for the prior to make sense.

  y.refSamples = .weightedScale(y[fitToSamples, ], weights.refSamples, replaceZeroScale = FALSE);
     ### Do not replace zero scale; at this point I would rather have the regression fail for these samples.
  mean.y = attr(y.refSamples, "scaled:center");
  scale.y = attr(y.refSamples, "scaled:scale");
  y.refSamples[!yFinite.refSamples] = 0;

  ## Note to self: original code above used y = .weightedScale(y, weights.refSamples);
  ## We now need to scale y according to the mean and scale calculated above.

  mean.y.mat = matrix(mean.y, nrow = nrow(y), ncol = ncol(y), byrow = TRUE);
  scale.y.mat = matrix(scale.y, nrow = nrow(y), ncol = ncol(y), byrow = TRUE);
  y.scaled = (y-mean.y.mat)/scale.y.mat;

  designMat.refSamples = designMat[fitToSamples, ];

  if (any(is.na(designMat.refSamples))) 
    stop("The design matrix contains missing values. Please check that all variables\n",
         " entering the desing matrix have all values present.");

  if (verbose > 0) printFlush(paste(spaces, "..initial model fitting.."));
  # Prepare ordinary regression to get starting points for beta and sigma
  beta.OLS = matrix(NA, nc, N);
  betaValid = matrix(TRUE, nc, N);
  sigma.OLS = rep(NA, N);
  regressionValid = rep(TRUE, N);

  # Get the means of the design matrix with respect to all weight vectors.
  V1 = colSums(weights.refSamples);
  V2 = colSums(weights.refSamples^2);
  means.dm = t(designMat.refSamples) %*% weights.refSamples / matrix(V1, nrow = nc, ncol = N, byrow = TRUE);
  i = 0;
  on.exit(printFlush(spaste("Error occurred at i = ", i)));
  #on.exit(browser());
  pindStep = max(1, floor(N/100));
  lastInvalidFitError = "(no error, just a placeholder)";
  if (is.null(initialFitFunction))
  {
    # Ordinary weighted least squares, in two varieties.
    oldWeights = rep(-1, nRefSamples);
    if (verbose > 1) pind = initProgInd();
    for (i in 1:N)
    {
      w1 = weights.refSamples[, i];
      y1 = y.refSamples[, i];
      if (any(w1!=oldWeights))
      {
        centeredDM = designMat.refSamples - matrix(means.dm[, i], nRefSamples, nc, byrow = TRUE);
        #dmVar = colSds(centeredDM);
        dmVar.w = apply(centeredDM, 2, .weightedVar, weights = w1);
        #keepDM = dmVar > 0 & dmVar.w > 0;
        keepDM = dmVar.w > minDesignDeviation^2;
        xtxInv = try(
        {
          centeredDM.keep = centeredDM[, keepDM, drop = FALSE];
          xtx = t(centeredDM.keep) %*% (centeredDM.keep * w1);
          solve(xtx);
        }, silent = TRUE)
      }
  
      if (!inherits(xtxInv, "try-error"))
      {
        oldWeights = w1;
        # The following is really (xtxInv %*% xy1), where xy1 = t(centeredDM) %*% (y[,i]*weights[,i])
        beta.OLS[keepDM, i] = colSums(xtxInv * colSums(centeredDM.keep * y1 * w1));
           ## Caution here: ceause of the scaling based only on fitToSamples samples, y1 could be invalid.
        betaValid[!keepDM, i] = FALSE;
        betaValid[keepDM, i] = is.finite(beta.OLS[keepDM, i]);
        if (!any(betaValid[, i])) regressionValid[i] = FALSE;
        y.pred = centeredDM.keep %*% beta.OLS[keepDM, i, drop = FALSE];
        if (weightType=="apriori")
        {
          # Standard calculation of sigma^2 in weighted regression
          sigma.OLS[i] = sum((w1>0) * (y1 - y.pred)^2)/(sum(w1>0)-nc-1);
        } else {
          xtxw2 =  t(centeredDM.keep) %*% (centeredDM.keep *w1^2);
          sigma.OLS[i] = sum( w1* (y1-y.pred)^2) / (V1[i] - V2[i]/V1[i] - sum(xtxw2 * xtxInv));
        }
      } else {
        regressionValid[i] = FALSE;
        betaValid[, i] = FALSE;
        lastInvalidFitError = xtxInv;
      }
      if (i%%garbageCollectInterval ==0) gc();
      if (verbose > 1 && i%%pindStep==0) pind = updateProgInd(i/N, pind);
    }
    if (verbose > 1) {pind = updateProgInd(i/N, pind); printFlush()}
    rweights = weights.refSamples;
  } else {
    fitFnc = match.fun(initialFitFunction);
    if (is.null(initialFit.returnWeightName))
      initialFit.returnWeightName = .initialFit.defaultWeightName(initialFitFunction);

    if (is.null(initialFitOptions))
      initialFitOptions = .initialFit.defaultOptions(initialFitFunction);

    if (is.null(initialFitRequiresFormula))
      initialFitRequiresFormula = .initialFit.requiresFormula(initialFitFunction);
      
    rweights = weights.refSamples;
    if (verbose > 1) pind = initProgInd();
    for (i in 1:N)
    {
      y1 = y.refSamples[, i];
      w1 = weights.refSamples[, i]
      dmVar.w = apply(designMat.refSamples, 2, .weightedVar, weights = w1);
      keepDM = dmVar.w > 0;
      if (initialFitRequiresFormula)
      {
        modelData = data.frame(designMat.refSamples[, keepDM, drop = FALSE]);
        fit = try(do.call(fitFnc, c(list(formula = "y1~.", data = data.frame(y1 = y1, modelData), w = w1), 
                                            initialFitOptions)));
      } else {
        fit = try(do.call(fitFnc, c(list(x = cbind(intercept = rep(1, nRefSamples), 
                                                   designMat.refSamples[, keepDM, drop = FALSE]), 
                                         y = y1, w = w1),
                                    initialFitOptions)));
      }
      if (!inherits(fit, "try-error"))
      {
        beta.OLS[keepDM, i] = fit$coefficients[-1];
        betaValid[!keepDM, i] = FALSE;
        rw1 = getElement(fit, initialFit.returnWeightName);
        if (length(rw1)!=nRefSamples) 
          stop("Length of component '", initialFit.returnWeightName, 
               "' does not match the number of samples.\n",
               "Please check that the name of the component containing robustness weights\n",
               "is specified correctly.");
        rweights[, i] = rw1 * w1;
        if (!is.null(fit$scale))
        { 
           sigma.OLS[i] = fit$scale
        } else {
          # This is not the greatest estimate since it is non-robust and does not take the final weights into
          # account. The hope is that this will never be used.
          sigma.OLS[i] = sum((w1>0) * fit$residuals^2)/(sum(w1>0)-nc-1)
        }
      } else {
        regressionValid[i] = FALSE;
        betaValid[, i] = FALSE;
        lastInvalidFitError = fit;
      }
      if (i%%garbageCollectInterval ==0) gc();
      if (verbose > 1 && i%%pindStep==0) pind = updateProgInd(i/N, pind);
    }
    if (verbose > 1) {pind = updateProgInd(i/N, pind); printFlush()}
  }

  if (any(!regressionValid))
     warning(immediate. = TRUE,
             "empiricalBayesLM: initial regression failed in ", sum(!regressionValid), " variables.\n",
             "Last failed model returned the following error:\n\n",
          lastInvalidFitError,
          "\n\nPlease check that the model is correctly specified.");

  if (all(!regressionValid))
     stop("Initial regression model failed for all columns in 'data'.\n",
          "Last model returned the following error:\n\n",
          lastInvalidFitError,
          "\n\nPlease check that the model is correctly specified.");

  # beta.OLS has columns corresponding to variables in data, and rows corresponding to columns in x.

  # Debugging...
  if (FALSE)
  {
    fit = lm(y.refSamples~., data = data.frame(designMat.refSamples))
    fit2 = lm(y.refSamples~., data = as.data.frame(centeredDM));

    max(abs(fit2$coefficients[1,]))

    all.equal(c(fit$coefficients[-1, ]), c(fit2$coefficients[-1, ]))
    all.equal(c(fit$coefficients[-1, ]), c(beta.OLS))
    sigma.fit = apply(fit$residuals, 2, var)*(nRefSamples-1)/(nRefSamples-1-nc)
    all.equal(as.numeric(sigma.fit), as.numeric(sigma.OLS))

  }

  if (getEBadjustedData)
  {
    # Priors on beta : mean and variance
    if (verbose > 0) printFlush(spaste(spaces, "..calculating priors.."));
    if (robustPriors)
    {
      if (any(is.na(beta.OLS[, regressionValid])))
        stop("Some of OLS coefficients are missing. Please use non-robust priors.");
      prior.means = rowMedians(beta.OLS[, regressionValid, drop = FALSE], na.rm = TRUE);
      prior.covar = .bicov(t(beta.OLS[, regressionValid, drop = FALSE]));
    } else {
      prior.means = rowMeans(beta.OLS[, regressionValid, drop = FALSE], na.rm = TRUE);
      prior.covar = cov(t(beta.OLS[, regressionValid, drop = FALSE]), use = "complete.obs");
    }
    prior.inverse = solve(prior.covar);
  
    # Prior on sigma: mean and variance (median and MAD are bad estimators since the distribution is skewed)
    sigma.m = mean(sigma.OLS[regressionValid], na.rm = TRUE);
    sigma.v = var(sigma.OLS[regressionValid], na.rm = TRUE);
  
    # Turn the sigma mean and variance into the parameters of the inverse gamma distribution
    prior.a = sigma.m^2/sigma.v + 2;
    prior.b = sigma.m * (prior.a-1);
  
    # Calculate the EB estimates
    if (verbose > 0) printFlush(spaste(spaces, "..calculating final coefficients.."));
    beta.EB = beta.OLS;
    sigma.EB = sigma.OLS;
    
    # Get the means of the design matrix with respect to all rweight vectors.
    rV1 = colSums(rweights);
    rV2 = colSums(rweights^2);
    rmeans.dm = t(designMat.refSamples) %*% rweights / matrix(V1, nrow = nc, ncol = N, byrow = TRUE);
  
    if (verbose > 1) pind = initProgInd();
    for (i in which(regressionValid))
    {
      # Iterate to solve for EB regression coefficients (betas) and the residual variances (sigma)
      # It appears that this has to be done individually for each variable.
  
      difference = 1;
      iteration = 1;
  
      y1 = y.refSamples[, i];
      w1 = rweights[, i];
  
      centeredDM = designMat.refSamples - matrix(rmeans.dm[, i], nRefSamples, nc, byrow = TRUE);
      dmVar.w = apply(centeredDM, 2, .weightedVar, weights = w1);
      keepDM = dmVar.w > 0 & betaValid[, i];
  
      beta.old = as.matrix(beta.OLS[keepDM, i, drop = FALSE]);
      sigma.old = sigma.OLS[i];
  
      centeredDM.keep = centeredDM[, keepDM, drop = FALSE];
      xtx = t(centeredDM.keep) %*% (centeredDM.keep * w1);
      xtxInv = solve(xtx);
  
      if (all(keepDM))
      {
        prior.inverse.keep = prior.inverse;
      } else
        prior.inverse.keep = solve(prior.covar[keepDM, keepDM, drop = FALSE]);
  
      while (difference > tol && iteration <= maxIterations)
      {
        y.pred = centeredDM.keep %*% beta.old;
        if (wtype==1)
        {
          # Apriori weights.
          fin1 = yFinite.refSamples[, i];
          nSamples1 = sum(fin1);
          sigma.new = (sum(fin1 * (y1-y.pred)^2) + 2*prior.b)/ (nSamples1-nc + 2 * prior.a + 1);
        } else {
          # Empirical weights
          V1 = sum(w1);
          V2 = sum(w1^2);
          xtxw2 =  t(centeredDM.keep) %*% (centeredDM.keep *w1^2);
          sigma.new = (sum( w1* (y1-y.pred)^2) + 2*prior.b) / (V1 - V2/V1 - sum(xtxw2 * xtxInv) + 2*prior.a + 2);
        }
  
        A = (prior.inverse.keep + xtx/sigma.new)/2;
        A.inv = solve(A);
        #B = as.numeric(t(y1*w1) %*% centeredDM/sigma.new) + as.numeric(prior.inverse %*% prior.means);
        B = colSums(centeredDM.keep * y1 * w1/sigma.new) + colSums(prior.inverse.keep * prior.means[keepDM]);
  
        #beta.new = A.inv %*% as.matrix(B)/2
        beta.new = colSums(A.inv * B)/2  # ...a different and hopefully faster way of writing the above
  
        difference = max( abs(sigma.new-sigma.old)/(sigma.new + sigma.old),
                          abs(beta.new-beta.old)/(beta.new + beta.old));
        #if (!is.finite(difference))
        #{
        #   printFlush("Invalid 'difference' in empiricalBayesLM. Dropping into browser.");
        #   browser();
        #}
  
        beta.old = beta.new;
        sigma.old = sigma.new;
        iteration = iteration + 1;
        if (any(is.na(c(difference, iteration)))) browser("Have non-finite difference or iteration.");
      }
      if (iteration > maxIterations) warning(immediate. = TRUE, 
                                             "Exceeded maximum number of iterations for variable ", i, ".");
      beta.EB[keepDM, i] = beta.old;
      sigma.EB[i] = sigma.old;
      if (i%%garbageCollectInterval ==0) gc();
      if (verbose > 1 && i%%pindStep==0) pind = updateProgInd(i/N, pind);
    }
    if (verbose > 1) {pind = updateProgInd(i/N, pind); printFlush()}
  } ### if (getEBAdjustedData)  

  on.exit(NULL);


  # Put output together. Will return the coefficients for lm and EB-lm, and the residuals with added mean.

  if (verbose > 0) printFlush(paste(spaces, "..calculating adjusted data.."));

  fitAndCoeffs = function(beta, sigma)
  {
      #fitted.removed = fitted = matrix(NA, nSamples, N);
      fitted.removed = y;   ### this assignment and the one below just sets the dimensions and dimnames
      if (getFittedValues) fitted = y;  ### Actual values will be set below.
      beta.fin = beta;
      beta.fin[is.na(beta)] = 0;
      for (i in which(regressionValid))
      {
         centeredDM = designMat - matrix(means.dm[, i], nSamples, nc, byrow = TRUE);
         fitted.removed[, i] = centeredDM[, removedColumns, drop = FALSE] %*% 
                                   beta.fin[removedColumns, i, drop = FALSE];
         if (getFittedValues) fitted[, i] = centeredDM %*% beta.fin[, i, drop = FALSE]
         if (i%%garbageCollectInterval ==0) gc();
      }
      #browser()
      
      residuals = (y.scaled - fitted.removed) * scale.y.mat;
      # Residuals now have weighted column means of fitted-to samples equal zero.

      residualMeans.scaleMeansOfSamples = .colWeightedMeans.x(residuals[scaleMeanOfSamples, ], weights[scaleMeanOfSamples, ], na.rm = TRUE);

      meanShift =  matrix(mean.y.target, nSamples, N, byrow = TRUE) - 
                    matrix(residualMeans.scaleMeansOfSamples, nSamples, N, byrow = TRUE);

      if (getFittedValues) 
      {
        fitted.all = matrix(NA, nSamples, N.original);
        fitted.all[, keepY] = fitted * scale.y.mat + meanShift;
        dimnames(fitted.all) = dimnamesY;
      }
      
      residualsWithMean.all = matrix(NA, nSamples, N.original);
      if (getResiduals)
      {
        residuals.all = residualsWithMean.all;
        residuals.all[, keepY] = residuals;
        residuals.all[, varYZero] = 0;
        residuals.all[is.na(y.original)] = NA;
      }

      residualsWithMean.all[, keepY] = residuals + meanShift;
      residualsWithMean.all[, varYZero] = 0;
      residualsWithMean.all[is.na(y.original)] = NA;

      beta.all = beta.all.scaled = matrix(NA, nc+1, N.original);
      sigma.all = sigma.all.scaled = rep(NA, N.original);
      sigma.all.scaled[keepY] = sigma;
      sigma.all[keepY] = sigma * scale.y^2;

      beta.all[-1, keepY] = beta.fin * matrix(scale.y, nrow = nc, ncol = N, byrow = TRUE);
      beta.all.scaled[-1, keepY] = beta.fin;
      beta.all[varYZero] = beta.all.scaled[-1, varYZero] = 0;
      #alpha = mean.y - t(as.matrix(mean.x)) %*% beta * scale.y
      alpha = mean.y - colSums(beta * means.dm, na.rm = TRUE) * scale.y
      beta.all[1, keepY] = alpha;
      beta.all.scaled[1, keepY] = 0;

      dimnames(residualsWithMean.all) = dimnamesY;
      colnames(beta.all) = colnames(beta.all.scaled) = colnames(y.original);
      rownames(beta.all) = rownames(beta.all.scaled) = c("(Intercept)", colnames(designMat));

      list(residuals = if (getResiduals) residuals.all else NULL,
           residualsWithMean = residualsWithMean.all,
           beta = beta.all,
           beta.scaled = beta.all.scaled,
           sigmaSq = sigma.all,
           sigmaSq.scaled = sigma.all.scaled,
           fittedValues = if (getFittedValues) fitted.all else NULL);
  }

  fc.OLS = fitAndCoeffs(beta.OLS, sigma.OLS);
  if (getEBadjustedData) fc.EB = fitAndCoeffs(beta.EB, sigma.EB);

  betaValid.all = matrix(FALSE, nc+1, N.original);
  betaValid.all[-1, keepY] = betaValid;
  betaValid.all[1, keepY] = TRUE;
  dimnames(betaValid.all) = dimnames(fc.OLS$beta);

  finalWeights = originalWeights;
  finalWeights[fitToSamples, keepY] = rweights;

  list( adjustedData = if (getEBadjustedData) fc.EB$residualsWithMean else NULL,
       residuals = if (getResiduals & getEBadjustedData) fc.EB$residuals else NULL,
       coefficients = if (getEBadjustedData) fc.EB$beta else NULL,
       coefficients.scaled = if (getEBadjustedData) fc.EB$beta.scaled else NULL,
       sigmaSq = if (getEBadjustedData) fc.EB$sigmaSq else NULL,
       sigmaSq.scaled = if (getEBadjustedData) fc.EB$sigmaSq.scaled else NULL,
       fittedValues = if (getFittedValues && getEBadjustedData) fc.EB$fittedValues else NULL,

       # OLS results
       adjustedData.OLS = if (getOLSAdjustedData) fc.OLS$residualsWithMean else NULL,
       residuals.OLS = if (getResiduals) fc.OLS$residuals else NULL,
       coefficients.OLS = fc.OLS$beta,
       coefficients.OLS.scaled = fc.OLS$beta.scaled,
       sigmaSq.OLS = fc.OLS$sigmaSq,
       sigmaSq.OLS.scaled = fc.OLS$sigmaSq.scaled,
       fittedValues.OLS = if (getFittedValues) fc.OLS$fittedValues else NULL,

       # Weights used in the model
       initialWeights = if (getWeights) originalWeights else NULL,
       finalWeights = if (getWeights) finalWeights else NULL,
       
       # indices of valid fits
       dataColumnValid = keepY,
       dataColumnWithZeroVariance = varYZero,
       coefficientValid = betaValid.all);
}



#===================================================================================================
#
# Linear model coefficients
#
#===================================================================================================

.linearModelCoefficients = function(
  responses, 
  predictors,

  weights = NULL,
  automaticWeights = c("none", "bicov"),
  aw.maxPOutliers = 0.1,

  getWeights = FALSE,

  garbageCollectInterval = 50000,
  minDesignDeviation = 1e-10,

  verbose = 0, indent = 0)
{

  spaces = indentSpaces(indent);

  designMat = NULL;

  automaticWeights = match.arg(automaticWeights);
  if (automaticWeights=="bicov")
  {
    if (verbose > 0) printFlush(paste(spaces, "..calculating weights.."));
    weights = bicovWeights(responses, maxPOutliers = aw.maxPOutliers);
  }

  if (is.null(dim(predictors)))
     predictors = data.frame(retainedCovariate = predictors)

  keepSamples = rowSums(is.na(predictors))==0;

  responses = responses[keepSamples, , drop = FALSE];
  predictors = predictors[keepSamples, , drop = FALSE];
  nSamples = nrow(responses);

  if (nrow(predictors)!=nSamples)
    stop("Numbers of rows in 'responses' and 'predictors' differ.");
  predictors = as.data.frame(predictors);
  mm = model.matrix(~., data = predictors);
  colSDs = colSds(mm[, -1, drop = FALSE], na.rm = TRUE);
  if (any(colSDs==0))
    stop("Some columns in 'predictors' have zero variance.");
  designMat = mm;

  y.original = as.matrix(responses);
  N.original = ncol(y.original);
  if (any(!is.finite(y.original))) 
  {
    warning(immediate. = TRUE,
            "Found missing and non-finite data. These will be removed.");
  }
    
  if (is.null(weights))
    weights = matrix(1, nSamples, ncol(y.original));

  if (any(!is.finite(weights)))
    stop("Given 'weights' contain some infinite or missing entries. All weights must be present and finite.");

  if (any(weights<0))
    stop("Given 'weights' contain negative entries. All weights must be non-negative.");

  originalWeights = weights;

  dimnamesY = dimnames(y.original);

  if (verbose > 0) printFlush(paste(spaces, "..checking for non-varying responses.."));
  varY = colVars(y.original, na.rm = TRUE);
  varYMissing = is.na(varY);
  varYZero = varY==0;
  varYZero[is.na(varYZero)] = FALSE;
  keepY = !(varYZero | varYMissing);

  y = y.original[, keepY];
  weights = weights[, keepY];
  yFinite = is.finite(y);
  weights[!yFinite] = 0;
  nSamples.y = colSums(yFinite);
  y[!yFinite] = 0;

  N = ncol(y);
  nc = ncol(designMat);

  if (verbose > 0) printFlush(paste(spaces, "..model fitting.."));
  # Prepaer ordinary regression to get starting points for beta and sigma
  beta.OLS = matrix(NA, nc, N);
  betaValid = matrix(TRUE, nc, N);
  sigma.OLS = rep(NA, N);
  regressionValid = rep(TRUE, N);

  # Get the means of the design matrix with respect to all weight vectors.
  V1 = colSums(weights);
  V2 = colSums(weights^2);
  means.dm = t(designMat) %*% weights / matrix(V1, nrow = nc, ncol = N, byrow = TRUE);
  i = 0;
  on.exit(printFlush(spaste("Error occurred at i = ", i)));
  #on.exit(browser());
  pindStep = max(1, floor(N/100));

  # Ordinary weighted least squares
  oldWeights = rep(-1, nSamples);
  if (verbose > 1) pind = initProgInd();
  for (i in 1:N)
  {
    w1 = weights[, i];
    y1 = y[, i];
    if (any(w1!=oldWeights))
    {
      #centeredDM = designMat - matrix(means.dm[, i], nSamples, nc, byrow = TRUE);
      centeredDM = designMat; # Do not center the design mat.
      #dmVar = colSds(centeredDM);
      dmVar.w = apply(centeredDM, 2, .weightedVar, weights = w1);
      #keepDM = dmVar > 0 & dmVar.w > 0;
      keepDM = dmVar.w > minDesignDeviation^2;
      keepDM[1] = TRUE; # For the intercept
      centeredDM.keep = centeredDM[, keepDM, drop = FALSE];
      xtx = t(centeredDM.keep) %*% (centeredDM.keep * w1);
      xtxInv = try(solve(xtx), silent = TRUE);
      oldWeights = w1;
    }

    if (!inherits(xtxInv, "try-error"))
    {
      # The following is really (xtxInv %*% xy1), where xy1 = t(centeredDM) %*% (y[,i]*weights[,i])
      beta.OLS[keepDM, i] = colSums(xtxInv * colSums(centeredDM.keep * y1 * w1));
      betaValid[!keepDM, i] = FALSE;
      y.pred = centeredDM.keep %*% beta.OLS[keepDM, i, drop = FALSE];
      #if (weightType=="apriori")
      #{
        # Standard calculation of sigma^2 in weighted refression
        sigma.OLS[i] = sum((w1>0) * (y1 - y.pred)^2)/(sum(w1>0)-nc-1);
      #} else {
      #  xtxw2 =  t(centeredDM.keep) %*% (centeredDM.keep *w1^2);
      #  sigma.OLS[i] = sum( w1* (y1-y.pred)^2) / (V1[i] - V2[i]/V1[i] - sum(xtxw2 * xtxInv));
      #}
    } else {
      regressionValid[i] = FALSE;
      betaValid[, i] = FALSE;
    }
    if (i%%garbageCollectInterval ==0) gc();
    if (verbose > 1 && i%%pindStep==0) pind = updateProgInd(i/N, pind);
  }
  if (verbose > 1) {pind = updateProgInd(i/N, pind); printFlush()}

  on.exit(NULL)

  if (any(!regressionValid))
     warning(immediate. = TRUE,
             "linearModelCoefficients: initial regression failed in ", sum(!regressionValid), " variables.");

  if (all(!regressionValid))
     stop("Initial regression model failed for all columns in 'data'.\n",
          "Last model returned the following error:\n\n",
          xtx,
          "\n\nPlease check that the model is correctly specified.");

  # beta.OLS has columns corresponding to variables in responses, and rows corresponding to columns in x.

  # Extend results to all variables.

  beta.all = matrix(NA, nc, N.original);
  beta.all[, keepY] = beta.OLS;
  dimnames(beta.all) = list(colnames(designMat), dimnamesY[[2]]);

  sigma.all = rep(NA, N.original);
  sigma.all[keepY] = sigma.OLS;
  

  betaValid.all = matrix(FALSE, nc, N.original);
  betaValid.all[, keepY] = betaValid;
  dimnames(betaValid.all) = dimnames(beta.all);


  list(coefficients = beta.all,
       sigmaSq = sigma.all,
       # Weights used in the model
       weights = if (getWeights) weights else NULL,

       # indices of valid fits
       dataColumnValid = keepY,
       dataColumnWithZeroVariance = varYZero,
       coefficientValid = betaValid.all);
}

#===================================================================================================
#
# Robust functions
#
#===================================================================================================

.bicov = function(x)
{
  x = as.matrix(x);
  if (any(!is.finite(x))) 
    stop("All entries of 'x' must be finite.");
  nc = ncol(x);
  nr = nrow(x);
  mx = colMedians(x);
  mx.mat = matrix(mx, nr, nc, byrow = TRUE);
  mads = colMads(x, constant = 1);
  mad.mat = matrix(mads, nr, nc, byrow = TRUE);
  u = abs((x - mx.mat)/(9 * mad.mat));
  a = matrix(as.numeric(u<1), nr, nc);

  topMat = a * (x-mx.mat) * (1-u^2)^2;
  top = nr * t(topMat) %*% topMat;
  botVec = colSums(a * (1-u^2) * (1-5*u^2));
  bot = botVec %o% botVec;

  out = top/bot;
  dimnames(out) = list(colnames(x), colnames(x));
  out;
}


# Argument refWeight:
# w = (1-u^2)^2
# u^2 = 1-sqrt(w)
# referenceU = sqrt(1-sqrt(referenceW))

bicovWeightFactors = function(x, pearsonFallback = TRUE, maxPOutliers = 1,
                        outlierReferenceWeight = 0.5625,
                        defaultFactor = NA)
{
  referenceU = sqrt(1-sqrt(outlierReferenceWeight));
  dimX = dim(x);
  dimnamesX = dimnames(x);
  x = as.matrix(x);
  nc = ncol(x);
  nr = nrow(x);
  mx = colMedians(x, na.rm = TRUE);
  mx.mat = matrix(mx, nr, nc, byrow = TRUE);
  mads = colMads(x, constant = 1, na.rm = TRUE);
  madZero = replaceMissing(mads==0);
  if (any(madZero, na.rm = TRUE))
  {
     warning(immediate. = TRUE,
             "MAD is zero in some columns of 'x'.");
     if (pearsonFallback)
     {
       sds = colSds(x[, madZero, drop = FALSE], na.rm = TRUE)
       mads[madZero] = sds * qnorm(0.75);
     }
  }
  mad.mat = matrix(mads, nr, nc, byrow = TRUE);
  u = (x - mx.mat)/(9 * mad.mat);
  if (maxPOutliers < 0.5)
  {
    lq = colQuantileC(u, p = maxPOutliers);
    uq = colQuantileC(u, p = 1-maxPOutliers);
    lq[is.na(lq)] = 0;
    uq[is.na(uq)] = 0;

    lq[lq>-referenceU] = -referenceU;
    uq[uq < referenceU] = referenceU;
    lq = abs(lq);
    changeNeg = which(lq>referenceU);
    changePos = which(uq > referenceU);

    for (c in changeNeg)
    {
      neg1 = u[, c] < 0;
      neg1[is.na(neg1)] = FALSE;
      u[neg1, c] = u[neg1, c] * referenceU/lq[c];
    }

    for (c in changePos)
    {
      pos1 = u[, c] > 0;
      pos1[is.na(pos1)] = FALSE;
      u[pos1, c] = u[pos1, c] * referenceU/uq[c];
    }
  }
  if (!is.null(defaultFactor)) u[!is.finite(u)] = defaultFactor;
  u;
}
  

bicovWeights = function(x, pearsonFallback = TRUE, maxPOutliers = 1,
                        outlierReferenceWeight = 0.5625,
                        defaultWeight = 0)
{
  dimX = dim(x);
  dimnamesX = dimnames(x);
  x = as.matrix(x);
  nc = ncol(x);
  nr = nrow(x);

  u = bicovWeightFactors(x, pearsonFallback = pearsonFallback,
                         maxPOutliers = maxPOutliers,
                         outlierReferenceWeight = outlierReferenceWeight,
                         defaultFactor = NA);

  a = matrix(as.numeric(abs(u)<1), nr, nc);
  weights = a * (1-u^2)^2;
  weights[is.na(x)] = defaultWeight;
  weights[!is.finite(weights)] = defaultWeight;
  dim(weights) = dimX;
  if (!is.null(dimX)) dimnames(weights) = dimnamesX;
  weights;
}

bicovWeightsFromFactors = function(u, defaultWeight = 0)
{
  dimU = dim(u);
  u = as.matrix(u);
  a = matrix(as.numeric(abs(u)<1), nrow(u), ncol(u));
  weights = a * (1-u^2)^2;
  weights[is.na(u)] = defaultWeight;
  weights[!is.finite(weights)] = defaultWeight;
  dim(weights) = dimU;
  if (!is.null(dimU)) dimnames(weights) = dimnames(u);
  weights;
}

Try the WGCNA package in your browser

Any scripts or data that you put into this service are public.

WGCNA documentation built on Jan. 22, 2023, 1:34 a.m.