R/consensusRepresentatives.R
In WGCNA: Weighted Correlation Network Analysis

Documented in consensusRepresentatives selectFewestConsensusMissing

#============================================================================================================
#
# mdx version of collapseRows
#
#============================================================================================================


.cr.absMaxMean <- function(x, robust)
{
   if (robust)
   {
     colMedians(abs(x), na.rm = TRUE)
   } else
     colMeans(abs(x), na.rm = TRUE)
}

.cr.absMinMean <- function(x, robust)
{
   if (robust)
   {
     -colMedians(abs(x), na.rm = TRUE)
   } else
     -colMeans(abs(x), na.rm = TRUE)
}


.cr.MaxMean <- function(x, robust)
{
   if (robust)
   {
     colMedians(x, na.rm = TRUE)
   } else
     colMeans(x, na.rm = TRUE)
}

.cr.MinMean <- function(x, robust)
{
   if (robust)
   {
     -colMedians(x, na.rm = TRUE)
   } else
     -colMeans(x, na.rm = TRUE)
}

.cr.maxVariance <- function(x, robust)
{
   if (robust) 
   {
     colMads(x, na.rm = TRUE)
   } else 
     colSds(x, na.rm = TRUE)

}

.checkConsistencyOfGroupAndColID = function(mdx, colID, group)
{
  colID  = as.character(colID)
  group = as.character(group)
  if (length(colID)!=length(group))
     stop("'group' and 'colID' must have the same length.")

  if (any(duplicated(colID)))
     stop("'colID' contains duplicate entries.")

  rnDat = mtd.colnames(mdx);

  if ( sum(is.na(colID))>0 )
      warning(spaste("The argument colID contains missing data. It is recommend you choose non-missing,\n",
              "unique values for colID, e.g. character strings."))

   if ( sum(group=="",na.rm=TRUE)>0 ){
      warning(paste("group contains blanks. It is strongly recommended that you remove",
                    "these rows before calling the function.\n",
                    "   But for your convenience, the collapseRow function will remove these rows"));
      group[group==""]=NA
   }

   if ( sum(is.na(group))>0 ){
       warning(paste("The argument group contains missing data. It is strongly recommended\n",
              "   that you remove these rows before calling the function. Or redefine group\n",
           "   so that it has no missing data. But for convenience, we remove these data."))
   }   

  if ((is.null(rnDat))&(checkSets(mdx)$nGenes==length(colID)))
  {
    write("Warning: mdx does not have column names. Using 'colID' as column names.","")
    rnDat = colID;
    mdx = mtd.setColnames(mdx, colID);
  }
  if (is.null(rnDat))
     stop("'mdx' does not have row names and \n",
          "length of 'colID' is not the same as # variables in mdx.");

  keepProbes = rep(TRUE, checkSets(mdx)$nGenes);

  if (sum(is.element(rnDat,colID))!=length(colID)){
      write("Warning: row names of input data and probes not identical...","")
      write("... Attempting to proceed anyway. Check results carefully.","")
      keepProbes = is.element(colID, rnDat);
      colID = colID[keepProbes]
      mdx = mtd.subset(mdx, , colID);
      group = group[colID]
  }

  restCols = (group!="" & !is.na(group))
  if (any(!restCols))
  {
    keepProbes[keepProbes] = restCols;
    mdx = mtd.subset(mdx, , restCols);
    group = group[restCols]
    colID = colID[restCols]
    rnDat = rnDat[restCols]
  }

  list(mdx = mdx, group = group, colID = colID, keepProbes = keepProbes);
}



selectFewestConsensusMissing <- function(mdx, colID, group, 
                                 minProportionPresent = 1,
                                 consensusQuantile = 0,
                                 verbose = 0, ...)

{
## For each gene, select the gene with the fewest missing probes, and return the results.
#   If there is a tie, keep all probes involved in the tie.
#   The main part of this function is run only if omitGroups=TRUE
   
   otherArgs = list(...)
   nVars = checkSets(mdx)$nGenes;
   nSamples = checkSets(mdx)$nSamples;
   nSets = length(mdx);

   if ((!"checkConsistency" %in% names(otherArgs)) || otherArgs$checkConsistency)
   {
      cd = .checkConsistencyOfGroupAndColID(mdx, colID, group);
      mdx = cd$mdx;
      group = cd$group;
      colID = cd$colID;
      keep = cd$keepProbes;
   } else
      keep = rep(TRUE, nVars);

   # First, return datET if there is no missing data, otherwise run the function
   if (sum(mtd.apply(mdx, function(x) sum(is.na(x)), mdaSimplify = TRUE))==0) 
     return(rep(TRUE, nVars));
   
   # Set up the variables.
   names(group)     = colID
   probes              = mtd.colnames(mdx);
   genes               = group[probes]
   keepGenes           = rep(TRUE, nVars);
   tGenes              = table(genes)
   checkGenes          = sort(names(tGenes)[tGenes>1])
   presentData         = as.matrix(mtd.apply(mdx, function(x) colSums(is.finite(x)), mdaSimplify = TRUE));

   presentFrac = presentData/matrix(nSamples, nVars, nSets, byrow = TRUE);
   consensusPresentFrac = .consensusCalculation.base(data = presentFrac, useMean = FALSE,
                            setWeightMat = NULL,
                            consensusQuantile = consensusQuantile)$consensus;
   
   # Omit all probes with at least omitFrac genes missing
   #keep = consensusPresentFrac > omitFraction
   minProportionPresent = as.numeric(minProportionPresent);

   # Omit relevant genes and return results
   if (minProportionPresent > 0)
   {
      if (verbose) pind = initProgInd();
      for (gi in 1:length(checkGenes))
      {
         g = checkGenes[gi];
         gn            = which(genes==g)
         keepGenes[gn] = (consensusPresentFrac[gn] >= minProportionPresent * max(consensusPresentFrac[gn]))
         if (verbose) pind = updateProgInd(gi/length(checkGenes), pind);
      }
      if (verbose) printFlush("");
   }

   keep[keep] = keepGenes;
   return (keep);
}

# ----------------- Main Function ------------------- #

consensusRepresentatives = function(mdx, 
                            group, colID, 
                            consensusQuantile = 0,
                            method = "MaxMean", 
                            useGroupHubs = TRUE,
                            calibration = c("none", "full quantile"),
                            selectionStatisticFnc = NULL, 
                            connectivityPower=1, 
                            minProportionPresent=1,
                            getRepresentativeData = TRUE,
                            statisticFncArguments = list(),
                            adjacencyArguments = list(),
                            verbose = 2, indent = 0)

# Change in methodFunction: if the methodFunction picks a single representative, it should return it in
# attribute "selectedRepresentative".
# minProportionPresent now gives the fraction of the maximum of present values that will still be included.
# minProportionPresent=1 corresponds to minProportionPresent=TRUE in original collapseRows.

# In connectivity-based collapsing, use simple connectivity, do not normalize. This way the connectivities
# retain a larger spread which should prevent quantile normalization from making big changes and potentially
# suprious changes. 

{

   if (!is.null(dim(mdx)))
   {
     warning("consensusRepresentatives: wrapping matrix-like input into a mdx structure.");
     mdx = multiData(mdx);
   }
   spaces = indentSpaces(indent);
   nSamples= checkSets(mdx)$nSamples
   nSets = length(mdx);

   colnames.in = mtd.colnames(mdx);

   calibration = match.arg(calibration);
   
    ## Test to make sure the variables are the right length.
    #     if not, fix it if possible, or stop.

   cd = .checkConsistencyOfGroupAndColID(mdx, colID, group);
   colID = cd$colID;
   group = cd$group;
   mdx = cd$mdx;
   keepVars = cd$keepProbes

   rnDat = mtd.colnames(mdx);

      
## For each gene, select the gene with the fewest missing probes (if minProportionPresent==TRUE)
##  Also, remove all probes with more than 90% missing data
   
   if (verbose > 0)
     printFlush(spaste(spaces, "..selecting variables with lowest numbers of missing data.."));
   keep = selectFewestConsensusMissing(mdx, colID, group, minProportionPresent, 
                                 consensusQuantile = consensusQuantile, verbose = verbose -1)
   mdx = mtd.subset(mdx, , keep);
   keepVars[keepVars] = keep;

   group = group[keep];
   colID = colID[keep];

   rnDat = mtd.colnames(mdx);
   
##   If method="function", use the function "methodFunction" as a way of combining genes
#    Alternatively, use one of the built-in functions 
#    Note: methodFunction must be a function that takes a vector of numbers as input and
#     outputs a single number. This function will return(0) or crash otherwise.

   recMethods = c("function","MaxMean","maxVariance","MinMean","absMinMean","absMaxMean");
   imethod = pmatch(method, recMethods);
        
   if (is.na(imethod)) 
      stop("Error: entered method is not a legal option. Recognized options are\n",
           "       *maxVariance*, *MaxMean*, *MinMean*, *absMaxMean*, *absMinMean*\n",
           "       or *function* for a user-defined function.")

   if (imethod > 1) 
   {
     selectionStatisticFnc = spaste(".cr.", method);
     selStatFnc = get(selectionStatisticFnc, mode = "function")
   } else {
     selStatFnc = match.fun(selectionStatisticFnc);
     if((!is.function(selStatFnc))&(!is.null(selStatFnc)))
            stop("Error: 'selectionStatisticFnc must be a function... please read the help file.")
   }
      
## Format the variables for use by this function
   colID[is.na(colID)] = group[is.na(colID)]    # Use group if row is missing
   rnDat[is.na(rnDat)]   = group[is.na(rnDat)];
   mdx = mtd.setColnames(mdx, rnDat);

   remove       = (is.na(colID))|(is.na(group)) # Omit if both gene and probe are missing
   colID  = colID[!remove];
   group = group[!remove];
   names(group) = colID
   colID = sort(intersect(rnDat,colID))
   if (length(colID)<=1)
      stop("None of the variable names in 'mdx' are in 'colID'.")

   group = group[colID]
   mdx  = mtd.apply(mdx, as.matrix);
   keepVars[keepVars] =  mtd.colnames(mdx) %in% colID;
   mdx = mtd.subset(mdx, , colID);

   probes = mtd.colnames(mdx)
   genes  = group[probes]
   tGenes = table(genes)
   colnames.out = sort(names(tGenes));
    
   if (getRepresentativeData)
   {
     mdxOut = mtd.apply(mdx, function(x) 
     {
       out = matrix(0, nrow(x), length(tGenes));
       rownames(out) = rownames(x);
       colnames(out) = colnames.out
       out;
     });
     names(mdxOut) = names(mdx);
   }

   representatives = rep("", length(colnames.out))
   names(representatives) = colnames.out; 
   
##  If !is.null(connectivityPower), default to the connectivity method with power=method
#      Collapse genes with multiple probe sets together using the following algorthim:
#      1) If there is one ps/g = keep
#      2) If there are 2 ps/g = (use "method" or "methodFunction")
#      3) If there are 3+ ps/g = take the max connectivity
#   Otherwise, use "method" if there are 3+ ps/g as well. 
   if(!is.null(connectivityPower)){
     if(!is.numeric(connectivityPower))
        stop("Error: if entered, connectivityPower must be numeric.")
     if(connectivityPower<=0)
       stop("Warning: connectivityPower must be >= 0.");

     if(any(nSamples<=5)){
       write("Warning: 5 or fewer samples, this method of probe collapse is unreliable...","")
       write("...Running anyway, but we suggest trying another method (for example, *mean*).","")
     }
   }
   
   # Run selectionStatisticFnc on all data; if quantile normalization is requested, normalize the selection
   # statistics across data sets.

   selectionStatistics = mtd.apply(mdx, function(x) 
             do.call(selStatFnc, c(list(x), statisticFncArguments)),
              mdaSimplify = TRUE);

   #if (FALSE) xxx = selectionStatistics;

   if (is.null(dim(selectionStatistics)))
      stop("Calculation of selection statistics produced results of zero or unqual lengths.");

   if (calibration=="full quantile")
      selectionStatistics = normalize.quantiles(selectionStatistics);

   #if (FALSE)
   #{
   #   sizeGrWindow(14, 5);
   #   par(mfrow = c(1,3));
   #   for (set in 1:nSets)
   #     hist(xxx[, set], breaks = 200);
#
##      for (set in 1:nSets)
 #       verboseScatterplot(xxx[, set], selectionStatistics[, set], samples = 10000)
 #  }

   consensusSelStat = .consensusCalculation.base(selectionStatistics, useMean = FALSE, setWeightMat = NULL,
                             consensusQuantile = consensusQuantile)$consensus;

   # Actually run the summarization.

   ones = sort(names(tGenes)[tGenes==1])
   if(useGroupHubs){
      twos = sort(names(tGenes)[tGenes==2]) # use "method" and connectivity
      more = sort(names(tGenes)[tGenes>2])
   } else { 
      twos = sort(names(tGenes)[tGenes>1]) # only use "method"
      more = character(0)
   }
   ones2genes =  match(ones, genes);
   if (getRepresentativeData) for (set in 1:nSets)
       mdxOut[[set]]$data[,ones] = mdx[[set]]$data[, ones2genes];
   representatives[ones] = probes[ones2genes];
   count = 0;

   if (length(twos) > 0)
   {
     if (verbose > 0)
       printFlush(spaste(spaces, "..selecting representatives for 2-variable groups.."));
     if (verbose > 1) pind = initProgInd(paste(spaces, ".."));
     repres = rep(NA, length(twos));
     for (ig in 1:length(twos))
     {
        g = twos[ig];
        probeIndex = which(genes==g);
        repres[ig] = probeIndex[which.max(consensusSelStat[probeIndex])];
        if (verbose > 1) pind = updateProgInd(ig/length(twos), pind);
     }
     if (verbose > 1) printFlush("");
     if (getRepresentativeData) for (set in 1:nSets)
        mdxOut[[set]]$data[, twos] = mdx[[set]]$data[, repres];
     representatives[twos] = probes[repres];
   }
   if (length(more) > 0)
   {
     if (verbose > 0)
       printFlush(spaste(spaces, "..selecting representatives for 3-variable groups.."));
     if (verbose > 1) pind = initProgInd(paste(spaces, ".."));
     genes.more = genes[genes %in% more];
     nAll = length(genes.more);
     connectivities = matrix(NA, nAll, nSets);
     for (ig in 1:length(more))
     {
        g = more[ig];
        keepProbes1 = which(genes==g);
        keep.inMore = which(genes.more==g);
        mdxTmp = mtd.subset(mdx, , keepProbes1);
        adj = mtd.apply(mdxTmp, function(x) do.call(adjacency, 
                c(list(x, type = "signed", power = connectivityPower), adjacencyArguments)));
        connectivities[keep.inMore, ] = mtd.apply(adj, colSums, mdaSimplify = TRUE);
        count = count + 1;
        if (count %% 50000 == 0) collectGarbage();
        if (verbose > 1) pind = updateProgInd(ig/(2*length(more)), pind);
     }

     if (calibration=="full quantile")
       connectivities = normalize.quantiles(connectivities);

     consConn = .consensusCalculation.base(connectivities, useMean = FALSE, setWeightMat = NULL,
                                               consensusQuantile = consensusQuantile)$consensus;
     repres.inMore = rep(0, length(more));
     for (ig in 1:length(more))
     {
        probeIndex = which(genes.more==more[ig]);
        repres.inMore[ig] = probeIndex[which.max(consConn[probeIndex])];
        if (verbose > 1) pind = updateProgInd(ig/(2*length(more)) + 0.5, pind);
     }
     repres = which(genes %in% more)[repres.inMore];
     if (verbose > 1) printFlush("");
     if (getRepresentativeData) for (set in 1:nSets)
       mdxOut[[set]]$data[, more] = as.numeric(mdx[[set]]$data[, repres]); 
     representatives[more] = probes[repres];
   }
      
   # Retreive the information about which probes were saved, and include that information
   #   as part of the output.  

   out2 = cbind(colnames.out, representatives)
   colnames(out2) = c("group","selectedColID")

   reprIndicator = keepVars;
   reprIndicator[keepVars] [match(representatives, mtd.colnames(mdx))] = TRUE
   reprIndicator = colnames.in
   out = list(representatives = out2, 
              varSelected = reprIndicator,
              representativeData = if (getRepresentativeData) mdxOut else NULL)
   return(out)
      
}