gMRPP: Generalized Multi-Response Permutation Procedure

Documented in gmrpp

#' @title Generalized multi-response permutation procedure (GMRPP) for
#' categorical, quantitative, and censored event-time variables
#'
#' @description This function impletements the generalized
#' multiresponse permutation procedure (GMRPP) for
#' categorical, quantitative, and censored event-time variables
#'
#' @param data.mtx The genomic data matrix. Each row is a features. Each column is a subject.
#' @param grp.data # A data frame with columns "ID" (matches columns of data.mtx)
#' and "grp" (for group comparisons), "nvar" (to associate with numerical variable),
#' or "stime" and "evnt" (to associate with survival data).
#' @param vset.data Variable set data, row for assignment of variable (vID) to a variable
#' set (vset).
#' @param nperms The minimum and maximum number of permutations for adaptive permutation.
#' Default is c(10,1000).
#' @param rand.seed Random seed used for reproducible computing.  Default is NULL (no seed set).
#'
#' @return A data.frame with the following columns:
#' \item{vset}{The name of variable set (gene-set).}
#' \item{vIDs}{The list of variables in the variable set (gene-set).}
#' \item{dist.stat}{The distance statistic for the variable set.}
#' \item{p.vset}{The p-value.}
#' \item{nperms}{The number of permutations performed.}
#'
#' @details 
#' 
#' This function performs the GMRPP method described by Cao, Sahr, and Pounds (2019), which is
#' a generalization of the MRPP method described by Nettleton, Recknor, and Reecy (2008).  It characterizes
#' the genomic data of a gene set as the distances between each pair of subjects computed on
#' data for genes in the gene set.  This distance matrix is used to measure the association of
#' the gene set genomic data with a categorical variable by computing the sum of distances for
#' subjects belonging to different groups minus the sum of distances for subjects belonging to the same group.
#' A greater value of this statistic indicates a stronger association.  For numeric variables, the
#' average of this statisic is computed over all sets of groupings defined by all possible dichotomizations.  
#' For censored survival time variables, this statistic is computed over all dichotomizations defined
#' by risk sets (unique uncesored event times), with some adjustments for censoring.  The method 
#' detects certain complex forms of gene-set associations that are not easily identified with other methods.
#' The statistical
#' significance is determined by an adaptive permutation procedure (Pounds et al 2011) that stops evaluating
#' permuted data sets once it becomes clear the result is not statistically significant or a maximum
#' number of permutations have been performed.
#' 
#' @examples
#' data(AMLex)                      # AML TARGET project expression data for selected genes (website)
#' data(AMLclin)                    # AML TARGET project overall survival data  
#' data(vsets)                      # KEGG gene sets for AML and CML (website)
#' res<-gmrpp(AMLex,AMLclin,vsets)  # Evaluate association of gene sets with overall survival
#' res[,-2]                         # results excluding variable IDs for brevity 
#' 
#' @references
#' 
#' Cao X, Sahr N, Pounds S (2019)  Robust Detection of Complex Gene-Set Associations.  Manuscript.
#' 
#' Nettleton D, Recknor J, Reecy JM (2008)  Identification of differentially expressed gene categories in
#' microarray studies using nonparametric multivariate analysis.  Bioinformatics 24: 192-201.  PMID 18054553.
#' 
#' Pounds S, Cao X, Cheng C, et al (2011)  Integrated analysis of pharmacologic, clinical
#' and SNP microarray data using projection onto the most interesting statistical
#' evidence with adaptive permutation testing. International Journal of Data Mining
#' and Bioinformatics, 5, 143-57.  PMID 21516175.
#' 
#' @keywords
#' gene set; gene set enrichment; gene set association; multiresponse permutation procedure; adaptive permutation
#'
#' @import MASS stats
#' 
#' @export
#'

# Generalized multi-response permutation procedure
gmrpp=function(data.mtx, grp.data, vset.data, nperms=c(10,1000), rand.seed=NULL) {
  
  # Prepare data for analysis
  mrpp.data=prep.data(data.mtx,grp.data,vset.data)
  
  data.mtx=mrpp.data$data.mtx
  grp.data=mrpp.data$grp.data
  vset.data=mrpp.data$vset.data
  vset.index=mrpp.data$vset.index
  
  # Compute distance coefficient matrix to be used for calculations
  dcoef=dist.coef.mtx(grp.data)
  
  # Initialize values for permutation
  min.perms=min(nperms)  # minimum number of permutations for adaptive permutation
  max.perms=max(nperms)  # maximum number of permutations for adaptive permutation
  
  n=nrow(grp.data)          # number of subjects
  nset=nrow(vset.index)     # number of gene-sets
  p.vset=rep(0,nset)        # initialize p-value vector
  vset.list=rep("NA",nset)  # initialize vector of variable set lists 
  n.perms=rep(0,nset)       # initialize vector of number of permutations performed
  dist.stats=rep(0,nset)    # initialize vector of distance statistics
  
  set.seed(rand.seed)       # set seed for reprodicibility
  
  for (i in 1:nset) # Loop over variable sets (gene-sets)
  {
    
    # Get variables in this variable set
    vset.ind=(vset.index$start.index[i]:vset.index$end.index[i]) # rows with variables in this variable set
    vset.vIDs=vset.data$vID[vset.ind]                            # ids of variables in this variable set
    vset.list[i]=paste(sort(vset.vIDs),collapse=" | ")           # string listing all variables in this variable set
    
    # Extract data matrix for this variable set
    vset.mtx=matrix(data.mtx[vset.vIDs,],length(vset.ind))        
    
    
    # Compute distances for this variable set
    vset.mtx=t(vset.mtx)
    vset.dist=dist(vset.mtx)
    vset.dist=as.matrix(vset.dist)

    
    # Compute observed distance stats for this variable set
    dist.stats[i]=grp.dist(vset.dist,dcoef)
    
    # Perform adaptive permutation
    while((p.vset[i]<min.perms)&&(n.perms[i]<max.perms)) # Loop conditions
    {
      n.perms[i]=n.perms[i]+1                                    # increment number of permutations for this variable set
      perm.index=sample(n)                                       # generate this permutation
      vset.dist=vset.dist[perm.index,]                           # permute rows of the distance matrix
      vset.dist=vset.dist[,perm.index]                           # permute columns of the distance matrix
      perm.stats=grp.dist(vset.dist,dcoef)                       # compute the statistic with permuted distance matrix
      p.vset[i]=p.vset[i]+as.numeric(perm.stats>=dist.stats[i])  # update p-value
    }                                                    # End loop for adaptive permutation of this gene-set
  } # End loop over gene-sets
  
  # Compute, package, and return final result data.frame  
  p.vset=p.vset/n.perms # p-value is determined by dividing by number of permutations
  
  res=cbind.data.frame(vset=vset.index$value,  # name of variable set (gene-set)
                       vIDs=vset.list,         # list of variables in the variable set (gene-set)
                       dist.stat=dist.stats,   # distance statistic for the variable set
                       p.vset=p.vset,          # p-value
                       nperms=n.perms)         # number of permutations performed
  return(res)
}

# Compute phenotype distance coefficient matrix
dist.coef.mtx=function(ph.data) {
  ph.names=colnames(ph.data)  # get the names of the phenotype
  if (any(ph.names=="grp"))   # if a group variable, use that function
  {
    res=grp.coef.mtx(ph.data[,"grp"])
    return(res)
  }

  if (any(ph.names=="nvar")) # if a numeric variable, use that function
  {
    res=nvar.coef.mtx(ph.data[,"nvar"])
    return(res)
  }

  if (any(ph.names=="stime")) # if a survival variable, use that function
  {
    res=surv.coef.mtx(ph.data[,c("stime","evnt")])
    return(res)
  }
}

# Compute coefficient matrix for association with group
grp.coef.mtx=function(grp) {
  grp=as.character(grp)
  n=length(grp)         # obtain sample size
  res=matrix(1,n,n)     # initialize matrix
  ugrp=unique(grp)      # get unique group labels
  ngrp=length(ugrp)     # number of unique groups
  for (i in 1:ngrp)     # loop over groups
  {
    in.grp=is.element(grp,ugrp[i]) # find those in this group
    res[in.grp,in.grp]=-1          # set coefficient to negative 1 for all pairs in this group
  }                     # end loop over groups
  return(res)           # return the matrix
}

# Compute distance coefficient matrix for numerical variable
nvar.coef.mtx=function(nvar) {
  n=length(nvar)                           # sample size
  res=matrix(0,n,n)                        # initialize coefficient matrix
  uval=sort(unique(nvar))                  # unique numerical values
  thresh=(uval[-1]+uval[-length(uval)])/2  # all possible thresholds for the numerical value
  for(i in 1:length(thresh))  # loop over thresholds
  {
    grps=as.numeric(nvar<thresh[i])        # define group vector based on this threshold
    temp=grp.coef.mtx(grps)                # define coefficient matrix for group comparisons
    res=res+temp                           # add it to the result
  }
  # return the result
  return(res)
}

# Order and index data set by values of a specific variable
order.index.dset=function(dset, vr) {
  v=dset[,vr]     # extract the variable
  ord=order(v)    # vector of indices to order by that variable
  dset=dset[ord,] # order dset by that variable
  m=nrow(dset)    # get size of data set
  new.val=which(dset[-1,vr]!=dset[-m,vr]) # find which rows have a new value of variable
  ind1=c(1,new.val+1)                     # first row with a unique value of the variable
  ind2=c(new.val,m)                       # last row with a unique value of the variable

  # data.frame with indices of dset that have unique values of the variable vr
  dset.ind=cbind.data.frame(start.index=ind1,
                            end.index=ind2,
                            value=dset[ind1,vr])
  # create a list to return
  res=list(dset=dset,
           indx=dset.ind)
  # return that list
  return(res)
}

# Compute distance coefficient matrix for survival outcome
surv.coef.mtx=function(stime.evnt) {
  n=nrow(stime.evnt)                 # sample size
  res=matrix(0,n,n)                  # initialize matrix
  evnt.indx=which(stime.evnt[,2]>0)  # index of rows with an event

  time.pts=sort(unique(stime.evnt[evnt.indx,1])) # unique event times
  for (i in 1:length(time.pts)) # loop over unique event times
  {
    early.event=(stime.evnt[,1]<=time.pts[i])&
      (stime.evnt[,2]==1)           # subjects with event before this event time

    late.event1=(stime.evnt[,1]>time.pts[i])    # subjects with longer follow-up than this event time

    late.event2=(stime.evnt[,1]==time.pts[i])&  # subjects censored at this event time
      (stime.evnt[,2]==0)

    late.event=(late.event1|late.event2)        # subjects that definitely have event after this event time

    temp=matrix(0,n,n)                          # initialize distance coefficient matrix to all zeroes
    temp[early.event,early.event]=-1            # assign distance coefficient matrix to -1 for pairs of subjects with earlier events
    temp[late.event,late.event]=-1              # assign distance coefficient matrix to -1 for pairs of subjects with later events
    temp[early.event,late.event]=1              # assign distance coefficient matrix to +1 for pairs of subjects with early and late event
    temp[late.event,early.event]=1              # assign distance coefficient matrix to +1 for pairs of subjects with early and late event

    res=res+temp                # add result for this time point to cumulative coefficient matrix
  }                             # end loop over risk sets

  return(res) # return the matrix
}


####################################
# Prepare data for gmrpp
prep.data=function(data.mtx,grp.data,vset.data=NULL)
  
{
  # Merge data sets, match order of subjects in columns of data.mtx and rows of grp.data
  id.mtx=colnames(data.mtx)              # ids from data.mtx
  id.grp=grp.data$ID                     # ids from grp.data
  id.mtch=match.order.ids(id.mtx,id.grp) # function to match order of subjects
  data.mtx=data.mtx[,id.mtch$ind1]       # put columns of data.mtx in that order
  grp.data=grp.data[id.mtch$ind2,]       # put rows of grp.data in that order
  
  # check merging before continuing; stop if something didn't work
  id.mtx=colnames(data.mtx)  # ids from data.mtx
  id.grp=grp.data$ID         # ids from grp.data
  if (any(id.mtx!=id.grp))   # stop if something didn't work
    stop("Error in merging grp.data and data.mtx.")
  
  n=nrow(grp.data) # Now get the sample size
  
  #rss=sqrt(sum(dcoef^2))         # compute a scaling factor to make stats more comparable across gene-sets
  #dcoef=dcoef/rss                # scale the distance coefficient matrix
  
  # Order and index variable set data for faster computing
  vset.index=NULL
  if (!is.null(vset.data))
  {
    vset.data0=vset.data                        # keep original variable set data to double-check later
    ord.dset=order.index.dset(vset.data,"vset") # function to order gene-set assignments by gene-set
    vset.data=ord.dset$dset                     # ordered gene-set data
    vset.index=ord.dset$indx                    # indexes of ordered gene-set data
    nset=nrow(vset.index)                       # number of gene-sets
    
    # Double-check the re-indexed variable set data
    ord0=order(vset.data0$vset,vset.data0$vID)  # order original and re-indexed vset data in the same way
    ord1=order(vset.data$vset,vset.data$vID)
    if (any(vset.data0$vID[ord0]!=vset.data$vID[ord1])||
        any(vset.data0$vset[ord0]!=vset.data$vset[ord1]))
    {
      stop("Error in indexing vset.data.")
    }
    
    # Check that each indexed variable set has only one value for vset variable
    for (i in 1:nset)
    {
      indx=vset.index[i,1]:vset.index[i,2]
      uniq.vset=unique(vset.data$vset[indx])
      if (length(uniq.vset)!=1)
        stop("Error in index vset.data")
    } 
  }
  
  res=list(data.mtx=data.mtx,
           grp.data=grp.data,
           vset.data=vset.data,
           vset.index=vset.index)
  
  return(res)
  
}

############################################
# generate data.frame to match the order of two vectors of identifiers

match.order.ids=function(id1,id2) {
  temp1=cbind.data.frame(id=id1,ind1=1:length(id1)) # create data.frame with indices for id1
  temp2=cbind.data.frame(id=id2,ind2=1:length(id2)) # create data.frame with indices for id2
  res=merge(temp1,temp2,by=1)                       # merged data.frame

  # Warn user if some elements are not present in combined data.frame
  if (nrow(res)<length(id1))
    warning(paste0("Failed to merge ",length(id1)-nrow(res)," elements of id1."))

  if (nrow(res)<length(id2))
    warning(paste0("Failed to merge ",length(id2)-nrow(res)," elements of id2."))

  # return the result
  return(res)
}

#############################################
# function to compute distance statistic 
grp.dist=function(dist.mtx, # distance matrix for gene-set
                  dcoefs)   # coefficients computed from phenotype data
  
{
  res=sum(dist.mtx*dcoefs)-sum(dist.mtx)*sum(dcoefs)
  res=res/sqrt(sum(dist.mtx^2)*sum(dcoefs^2))
  return(res)
}