#' @title Generalized multi-response permutation procedure (GMRPP) for
#' categorical, quantitative, and censored event-time variables
#'
#' @description This function impletements the generalized
#' multiresponse permutation procedure (GMRPP) for
#' categorical, quantitative, and censored event-time variables
#'
#' @param data.mtx The genomic data matrix. Each row is a features. Each column is a subject.
#' @param grp.data # A data frame with columns "ID" (matches columns of data.mtx)
#' and "grp" (for group comparisons), "nvar" (to associate with numerical variable),
#' or "stime" and "evnt" (to associate with survival data).
#' @param vset.data Variable set data, row for assignment of variable (vID) to a variable
#' set (vset).
#' @param nperms The minimum and maximum number of permutations for adaptive permutation.
#' Default is c(10,1000).
#' @param rand.seed Random seed used for reproducible computing. Default is NULL (no seed set).
#'
#' @return A data.frame with the following columns:
#' \item{vset}{The name of variable set (gene-set).}
#' \item{vIDs}{The list of variables in the variable set (gene-set).}
#' \item{dist.stat}{The distance statistic for the variable set.}
#' \item{p.vset}{The p-value.}
#' \item{nperms}{The number of permutations performed.}
#'
#' @details
#'
#' This function performs the GMRPP method described by Cao, Sahr, and Pounds (2019), which is
#' a generalization of the MRPP method described by Nettleton, Recknor, and Reecy (2008). It characterizes
#' the genomic data of a gene set as the distances between each pair of subjects computed on
#' data for genes in the gene set. This distance matrix is used to measure the association of
#' the gene set genomic data with a categorical variable by computing the sum of distances for
#' subjects belonging to different groups minus the sum of distances for subjects belonging to the same group.
#' A greater value of this statistic indicates a stronger association. For numeric variables, the
#' average of this statisic is computed over all sets of groupings defined by all possible dichotomizations.
#' For censored survival time variables, this statistic is computed over all dichotomizations defined
#' by risk sets (unique uncesored event times), with some adjustments for censoring. The method
#' detects certain complex forms of gene-set associations that are not easily identified with other methods.
#' The statistical
#' significance is determined by an adaptive permutation procedure (Pounds et al 2011) that stops evaluating
#' permuted data sets once it becomes clear the result is not statistically significant or a maximum
#' number of permutations have been performed.
#'
#' @examples
#' data(AMLex) # AML TARGET project expression data for selected genes (website)
#' data(AMLclin) # AML TARGET project overall survival data
#' data(vsets) # KEGG gene sets for AML and CML (website)
#' res<-gmrpp(AMLex,AMLclin,vsets) # Evaluate association of gene sets with overall survival
#' res[,-2] # results excluding variable IDs for brevity
#'
#' @references
#'
#' Cao X, Sahr N, Pounds S (2019) Robust Detection of Complex Gene-Set Associations. Manuscript.
#'
#' Nettleton D, Recknor J, Reecy JM (2008) Identification of differentially expressed gene categories in
#' microarray studies using nonparametric multivariate analysis. Bioinformatics 24: 192-201. PMID 18054553.
#'
#' Pounds S, Cao X, Cheng C, et al (2011) Integrated analysis of pharmacologic, clinical
#' and SNP microarray data using projection onto the most interesting statistical
#' evidence with adaptive permutation testing. International Journal of Data Mining
#' and Bioinformatics, 5, 143-57. PMID 21516175.
#'
#' @keywords
#' gene set; gene set enrichment; gene set association; multiresponse permutation procedure; adaptive permutation
#'
#' @import MASS stats
#'
#' @export
#'
# Generalized multi-response permutation procedure
gmrpp=function(data.mtx, grp.data, vset.data, nperms=c(10,1000), rand.seed=NULL) {
# Prepare data for analysis
mrpp.data=prep.data(data.mtx,grp.data,vset.data)
data.mtx=mrpp.data$data.mtx
grp.data=mrpp.data$grp.data
vset.data=mrpp.data$vset.data
vset.index=mrpp.data$vset.index
# Compute distance coefficient matrix to be used for calculations
dcoef=dist.coef.mtx(grp.data)
# Initialize values for permutation
min.perms=min(nperms) # minimum number of permutations for adaptive permutation
max.perms=max(nperms) # maximum number of permutations for adaptive permutation
n=nrow(grp.data) # number of subjects
nset=nrow(vset.index) # number of gene-sets
p.vset=rep(0,nset) # initialize p-value vector
vset.list=rep("NA",nset) # initialize vector of variable set lists
n.perms=rep(0,nset) # initialize vector of number of permutations performed
dist.stats=rep(0,nset) # initialize vector of distance statistics
set.seed(rand.seed) # set seed for reprodicibility
for (i in 1:nset) # Loop over variable sets (gene-sets)
{
# Get variables in this variable set
vset.ind=(vset.index$start.index[i]:vset.index$end.index[i]) # rows with variables in this variable set
vset.vIDs=vset.data$vID[vset.ind] # ids of variables in this variable set
vset.list[i]=paste(sort(vset.vIDs),collapse=" | ") # string listing all variables in this variable set
# Extract data matrix for this variable set
vset.mtx=matrix(data.mtx[vset.vIDs,],length(vset.ind))
# Compute distances for this variable set
vset.mtx=t(vset.mtx)
vset.dist=dist(vset.mtx)
vset.dist=as.matrix(vset.dist)
# Compute observed distance stats for this variable set
dist.stats[i]=grp.dist(vset.dist,dcoef)
# Perform adaptive permutation
while((p.vset[i]<min.perms)&&(n.perms[i]<max.perms)) # Loop conditions
{
n.perms[i]=n.perms[i]+1 # increment number of permutations for this variable set
perm.index=sample(n) # generate this permutation
vset.dist=vset.dist[perm.index,] # permute rows of the distance matrix
vset.dist=vset.dist[,perm.index] # permute columns of the distance matrix
perm.stats=grp.dist(vset.dist,dcoef) # compute the statistic with permuted distance matrix
p.vset[i]=p.vset[i]+as.numeric(perm.stats>=dist.stats[i]) # update p-value
} # End loop for adaptive permutation of this gene-set
} # End loop over gene-sets
# Compute, package, and return final result data.frame
p.vset=p.vset/n.perms # p-value is determined by dividing by number of permutations
res=cbind.data.frame(vset=vset.index$value, # name of variable set (gene-set)
vIDs=vset.list, # list of variables in the variable set (gene-set)
dist.stat=dist.stats, # distance statistic for the variable set
p.vset=p.vset, # p-value
nperms=n.perms) # number of permutations performed
return(res)
}
# Compute phenotype distance coefficient matrix
dist.coef.mtx=function(ph.data) {
ph.names=colnames(ph.data) # get the names of the phenotype
if (any(ph.names=="grp")) # if a group variable, use that function
{
res=grp.coef.mtx(ph.data[,"grp"])
return(res)
}
if (any(ph.names=="nvar")) # if a numeric variable, use that function
{
res=nvar.coef.mtx(ph.data[,"nvar"])
return(res)
}
if (any(ph.names=="stime")) # if a survival variable, use that function
{
res=surv.coef.mtx(ph.data[,c("stime","evnt")])
return(res)
}
}
# Compute coefficient matrix for association with group
grp.coef.mtx=function(grp) {
grp=as.character(grp)
n=length(grp) # obtain sample size
res=matrix(1,n,n) # initialize matrix
ugrp=unique(grp) # get unique group labels
ngrp=length(ugrp) # number of unique groups
for (i in 1:ngrp) # loop over groups
{
in.grp=is.element(grp,ugrp[i]) # find those in this group
res[in.grp,in.grp]=-1 # set coefficient to negative 1 for all pairs in this group
} # end loop over groups
return(res) # return the matrix
}
# Compute distance coefficient matrix for numerical variable
nvar.coef.mtx=function(nvar) {
n=length(nvar) # sample size
res=matrix(0,n,n) # initialize coefficient matrix
uval=sort(unique(nvar)) # unique numerical values
thresh=(uval[-1]+uval[-length(uval)])/2 # all possible thresholds for the numerical value
for(i in 1:length(thresh)) # loop over thresholds
{
grps=as.numeric(nvar<thresh[i]) # define group vector based on this threshold
temp=grp.coef.mtx(grps) # define coefficient matrix for group comparisons
res=res+temp # add it to the result
}
# return the result
return(res)
}
# Order and index data set by values of a specific variable
order.index.dset=function(dset, vr) {
v=dset[,vr] # extract the variable
ord=order(v) # vector of indices to order by that variable
dset=dset[ord,] # order dset by that variable
m=nrow(dset) # get size of data set
new.val=which(dset[-1,vr]!=dset[-m,vr]) # find which rows have a new value of variable
ind1=c(1,new.val+1) # first row with a unique value of the variable
ind2=c(new.val,m) # last row with a unique value of the variable
# data.frame with indices of dset that have unique values of the variable vr
dset.ind=cbind.data.frame(start.index=ind1,
end.index=ind2,
value=dset[ind1,vr])
# create a list to return
res=list(dset=dset,
indx=dset.ind)
# return that list
return(res)
}
# Compute distance coefficient matrix for survival outcome
surv.coef.mtx=function(stime.evnt) {
n=nrow(stime.evnt) # sample size
res=matrix(0,n,n) # initialize matrix
evnt.indx=which(stime.evnt[,2]>0) # index of rows with an event
time.pts=sort(unique(stime.evnt[evnt.indx,1])) # unique event times
for (i in 1:length(time.pts)) # loop over unique event times
{
early.event=(stime.evnt[,1]<=time.pts[i])&
(stime.evnt[,2]==1) # subjects with event before this event time
late.event1=(stime.evnt[,1]>time.pts[i]) # subjects with longer follow-up than this event time
late.event2=(stime.evnt[,1]==time.pts[i])& # subjects censored at this event time
(stime.evnt[,2]==0)
late.event=(late.event1|late.event2) # subjects that definitely have event after this event time
temp=matrix(0,n,n) # initialize distance coefficient matrix to all zeroes
temp[early.event,early.event]=-1 # assign distance coefficient matrix to -1 for pairs of subjects with earlier events
temp[late.event,late.event]=-1 # assign distance coefficient matrix to -1 for pairs of subjects with later events
temp[early.event,late.event]=1 # assign distance coefficient matrix to +1 for pairs of subjects with early and late event
temp[late.event,early.event]=1 # assign distance coefficient matrix to +1 for pairs of subjects with early and late event
res=res+temp # add result for this time point to cumulative coefficient matrix
} # end loop over risk sets
return(res) # return the matrix
}
####################################
# Prepare data for gmrpp
prep.data=function(data.mtx,grp.data,vset.data=NULL)
{
# Merge data sets, match order of subjects in columns of data.mtx and rows of grp.data
id.mtx=colnames(data.mtx) # ids from data.mtx
id.grp=grp.data$ID # ids from grp.data
id.mtch=match.order.ids(id.mtx,id.grp) # function to match order of subjects
data.mtx=data.mtx[,id.mtch$ind1] # put columns of data.mtx in that order
grp.data=grp.data[id.mtch$ind2,] # put rows of grp.data in that order
# check merging before continuing; stop if something didn't work
id.mtx=colnames(data.mtx) # ids from data.mtx
id.grp=grp.data$ID # ids from grp.data
if (any(id.mtx!=id.grp)) # stop if something didn't work
stop("Error in merging grp.data and data.mtx.")
n=nrow(grp.data) # Now get the sample size
#rss=sqrt(sum(dcoef^2)) # compute a scaling factor to make stats more comparable across gene-sets
#dcoef=dcoef/rss # scale the distance coefficient matrix
# Order and index variable set data for faster computing
vset.index=NULL
if (!is.null(vset.data))
{
vset.data0=vset.data # keep original variable set data to double-check later
ord.dset=order.index.dset(vset.data,"vset") # function to order gene-set assignments by gene-set
vset.data=ord.dset$dset # ordered gene-set data
vset.index=ord.dset$indx # indexes of ordered gene-set data
nset=nrow(vset.index) # number of gene-sets
# Double-check the re-indexed variable set data
ord0=order(vset.data0$vset,vset.data0$vID) # order original and re-indexed vset data in the same way
ord1=order(vset.data$vset,vset.data$vID)
if (any(vset.data0$vID[ord0]!=vset.data$vID[ord1])||
any(vset.data0$vset[ord0]!=vset.data$vset[ord1]))
{
stop("Error in indexing vset.data.")
}
# Check that each indexed variable set has only one value for vset variable
for (i in 1:nset)
{
indx=vset.index[i,1]:vset.index[i,2]
uniq.vset=unique(vset.data$vset[indx])
if (length(uniq.vset)!=1)
stop("Error in index vset.data")
}
}
res=list(data.mtx=data.mtx,
grp.data=grp.data,
vset.data=vset.data,
vset.index=vset.index)
return(res)
}
############################################
# generate data.frame to match the order of two vectors of identifiers
match.order.ids=function(id1,id2) {
temp1=cbind.data.frame(id=id1,ind1=1:length(id1)) # create data.frame with indices for id1
temp2=cbind.data.frame(id=id2,ind2=1:length(id2)) # create data.frame with indices for id2
res=merge(temp1,temp2,by=1) # merged data.frame
# Warn user if some elements are not present in combined data.frame
if (nrow(res)<length(id1))
warning(paste0("Failed to merge ",length(id1)-nrow(res)," elements of id1."))
if (nrow(res)<length(id2))
warning(paste0("Failed to merge ",length(id2)-nrow(res)," elements of id2."))
# return the result
return(res)
}
#############################################
# function to compute distance statistic
grp.dist=function(dist.mtx, # distance matrix for gene-set
dcoefs) # coefficients computed from phenotype data
{
res=sum(dist.mtx*dcoefs)-sum(dist.mtx)*sum(dcoefs)
res=res/sqrt(sum(dist.mtx^2)*sum(dcoefs^2))
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.