knitr::opts_chunk$set(
    collapse = TRUE,comment = "#>",
    echo = TRUE,cache = TRUE,
    dev = "png")
ot_eval = TRUE

\def\T{\text{T}} \newcommand{\bf}[1]{\mathbf{#1}} \newcommand{\bigPar}[1]{\left(#1\right)} \newcommand{\bigCur}[1]{\left{#1\right}} \newcommand{\bcSqu}[2]{\left[#1 \middle| #2\right]} \newcommand{\cE}[2]{E\bcSqu{#1}{#2}} \newcommand{\nexp}[1]{\exp\bigCur{#1}} \newcommand{\ind}[1]{1\bigCur{#1}} \newcommand{\w}[1]{\widehat{#1}}

Overview

Assuming all software dependencies and ROKET [@little2023associating] installation are installed, we can begin.

# Load libraries
req_packs = c("devtools","smarter","ggplot2","reshape2",
  "survival","ggdendro","MiRKAT","ROKET")
for(pack in req_packs){
  library(package = pack,character.only = TRUE)
}

# List package's exported functions
ls("package:ROKET")

# Fix seed
set.seed(2)

Distance Motivation

For $i = 1,\ldots,N$ and $g = 1,\ldots,G$, let

We would like to calculate the distance between the $i$th and $j$th samples in terms of mutated genes $\bf{Z}_i$ and $\bf{Z}_j$, denoted by $d\bigPar{\bf{Z}_i,\bf{Z}_j}$.

Optimal Transport

Background papers

Several optimal transport (OT) references to get familiarized with the framework, objective functions, updating equations, entropic regularizations, types of OT are provided:

Basic Idea

For the $i$th and $j$th samples, let

Balanced OT

If the mass of the two vectors are equal or normalized such that $\sum_g Z_{ig} = \sum_g Z_{jg}$, we could use balanced optimal transport.

Unbalanced OT

If the mass of the two vectors are not equal, $\sum_g Z_{ig} \neq \sum_g Z_{jg}$, we could use unbalanced optimal transport with penalty parameters.

Simulated Example

The code below will simulate samples and mutated genes. We welcome the reader to manipulate the following input arguments:

Ideally, PP should be much less than GG to allow multiple genes to be allocated to each pathway.

# number of samples
NN = 30
NN_nms = sprintf("S%s",seq(NN))

# number of pathways
PP = 4
PP_nms = sprintf("P%s",seq(PP))

# number of genes
GG = 30
GG_nms = sprintf("G%s",seq(GG))

# bound for gene similarity of two genes on same or different pathway
bnd_same = 0.75

# Gene and pathway relationship
GP = smart_df(PATH = sample(seq(PP),GG,replace = TRUE),
  GENE = seq(GG))
table(GP$PATH)

# gene-gene similarity matrix
GS = matrix(NA,GG,GG)
dimnames(GS) = list(GG_nms,GG_nms)
diag(GS) = 1

tmp_mat = t(combn(seq(GG),2))

for(ii in seq(nrow(tmp_mat))){

  G1 = tmp_mat[ii,1]
  G2 = tmp_mat[ii,2]
  same = GP$PATH[GP$GENE == G1] == GP$PATH[GP$GENE == G2]

  if( same )
    GS[G1,G2] = runif(1,bnd_same,1)
  else
    GS[G1,G2] = runif(1,0,1 - bnd_same)

}
GS[lower.tri(GS)] = t(GS)[lower.tri(GS)]
# round(GS,3)

Gene-Gene Similarity

Let's take a look at the gene similarity matrix.

show_tile = function(MAT,LABEL,TYPE = NULL,
  LABx = NULL,LABy = NULL,DIGITS = 1){

  min_val = min(MAT)
  max_val = max(MAT)
  med_val = (min_val + max_val) / 2

  if( is.null(TYPE) ) stop("Specify TYPE")

  if( isSymmetric(MAT) )
    MAT[upper.tri(MAT,diag = !TRUE)] = NA

  if( TYPE == "GSIM" ){
    max_val = 1
    med_val = 0.5
  }
  # else if( TYPE == "DIST" ){
  #  max_val = max(c(1,max(MAT)))
  #}

  dat = melt(MAT,na.rm = TRUE)
  # class(dat); dim(dat); dat[1:5,]

  gg = ggplot(data = dat,aes(x = Var1,y = Var2,fill = value)) +
    geom_tile(color = "black") + ggtitle(LABEL) +
    labs(fill = "Value")

  if( is.null(LABx) ){
    gg = gg + xlab("")
  } else {
    gg = gg + xlab(LABx)
  }

  if( is.null(LABy) ){
    gg = gg + ylab("")
  } else {
    gg = gg + ylab(LABy)
  }

  if( max(dim(MAT)) <= 10 && DIGITS >= 0 )
    gg = gg + geom_text(mapping = aes(label = smart_digits(value,DIGITS)))

  if( TYPE %in% c("GSIM","DIST") ){
    gg = gg + scale_fill_gradient2(midpoint = med_val,low = "deepskyblue",
        mid = "white",high = "red",limit = c(min_val,max_val))
  } else if( TYPE == "MUT" ){
    gg = gg + scale_fill_gradient2(low = "black",
        high = "red",limit = c(min_val,max_val))
  }

  gg = gg + guides(fill = guide_colorbar(frame.colour = "black")) +
    scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
    scale_y_discrete(guide = guide_axis(n.dodge = 2))

  gg = gg + theme(legend.position = "right",
        # legend.key.width = unit(1.5,'cm'),
        legend.key.height = unit(1,'cm'),
        legend.text = element_text(size = 12),
        legend.title = element_text(size = 12,hjust = 0.5),
        text = element_text(size = 12),
        panel.background = element_blank(),
        panel.grid.major = element_line(colour = "grey50",
            size = 0.5,linetype = "dotted"),
        axis.text = element_text(face = "italic"),
        axis.text.x = element_text(angle = 0),
        plot.title = element_text(hjust = 0.5))

  return(gg)
}

hout = hclust(as.dist(1 - GS))
ord = hout$labels[hout$order]

show_tile(MAT = GS[ord,ord],
  LABEL = "Simulated Gene Similarity",
  TYPE = "GSIM",DIGITS = 2)

show_tile(MAT = 1 - GS[ord,ord],
  LABEL = "Simulated Gene Dissimilarity",
  TYPE = "GSIM",DIGITS = 2)

The function show_tile() is inherent to this vignette and not part of the ROKET package.

Simulate mutated gene statuses

# Mutated gene statuses
prob_mut = 0.2
prob_muts = c(1 - prob_mut,prob_mut)
while(TRUE){
  ZZ = matrix(sample(c(0,1),NN*GG,replace = TRUE,prob = prob_muts),NN,GG)

  # Ensure each sample has at least one mutated gene
  if( min(rowSums(ZZ)) > 0 ) break
}
dimnames(ZZ) = list(NN_nms,GG_nms)

show_tile(MAT = ZZ,
  LABEL = "Mutation Status: Gene by Sample",
  TYPE = "MUT",DIGITS = 0)

# Store all distances
DD = array(data = NA,dim = c(NN,NN,5))
dimnames(DD)[1:2] = list(NN_nms,NN_nms)
dimnames(DD)[[3]] = c("EUC","OT_Balanced",sprintf("OT_LAM%s",c(0.5,1.0,5.0)))

We can look at the distribution of gene mutation frequencies.

freq = colSums(ZZ); # freq
dat = smart_df(GENE = names(freq),FREQ = as.integer(freq))
dat$GENE = factor(dat$GENE,levels = names(sort(freq,decreasing = TRUE)))
# dat

ggplot(data = dat,mapping = aes(x = GENE,y = FREQ)) +
  geom_bar(stat = "identity") +
  xlab("Gene") + ylab("Mutation Frequency") +
  scale_x_discrete(guide = guide_axis(n.dodge = 2))

Euclidean distance

We can calculate the Euclidean distance, which does not incorporate relationships between pairs of genes.

DD[,,"EUC"] = as.matrix(dist(ZZ,diag = TRUE,upper = TRUE))

hout = hclust(as.dist(DD[,,"EUC"]))
ord = hout$labels[hout$order]
show_tile(MAT = DD[,,"EUC"][ord,ord],
  LABEL = "Euclidean Pairwise Distances",
  TYPE = "DIST",DIGITS = 2)

OT distance

Code between two samples

To demonstrate ROKET's functionality, the code below will run balanced OT (pre-normalizing input vectors) between two samples. Regardless of the values specified by LAMBDA1 and LAMBDA2 arguments, we need to set balance = TRUE. The OT cost matrix argument corresponds to 1 - GS, one minus the gene similarity matrix.

# Pick two samples
ii = 1
jj = 2
ZZ[c(ii,jj),colSums(ZZ[c(ii,jj),]) > 0]
outOT = run_myOT(XX = ZZ[ii,],YY = ZZ[jj,],
  COST = 1 - GS,EPS = 1e-3,LAMBDA1 = 1,
  LAMBDA2 = 1,balance = TRUE,verbose = FALSE)
# str(outOT)

# Optimal transport matrix
tmpOT = outOT$OT
tmpOT = tmpOT[rowSums(tmpOT) > 0,colSums(tmpOT) > 0]
show_tile(MAT = tmpOT,LABEL = "Balanced OT (showing only genes mutated in each sample)",
  TYPE = "DIST",LABx = sprintf("Sample %s",ii),
  LABy = sprintf("Sample %s",jj),
  DIGITS = 2)

# Pairwise distance
outOT$DIST

Let's try again but with unbalanced OT and $\lambda = 0.5$. We need to set balance = FALSE and specify LAMBDA1 = 0.5 and LAMBDA2 = 0.5.

ZZ[c(ii,jj),colSums(ZZ[c(ii,jj),]) > 0]
LAM = 0.5
outOT = run_myOT(XX = ZZ[ii,],YY = ZZ[jj,],
  COST = 1 - GS,EPS = 1e-3,LAMBDA1 = LAM,
  LAMBDA2 = LAM,balance = FALSE,verbose = FALSE)
# str(outOT)

# Optimal transport matrix
tmpOT = outOT$OT
tmpOT = tmpOT[rowSums(tmpOT) > 0,colSums(tmpOT) > 0]
show_tile(MAT = tmpOT,
  LABEL = "Unbalanced OT (showing only genes mutated in each sample)",TYPE = "DIST",
  LABx = sprintf("Sample %s",ii),
  LABy = sprintf("Sample %s",jj),
  DIGITS = 2)

# Pairwise distance
outOT$DIST

Code between all sample pairs

ROKET can run optimal transport calculations across all $N$ choose 2 samples. Below is code to run balanced OT.

outOTs = run_myOTs(ZZ = t(ZZ),COST = 1 - GS,
  EPS = 1e-3,balance = TRUE,LAMBDA1 = 1,
  LAMBDA2 = 1,conv = 1e-5,max_iter = 3e3,
  ncores = 1,verbose = FALSE)

hout = hclust(as.dist(outOTs))
ord = hout$labels[hout$order]
show_tile(MAT = outOTs[ord,ord],
  LABEL = "Balanced OT Distances",
  TYPE = "DIST",DIGITS = 1)

We can run calculations for various $\lambda$ values. For shorthand, $\lambda$ = Inf corresponds to balanced OT.

LAMs = c(0,0.5,1.0,5.0)

for(LAM in LAMs){
  # LAM = LAMs[2]

  BAL = ifelse(LAM == 0,TRUE,FALSE)
  LAM2 = ifelse(BAL,1,LAM)

  outOTs = run_myOTs(ZZ = t(ZZ),COST = 1 - GS,
    EPS = 1e-3,balance = BAL,LAMBDA1 = LAM2,
    LAMBDA2 = LAM2,conv = 1e-5,max_iter = 3e3,
    ncores = 1,verbose = FALSE)

  hout = hclust(as.dist(outOTs))
  ord = hout$labels[hout$order]

  LABEL = ifelse(BAL,"OT Distances (Balanced)",
    sprintf("OT Distances (Lambda = %s)",LAM))
  LABEL2 = ifelse(BAL,"OT_Balanced",sprintf("OT_LAM%s",LAM))

  gg = show_tile(MAT = outOTs[ord,ord],
    LABEL = LABEL,TYPE = "DIST",DIGITS = 2)
  print(gg)

  DD[,,LABEL2] = outOTs
  rm(outOTs)
}

Dendrograms

We can see that Euclidean distance calculations on gene mutation statuses alone does not lead to strong evidence of sample clusters. However optimal transport-based distance calculations with integrated gene-gene similarities provide stronger evidence of sample clusters.

nms = dimnames(DD)[[3]]; nms

for(nm in nms){
  # nm = nms[2]

  hout = hclust(as.dist(DD[,,nm]))
  hdend = as.dendrogram(hout)
  dend_data = dendro_data(hdend,type = "rectangle")
  gg = ggplot(dend_data$segments) +
    geom_segment(aes(x = x,y = y,xend = xend,yend = yend)) +
    ggtitle(nm) + xlab("") + ylab("") +
    geom_text(data = dend_data$labels,
      mapping = aes(x,y,label = label),vjust = 0.5,hjust = 1) +
    theme_dendro() + coord_flip() +
    theme(plot.title = element_text(hjust = 0.5))
  print(gg)
  rm(hout,hdend,dend_data,gg)
}

Kernel Regression and Association

The next step is to transform distance matrices into centered kernel matrices to perform hypothesis testing on our constructed kernels.

Models

For a binary or continuous outcome, $Y_i$, fitted with a generalized linear model, we have

$$ g\bigPar{E\bcSqu{Y_i}{\bf{X}_i,\bf{Z}_i}} = \bf{X}_i^\T \bf{\beta} + f(\bf{Z}_i) = \bf{X}_i^\T \bf{\beta} + \bf{K}_i^\T \bf{\alpha} $$

and for Cox proportional hazards for survival outcomes, we have

$$ h(t;\bf{X}_i,\bf{Z}_i) = h_0(t) \nexp{\bf{X}_i^\T \bf{\beta} + f(\bf{Z}_i)} = h_0(t) \nexp{\bf{X}_i^\T \bf{\beta} + \bf{K}_i^\T \bf{\alpha}} $$

where

Distance to Kernel

The matrix $\bf{K}$ is constructed from the distance matrix (Euclidean or optimal transport), $\bf{D}$, by double centering the rows and columns of $\bf{D}$ with the formula

$$ \tilde{\bf{K}} = -\frac{1}{2} \bigPar{\bf{I}_N - \bf{J}_N \bf{J}_N^\T} \bf{D}^2 \bigPar{\bf{I}_N - \bf{J}_N \bf{J}_N^\T} $$

where

Since $\tilde{\bf{K}}$ is not guaranteed to be positive semi-definite, we perform spectral decomposition and replace negative eigenvalues with zero and re-calculate the kernel to arrive at $\bf{K}$.

For a single candidate distance matrix, transform the distance matrix to a kernel matrix and convert the object into a list, denoted by the object KK, code is provided below from the R package, MiRKAT [@zhao2015testing;@plantinga2017mirkat].

# For example, with Euclidean distance
KK = MiRKAT::D2K(D = DD[,,"EUC"])
KK = list(EUC = KK)

If there are multiple distance matrices, in our case, contained in an array DD, store them into a list of kernel matrices KK. Example code is below.

KK = list()

for(nm in dimnames(DD)[[3]]){
  KK[[nm]] = MiRKAT::D2K(D = DD[,,nm])
}

Hypothesis Testing

The user needs to pre-specify and fit the null model,

$$ H_0: \bf{\alpha} = 0, $$

of baseline covariates, $\bf{X}_i$, to one of the three outcome models to obtain continuous, logistic, or martingale residuals (RESI). Some example code is provided below.

# Continuous
out_LM = lm(Y ~ .,data = data.frame(X))
RESI = residuals(out_LM)

# Logistic
out_LOG = glm(Y ~ .,data = data.frame(X),family = "binomial")
RESI = residuals(out_LOG)

# Survival
out_CX = coxph(Surv(TIME,CENS) ~ .,data = data.frame(X))
RESI = residuals(out_CX)

With one or multiple candidate kernels, ROKET will take

to calculate individual kernel p-values as well as omnibus tests. An omnibus test assesses the significance of the minimum p-value kernel among kernels considered. The function kernTEST requires names(RESI) and dimension names per object as well as row/column names per kernel within aKK are named for sample order consistency. Sample code is provided below. Setting verbose = TRUE allows the user to track the permutation's progress, especially when requesting hundreds of thousands of permutations.

nPERMS = 1e5
nKK = length(KK)

# Array of kernels
aKK = array(data = NA,dim = dim(DD),
  dimnames = dimnames(DD))
for(nm in dimnames(DD)[[3]]){
  aKK[,,nm] = KK[[nm]]
}

# Create OMNI matrix
OMNI = matrix(0,nrow = 2,ncol = dim(aKK)[3],
  dimnames = list(c("EUC","OT"),dimnames(aKK)[[3]]))
OMNI["EUC","EUC"] = 1
OMNI["OT",grepl("^OT",colnames(OMNI))] = 1
OMNI

# Hypothesis Testing
ROKET::kernTEST(RESI = RESI,
  KK = aKK,
  OMNI = OMNI,
  nPERMS = nPERMS,
  ncores = 1)

The final output contains individual kernel p-values and the omnibus p-value.

Have fun with utilizing kernel regression and optimal transport frameworks with ROKET!

Session Info

sessionInfo()

References



pllittle/ROKET documentation built on March 5, 2025, 1:42 a.m.