Introduction"

knitr::opts_chunk$set(
    collapse = TRUE,comment = "#>",
    echo = TRUE,cache = TRUE,
    dev = "png")
ot_eval = TRUE

\def\T{\text{T}} \newcommand{\bf}[1]{\mathbf{#1}} \newcommand{\bigPar}[1]{\left(#1\right)} \newcommand{\bigCur}[1]{\left{#1\right}} \newcommand{\bcSqu}[2]{\left[#1 \middle| #2\right]} \newcommand{\cE}[2]{E\bcSqu{#1}{#2}} \newcommand{\nexp}[1]{\exp\bigCur{#1}} \newcommand{\ind}[1]{1\bigCur{#1}} \newcommand{\w}[1]{\widehat{#1}}

Overview

Assuming all software dependencies and ROKET [@little2023associating] installation are installed, we can begin.

# Load libraries
req_packs = c("devtools","smarter","ggplot2","reshape2",
  "survival","ggdendro","MiRKAT","ROKET")
for(pack in req_packs){
  library(package = pack,character.only = TRUE)
}

# List package's exported functions
ls("package:ROKET")

# Fix seed
set.seed(2)

Distance Motivation

For $i = 1,\ldots,N$ and $g = 1,\ldots,G$, let

We would like to calculate the distance between the $i$th and $j$th samples in terms of mutated genes $\bf{Z}_i$ and $\bf{Z}_j$, denoted by $d\bigPar{\bf{Z}_i,\bf{Z}_j}$.

Optimal Transport

Background papers

Several optimal transport (OT) references to get familiarized with the framework, objective functions, updating equations, entropic regularizations, types of OT are provided:

Basic Idea

For the $i$th and $j$th samples, let

Balanced OT

If the mass of the two vectors are equal or normalized such that $\sum_g Z_{ig} = \sum_g Z_{jg}$, we could use balanced optimal transport.

Unbalanced OT

If the mass of the two vectors are not equal, $\sum_g Z_{ig} \neq \sum_g Z_{jg}$, we could use unbalanced optimal transport with penalty parameters.

Simulated Example

The code below will simulate samples and mutated genes. We welcome the reader to manipulate the following input arguments:

Ideally, PP should be much less than GG to allow multiple genes to be allocated to each pathway.

# number of samples
NN = 30
NN_nms = sprintf("S%s",seq(NN))

# number of pathways
PP = 4
PP_nms = sprintf("P%s",seq(PP))

# number of genes
GG = 30
GG_nms = sprintf("G%s",seq(GG))

# bound for gene similarity of two genes on same or different pathway
bnd_same = 0.75

# Gene and pathway relationship
GP = smart_df(PATH = sample(seq(PP),GG,replace = TRUE),
  GENE = seq(GG))
table(GP$PATH)

# gene-gene similarity matrix
GS = matrix(NA,GG,GG)
dimnames(GS) = list(GG_nms,GG_nms)
diag(GS) = 1

tmp_mat = t(combn(seq(GG),2))

for(ii in seq(nrow(tmp_mat))){

  G1 = tmp_mat[ii,1]
  G2 = tmp_mat[ii,2]
  same = GP$PATH[GP$GENE == G1] == GP$PATH[GP$GENE == G2]

  if( same )
    GS[G1,G2] = runif(1,bnd_same,1)
  else
    GS[G1,G2] = runif(1,0,1 - bnd_same)

}
GS[lower.tri(GS)] = t(GS)[lower.tri(GS)]
# round(GS,3)

Gene-Gene Similarity

Let's take a look at the gene similarity matrix.

show_tile = function(MAT,LABEL,TYPE = NULL,
  LABx = NULL,LABy = NULL,DIGITS = 1){

  min_val = min(MAT)
  max_val = max(MAT)
  med_val = (min_val + max_val) / 2

  if( is.null(TYPE) ) stop("Specify TYPE")

  if( isSymmetric(MAT) )
    MAT[upper.tri(MAT,diag = !TRUE)] = NA

  if( TYPE == "GSIM" ){
    max_val = 1
    med_val = 0.5
  }
  # else if( TYPE == "DIST" ){
  #  max_val = max(c(1,max(MAT)))
  #}

  dat = melt(MAT,na.rm = TRUE)
  # class(dat); dim(dat); dat[1:5,]

  gg = ggplot(data = dat,aes(x = Var1,y = Var2,fill = value)) +
    geom_tile(color = "black") + ggtitle(LABEL) +
    labs(fill = "Value")

  if( is.null(LABx) ){
    gg = gg + xlab("")
  } else {
    gg = gg + xlab(LABx)
  }

  if( is.null(LABy) ){
    gg = gg + ylab("")
  } else {
    gg = gg + ylab(LABy)
  }

  if( max(dim(MAT)) <= 10 && DIGITS >= 0 )
    gg = gg + geom_text(mapping = aes(label = smart_digits(value,DIGITS)))

  if( TYPE %in% c("GSIM","DIST") ){
    gg = gg + scale_fill_gradient2(midpoint = med_val,low = "deepskyblue",
        mid = "white",high = "red",limit = c(min_val,max_val))
  } else if( TYPE == "MUT" ){
    gg = gg + scale_fill_gradient2(low = "black",
        high = "red",limit = c(min_val,max_val))
  }

  gg = gg + guides(fill = guide_colorbar(frame.colour = "black")) +
    scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
    scale_y_discrete(guide = guide_axis(n.dodge = 2))

  gg = gg + theme(legend.position = "right",
        # legend.key.width = unit(1.5,'cm'),
        legend.key.height = unit(1,'cm'),
        legend.text = element_text(size = 12),
        legend.title = element_text(size = 12,hjust = 0.5),
        text = element_text(size = 12),
        panel.background = element_blank(),
        panel.grid.major = element_line(colour = "grey50",
            size = 0.5,linetype = "dotted"),
        axis.text = element_text(face = "italic"),
        axis.text.x = element_text(angle = 0),
        plot.title = element_text(hjust = 0.5))

  return(gg)
}

hout = hclust(as.dist(1 - GS))
ord = hout$labels[hout$order]

show_tile(MAT = GS[ord,ord],
  LABEL = "Simulated Gene Similarity",
  TYPE = "GSIM",DIGITS = 2)

show_tile(MAT = 1 - GS[ord,ord],
  LABEL = "Simulated Gene Dissimilarity",
  TYPE = "GSIM",DIGITS = 2)

The function show_tile() is inherent to this vignette and not part of the ROKET package.

Simulate mutated gene statuses

# Mutated gene statuses
prob_mut = 0.2
prob_muts = c(1 - prob_mut,prob_mut)
while(TRUE){
  ZZ = matrix(sample(c(0,1),NN*GG,replace = TRUE,prob = prob_muts),NN,GG)

  # Ensure each sample has at least one mutated gene
  if( min(rowSums(ZZ)) > 0 ) break
}
dimnames(ZZ) = list(NN_nms,GG_nms)

show_tile(MAT = ZZ,
  LABEL = "Mutation Status: Gene by Sample",
  TYPE = "MUT",DIGITS = 0)

# Store all distances
DD = array(data = NA,dim = c(NN,NN,5))
dimnames(DD)[1:2] = list(NN_nms,NN_nms)
dimnames(DD)[[3]] = c("EUC","OT_Balanced",sprintf("OT_LAM%s",c(0.5,1.0,5.0)))

We can look at the distribution of gene mutation frequencies.

freq = colSums(ZZ); # freq
dat = smart_df(GENE = names(freq),FREQ = as.integer(freq))
dat$GENE = factor(dat$GENE,levels = names(sort(freq,decreasing = TRUE)))
# dat

ggplot(data = dat,mapping = aes(x = GENE,y = FREQ)) +
  geom_bar(stat = "identity") +
  xlab("Gene") + ylab("Mutation Frequency") +
  scale_x_discrete(guide = guide_axis(n.dodge = 2))

Euclidean distance

We can calculate the Euclidean distance, which does not incorporate relationships between pairs of genes.

DD[,,"EUC"] = as.matrix(dist(ZZ,diag = TRUE,upper = TRUE))

hout = hclust(as.dist(DD[,,"EUC"]))
ord = hout$labels[hout$order]
show_tile(MAT = DD[,,"EUC"][ord,ord],
  LABEL = "Euclidean Pairwise Distances",
  TYPE = "DIST",DIGITS = 2)

OT distance

Code between two samples

To demonstrate ROKET's functionality, the code below will run balanced OT (pre-normalizing input vectors) between two samples. Regardless of the values specified by LAMBDA1 and LAMBDA2 arguments, we need to set balance = TRUE. The OT cost matrix argument corresponds to 1 - GS, one minus the gene similarity matrix.

# Pick two samples
ii = 1
jj = 2
ZZ[c(ii,jj),colSums(ZZ[c(ii,jj),]) > 0]
outOT = run_myOT(XX = ZZ[ii,],YY = ZZ[jj,],
  COST = 1 - GS,EPS = 1e-3,LAMBDA1 = 1,
  LAMBDA2 = 1,balance = TRUE,verbose = FALSE)
# str(outOT)

# Optimal transport matrix
tmpOT = outOT$OT
tmpOT = tmpOT[rowSums(tmpOT) > 0,colSums(tmpOT) > 0]
show_tile(MAT = tmpOT,LABEL = "Balanced OT (showing only genes mutated in each sample)",
  TYPE = "DIST",LABx = sprintf("Sample %s",ii),
  LABy = sprintf("Sample %s",jj),
  DIGITS = 2)

# Pairwise distance
outOT$DIST

Let's try again but with unbalanced OT and $\lambda = 0.5$. We need to set balance = FALSE and specify LAMBDA1 = 0.5 and LAMBDA2 = 0.5.

ZZ[c(ii,jj),colSums(ZZ[c(ii,jj),]) > 0]
LAM = 0.5
outOT = run_myOT(XX = ZZ[ii,],YY = ZZ[jj,],
  COST = 1 - GS,EPS = 1e-3,LAMBDA1 = LAM,
  LAMBDA2 = LAM,balance = FALSE,verbose = FALSE)
# str(outOT)

# Optimal transport matrix
tmpOT = outOT$OT
tmpOT = tmpOT[rowSums(tmpOT) > 0,colSums(tmpOT) > 0]
show_tile(MAT = tmpOT,
  LABEL = "Unbalanced OT (showing only genes mutated in each sample)",TYPE = "DIST",
  LABx = sprintf("Sample %s",ii),
  LABy = sprintf("Sample %s",jj),
  DIGITS = 2)

# Pairwise distance
outOT$DIST

Code between all sample pairs

ROKET can run optimal transport calculations across all $N$ choose 2 samples. Below is code to run balanced OT.

outOTs = run_myOTs(ZZ = t(ZZ),COST = 1 - GS,
  EPS = 1e-3,balance = TRUE,LAMBDA1 = 1,
  LAMBDA2 = 1,conv = 1e-5,max_iter = 3e3,
  ncores = 1,verbose = FALSE)

hout = hclust(as.dist(outOTs))
ord = hout$labels[hout$order]
show_tile(MAT = outOTs[ord,ord],
  LABEL = "Balanced OT Distances",
  TYPE = "DIST",DIGITS = 1)

We can run calculations for various $\lambda$ values. For shorthand, $\lambda$ = Inf corresponds to balanced OT.

LAMs = c(0,0.5,1.0,5.0)

for(LAM in LAMs){
  # LAM = LAMs[2]

  BAL = ifelse(LAM == 0,TRUE,FALSE)
  LAM2 = ifelse(BAL,1,LAM)

  outOTs = run_myOTs(ZZ = t(ZZ),COST = 1 - GS,
    EPS = 1e-3,balance = BAL,LAMBDA1 = LAM2,
    LAMBDA2 = LAM2,conv = 1e-5,max_iter = 3e3,
    ncores = 1,verbose = FALSE)

  hout = hclust(as.dist(outOTs))
  ord = hout$labels[hout$order]

  LABEL = ifelse(BAL,"OT Distances (Balanced)",
    sprintf("OT Distances (Lambda = %s)",LAM))
  LABEL2 = ifelse(BAL,"OT_Balanced",sprintf("OT_LAM%s",LAM))

  gg = show_tile(MAT = outOTs[ord,ord],
    LABEL = LABEL,TYPE = "DIST",DIGITS = 2)
  print(gg)

  DD[,,LABEL2] = outOTs
  rm(outOTs)
}

Dendrograms

We can see that Euclidean distance calculations on gene mutation statuses alone does not lead to strong evidence of sample clusters. However optimal transport-based distance calculations with integrated gene-gene similarities provide stronger evidence of sample clusters.

nms = dimnames(DD)[[3]]; nms

for(nm in nms){
  # nm = nms[2]

  hout = hclust(as.dist(DD[,,nm]))
  hdend = as.dendrogram(hout)
  dend_data = dendro_data(hdend,type = "rectangle")
  gg = ggplot(dend_data$segments) +
    geom_segment(aes(x = x,y = y,xend = xend,yend = yend)) +
    ggtitle(nm) + xlab("") + ylab("") +
    geom_text(data = dend_data$labels,
      mapping = aes(x,y,label = label),vjust = 0.5,hjust = 1) +
    theme_dendro() + coord_flip() +
    theme(plot.title = element_text(hjust = 0.5))
  print(gg)
  rm(hout,hdend,dend_data,gg)
}

Kernel Regression and Association

The next step is to transform distance matrices into centered kernel matrices to perform hypothesis testing on our constructed kernels.

Models

For a binary or continuous outcome, $Y_i$, fitted with a generalized linear model, we have

$$ g\bigPar{E\bcSqu{Y_i}{\bf{X}_i,\bf{Z}_i}} = \bf{X}_i^\T \bf{\beta} + f(\bf{Z}_i) = \bf{X}_i^\T \bf{\beta} + \bf{K}_i^\T \bf{\alpha} $$

and for Cox proportional hazards for survival outcomes, we have

$$ h(t;\bf{X}_i,\bf{Z}_i) = h_0(t) \nexp{\bf{X}_i^\T \bf{\beta} + f(\bf{Z}_i)} = h_0(t) \nexp{\bf{X}_i^\T \bf{\beta} + \bf{K}_i^\T \bf{\alpha}} $$

where

Distance to Kernel

The matrix $\bf{K}$ is constructed from the distance matrix (Euclidean or optimal transport), $\bf{D}$, by double centering the rows and columns of $\bf{D}$ with the formula

$$ \tilde{\bf{K}} = -\frac{1}{2} \bigPar{\bf{I}_N - \bf{J}_N \bf{J}_N^\T} \bf{D}^2 \bigPar{\bf{I}_N - \bf{J}_N \bf{J}_N^\T} $$

where

Since $\tilde{\bf{K}}$ is not guaranteed to be positive semi-definite, we perform spectral decomposition and replace negative eigenvalues with zero and re-calculate the kernel to arrive at $\bf{K}$.

For a single candidate distance matrix, transform the distance matrix to a kernel matrix and convert the object into a list, denoted by the object KK, code is provided below from the R package, MiRKAT [@zhao2015testing;@plantinga2017mirkat].

# For example, with Euclidean distance
KK = MiRKAT::D2K(D = DD[,,"EUC"])
KK = list(EUC = KK)

If there are multiple distance matrices, in our case, contained in an array DD, store them into a list of kernel matrices KK. Example code is below.

KK = list()

for(nm in dimnames(DD)[[3]]){
  KK[[nm]] = MiRKAT::D2K(D = DD[,,nm])
}

Hypothesis Testing

The user needs to pre-specify and fit the null model,

$$ H_0: \bf{\alpha} = 0, $$

of baseline covariates, $\bf{X}_i$, to one of the three outcome models to obtain continuous, logistic, or martingale residuals (RESI). Some example code is provided below.

# Continuous
out_LM = lm(Y ~ .,data = data.frame(X))
RESI = residuals(out_LM)

# Logistic
out_LOG = glm(Y ~ .,data = data.frame(X),family = "binomial")
RESI = residuals(out_LOG)

# Survival
out_CX = coxph(Surv(TIME,CENS) ~ .,data = data.frame(X))
RESI = residuals(out_CX)

With one or multiple candidate kernels, ROKET will take

to calculate individual kernel p-values as well as omnibus tests. An omnibus test assesses the significance of the minimum p-value kernel among kernels considered. The function kernTEST requires names(RESI) and dimension names per object as well as row/column names per kernel within aKK are named for sample order consistency. Sample code is provided below. Setting verbose = TRUE allows the user to track the permutation's progress, especially when requesting hundreds of thousands of permutations.

nPERMS = 1e5
nKK = length(KK)

# Array of kernels
aKK = array(data = NA,dim = dim(DD),
  dimnames = dimnames(DD))
for(nm in dimnames(DD)[[3]]){
  aKK[,,nm] = KK[[nm]]
}

# Create OMNI matrix
OMNI = matrix(0,nrow = 2,ncol = dim(aKK)[3],
  dimnames = list(c("EUC","OT"),dimnames(aKK)[[3]]))
OMNI["EUC","EUC"] = 1
OMNI["OT",grepl("^OT",colnames(OMNI))] = 1
OMNI

# Hypothesis Testing
ROKET::kernTEST(RESI = RESI,
  KK = aKK,
  OMNI = OMNI,
  nPERMS = nPERMS,
  ncores = 1)

The final output contains individual kernel p-values and the omnibus p-value.

Have fun with utilizing kernel regression and optimal transport frameworks with ROKET!

Session Info

sessionInfo()

References



Try the ROKET package in your browser

Any scripts or data that you put into this service are public.

ROKET documentation built on April 4, 2025, 2:28 a.m.