knitr::opts_chunk$set( collapse = TRUE,comment = "#>", echo = TRUE,cache = TRUE, dev = "png") ot_eval = TRUE
\def\T{\text{T}} \newcommand{\bf}[1]{\mathbf{#1}} \newcommand{\bigPar}[1]{\left(#1\right)} \newcommand{\bigCur}[1]{\left{#1\right}} \newcommand{\bcSqu}[2]{\left[#1 \middle| #2\right]} \newcommand{\cE}[2]{E\bcSqu{#1}{#2}} \newcommand{\nexp}[1]{\exp\bigCur{#1}} \newcommand{\ind}[1]{1\bigCur{#1}} \newcommand{\w}[1]{\widehat{#1}}
Assuming all software dependencies and ROKET [@little2023associating] installation are installed, we can begin.
# Load libraries req_packs = c("devtools","smarter","ggplot2","reshape2", "survival","ggdendro","MiRKAT","ROKET") for(pack in req_packs){ library(package = pack,character.only = TRUE) } # List package's exported functions ls("package:ROKET") # Fix seed set.seed(2)
For $i = 1,\ldots,N$ and $g = 1,\ldots,G$, let
We would like to calculate the distance between the $i$th and $j$th samples in terms of mutated genes $\bf{Z}_i$ and $\bf{Z}_j$, denoted by $d\bigPar{\bf{Z}_i,\bf{Z}_j}$.
Several optimal transport (OT) references to get familiarized with the framework, objective functions, updating equations, entropic regularizations, types of OT are provided:
For the $i$th and $j$th samples, let
If the mass of the two vectors are equal or normalized such that $\sum_g Z_{ig} = \sum_g Z_{jg}$, we could use balanced optimal transport.
If the mass of the two vectors are not equal, $\sum_g Z_{ig} \neq \sum_g Z_{jg}$, we could use unbalanced optimal transport with penalty parameters.
The code below will simulate samples and mutated genes. We welcome the reader to manipulate the following input arguments:
NN
for sample size,PP
for number of pathways,GG
for number of genes, and bnd_same
the lower bound gene similarity for genes sharing the same pathway and 1 - bnd_same
, an upper bound on gene similarity for genes not sharing the same pathwayIdeally, PP
should be much less than GG
to allow multiple genes to be allocated to each pathway.
# number of samples NN = 30 NN_nms = sprintf("S%s",seq(NN)) # number of pathways PP = 4 PP_nms = sprintf("P%s",seq(PP)) # number of genes GG = 30 GG_nms = sprintf("G%s",seq(GG)) # bound for gene similarity of two genes on same or different pathway bnd_same = 0.75 # Gene and pathway relationship GP = smart_df(PATH = sample(seq(PP),GG,replace = TRUE), GENE = seq(GG)) table(GP$PATH) # gene-gene similarity matrix GS = matrix(NA,GG,GG) dimnames(GS) = list(GG_nms,GG_nms) diag(GS) = 1 tmp_mat = t(combn(seq(GG),2)) for(ii in seq(nrow(tmp_mat))){ G1 = tmp_mat[ii,1] G2 = tmp_mat[ii,2] same = GP$PATH[GP$GENE == G1] == GP$PATH[GP$GENE == G2] if( same ) GS[G1,G2] = runif(1,bnd_same,1) else GS[G1,G2] = runif(1,0,1 - bnd_same) } GS[lower.tri(GS)] = t(GS)[lower.tri(GS)] # round(GS,3)
Let's take a look at the gene similarity matrix.
show_tile = function(MAT,LABEL,TYPE = NULL, LABx = NULL,LABy = NULL,DIGITS = 1){ min_val = min(MAT) max_val = max(MAT) med_val = (min_val + max_val) / 2 if( is.null(TYPE) ) stop("Specify TYPE") if( isSymmetric(MAT) ) MAT[upper.tri(MAT,diag = !TRUE)] = NA if( TYPE == "GSIM" ){ max_val = 1 med_val = 0.5 } # else if( TYPE == "DIST" ){ # max_val = max(c(1,max(MAT))) #} dat = melt(MAT,na.rm = TRUE) # class(dat); dim(dat); dat[1:5,] gg = ggplot(data = dat,aes(x = Var1,y = Var2,fill = value)) + geom_tile(color = "black") + ggtitle(LABEL) + labs(fill = "Value") if( is.null(LABx) ){ gg = gg + xlab("") } else { gg = gg + xlab(LABx) } if( is.null(LABy) ){ gg = gg + ylab("") } else { gg = gg + ylab(LABy) } if( max(dim(MAT)) <= 10 && DIGITS >= 0 ) gg = gg + geom_text(mapping = aes(label = smart_digits(value,DIGITS))) if( TYPE %in% c("GSIM","DIST") ){ gg = gg + scale_fill_gradient2(midpoint = med_val,low = "deepskyblue", mid = "white",high = "red",limit = c(min_val,max_val)) } else if( TYPE == "MUT" ){ gg = gg + scale_fill_gradient2(low = "black", high = "red",limit = c(min_val,max_val)) } gg = gg + guides(fill = guide_colorbar(frame.colour = "black")) + scale_x_discrete(guide = guide_axis(n.dodge = 2)) + scale_y_discrete(guide = guide_axis(n.dodge = 2)) gg = gg + theme(legend.position = "right", # legend.key.width = unit(1.5,'cm'), legend.key.height = unit(1,'cm'), legend.text = element_text(size = 12), legend.title = element_text(size = 12,hjust = 0.5), text = element_text(size = 12), panel.background = element_blank(), panel.grid.major = element_line(colour = "grey50", size = 0.5,linetype = "dotted"), axis.text = element_text(face = "italic"), axis.text.x = element_text(angle = 0), plot.title = element_text(hjust = 0.5)) return(gg) } hout = hclust(as.dist(1 - GS)) ord = hout$labels[hout$order] show_tile(MAT = GS[ord,ord], LABEL = "Simulated Gene Similarity", TYPE = "GSIM",DIGITS = 2) show_tile(MAT = 1 - GS[ord,ord], LABEL = "Simulated Gene Dissimilarity", TYPE = "GSIM",DIGITS = 2)
The function show_tile()
is inherent to this vignette and not part of the ROKET package.
# Mutated gene statuses prob_mut = 0.2 prob_muts = c(1 - prob_mut,prob_mut) while(TRUE){ ZZ = matrix(sample(c(0,1),NN*GG,replace = TRUE,prob = prob_muts),NN,GG) # Ensure each sample has at least one mutated gene if( min(rowSums(ZZ)) > 0 ) break } dimnames(ZZ) = list(NN_nms,GG_nms) show_tile(MAT = ZZ, LABEL = "Mutation Status: Gene by Sample", TYPE = "MUT",DIGITS = 0) # Store all distances DD = array(data = NA,dim = c(NN,NN,5)) dimnames(DD)[1:2] = list(NN_nms,NN_nms) dimnames(DD)[[3]] = c("EUC","OT_Balanced",sprintf("OT_LAM%s",c(0.5,1.0,5.0)))
We can look at the distribution of gene mutation frequencies.
freq = colSums(ZZ); # freq dat = smart_df(GENE = names(freq),FREQ = as.integer(freq)) dat$GENE = factor(dat$GENE,levels = names(sort(freq,decreasing = TRUE))) # dat ggplot(data = dat,mapping = aes(x = GENE,y = FREQ)) + geom_bar(stat = "identity") + xlab("Gene") + ylab("Mutation Frequency") + scale_x_discrete(guide = guide_axis(n.dodge = 2))
We can calculate the Euclidean distance, which does not incorporate relationships between pairs of genes.
DD[,,"EUC"] = as.matrix(dist(ZZ,diag = TRUE,upper = TRUE)) hout = hclust(as.dist(DD[,,"EUC"])) ord = hout$labels[hout$order] show_tile(MAT = DD[,,"EUC"][ord,ord], LABEL = "Euclidean Pairwise Distances", TYPE = "DIST",DIGITS = 2)
To demonstrate ROKET's functionality, the code below will run balanced OT (pre-normalizing input vectors) between two samples. Regardless of the values specified by LAMBDA1
and LAMBDA2
arguments, we need to set balance = TRUE
. The OT cost matrix argument corresponds to 1 - GS
, one minus the gene similarity matrix.
# Pick two samples ii = 1 jj = 2 ZZ[c(ii,jj),colSums(ZZ[c(ii,jj),]) > 0] outOT = run_myOT(XX = ZZ[ii,],YY = ZZ[jj,], COST = 1 - GS,EPS = 1e-3,LAMBDA1 = 1, LAMBDA2 = 1,balance = TRUE,verbose = FALSE) # str(outOT) # Optimal transport matrix tmpOT = outOT$OT tmpOT = tmpOT[rowSums(tmpOT) > 0,colSums(tmpOT) > 0] show_tile(MAT = tmpOT,LABEL = "Balanced OT (showing only genes mutated in each sample)", TYPE = "DIST",LABx = sprintf("Sample %s",ii), LABy = sprintf("Sample %s",jj), DIGITS = 2) # Pairwise distance outOT$DIST
Let's try again but with unbalanced OT and $\lambda = 0.5$. We need to set balance = FALSE
and specify LAMBDA1 = 0.5
and LAMBDA2 = 0.5
.
ZZ[c(ii,jj),colSums(ZZ[c(ii,jj),]) > 0] LAM = 0.5 outOT = run_myOT(XX = ZZ[ii,],YY = ZZ[jj,], COST = 1 - GS,EPS = 1e-3,LAMBDA1 = LAM, LAMBDA2 = LAM,balance = FALSE,verbose = FALSE) # str(outOT) # Optimal transport matrix tmpOT = outOT$OT tmpOT = tmpOT[rowSums(tmpOT) > 0,colSums(tmpOT) > 0] show_tile(MAT = tmpOT, LABEL = "Unbalanced OT (showing only genes mutated in each sample)",TYPE = "DIST", LABx = sprintf("Sample %s",ii), LABy = sprintf("Sample %s",jj), DIGITS = 2) # Pairwise distance outOT$DIST
ROKET can run optimal transport calculations across all $N$ choose 2 samples. Below is code to run balanced OT.
outOTs = run_myOTs(ZZ = t(ZZ),COST = 1 - GS, EPS = 1e-3,balance = TRUE,LAMBDA1 = 1, LAMBDA2 = 1,conv = 1e-5,max_iter = 3e3, ncores = 1,verbose = FALSE) hout = hclust(as.dist(outOTs)) ord = hout$labels[hout$order] show_tile(MAT = outOTs[ord,ord], LABEL = "Balanced OT Distances", TYPE = "DIST",DIGITS = 1)
We can run calculations for various $\lambda$ values. For shorthand, $\lambda$ = Inf corresponds to balanced OT.
LAMs = c(0,0.5,1.0,5.0) for(LAM in LAMs){ # LAM = LAMs[2] BAL = ifelse(LAM == 0,TRUE,FALSE) LAM2 = ifelse(BAL,1,LAM) outOTs = run_myOTs(ZZ = t(ZZ),COST = 1 - GS, EPS = 1e-3,balance = BAL,LAMBDA1 = LAM2, LAMBDA2 = LAM2,conv = 1e-5,max_iter = 3e3, ncores = 1,verbose = FALSE) hout = hclust(as.dist(outOTs)) ord = hout$labels[hout$order] LABEL = ifelse(BAL,"OT Distances (Balanced)", sprintf("OT Distances (Lambda = %s)",LAM)) LABEL2 = ifelse(BAL,"OT_Balanced",sprintf("OT_LAM%s",LAM)) gg = show_tile(MAT = outOTs[ord,ord], LABEL = LABEL,TYPE = "DIST",DIGITS = 2) print(gg) DD[,,LABEL2] = outOTs rm(outOTs) }
We can see that Euclidean distance calculations on gene mutation statuses alone does not lead to strong evidence of sample clusters. However optimal transport-based distance calculations with integrated gene-gene similarities provide stronger evidence of sample clusters.
nms = dimnames(DD)[[3]]; nms for(nm in nms){ # nm = nms[2] hout = hclust(as.dist(DD[,,nm])) hdend = as.dendrogram(hout) dend_data = dendro_data(hdend,type = "rectangle") gg = ggplot(dend_data$segments) + geom_segment(aes(x = x,y = y,xend = xend,yend = yend)) + ggtitle(nm) + xlab("") + ylab("") + geom_text(data = dend_data$labels, mapping = aes(x,y,label = label),vjust = 0.5,hjust = 1) + theme_dendro() + coord_flip() + theme(plot.title = element_text(hjust = 0.5)) print(gg) rm(hout,hdend,dend_data,gg) }
The next step is to transform distance matrices into centered kernel matrices to perform hypothesis testing on our constructed kernels.
For a binary or continuous outcome, $Y_i$, fitted with a generalized linear model, we have
$$ g\bigPar{E\bcSqu{Y_i}{\bf{X}_i,\bf{Z}_i}} = \bf{X}_i^\T \bf{\beta} + f(\bf{Z}_i) = \bf{X}_i^\T \bf{\beta} + \bf{K}_i^\T \bf{\alpha} $$
and for Cox proportional hazards for survival outcomes, we have
$$ h(t;\bf{X}_i,\bf{Z}_i) = h_0(t) \nexp{\bf{X}_i^\T \bf{\beta} + f(\bf{Z}_i)} = h_0(t) \nexp{\bf{X}_i^\T \bf{\beta} + \bf{K}_i^\T \bf{\alpha}} $$
where
The matrix $\bf{K}$ is constructed from the distance matrix (Euclidean or optimal transport), $\bf{D}$, by double centering the rows and columns of $\bf{D}$ with the formula
$$ \tilde{\bf{K}} = -\frac{1}{2} \bigPar{\bf{I}_N - \bf{J}_N \bf{J}_N^\T} \bf{D}^2 \bigPar{\bf{I}_N - \bf{J}_N \bf{J}_N^\T} $$
where
Since $\tilde{\bf{K}}$ is not guaranteed to be positive semi-definite, we perform spectral decomposition and replace negative eigenvalues with zero and re-calculate the kernel to arrive at $\bf{K}$.
For a single candidate distance matrix, transform the distance matrix to a kernel matrix and convert the object into a list, denoted by the object KK
, code is provided below from the R package, MiRKAT [@zhao2015testing;@plantinga2017mirkat].
# For example, with Euclidean distance KK = MiRKAT::D2K(D = DD[,,"EUC"]) KK = list(EUC = KK)
If there are multiple distance matrices, in our case, contained in an array DD
, store them into a list of kernel matrices KK
. Example code is below.
KK = list() for(nm in dimnames(DD)[[3]]){ KK[[nm]] = MiRKAT::D2K(D = DD[,,nm]) }
The user needs to pre-specify and fit the null model,
$$ H_0: \bf{\alpha} = 0, $$
of baseline covariates, $\bf{X}_i$, to one of the three outcome models to obtain continuous, logistic, or martingale residuals (RESI
). Some example code is provided below.
# Continuous out_LM = lm(Y ~ .,data = data.frame(X)) RESI = residuals(out_LM) # Logistic out_LOG = glm(Y ~ .,data = data.frame(X),family = "binomial") RESI = residuals(out_LOG) # Survival out_CX = coxph(Surv(TIME,CENS) ~ .,data = data.frame(X)) RESI = residuals(out_CX)
With one or multiple candidate kernels, ROKET will take
aKK
) (defined below), RESI
), and nPERMS
)to calculate individual kernel p-values as well as omnibus tests. An omnibus test assesses the significance of the minimum p-value kernel among kernels considered. The function kernTEST
requires names(RESI)
and dimension names per object as well as row/column names per kernel within aKK
are named for sample order consistency. Sample code is provided below. Setting verbose = TRUE
allows the user to track the permutation's progress, especially when requesting hundreds of thousands of permutations.
nPERMS = 1e5 nKK = length(KK) # Array of kernels aKK = array(data = NA,dim = dim(DD), dimnames = dimnames(DD)) for(nm in dimnames(DD)[[3]]){ aKK[,,nm] = KK[[nm]] } # Create OMNI matrix OMNI = matrix(0,nrow = 2,ncol = dim(aKK)[3], dimnames = list(c("EUC","OT"),dimnames(aKK)[[3]])) OMNI["EUC","EUC"] = 1 OMNI["OT",grepl("^OT",colnames(OMNI))] = 1 OMNI # Hypothesis Testing ROKET::kernTEST(RESI = RESI, KK = aKK, OMNI = OMNI, nPERMS = nPERMS, ncores = 1)
The final output contains individual kernel p-values and the omnibus p-value.
Have fun with utilizing kernel regression and optimal transport frameworks with ROKET!
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.