Nothing
## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,comment = "#>",
echo = TRUE,cache = TRUE,
dev = "png")
ot_eval = TRUE
## ----setup,warning = FALSE----------------------------------------------------
# Load libraries
req_packs = c("devtools","smarter","ggplot2","reshape2",
"survival","ggdendro","MiRKAT","ROKET")
for(pack in req_packs){
library(package = pack,character.only = TRUE)
}
# List package's exported functions
ls("package:ROKET")
# Fix seed
set.seed(2)
## ----inputs,eval = ot_eval----------------------------------------------------
# number of samples
NN = 30
NN_nms = sprintf("S%s",seq(NN))
# number of pathways
PP = 4
PP_nms = sprintf("P%s",seq(PP))
# number of genes
GG = 30
GG_nms = sprintf("G%s",seq(GG))
# bound for gene similarity of two genes on same or different pathway
bnd_same = 0.75
# Gene and pathway relationship
GP = smart_df(PATH = sample(seq(PP),GG,replace = TRUE),
GENE = seq(GG))
table(GP$PATH)
# gene-gene similarity matrix
GS = matrix(NA,GG,GG)
dimnames(GS) = list(GG_nms,GG_nms)
diag(GS) = 1
tmp_mat = t(combn(seq(GG),2))
for(ii in seq(nrow(tmp_mat))){
G1 = tmp_mat[ii,1]
G2 = tmp_mat[ii,2]
same = GP$PATH[GP$GENE == G1] == GP$PATH[GP$GENE == G2]
if( same )
GS[G1,G2] = runif(1,bnd_same,1)
else
GS[G1,G2] = runif(1,0,1 - bnd_same)
}
GS[lower.tri(GS)] = t(GS)[lower.tri(GS)]
# round(GS,3)
## ----gene_sim,fig.dim = c(8,5),echo = FALSE,results = "hide",eval = ot_eval----
show_tile = function(MAT,LABEL,TYPE = NULL,
LABx = NULL,LABy = NULL,DIGITS = 1){
min_val = min(MAT)
max_val = max(MAT)
med_val = (min_val + max_val) / 2
if( is.null(TYPE) ) stop("Specify TYPE")
if( isSymmetric(MAT) )
MAT[upper.tri(MAT,diag = !TRUE)] = NA
if( TYPE == "GSIM" ){
max_val = 1
med_val = 0.5
}
# else if( TYPE == "DIST" ){
# max_val = max(c(1,max(MAT)))
#}
dat = melt(MAT,na.rm = TRUE)
# class(dat); dim(dat); dat[1:5,]
gg = ggplot(data = dat,aes(x = Var1,y = Var2,fill = value)) +
geom_tile(color = "black") + ggtitle(LABEL) +
labs(fill = "Value")
if( is.null(LABx) ){
gg = gg + xlab("")
} else {
gg = gg + xlab(LABx)
}
if( is.null(LABy) ){
gg = gg + ylab("")
} else {
gg = gg + ylab(LABy)
}
if( max(dim(MAT)) <= 10 && DIGITS >= 0 )
gg = gg + geom_text(mapping = aes(label = smart_digits(value,DIGITS)))
if( TYPE %in% c("GSIM","DIST") ){
gg = gg + scale_fill_gradient2(midpoint = med_val,low = "deepskyblue",
mid = "white",high = "red",limit = c(min_val,max_val))
} else if( TYPE == "MUT" ){
gg = gg + scale_fill_gradient2(low = "black",
high = "red",limit = c(min_val,max_val))
}
gg = gg + guides(fill = guide_colorbar(frame.colour = "black")) +
scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
scale_y_discrete(guide = guide_axis(n.dodge = 2))
gg = gg + theme(legend.position = "right",
# legend.key.width = unit(1.5,'cm'),
legend.key.height = unit(1,'cm'),
legend.text = element_text(size = 12),
legend.title = element_text(size = 12,hjust = 0.5),
text = element_text(size = 12),
panel.background = element_blank(),
panel.grid.major = element_line(colour = "grey50",
size = 0.5,linetype = "dotted"),
axis.text = element_text(face = "italic"),
axis.text.x = element_text(angle = 0),
plot.title = element_text(hjust = 0.5))
return(gg)
}
hout = hclust(as.dist(1 - GS))
ord = hout$labels[hout$order]
show_tile(MAT = GS[ord,ord],
LABEL = "Simulated Gene Similarity",
TYPE = "GSIM",DIGITS = 2)
show_tile(MAT = 1 - GS[ord,ord],
LABEL = "Simulated Gene Dissimilarity",
TYPE = "GSIM",DIGITS = 2)
## ----gene_muts,fig.dim = c(8,6),eval = ot_eval--------------------------------
# Mutated gene statuses
prob_mut = 0.2
prob_muts = c(1 - prob_mut,prob_mut)
while(TRUE){
ZZ = matrix(sample(c(0,1),NN*GG,replace = TRUE,prob = prob_muts),NN,GG)
# Ensure each sample has at least one mutated gene
if( min(rowSums(ZZ)) > 0 ) break
}
dimnames(ZZ) = list(NN_nms,GG_nms)
show_tile(MAT = ZZ,
LABEL = "Mutation Status: Gene by Sample",
TYPE = "MUT",DIGITS = 0)
# Store all distances
DD = array(data = NA,dim = c(NN,NN,5))
dimnames(DD)[1:2] = list(NN_nms,NN_nms)
dimnames(DD)[[3]] = c("EUC","OT_Balanced",sprintf("OT_LAM%s",c(0.5,1.0,5.0)))
## ----mutfreq,fig.dim = c(8,5),eval = ot_eval----------------------------------
freq = colSums(ZZ); # freq
dat = smart_df(GENE = names(freq),FREQ = as.integer(freq))
dat$GENE = factor(dat$GENE,levels = names(sort(freq,decreasing = TRUE)))
# dat
ggplot(data = dat,mapping = aes(x = GENE,y = FREQ)) +
geom_bar(stat = "identity") +
xlab("Gene") + ylab("Mutation Frequency") +
scale_x_discrete(guide = guide_axis(n.dodge = 2))
## ----euc,fig.dim = c(8,5),eval = ot_eval--------------------------------------
DD[,,"EUC"] = as.matrix(dist(ZZ,diag = TRUE,upper = TRUE))
hout = hclust(as.dist(DD[,,"EUC"]))
ord = hout$labels[hout$order]
show_tile(MAT = DD[,,"EUC"][ord,ord],
LABEL = "Euclidean Pairwise Distances",
TYPE = "DIST",DIGITS = 2)
## ----balOT,fig.dim = c(8,5),eval = ot_eval------------------------------------
# Pick two samples
ii = 1
jj = 2
ZZ[c(ii,jj),colSums(ZZ[c(ii,jj),]) > 0]
outOT = run_myOT(XX = ZZ[ii,],YY = ZZ[jj,],
COST = 1 - GS,EPS = 1e-3,LAMBDA1 = 1,
LAMBDA2 = 1,balance = TRUE,verbose = FALSE)
# str(outOT)
# Optimal transport matrix
tmpOT = outOT$OT
tmpOT = tmpOT[rowSums(tmpOT) > 0,colSums(tmpOT) > 0]
show_tile(MAT = tmpOT,LABEL = "Balanced OT (showing only genes mutated in each sample)",
TYPE = "DIST",LABx = sprintf("Sample %s",ii),
LABy = sprintf("Sample %s",jj),
DIGITS = 2)
# Pairwise distance
outOT$DIST
## ----unbal_OT,fig.dim = c(8,5),eval = ot_eval---------------------------------
ZZ[c(ii,jj),colSums(ZZ[c(ii,jj),]) > 0]
LAM = 0.5
outOT = run_myOT(XX = ZZ[ii,],YY = ZZ[jj,],
COST = 1 - GS,EPS = 1e-3,LAMBDA1 = LAM,
LAMBDA2 = LAM,balance = FALSE,verbose = FALSE)
# str(outOT)
# Optimal transport matrix
tmpOT = outOT$OT
tmpOT = tmpOT[rowSums(tmpOT) > 0,colSums(tmpOT) > 0]
show_tile(MAT = tmpOT,
LABEL = "Unbalanced OT (showing only genes mutated in each sample)",TYPE = "DIST",
LABx = sprintf("Sample %s",ii),
LABy = sprintf("Sample %s",jj),
DIGITS = 2)
# Pairwise distance
outOT$DIST
## ----all_samp_balOT,fig.dim = c(8,5),eval = ot_eval---------------------------
outOTs = run_myOTs(ZZ = t(ZZ),COST = 1 - GS,
EPS = 1e-3,balance = TRUE,LAMBDA1 = 1,
LAMBDA2 = 1,conv = 1e-5,max_iter = 3e3,
ncores = 1,verbose = FALSE)
hout = hclust(as.dist(outOTs))
ord = hout$labels[hout$order]
show_tile(MAT = outOTs[ord,ord],
LABEL = "Balanced OT Distances",
TYPE = "DIST",DIGITS = 1)
## ----full_OT_calc,fig.dim = c(8,5),eval = ot_eval-----------------------------
LAMs = c(0,0.5,1.0,5.0)
for(LAM in LAMs){
# LAM = LAMs[2]
BAL = ifelse(LAM == 0,TRUE,FALSE)
LAM2 = ifelse(BAL,1,LAM)
outOTs = run_myOTs(ZZ = t(ZZ),COST = 1 - GS,
EPS = 1e-3,balance = BAL,LAMBDA1 = LAM2,
LAMBDA2 = LAM2,conv = 1e-5,max_iter = 3e3,
ncores = 1,verbose = FALSE)
hout = hclust(as.dist(outOTs))
ord = hout$labels[hout$order]
LABEL = ifelse(BAL,"OT Distances (Balanced)",
sprintf("OT Distances (Lambda = %s)",LAM))
LABEL2 = ifelse(BAL,"OT_Balanced",sprintf("OT_LAM%s",LAM))
gg = show_tile(MAT = outOTs[ord,ord],
LABEL = LABEL,TYPE = "DIST",DIGITS = 2)
print(gg)
DD[,,LABEL2] = outOTs
rm(outOTs)
}
## ----dendro,fig.dim = c(8,5),eval = ot_eval-----------------------------------
nms = dimnames(DD)[[3]]; nms
for(nm in nms){
# nm = nms[2]
hout = hclust(as.dist(DD[,,nm]))
hdend = as.dendrogram(hout)
dend_data = dendro_data(hdend,type = "rectangle")
gg = ggplot(dend_data$segments) +
geom_segment(aes(x = x,y = y,xend = xend,yend = yend)) +
ggtitle(nm) + xlab("") + ylab("") +
geom_text(data = dend_data$labels,
mapping = aes(x,y,label = label),vjust = 0.5,hjust = 1) +
theme_dendro() + coord_flip() +
theme(plot.title = element_text(hjust = 0.5))
print(gg)
rm(hout,hdend,dend_data,gg)
}
## ----eval = FALSE-------------------------------------------------------------
# # For example, with Euclidean distance
# KK = MiRKAT::D2K(D = DD[,,"EUC"])
# KK = list(EUC = KK)
## ----eval = FALSE-------------------------------------------------------------
# KK = list()
#
# for(nm in dimnames(DD)[[3]]){
# KK[[nm]] = MiRKAT::D2K(D = DD[,,nm])
# }
## ----eval = FALSE-------------------------------------------------------------
# # Continuous
# out_LM = lm(Y ~ .,data = data.frame(X))
# RESI = residuals(out_LM)
#
# # Logistic
# out_LOG = glm(Y ~ .,data = data.frame(X),family = "binomial")
# RESI = residuals(out_LOG)
#
# # Survival
# out_CX = coxph(Surv(TIME,CENS) ~ .,data = data.frame(X))
# RESI = residuals(out_CX)
## ----eval = FALSE-------------------------------------------------------------
# nPERMS = 1e5
# nKK = length(KK)
#
# # Array of kernels
# aKK = array(data = NA,dim = dim(DD),
# dimnames = dimnames(DD))
# for(nm in dimnames(DD)[[3]]){
# aKK[,,nm] = KK[[nm]]
# }
#
# # Create OMNI matrix
# OMNI = matrix(0,nrow = 2,ncol = dim(aKK)[3],
# dimnames = list(c("EUC","OT"),dimnames(aKK)[[3]]))
# OMNI["EUC","EUC"] = 1
# OMNI["OT",grepl("^OT",colnames(OMNI))] = 1
# OMNI
#
# # Hypothesis Testing
# ROKET::kernTEST(RESI = RESI,
# KK = aKK,
# OMNI = OMNI,
# nPERMS = nPERMS,
# ncores = 1)
## -----------------------------------------------------------------------------
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.