inst/examples/case-control-interaction-simulation.R

library(privateEC)
library(broom)
library(dplyr)

## npdr install
# library(devtools)
# install_github("insilico/npdr")
library(npdr)

set.seed(1618)
##### simulate case-control interaction effect data 
n.samples <- 300     # 100 samples in train/holdout/test
n.variables <- 1000   # 100 features
label <- "class" # tells simulator to do case/control and adds this colname
type <- "interactionErdos" # or mainEffect
#type <-"mainEffect"
bias <- 0.4          # moderate effect size
pct.signals <- 0.1   # pct functional features
verbose <- FALSE
case.control.3sets <- createSimulation(num.samples = n.samples,
                              num.variables = n.variables,
                              pct.signals = pct.signals,
                              label = label,
                              bias = bias,
                              pct.train = 1/3,
                              pct.holdout = 1/3,
                              pct.validation = 1/3,
                              sim.type = type,
                              save.file = NULL,
                              verbose = verbose)
# combine train and holdout into 200 samples x 100 attributes
# ignore validation set
case.control.data <- rbind(case.control.3sets$train,case.control.3sets$holdout)
#validation.data <- data.sets$validation
n.samples.case.control <- dim(case.control.data)[1]
pheno.case.control <- as.factor(case.control.data[,"class"])
functional.case.control <- case.control.3sets$signal.names # functional attributes

##### Univariate logistic regression
# linear regression on all predictors, fdr adjust, check functional hits
# standardized beta and p-value
# npdr utulity function

univariate.cc.results <- uniReg(outcome="class", dataset=case.control.data, regression.type="binomial")
univariate.cc.results[1:10,]

# don't expect any less than .05 for interaction simulations
# univariate.cc.results[univariate.cc.results[,"p.adj"]<.05,]

##### Run npdr
system.time(
  npdr.cc.results <- npdr("class", case.control.data, regression.type="binomial", attr.diff.type="numeric-abs",
                          nbd.method="multisurf", nbd.metric = "manhattan", msurf.sd.frac=.5, 
                          neighbor.sampling="none", fast.reg = T, dopar.nn = T, dopar.reg = T,
                          padj.method="bonferroni", verbose=T)
)
head(npdr.cc.results)
system.time(
  npdr.cc.results <- npdr("class", case.control.data, regression.type="binomial", attr.diff.type="numeric-abs",
                          nbd.method="multisurf", nbd.metric = "manhattan", msurf.sd.frac=.5, 
                          neighbor.sampling="none", fast.dist = F, dopar.nn = T, dopar.reg = F,
                          padj.method="bonferroni", verbose=T)
)
head(npdr.cc.results)

npdr.cc.results <- npdr("class", case.control.data, regression.type="binomial", attr.diff.type="numeric-abs",
                            nbd.method="multisurf", nbd.metric = "manhattan", msurf.sd.frac=.5, 
                            neighbor.sampling="none", dopar.nn = T,
                            padj.method="bonferroni", verbose=T)

# attributes with npdr adjusted p-value less than .05 
npdr.cc.results[npdr.cc.results$pval.adj<.05,] # pval.adj, first column
# attributes with npdr raw/nominal p-value less than .05
#rownames(npdr.cc.results)[npdr.cc.results$pval.attr<.05] # pval.attr, second column

# functional attribute detection stats
#npdr.cc.positives <- row.names(npdr.cc.results[npdr.cc.results$pval.adj<.05,]) # p.adj<.05
npdr.cc.positives <- npdr.cc.results %>% filter(pval.adj<.05) %>% pull(att)
npdr.cc.detect.stats <- detectionStats(functional.case.control, npdr.cc.positives)
cat(npdr.cc.detect.stats$report)

##### Run npdr unique neighbors

npdr.cc.unique.results <- npdr("class", case.control.data, regression.type="binomial", attr.diff.type="numeric-abs",
                        nbd.method="multisurf", nbd.metric = "manhattan", msurf.sd.frac=.5, 
                        neighbor.sampling="unique",
                        padj.method="bonferroni", verbose=T)
# attributes with npdr adjusted p-value less than .05 
npdr.cc.unique.results[npdr.cc.unique.results$pval.adj<.05,] # pval.adj, first column
# attributes with npdr raw/nominal p-value less than .05
#rownames(npdr.cc.results)[npdr.cc.results$pval.attr<.05] # pval.attr, second column

# functional attribute detection stats
#npdr.cc.positives <- row.names(npdr.cc.results[npdr.cc.results$pval.adj<.05,]) # p.adj<.05
npdr.cc.unique.positives <- npdr.cc.unique.results %>% filter(pval.adj<.05) %>% pull(att)
npdr.cc.unique.detect.stats <- detectionStats(functional.case.control, npdr.cc.unique.positives)
cat(npdr.cc.unique.detect.stats$report)

# nearly perfect correlation, but unique npdr shrinks all betas
cor(npdr.cc.results$beta.Z.att,npdr.cc.unique.results$beta.Z.att)
plot(npdr.cc.results$beta.Z.att,npdr.cc.unique.results$beta.Z.att)
plot(npdr.cc.results$beta.raw.att,npdr.cc.unique.results$beta.raw.att)

### Compare univariate and npdr
univ.log10.df <- data.frame(vars=rownames(univariate.cc.results),univ.log10=-log10(univariate.cc.results[,"pval"]))
npdr.cc.log10.df <- data.frame(vars=npdr.cc.results$att,npdr.cc.log10=-log10(npdr.cc.results$pval.att))

univ.cc.pcutoff <- max(-log10(univariate.cc.results[,"pval"]))
npdr.cc.pcutoff <- -log10(npdr.cc.results$pval.att[which(npdr.cc.results$pval.adj>.05)[1]-1])

library(ggplot2)
test.cc.df <- merge(univ.log10.df,npdr.cc.log10.df)
functional <- factor(c(rep("Func",length(functional.case.control)),rep("Non-Func",n.variables-length(functional.case.control))))
ggplot(test.cc.df, aes(x=univ.log10,y=npdr.cc.log10)) + geom_point(aes(colour = functional), size=4) +
  theme(text = element_text(size = 20)) +
  #geom_vline(xintercept=univ.cc.pcutoff, linetype="dashed") +
  geom_hline(yintercept=npdr.cc.pcutoff, linetype="dashed") +
  xlab("Logistic Regression -log10(P)") + ylab("NPDR -log10(P)") 

##### original (pseudo t-test) STIR
# impression is that npdr gives same resutls as original t-STIR
#install_github("insilico/stir")
tryCatch(
  {
  library(stir)
  # stir interface requires splitting phenotype and predictor matrix, 
  # and requires finding the neighborhood separately.
  predictors.cc.mat <- case.control.data[, - which(colnames(case.control.data) == "class")]
  case.control.data[, "class"] <- as.factor(case.control.data[, "class"]) 
  pheno.case.control <- case.control.data[, "class"]
  neighbor.idx.observed <- find.neighbors(predictors.cc.mat, pheno.case.control, k = 0, method = "multisurf")
  results.list <- stir(predictors.cc.mat, neighbor.idx.observed, k = 0, metric = "manhattan", method = "multisurf")
  t_sorted_multisurf <- results.list$STIR_T[, -3]  # remove cohen-d
  colnames(t_sorted_multisurf) <- paste(c("t.stat", "t.pval", "t.pval.adj"), "stir", sep=".")
  (t_sorted_multisurf[t_sorted_multisurf[,3]<.05,])
  
  # functional attribute detection stats
  tstat_stir.detect.stats <- detectionStats(functional.case.control, 
                                            row.names(t_sorted_multisurf[t_sorted_multisurf[,3]<.05,]))
  cat(tstat_stir.detect.stats$report)
  
  ### Compare STIR and npdr
  stir.log10.df <- data.frame(vars=rownames(t_sorted_multisurf),stir.log10=-log10(t_sorted_multisurf$t.pval.stir))
  npdr.log10.df <- data.frame(vars=npdr.cc.results$att,npdr.log10=-log10(npdr.cc.results$pval.att))
  
  stir.pcutoff <- -log10(t_sorted_multisurf$t.pval.stir[which(t_sorted_multisurf$t.pval.adj.stir>.05)[1]-1])
  npdr.pcutoff <- -log10(npdr.cc.results$pval.att[which(npdr.cc.results$pval.adj>.05)[1]-1])
  
  library(ggplot2)
  test.df <- merge(stir.log10.df,npdr.log10.df)
  functional <- factor(c(rep("Func",length(functional.case.control)),rep("Non-Func",n.variables-length(functional.case.control))))
  ggplot(test.df, aes(x=stir.log10,y=npdr.log10)) + geom_point(aes(colour = functional), size=4) +
    theme(text = element_text(size = 20)) +
    # geom_vline(xintercept=stir.pcutoff, linetype="dashed") +
    # geom_hline(yintercept=npdr.pcutoff, linetype="dashed") +
    xlab("STIR -log10(P)") + ylab("NPDR -log10(P)") 
  },
  error=function(cond) {
    message("Trying to use stir. Need to run the following.")
    message('devtools::install_github("insilico/stir")')
    message(cond)
  }
)  # end try stir

##### CORElearn ReliefF with surf fixed k
# fixed k with theoretical surf value
library(CORElearn)
core.learn.case.control <- CORElearn::attrEval("class", data = case.control.data,
                                      estimator = "ReliefFequalK",
                                      costMatrix = NULL,
                                      outputNumericSplits=FALSE,
                                      kNearestEqual = knnSURF(n.samples.case.control,.5))
core.learn.case.control.order <- order(core.learn.case.control, decreasing = T)
t(t(core.learn.case.control[core.learn.case.control.order[1:20]]))

# functional attribute detection stats
#core.learn.detect.stats <- detectionStats(functional.case.control, 
#                                          names(core.learn.case.control)[core.learn.case.control.order[1:20]])

arbitrary.cc.threshold <- .0072
core.learn.cc.detect <- detectionStats(functional.case.control, 
                            names(core.learn.case.control)[core.learn.case.control>arbitrary.cc.threshold])
cat(core.learn.cc.detect$report)

### Compare corelearn and npdr
corelearn.cc.df <- data.frame(vars=names(core.learn.case.control),rrelief=core.learn.case.control)
npdr.cc.beta.df <- data.frame(vars=npdr.cc.results$att,npdr.beta=npdr.cc.results$beta.Z.att)

corelearn.cc.cutoff <- arbitrary.cc.threshold
npdr.cc.pcutoff <- (npdr.cc.results$beta.Z.att[which(npdr.cc.results$pval.adj>.05)[1]-1])


library(ggplot2)
test.cc.df <- merge(corelearn.cc.df,npdr.cc.beta.df)
functional <- factor(c(rep("Func",length(functional.case.control)),rep("Non-Func",n.variables-length(functional.case.control))))
ggplot(test.cc.df, aes(x=rrelief,y=npdr.beta)) + geom_point(aes(colour = functional), size=4) +
  theme(text = element_text(size = 20)) +
  geom_vline(xintercept=corelearn.cc.cutoff, linetype="dashed") +
  geom_hline(yintercept=npdr.cc.pcutoff, linetype="dashed") +
  xlab("Relief Score") + ylab("NPDR Coefficient") 

##### Consensus Nested Cross Validation with ReliefF with surf fixed k
# selects features and learns classification model.
tryCatch(
  {
    library(cncv)
    library(caret)
  cncv.case.control <- consensus_nestedCV(train.ds = case.control.data, 
                                  validation.ds =  case.control.3sets$validation, 
                                  label = "class",
                                  method.model = "classification",
                                  is.simulated = TRUE,
                                  ncv_folds = c(10, 10),
                                  param.tune = FALSE,
                                  learning_method = "rf", 
                                  importance.algorithm = "ReliefFequalK",
                                  relief.k.method = "k_half_sigma",     # surf k
                                  num_tree = 500,
                                  verbose = F)

  cat("\n Train Accuracy [",cncv.case.control$cv.acc,"]\n")
  cat("\n Validation Accuracy [",cncv.case.control$Validation,"]\n")
  cat("\n Selected Features \n [",cncv.case.control$Features,"]\n")
  cat("\n Elapsed Time [",cncv.case.control$Elapsed,"]\n")
  cat(detectionStats(functional.case.control, cncv.case.control$Features)$report)
  },
  error=function(cond) {
    message("Trying to use cncv. Need to run the following.")
    message('devtools::install_github("insilico/cncv")')
    message(cond)
  }
)  # end try cncv


##### Regular Nested Cross Validation with ReliefF with surf fixed k
# selects features and learns classification model.
## insilico/cncv error
## Error in summary.connection(connection) : invalid connection
# rncv.case.control <- regular_nestedCV(train.ds = case.control.data, 
#                                         validation.ds =  case.control.3sets$validation, 
#                                         label = "class",
#                                         method.model = "classification",
#                                         is.simulated = TRUE,
#                                         ncv_folds = c(10, 10),
#                                         param.tune = FALSE,
#                                         learning_method = "rf", 
#                                         importance.algorithm = "ReliefFequalK",
#                                         relief.k.method = "k_half_sigma",     # surf k
#                                         num_tree = 500,
#                                         verbose = F)
# 
# cat("\n Train Accuracy [",rncv.case.control$cv.acc,"]\n")
# cat("\n Validation Accuracy [",rncv.case.control$Validation,"]\n")
# cat("\n Selected Features \n [",rncv.case.control$Features,"]\n")
# cat("\n Elapsed Time [",rncv.case.control$Elapsed,"]\n")
# cat(detectionStats(functional.case.control, rncv.case.control$Features)$report)

##### GLMnet (penalized regression) comparison. Don't expect good performance for interaction models. 

library(glmnet)
# cc short for case-control
predictors.cc.mat <- case.control.data[, - which(colnames(case.control.data) == "class")]
pheno.case.control <- case.control.data[, "class"]

glmnet.cc.model<-cv.glmnet(as.matrix(predictors.cc.mat), pheno.case.control, alpha=1, family="binomial", type.measure="class")
#glmnet.cc.coeffs<-as.matrix(predict(glmnet.cc.model,type="coefficients"))
#glmnet.cc.coeffs  # maybe 3 is most important, Excess kurtosis
#model.cc.terms <- colnames(predictors.cc.mat)  # glmnet includes an intercept but we are going to ignore
# skip intercept if there, 0-based counting
#nonzero.glmnet.cc.coeffs <- model.cc.terms[glmnet.cc.coeffs@i[which(glmnet.cc.coeffs@i!=0)]] 
# finds some interactions maybe because of the co-expression in the simulation
#nonzero.glmnet.cc.coeffs
#nonzero.glmnet.cc.mask <- abs(glmnet.cc.coeffs[,1])>.05  
#as.matrix(glmnet.cc.coeffs[nonzero.glmnet.cc.mask],ncol=1)

# do not expect correct associations for interaction simulations. 
glmnet.cc.coeffs<-as.matrix(predict(glmnet.cc.model,type="coefficients"))
row.names(glmnet.cc.coeffs) <- c("intercept", colnames(predictors.cc.mat))  # add variable names to results
glmnet.cc.sorted<-as.matrix(glmnet.cc.coeffs[order(abs(glmnet.cc.coeffs),decreasing = T),],ncol=1) # sort
glmnet.cc.sorted[abs(glmnet.cc.sorted)>0,]

#####
## EXPERIMENTAL Needs penalty for ordinal regression (not part of glmnet)
##### Run npdrNET, penalized npdr
# bam: regression.type="glmnet" currently doesn't work
# npdrNET.cc.results <- npdr("class", case.control.data, regression.type="glmnet", attr.diff.type="numeric-abs",
#                                nbd.method="multisurf", nbd.metric = "manhattan", msurf.sd.frac=.5,
#                                glmnet.alpha=1, glmnet.lower=0, 
#                       # argument removed from function?
#                       #glmnet.family="binomial",
#                       verbose=T)
# npdrNET.cc.results.mat <- as.matrix(npdrNET.cc.results)
# # .05 regression coefficient threshold is arbitrary
# # not sure why glment did not force zeros
# # Negative coefficients mean irrelevant attributes for Relief scores.
# # However, glmnet does not include ordinal models. 
# nonzero.npdrNET.mask <- abs(npdrNET.cc.results.mat[,1])>0 
# cbind(shrunk.betas=npdrNET.cc.results.mat[nonzero.npdrNET.mask,1])  # rownames and 1 column of shrunk betas
# 
# dim(npdrNET.cc.results.mat)
# # Naively remove negative coefficients, but would be better to modify shrinkage model.
# #pos.npdrNET.mask <- npdrNET.cc.results.mat[,1]>0.05  
# #as.matrix(npdrNET.cc.results.mat[pos.npdrNET.mask,],ncol=1) 
# 
# # functional attribute detection stats
# npdrNET.cc.positives <- names(npdrNET.cc.results.mat[nonzero.npdrNET.mask,]) # p.adj<.05
# npdrNET.cc.detect.stats <- detectionStats(functional.case.control, npdrNET.cc.positives)
# cat(npdrNET.cc.detect.stats$report)

## Random Forest
library(randomForest)
ranfor.cc.fit <- randomForest(as.factor(class) ~ ., data = case.control.data) 
rf.cc.importance <- importance(ranfor.cc.fit)  
rf.cc.sorted<-sort(rf.cc.importance, decreasing=T, index.return=T)
#rf.cc.sorted$ix
#rownames(rf.cc.importance)[rf.cc.sorted$ix]
#cbind(rownames(rf.cc.importance)[rf.cc.sorted$ix],rf.cc.importance[rf.cc.sorted$ix])[1:25,]

### Compare corelearn and npdr
rf.cc.df <- data.frame(vars=rownames(rf.cc.importance),MeanDecreaseGini=rf.cc.importance)
npdr.cc.beta.df <- data.frame(vars=npdr.cc.results$att,npdr.beta=npdr.cc.results$beta.Z.att)

rf.cc.cutoff <- 0
npdr.cc.pcutoff <- (npdr.cc.results$beta.Z.att[which(npdr.cc.results$pval.adj>.05)[1]-1])


library(ggplot2)
test.cc.df <- merge(rf.cc.df,npdr.cc.beta.df)
functional <- factor(c(rep("Func",length(functional.case.control)),rep("Non-Func",n.variables-length(functional.case.control))))
ggplot(test.cc.df, aes(x=MeanDecreaseGini,y=npdr.beta)) + geom_point(aes(colour = functional), size=4) +
  theme(text = element_text(size = 20)) +
  geom_vline(xintercept=rf.cc.cutoff, linetype="dashed") +
  geom_hline(yintercept=npdr.cc.pcutoff, linetype="dashed") +
  xlab("Random Forest Score") + ylab("NPDR Standardized Coefficient") 


## Testing out penalized neighbor idea

my.cc.attrs <- case.control.data[,colnames(case.control.data)!="class"]
my.cc.pheno <- as.numeric(as.character(case.control.data[,colnames(case.control.data)=="class"]))
neighbor.cc.pairs.idx <- nearestNeighbors(my.cc.attrs, 
                                       nbd.method="relieff", nbd.metric="manhattan", 
                                       sd.frac = .5, k=0)

Ridx_vec <- neighbor.cc.pairs.idx[,"Ri_idx"]
NNidx_vec <- neighbor.cc.pairs.idx[,"NN_idx"]

attr.idx <- 1
my.cc.attr <- my.cc.attrs[,attr.idx] 

num.samp <- nrow(my.cc.attrs)
knnSURF(num.samp,.5)
neighborhood.cc.betas <- rep(0,num.samp)
neighborhood.cc.pvals <- rep(0,num.samp)
for (Ridx in 1:num.samp){
  #Ridx <- 51
  Ri.attr.vals <- my.cc.attr[Ridx]
  NN.attr.vals <- my.cc.attr[NNidx_vec[Ridx_vec==Ridx]]
  attr.diff.vec <- npdrDiff(Ri.attr.vals, NN.attr.vals, diff.type="numeric-abs")

  Ri.pheno.vals <- my.cc.pheno[Ridx]
  NN.pheno.vals <- my.cc.pheno[NNidx_vec[Ridx_vec==Ridx]]
  pheno.diff.vec <- npdrDiff(Ri.pheno.vals, NN.pheno.vals, diff.type="match-mismatch")
  pheno.diff.vec <- factor(pheno.diff.vec, levels=c(0,1))
  mod <- glm(pheno.diff.vec ~ attr.diff.vec, family=binomial(link=logit))
  fit <- summary(mod)
  beta_a <- coef(fit)[2, 1]         # raw beta coefficient, slope (not standardized)
  beta_zscore_a <- coef(fit)[2, 3]  # standardized beta coefficient (col 3)
  ## use one-side p-value to test H1: beta>0 for case-control npdr scores
  pval_beta_a <- pt(beta_zscore_a, mod$df.residual, lower = FALSE)  # one-sided p-val
  neighborhood.cc.betas[Ridx] <- beta_zscore_a
  neighborhood.cc.pvals[Ridx] <- pval_beta_a
}
#cbind(neighborhood.cc.betas, neighborhood.cc.pvals, my.cc.pheno)
beta_zscore_ave <- mean(neighborhood.cc.betas)
mean(neighborhood.cc.pvals)
pt(beta_zscore_ave, knnSURF(num.samp,.5), lower = FALSE) 
pnorm(beta_zscore_ave, mean = 0, sd = 1, lower.tail = FALSE, log.p = FALSE)


#### look at nearest neighbors

predictors.cc.mat <- case.control.data[, - which(colnames(case.control.data) == "class")]
my.cc.nbrs <- nearestNeighbors(predictors.cc.mat, 
                             nbd.method="multisurf", 
                             nbd.metric = "manhattan", 
                             sd.frac = 0.5, k=0,
                             neighbor.sampling="none")
length(my.cc.nbrs[,1])
#my.cc.nbrs[my.cc.nbrs[,1]==23,2]
#my.cc.nbrs[my.cc.nbrs[,1]==31,2]
#my.cc.nbrs[my.cc.nbrs[,1]==102,2]

# knnVec <- function(neighbor.pairs.mat){
#   # number of neighbors for each sample
#   sample.ids <- unique(neighbor.pairs.mat[,1])
#   n.samp <- length(sample.ids)
#   knn.vec <- numeric(length=n.samp) # k for each sample's neighborhood
#   for (i in 1:n.samp){
#     knn.vec[i] <- length(neighbor.pairs.mat[neighbor.pairs.mat[,1]==i,2])
#   }
#   return(knn.vec)
# }
plot(knnVec(my.cc.nbrs))
mean(knnVec(my.cc.nbrs))

knnSURF(200,.5)

my.cc.unique.nbrs <- uniqueNeighbors(my.cc.nbrs)
plot(knnVec(my.cc.unique.nbrs))
mean(knnVec(my.cc.unique.nbrs))
length(my.cc.unique.nbrs[,1])
#my.cc.unique.nbrs[my.cc.unique.nbrs[,1]==2,2]
#my.cc.unique.nbrs[my.cc.unique.nbrs[,1]==31,2]
#my.cc.unique.nbrs[my.cc.unique.nbrs[,1]==102,2]
insilico/npdr documentation built on July 6, 2023, 1:14 p.m.