In jpvert/kmr4toxicogenetics: Kernel multitask regression for toxicogenetics

This vignette is intended for reproducing the experiments of the following paper:

Elsa Bernard, Yunlong Jiao, Erwan Scornet, Veronique Stoven, Thomas Walter, and Jean-Philippe Vert. "Kernel multitask regression for toxicogenetics." Submitted. 2017. bioRxiv-171298.

knitr::opts_chunk$set(error = TRUE, message = FALSE, warning = FALSE, cache = TRUE)
library(kmr4toxicogenetics)

savepath <- "results_large/" # to save data locally for use later
if (!dir.exists(savepath))
  dir.create(savepath)

I. Read data and kernel matrices

We test kernel multitask regression (KMR), against several standard prediction algorithms, on the data of the Dream 8 toxicogenetics challenge sub-challenge 1, where participants were asked to predict the effect of various toxic chemicals on cell lines. For details of the challenge and the toxicogenetics data, see the challenge publication:

Eduati, F., et al. "Prediction of human population responses to toxic compounds by a collaborative competition." Nature biotechnology 33.9 (2015): 933-940. \href{https://doi.org/10.1038/nbt.3299}{doi:10.1038/nbt.3299}.

Data were downloaded and processed from the Dream 8 toxicogenetics challenge, publicly available at the data portal \url{https://doi.org/10.7303/syn1761567}. Specifically, we have the following:

Toxicity response values. Details are referred to ?tox.
Design matrix, as input to standard prediction algorithms. Details are referred to ?design.
The identifiers of the subset of cell lines for which RNA-seq data were available. Details are referred to ?id.rna.

# Load toxicity response - sizeof ncell x nchem
data("tox", package = "kmr4toxicogenetics")
cellnames <- rownames(tox)
ncelllines <- nrow(tox)
# Number of total cell lines
ncelllines
chemnames <- colnames(tox)
nchemicals <- ncol(tox)
# Number of total chemicals
nchemicals

# Load design matrix - sizeof ncell x p
data("design", package = "kmr4toxicogenetics")
# Cell line design matrix dimension
dim(design)
# Number of dim in each feature category
table(as.vector(
  sapply(colnames(design), function(a){
    if (!grepl("gram",a)) {
      return("covariate")
    } else {
      return(paste(strsplit(a,split="")[[1]][1:7],collapse=""))
    }
  })
))

# Load RNA-seq only cell line identifiers
data("id.rna", package = "kmr4toxicogenetics")
# Number of such cell lines
length(id.rna)

From the feature data or descriptors available for cell lines and chemical compounds, we devised several kernels for cell lines and chemicals, as detailed in ?kcell and ?kchem. Further, we have calculated average kernels, integrated kernels and empirical kernels (implementation details elaborated as follows). A summary of kernels under consideration is found in Table 1 of the referenced paper (Elsa et al., 2017).

# Load cell line kernels
data("kcell", package = "kmr4toxicogenetics")
kcellname <- names(kcell)

# Load chemical kernels
data("kchem", package = "kmr4toxicogenetics")
kchemname <- names(kchem)

# Add mean kernel when we have several RBF kernels of different bandwidth for both cell lines and chemicals
kcell <- c(kcell, 
           list("KrnaseqMean" = do.call(WGCNA::pmean, kcell[grep('rnaseq', kcellname)]), 
                "KsnpMean" = do.call(WGCNA::pmean, kcell[grep('snp', kcellname)])))
kcellname <- c(kcellname, 'KrnaseqMean', 'KsnpMean')
kchem <- c(kchem, 
           list("KcdkMean" = do.call(WGCNA::pmean, kchem[grep('cdk', kchemname)]) ,
                "KpredtargetMean" = do.call(WGCNA::pmean, kchem[grep('predtarget', kchemname)]) ,
                "KsirmsMean" = do.call(WGCNA::pmean, kchem[grep('sirms', kchemname)])))
kchemname <- c(kchemname, 'KcdkMean', 'KpredtargetMean', 'KsirmsMean')

# Add multitask kernel for chemicals (interpolation between Dirac and uniform)
ninter <- 11
kchem <- c(kchem, lapply(as.list(seq(ninter)), function(i){
  kk <- ((i-1) * matrix(1, nchemicals, nchemicals) + (ninter - i) * diag(nchemicals)) / (ninter - 1)
  colnames(kk) <- chemnames
  rownames(kk) <- chemnames
  return(kk)
}))
kchemname <- c(kchemname, sapply(seq(ninter), function(i) paste('Kmultitask', i, sep = '')))

# Add empirical correlation kernel for chemicals
kchem <- c(kchem, 
           list("Kemp" = cor(tox)))
kchemname <- c(kchemname, 'Kempirical')

# Add mean kernel of heterogeneous sources for both cell lines and chemicals
kcell <- c(kcell, 
           list("Kint" = do.call(WGCNA::pmean, kcell[match(c("Kcovariates", "KrnaseqMean", "KsnpMean"), kcellname)])))
kcellname <- c(kcellname, 'Kint')
kchem <- c(kchem, 
           list("Kint" = do.call(WGCNA::pmean, kchem[match(c("KcdkMean", "KpredtargetMean", "KsirmsMean", "Kmultitask1", "Kmultitask11", "Kempirical"), kchemname)])))
kchemname <- c(kchemname, 'Kint')

# Preview kernel names
# Cell line kernels
kcellname
# Chemical kernels
kchemname

II. Kernel Multitask Regression (KMR)

Prediction

# CV setup
nfolds <- 5
nrepeats <- 10

# Seed
seed <- 47

# Number of cores
mc.cores <- 1

# Main loop : Try each cell line kernel with each chemical kernel
sT = sF = matrix(NA, nrow = length(kcell), ncol = length(kchem), dimnames = list(kcellname, kchemname))
for (flagrna in c(TRUE, FALSE)) {
  # whether only RNA-seq cell lines vs all cell lines are used
  for (icell in seq(length(kcell))) {
    # for each cell line
    for (ichem in seq(length(kchem))) {
      # for each chemical
      objname <- paste("cvKMR", flagrna, kcellname[icell], kchemname[ichem], sep = "_")
      filename <- paste0(savepath, objname, ".RData")
      if (file.exists(filename)) {
        # CASE I - file exists from earlier runs
        # load the results
        cvres <- get(load(filename))
      } else {
        # CASE II - file not found
        message("running... keep rna cell ", flagrna, 
                " cell ", icell," out of ", length(kcell),
                " chem ", ichem," out of ", length(kchem),
                " ", objname)
        # specify kernel data and y data
        chemicalsKernel <- kchem[[ichem]]
        if(flagrna) {
          celllinesKernel <- kcell[[icell]][id.rna, id.rna]
          toxicity <- tox[id.rna, ]
        } else {
          celllinesKernel <- kcell[[icell]]
          toxicity <- tox
        }
        # run CV evaluation with KMR
        cvres <- evaluateCV(mypredictor = "predictorKMR", 
                            celllinesKernel = celllinesKernel, 
                            chemicalsKernel = chemicalsKernel, 
                            toxicity = toxicity, 
                            nfolds = nfolds, 
                            nrepeats = nrepeats, 
                            seed = seed, 
                            mc.cores = mc.cores)
        assign(objname, cvres)
        save(list = objname, file = filename)
      }
      rm(list = objname)
      # focus on CI scores and impute NA to 0.5 (random guess)
      cvres$matrix.ci[is.na(cvres$matrix.ci)] <- 0.5
      # average CI scores (over CV folds) per chemical per cell line
      if (flagrna)
        sT[icell,ichem] <- mean(cvres$matrix.ci)
      else
        sF[icell,ichem] <- mean(cvres$matrix.ci)
    }
  }
}

Results

# save default graphics options as to restore later
op <- par(no.readonly = TRUE)

## For all cell lines:
# CI heatmap
gplots::heatmap.2(sF, trace = "none", margin = c(8, 9), main = "CI")
# Mean CI over cell line kernels
par(mar = c(5.1,10.1,4.1,0.1))
barplot(sort(apply(sF, 1, mean)), horiz = TRUE, las = 1, xlim = c(min(sF) - 0.005, max(sF) + 0.005), xpd = FALSE, main = "Mean CI for cell line kernels")
par(op)
# Mean CI over chemical kernels
par(mar = c(5.1,8.1,4.1,0.1))
barplot(sort(apply(sF, 2, mean)), horiz = TRUE, las = 1, xlim = c(min(sF) - 0.005, max(sF) + 0.005), xpd = FALSE, main = "Mean CI for chemicals kernels", cex.names = 0.8)
par(op)
# Table of top scoring kernel pair
s <- reshape2::melt(sF, varnames = c("kcell", "kchem"))
s <- s[order(s$value, decreasing = TRUE), ]
s <- cbind("rank" = seq(nrow(s)), s)
rownames(s) <- s$rank
knitr::kable(head(s, 30), caption = "Top scoring kernel pair")

## For RNA-seq cell lines only:
# CI heatmap
gplots::heatmap.2(sT, trace = "none", margin = c(8, 9), main = "CI (RNA-seq only)")

Supplement: Regularization path

Take as example integrated kernel on cell lines and empirical kernel on chemicals, we train a KMR model on full data and plot the regularization path.

# set kernels and data
celllinesKernel <- kcell[["Kint"]]
cat("Cell line kernel dim = ", dim(celllinesKernel))
chemicalsKernel <- kchem[["Kemp"]]
cat("Chemical kernel dim = ", dim(chemicalsKernel))
toxicity <- tox
cat("Toxicity response dim = ", dim(toxicity))

# train a model on full dataset
modelKMR <- kmr::cv.kmr(x = celllinesKernel, 
                        y = toxicity, 
                        kx_type = "precomputed", 
                        kt_type = "precomputed", 
                        kt_option = list(kt = chemicalsKernel), 
                        lambda = exp(-15:25), 
                        nfolds = 5, 
                        nrepeats = 1)

# plot regularization path
par(cex.axis = 1.5, cex.lab = 1.5, font.axis = 2, font.lab = 2)
plot(modelKMR)
par(op)
rm(modelKMR)

IV. Comparison to state-of-the-art

State-of-the-art

We compare prediction performance (5-fold CV repeated 10 times) obtained with:

lasso
elastic net
random forest
KMR (with integrated kernel on cell lines and empirical kernel on chemicals for instance)

We also compare running time of these methods, by reporting the half of the running time of a 2-fold 1-repeat CV (2-fold as to get equal training-to-test sample size). Note that except RF now uses 100 trees (instead of 500 by default) to test running time, all methods are set with the same parameters as used in comparing prediction performance.

# set parameter for running time comparison
nfolds.run <- 2
nrepeats.run <- 1
seed.run <- seed
mc.cores.run <- 1

# 1) ElasticNet
message("ElasticNet ...")
# pred perf
filename <- paste0(savepath, "cvElasticNet.RData")
if (file.exists(filename)) {
  load(filename)
} else {
  message("running... ", filename)
  cvElasticNet <- evaluateCV(mypredictor = "predictorElasticNet", celllines = design, toxicity = tox, nfolds = nfolds, nrepeats = nrepeats, seed = seed, mc.cores = mc.cores)
  save(cvElasticNet, file = filename)
}
# running time
ptElasticNet <- system.time(evaluateCV(mypredictor = "predictorElasticNet", celllines = design, toxicity = tox, nfolds = nfolds.run, nrepeats = nrepeats.run, seed = seed.run, mc.cores = mc.cores.run))

# 2) Lasso
message("Lasso ...")
# pred perf
filename <- paste0(savepath, "cvLasso.RData")
if (file.exists(filename)) {
  load(filename)
} else {
  message("running... ", filename)
  cvLasso <- evaluateCV(mypredictor = "predictorLasso", celllines = design, toxicity = tox, nfolds = nfolds, nrepeats = nrepeats, seed = seed, mc.cores = mc.cores)
  save(cvLasso, file = filename)
}
# running time
ptLasso <- system.time(evaluateCV(mypredictor = "predictorLasso", celllines = design, toxicity = tox, nfolds = nfolds.run, nrepeats = nrepeats.run, seed = seed.run, mc.cores = mc.cores.run))

# 3) RF
message("RF ...")
# pred perf
filename <- paste0(savepath, "cvRF.RData")
if (file.exists(filename)) {
  load(filename)
} else {
  message("running... ", filename)
  cvRF <- evaluateCV(mypredictor = "predictorRF", celllines = design, toxicity = tox, nfolds = nfolds, nrepeats = nrepeats, seed = seed, mc.cores = mc.cores)
  save(cvRF, file = filename)
}
# running time
ptRF <- system.time(evaluateCV(mypredictor = "predictorRF", celllines = design, toxicity = tox, nfolds = nfolds.run, nrepeats = nrepeats.run, seed = seed.run, mc.cores = mc.cores.run, ntree = 100))

# 4) KMR (with integrated kernel on cell lines and empirical kernel on chemicals)
message("KMR ...")
# pred perf
cvKMR <- get(load(paste0(savepath, "cvKMR_FALSE_Kint_Kempirical.RData")))
# running time
ptKMR <- system.time(evaluateCV(mypredictor = "predictorKMR", celllinesKernel = kcell[["Kint"]], chemicalsKernel = kchem[["Kemp"]], toxicity = tox, nfolds = nfolds.run, nrepeats = nrepeats.run, seed = seed.run, mc.cores = mc.cores.run))

Performance comparison

methodlist <- c("ElasticNet", "Lasso", "RF", "KMR")
methodlist <- ordered(methodlist, levels = methodlist)

# Running time per method
times <- lapply(methodlist, function(u) get(paste0("pt",u))[1])
times <- do.call("rbind",times)
rownames(times) <- methodlist
colnames(times) <- "Time (sec)"
knitr::kable(ceiling(times/2), caption = "Running time per method")

# Averaged CI across CV experiments per method per chemical
scores <- list()
for (i in seq_along(methodlist)) {
  cvres <- get(paste0("cv",methodlist[i]))
  # focus on CI and impute NA to 0.5 (random guess)
  cvres$matrix.ci[is.na(cvres$matrix.ci)] <- 0.5
  scores[[i]] <- data.frame(
    "method" = methodlist[i],
    "chemical" = colnames(cvres$matrix.ci),
    "CI" = colMeans(cvres$matrix.ci),
    stringsAsFactors = FALSE
  )
}
scores <- do.call("rbind", scores)
rownames(scores) <- seq(nrow(scores))

# Mean CI over chemicals per method
tabscores <- tapply(scores$CI, scores$method, function(u){
  c("Mean" = mean(u), "SD" = sd(u))
})
tabscores <- do.call("rbind", tabscores)
# preview
knitr::kable(tabscores, digits = 4, caption = "Mean CI over chemicals per method")

# Mean CI over chemicals per method - Signif test by two-sided t.test
pmatrix <- matrix(NA, nrow = length(methodlist), ncol = length(methodlist),
                  dimnames = list(methodlist, methodlist))
for (i in 1:(length(methodlist) - 1)) {
  for (j in (i + 1):length(methodlist)) {
    pmatrix[i,j] <- t.test(x = scores$CI[scores$method == methodlist[i]], 
                           y = scores$CI[scores$method == methodlist[j]], 
                           alternative = 'two.sided', mu = 0, paired = TRUE)$p.value
  }
}
# correct p-value for multiple testing with Benjamini-Hochberg
pmatrix.adj <- p.adjust(pmatrix, "BH")
attributes(pmatrix.adj) <- attributes(pmatrix)
# preview
pmatrix.adj

# Mean CI over chemicals per method - Boxplot
par(cex.axis = 1.5, cex.lab = 1.5, font.axis = 2, font.lab = 2)
boxplot(CI~method, data = scores, ylab = "CI", col = c("#f4a582", "#0571b0", "#92c5de", "#ca0020"), outline = FALSE)
par(op)