SmCCNet_Vignette_SingleOmics.R
In SmCCNet: Sparse Multiple Canonical Correlation Network Analysis Tool

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)

## ---- echo = FALSE, results = "hide", warning = FALSE, eval = TRUE------------
suppressPackageStartupMessages({
    library(pbapply)
    library(Matrix)
    library(igraph)
})

## ---- eval = TRUE-------------------------------------------------------------
library(pbapply)
library(Matrix)
library(igraph)
library(SmCCNet)
library(parallel)

## ----flowPlot, out.width="100%", fig.cap = "Workflow for SmCCNet single-omics setting (binary and quantitative phenotype).", echo = FALSE----
knitr::include_graphics("figures/single-omics-smccnet.jpg")

## ----example data-------------------------------------------------------------
data(ExampleData)
head(X1[ , 1:6])
head(Y)

## ----p1p2, eval = TRUE--------------------------------------------------------
p1 <- ncol(X1)
N <- nrow(X1)
AbarLabel <- colnames(X1)

## ---- eval = FALSE------------------------------------------------------------
#  # preprocess data
#  processed_data <- dataPreprocess(X = as.data.frame(X1), covariates = NULL,
#                    is_cv = TRUE, cv_quantile = 0.2, center = TRUE, scale = TRUE)

## ----CVpara, eval = FALSE, warning = FALSE------------------------------------
#  K <- 3 # number of folds in K-fold CV.
#  s1 <- 0.7
#  subSamp <- 50 # number of subsamples (will be described in later section).
#  
#  # create sparsity penalty options.
#  pen1 <- seq(.05, .3, by = .05)
#  
#  # set a CV directory.
#  CVDir <- "Example3foldCV/"
#  pheno <- "Example3foldCV"
#  dir.create(CVDir)

## ----make K-fold, eval = FALSE------------------------------------------------
#  # set random seed
#  set.seed(12345)
#  
#  # save data and parameters into local directory
#  save(X1, Y, s1, subSamp, pen1,
#       file = paste0(CVDir, "Data.Rdata"))
#  
#  # split data into K folds
#  foldIdx <- split(1:N, sample(1:N, K))
#  for(i in 1:K){
#    iIdx <- foldIdx[[i]]
#    x1.train <- scale(X1[-iIdx, ])
#    yy.train <- Y[-iIdx, ]
#    x1.test <- scale(X1[iIdx, ])
#    yy.test <- Y[iIdx, ]
#  
#    if(is.na(min(min(x1.train), min(yy.train), min(x1.test), min(yy.test)))){
#      stop("Invalid scaled data.")
#    }
#  
#    subD <- paste0(CVDir, "CV_", i, "/")
#    dir.create(subD)
#  
#    # save data from each fold into local directory
#    save(x1.train, yy.train, x1.test, yy.test,
#         pen1, p1,
#         file = paste0(subD, "Data.Rdata"))
#  }

## ----run CV, eval = FALSE-----------------------------------------------------
#  # number of clusters in parSapply should be the same as number specified above
#  suppressWarnings(for (CVidx in 1:K)
#  {
#    # define the sub-directory for each fold
#    subD <- paste0(CVDir, "CV_", CVidx, "/")
#    # load fold data
#    load(paste0(subD, "Data.Rdata"))
#    dir.create(paste0(subD, "SmCCA/"))
#    # create empty vector to store cross-validation result
#    RhoTrain <- RhoTest <- DeltaCor <- rep(0, length(pen1))
#    # evaluate through all the possible penalty candidates
#    for(idx in 1:length(pen1)){
#      l1 <- pen1[idx]
#      print(paste0("Running SmCCA on CV_", CVidx, " pen=", l1))
#      # run single-omics SmCCNet
#      Ws <- getRobustWeightsSingle(x1.train, as.matrix(yy.train), l1, 1,
#                                      SubsamplingNum = 1)
#      # average
#      meanW <- rowMeans(Ws)
#      v <- meanW[1:p1]
#  
#      rho.train <-  cor(x1.train %*% v, yy.train)
#  
#  
#      rho.test <- cor(x1.test %*% v, yy.test)
#  
#  
#      RhoTrain[idx] <- round(rho.train, digits = 5)
#      RhoTest[idx] <- round(rho.test, digits = 5)
#      DeltaCor[idx] <- abs(rho.train - rho.test)
#  
#  
#  
#    }
#  
#    DeltaCor.all <- cbind(pen1, RhoTrain, RhoTest, DeltaCor)
#    colnames(DeltaCor.all) <- c("l1", "Training CC", "Test CC", "CC Pred. Error")
#    write.csv(DeltaCor.all,
#              file = paste0(subD, "SmCCA/SCCA_", subSamp,"_allDeltaCor.csv"))
#  
#  
#  })

## ----aggregate error, eval = FALSE--------------------------------------------
#  # combine prediction errors from all K folds and compute the total prediction
#  # error for each sparsity penalty pair.
#  aggregateCVSingle(CVDir, "SmCCA", NumSubsamp = subSamp, K = K)

## ----get abar, eval = FALSE---------------------------------------------------
#  # set up directory to store all the results
#  plotD <- paste0(CVDir, "Figures/")
#  saveD <- paste0(CVDir, "Results/")
#  dataF <- paste0(CVDir, "Data.Rdata")
#  dir.create(plotD)
#  dir.create(saveD)
#  dir.create(dataF)
#  
#  # type of CCA result, only "SmCCA" supported
#  Method = "SmCCA"
#  
#  
#  # after SmCCA CV, select the best penalty term,
#  # and use it for running SmCCA on the complete dataset
#  for(Method in "SmCCA"){
#    # select optimal penalty term from CV result
#    T12 <- read.csv(paste0(CVDir, "Results/", Method, "CVmeanDeltaCors.csv"))
#    # calculate evaluation metric **
#    pen <- T12[which.min(T12[ ,4]/abs(T12[ ,3])) ,2]
#  
#  
#    l1 <- pen;
#    system.time({
#      Ws <- getRobustWeightsSingle(X1 = X1, Trait = as.matrix(Y),
#                                         Lambda1 = l1,
#                                         s1, SubsamplingNum = subSamp)
#  
#  
#      Abar <- getAbar(Ws, FeatureLabel = AbarLabel[1:p1])
#      save(l1, X1, Y, s1, Ws, Abar,
#           file = paste0(saveD, Method, K, "foldSamp", subSamp, "_", pen,
#                         ".Rdata"))
#    })
#  
#  
#  }

## ----get modules, eval = FALSE------------------------------------------------
#  # perform clustering based on the adjacency matrix Abar
#  OmicsModule <- getOmicsModules(Abar, PlotTree = FALSE)
#  
#  # make sure there are no duplicated labels
#  AbarLabel <- make.unique(AbarLabel)
#  
#  # calculate feature correlation matrix
#  bigCor2 <- cor(X1)
#  
#  # data type
#  types <- rep('gene', nrow(bigCor2))

## ---- eval = FALSE------------------------------------------------------------
#  # filter out network modules with insufficient number of nodes
#  module_length <- unlist(lapply(OmicsModule, length))
#  network_modules <- OmicsModule[module_length > 10]
#  # extract pruned network modules
#  for(i in 1:length(network_modules))
#  {
#    cat(paste0('For network module: ', i, '\n'))
#    # define subnetwork
#    abar_sub <- Abar[network_modules[[i]],network_modules[[i]]]
#    cor_sub <- bigCor2[network_modules[[i]],network_modules[[i]]]
#    # prune network module
#    networkPruning(Abar = abar_sub,CorrMatrix = cor_sub,
#                            type = types[network_modules[[i]]],
#                   data = X1[,network_modules[[i]]],
#  			  Pheno = Y, ModuleIdx = i, min_mod_size = 10,
#                            max_mod_size = 100, method = 'PCA',
#                            saving_dir = getwd())
#    cat("\n")
#  }

## ---- eval = TRUE,echo = FALSE, warning = FALSE, message = FALSE--------------
load('../vignettes/cont_size_16_net_1.Rdata')
row.names(omics_correlation_data) <- NULL
colnames(omics_correlation_data) <- c('Molecular Feature', 'Correlation to Phenotype', 'P-value')
knitr::kable(omics_correlation_data, caption = 'Individual molecular features correlation table with respect to phenotype (correlation and p-value).')

## ----loadings1, echo = FALSE, warning = FALSE, message = FALSE, out.width="100%", fig.cap = "PC1 loading for each subnetwork feature."----
library(grid)
library(tidyverse)
library(shadowtext)
library(reshape2)
BLUE <- "#076fa2"
data <- data.frame(name = row.names(pc_loading), loading = abs(pc_loading[,1]))
plt <- ggplot(data) +
  geom_col(aes(loading, name), fill = BLUE, width = 0.6) 

plt

## ----corheatmap, echo = FALSE, warning = FALSE, message = FALSE, out.width="100%", fig.cap = "Correlation heatmap for subnetwork features."----
###### Correlation heatmap
melted_cormat <- melt(correlation_sub)
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile() +
   labs(title = "Correlation Heatmap of Network Features") + 
  theme(plot.title.position = "plot")

## ----adjheatmap, echo = FALSE, warning = FALSE, message = FALSE, out.width="100%", fig.cap = "Adjacency matrix heatmap for subnetwork features."----
###### Correlation heatmap
melted_cormat <- melt(as.matrix(M))
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile() +
   labs(title = "Adjacency Matrix Heatmap of Network Features") + 
  theme(plot.title.position = "plot")


## ---- eval = FALSE------------------------------------------------------------
#  library(RCy3)
#  library(igraph)
#  # load subnetwork data (example, user need to provide the directory)
#  load('ResultDirectory/size_a_net_b.Rdata')
#  M <- as.matrix(M)
#  
#  # correlation matrix for the subnetwork
#  filter_index <- which(abs(correlation_sub) < 0.05)
#  M_ind <- ifelse(correlation_sub > 0, 1, -1)
#  M_adj <- M * M_ind
#  M_adj[filter_index] <- 0
#  diag(M_adj) <- 0
#  
#  # network visualization through cytoscape
#  graph <- igraph::graph_from_adjacency_matrix(M_adj, mode = 'undirected',
#           weighted = TRUE, diag = TRUE, add.colnames = NULL, add.rownames = NA)
#  
#  # define network node type and connectivity
#  V(graph)$type <- sub_type
#  V(graph)$type
#  V(graph)$connectivity <- rowSums(abs(M))
#  V(graph)$connectivity
#  
#  createNetworkFromIgraph(graph,"single_omics_network")

## ----netPlot, out.width="100%", fig.cap = "Trimmed module 1. The strength of the node connections is indicated by the thickness of edges. Red edges and blue edges are for negative and positive connections respectively.", echo = FALSE----
knitr::include_graphics("../vignettes/example_network_cont.png")

## ----example data binary------------------------------------------------------
data(ExampleData)
head(X1[ , 1:6])
head(Y)
# binarize quantitative outcome 
Y <- ifelse(Y > median(Y), 1, 0)

## ----p1p2 binary, eval = TRUE-------------------------------------------------
p1 <- ncol(X1)
N <- nrow(X1)
AbarLabel <- colnames(X1)

## ---- eval = FALSE------------------------------------------------------------
#  K <- 3 # number of folds in K-fold CV.
#  s1 <- 0.7 # feature subsampling percentage.
#  subSamp <- 50 # number of subsamples.
#  CCcoef <- NULL # unweighted version of SmCCNet.
#  metric <- 'auc' # evaluation metric to be used.
#  
#  # create sparsity penalty options.
#  pen1 <- seq(.1, .9, by = .1)
#  
#  # set a CV directory.
#  CVDir <- "Example3foldCV_Binary/"
#  pheno <- "Example3foldCV_Binary"
#  dir.create(CVDir)
#  
#  Y <- ifelse(Y > median(Y), 1, 0)

## ----make K-fold 2, eval = FALSE----------------------------------------------
#  set.seed(12345) # set random seed.
#  # save original data into local directory
#  save(X1, Y, s1, subSamp, pen1,
#       file = paste0(CVDir, "Data.Rdata"))
#  
#  
#  # define the number of components to be extracted
#  ncomp <- 3
#  # split data into k folds
#  foldIdx <- split(1:N, sample(1:N, K))
#  for(i in 1:K){
#    iIdx <- foldIdx[[i]]
#    x1.train <- scale(X1[-iIdx, ])
#    yy.train <- Y[-iIdx, ]
#    x1.test <- scale(X1[iIdx, ])
#    yy.test <- Y[iIdx, ]
#  
#    if(is.na(min(min(x1.train), min(yy.train), min(x1.test), min(yy.test)))){
#      stop("Invalid scaled data.")
#    }
#  
#    subD <- paste0(CVDir, "CV_", i, "/")
#    dir.create(subD)
#    save(x1.train, yy.train, x1.test, yy.test,
#        pen1, p1,
#         file = paste0(subD, "Data.Rdata"))
#  }
#  
#  ############################################## Running Cross-Validation
#  
#  # run SmCCA with K-fold cross-validation
#  suppressWarnings(for (CVidx in 1:K){
#    # define evaluation method
#    EvalMethod <- 'precision'
#    # define and create saving directory
#    subD <- paste0(CVDir, "CV_", CVidx, "/")
#    load(paste0(subD, "Data.Rdata"))
#    dir.create(paste0(subD, "SmCCA/"))
#  
#    # create empty vector to store cross-validation accuracy result
#    TrainScore <- TestScore <- rep(0, length(pen1))
#    for(idx in 1:length(pen1)){
#      # define value of penalty
#      l1 <- pen1[idx]
#  
#      # run SPLS-DA to extract canonical weight
#      Ws <- spls::splsda(x = x1.train, y = yy.train, K = ncomp, eta = l1,
#                         kappa=0.5,
#                         classifier= 'logistic', scale.x=FALSE)
#  
#      # create emtpy matrix to save canonical weights for each subsampling
#      weight <- matrix(0,nrow = ncol(x1.train), ncol = ncomp)
#      weight[Ws[["A"]],] <- Ws[["W"]]
#  
#      # train the latent factor model with logistic regression
#      train_data <- data.frame(x = (x1.train %*% weight)[,1:ncomp],
#                               y = as.factor(yy.train))
#      test_data <- data.frame(x = (x1.test %*% weight)[,1:ncomp])
#  
#      logisticFit <- stats::glm(y~., family = 'binomial',data = train_data)
#      # make prediction for train/test set
#      train_pred <- stats::predict(logisticFit, train_data, type = 'response')
#      test_pred <- stats::predict(logisticFit, test_data, type = 'response')
#      # specifying which method to use as evaluation metric
#      TrainScore[idx] <- classifierEval(obs = yy.train,
#                                        pred = train_pred,
#                                        EvalMethod = metric, print_score = FALSE)
#      TestScore[idx] <- classifierEval(obs = yy.test,
#                                        pred = test_pred,
#                                        EvalMethod = metric, print_score = FALSE)
#  
#    }
#  
#    # combine cross-validation results
#    DeltaAcc.all <- as.data.frame(cbind(pen1, TrainScore, TestScore))
#    DeltaAcc.all$Delta <- abs(DeltaAcc.all$TrainScore - DeltaAcc.all$TestScore)
#    colnames(DeltaAcc.all) <- c("l1", "Training Score", "Testing Score", "Delta")
#  
#    # save cross-validation results to local directory
#    write.csv(DeltaAcc.all,
#              file = paste0(subD, "SmCCA/SCCA_", subSamp,"_allDeltaCor.csv"))
#  
#  }
#  )

## ---- eval = FALSE------------------------------------------------------------
#  # save cross-validation result
#  cv_result <- aggregateCVSingle(CVDir, "SmCCA", NumSubsamp = subSamp, K = 3)
#  
#  # create directory to save overall result with the best penalty term
#  plotD <- paste0(CVDir, "Figures/")
#  saveD <- paste0(CVDir, "Results/")
#  dataF <- paste0(CVDir, "Data.Rdata")
#  dir.create(plotD)
#  dir.create(saveD)
#  dir.create(dataF)
#  
#  # specify method (only SmCCA works)
#  Method = "SmCCA"
#  
#  for(Method in "SmCCA"){
#    # read cross validation result in
#    T12 <- read.csv(paste0(CVDir, "Results/", Method, "CVmeanDeltaCors.csv"))
#    # determine the optimal penalty term
#    pen <- T12[which.max(T12[ ,3]) ,2]
#    l1 <- pen;
#    system.time({
#      # run SPLSDA with optimal penalty term
#      Ws <- getRobustWeightsSingleBinary(X1 = X1, Trait = as.matrix(Y),
#                                         Lambda1 = l1,
#                                         s1, SubsamplingNum = subSamp)
#  
#      # get adjacency matrix
#      Abar <- getAbar(Ws, FeatureLabel = AbarLabel[1:p1])
#      # save result into local directory
#      save(l1, X1, Y, s1, Ws, Abar,
#           file = paste0(saveD, Method, K, "foldSamp", subSamp, "_", pen,
#                         ".Rdata"))
#    })
#  
#  
#  }

## ----sessionInfo--------------------------------------------------------------
sessionInfo()
warnings()