inst/doc/example_brca_protein-protein-interactions_survival.R

params <-
list(seed = 20188102)

## ---- eval=FALSE--------------------------------------------------------------
#  if (!require("BiocManager"))
#    install.packages("BiocManager")
#  BiocManager::install("glmSparseNet")

## ----packages, message=FALSE, warning=FALSE-----------------------------------
library(dplyr)
library(Matrix)
library(ggplot2)
library(forcats)
library(parallel)
library(STRINGdb)
library(reshape2)
library(survival)
library(VennDiagram)
library(loose.rock)
library(futile.logger)
library(curatedTCGAData)
library(TCGAutils)

library(glmSparseNet)

#
# Some general options for futile.logger the debugging package
.Last.value <- flog.layout(layout.format('[~l] ~m'))
.Last.value <- loose.rock::show.message(FALSE)

# Setting ggplot2 default theme as minimal
theme_set(ggplot2::theme_minimal())

## ---- eval=FALSE--------------------------------------------------------------
#  # Not evaluated in vignette as it takes too long to download and process
#  all.interactions.700 <- stringDBhomoSapiens(score_threshold = 700)
#  string.network       <- buildStringNetwork(all.interactions.700,
#                                             use.names = 'external')

## ---- include=FALSE-----------------------------------------------------------
data('string.network.700.cache', package = 'glmSparseNet')
string.network <- string.network.700.cache

## -----------------------------------------------------------------------------
string.network.undirected <- string.network + Matrix::t(string.network)
string.network.undirected <- (string.network.undirected != 0) * 1

## ---- echo=FALSE, collapse=TRUE-----------------------------------------------
flog.info('Directed graph (score_threshold = %d)', 700)
flog.info('  *       total edges: %d', sum(string.network != 0))
flog.info('  *    unique protein: %d', nrow(string.network))
flog.info('  * edges per protein: %f', 
          sum(string.network != 0) / nrow(string.network) )
flog.info('')
flog.info('Undirected graph (score_threshold = %d)', 700)
flog.info('  *       total edges: %d', sum(string.network.undirected != 0) / 2)
flog.info('  *    unique protein: %d', nrow(string.network.undirected))
flog.info('  * edges per protein: %f', 
          sum(string.network.undirected != 0)/2/nrow(string.network.undirected))

## ---- echo=FALSE--------------------------------------------------------------
string.network.binary <- (string.network.undirected != 0) * 1
degree.network        <- (Matrix::rowSums(string.network.binary) + 
                            Matrix::colSums(string.network.binary)) / 2

flog.info('Summary of degree:', summary(degree.network), capture = TRUE)

## ---- warning=FALSE-----------------------------------------------------------
qplot(degree.network[degree.network <= quantile(degree.network, 
                                                probs = .99999)], 
      geom = 'histogram', fill = my.colors(2), bins = 100) + 
  theme(legend.position = 'none') + xlab('Degree (up until 99.999% quantile)')

## ---- include=FALSE-----------------------------------------------------------
# chunk not included as it produces to many unnecessary messages
brca <- curatedTCGAData(diseaseCode = "BRCA", assays = "RNASeq2GeneNorm", FALSE)

## ---- eval=FALSE--------------------------------------------------------------
#  brca <- curatedTCGAData(diseaseCode = "BRCA", assays = "RNASeq2GeneNorm", FALSE)

## ----data.show, warning=FALSE, error=FALSE------------------------------------
# keep only solid tumour (code: 01)
brca.primary.solid.tumor <- TCGAutils::splitAssays(brca, '01')
xdata.raw                <- t(assay(brca.primary.solid.tumor[[1]]))

# Get survival information
ydata.raw <- colData(brca.primary.solid.tumor) %>% as.data.frame %>% 
  # Convert days to integer
  mutate(Days.to.date.of.Death = as.integer(Days.to.date.of.Death),
         Days.to.Last.Contact  = as.integer(Days.to.Date.of.Last.Contact)) %>%
  # Find max time between all days (ignoring missings)
  rowwise %>%
  mutate(time = max(days_to_last_followup, Days.to.date.of.Death, 
                    Days.to.Last.Contact, days_to_death, na.rm = TRUE)) %>%
  # Keep only survival variables and codes
  select(patientID, status = vital_status, time) %>% 
  # Discard individuals with survival time less or equal to 0
  filter(!is.na(time) & time > 0) %>% as.data.frame

# Set index as the patientID
rownames(ydata.raw) <- ydata.raw$patientID

# keep only features that are in degree.network and have standard deviation > 0
valid.features <- colnames(xdata.raw)[colnames(xdata.raw) %in% 
                                      names(degree.network[degree.network > 0])]
xdata.raw      <- xdata.raw[TCGAbarcode(rownames(xdata.raw)) %in% 
                              rownames(ydata.raw), valid.features]
xdata.raw      <- scale(xdata.raw)

# Order ydata the same as assay
ydata.raw    <- ydata.raw[TCGAbarcode(rownames(xdata.raw)), ]

# Using only a subset of genes previously selected to keep this short example.
set.seed(params$seed)
small.subset <- c('AAK1', 'ADRB1', 'AK7', 'ALK', 'APOBEC3F', 'ARID1B', 'BAMBI', 
                  'BRAF', 'BTG1', 'CACNG8', 'CASP12', 'CD5',  'CDA', 'CEP72', 
                  'CPD', 'CSF2RB', 'CSN3', 'DCT', 'DLG3',  'DLL3', 'DPP4', 
                  'DSG1', 'EDA2R', 'ERP27', 'EXD1', 'GABBR2',  'GADD45A', 
                  'GBP1', 'HTR1F', 'IFNK', 'IRF2', 'IYD', 'KCNJ11',  'KRTAP5-6',
                  'MAFA', 'MAGEB4', 'MAP2K6', 'MCTS1', 'MMP15', 'MMP9',  
                  'NFKBIA', 'NLRC4', 'NT5C1A', 'OPN4', 'OR13C5', 'OR13C8', 
                  'OR2T6', 'OR4K2', 'OR52E6', 'OR5D14', 'OR5H1', 'OR6C4', 
                  'OR7A17', 'OR8J3',  'OSBPL1A', 'PAK6', 'PDE11A', 'PELO', 
                  'PGK1', 'PIK3CB', 'PMAIP1',  'POLR2B', 'POP1', 'PPFIA3', 
                  'PSME1', 'PSME2', 'PTEN', 'PTGES3',  'QARS', 'RABGAP1', 
                  'RBM3', 'RFC3', 'RGPD8', 'RPGRIP1L', 'SAV1',  'SDC1', 'SDC3',
                  'SEC16B', 'SFPQ', 'SFRP5', 'SIPA1L1', 'SLC2A14', 'SLC6A9',
                  'SPATA5L1', 'SPINT1', 'STAR', 'STXBP5', 'SUN3', 'TACC2',
                  'TACR1', 'TAGLN2', 'THPO', 'TNIP1', 'TP53', 'TRMT2B', 'TUBB1',
                  'VDAC1', 'VSIG8', 'WNT3A', 'WWOX', 'XRCC4', 'YME1L1', 
                  'ZBTB11',  'ZSCAN21') %>% 
  sample(size  = 50) %>% sort

# make sure we have 100 genes
small.subset <- c(small.subset, sample(colnames(xdata.raw), 51)) %>% 
  unique %>% 
  sort

xdata <- xdata.raw[, small.subset[small.subset %in% colnames(xdata.raw)]]
ydata <- ydata.raw %>% select(time, status) %>% filter(!is.na(time) | time < 0)

## -----------------------------------------------------------------------------
#
# Add degree 0 to genes not in STRING network

my.degree <- degree.network[small.subset]
my.string <- string.network.binary[small.subset, small.subset]

## ---- echo=FALSE, warning=FALSE-----------------------------------------------
qplot(my.degree, bins = 100, fill = my.colors(3)) + 
  theme(legend.position = 'none')

## -----------------------------------------------------------------------------
set.seed(params$seed)
foldid <- balanced.cv.folds(!!ydata$status)$output

## ---- include=FALSE-----------------------------------------------------------
# List that will store all selected genes
selected.genes <- list()

## ---- warning=FALSE, error=FALSE----------------------------------------------
cv.hub <- cv.glmHub(xdata, 
                    Surv(ydata$time, ydata$status), 
                    family  = 'cox',
                    foldid  = foldid,
                    network = my.string, 
                    network.options = networkOptions(min.degree = 0.2))

## ---- echo=FALSE--------------------------------------------------------------
glmSparseNet::separate2GroupsCox(as.vector(coef(cv.hub, s = 'lambda.min')[,1]), 
                                 xdata, ydata,
                                 plot.title = 'Full dataset', 
                                 legend.outside = FALSE)

selected.genes[['Hub']] <- coef(cv.hub, s = 'lambda.min')[,1] %>% 
  { .[. != 0] } %>%
  names %>% 
  geneNames %>% 
  { .[['external_gene_name']]} 

## ---- warning=FALSE, error=FALSE----------------------------------------------
cv.orphan <- cv.glmOrphan(xdata, 
                          Surv(ydata$time, ydata$status), 
                          family  = 'cox',
                          foldid  = foldid,
                          network = my.string, 
                          network.options = networkOptions(min.degree = 0.2))

## ---- echo=FALSE--------------------------------------------------------------
glmSparseNet::separate2GroupsCox(as.vector(coef(cv.orphan, 
                                                s = 'lambda.min')[,1]), 
                                 xdata, ydata,
                                 plot.title = 'Full dataset',
                                 legend.outside = FALSE)

selected.genes[['Orphan']] <- coef(cv.orphan, s = 'lambda.min')[,1] %>% 
  { .[. != 0] } %>%
  names %>% 
  geneNames %>% 
  { .[['external_gene_name']]} 

## ---- warning=FALSE, error=FALSE----------------------------------------------
cv.glm <- cv.glmnet(xdata, 
                    Surv(ydata$time, ydata$status), 
                    family = 'cox', 
                    foldid = foldid)

## ---- echo=FALSE--------------------------------------------------------------
glmSparseNet::separate2GroupsCox(as.vector(coef(cv.glm, s = 'lambda.min')[,1]), 
                                 xdata, ydata, 
                                 plot.title = 'Full dataset', 
                                 legend.outside = FALSE)

selected.genes[['GLMnet']] <- coef(cv.glm, s = 'lambda.min')[,1] %>% 
  { .[. != 0] } %>%
  names %>% 
  geneNames %>% 
  { .[['external_gene_name']]} 

## ---- echo=FALSE, warning=FALSE-----------------------------------------------
venn.plot <- venn.diagram(selected.genes , 
                          NULL, 
                          fill           = c(my.colors(5), my.colors(3), 
                                             my.colors(4)), 
                          alpha          = c(0.3,0.3, .3), 
                          cex            = 2, 
                          cat.fontface   = 4, 
                          category.names = names(selected.genes))
grid.draw(venn.plot)

## ---- echo=FALSE, warning=FALSE-----------------------------------------------
melt(selected.genes) %>% 
  mutate(Degree = my.degree[value], 
         value  = factor(value), 
         L1     = factor(L1)) %>%
  mutate(value = fct_reorder(value, Degree)) %>%
  as.data.frame %>% ggplot() +
  geom_point(aes(value, L1, size = Degree), shape = my.symbols(3), 
             color = my.colors(3)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0), 
        legend.position = 'top') +
  ylab('Model') + xlab('Gene') + 
  scale_size_continuous(trans = 'log10')

Try the glmSparseNet package in your browser

Any scripts or data that you put into this service are public.

glmSparseNet documentation built on April 14, 2021, 6 p.m.