if (!require("BiocManager")) install.packages("BiocManager") BiocManager::install("glmSparseNet")
library(dplyr) library(ggplot2) library(survival) library(loose.rock) library(futile.logger) library(curatedTCGAData) library(TCGAutils) # library(glmSparseNet) # # Some general options for futile.logger the debugging package .Last.value <- flog.layout(layout.format('[~l] ~m')) .Last.value <- loose.rock::show.message(FALSE) # Setting ggplot2 default theme as minimal theme_set(ggplot2::theme_minimal())
The data is loaded from an online curated dataset downloaded from TCGA using
curatedTCGAData
bioconductor package and processed.
To accelerate the process we use a very reduced dataset down to 107 variables only (genes), which is stored as a data object in this package. However, the procedure to obtain the data manually is described in the following chunk.
# chunk not included as it produces to many unnecessary messages brca <- curatedTCGAData(diseaseCode = "BRCA", assays = "RNASeq2GeneNorm", FALSE)
brca <- curatedTCGAData(diseaseCode = "BRCA", assays = "RNASeq2GeneNorm", FALSE)
brca <- TCGAutils::splitAssays(brca, c('01','11')) xdata.raw <- t(cbind(assay(brca[[1]]), assay(brca[[2]]))) # Get matches between survival and assay data class.v <- TCGAbiospec(rownames(xdata.raw))$sample_definition %>% factor names(class.v) <- rownames(xdata.raw) # keep features with standard deviation > 0 xdata.raw <- xdata.raw %>% { (apply(., 2, sd) != 0) } %>% { xdata.raw[, .] } %>% scale set.seed(params$seed) small.subset <- c('CD5', 'CSF2RB', 'HSF1', 'IRGC', 'LRRC37A6P', 'NEUROG2', 'NLRC4', 'PDE11A', 'PIK3CB', 'QARS', 'RPGRIP1L', 'SDC1', 'TMEM31', 'YME1L1', 'ZBTB11', sample(colnames(xdata.raw), 100)) xdata <- xdata.raw[, small.subset[small.subset %in% colnames(xdata.raw)]] ydata <- class.v
Fit model model penalizing by the hubs using the cross-validation function by
cv.glmHub
.
fitted <- cv.glmHub(xdata, ydata, family = 'binomial', network = 'correlation', nlambda = 1000, network.options = networkOptions(cutoff = .6, min.degree = .2))
Shows the results of 1000
different parameters used to find the optimal value
in 10-fold cross-validation. The two vertical dotted lines represent the best
model and a model with less variables selected (genes), but within a standard
error distance from the best.
plot(fitted)
Taking the best model described by lambda.min
coefs.v <- coef(fitted, s = 'lambda.min')[,1] %>% { .[. != 0]} coefs.v %>% { data.frame(ensembl.id = names(.), gene.name = geneNames(names(.))$external_gene_name, coefficient = ., stringsAsFactors = FALSE) } %>% arrange(gene.name) %>% knitr::kable()
geneNames(names(coefs.v)) %>% { hallmarks(.$external_gene_name)$heatmap }
resp <- predict(fitted, s = 'lambda.min', newx = xdata, type = 'class') flog.info('Misclassified (%d)', sum(ydata != resp)) flog.info(' * False primary solid tumour: %d', sum(resp != ydata & resp == 'Primary Solid Tumor')) flog.info(' * False normal : %d', sum(resp != ydata & resp == 'Solid Tissue Normal'))
Histogram of predicted response
response <- predict(fitted, s = 'lambda.min', newx = xdata, type = 'response') qplot(response, bins = 100)
ROC curve
roc_obj <- pROC::roc(ydata, as.vector(response)) data.frame(TPR = roc_obj$sensitivities, FPR = 1 - roc_obj$specificities) %>% ggplot() +geom_line(aes(FPR,TPR), color = 2, size = 1, alpha = 0.7)+ labs(title= sprintf("ROC curve (AUC = %f)", pROC::auc(roc_obj)), x = "False Positive Rate (1-Specificity)", y = "True Positive Rate (Sensitivity)")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.