knitr::opts_chunk$set(echo = TRUE)
library(ransac) library(ggplot2) library(parallel) library(ggbeeswarm) library(reshape2) library(verissimo) library(futile.logger) # # setup logger flog.layout(layout.simple) flog.appender(appender.tee('logger.txt')) flog.threshold(DEBUG)
generate.cache.name <- function(name) { cache.name <- sprintf('%s-%d-%d-%d-%d-%f-%f-%d-%s.RData', name, nrow(xdata), ncol(xdata), k, n, threshold, good.fit.perct, my.seed, data.name, family) cache.name <- file.path('cache', gsub('[ /]', '_', cache.name)) }
# if (!exists('xdata') || !exists('ydata')) { genes <- c("ENSG00000072778","ENSG00000235505") load('BRCA.data.norm.RData') xdata <- BRCA.data.norm[,genes] ydata <- data.Y colnames(ydata) <- 'logit_class' data.name <- 'BRCA (2 co-variates)' } flog.info(paste0('Description of \'%s\' dataset:\n', ' cases: %d\n', ' class == 1: %d\n', ' class == 0: %d\n', '---------------------\n', ' co-variates: %d'), data.name, nrow(xdata), sum(ydata == 1), sum(ydata == 0), ncol(xdata))
if (ncol(xdata) > 100) { xdata.melt <- melt(xdata[, sample(seq(ncol(xdata)), 50)]) flog.info('xdata has more than 50 co-variates (%d), showing a random subset of 50.', ncol(xdata)) } else { xdata.melt <- melt(xdata) } colnames(xdata.melt) <- c('id', 'feature', 'value') xdata.melt$feature <- factor(xdata.melt$feature) xdata.melt$feature.short <- xdata.melt$feature levels(xdata.melt$feature.short) <- gsub('ENSG0+','', levels(xdata.melt$feature)) # g <- ggplot(xdata.melt, aes(feature.short, value)) + theme_minimal() + theme(axis.text.x = element_text(angle = 90, face = 'italic', hjust = 0, vjust = .5)) + xlab('Feature') + geom_boxplot(notch = TRUE, varwidth = TRUE, colour = my.colors(1), outlier.colour = my.colors(5), outlier.shape = 1) # g + scale_y_continuous(trans = 'log1p') + ylab('Value (log1p scale)') g + ylab('Value (no scale)')
# seed to make this reproducible RNGkind("L'Ecuyer-CMRG") # Parameters if (exists('force.k')) { n <- force.n } else { n <- 30 } if (exists('force.k')) { k <- force.k } else { k <- 5000 } if (exists('force.seed')) { my.seed <- force.seed } else { my.seed <- 209 } if (exists('force.thre')) { threshold <- force.thre } else { threshold <- .35^2 } if (exists('force.good')) { good.fit.perct <- force.good } else { good.fit.perct <- .8 } if (exists('force.fami')) { family <- force.fami } else { family <- 'binomial.glm' } if (exists('force.core')) { mc.cores <- force.core } else { mc.cores <- 16 }
k
: number of iterationsn
: cases to be included in the model (this should be the minimum number of observations required to fit the model)threshold
: error threshold to keep case as inlier (error
< threshold
)good.fit.perct
: percentage of inlier cases that are required to consider a valid modelflog.info(paste0('Description of parameters:\n', ' k: %d\n', ' n: %d\n', ' threshold: %g\n', ' good.fit.perct : %g'), k, n , threshold, good.fit.perct)
set.seed(my.seed) # cache.name <- generate.cache.name('results.ransac') if (file.exists(cache.name)) { load(cache.name) flog.info('Loading cached ransac from %s', cache.name) } else { results.ransac <- ransac(xdata, ydata, n = n, threshold = threshold, good.fit.perct = good.fit.perct, k = k, family = family, mc.cores = mc.cores, nlambda = 100, alpha = 0) flog.info('Saving cache to disk (%s)', cache.name) save(results.ransac, xdata, ydata, family, data.name, file = cache.name) } plot(results.ransac, xdata, ydata, family = family, name = 'Original data', nlambda = 100)
set.seed(my.seed) # ydata.pert <- ydata ix.marta <- c(23,26,29,35,41,44,54,79,84,91,98,116,129,136,154,155,161,183,187,218) ydata.pert[ix.marta] <- (ydata.pert[ix.marta] - 1) * -1 len <- length(ydata) # #ydata.pert[sample(which(ydata == 1),10)] <- 0 #ydata.pert[sample(which(ydata == 0),10)] <- 1 # cache.name <- generate.cache.name('results.ransac.pert') if (file.exists(cache.name)) { flog.info('Loading cached ransac from %s', cache.name) load(cache.name) } else { results.ransac.pert <- ransac(xdata, ydata.pert, n = 20, threshold = threshold, good.fit.perct = good.fit.perct, k = k, family = family, mc.cores = mc.cores, nlambda = 100, alpha = 0) flog.info('Saving cache to disk (%s)', cache.name) save(results.ransac.pert, xdata, ydata, ydata.pert, family, file = cache.name) } plot(results.ransac.pert, xdata, ydata.pert, family = family, name = 'With perturbation') #
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.