knitr::opts_chunk$set(echo = TRUE)
library(ransac)
library(ggplot2)
library(parallel)
library(ggbeeswarm)
library(reshape2)
library(verissimo)
library(futile.logger)
#
# setup logger
flog.layout(layout.simple)
flog.appender(appender.tee('logger.txt'))
flog.threshold(DEBUG)

Functions

generate.cache.name <- function(name) {
  cache.name <- sprintf('%s-%d-%d-%d-%d-%f-%f-%d-%s.RData', 
                        name, nrow(xdata), ncol(xdata),
                        k, n, threshold, 
                        good.fit.perct, 
                        my.seed, 
                        data.name,
                        family)
  cache.name <- file.path('cache', gsub('[ /]', '_', cache.name))
}

Load BRCA data with 2 co-variates

#
if (!exists('xdata') || !exists('ydata')) {
  genes <- c("ENSG00000072778","ENSG00000235505")
  load('BRCA.data.norm.RData')
  xdata <- BRCA.data.norm[,genes]
  ydata <- data.Y
  colnames(ydata) <- 'logit_class'
  data.name <- 'BRCA (2 co-variates)'

}

flog.info(paste0('Description of \'%s\' dataset:\n', '        cases: %d\n', '   class == 1: %d\n',
              '   class == 0: %d\n', '---------------------\n', '  co-variates: %d'),
          data.name, nrow(xdata), sum(ydata == 1), sum(ydata == 0),  ncol(xdata))

Show SD of features

if (ncol(xdata) > 100) {
  xdata.melt <- melt(xdata[, sample(seq(ncol(xdata)), 50)])
  flog.info('xdata has more than 50 co-variates (%d), showing a random subset of 50.', ncol(xdata))
} else {
  xdata.melt <- melt(xdata)
}
colnames(xdata.melt) <- c('id', 'feature', 'value')

xdata.melt$feature       <- factor(xdata.melt$feature)
xdata.melt$feature.short <- xdata.melt$feature
levels(xdata.melt$feature.short) <- gsub('ENSG0+','', levels(xdata.melt$feature))
#
g <- ggplot(xdata.melt, aes(feature.short, value)) + theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, face = 'italic', hjust = 0, vjust = .5)) + xlab('Feature') + 
  geom_boxplot(notch = TRUE, varwidth = TRUE, colour = my.colors(1), outlier.colour = my.colors(5), outlier.shape = 1)
# g + scale_y_continuous(trans = 'log1p') + ylab('Value (log1p scale)')
g + ylab('Value (no scale)')

Running with k = 5000

# seed to make this reproducible
RNGkind("L'Ecuyer-CMRG")

# Parameters
if (exists('force.k'))    { n <- force.n }                 else { n <- 30 }
if (exists('force.k'))    { k <- force.k }                 else { k <- 5000 }
if (exists('force.seed')) { my.seed <- force.seed }        else { my.seed <- 209 }
if (exists('force.thre')) { threshold <- force.thre }      else { threshold <- .35^2 }
if (exists('force.good')) { good.fit.perct <- force.good } else { good.fit.perct <- .8 }
if (exists('force.fami')) { family <- force.fami }         else { family <- 'binomial.glm' }
if (exists('force.core')) { mc.cores <- force.core }       else { mc.cores <- 16 }

Description of parameters

flog.info(paste0('Description of parameters:\n', 
                 '                k: %d\n', '                n: %d\n',
                 '        threshold: %g\n', '  good.fit.perct : %g'), 
          k, n , threshold, good.fit.perct)

RANSAC with the original data

set.seed(my.seed)
#
cache.name <- generate.cache.name('results.ransac')
if (file.exists(cache.name)) {
  load(cache.name)
  flog.info('Loading cached ransac from %s', cache.name)
} else {
  results.ransac <- ransac(xdata, ydata, n = n, threshold = threshold, 
                           good.fit.perct = good.fit.perct, 
                           k = k, family = family, mc.cores = mc.cores,
                           nlambda = 100, alpha = 0)
  flog.info('Saving cache to disk (%s)', cache.name)
  save(results.ransac, xdata, ydata, family, data.name, file = cache.name)
}
plot(results.ransac, xdata, ydata, family = family, name = 'Original data', nlambda = 100)

RAMSAC with perturbation in first 10 and last 10

set.seed(my.seed)
#
ydata.pert <- ydata
ix.marta <- c(23,26,29,35,41,44,54,79,84,91,98,116,129,136,154,155,161,183,187,218)
ydata.pert[ix.marta] <- (ydata.pert[ix.marta] - 1) * -1
len <- length(ydata)
#
#ydata.pert[sample(which(ydata == 1),10)] <- 0
#ydata.pert[sample(which(ydata == 0),10)] <- 1
#
cache.name <- generate.cache.name('results.ransac.pert')
if (file.exists(cache.name)) {
  flog.info('Loading cached ransac from %s', cache.name)
  load(cache.name)
} else {
  results.ransac.pert <- ransac(xdata, ydata.pert, n = 20, threshold = threshold, 
                                good.fit.perct = good.fit.perct, k = k,
                                family = family, mc.cores = mc.cores,
                                nlambda = 100, alpha = 0)
  flog.info('Saving cache to disk (%s)', cache.name)
  save(results.ransac.pert, xdata, ydata, ydata.pert, family, file = cache.name)
}
plot(results.ransac.pert, xdata, ydata.pert, family = family, name = 'With perturbation')
#


sysbiomed/ransac documentation built on May 14, 2019, 12:55 a.m.