knitr::opts_chunk$set(echo = TRUE)
## Libraries required

# output nothing
suppressPackageStartupMessages({
  library(devtools)
  source(file.path('..', 'R-aux', 'dataset_brca_tnbc.R'))
  #library(ransac)
  devtools::load_all('../')
  library(brca.data)
  library(futile.logger)
  library(glmnet)
  library(verissimo)
  library(dplyr)
  library(ggbeeswarm)
  library(ggplot2)
  library(reshape2)
})
# Setup logging to write to file `logger.txt` and using DEBUG level

#flog.layout(layout.simple)
layout <- layout.format('[~l] ~m')
flog.layout(layout)
flog.appender(appender.tee('logger.txt'))
flog.threshold(DEBUG)

Parameters

#if (exists('my.params')) {
#  params <- my.params
#}
mc.cores            <- params$mc.cores
my.seed             <- params$my.seed
sample.min          <- params$sample.min
nreps               <- params$nreps
feature.selection   <- params$feature.selection
threshold.to.keep   <- params$threshold.to.keep
target.alpha.vector <- params$target.alpha.vector
nlambda             <- params$nlambda
lambda.min.ratio    <- params$lambda.min.ratio
data.origin         <- params$data.origin
train.perc          <- params$train.perc
k                   <- params$k
my.family           <- params$my.family
n                   <- params$n
threshold           <- params$threshold
good.fit.perct      <- params$good.fit.perct
alpha.baseline      <- params$alpha.baseline

# generated parameters
family.fun <- ransac.family(my.family)
{
  flog.info('Parameters:')
  flog.info('')
  flog.info('---- General --------------------')
  flog.info('                           Family: %s', my.family)
  flog.info('         initial seed for reprod.: %d', my.seed)
  flog.info('                       # of cores: %d', mc.cores)
  flog.info('----------------------------------')
  if (feature.selection) {
    flog.info('')
    flog.info('---- Consesus variables ----------')
    flog.info('               sample at each run: %g', sample.min)
    flog.info('                   number of runs: %d', nreps)
    flog.info('  min. of repetitions of variables')
    flog.info('             to keep as consensus: %g', threshold.to.keep)
    flog.info('          alphas to use in glmnet: %s', paste(target.alpha.vector, collapse = ', '))
    flog.info('             # of lambdas to test: %d', nlambda)
    flog.info('                 lambda.min.ratio: %g', lambda.min.ratio)
    flog.info('----------------------------------')
  }
  flog.info('')
  flog.info('---- RANSAC ----------------------')
  flog.info('                      Data origin: %s', data.origin)
  flog.info('                      Train set %%: %g', train.perc)
  flog.info('                     k iterations: %d', k)
  flog.info('          minimum number of cases: %d', n)
  flog.info('      threshold to keep (squared): %g', threshold)
  flog.info('   %% of inliers to consider model: %g', good.fit.perct)
  flog.info('       alpha to be used in RANSAC: %g', alpha.baseline)
}

Get the data

This is controlled by data.origin variable, that decides which type of data should be used.

All data and steps are saved in a cache to speed up the analysis.

outlier.description <-function(ydata, include.perturb = F) {
  outliers <- 1 * (abs(ydata$real_class - 1*(ydata$prob > 0.5)) == 1)
  if (include.perturb) {
    outliers <- outliers + 2 * (ydata$real_class != ydata$logit_class)
  }
  plyr::revalue(factor(outliers), c('1' = 'Natural', '0' = '', '2' = 'Perturbed', '3' = 'Natural and Perturbed'))
}
# build cache file name
cache.file <- file.path('cache', sprintf('ransac.data_%s.RData', data.origin))

#
set.seed(my.seed)

#
if (file.exists(cache.file)) {
  flog.info('Loading cache with data: %s', cache.file)
  load(cache.file)
} else {
  flog.info('Loading from \'%s\'', data.origin)
  if (data.origin == 'tnbc'){
    my.data  <- dataset.brca.tnbc()
    new.coef <- array(0, ncol(my.data$xdata + 1)) 
  } else if (data.origin == 'synth0a') {
    # New coefficient for model
    new.coef      <- c(0,3,2,1.5)
    my.data <- gen.synth(obs = 100, new.coef = new.coef, perturbation.perct = 0.1)
  } else if (data.origin == 'synth0b') {
    # New coefficient for model
    new.coef      <- c(0,3,2,1.5)
    my.data <- gen.synth(obs = 1000, new.coef = new.coef, perturbation.perct = 0.1)
  } else if (data.origin == 'synth1a') {
    # New coefficient for model
    new.coef      <- c(0,3,2,1.5,0,0,0,0,0)
    my.data <- gen.synth(obs = 100, new.coef = new.coef, perturbation.perct = 0.1)
  } else if (data.origin == 'synth1b') {
    # New coefficient for model
    new.coef      <- c(0,3,2,1.5,0,0,0,0,0)
    my.data <- gen.synth(obs = 1000, new.coef = new.coef, perturbation.perct = 0.1)
  } else if (data.origin == 'synth2a') {
    # New coefficient for model
    nvars         <- 10000
    sparse.vars   <- sample(1:nvars, nvars - 100)
    new.coef      <- c(10 * runif(nvars))
    new.coef[sparse.vars] <- 
    my.data <- gen.synth(obs = 100, new.coef = new.coef, perturbation.perct = 0.1)
  } else if (data.origin == 'synth2b') {
    # New coefficient for model
    nvars         <- 10000
    new.coef      <- c(0,10 * runif(nvars))
    my.data <- gen.synth(obs = 1000, new.coef = new.coef, perturbation.perct = 0.1)
  } else {
    stop(sprintf('Dataset not found: %s', data.origin))
  }
  # get xdata and ydata
  xdata <- my.data$xdata
  ydata <- my.data$ydata

  # set variables
  dataset.name <- data.origin
  dataset.coef <- new.coef

  # save to cache file
  save(xdata, ydata, dataset.name, dataset.coef, file = cache.file)  
}

Define penalty (only used with glmnet models)

#  Add more such as degree penalty
penalty.list                  <- list()

default.penalty.factor        <- array(1, ncol(xdata))
names(default.penalty.factor) <- colnames(xdata)

penalty.list$glmnet = default.penalty.factor

Find Consensus variables for data

To disable feature selection feature.selection = FALSE.

Otherwise it will:

The parameters are described in a section above

# build all possible combinations
pair.comb <- sapply(seq_along(target.alpha.vector), function(ix) {
  lapply(names(penalty.list), function(ix.name) {
    list(alpha = target.alpha.vector[ix], 
         penalty = penalty.list[[ix.name]],
         penalty.name = ix.name)
  })
})
#
# This finds the best lambdas for each iteration
#

if (feature.selection) {
  # Does not need paralleziation, as cv.glmnet does it itself
  cv.result.list <- lapply(seq_along(pair.comb), function(ix) {
    target.alpha        <- pair.comb[[ix]]$alpha
    target.penalty      <- pair.comb[[ix]]$penalty
    target.penalty.name <- pair.comb[[ix]]$penalty.name
    #
    cache.file <- file.path('cache', sprintf('cv.glmnet-%s.%.3f.%s.%d.%d.%g.RData',
                                             data.origin,
                                             target.alpha, 
                                             pair.comb[[ix]]$penalty.name,
                                             my.seed,
                                             nlambda,
                                             lambda.min.ratio))
    if (file.exists(cache.file)) {
      flog.info('Loading cache from: %s', cache.file)
      load(cache.file)
    } else  {
      flog.info('Running CV.GLMNET to calculate appropriate lambda value')

      # Get predictable fold id
      set.seed(my.seed)
      #
      cv.folds <- balanced.cv.folds(which(ydata$logit_class == 0), which(ydata$logit_class == 1))
      foldid   <- array(0, length(ydata))
      foldid[ydata$logit_class == 0] <- cv.folds$output[[1]]
      foldid[ydata$logit_class == 1] <- cv.folds$output[[2]]
      #
      cv.result <- cv.glmnet(xdata, ydata$logit_class, 
                             family           = 'binomial', 
                             alpha            = target.alpha, 
                             penalty.factor   = target.penalty,
                             nlambda          = nlambda, 
                             lambda.min.ratio = lambda.min.ratio,
                             foldid           = foldid, 
                             #
                             mc.cores = mc.cores)
      #
      save(foldid, cv.result, nlambda, target.alpha,
           my.seed, target.penalty.name, target.penalty,
           file = cache.file)
    }
    flog.info('  Parameters:')
    flog.info('           alpha: %f', target.alpha)
    flog.info('    penalty.name: %s', target.penalty.name)
    flog.info('     Best Lambda: %g', cv.result$lambda.min)
    flog.info('')
    #
    return(list(glmnet = cv.result,
                alpha  = target.alpha, 
                penalty.factor = target.penalty,
                penalty.name   = target.penalty.name))
  })
}
#
# Runs nreps times to find the consensus variables
#

if (feature.selection) {
  #  
  flog.info('Running multiple GLMNET to determine consensus variables')

  multiple.runs <- lapply(seq_along(cv.result.list), function(ix) {
    cv.result     <- cv.result.list[[ix]]$glmnet
    target.alpha  <- cv.result.list[[ix]]$alpha
    target.lambda <- cv.result.list[[ix]]$glmnet$lambda.min
    #
    target.penalty.factor <- cv.result.list[[ix]]$penalty.factor
    target.penalty.name   <- cv.result.list[[ix]]$penalty.name
    # reproducibility
    set.seed(my.seed)
    #
    cache.file <- file.path('cache', sprintf('repeated.variables-%s.%.3f.%.3f.%d.%d.RData',
                                             data.origin, target.alpha, sample.min, nreps, my.seed))
    # 
    if (file.exists(cache.file)) {
      flog.info('')
      flog.info('  Loading cache from: %s', cache.file)
      load(cache.file)
    } else {
      glmnet.result <- mclapply(1:nreps, function(ix.2) {
        # get a sample of the data
        balanced.data <- balanced.train.and.test(which(ydata$logit_class == 1), 
                                                 which(ydata$logit_class == 0), 
                                                 train.perc = sample.min, join.all = T)
        flog.info('  Run %d / %d', ix.2, nreps)
        # calculate logistics regression on the train dataset
        fit <- suppressWarnings(glmnet(xdata[balanced.data$train,], 
                                       ydata$logit_class[balanced.data$train], 
                                       family = 'binomial', 
                                       alpha  = target.alpha,
                                       penalty.factor = target.penalty.factor,
                                       # do it just for our target lambda
                                       #  and 1. If it does not converge add
                                       #  lambda values in between.
                                       lambda = find.lambda(target.lambda)))
        # get the coeficients of target.lambda
        my.coef <- coef(fit, s = target.lambda)
        # get the names of the non-zero coefficients
        my.coef.names <- names(my.coef[-1,])[which(as.vector(my.coef[-1,]) != 0)]
        flog.info('  Number of non-zero variables: %d', length(my.coef.names))
        #
        return(my.coef.names)
      }, mc.cores = mc.cores)
      save(target.alpha, glmnet.result, target.lambda, target.penalty.factor, target.penalty.name, sample.min, nreps, file = cache.file)
    }
    flog.info('  Calculating for:')
    flog.info('         alpha: %f', target.alpha)
    flog.info('    sample.min: %f', sample.min)
    flog.info('         nreps: %d', nreps)
    return(list(glmnet = glmnet.result, penalty.name = target.penalty.name, alpha = target.alpha))
  })
}
#
# Counts the variables
#

if (feature.selection) {
  result.df <- data.frame()
  for( ix in seq_along(multiple.runs)) {
    res <- multiple.runs[[ix]]$glmnet
    # initialize the vector of counts
    row.ix            <- unique(unlist(res))
    count.vars        <- array(0, length(row.ix))
    names(count.vars) <- row.ix
    # add 1 for every non-zero coefficient
    for(ix.res in seq_along(res)) {
      count.vars[res[[ix.res]]] <- count.vars[res[[ix.res]]] + 1
    }
    result.df <- rbind(result.df, data.frame(id      = row.ix, 
                                             value   = count.vars, 
                                             penalty = array(multiple.runs[[ix]]$penalty.name, length(row.ix)),
                                             alpha   = array(multiple.runs[[ix]]$alpha, length(row.ix))))
  }
  result.df$type <- sprintf('%s - %2.2f', result.df$penalty, result.df$alpha)
}

Selected variables

if (!feature.selection) {
  new.vars <- seq(ncol(xdata))
  flog.info('No feature selection was performed as parameter \'feature.selection\' = FALSE')
} else {
  new.vars <- unique(subset(dplyr::arrange(result.df, desc(value)), value >= nreps * threshold.to.keep)$id)
  #
  ggplot(data = subset(result.df, value >= nreps * threshold.to.keep)) + 
  geom_freqpoly(aes(value, color = 1), binwidth = 25) +
  facet_wrap(~type) +
  scale_colour_continuous(guide = FALSE) + 
#  scale_y_continuous(trans = 'log10') + 
  theme_minimal() + 
  ylab('Count (log10 scale)') + 
  xlab(sprintf('Number of times variable appears in models (%d runs)\n Shows only variables that appear more than %g%% of the runs', nreps, threshold.to.keep * 100)) +
  ggtitle(sprintf('Total number of variables selected: %d (out of %d)', length(new.vars), ncol(xdata)))

}

Train / Test dataset generation

Select balanced training and test sets

cache.file <- file.path('cache', sprintf('ransac.training.test.data-%s.%.3f.RData', 
                                         data.origin, train.perc))

if (file.exists(cache.file)) {
  flog.info('Loading cache file: %s', cache.file)
  load(cache.file)
} else {
  flog.info('Determing train and test datasets')
  set.seed(my.seed)

  # obtain balanced training and test partitions
  sets.ix <- balanced.train.and.test(which(ydata$logit_class == 1), 
                                     which(ydata$logit_class == 0), 
                                     train.perc = train.perc, join.all = T)
  train.ix <- sets.ix$train
  test.ix  <- sets.ix$test

  #  
  xdata.train <- xdata[train.ix, new.vars]
  ydata.train <- as.data.frame(ydata[train.ix,])
  colnames(ydata.train) <- colnames(ydata)
  #
  xdata.test  <- xdata[test.ix, new.vars]
  ydata.test <- as.data.frame(ydata[test.ix,])
  colnames(ydata.test) <- colnames(ydata)
  save(xdata.train, ydata.train, xdata.test, ydata.test, file = cache.file)
}
# Set of expression to be seen all at once
{
  flog.info('Description of Train partition:')
  flog.info('  Cases: %d (Class 0) + %d (Class 1) = %d',
            table(ydata.train$logit_class)[1],
            nrow(ydata.train) - table(ydata.train$logit_class)[1],
            nrow(ydata.train))
  flog.info('')
  flog.info('Description of Test partition:')
  flog.info('  Cases: %d (Class 0) + %d (Class 1) = %d',
            table(ydata.test$logit_class)[1],
            nrow(ydata.test) - table(ydata.test$logit_class)[1],
            nrow(ydata.test))
}

Baseline model

#
#
#
penalty.factor <- penalty.list$glmnet[new.vars]
#
force.skip.cache <- F
#
# Find best lambdas if that is necessary
#

{
  set.seed(my.seed)
  #
  cv.folds <- balanced.cv.folds(which(ydata.train$logit_class == 0), which(ydata.train$logit_class == 1))
  foldid   <- array(0, nrow(ydata.train))
  foldid[ydata.train$logit_class == 0] <- cv.folds$output[[1]]
  foldid[ydata.train$logit_class == 1] <- cv.folds$output[[2]]
  #   
  flog.info('Running GLMNET with cross-validation to find lambda parameter')
  baseline <- cv.glmnet(xdata.train, ydata.train$logit_class, family = 'binomial', 
                        alpha            = alpha.baseline,
                        nlambda          = 100, 
                        lambda.min.ratio = lambda.min.ratio,
                        #
                        nfolds         = 10,
                        foldid         = foldid,
                        penalty.factor = penalty.factor,
                        mc.cores       = 1)
  lambda.baseline <- baseline$lambda.min
  #
  flog.info('  Found lambda = %g to use in RANSAC if necessary', lambda.baseline)
}
#
# glm(logit_class~., data.frame(xdata, logit_class = ydata$logit_class), family = binomial(link = "logit"), control = glm.control(maxit = 1000))
#
baseline.model.pert <- family.fun$fit.model(xdata.train, ydata.train$logit_class,
                                            lambda.baseline, penalty.factor = penalty.factor,
                                            alpha          = alpha.baseline)

baseline.model.pert.glm <- ransac.binomial.glm()$fit.model(xdata.train, ydata.train$logit_class,
                                            lambda.baseline, penalty.factor = penalty.factor,
                                            alpha          = alpha.baseline)
#
baseline.model.real <- family.fun$fit.model(xdata.train, ydata.train$real_class,
                                            lambda.baseline, penalty.factor = penalty.factor,
                                            alpha          = alpha.baseline)

Description of outliers

miscl.pert <- ydata$real_class - round(family.fun$predict(baseline.model.pert, newx = xdata.train, lambda = lambda.baseline)) != 0
miscl.real <- ydata$real_class - round(family.fun$predict(baseline.model.real, newx = xdata.train, lambda = lambda.baseline)) != 0

#
{

  if (sum(dataset.coef) != 0) {
    flog.info('Coefficients:', 
              data.frame(Original = dataset.coef, 
                         baseline.real = as.vector(family.fun$coef(baseline.model.real, lambda = lambda.baseline)),
                         baseline.pert = as.vector(family.fun$coef(baseline.model.pert, lambda = lambda.baseline))), capture = T)
  }
  flog.info('')
  flog.info('Calculted baseline model with original data (%s)', family.fun$model.name)
  flog.info('  Misclassifications: %d (# obs: %d)', sum(miscl.real), nrow(xdata.train))
  if (sum(miscl.real) != 0) {
    flog.info('  Which ones? %s', paste(rownames(xdata.train)[miscl.real], collapse = ', '))
  } else { flog.info('  Perfect classification!')}
  flog.info('')
  flog.info('Calculted baseline model with perturbed data (%s)', family.fun$model.name)
  flog.info('  Misclassifications:  %d (# obs: %d)',sum(miscl.pert), nrow(xdata.train))
  if (sum(miscl.pert) != 0) {
    flog.info('  Which ones? %s', paste(rownames(xdata.train)[miscl.pert], collapse = ', '))
  } else { flog.info('  Perfect classification!')}
  flog.info('')
  flog.info('-----  Misclassification tables  ---------------')
  flog.info('\n')

  if (sum(miscl.real) != 0) {
    my.miscl <- miscl.real
    natural.outlier <- factor(ydata.train$real_class[which(my.miscl)] != 1*(ydata.train$prob[which(my.miscl)] > 0.5))
    natural.outlier <- plyr::revalue(natural.outlier, c("TRUE"="Natural", "FALSE"=""))
    prediction <- family.fun$predict(baseline.model.real, newx = xdata.train[which(my.miscl),], lambda = lambda.baseline)
    #
    print(data.frame(
      natural.outlier = natural.outlier,
      gene.class     = ydata.train$real_class[which(my.miscl)],
      gene.prob  = ydata.train$prob[which(my.miscl)],
      model.real      = prediction))
  }
  flog.info('\n')  
    if (sum(miscl.pert) != 0) {
      if (nrow(ydata) <= 100) {
        my.miscl <- miscl.pert | ydata$logit_class != ydata$real_class
      } else {
        my.miscl <- miscl.pert
      }
      #
      natural.outlier <- factor(ydata.train$real_class[which(my.miscl)] != 1*(ydata.train$prob[which(my.miscl)] > 0.5))
      natural.outlier <- plyr::revalue(natural.outlier, c("TRUE"="Natural", "FALSE"=""))
      #
      prediction <- family.fun$predict(baseline.model.pert, newx = xdata.train[which(my.miscl),], lambda = lambda.baseline)
      #
      outlier    <- factor(ydata.train$real_class[which(my.miscl)] != 1*(prediction > 0.5))
      outlier    <- plyr::revalue(outlier, c("TRUE"="Misclass.", "FALSE"=""))
      #
      perturbed <- factor((ydata$logit_class != ydata$real_class)[my.miscl])
      perturbed <- plyr::revalue(perturbed, c("TRUE"="X", "FALSE"=""))
      #
      print(data.frame(
        perturbed       = perturbed,
        natural.outlier = natural.outlier,
        gen.class = ydata.train$real_class[which(my.miscl)],
        gen.prob  = ydata.train$prob[which(my.miscl)],
        model.pert      = prediction,
        outlier         = outlier))
  }

}

Outlier index

ydata[ydata$logit_class != ydata$real_class,]

Natural Outliers

ydata[ydata$real_class != 1*(ydata$prob > 0.5),]

RANSAC

Run RANSAC

Original Dataset (without perturbation)

result.ransac.real <- ransac::ransac(xdata = xdata.train, ydata  = ydata.train$real_class, 
                                     k = k, n = n, family = my.family, 
                                     #
                                     threshold = threshold,good.fit.perct = good.fit.perct, 
                                     #
                                     mc.cores = mc.cores, lambda = lambda.baseline,
                                     alpha = alpha.baseline, penalty.factor = penalty.factor)
cache.file <- file.path('cache', sprintf('ransac-real-%s.%d.%d.%d.%.4f.%s.%d.%.4f.%.4f.RData', 
                                         data.origin, nrow(xdata.train), ncol(xdata.train), k, 
                                         lambda.baseline,
                                         my.family, n, threshold, good.fit.perct))

if (!force.skip.cache && file.exists(cache.file)) {
  flog.info('Loading cache with ransac results: %s', cache.file)
  load(cache.file)
} else  {
  flog.info('Running RANSAC')
  set.seed(my.seed)
  result.ransac.real <- ransac::ransac(xdata  = xdata.train, 
                                       ydata  = ydata.train$real_class, 
                                       k      = k, 
                                       n      = n, 
                                       family = my.family, 
                                       #
                                       threshold      = threshold, 
                                       good.fit.perct = good.fit.perct, 
                                       #
                                       mc.cores       = mc.cores,
                                       lambda         = lambda.baseline,
                                       alpha          = alpha.baseline,
                                       penalty.factor = penalty.factor)
  save(k, my.family, n, threshold, good.fit.perct, result.ransac.real, file = cache.file)
}

Dataset with perturbation

result.ransac.real <- ransac::ransac(xdata = xdata.train, ydata  = ydata.train$logit_class, 
                                     k = k, n = n, family = my.family, 
                                     #
                                     threshold = threshold,good.fit.perct = good.fit.perct, 
                                     #
                                     mc.cores = mc.cores, lambda = lambda.baseline,
                                     alpha = alpha.baseline, penalty.factor = penalty.factor)
cache.file <- file.path('cache', sprintf('ransac-pert-%s.%d.%d.%d.%.4f.%s.%d.%.4f.%.4f.RData', 
                                         data.origin, nrow(xdata.train), ncol(xdata.train), k, 
                                         lambda.baseline,
                                         my.family, n, threshold, good.fit.perct))

if (!force.skip.cache && file.exists(cache.file)) {
  flog.info('Loading cache with ransac results: %s', cache.file)
  load(cache.file)
} else  {
  flog.info('Running RANSAC')
  set.seed(my.seed)
  result.ransac.pert <- ransac::ransac(xdata  = xdata.train, 
                                       ydata  = ydata.train$logit_class, 
                                       k      = k, 
                                       n      = n, 
                                       family = my.family, 
                                       #
                                       threshold      = threshold, 
                                       good.fit.perct = good.fit.perct, 
                                       #
                                       mc.cores       = mc.cores,
                                       lambda         = lambda.baseline,
                                       alpha          = alpha.baseline,
                                       penalty.factor = penalty.factor)
  save(k, my.family, n, threshold, good.fit.perct, result.ransac.pert, file = cache.file)
}

Plot Results

Test dataset

Distance to class

#
if (train.perc == 1) {
  flog.info('No test set being calculated as no test set is being used (train.perc = %g)', train.perc)
} else {
  info.test <- plot(result.ransac.pert, xdata = xdata.test, 
                    ydata          = ydata.test$real_class,
                    baseline       = baseline.model.real,
                    family         = my.family, 
                    name           = paste0(dataset.name, ' (distance to class)'),
                    lambda         = lambda.baseline,
                    alpha          = alpha,
                    only_consensus = F)
}

Training dataset

Distance to class

#
#
if (train.perc == 1) {
  flog.info('No test set being calculated as no test set is being used (train.perc = %g)', train.perc)
} else {
  info.train <- plot(result.ransac.pert, xdata = xdata.train, 
                     ydata          = ydata.train$real_class,
                     baseline       = baseline.model.real,
                     family         = my.family, 
                     name           = paste0(dataset.name, ' (distance to class)'),
                     lambda         = lambda.baseline,
                     alpha          = alpha.baseline,
                     only_consensus = F)
}

All data

Distance to class

#
info.all <- plot(result.ransac.real, xdata = xdata[, new.vars], 
                 ydata            = ydata$real_class,
                 family           = my.family, 
                 baseline         = baseline.model.real,
                 name             = paste0('Original ', dataset.name, ' (Distance to Real Class)'),
                 lambda           = lambda.baseline,
                 alpha            = alpha.baseline,
                 only_consensus   = F,
                 outliers         = outlier.description(ydata))

Dataset with perturbation

info.all.pert <- plot(result.ransac.pert, xdata = xdata[, new.vars], 
                      ydata          = ydata$real_class,
                      family         = my.family, 
                      baseline       = baseline.model.pert,
                      name           = paste0('Perturbed ', dataset.name, ' (Distance to Real Class)'),
                      lambda         = lambda.baseline,
                      alpha          = alpha.baseline,
                      only_consensus = F,
                      outliers       = outlier.description(ydata, include.perturb = T))

Save indicators

Non-perturbed

cache.file <- file.path('cache', 'overall.RData')
if (file.exists(cache.file)) {
  load(cache.file)
} else {
  overall.df      <- data.frame()
}

new.line <- data.frame(my.seed, data.origin, my.family, k, n, threshold, good.fit.perct)
colnames(new.line) <- c('seed', 'data', 'family', 'k', 'n', 'threshold', 'good.fit.perct')

for (ix in names(info.all$misclassifications)) {
  false.neg <- sum(info.all$misclassifications[[ix]]$false.neg)
  false.pos <- sum(info.all$misclassifications[[ix]]$false.pos)
  prev.names <- colnames(new.line)
  new.line <- cbind(new.line, false.neg + false.pos, false.neg, false.pos)
  colnames(new.line) <- c(prev.names, 
                       sprintf('%s outliers', info.all$description[[ix]]),
                       sprintf('%s false.neg', info.all$description[[ix]]),
                       sprintf('%s false.pos', info.all$description[[ix]]))
}

if (nrow(overall.df) > 0 && ncol(overall.df) != ncol(new.line)) {
  # skip
  flog.info('Can\'t add to overall.df, as no models were obtained')
  individual.line <- overall.df[1,]
  individual.line[,colnames(new.line)] <- new.line
  individual.line[,!colnames(overall.df) %in% colnames(new.line)] <- NA
  new.line <- individual.line
}
overall.df <- rbind(overall.df, new.line)
overall.df <- distinct(overall.df)
#
save(overall.df, file = cache.file)

Perturbed

cache.file <- file.path('cache', 'overall.pert.RData')
if (file.exists(cache.file)) {
  load(cache.file)
} else {
  overall.df      <- data.frame()
}

new.line <- data.frame(my.seed, data.origin, my.family, k, n, threshold, good.fit.perct)
colnames(new.line) <- c('seed', 'data', 'family', 'k', 'n', 'threshold', 'good.fit.perct')

for (ix in names(info.all$misclassifications)) {
  false.neg <- sum(info.all.pert$misclassifications[[ix]]$false.neg)
  false.pos <- sum(info.all.pert$misclassifications[[ix]]$false.pos)
  prev.names <- colnames(new.line)
  new.line <- cbind(new.line, false.neg + false.pos, false.neg, false.pos)
  colnames(new.line) <- c(prev.names, 
                       sprintf('%s outliers', info.all.pert$description[[ix]]),
                       sprintf('%s false.neg', info.all.pert$description[[ix]]),
                       sprintf('%s false.pos', info.all.pert$description[[ix]]))
}

if (nrow(overall.df) > 0 && ncol(overall.df) != ncol(new.line)) {
  # skip
  flog.info('Can\'t add to overall.df, as no models were obtained')
  individual.line <- overall.df[1,]
  individual.line[,colnames(new.line)] <- new.line
  individual.line[,!colnames(overall.df) %in% colnames(new.line)] <- NA
  new.line <- individual.line
}
overall.df <- rbind(overall.df, new.line)
overall.df <- distinct(overall.df)
#
save(overall.df, file = cache.file)

Tests to support some decisions on the pipeline

Perfect model (saturated)

lambda.v <- find.lambda(lambda.baseline)
perfect.model <- glmnet(xdata[, new.vars], ydata$logit_class,
       alpha = alpha.baseline,
       family = 'binomial',
       lambda = lambda.v,
       penalty.factor = NULL)
summary(as.vector(coef(perfect.model, s = lambda.baseline)))
summary(predict(perfect.model, newx = xdata[, new.vars], type = 'response', s = lambda.baseline) - ydata$logit_class)
#
new.vars.perfect <- rownames(coef(perfect.model, s = lambda.baseline) != 0)[-1]

my.ridge <- glmnet(xdata[, new.vars.perfect], ydata$logit_class,
       alpha = 0,
       family = 'binomial',
       #lambda = lambda.v)
       nlambda = 100,
       lambda.min.ratio = 1e-10)
len <- 100
len <- length(lambda.v)
ydata.rep <- as.data.frame(matrix(rep(ydata$logit_class, len), ncol = len, nrow = nrow(ydata)))
ridge.predict <- predict(my.ridge, newx = xdata[, new.vars.perfect], type = 'response') - ydata.rep
apply(ridge.predict, 2, function(e){ max(abs(e))})

AUC example

len <- 1500

set.seed(my.seed)
nvars     <- 100
xdata.auc <- sapply(seq(nvars), function(e, len) { rnorm(len)}, len)
coef.auc  <- seq(nvars) * runif(nvars)
#
z  <- 1 + xdata.auc %*% coef.auc           # linear combination with a bias
pr <- 1/(1+exp(-z))            # pass through an inv-logit function
y  <- factor(rbinom(len,1,pr))  # bernoulli response variable
y.pert <- y
perturbation.ix <- sample(seq_along(y), round(len * .1))
y.pert[perturbation.ix] <- (as.numeric(as.vector(y[perturbation.ix])) - 1)* -1
#
df <- data.frame(y.pert,xdata.auc)
my.model <- glm( y.pert ~ ., data=df,family="binomial")

my.roc <- AUC::roc(predict(my.model, type = 'response'), labels = y)
my.auc <- AUC::auc(my.roc)
flog.info('Y original auc: %g', my.auc)
plot(my.roc)
title(sprintf('Y Original -- AUC = %g', my.auc))

my.roc.2 <- AUC::roc(predict(my.model, type = 'response'), labels = y.pert)
my.auc.2 <- AUC::auc(my.roc.2)
plot(my.roc.2)
title(sprintf('Y with Perturbation -- AUC = %g', my.auc.2))

Three dot elipsis

three.dot <- function(...) {
  arguments <- list(...)
  flog.info('aca %s', arguments$aca)
}
three.dot(aca = 12)
set.seed(666)
library(futile.logger)

my.coef <- c(1,2,3)
obs <- 1000
xdata <- sapply(my.coef[-1], function(e){rnorm(obs)})

colnames(xdata) <- paste0('X', seq(ncol(xdata)))

z = my.coef[1] + xdata %*% my.coef[-1]
pr = 1/(1+exp(-z))         # pass through an inv-logit function
y = rbinom(obs,1,pr)      # bernoulli response variable
y2 <- 1*(pr > 0.5)

df = data.frame(y=y,xdata)
flog.info('')
flog.info('')
flog.info('GLM Y', summary(glm( y~ .,data=df,family="binomial", control = glm.control(maxit = 1000))), capture = T)

df2 = data.frame(y=y2,xdata)
flog.info('')
flog.info('')
flog.info('GLM Y2', summary(glm( y~ .,data=df2,family="binomial", control = glm.control(maxit = 1000))), capture = T)

library(glmnet)

lambda <- cv.glmnet(xdata, y, family = 'binomial', alpha = 1, mc.cores = 16)$lambda.min
my.glmnet <- glmnet(xdata, y, family = 'binomial', lambda = find.lambda(lambda), alpha = 1)

lambda.2 <- cv.glmnet(xdata, y2, family = 'binomial', alpha = 1, mc.cores = 16)$lambda.min
my.glmnet.2 <- glmnet(xdata, y2, family = 'binomial', lambda = find.lambda(lambda), alpha = 1)

flog.info('')
flog.info('')
flog.info('RIDGE Y', coef(my.glmnet, s = lambda), capture = T)
flog.info('')
flog.info('')
flog.info('RIDGE Y2', coef(my.glmnet.2, s = lambda), capture = T)

library(logistf)
flog.info('')
flog.info('')
flog.info('Logistics Firth Y', logistf(y ~ ., data = df), capture = T)
flog.info('')
flog.info('')
flog.info('Logistics Firth Y2', logistf(y ~ ., data = df2), capture = T)
glm.bad  <- glm( y~ .,data=df2,family="binomial", control = glm.control(maxit = 1000))
glm.good <- glm( y~ .,data=df,family="binomial", control = glm.control(maxit = 1000))

#head(cbind(Qr$qr, aa$qr$qr))
p <- ncol(xdata) + 1

#Qr <- glm.good$qr
#Qr <- glm.bad$qr

wald.test <- function(coef, qr, p) {
  p1 <- 1L:p
  coef.p <- coef
  covmat.unscaled <- chol2inv(qr)
  dimnames(covmat.unscaled) <- list(names(coef.p), names(coef.p))
  covmat <- 1 * covmat.unscaled
  var.cf <- diag(covmat)
  s.err <- sqrt(var.cf)
  tvalue <- coef.p/s.err
  dn <- c("Estimate", "Std. Error")
  pvalue <- 2 * pnorm(-abs(tvalue))
  coef.table <- cbind(coef.p, s.err, tvalue, pvalue)
  dimnames(coef.table) <- list(names(coef.p), c(dn, 
    "z value", "Pr(>|z|)"))
  print(coef.table)
}
print('')
flog.info('')
flog.info('')
flog.info('Good')
flog.info('')
wald.test(glm.good$coefficients, glm.good$qr$qr, p)
flog.info('')
flog.info('')
flog.info('Bad')
flog.info('')
wald.test(glm.bad$coefficients, glm.bad$qr$qr, p)

calc.qr <- function(xdata, coef) {
  qr(t(t(cbind(array(1, nrow(xdata)), xdata)) * coef))
}

flog.info('')
flog.info('')
flog.info('Good Own QR')
flog.info('')
Qr <- calc.qr(xdata, glm.good$coefficients)
wald.test(glm.good$coefficients, Qr$qr, p)
flog.info('')
flog.info('')
flog.info('Bad Own QR')
flog.info('')
Qr <- calc.qr(xdata, glm.bad$coefficients)
wald.test(glm.bad$coefficients, Qr$qr, p)
# covmat.unscaled
# cov(t(t(cbind(array(1, nrow(xdata)), xdata)) * glm.good$coefficients))
# chol2inv(Qr$qr)


sysbiomed/ransac documentation built on May 14, 2019, 12:55 a.m.