knitr::opts_chunk$set(echo = TRUE)
## Libraries required # output nothing suppressPackageStartupMessages({ library(devtools) source(file.path('..', 'R-aux', 'dataset_brca_tnbc.R')) #library(ransac) devtools::load_all('../') library(brca.data) library(futile.logger) library(glmnet) library(verissimo) library(dplyr) library(ggbeeswarm) library(ggplot2) library(reshape2) })
# Setup logging to write to file `logger.txt` and using DEBUG level #flog.layout(layout.simple) layout <- layout.format('[~l] ~m') flog.layout(layout) flog.appender(appender.tee('logger.txt')) flog.threshold(DEBUG)
#if (exists('my.params')) { # params <- my.params #} mc.cores <- params$mc.cores my.seed <- params$my.seed sample.min <- params$sample.min nreps <- params$nreps feature.selection <- params$feature.selection threshold.to.keep <- params$threshold.to.keep target.alpha.vector <- params$target.alpha.vector nlambda <- params$nlambda lambda.min.ratio <- params$lambda.min.ratio data.origin <- params$data.origin train.perc <- params$train.perc k <- params$k my.family <- params$my.family n <- params$n threshold <- params$threshold good.fit.perct <- params$good.fit.perct alpha.baseline <- params$alpha.baseline # generated parameters family.fun <- ransac.family(my.family)
{ flog.info('Parameters:') flog.info('') flog.info('---- General --------------------') flog.info(' Family: %s', my.family) flog.info(' initial seed for reprod.: %d', my.seed) flog.info(' # of cores: %d', mc.cores) flog.info('----------------------------------') if (feature.selection) { flog.info('') flog.info('---- Consesus variables ----------') flog.info(' sample at each run: %g', sample.min) flog.info(' number of runs: %d', nreps) flog.info(' min. of repetitions of variables') flog.info(' to keep as consensus: %g', threshold.to.keep) flog.info(' alphas to use in glmnet: %s', paste(target.alpha.vector, collapse = ', ')) flog.info(' # of lambdas to test: %d', nlambda) flog.info(' lambda.min.ratio: %g', lambda.min.ratio) flog.info('----------------------------------') } flog.info('') flog.info('---- RANSAC ----------------------') flog.info(' Data origin: %s', data.origin) flog.info(' Train set %%: %g', train.perc) flog.info(' k iterations: %d', k) flog.info(' minimum number of cases: %d', n) flog.info(' threshold to keep (squared): %g', threshold) flog.info(' %% of inliers to consider model: %g', good.fit.perct) flog.info(' alpha to be used in RANSAC: %g', alpha.baseline) }
This is controlled by data.origin
variable, that decides which type of data should be used.
All data and steps are saved in a cache to speed up the analysis.
outlier.description <-function(ydata, include.perturb = F) { outliers <- 1 * (abs(ydata$real_class - 1*(ydata$prob > 0.5)) == 1) if (include.perturb) { outliers <- outliers + 2 * (ydata$real_class != ydata$logit_class) } plyr::revalue(factor(outliers), c('1' = 'Natural', '0' = '', '2' = 'Perturbed', '3' = 'Natural and Perturbed')) }
# build cache file name cache.file <- file.path('cache', sprintf('ransac.data_%s.RData', data.origin)) # set.seed(my.seed) # if (file.exists(cache.file)) { flog.info('Loading cache with data: %s', cache.file) load(cache.file) } else { flog.info('Loading from \'%s\'', data.origin) if (data.origin == 'tnbc'){ my.data <- dataset.brca.tnbc() new.coef <- array(0, ncol(my.data$xdata + 1)) } else if (data.origin == 'synth0a') { # New coefficient for model new.coef <- c(0,3,2,1.5) my.data <- gen.synth(obs = 100, new.coef = new.coef, perturbation.perct = 0.1) } else if (data.origin == 'synth0b') { # New coefficient for model new.coef <- c(0,3,2,1.5) my.data <- gen.synth(obs = 1000, new.coef = new.coef, perturbation.perct = 0.1) } else if (data.origin == 'synth1a') { # New coefficient for model new.coef <- c(0,3,2,1.5,0,0,0,0,0) my.data <- gen.synth(obs = 100, new.coef = new.coef, perturbation.perct = 0.1) } else if (data.origin == 'synth1b') { # New coefficient for model new.coef <- c(0,3,2,1.5,0,0,0,0,0) my.data <- gen.synth(obs = 1000, new.coef = new.coef, perturbation.perct = 0.1) } else if (data.origin == 'synth2a') { # New coefficient for model nvars <- 10000 sparse.vars <- sample(1:nvars, nvars - 100) new.coef <- c(10 * runif(nvars)) new.coef[sparse.vars] <- my.data <- gen.synth(obs = 100, new.coef = new.coef, perturbation.perct = 0.1) } else if (data.origin == 'synth2b') { # New coefficient for model nvars <- 10000 new.coef <- c(0,10 * runif(nvars)) my.data <- gen.synth(obs = 1000, new.coef = new.coef, perturbation.perct = 0.1) } else { stop(sprintf('Dataset not found: %s', data.origin)) } # get xdata and ydata xdata <- my.data$xdata ydata <- my.data$ydata # set variables dataset.name <- data.origin dataset.coef <- new.coef # save to cache file save(xdata, ydata, dataset.name, dataset.coef, file = cache.file) }
glmnet
models)# Add more such as degree penalty penalty.list <- list() default.penalty.factor <- array(1, ncol(xdata)) names(default.penalty.factor) <- colnames(xdata) penalty.list$glmnet = default.penalty.factor
To disable feature selection feature.selection = FALSE
.
Otherwise it will:
nreps
times GLMNET with different target.alpha.vector
valuesthreshold.to.keep
(in percentage)The parameters are described in a section above
# build all possible combinations pair.comb <- sapply(seq_along(target.alpha.vector), function(ix) { lapply(names(penalty.list), function(ix.name) { list(alpha = target.alpha.vector[ix], penalty = penalty.list[[ix.name]], penalty.name = ix.name) }) })
# # This finds the best lambdas for each iteration # if (feature.selection) { # Does not need paralleziation, as cv.glmnet does it itself cv.result.list <- lapply(seq_along(pair.comb), function(ix) { target.alpha <- pair.comb[[ix]]$alpha target.penalty <- pair.comb[[ix]]$penalty target.penalty.name <- pair.comb[[ix]]$penalty.name # cache.file <- file.path('cache', sprintf('cv.glmnet-%s.%.3f.%s.%d.%d.%g.RData', data.origin, target.alpha, pair.comb[[ix]]$penalty.name, my.seed, nlambda, lambda.min.ratio)) if (file.exists(cache.file)) { flog.info('Loading cache from: %s', cache.file) load(cache.file) } else { flog.info('Running CV.GLMNET to calculate appropriate lambda value') # Get predictable fold id set.seed(my.seed) # cv.folds <- balanced.cv.folds(which(ydata$logit_class == 0), which(ydata$logit_class == 1)) foldid <- array(0, length(ydata)) foldid[ydata$logit_class == 0] <- cv.folds$output[[1]] foldid[ydata$logit_class == 1] <- cv.folds$output[[2]] # cv.result <- cv.glmnet(xdata, ydata$logit_class, family = 'binomial', alpha = target.alpha, penalty.factor = target.penalty, nlambda = nlambda, lambda.min.ratio = lambda.min.ratio, foldid = foldid, # mc.cores = mc.cores) # save(foldid, cv.result, nlambda, target.alpha, my.seed, target.penalty.name, target.penalty, file = cache.file) } flog.info(' Parameters:') flog.info(' alpha: %f', target.alpha) flog.info(' penalty.name: %s', target.penalty.name) flog.info(' Best Lambda: %g', cv.result$lambda.min) flog.info('') # return(list(glmnet = cv.result, alpha = target.alpha, penalty.factor = target.penalty, penalty.name = target.penalty.name)) }) }
# # Runs nreps times to find the consensus variables # if (feature.selection) { # flog.info('Running multiple GLMNET to determine consensus variables') multiple.runs <- lapply(seq_along(cv.result.list), function(ix) { cv.result <- cv.result.list[[ix]]$glmnet target.alpha <- cv.result.list[[ix]]$alpha target.lambda <- cv.result.list[[ix]]$glmnet$lambda.min # target.penalty.factor <- cv.result.list[[ix]]$penalty.factor target.penalty.name <- cv.result.list[[ix]]$penalty.name # reproducibility set.seed(my.seed) # cache.file <- file.path('cache', sprintf('repeated.variables-%s.%.3f.%.3f.%d.%d.RData', data.origin, target.alpha, sample.min, nreps, my.seed)) # if (file.exists(cache.file)) { flog.info('') flog.info(' Loading cache from: %s', cache.file) load(cache.file) } else { glmnet.result <- mclapply(1:nreps, function(ix.2) { # get a sample of the data balanced.data <- balanced.train.and.test(which(ydata$logit_class == 1), which(ydata$logit_class == 0), train.perc = sample.min, join.all = T) flog.info(' Run %d / %d', ix.2, nreps) # calculate logistics regression on the train dataset fit <- suppressWarnings(glmnet(xdata[balanced.data$train,], ydata$logit_class[balanced.data$train], family = 'binomial', alpha = target.alpha, penalty.factor = target.penalty.factor, # do it just for our target lambda # and 1. If it does not converge add # lambda values in between. lambda = find.lambda(target.lambda))) # get the coeficients of target.lambda my.coef <- coef(fit, s = target.lambda) # get the names of the non-zero coefficients my.coef.names <- names(my.coef[-1,])[which(as.vector(my.coef[-1,]) != 0)] flog.info(' Number of non-zero variables: %d', length(my.coef.names)) # return(my.coef.names) }, mc.cores = mc.cores) save(target.alpha, glmnet.result, target.lambda, target.penalty.factor, target.penalty.name, sample.min, nreps, file = cache.file) } flog.info(' Calculating for:') flog.info(' alpha: %f', target.alpha) flog.info(' sample.min: %f', sample.min) flog.info(' nreps: %d', nreps) return(list(glmnet = glmnet.result, penalty.name = target.penalty.name, alpha = target.alpha)) }) }
# # Counts the variables # if (feature.selection) { result.df <- data.frame() for( ix in seq_along(multiple.runs)) { res <- multiple.runs[[ix]]$glmnet # initialize the vector of counts row.ix <- unique(unlist(res)) count.vars <- array(0, length(row.ix)) names(count.vars) <- row.ix # add 1 for every non-zero coefficient for(ix.res in seq_along(res)) { count.vars[res[[ix.res]]] <- count.vars[res[[ix.res]]] + 1 } result.df <- rbind(result.df, data.frame(id = row.ix, value = count.vars, penalty = array(multiple.runs[[ix]]$penalty.name, length(row.ix)), alpha = array(multiple.runs[[ix]]$alpha, length(row.ix)))) } result.df$type <- sprintf('%s - %2.2f', result.df$penalty, result.df$alpha) }
if (!feature.selection) { new.vars <- seq(ncol(xdata)) flog.info('No feature selection was performed as parameter \'feature.selection\' = FALSE') } else { new.vars <- unique(subset(dplyr::arrange(result.df, desc(value)), value >= nreps * threshold.to.keep)$id) # ggplot(data = subset(result.df, value >= nreps * threshold.to.keep)) + geom_freqpoly(aes(value, color = 1), binwidth = 25) + facet_wrap(~type) + scale_colour_continuous(guide = FALSE) + # scale_y_continuous(trans = 'log10') + theme_minimal() + ylab('Count (log10 scale)') + xlab(sprintf('Number of times variable appears in models (%d runs)\n Shows only variables that appear more than %g%% of the runs', nreps, threshold.to.keep * 100)) + ggtitle(sprintf('Total number of variables selected: %d (out of %d)', length(new.vars), ncol(xdata))) }
cache.file <- file.path('cache', sprintf('ransac.training.test.data-%s.%.3f.RData', data.origin, train.perc)) if (file.exists(cache.file)) { flog.info('Loading cache file: %s', cache.file) load(cache.file) } else { flog.info('Determing train and test datasets') set.seed(my.seed) # obtain balanced training and test partitions sets.ix <- balanced.train.and.test(which(ydata$logit_class == 1), which(ydata$logit_class == 0), train.perc = train.perc, join.all = T) train.ix <- sets.ix$train test.ix <- sets.ix$test # xdata.train <- xdata[train.ix, new.vars] ydata.train <- as.data.frame(ydata[train.ix,]) colnames(ydata.train) <- colnames(ydata) # xdata.test <- xdata[test.ix, new.vars] ydata.test <- as.data.frame(ydata[test.ix,]) colnames(ydata.test) <- colnames(ydata) save(xdata.train, ydata.train, xdata.test, ydata.test, file = cache.file) } # Set of expression to be seen all at once { flog.info('Description of Train partition:') flog.info(' Cases: %d (Class 0) + %d (Class 1) = %d', table(ydata.train$logit_class)[1], nrow(ydata.train) - table(ydata.train$logit_class)[1], nrow(ydata.train)) flog.info('') flog.info('Description of Test partition:') flog.info(' Cases: %d (Class 0) + %d (Class 1) = %d', table(ydata.test$logit_class)[1], nrow(ydata.test) - table(ydata.test$logit_class)[1], nrow(ydata.test)) }
# # # penalty.factor <- penalty.list$glmnet[new.vars] # force.skip.cache <- F
# # Find best lambdas if that is necessary # { set.seed(my.seed) # cv.folds <- balanced.cv.folds(which(ydata.train$logit_class == 0), which(ydata.train$logit_class == 1)) foldid <- array(0, nrow(ydata.train)) foldid[ydata.train$logit_class == 0] <- cv.folds$output[[1]] foldid[ydata.train$logit_class == 1] <- cv.folds$output[[2]] # flog.info('Running GLMNET with cross-validation to find lambda parameter') baseline <- cv.glmnet(xdata.train, ydata.train$logit_class, family = 'binomial', alpha = alpha.baseline, nlambda = 100, lambda.min.ratio = lambda.min.ratio, # nfolds = 10, foldid = foldid, penalty.factor = penalty.factor, mc.cores = 1) lambda.baseline <- baseline$lambda.min # flog.info(' Found lambda = %g to use in RANSAC if necessary', lambda.baseline) }
# # glm(logit_class~., data.frame(xdata, logit_class = ydata$logit_class), family = binomial(link = "logit"), control = glm.control(maxit = 1000)) # baseline.model.pert <- family.fun$fit.model(xdata.train, ydata.train$logit_class, lambda.baseline, penalty.factor = penalty.factor, alpha = alpha.baseline) baseline.model.pert.glm <- ransac.binomial.glm()$fit.model(xdata.train, ydata.train$logit_class, lambda.baseline, penalty.factor = penalty.factor, alpha = alpha.baseline) # baseline.model.real <- family.fun$fit.model(xdata.train, ydata.train$real_class, lambda.baseline, penalty.factor = penalty.factor, alpha = alpha.baseline)
P(XB)
from data generation model differs by more than 0.5
in classmiscl.pert <- ydata$real_class - round(family.fun$predict(baseline.model.pert, newx = xdata.train, lambda = lambda.baseline)) != 0 miscl.real <- ydata$real_class - round(family.fun$predict(baseline.model.real, newx = xdata.train, lambda = lambda.baseline)) != 0 # { if (sum(dataset.coef) != 0) { flog.info('Coefficients:', data.frame(Original = dataset.coef, baseline.real = as.vector(family.fun$coef(baseline.model.real, lambda = lambda.baseline)), baseline.pert = as.vector(family.fun$coef(baseline.model.pert, lambda = lambda.baseline))), capture = T) } flog.info('') flog.info('Calculted baseline model with original data (%s)', family.fun$model.name) flog.info(' Misclassifications: %d (# obs: %d)', sum(miscl.real), nrow(xdata.train)) if (sum(miscl.real) != 0) { flog.info(' Which ones? %s', paste(rownames(xdata.train)[miscl.real], collapse = ', ')) } else { flog.info(' Perfect classification!')} flog.info('') flog.info('Calculted baseline model with perturbed data (%s)', family.fun$model.name) flog.info(' Misclassifications: %d (# obs: %d)',sum(miscl.pert), nrow(xdata.train)) if (sum(miscl.pert) != 0) { flog.info(' Which ones? %s', paste(rownames(xdata.train)[miscl.pert], collapse = ', ')) } else { flog.info(' Perfect classification!')} flog.info('') flog.info('----- Misclassification tables ---------------') flog.info('\n') if (sum(miscl.real) != 0) { my.miscl <- miscl.real natural.outlier <- factor(ydata.train$real_class[which(my.miscl)] != 1*(ydata.train$prob[which(my.miscl)] > 0.5)) natural.outlier <- plyr::revalue(natural.outlier, c("TRUE"="Natural", "FALSE"="")) prediction <- family.fun$predict(baseline.model.real, newx = xdata.train[which(my.miscl),], lambda = lambda.baseline) # print(data.frame( natural.outlier = natural.outlier, gene.class = ydata.train$real_class[which(my.miscl)], gene.prob = ydata.train$prob[which(my.miscl)], model.real = prediction)) } flog.info('\n') if (sum(miscl.pert) != 0) { if (nrow(ydata) <= 100) { my.miscl <- miscl.pert | ydata$logit_class != ydata$real_class } else { my.miscl <- miscl.pert } # natural.outlier <- factor(ydata.train$real_class[which(my.miscl)] != 1*(ydata.train$prob[which(my.miscl)] > 0.5)) natural.outlier <- plyr::revalue(natural.outlier, c("TRUE"="Natural", "FALSE"="")) # prediction <- family.fun$predict(baseline.model.pert, newx = xdata.train[which(my.miscl),], lambda = lambda.baseline) # outlier <- factor(ydata.train$real_class[which(my.miscl)] != 1*(prediction > 0.5)) outlier <- plyr::revalue(outlier, c("TRUE"="Misclass.", "FALSE"="")) # perturbed <- factor((ydata$logit_class != ydata$real_class)[my.miscl]) perturbed <- plyr::revalue(perturbed, c("TRUE"="X", "FALSE"="")) # print(data.frame( perturbed = perturbed, natural.outlier = natural.outlier, gen.class = ydata.train$real_class[which(my.miscl)], gen.prob = ydata.train$prob[which(my.miscl)], model.pert = prediction, outlier = outlier)) } }
ydata[ydata$logit_class != ydata$real_class,]
ydata[ydata$real_class != 1*(ydata$prob > 0.5),]
result.ransac.real <- ransac::ransac(xdata = xdata.train, ydata = ydata.train$real_class, k = k, n = n, family = my.family, # threshold = threshold,good.fit.perct = good.fit.perct, # mc.cores = mc.cores, lambda = lambda.baseline, alpha = alpha.baseline, penalty.factor = penalty.factor)
cache.file <- file.path('cache', sprintf('ransac-real-%s.%d.%d.%d.%.4f.%s.%d.%.4f.%.4f.RData', data.origin, nrow(xdata.train), ncol(xdata.train), k, lambda.baseline, my.family, n, threshold, good.fit.perct)) if (!force.skip.cache && file.exists(cache.file)) { flog.info('Loading cache with ransac results: %s', cache.file) load(cache.file) } else { flog.info('Running RANSAC') set.seed(my.seed) result.ransac.real <- ransac::ransac(xdata = xdata.train, ydata = ydata.train$real_class, k = k, n = n, family = my.family, # threshold = threshold, good.fit.perct = good.fit.perct, # mc.cores = mc.cores, lambda = lambda.baseline, alpha = alpha.baseline, penalty.factor = penalty.factor) save(k, my.family, n, threshold, good.fit.perct, result.ransac.real, file = cache.file) }
result.ransac.real <- ransac::ransac(xdata = xdata.train, ydata = ydata.train$logit_class, k = k, n = n, family = my.family, # threshold = threshold,good.fit.perct = good.fit.perct, # mc.cores = mc.cores, lambda = lambda.baseline, alpha = alpha.baseline, penalty.factor = penalty.factor)
cache.file <- file.path('cache', sprintf('ransac-pert-%s.%d.%d.%d.%.4f.%s.%d.%.4f.%.4f.RData', data.origin, nrow(xdata.train), ncol(xdata.train), k, lambda.baseline, my.family, n, threshold, good.fit.perct)) if (!force.skip.cache && file.exists(cache.file)) { flog.info('Loading cache with ransac results: %s', cache.file) load(cache.file) } else { flog.info('Running RANSAC') set.seed(my.seed) result.ransac.pert <- ransac::ransac(xdata = xdata.train, ydata = ydata.train$logit_class, k = k, n = n, family = my.family, # threshold = threshold, good.fit.perct = good.fit.perct, # mc.cores = mc.cores, lambda = lambda.baseline, alpha = alpha.baseline, penalty.factor = penalty.factor) save(k, my.family, n, threshold, good.fit.perct, result.ransac.pert, file = cache.file) }
Distance to class
# if (train.perc == 1) { flog.info('No test set being calculated as no test set is being used (train.perc = %g)', train.perc) } else { info.test <- plot(result.ransac.pert, xdata = xdata.test, ydata = ydata.test$real_class, baseline = baseline.model.real, family = my.family, name = paste0(dataset.name, ' (distance to class)'), lambda = lambda.baseline, alpha = alpha, only_consensus = F) }
Distance to class
# # if (train.perc == 1) { flog.info('No test set being calculated as no test set is being used (train.perc = %g)', train.perc) } else { info.train <- plot(result.ransac.pert, xdata = xdata.train, ydata = ydata.train$real_class, baseline = baseline.model.real, family = my.family, name = paste0(dataset.name, ' (distance to class)'), lambda = lambda.baseline, alpha = alpha.baseline, only_consensus = F) }
# info.all <- plot(result.ransac.real, xdata = xdata[, new.vars], ydata = ydata$real_class, family = my.family, baseline = baseline.model.real, name = paste0('Original ', dataset.name, ' (Distance to Real Class)'), lambda = lambda.baseline, alpha = alpha.baseline, only_consensus = F, outliers = outlier.description(ydata))
info.all.pert <- plot(result.ransac.pert, xdata = xdata[, new.vars], ydata = ydata$real_class, family = my.family, baseline = baseline.model.pert, name = paste0('Perturbed ', dataset.name, ' (Distance to Real Class)'), lambda = lambda.baseline, alpha = alpha.baseline, only_consensus = F, outliers = outlier.description(ydata, include.perturb = T))
cache.file <- file.path('cache', 'overall.RData') if (file.exists(cache.file)) { load(cache.file) } else { overall.df <- data.frame() } new.line <- data.frame(my.seed, data.origin, my.family, k, n, threshold, good.fit.perct) colnames(new.line) <- c('seed', 'data', 'family', 'k', 'n', 'threshold', 'good.fit.perct') for (ix in names(info.all$misclassifications)) { false.neg <- sum(info.all$misclassifications[[ix]]$false.neg) false.pos <- sum(info.all$misclassifications[[ix]]$false.pos) prev.names <- colnames(new.line) new.line <- cbind(new.line, false.neg + false.pos, false.neg, false.pos) colnames(new.line) <- c(prev.names, sprintf('%s outliers', info.all$description[[ix]]), sprintf('%s false.neg', info.all$description[[ix]]), sprintf('%s false.pos', info.all$description[[ix]])) } if (nrow(overall.df) > 0 && ncol(overall.df) != ncol(new.line)) { # skip flog.info('Can\'t add to overall.df, as no models were obtained') individual.line <- overall.df[1,] individual.line[,colnames(new.line)] <- new.line individual.line[,!colnames(overall.df) %in% colnames(new.line)] <- NA new.line <- individual.line } overall.df <- rbind(overall.df, new.line) overall.df <- distinct(overall.df) # save(overall.df, file = cache.file)
cache.file <- file.path('cache', 'overall.pert.RData') if (file.exists(cache.file)) { load(cache.file) } else { overall.df <- data.frame() } new.line <- data.frame(my.seed, data.origin, my.family, k, n, threshold, good.fit.perct) colnames(new.line) <- c('seed', 'data', 'family', 'k', 'n', 'threshold', 'good.fit.perct') for (ix in names(info.all$misclassifications)) { false.neg <- sum(info.all.pert$misclassifications[[ix]]$false.neg) false.pos <- sum(info.all.pert$misclassifications[[ix]]$false.pos) prev.names <- colnames(new.line) new.line <- cbind(new.line, false.neg + false.pos, false.neg, false.pos) colnames(new.line) <- c(prev.names, sprintf('%s outliers', info.all.pert$description[[ix]]), sprintf('%s false.neg', info.all.pert$description[[ix]]), sprintf('%s false.pos', info.all.pert$description[[ix]])) } if (nrow(overall.df) > 0 && ncol(overall.df) != ncol(new.line)) { # skip flog.info('Can\'t add to overall.df, as no models were obtained') individual.line <- overall.df[1,] individual.line[,colnames(new.line)] <- new.line individual.line[,!colnames(overall.df) %in% colnames(new.line)] <- NA new.line <- individual.line } overall.df <- rbind(overall.df, new.line) overall.df <- distinct(overall.df) # save(overall.df, file = cache.file)
lambda.v <- find.lambda(lambda.baseline) perfect.model <- glmnet(xdata[, new.vars], ydata$logit_class, alpha = alpha.baseline, family = 'binomial', lambda = lambda.v, penalty.factor = NULL) summary(as.vector(coef(perfect.model, s = lambda.baseline))) summary(predict(perfect.model, newx = xdata[, new.vars], type = 'response', s = lambda.baseline) - ydata$logit_class) # new.vars.perfect <- rownames(coef(perfect.model, s = lambda.baseline) != 0)[-1] my.ridge <- glmnet(xdata[, new.vars.perfect], ydata$logit_class, alpha = 0, family = 'binomial', #lambda = lambda.v) nlambda = 100, lambda.min.ratio = 1e-10) len <- 100 len <- length(lambda.v) ydata.rep <- as.data.frame(matrix(rep(ydata$logit_class, len), ncol = len, nrow = nrow(ydata))) ridge.predict <- predict(my.ridge, newx = xdata[, new.vars.perfect], type = 'response') - ydata.rep apply(ridge.predict, 2, function(e){ max(abs(e))})
len <- 1500 set.seed(my.seed) nvars <- 100 xdata.auc <- sapply(seq(nvars), function(e, len) { rnorm(len)}, len) coef.auc <- seq(nvars) * runif(nvars) # z <- 1 + xdata.auc %*% coef.auc # linear combination with a bias pr <- 1/(1+exp(-z)) # pass through an inv-logit function y <- factor(rbinom(len,1,pr)) # bernoulli response variable y.pert <- y perturbation.ix <- sample(seq_along(y), round(len * .1)) y.pert[perturbation.ix] <- (as.numeric(as.vector(y[perturbation.ix])) - 1)* -1 # df <- data.frame(y.pert,xdata.auc) my.model <- glm( y.pert ~ ., data=df,family="binomial") my.roc <- AUC::roc(predict(my.model, type = 'response'), labels = y) my.auc <- AUC::auc(my.roc) flog.info('Y original auc: %g', my.auc) plot(my.roc) title(sprintf('Y Original -- AUC = %g', my.auc)) my.roc.2 <- AUC::roc(predict(my.model, type = 'response'), labels = y.pert) my.auc.2 <- AUC::auc(my.roc.2) plot(my.roc.2) title(sprintf('Y with Perturbation -- AUC = %g', my.auc.2))
three.dot <- function(...) { arguments <- list(...) flog.info('aca %s', arguments$aca) } three.dot(aca = 12)
set.seed(666) library(futile.logger) my.coef <- c(1,2,3) obs <- 1000 xdata <- sapply(my.coef[-1], function(e){rnorm(obs)}) colnames(xdata) <- paste0('X', seq(ncol(xdata))) z = my.coef[1] + xdata %*% my.coef[-1] pr = 1/(1+exp(-z)) # pass through an inv-logit function y = rbinom(obs,1,pr) # bernoulli response variable y2 <- 1*(pr > 0.5) df = data.frame(y=y,xdata) flog.info('') flog.info('') flog.info('GLM Y', summary(glm( y~ .,data=df,family="binomial", control = glm.control(maxit = 1000))), capture = T) df2 = data.frame(y=y2,xdata) flog.info('') flog.info('') flog.info('GLM Y2', summary(glm( y~ .,data=df2,family="binomial", control = glm.control(maxit = 1000))), capture = T) library(glmnet) lambda <- cv.glmnet(xdata, y, family = 'binomial', alpha = 1, mc.cores = 16)$lambda.min my.glmnet <- glmnet(xdata, y, family = 'binomial', lambda = find.lambda(lambda), alpha = 1) lambda.2 <- cv.glmnet(xdata, y2, family = 'binomial', alpha = 1, mc.cores = 16)$lambda.min my.glmnet.2 <- glmnet(xdata, y2, family = 'binomial', lambda = find.lambda(lambda), alpha = 1) flog.info('') flog.info('') flog.info('RIDGE Y', coef(my.glmnet, s = lambda), capture = T) flog.info('') flog.info('') flog.info('RIDGE Y2', coef(my.glmnet.2, s = lambda), capture = T) library(logistf) flog.info('') flog.info('') flog.info('Logistics Firth Y', logistf(y ~ ., data = df), capture = T) flog.info('') flog.info('') flog.info('Logistics Firth Y2', logistf(y ~ ., data = df2), capture = T)
glm.bad <- glm( y~ .,data=df2,family="binomial", control = glm.control(maxit = 1000)) glm.good <- glm( y~ .,data=df,family="binomial", control = glm.control(maxit = 1000)) #head(cbind(Qr$qr, aa$qr$qr)) p <- ncol(xdata) + 1 #Qr <- glm.good$qr #Qr <- glm.bad$qr wald.test <- function(coef, qr, p) { p1 <- 1L:p coef.p <- coef covmat.unscaled <- chol2inv(qr) dimnames(covmat.unscaled) <- list(names(coef.p), names(coef.p)) covmat <- 1 * covmat.unscaled var.cf <- diag(covmat) s.err <- sqrt(var.cf) tvalue <- coef.p/s.err dn <- c("Estimate", "Std. Error") pvalue <- 2 * pnorm(-abs(tvalue)) coef.table <- cbind(coef.p, s.err, tvalue, pvalue) dimnames(coef.table) <- list(names(coef.p), c(dn, "z value", "Pr(>|z|)")) print(coef.table) } print('') flog.info('') flog.info('') flog.info('Good') flog.info('') wald.test(glm.good$coefficients, glm.good$qr$qr, p) flog.info('') flog.info('') flog.info('Bad') flog.info('') wald.test(glm.bad$coefficients, glm.bad$qr$qr, p) calc.qr <- function(xdata, coef) { qr(t(t(cbind(array(1, nrow(xdata)), xdata)) * coef)) } flog.info('') flog.info('') flog.info('Good Own QR') flog.info('') Qr <- calc.qr(xdata, glm.good$coefficients) wald.test(glm.good$coefficients, Qr$qr, p) flog.info('') flog.info('') flog.info('Bad Own QR') flog.info('') Qr <- calc.qr(xdata, glm.bad$coefficients) wald.test(glm.bad$coefficients, Qr$qr, p) # covmat.unscaled # cov(t(t(cbind(array(1, nrow(xdata)), xdata)) * glm.good$coefficients)) # chol2inv(Qr$qr)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.