analyses/simulate.R

library(MLlibrary)
library(ggplot2)
library(dplyr)
library(RColorBrewer)

#library(doMC)
#registerDoMC(cores=3)
K = 5



gaussian <- function(mean, sd) {
  function(response) {response + sd(response)*rnorm(length(response), mean=mean, sd=sd)}
}

gaussian_x <- function(mean, sd, random=F) {
  if (random) {
    function(response) {apply(response,2,gaussian(mean,sd=runif(1)*sd))}
  }
  else {
    function(response) {apply(response,2,gaussian(mean,sd))}
  }
}

add_noise <-function(d, noise_y=function(response) {response}, noise_x=function(response) {response}, seed=1) {
  set.seed(seed)
  d$y <- noise_y(d$y)
  d$X <- noise_x(d$X)
  d
}

linear <- function(nvar=10, nrow=100, seed=1) {
  set.seed(seed)
  variables <- as.character(seq_len(nvar))
  coefficients <- seq_along(variables)
  X <- matrix(rnorm(nvar * nrow), nrow=nrow, ncol=nvar)
  y <- (X %*% coefficients) + 10
  list(y=y, X=X)
}

exponential <- function(nvar=10, nrow=100) {
  d <- linear_(nvar, nrow)
  d$y <- exp(d$y / 20)
  d
}

cuts <- function(nvar=10, nrow=100, nest = F, seed=1) {
  set.seed(seed)
  X <- matrix(rnorm(nvar * nrow), nrow=nrow, ncol=nvar)
  cuts <- rnorm(nvar)
  cuts <- matrix(cuts, nrow=nrow, ncol=nvar, byrow = TRUE)
  X_transformed <- X<cuts
  if (nest) {
    for (n in 1:20) {
      temp <- sample(1:nvar,2,replace=F)
      X_transformed[,temp[1]] <- X_transformed[,temp[1]]*X_transformed[,temp[2]]
    }
  }
  variables <- as.character(seq_len(nvar))
  coefficients <- seq_along(variables)
  y <- as.vector((X_transformed %*% coefficients) + 10)
  list(y=y, X=X)
}

cuts_and_linear <- function(nvar=10, nrow=100, alpha=0.5) {
  X <- matrix(rnorm(nvar * nrow), nrow=nrow, ncol=nvar)
  cuts <- rnorm(nvar)
  cuts <- matrix(cuts, nrow=nrow, ncol=nvar, byrow = TRUE)
  X_transformed <- X<cuts
  variables <- as.character(seq_len(nvar))
  coefficients <- seq_along(variables)
  y <- (alpha*as.vector((X_transformed %*% coefficients) + (1-alpha)*(X %*% coefficients))) + 10
  list(y=y, X=X)
}





run_all <- function(dataset,NAME) {
  X <- dataset$X
  X_ix <- model.matrix(y ~ . + .:.,  data.frame(dataset))
  y <- dataset$y
  ksplit <- kfold_split(K, y, X, seed=1)
  ksplit_nmm <- kfold_split(K, y, data.frame(X), seed=1)
  ksplit_ix <- kfold_split(K, y, X_ix, seed=1)
  dataset <- data.frame(y=y,X)
  run_simulation_models(NAME, dataset, "y", ksplit, ksplit_nmm, ksplit_ix)
}

plot_all <- function(NAME) {
  all_models <- load_models(NAME)
  plot_swf_(all_models)
  plot_reach_vs_waste_(all_models)
  plot_reach_vs_waste_(all_models, THRESHOLD=.4)
}

run_dataset <- function(dataset, NAME)  { 
  run_all(dataset, NAME)
  joined <- load_models(NAME)
  reach_df <- calculate_reach_(joined, fold=TRUE)
  reach_df$name <- NAME
  reach_df
}


plot_scores_OLD <- function(df, fname) {
  df$type[df$method %in% c("least_squares","stepwise","stepwise_15","ridge","lasso", "lasso_15","ridge")] <- "regularization"
  df$type[df$method %in% c("pca_knn","pca_knn_all","forest","btree","rtree")] <- "non-linear"
  df$type[df$method %in% c("logistic_40","logistic_lasso_40","ctree_40","cforest_40","cbtree_40","ctree_40","quantile")] <- "different loss"
  df<-df[!is.na(df$type),]
  ggplot(df, aes(y=reach, x=method, color=name)) + 
      geom_point(position=position_dodge(width=0.5)) +
      scale_colour_brewer(palette="RdYlBu") + 
      theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
      labs(y = "reach") +
      facet_wrap( ~ type, ncol=3)
  ggsave(fname, width=12, height=8)
}

plot_scores <- function(df, fname) {
    add_prefix <- function(df, names, prefix) {
    names_logical <- levels(df$method) %in% names
    if (!any(names_logical)) {
      return(df)
    }
    levels(df$method)[names_logical] <- sapply(
      levels(df$method)[names_logical],
      function(response) {paste(prefix, response, sep=" -- ")})
    df
  }
    
  lms <- c("least_squares")
  reg_names <- c("stepwise","stepwise_15","ridge","lasso", "lasso_15","ridge")
  inter_names <- c("ridge_ix","lasso_ix", "least_squares_ix")
  nonlin_names <- c("pca_knn","pca_knn_all","forest","btree","rtree") 
  class_names <- c("logistic_40","logistic_lasso_40","ctree_40","cforest_40","cbtree_40","ctree_40")

  df <- add_prefix(df, lms, "0. ------")  
  df <- add_prefix(df, reg_names, "1. Linear regularized")
  df <- add_prefix(df, inter_names, "2. Linear interactions")
  df <- add_prefix(df, nonlin_names, "3. Non-linear regression")
  df <- add_prefix(df, class_names, "4. Classification")
  
  df$method <- as.factor(as.character(df$method))
  
  ggplot(df, aes(y=reach, x=method, color=name)) + 
    geom_point(position=position_dodge(width=0.5)) +
    scale_colour_brewer(palette="RdYlBu") + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    labs(y = "reach") +
    geom_vline(xintercept = 1.5) +
    geom_vline(xintercept = 6.5) +
    geom_vline(xintercept = 9.5) +
    geom_vline(xintercept = 14.5) +
    geom_vline(xintercept = 19.5) +
    geom_hline(yintercept = 0.16)
  ggsave(fname, width=12, height=8)
}


##Linear y noise

NAME <- "linear"
dataset <- add_noise(linear(nvar=20, nrow=1000))
temp_df <- run_dataset(dataset, NAME)
reach_df <- temp_df
for (j in c(0.1,0.2,0.5,1,2,5)) {
  NAME <- paste('linear', j, sep='_')
  noise_y <- gaussian(0,j)
  noise_x <- gaussian_x(0,0)
  dataset <- add_noise(linear(nvar=20, nrow=1000), noise_y=noise_y, noise_x=noise_x)
  temp_df <- run_dataset(dataset, NAME)
  reach_df <- rbind(reach_df,temp_df)    
} 
reach_df_avg <- group_by(reach_df, method, name) %>% summarize(reach=mean(reach))
plot_scores(reach_df_avg, 'results/linear_y_scores_avg.pdf')

##Linear x noise

NAME <- "linear"
dataset <- add_noise(linear(nvar=20, nrow=1000))
temp_df <- run_dataset(dataset, NAME)
reach_df <- temp_df
for (i in c(0.1,0.2,0.5,1,2,5)) {
  NAME <- paste('linear', i, sep='_')
  noise_y <- gaussian(0,0)
  noise_x <- gaussian_x(0,i,random=T)
  dataset <- add_noise(linear(nvar=20, nrow=1000), noise_y=noise_y, noise_x=noise_x)
  temp_df <- run_dataset(dataset, NAME)
  reach_df <- rbind(reach_df,temp_df)    
} 
reach_df_avg <- group_by(reach_df, method, name) %>% summarize(reach=mean(reach))
plot_scores(reach_df_avg, 'results/linear_x_scores_avg.pdf')


##Cuts y noise

NAME <- "cuts"
dataset <- add_noise(cuts(nvar=20, nrow=1000, nest=T))
temp_df <- run_dataset(dataset, NAME)
reach_df <- temp_df
for (j in c(0.1,0.2,0.5,1,2,5)) {
  NAME <- paste('cuts', j, sep='_')
  noise_y <- gaussian(0,j)
  noise_x <- gaussian_x(0,0)
  dataset <- add_noise(cuts(nvar=20, nrow=1000), noise_y=noise_y, noise_x=noise_x)
  temp_df <- run_dataset(dataset, NAME)
  reach_df <- rbind(reach_df,temp_df)    
} 
reach_df_avg <- group_by(reach_df, method, name) %>% summarize(reach=mean(reach))
plot_scores(reach_df_avg, 'results/cuts_y_scores_avg.pdf')

##Cuts x noise

NAME <- "cuts"
dataset <- add_noise(cuts(nvar=20, nrow=1000, nest=T))
temp_df <- run_dataset(dataset, NAME)
reach_df <- temp_df
for (i in c(0.1,0.2,0.5,1,2,5)) {
  NAME <- paste('cuts', i, sep='_')
  noise_y <- gaussian(0,0)
  noise_x <- gaussian_x(0,i,random=T)
  dataset <- add_noise(cuts(nvar=20, nrow=1000), noise_y=noise_y, noise_x=noise_x)
  temp_df <- run_dataset(dataset, NAME)
  reach_df <- rbind(reach_df,temp_df)    
} 
reach_df_avg <- group_by(reach_df, method, name) %>% summarize(reach=mean(reach))
plot_scores(reach_df_avg, 'results/cuts_x_scores_avg.pdf')
ml-e/ML-library documentation built on May 23, 2019, 2:03 a.m.