OnlineSuperLearner: Online SuperLearner package

Documented in OutputPlotGenerator.create_convergence_plot OutputPlotGenerator.create_density_plot OutputPlotGenerator.create_risk_plot OutputPlotGenerator.create_training_curve OutputPlotGenerator.export_key_value OutputPlotGenerator.get_colors OutputPlotGenerator.get_simple_colors OutputPlotGenerator.store_oos_osl_difference

#' OutputPlotGenerator.create_density_plot
#'
#' Function to plot the true and predicted densities in one plot.
#'
#' @importFrom stats density
#' @importFrom grDevices dev.off pdf
#' @importFrom graphics lines plot
#'
#' @param yValues vector a vector of the true outcome values.
#'
#' @param estimated_probabilities vector a vector of the estimated
#'  probabilities.
#'
#' @param estimated_y_values vector (default = NULL) vector of the estimated
#'  y_values (i.e., sampled values).
#'
#' @param output string the filename to use.
#'
#' @param dir string (default = 'tmp')the directory to plot to.
#' @export
OutputPlotGenerator.create_density_plot = function(yValues, estimated_probabilities, estimated_y_values = NULL, output, dir = 'tmp') {
  ## plot densitity first:
  vals <- unique(yValues)
  if(length(vals) == 2 ) {
    ## If the outcome is binary, we can see how well it managed to predict the whole distribution
    ## This error term should be small (~ 0.001)
    abs(mean(estimated_probabilities[yValues == vals[1] ]) - mean(yValues == vals[1]))
    abs(mean(estimated_probabilities[yValues == vals[2] ]) - mean(yValues == vals[2]))
  }
  true_density <- density(yValues)

  ## Normalize the density to 1
  ##true_density$y <- diff(true_density$x) * true_density$y

  ## Draw a line by default, instead of dots
  type = "l"

  if (is.null(estimated_y_values)) {
    estimated_y_values <- yValues 
    type = "p"
  }

  ## Normalize to sum to one, to make it an actual density

  ## Save the output in a dir so we can access it later
  full_file_name <- get_file_location(name = output,
                                      extension = 'pdf',
                                      dir = dir,
                                      add_date_to_dir=TRUE)

  ylim <- c(0,max(c(estimated_probabilities, true_density$y), na.rm=TRUE))

  pdf(full_file_name)

  plot(true_density, ylim=ylim)
  lines(estimated_y_values, estimated_probabilities, type = type, cex = .3, col = "red",
        ylim=c(0,max(estimated_probabilities)+.5))
  dev.off()
  return(full_file_name)
}

#' OutputPlotGenerator.create_convergence_plot
#'
#' Function to plot the convergence to the parameter of interest of both the
#' truth and the estimation.
#'
#' @import ggplot2
#' @import reshape2
#' @importFrom graphics lines plot
#' @importFrom grDevices dev.off pdf
#'
#' @param data the truth and apporoximations to plot.
#'
#' @param output string the filename of the output.
#'
#' @param dir string (default = 'tmp') the directory name to
#'  store the output in.
#' @export
OutputPlotGenerator.create_convergence_plot = function(data, output, dir = 'tmp', range = c(0,1)) {
  labels = names(data)
  colors <- OutputPlotGenerator.get_colors(length(labels))

  full_file_name <- get_file_location(name = output,
                                      extension = 'pdf',
                                      subdir = 'convergence',
                                      dir = dir)

  pdf(full_file_name)

  data <- lapply(data, function(dat) cumsum(dat)/seq(along=dat)) %>%
    as.data.frame

  data$epoch <- seq(nrow(data))
  data <- melt(data, id.vars='epoch')

  plotje <- ggplot(data,aes(epoch, value, color=variable))+
    geom_line() + 
    scale_color_manual(values = colors[seq_along(labels)])+
    theme(panel.background = element_rect(fill = 'transparent', colour = 'black', size=1))+
    scale_y_continuous(name="")+
    theme(axis.text.y = element_text(colour = "black") ) +
    theme(axis.text.x = element_text(colour = "black") ) +
    theme(axis.title.x = element_text(vjust = -0.5)) +
    theme(legend.title = element_blank())+
    theme(legend.position="bottom")+
    theme(legend.background = element_rect(colour="transparent"))+
    theme(legend.key = element_rect(fill = "transparent", colour = "transparent")) +
    theme(legend.key.size= unit(3,"lines"))


  if(!is.null(range)) {
    plotje <- plotje + ylim(range[1], range[2])
  }

  plot(plotje)
  dev.off()
}


#' OutputPlotGenerator.create_training_curve
#'
#' Function to plot the training curve given the number of iterations used
#'
#' @import ggplot2
#' @import reshape2
#' @importFrom grDevices dev.off pdf
#' @importFrom graphics plot
#'
#' @param historical_cvs the historical CV risks. List of lists of datatables.
#'  First list is for each iteration, second for each learner, and the datatable
#'  has columns for each relevant variable.
#'
#' @param relevantVariables list of relevantvariables to use in the output
#'
#' @param output string (default = 'historical_cvs') the filename to write the
#'  pdf to (without .pdf).
#'
#' @param dir string (default = 'tmp') the directory to write the file
#'  to.
#' @export
OutputPlotGenerator.create_training_curve = function(historical_cvs, relevantVariables, output = 'historical_cvs', dir = 'tmp') {
  # historical_cvs is a list of lists of datatables
  #  - each iteration
  #    - each learner
  #      - each RV
  result <- lapply(relevantVariables, function(rv){
    lapply (historical_cvs, function(epoch) {
      lapply(epoch, function(algorithm_outcome) algorithm_outcome[, rv$getY, with=FALSE]) %>% unlist
    })
  })
  names(result) <- lapply(relevantVariables, function(rv) rv$getY)

  plots <- lapply(seq_along(result), function(rv_id) {
    rv_result <- result[[rv_id]]
    dt <- as.data.table(rv_result)%>% t

    the_names <- names(rv_result[[1]])
    for (name_id in seq_along(the_names)) {
      name <- the_names[[name_id]]
      the_key <- paste('algorithm', name_id, sep='_') 
      OutputPlotGenerator.export_key_value(the_key, name)
    }
    
    colnames(dt) <- the_names
    dt = tryCatch({
      data.table(dt)
    }, error = function(e) {
      browser()
    })
    dt[, epoch := seq(1, length(rv_result))]

    test_data_long <- reshape2::melt(dt, id='epoch')
    test_data_long
    names(test_data_long)
    ggplot(data=test_data_long, aes(x=epoch, y=value, colour=variable)) +
        geom_line()+
        ggtitle(names(result)[rv_id])+
        theme(axis.text.y = element_text(colour = "black") ) +
        theme(axis.text.x = element_text(colour = "black") ) +
        theme(axis.title.x = element_text(vjust = -0.5)) +
        theme(legend.title = element_blank())+
        theme(legend.background = element_rect(colour="transparent"))+
        theme(legend.key = element_rect(fill = "transparent", colour = "transparent")) +
        theme(legend.key.size= unit(1,"lines"))

  })
  
  full_file_name <- get_file_location(name = output,
                                      extension = 'pdf',
                                      dir = dir)
  


  cat('create_training_curve plotting to:', full_file_name)
  pdf(full_file_name)
  for (i in seq_along(plots)) {
    plot(plots[[i]])
  }
  dev.off()



  #data <- lapply(data, function(dat) cumsum(dat)/seq(along=dat)) %>%
    #as.data.frame

  #data$epoch <- seq(nrow(data))
  #data <- melt(data, id.vars='epoch')

  #plotje <- ggplot(data,aes(epoch,value,color=variable))+
    #geom_line() + 
    #scale_color_manual(values = colors[seq_along(labels)])+
    #theme(panel.background = element_rect(fill = 'transparent', colour = 'black', size=1))+
    #scale_y_continuous(name="")+

  #plot(plotje)
}


#' OutputPlotGenerator.create_risk_plot
#'
#' Function to create plots similar to the ones in the OSL papers
#'
#' @importFrom stats binomial density terms
#' @importFrom grDevices dev.off pdf
#' @importFrom graphics lines plot
#' @importFrom utils head
#'
#' @param performance a list of list with performances: list 1 the estimators
#'  used, list 2 the relevant variables predicted.
#'
#' @param output string the filename to use for the plot (without .pdf).
#'
#' @param dir string (default = 'tmp') the directory to store the results
#'  in.
#'
#' @param make_summary boolean (default = FALSE) whether or not to sum all error terms to give a
#'  summed performance measure.
#'
#' @param label string (default = 'total.risk') the label to use when making a
#'  summary plot
#'
#' @export
OutputPlotGenerator.create_risk_plot = function(performance, output, dir = 'tmp', make_summary=FALSE, label='total.risk') {
  # Performance should be a list of data.tables:
  # List: the estimator used
  # Datatable: the relevant variable predicted
  if(make_summary) {
    ## Beware, the unlist is needed, it looks the same but without the unlist the contents are lists instead of numerics
    performance_dt <- lapply(performance,sum) %>% unlist %>% data.table
    colnames(performance_dt) <- label
  } else {
    performance_dt <- performance %>% rbindlist
  }
  ml_names <- names(performance)

  ## Append the names of the algorithms, so ggplot can plot them
  performance_dt[, 'names' :=  ml_names]

  ## Retrieve the names of the outcomes, -1 because the names column does not need a color
  outcomes <- head(colnames(performance_dt), -1)

  full_file_name <- get_file_location(name = output,
                                      extension = 'pdf',
                                      dir = dir)

  p <- ggplot(performance_dt)

  colors <- OutputPlotGenerator.get_simple_colors(length(outcomes))

  names(colors) <- outcomes
  for (name in outcomes) {
    p <- p + geom_point(size = 2, aes_string(x=name, y='names'), color=colors[[name]])+
      theme(legend.position="left")
  }

  p <- p +
    scale_linetype(guide = guide_legend(override.aes = list(alpha = 1)), labels = outcomes) + 
    theme(legend.position = c(.75, .25))+
    theme(panel.background = element_rect(fill = 'transparent', colour = 'black', size=1))+
    theme(axis.text.y = element_text(colour = "black") ) +
    theme(axis.title.y=element_blank()) +
    labs(y = '', x = paste(names(colors), colors, sep=': ', collapse = ', '))+
    theme(legend.position="left")+
    theme(legend.background = element_rect(colour="transparent"))+
    theme(legend.key = element_rect(fill = "transparent", colour = "transparent")) +
    theme(legend.key.size= unit(3,"lines"))


  pdf(full_file_name)
  cat('Create_risk_plot plotting to:', full_file_name)
  plot(p)
  dev.off()
}

#' OutputPlotGenerator.get_colors
#' 
#' Function to return a list of colors, depending on the number of variables /
#' colors needed, uses hex for colors
#'
#' @param number_of_variables integer the number of colors one requires.
#'
#' @return a list \code{number_of_variables} of colors
#' @export
OutputPlotGenerator.get_colors = function(number_of_variables) {
  if (number_of_variables > 12 || number_of_variables < 1) {
    return(NULL) 
  }
  list(
    tol1qualitative=c("#4477AA"),
    tol2qualitative=c("#4477AA", "#CC6677"),
    tol3qualitative=c("#4477AA", "#DDCC77", "#CC6677"),
    tol4qualitative=c("#4477AA", "#117733", "#DDCC77", "#CC6677"),
    tol5qualitative=c("#332288", "#88CCEE", "#117733", "#DDCC77", "#CC6677"),
    tol6qualitative=c("#332288", "#88CCEE", "#117733", "#DDCC77", "#CC6677","#AA4499"),
    tol7qualitative=c("#332288", "#88CCEE", "#44AA99", "#117733", "#DDCC77", "#CC6677","#AA4499"),
    tol8qualitative=c("#332288", "#88CCEE", "#44AA99", "#117733", "#999933", "#DDCC77", "#CC6677","#AA4499"),
    tol9qualitative=c("#332288", "#88CCEE", "#44AA99", "#117733", "#999933", "#DDCC77", "#CC6677", "#882255", "#AA4499"),
    tol10qualitative=c("#332288", "#88CCEE", "#44AA99", "#117733", "#999933", "#DDCC77", "#661100", "#CC6677", "#882255", "#AA4499"),
    tol11qualitative=c("#332288", "#6699CC", "#88CCEE", "#44AA99", "#117733", "#999933", "#DDCC77", "#661100", "#CC6677", "#882255", "#AA4499"),
    tol12qualitative=c("#332288", "#6699CC", "#88CCEE", "#44AA99", "#117733", "#999933", "#DDCC77", "#661100", "#CC6677", "#AA4466", "#882255", "#AA4499")
  )[[number_of_variables]]
}

#' OutputPlotGenerator.get_simple_colors
#'
#' Function to return a list of colors, depending on the number of variables / colors needed
#'
#' @importFrom RColorBrewer brewer.pal.info brewer.pal
#'
#' @param number_of_variables integer the number of colors one requires.
#'
#' @return a list of \code{number_of_variables} colors
#' @export
OutputPlotGenerator.get_simple_colors = function(number_of_variables) {
  if (number_of_variables < 1) {
    return(NULL)
  }
  number_of_variables <- abs(number_of_variables)
  qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
  col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
  sample(col_vector, number_of_variables)
}

#' OutputPlotGenerator.export_key_value 
#'
#' Function to export a key value pair to a file
#'
#' @param key the key to store the value under. That is, the data is stored as
#'  a key value pair. This is the key in that pair. 
#'
#' @param value the value to store. That is, the data is stored as
#'  a key value pair. This is the value in that pair.
#'
#' @param output string (default = 'variables.dat') the filename to store the
#'  data in.
#'
#' @param dir string (default = 'tmp') the dir to write the file to.
#' @export
OutputPlotGenerator.export_key_value = function(key, value, output='variables', dir = 'tmp') {
  if (is.numeric(value)) {
    value <- round(value, 3)
  }
  line <- paste(key,'=',value)

  full_file_name <- get_file_location(name = output,
                                      extension = 'dat',
                                      dir = dir)

  if (!file.exists(full_file_name)) {
    file.create(full_file_name)
  }

  write(line,file=full_file_name,append=TRUE)
}

#' OutputPlotGenerator.store_oos_osl_difference
#'
#' Function to store the differences between the truth and osl in a file.
#'
#' @param differences list a list of differences with a key for each option.
#'
#' @param output string the name of the file to write the output to.
#'
#' @param oos boolean was the \code{OneStepEstimator} used to generate the specified data?
#'
#' @param configuration integer (default = 0) which simulation configuration was used?
#' @export
OutputPlotGenerator.store_oos_osl_difference = function(differences, output, oos, configuration = 0) {
  name <- ifelse(oos, 'post-oos', 'pre-oos')
  for (output_name in names(differences)) {
    key <- paste('cfg', configuration, output_name, name, sep='-')
    OutputPlotGenerator.export_key_value(key, differences[[output_name]], output=output)
  }
}