R/weightingFunctions.R

#' Average posterior vote
#'
#' @description Create ensemble combination metric arrays with dimension \code{n X K X M}
#' with elements v_lim = obs i metric for class l from member m
#' Using average posterior rule\cr\cr
#'
#'\code{n} is the number of observations\cr
#'\code{K} is the number of classes\cr
#'\code{M} is the number of models\cr
#'
#' I DO NOT YET UNDERSTAND THIS ONE
#'
#' @param test_post_probs Posterior probabilities from model predictions on test data
#' @return \code{model_metric}
#' @examples
#'
#' @export
make_model_metric_array_avgpost <- function(test_post_probs){
  n = nrow(test_post_probs[[1]])                                # number of observations
  K = ncol(test_post_probs[[1]])                                # number of classes
  M = length(test_post_probs)                                   # number of models
  model_metric <- array(NA,c(n,K,M))
  for(m in 1:M){
    model_metric[,,m] <- test_post_probs[[m]]
  }
  dimnames(model_metric) <- list(dimnames(test_post_probs[[1]])[[1]],dimnames(test_post_probs[[1]])[[2]],1:M)
  return(model_metric)
}

#' Majority vote
#'
#' @description Create ensemble combination metric arrays with dimension n X K X M
#' with elements v_lim = obs i metric for class l from member m
#' Using majority rule
#'
#' I DO NOT YET UNDERSTAND THIS ONE
#'
#' @param test_post_probs Posterior probabilities from model predictions on test data
#' @return \code{model_metric}
#' @examples
#'
#' @export
make_model_metric_array_majvote <- function(test_preds){
  n = nrow(test_preds)
  K = length(levels(test_preds[,1]))
  M = ncol(test_preds)
  model_metric <- array(NA,c(n,K,M))
  for(m in 1:M){
    for(l in 1:K){
      for(i in 1:n){
        model_metric[i,l,m] <- as.numeric(test_preds[i,m]==levels(test_preds[,1])[l])
      }
    }
  }
  dimnames(model_metric) <- list(row.names(test_preds),levels(test_preds[,1]),1:M)
  return(model_metric)
}

#' Model metric array for combination rules
#'
#' @description Function for making model metric array for combination rules\cr
#' I DO NOT YET UNDERSTAND THIS ONE \cr
#' NEEDS TO BE GENERALIZED
#'
#' @param combination_rule "majority vote" or "average posterior"
#' @param model_storage_list A list holding models from RWeka
#' @param test_data A data frame holding data on which to test
#' @param true_classes An array holding the order of the true labels CANDIDATE FOR REPLACEMENT
#'
#' @return \code{model_metric}
#' @examples
#'
#' @export
make_model_metric_array <- function(combination_rule, model_storage_list, test_data, true_classes){

  model_metric = NULL
  if(combination_rule == "majority vote"){
    test_preds <- as.data.frame(matrix(0, ncol = length(model_storage_list), nrow = dim(test_data)[1]))
    for(i in 1:length(model_storage_list)){
      test_preds[,i] <- factor(predict(model_storage_list[[i]], type = "class", newdata = test_data), levels = true_classes)
    }
    model_metric <- make_model_metric_array_majvote(test_preds)
  }
  if(combination_rule == "average posterior"){
    test_preds <- list()
    for(i in 1:length(model_storage_list)){
      test_preds[[i]] <- predict(model_storage_list[[i]], type = "probability", newdata = test_data)
    }
    model_metric <- make_model_metric_array_avgpost(test_preds)
  }
  return(model_metric)
}

#' Model weights for "weight_type == "weighted"
#'
#' @description Calculate the model weights when "weight_type" == "weighted"
#'
#' @param train_data Training data with predicted class columns from each model \code{1,...,M}
#' @param n The number of instances in the test data
#'
#' @return matris of weights
#' @export
weighted <- function(train_data, M, n){
  model_accuracies <- array(sapply(paste("preds",1:M,sep=""), function(x){
    sum(train_data$true_class==train_data[,x])
  }), dim=c(M,1))
  model_weights <- array(NA,c(n,M))
  for(m in 1:M){
    model_weights[,m] <- model_accuracies[m]
  }
  return(model_weights)
}

#' Model weights for "weight_type == "bin weighted" VERSION 2
#'
#' @description Calculate the model weights when "weight_type" == "bin weighted"
#' # TODO this needs to be cleaned up
#' @param bin_features Training data with predicted class columns from each model \code{1,...,M}
#' @param bin_type Type of binning {"standard","quantile","iterative quantile"}
#' @param nbins vector containing number of bins in each dimension
#' @param train_data_preds data frame containing training data and CV prediction columns
#' @param test_data data frame containing test data. Must have same column names as training data
#' @param M number of models in bin weighted ensemble
#' @param K number of true classes
#'
#' @return matrix of bin weights
#'
#' @export
bin_weighted <- function(bin_features, bin_type, nbins, train_data_preds, test_data, M, K, rotate=FALSE){
  # rotation before binning optional
  if(rotate==TRUE){
    p <- ncol(test_data)-1 #!# basing the number of columns on the test data may be unstable in future iterations, find "p" another way
    xcols <- 1:p !# Danger in hard coded column positions, train_data_preds column order depends on original data
    train_pca <- prcomp(train_data_preds[,xcols])
    phi_matrix <- train_pca$rotation
    train_data_preds[,xcols] <- as.data.frame(as.matrix(train_data_preds[,xcols]) %*% phi_matrix)
    test_data[,xcols] <- as.data.frame(as.matrix(test_data[,xcols]) %*% phi_matrix)
    names(train_data_preds)[xcols] <- paste0("PC",1:p) #!#
    names(test_data)[xcols] <- paste0("PC",1:p) #!#
    bin_features <- paste0("PC",1:length(nbins))
  }
  ## Start with creating bin definitions based on "training data" then bin "test data" with that definition
  if(bin_type %in% c("standard","quantile")){
    bin_train <- bin_nd(data=train_data_preds, bin_features=bin_features, nbins=nbins, bin_type=bin_type, output="both")
    bin_test <- bin_nd_by_def(test_data, bin_nd_def=bin_train$bin_def)
  } else if(bin_type=="iterative quantile"){
    bin_train <- iterative_quant_bin(data=train_data_preds, bin_cols=bin_features, nbins=nbins, output="both", jit=rep(.001,length(nbins)))
    bin_test <- bin_by_iq_def(bin_def=bin_train$bin_def, new_data=test_data, output="data", strict=FALSE)
  } else {
    print("Please provide a supported bin_type")
    return(NULL)
  }
  ## Collect training accurcies of each bin using the cross validated predictions in train_data_preds
  # any region without existing data differs to overall model accuracies for weights
  B = nrow(bin_train$bin_def$bin_centers)
  model_accuracies <- sapply(paste("preds",1:M,sep=""), function(x){
    sum(train_data_preds$true_class==train_data_preds[,x])/nrow(train_data_preds)
  })
  bin_accuracy_array <- matrix(rep(model_accuracies,B),c(M,B), dimnames=list(1:M,1:B))
  for(m in 1:M){
    for(b in unique(bin_train$bin_data$bin_index)){
      inBin <- which(bin_train$bin_data$bin_index==b)
      bin_accuracy_array[m,as.numeric(as.character(b))] <- sum(train_data_preds$true_class[inBin]==train_data_preds[,paste("preds",m,sep="")][inBin])/length(inBin)
    }
  }

  ## set weights for test data observations based on the training accuracies of the bin they belong to
  n=nrow(test_data)
  model_weights <- array(NA,c(n,M))
  for(b in unique(bin_test$bin_indeces)){
    binSet <- bin_test$bin_indeces==b
    model_weights[binSet,] <- bin_accuracy_array[,b]
  }
  return(model_weights)
}

#----------------------------------------------------------------------------------------------------------

#' Establish bin dictator weights
#'
#' @description Take bin weights from bin_weights function and set all but best model weight in each bin to zero
#'
#' @param bin_model_weights nXM matrix of weights from the bin_weighted function
#'
#' @return matrix of bin-dictator weights
#' @export
bin_dictator_weighted <- function(bin_model_weights){
  for(i in 1:nrow(bin_model_weights)){
    bin_model_weights[i,bin_model_weights[i,] < max(bin_model_weights[i,])] <- 0
  }
  return(bin_model_weights)
}



#' Model weights for "weight_type == "knn"
#'
#' @description Calculate the model weights using k-nearest neighbor accuracy weights when "weight_type" == "knn"
#'
#' @param train_data Training data with predicted class columns from each model \code{1,...,M}
#' @param n The number of instances in the test data
#' @param scale TRUE/FALSE for rescaling data (default: scale=TRUE)
#' @param neighbors number of neighbors used in accuracy estimates for each model
#'
#' @return matris of weights nrow=number of test points, rcol=M=number of models
#' @export
knn_weighted <- function(train_data, test_data, M, scale=TRUE,knn_size=10){
  feature_names <- names(test_data)[names(test_data)!="true_class"]
  if(scale==TRUE){
    # standardize training data
    train_x <- scale(train_data[,feature_names])
    # standardize test data based training center/scales
    test_x <- scale(test_data[,feature_names],center=attributes(train_x)$'scaled:center',scale=attributes(train_x)$'scaled:scale')
  } else {
    train_x <- train_data[,feature_names]
    test_x <- test_data[,feature_names]
  }
  # find knn of each test point
  knn_idx <- FNN::get.knnx(data = train_x, query = test_x, k = knn_size, algorithm = "brute")
  # find accuracy of predictions from each model on k nearest neighborhood
  model_weights <- array(NA,c(nrow(test_data),M))
  for(i in 1:nrow(test_data)){
      model_weights[i,] <- sapply(paste("preds",1:M,sep=""), function(x) sum(train_data$true_class[knn_idx$nn.index[i,]]==train_data[knn_idx$nn.index[i,],x])/knn_size )
  }
  return(model_weights)
}
kmaurer/binsemble documentation built on May 7, 2019, 9:50 p.m.