R/prepare_data.R

Defines functions preprocess prepare_to_adv dataset_loader

Documented in dataset_loader prepare_to_adv preprocess

#' Creates datasets and dataloaders
#'
#' Creates two torch datasets, from given train_x, test_x matrices and train_y,
#' test_y vectors and converts them into torch dataloaders with provided batch
#' size. It is used for both classifier and adversarial. During the initialization
#' NAs are ommited.
#'
#' @param train_x numeric, scaled matrix of predictors used for training
#' @param test_x integer, matrix of predictors used for testing
#' @param train_y numeric, scaled vector of target used for training
#' @param test_y integer, vector of predictors used for testing
#' @param batch_size integer indicating a batch size used in dataloader.
#' Default: 50
#' @param dev device used for calculations (cpu or gpu)
#'
#' @return list of two data sets and two dataloaders for train and test
#' respectively
#' @export
#'
#' @examples
#' train_x <- matrix(c(1,2,3,4,5,6),nrow=3)
#' train_y <- c(1,2,3)
#' test_x  <- matrix(c(1,2,3,4),nrow=2)
#' test_y  <- c(1,2)
#' dev     <- "cpu"
#' dataset_loader(train_x,train_y,test_x,test_y,batch_size=1,dev)


dataset_loader <- function(train_x,
                           train_y,
                           test_x,
                           test_y,
                           batch_size = 50,
                           dev) {


  if (!is.numeric(train_x))
    stop("train_x must be numeric")
  if (!is.numeric(test_x))
    stop("test_x must be numeric")
  if (!is.numeric(train_y) || !is.vector(train_y))
    stop("train_y must be numeric vector of target")
  if (!is.numeric(test_y) || !is.vector(test_y))
    stop("test_y must be numeric vector of target")
  if (batch_size != as.integer(batch_size / 1))
    stop("batch size must be an integer")
  if (!dev %in% c("gpu", "cpu"))
    stop("dev must be gpu or cpu")

  #Without this NA self inside data_set produces global variable note
  self <- NA

  new_dataset <- torch::dataset(

    name = "new_dataset",

    initialize = function(df,y2) {
      df <- stats::na.omit(df)
      x_cont <- df
      #create tensors for x and y and pass it to device
      self$x_cont <- torch::torch_tensor(x_cont)$to(device = dev)
      self$y <-
        torch::torch_tensor(y2, dtype = torch::torch_long())$to(device = dev)
    },
    .getitem = function(i) {
      list(x_cont = self$x_cont[i, ], y=self$y[i])
    },
    .length = function() {
      self$y$size()[[1]]
    }
  )

  #create datasets and data loaders
  train_ds <- new_dataset(train_x, train_y)
  test_ds  <- new_dataset(test_x, test_y)
  train_dl <- torch::dataloader(train_ds, batch_size = batch_size,
                                shuffle = FALSE)
  test_dl  <- torch::dataloader(test_ds, batch_size = batch_size,
                                shuffle = FALSE)

  return(list("train_ds" = train_ds,"test_ds"=test_ds,
              "train_dl"=train_dl,"test_dl"=test_dl))
}

#' Prepares data for adversarial model
#'
#' Prepares classifiers output for adversarial by splitting original predictions
#' into train and test vectors.
#'
#' @param preds numeric vector of predictions of target value made by
#' classifier (preferably the probabilistic ones).
#' @param sensitive integer vector of sensitive attribute which adversarial has
#' to predict.
#' @param partition float from [0,1] range setting the size of train vector
#' (test size equals 1-partition). Default = 0.7.
#'
#' @return list of four numeric lists with x and y data for train and test
#' respectively.
#' @export
#'
#' @examples
#'
#' preds <-c(0.312,0.343,0.932,0.754,0.436,0.185,0.527,0.492,0.743,0.011)
#' sensitive <- c(1,1,2,2,1,1,2,2,2,1)
#'
#' prepare_to_adv(preds,sensitive,partition=0.6)
#'
prepare_to_adv <- function(preds, sensitive, partition=0.7){
  if (!is.numeric(preds) || !is.vector(preds))
    stop("preds must be numeric vector of probabilities")
  if (!is.numeric(sensitive) || !is.vector(sensitive))
    stop("sensitive must be numeric vector of mapped sensitive classes")
  if (!is.numeric(partition) || partition > 1 || partition < 0)
    stop("partition must be numeric be in [0,1]")

  set.seed(123)
  train_indices <- sample(1:length(preds),  length(preds) * partition)
  train_x       <- as.numeric(preds[train_indices])
  train_x       <- matrix(train_x, ncol = 1)
  train_y       <- sensitive[train_indices]
  test_x        <- as.numeric(preds[setdiff(1:length(preds), train_indices)])
  test_x        <- matrix(test_x, ncol = 1)
  test_y        <- sensitive[setdiff(1:length(sensitive), train_indices)]
  return(list("train_x"=train_x,"train_y"=train_y,
              "test_x"=test_x,"test_y"=test_y))
}


#' Preprocesses data for training
#'
#' Prepares provided dataset to be ready for the training process.
#' It  makes data suitable for training functions, splits it into train, test
#' and validation, provides other data objects that are necessary for our
#' training.
#'
#' WARNING! So far the code in other functions is not fully prepared for
#' validation dataset and is designed for using test as test and validation.
#' Well understanding users however can use validation set in place of test if
#' they are sure it makes sense there.
#'
#' @param data list representing whole table of data (categorical variables
#' must be factors).
#' @param target_name character, column name of the target variable. Selected
#' column must be interpretable as categorical.
#' @param sensitive_name character, column name of the sensitive variable.
#' Selected column must be interpretable as categorical.
#' @param privileged character meaning the name of privileged group
#' @param discriminated character meaning the name of discriminated group
#' @param drop_also character vector, column names of other columns to drop
#' (like other sensitive variables).
#' @param sample double from [0,1] setting size of our sample from original
#' data set. Default: 1
#' @param train_size double from [0,1] setting size of our train. Note that
#' train_size+test_size+validation_size=1. Default=0.7
#' @param test_size double from [0,1] setting size of our test Note that
#' train_size+test_size+validation_size=1. Default=0.3
#' @param validation_size double from [0,1] setting size of our validation.
#' Note that train_size+test_size+validation_size=1. Default=0
#' @param seed sets seed for the sampling for code reproduction. Default=NULL
#'
#' @return list of prepared data
#' (
#' train_x, - numeric scaled matrix for classifier training
#' train_y, - numeric scaled vector for classifier training
#' sensitive_train, - numeric scaled vector for adversaries training
#' test_x, - numeric scaled matrix for classifier testing
#' test_y, - numeric scaled vector for classifier testing
#' sensitive_test, - numeric scaled vector for adversaries testing
#' valid_x, - numeric scaled matrix for classifier validation
#' valid_y, - numeric scaled vector for classifier validation
#' sensitive_valid, - numeric scaled vector for adversaries validation
#' data_scaled_test, - numeric scaled data set for testing
#' data_scaled_valid, - numeric scaled data set for validation
#' data_test, - whole dataset for testing, unchanged
#' protected_test, - character vector of protected values for explainers test
#' data_valid, - whole dataset for validation, unchanged
#' protected_valid - character vector of protected values for explainers valid
#' )
#' @export
#'
#' @examples
#' adult <- fairmodels::adult
#'
#' processed <-
#'   preprocess(
#'     adult,
#'     "salary",
#'     "sex",
#'     "Male",
#'     "Female",
#'     c("race"),
#'     sample = 0.05,
#'     train_size = 0.65,
#'     test_size = 0.35,
#'     validation_size = 0,
#'     seed = 7
#'   )
#'
preprocess <- function(data,
                       target_name,
                       sensitive_name,
                       privileged,
                       discriminated,
                       drop_also = NULL,
                       sample = 1,
                       train_size = 0.7,
                       test_size = 0.3,
                       validation_size = 0,
                       seed = NULL) {


  if (!is.list(data) && !is.matrix(data) && !is.data.frame(data))
    stop("data must be some sort of data holer (list,matrix,data.frame)")
  if (!is.character(privileged))
    stop("privileged must be a character string")
  if (!is.character(discriminated))
    stop("discriminated must be a character string")
  if (train_size < 0 || test_size < 0 || validation_size < 0)
    stop("sizes must be positive")
  if (train_size + test_size + validation_size != 1)
    stop("train_size+test_size+validation_size must equal 1")
  if (!is.character(target_name) || !is.character(sensitive_name))
    stop("target_name and sensitive_name must be characters")
  if (!is.null(drop_also) && !is.character(drop_also))
    stop("drop_also must be a character vector")
  if(sample > 1 || sample < 0)
    stop("sample must be between 0 and 1")
  if (seed != as.integer(seed / 1))
    stop("seed must be an integer")

  col    <- eval(parse(text = paste("data$", sensitive_name, sep = "")))
  #balance dataset to have the same number of sensitive values, so
  #adversarial doesn't overfit (like all predictions are 1 or 2)
  M      <- min(table(col))
  df_new <- data[col == privileged,][1:M,]
  df_new <- rbind(df_new, data[col == discriminated,][1:M,])
  data   <- df_new
  data   <- stats::na.omit(data)

  set.seed(seed)

  sample_indices <- sample(1:nrow(data), nrow(data) * sample)

  data           <- data[sample_indices, ]
  data           <- stats::na.omit(data)

  sensitive <-
    as.integer (eval(parse(text = paste(
      "data$", sensitive_name, sep = ""
    ))))

  target <-
    as.integer (eval(parse(text = paste(
      "data$", target_name, sep = ""
    ))))

  #drop columns we dont want to be in learning set
  if(is.null(drop_also)){
    data_coded <- data[, -which(names(data) %in%
                                  c(target_name, sensitive_name))]
    data_coded <- data.frame(data_coded)
  }else{
    data_coded <- data[, -which(names(data) %in%
                                c(target_name, sensitive_name, drop_also))]
    data_coded <- data.frame(data_coded)
  }
  #encode columns which are not numeric
  for (i in 1:ncol(data_coded)) {
    if (!is.numeric(data_coded[, i])) {
      data_coded[, i] <- as.integer(data_coded[, i])
    }
  }
  #prepare data with scaling
  data_matrix <- matrix(unlist(data_coded), ncol = ncol(data_coded))
  data_scaled <- scale(data_matrix, center = TRUE, scale = TRUE)
  #prepare indices for all classes
  set.seed(seed)
  train_indices <- sample(1:nrow(data_coded), train_size * nrow(data_coded))
  rest_indices  <- setdiff(1:nrow(data_coded), train_indices)
  set.seed(seed)
  test_indices       <- sample(rest_indices, test_size /
                            (1 - train_size) * length(rest_indices))
  validation_indices <- setdiff(rest_indices, test_indices)

  data_scaled_test   <- data_scaled[test_indices, ]
  data_scaled_valid  <- data_scaled[validation_indices, ]

  train_x            <- data_scaled[train_indices, ]
  train_y            <- target[train_indices]
  sensitive_train    <- sensitive[train_indices]

  test_x             <- data_scaled[test_indices, ]
  test_y             <- target[test_indices]
  sensitive_test     <- sensitive[test_indices]

  valid_x            <- data_scaled[validation_indices,]
  valid_y            <- target[validation_indices]
  sensitive_valid    <- sensitive[validation_indices]


  data_test          <- data[test_indices, ]
  protected_test <-
    eval(parse(text = paste("data_test$", sensitive_name, sep = "")))

  data_valid         <- data[validation_indices, ]
  protected_valid <-
    eval(parse(text = paste("data_valid$", sensitive_name, sep = "")))

  prepared_data <- list(
    "train_x" = train_x,
    "train_y" = train_y,
    "sensitive_train" = sensitive_train,
    "test_x" = test_x,
    "test_y" = test_y,
    "sensitive_test" = sensitive_test,
    "valid_x" = valid_x,
    "valid_y" = valid_y,
    "sensitive_valid" = sensitive_valid,
    "data_scaled_test" = data_scaled_test,
    "data_scaled_valid" = data_scaled_valid,
    "data_test" = data_test,
    "protected_test" = protected_test,
    "data_valid" = data_valid,
    "protected_valid" = protected_valid
  )

  return(prepared_data)
}
ModelOriented/FairPAN documentation built on Dec. 17, 2021, 4:19 a.m.