R/utils.R

Defines functions regr_folds class_folds func_shuffle func_categorical_preds FUNCTION_weights func_tbl func_tbl_dist normalized entropy

Documented in class_folds entropy func_categorical_preds func_shuffle func_tbl func_tbl_dist FUNCTION_weights normalized regr_folds

#==============================================================================================================================================
#' this function compute entropy
#'
#' @keywords internal

entropy = function(x){
  diff = x/sum(x)
  -sum(sapply(diff, function(x){x*log(x)}))
}

#' this function normalizes the data
#'
#' @keywords internal

normalized = function(x) {
  if(max(x) == min(x)){
    out = x
  }else{
    out = (x - min(x))/(max(x) - min(x))
  }
  out
}


#' this function returns the probabilities in case of classification
#'
#' @keywords internal

func_tbl_dist = function(DF, Levels) {

  mat = matrix(rep(0, dim(DF)[1] * length(Levels)), ncol = length(Levels), nrow = dim(DF)[1])

  for (i in 1:dim(DF)[1]) {

    tmp_tbl = prop.table(table(DF[i, ]))

    mat[i, as.numeric(names(tmp_tbl))] = tmp_tbl
  }

  mat
}


#' this function returns a table of probabilities for each label
#'
#' @keywords internal

func_tbl = function(DF, W, labels) {

  tmp_W = matrix(rep(0, dim(DF)[1] * length(labels)), ncol = length(labels), nrow = dim(DF)[1])

  for (i in 1:length(labels)) {

    tmp_W[, i] <- rowSums(W * (DF == labels[i]))
  }

  tmp_W
}


#' this function is used as a kernel-function-identifier [ takes the distances and a weights-kernel (in form of a function) and returns weights ]
#'
#' @keywords internal

FUNCTION_weights = function(W_dist_matrix, weights_function, eps = 1.0e-6) {

  W_dist_matrix = t(apply(W_dist_matrix, 1, normalized))

  W_dist_matrix = W_dist_matrix - eps

  W = do.call(weights_function, list(W_dist_matrix))

  W <- W/rowSums(W)

  W
}


#' Arithmetic operations on lists
#' 
#' @keywords internal


switch.ops = function (LST, MODE = 'ADD') {
  
  if (!inherits(LST, "list"))  stop("LST must be a list")
  
  if (!all(unlist(lapply(LST, function(x) inherits(x, c('data.frame', 'matrix')))))) stop('the sublist objects must be either matrices or data frames')
  
  r = all(unlist(lapply(LST, nrow)) == unlist(lapply(LST, nrow))[1])
  
  c = all(unlist(lapply(LST, ncol)) == unlist(lapply(LST, ncol))[1])
  
  if (!all(c(r, c))) stop("the dimensions of the included data.frames or matrices differ")
  
  if (MODE == 'ADD') {
    
    init_df = data.frame(matrix(rep(0, dim(LST[[1]])[1] * dim(LST[[1]])[2]), nrow = dim(LST[[1]])[1], ncol = dim(LST[[1]])[2]))}
  
  else if (MODE == 'MULT') {
    
    init_df = data.frame(matrix(rep(1, dim(LST[[1]])[1] * dim(LST[[1]])[2]), nrow = dim(LST[[1]])[1], ncol = dim(LST[[1]])[2]))
  }
  
  else {
    
    stop('invalid MODE type')
  }
  
  for (i in 1:length(LST)) {
    
    if (MODE == 'ADD') {
      
      init_df = init_df + LST[[i]]}
    
    if (MODE == 'MULT') {
      
      init_df = init_df * LST[[i]]
    }
  }

  colnames(init_df) = colnames(LST[[1]])
  
  return(as.matrix(init_df))
}


#' OPTION to convert categorical features TO either numeric [ if levels more than 32] OR to dummy variables [ if levels less than 32 ]
#'
#' @keywords internal
#' @importFrom stats model.matrix

func_categorical_preds = function(prepr_categ) {

  less32 = sapply(prepr_categ, function(x) is.factor(x) && length(unique(x)) < 32)
  greater32 = sapply(prepr_categ, function(x) is.factor(x) && length(unique(x)) >= 32)

  if (sum(less32) == 1) {

    rem_predictors = names(which(less32))
    out_L = model.matrix(~. - 1, data = data.frame(prepr_categ[, rem_predictors]))
    colnames(out_L) = paste0(rem_predictors, 1:dim(out_L)[2])
  }

  if (sum(less32) > 1) {

    rem_predictors = names(which(less32))
    out_L = model.matrix(~. - 1, data = prepr_categ[, rem_predictors])
    colnames(out_L) = make.names(colnames(out_L))
  }

  if (sum(greater32) > 0) {

    fact_predictors = names(which(greater32))

    for (nams in fact_predictors) {

      prepr_categ[, nams] = as.numeric(prepr_categ[, nams])
    }
  }

  if (sum(less32) > 0) {

    return(cbind(prepr_categ[, -which(colnames(prepr_categ) %in% rem_predictors)], out_L))
  }

  else {

    return(prepr_categ)
  }
}


#' shuffle data
#'
#' this function shuffles the items of a vector
#' @keywords internal

func_shuffle = function(vec, times = 10) {
  
  for (i in 1:times) {
    
    out = sample(vec, length(vec))
  }
  out
}


#' stratified folds (in classification)                      [ detailed information about class_folds in the FeatureSelection package ]
#'
#' this function creates stratified folds in binary and multiclass classification
#' @keywords internal
#' @importFrom utils combn


class_folds = function(folds, RESP) {
  
  if (!is.factor(RESP)) {
    
    stop(simpleError("RESP must be a factor"))
  }
  
  clas = lapply(unique(RESP), function(x) which(RESP == x))
  
  len = lapply(clas, function(x) length(x))
  
  samp_vec = rep(1/folds, folds)
  
  prop = lapply(len, function(y) sapply(1:length(samp_vec), function(x) round(y * samp_vec[x])))
  
  repl = unlist(lapply(prop, function(x) sapply(1:length(x), function(y) rep(paste0('fold_', y), x[y]))))
  
  spl = suppressWarnings(split(1:length(RESP), repl))
  
  sort_names = paste0('fold_', 1:folds)
  
  spl = spl[sort_names]
  
  spl = lapply(spl, function(x) func_shuffle(x))           # the indices of the unique levels will be shuffled
  
  ind = t(combn(1:folds, 2))
  
  ind1 = apply(ind, 1, function(x) length(intersect(spl[x[1]], spl[x[2]])))
  
  if (sum(ind1) > 0) {
    
    stop(simpleError("there is an intersection between the resulted indexes of the folds"))
    
  }
  
  if (length(unlist(spl)) != length(RESP)) {
    
    stop(simpleError("the number of items in the folds are not equal with the response items"))
  }
  
  spl
}


#' create folds (in regression)                                           [ detailed information about class_folds in the FeatureSelection package ]
#'
#' this function creates both stratified and non-stratified folds in regression
#' @keywords internal


regr_folds = function(folds, RESP) {
  
  if (is.factor(RESP)) {
    
    stop(simpleError("this function is meant for regression for classification use the 'class_folds' function"))
  }
  
  samp_vec = rep(1/folds, folds)
  
  sort_names = paste0('fold_', 1:folds)
  
  prop = lapply(length(RESP), function(y) sapply(1:length(samp_vec), function(x) round(y * samp_vec[x])))
  
  repl = func_shuffle(unlist(lapply(prop, function(x) sapply(1:length(x), function(y) rep(paste0('fold_', y), x[y])))))
  
  spl = suppressWarnings(split(1:length(RESP), repl))
  
  spl = spl[sort_names]
  
  if (length(unlist(spl)) != length(RESP)) {
    
    stop(simpleError("the length of the splits are not equal with the length of the response"))
  }
  
  spl
}



#================================================================================================================================================================

Try the fmf package in your browser

Any scripts or data that you put into this service are public.

fmf documentation built on Sept. 3, 2020, 9:07 a.m.