R/helper_functions.R

Defines functions train_test_validate strat_sample_boot strat_sample standardize sparsity sort_factor shuffle remove_duplicates na_ref mode is_sparseMatrix is_binary initialize_parallel formula_rhs formula_lhs assert as_numeric

Documented in as_numeric assert formula_lhs formula_rhs initialize_parallel is_binary is_sparseMatrix mode na_ref remove_duplicates shuffle sort_factor sparsity standardize strat_sample_boot train_test_validate

#' Convert vectors (including factors) to numeric.
#'
#' @param var A vector. Can be a factor.
#' @return Returns \code{var} as a numeric vector.
#' @examples
#' as_numeric(c("1", "2", "3"))
#' as_numeric(factor(c("1", "2", "3")))
#' @return Numeric vector.
#' @export
as_numeric <- function(var) {
  var <- as.numeric(as.character(var))
  return(var)
}

#' Create assertions with custom error messages.
#'
#' @param condition A condition statement that returns a logical value.
#' @param message A string. A custom error message to return. This is optional.
#' @examples
#' var <- 2
#' assert(is.numeric(var), "This variable is not a numeric!")
#' @export
assert <- function(condition,
                   message = NULL) {
  if(!condition) {
    if(is.null(message)) {
      stop(FALSE, call. = FALSE)
    } else {
      stop(message, call. = FALSE)
    }
  }
}

#' Pull the LHS of a formula.
#'
#' @param form A formula as text or formula.
#' @export
formula_lhs <- function(form) {
  assert(purrr::is_formula(form),
         "Object doesn't appear to be a valid formula.")
  lhs <- trimws(form[2])
  return(lhs)
}

#' Pull the RHS of a formula including '~'.
#'
#' @param form A formula as text or formula.
#' @export
formula_rhs <- function(form) {
  assert(purrr::is_formula(form),
         "Object doesn't appear to be a valid formula.")
  rhs <- paste("~", form[3])
  return(rhs)
}

#' Initialize parallel processing
#' 
#' @export
initialize_parallel <- function(){
  future::plan(future::multiprocess)
}

#' Check if vector-like object is binary.
#'
#' @param var A vector-like object. This can also be a factor.
#' @return Logical value indicating if input is binary or not.
#' @examples
#' var <- c(1, 1, 0, 0, 1)
#' is_binary(var)
#' @export
is_binary <- function(var) {
  ux <- unique(var)
  if (length(ux) != 2) {
    return(FALSE)
  } else {
    if (!all(var %in% c(0, 1))) {
      message(
        "The input has only two unique values, but they are not in {0, 1}"
      )
      return(FALSE)
    } else {
      return(TRUE)
    }
  }
}

#' Check if object is a Sparse Matrix
#' 
#' @param dat Data object.
#' @return Logical indicating if input is sparse Matrix.
#' @export
is_sparseMatrix <- function(dat) {
  return(methods::is(dat, "sparseMatrix"))
}

#' Find the modal value of a vector.
#'
#' @param x A vector-like object.
#' @param na.rm A logical. Whether to remove \code{NA}s when computing.
#' @param all.modes A logical. Indicate whether to return all modes.
#' @return Single mode or vector of all modal values.
#' @examples
#' var <- c(1, 1, 2, 2, 3)
#' mode(var)
#' mode(var, all.modes = TRUE)
#' @export
mode <- function(x,
                 na.rm = TRUE,
                 all.modes = FALSE) {
  if (any(!is.na(x))) {
    if(na.rm == TRUE) {
      ux <- unique(x)[!is.na(unique(x))]
    } else {
      ux <- unique(x)
    }
  } else {
    ux <- NA
  }
  modes <- which(tabulate(match(x, ux)) == max(tabulate(match(x, ux))))
  if (all.modes == FALSE) {
    return(ux[modes[[1]]])
  } else {
    return(ux[modes])
  }
}

#' Convert vector-like object to a factor with NA as reference level.
#'
#' @param var Vector-like object.
#' @return Factor with NA as reference level if appropriate.
#' @examples
#' var <- c(4, 5, NA, 6, 7)
#' na_ref(var)
#' @export
na_ref <- function(var) {
  var <- factor(var,
                exclude = NULL,
                levels = c(NA,
                           unique(var)[!is.na(unique(var))]))
  var <- droplevels(var)
  return(var)
}

#' Remove duplicate coumns from matrices or dataframes
#' 
#' @param dat Matrix, dataframe, or sparse matrix.
#' @param return.sparse Logical. Return object as a sparse matrix? Default is F.
#' @return Input object minus duplicate columns.
#' @export
remove_duplicates <- function(dat, 
                              return.sparse = FALSE){
  is_df <- is.data.frame(dat)
  is_sM <- is_sparseMatrix(dat)
  if(!is_sM) dat <- as.matrix(dat)
  dup.cols <- as.vector(duplicated.matrix(t(dat)))
  dat <- dat[, !dup.cols]
  if(return.sparse == FALSE & !is_sM) {
    if(is_df) {
      return(tibble::as_tibble(dat))
    } else {
      return(dat)
    }
  } else {
    return(Matrix::Matrix(dat, sparse = TRUE))
  }
}

#' Shuffles standard data objects.
#' 
#' @param dat Matrix, dataframe, or vector.
#' @return Input object randomly re-ordered.
#' @export
shuffle <- function(dat) {
  is_df  <- is.data.frame(dat)
  is_mat <- is.matrix(dat)
  if(is_df | is_mat) {
    return(dat[sample(1:nrow(dat)), ])
  } else {
    return(dat[sample(1:length(dat))])
  }
}

#' Set sorted factor levels. Specify specific base level if desired.
#' 
#' @param var Input Vector.
#' @param base.level String indicating base level. Argument is optional.
#' @return Factor with ascending sorted levels and user-specified base level.
#' @export
sort_factor <- function(var,
                        base.level = NULL) {
  if(!is.null(base.level)) {
    var <- factor(var,
                  levels = c(
                    base.level,
                    sort(unique(var))[which(sort(unique(var)) != base.level)]
                  ))
    return(var)
  } else {
    var <- factor(var,
                  levels = sort(unique(var)))
    return(var)
  }
}

#' Checks the sparsity of each column of a dataframe, matrix, or sparse matrix.
#'
#' @param input Dataframe, matrix, or sparse matrix.
#' @param count.na.zero Logical. Should NAs be counted as zeros?
#' @return Dataframe with the respective column names and sparsity.
#' @examples
#' df <- data.frame(a = c(0, 0, 1), b = c(1, 1, 0), c = c(0, 0, 0))
#' sparsity(df)
#' @export
sparsity <- function(input,
                     count.na.zero = FALSE) {
  sparsity <- dplyr::rename(
    tibble::enframe(
      apply(input, 2, function(i) {
        Matrix::nnzero(i, na.counted = count.na.zero)/nrow(input)
      })
    ),
    "VARIABLE"    = "name",
    "PERC_SPARSE" = "value"
  )
  return(sparsity)
}

#' Standardizes vector, dataframe, or matrix.
#'
#' @param var Vector, dataframe, or matrix. Will not standardize factors,
#'   characters, or binary (0,1) vectors or columns.
#' @return Standardized input object.
#' @export
standardize <- function(var) {
  # Function that scales any non-factor/character vector
  stdz <- function(x){
    if(!is.factor(x) & !is.character(x)) {
      x <- as.vector(scale(x))
    }
    return(x)
  }
  # Checks for data.frame and matrix classes
  if(is.data.frame(var)) {
    var <- dplyr::bind_cols(lapply(var, stdz))
  } else if (is.matrix(var)) {
    var <- apply(var, 2, stdz)
  } else {
    var <- stdz(var)
  }
  return(var)
}

strat_sample <- function(df, strat) {
  # Append a column of ids
  df <- dplyr::select(dplyr::mutate(df, id = 1:nrow(df)), `strat`, "id")
  # Get stratified bootstrap sample
  ids <- sort(
    dplyr::pull(
      dplyr::ungroup(
        dplyr::sample_n(
          dplyr::mutate(
            dplyr::group_by(df, get(strat)),
            "N" = dplyr::n()
          ),
          size = get("N"),
          replace = TRUE
        )
      ),
      "id"
    )
  )
  return(ids)
}

#' Returns N stratified bootstrap samples of a dataframe.
#'
#' @param df Dataframe to sample from.
#' @param strat String specifying strata column.
#' @param nboot Numeric value specifying number of bootstrap samples.
#' @param parallel Logical value. Helpful when \code{nboot} is large.
#' @return List of N vectors containing bootstrap indices.
#' @export
strat_sample_boot <- function(df,
                              strat,
                              nboot,
                              parallel = TRUE) {
  # Create a named vector for each bootstrap sample
  boot_list <- paste0("Sample", 1:nboot)
  # Create samples in parallel or not
  if(parallel == FALSE) {
    # Create n samples using strat_sample function
    samples <- lapply(boot_list, function(i) {
      return(strat_sample(df = df, strat = strat))
    })
  } else {
    # Initialize parallel processing
    future::plan(future::multiprocess)
    # Create n samples using strat_sample function
    samples <- future.apply::future_lapply(boot_list, function(i) {
      return(strat_sample(df = df, strat = strat))
    })
  }
  # Return list of n bootstrap samples with names
  names(samples) <- boot_list
  return(samples)
}

#' Create a train, test, and validation split of a dataset.
#' 
#' @param y Outcome vector.
#' @param train.p Fraction of observations in the training set.
#' @param test.p Fraction of observations in the test set
#' @return A list with train, test, and validation indices.
#' @export
train_test_validate <- function(y,
                                train.p,
                                test.p) {
  rand_idx <- shuffle(1:length(y))
  train_idx <- floor(train.p*length(y))
  test_idx <- floor(test.p*length(y)) + train_idx
  train <- rand_idx[1:train_idx]
  test <- rand_idx[(train_idx + 1):test_idx]
  validate <- rand_idx[(test_idx + 1):length(y)]
  assert(sum(c(train, test, validate) == rand_idx) == length(y))
  return(list(train = sort(train),
              test = sort(test),
              validate = sort(validate)))
}
dmolitor/umbrella documentation built on Nov. 10, 2020, 1:25 a.m.