R/utils.R

Defines functions condense_rows dsample sample_choice

Documented in condense_rows dsample sample_choice

#' @title Modified version of base::sample for programming in a Monte Carlo simulation
#'
#' @description
#' This version of the sample function removes the feature where x is an integer of length 1
#' sampling via sample takes place from 1:x. This can bring unexpected behaviour
#' as mentioned in the documentation of `base::sample()`.
#'
#' @param x a vector that contains value(s) that represents a choiceset.
#' @inheritParams base::sample
#' @export
#'
#' @examples
#'
#' sample_choice(7, 10, replace = TRUE) # equipvalent to rep(7, 10)
#' sample_choice(7, 1)
#' sample_choice(7) # which is equipvalent to the above
sample_choice <- function(x, size = 1, replace = FALSE, prob = NULL) {
  if (length(x) != 1) {
    sample(x = x, size = size, replace = replace, prob = prob)
  } else {
    if (replace || size == 1) {
      rep(x, size)
    } else {
      stop("cannot take a sample larger than the population when 'replace = FALSE'")
    }
  }
}

#' Discrete choice sampling
#'
#' @description
#'
#' A modified sampling function based on `base::sample`. It always assume that
#' that `x` argument contains discrete values. For example, if a number 8 is given,
#' it will assume that 8 is the only choice it has and not assume that the choices
#' are number 1 to 8, like `base::sample` does. This feature makes it safe when the
#' choices are discrete numbers and use in a programmatic way.
#'
#' @param x a vector that contains value(s) that represents a choiceset.
#' @inheritParams base::sample
#'
#' @return returns a vector of the same type as `x` with length of `size`.
#' @export
#'
#' @examples
#'
#' dsample(7, 10, replace = TRUE) # equivalent to rep(7, 10)
#' dsample(7, 1)
#' dsample(7) # which is equivalent to the above
dsample <- function(x, size = 1, replace = FALSE, prob = NULL) {
  if (length(x) != 1) {
    sample(x = x, size = size, replace = replace, prob = prob)
  } else {
    if (replace || size == 1) {
      rep(x, size)
    } else {
      stop("cannot take a sample larger than the population when 'replace = FALSE'")
    }
  }
}

#' Condense rows
#'
#' @description
#' Condense two vectors - where the first vector is an id vector and the second
#' vector is an target (which must be unique) -  into a data.table with two
#' columns (id and target).
#'
#' @param id (`integer()`) a integer vector contains ids.
#' @param target (`integer()`) a integer vector contains values with the same length as `id`.
#' @param names (Default as `NULL`)
#'
#' @return a data.table with two columns, `id` and `target`.
condense_rows <- function(id, target, names = NULL) {
  checkmate::assert(
    checkmate::check_integerish(id, any.missing = FALSE, null.ok = FALSE),
    checkmate::check_integerish(target, any.missing = FALSE, null.ok = FALSE),
    checkmate::check_true(length(id) == length(target)),
    combine = "and"
  )

  if (is.numeric(id)) {
    id <- as.integer(id)
  }

  if (is.numeric(target)) {
    target <- as.integer(target)
  }

  dt <- data.table(id = id, target = target)
  dt <- dt[, .(target = list(target)), by = id]

  if (!is.null(names)) {
    stopifnot(length(names) == ncol(dt))
    names(dt) <- names
  }

  return(dt)
}


#' @title element_wise_expand_lists
#'
#' @description expand two lists into a dataframe. l1 and l2 can be a nested list.
#'
#' @param l1 first list
#' @param l2 second list
#'
#' @return data.frame with two columns, Var1 and Var2.
#' @export
#'
#' @examples
#' l1 <- list(1, 2, 3, 4)
#' l2 <- list(1, 2, 3, 4)
#' element_wise_expand_lists(l1, l2)
#'
#' l1 <- list(1, 2, 3, 4)
#' l2 <- list(1, 2, 3, c(1, 3))
#' element_wise_expand_lists(l1, l2)
element_wise_expand_lists <- function(l1, l2) {
  stopifnot(length(l1) == length(l2))
  as.data.table(do.call(rbind, Map(expand.grid, l1, l2)))
}


#' Look up and replace values in columns, including list columns.
#'
#' Replace all values in a data.frame using a lookup table. Maximum of one list
#' column can be include in `cols_to_lookup`.
#'
#' @param data a data.table/data.frame to be replaced.
#' @param lookup_table a data.table/data.frame contains two columns which must be
#' named as `.key` and `value`. The reason for requiring a dot infront of `key`
#' is that `key` is a reserved column name in `data.table` meaning no columns
#' can be named `key`. Also the `.key` column should have the same data type as
#' `cols_to_lookup` columns.
#' @param cols column names to be lookup and replace
#' @param id_col a column with unique value to be used in unnesting `data`. This
#' is not required if `cols_to_lookup` doesn't contain any list columns.
#'
#' @return data.table with new values
#'
#' @export
lookup_and_replace <- function(data, lookup_table, cols, id_col = NULL) {
  # checks
  checkmate::assert(
    checkmate::check_data_frame(data),
    checkmate::check_names(names(data), must.include = cols),
    checkmate::check_data_frame(lookup_table, max.cols = 2),
    checkmate::check_names(names(lookup_table), subset.of = c(".key", "key", "value")),
    combine = "and"
  )

  # transform data
  if (!is.data.table(data)) {
    data <- as.data.table(data)
  }

  if (!is.data.table(lookup_table)) {
    lookup_table <- as.data.table(lookup_table)
  }

  if ("key" %in% names(lookup_table)) {
    data.table::setnames(lookup_table, "key", ".key")
  }

  all_list_cols <- names(data)[sapply(data, class) == "list"]
  data <- copy(data)

  data_emptied <- copy(data)[0, ]

  # replace values
  list_cols <- all_list_cols[all_list_cols %in% cols]
  non_list_cols <- cols[!cols %in% list_cols]

  if (length(list_cols) != 0 & is.null(id_col)) {
    stop("`id_col` must be provided if `list_col` is not emptied.")
  }
  if (length(list_cols) == 1) {
    data <-
      .lookup_and_replace_list_cols(data, lookup_table, id_col, list_cols)
  }

  data <-
    .lookup_and_replace_non_list_cols(data, lookup_table, non_list_cols)

  stopifnot(identical(data[0, ], data_emptied[0, ]))
  return(data)
}


#' Look up and replace values in columns.
#'
#' @description
#'
#' Replace keys with their values.
#'
#' @param x a data.frame
#' @param cols a character vector of column names in `x` to be replaced.
#' @param mapping a data.frame contains two columns which are `.key` and `.value`
#'
#' @return a data.frame
#' @export
lookup_and_replace2 <- function(x, cols, mapping) {
  checkmate::assert_data_frame(x)
  checkmate::assert_character(cols)
  checkmate::assert_names(names(x), must.include = cols)
  checkmate::assert_data_frame(mapping, min.rows = 1, min.cols = 2)
  checkmate::assert_names(names(mapping), must.include = c(".key", ".value"))

  if (!is.data.table(x)) {
    x <- as.data.table(x)
  }

  if (!is.data.table(mapping)) {
    mapping <- as.data.table(mapping)
  }

  # preserve for validation
  x_col_order <- names(x)
  x_str <- x[0, ]

  for (col in cols) {

    # check that the data type in key and col are matched
    if (typeof(x[["col"]]) != typeof(mapping[["value"]])) {
      stop(paste0("Type of '", col, "' in `x` and '.key' in `mapping` are not matched."))
    }

    # merge new values
    x <-
      merge(
        x = x,
        y = mapping,
        by.x = col,
        by.y = ".key",
        all.x = TRUE,
        sort = FALSE,
        allow.cartesian = FALSE
      )

    # check
    if (sum(is.na(x[[col]])) != sum(is.na(x[[".value"]]))) {
      stop(paste0(
        "Some entries in the '", col, "' column couldn't ",
        "find a mapping value in `mapping`."
      ))
    }

    # final cleansing
    x %>%
      .[, c(col) := NULL] %>%
      data.table::setnames(., old = ".value", new = col)
  }

  # preserve the original column order
  data.table::setcolorder(x, x_col_order)

  # don't check keys
  data.table::setkey(x_str, NULL)
  data.table::setkey(x, NULL)

  # final checking of column types and data dimiensions
  checkmate::assert_data_table(x)
  checkmate::assert_names(names(x), identical.to = names(x_str))

  return(x)
}

#' .lookup_and_replace_list_cols
#'
#' @description
#'
#' This unnest function keeps
#'
#' @param data data.table
#' @param lookup_table lookup_table
#' @param id_col NA
#' @param list_cols NA
#'
#' @return data
#'
#' @noRd
.lookup_and_replace_list_cols <- function(data, lookup_table, id_col, list_cols) {
  .data <- data
  if (length(list_cols) > 1) {
    stop(
      paste(
        ".lookup_and_replace_list_cols can only work with one",
        "list column right now."
      )
    )
  }
  if (length(names(.data)[sapply(.data, class) == "list"]) == 0) {
    stop("There is no list columns in `data`.")
  }
  if (length(.data[, get(id_col)]) != length(unique(.data[, get(id_col)]))) {
    stop("id_col must be unique!")
  }
  if (!is.data.table(.data)) {
    lookup_table <- as.data.table(.data)
  }
  if (!is.data.table(lookup_table)) {
    lookup_table <- as.data.table(lookup_table)
  }

  .data <-
    unnest_datatable(.data, id_col) %>%
    .[lookup_table,
      on = paste0(list_cols, "==.key"),
      eval((list_cols)) := value
    ] %>%
    # nest the list column back
    .[,
      `:=`(list_col_tmp = .(get(list_cols))),
      by = eval((id_col))
    ] %>%
    .[, eval((list_cols)) := list_col_tmp] %>%
    .[, list_col_tmp := NULL] %>%
    .[, .SD[1], by = eval((id_col))] %>%
    .[, eval((list_cols)) := purrr::map(
      .x = get(list_cols),
      .f = ~ purrr::keep(.x, ~ !is.na(.x))
    )]

  return(.data)
}

.lookup_and_replace_non_list_cols <- function(data, lookup_table, non_list_cols) {
  if (!is.data.table(lookup_table)) {
    lookup_table <- as.data.table(lookup_table)
  }

  # replace values in normal columns
  for (col in non_list_cols) {
    # update on join
    data[lookup_table, on = paste0(col, "==.key"), (col) := value]
  }
  return(data)
}

#' Group a column into a list column and sort by the group-by column
#'
#' @param x :: [data.table::data.table()]
#' @param groupby_col :: `character(1)`
#'   the column to use in 'by' in the data.table `x`.
#' @param group_col :: `character(1)`
#'   the column to be grouped into a list column.
#' @param sort_order :: `vector()`
#'   A vector of ordered values to sort the `groupby_col`. If a value in the
#'   vector is missing from `groupby_col` in `x` its `group_col` will be `NA`.
#'
#' @return [data.table::data.table()] with 2 columns: `sort_col`, `group_col`.
#'
#' @note
#'
#' The returned [data.table::data.table()] should have the same number of rows
#' as the length of `sort_col`.
dt_group_and_sort <- function(x, groupby_col, group_col, sort_order) {
  checkmate::assert_data_table(x)
  stopifnot(groupby_col %in% names(x))
  stopifnot(group_col %in% names(x))
  checkmate::expect_integerish(sort_order, unique = T, lower = 0, min.len = 1)

  # group and sort
  x_new <-
    x %>%
    .[, .(group_col = list(get(group_col))), by = groupby_col] %>%
    merge(
      x = data.table(sort_col = sort_order), y = .,
      by.x = "sort_col", by.y = groupby_col, all.x = T, sort = FALSE
    )

  #' In short, the if-statement makes sure that all emptied `group_col` will be `NA` not `NULL`
  #'
  #' For cases where some `sort_order` are not in `groupby_col` of `x`
  #' those elements in `sort_order` will have their values in `groupby_col` as `NULL`
  if (!any(is.na(x[[group_col]])) | any(!sort_order %in% x[[groupby_col]])) {
    x_new[sapply(group_col, is.null), group_col := NA_integer_]
  }

  stopifnot(nrow(x_new) == length(sort_order))
  x_new
}


.slug <- function(x, ext) {
  x_base <- path_ext_remove(x)
  x_ext <- path_ext(x)
  ext <- if (identical(tolower(x_ext), tolower(ext))) {
    x_ext
  } else {
    ext
  }
  path_ext_set(x_base, ext)
}

.check_file_name <- function(x) {
  if (!checkmate::test_character(x, pattern = "^[a-zA-Z0-9._-]+$", len = 1, any.missing = FALSE, null.ok = FALSE)) {
    stop(glue::glue("'{x}' is not a valid file name. It should contain only \\
                    ASCII letters, numbers, '-', and '_'."))
  }
  return(TRUE)
}

check_names <- function(x, names) {
  if (!checkmate::test_named(x, type = "unique")) {
    stop("Names of 'x' must be unique!")
  }
  nms <- names(x)
  return(names %in% nms)
}

skip_on_not_master <- function(skip_on_not_git = TRUE) {
  if (skip_on_not_git && is.null(get_current_git_branch())) {
    skip("Skip on branch not master")
    return(invisible())
  }

  if (get_current_git_branch() == "master") {
    return(invisible())
  }

  skip("Skip on branch not master")
}

get_current_git_branch <- function() {
  branches <- system("git branch", intern = T)
  current_branch <-
    grep("\\*", branches, value = T) %>%
    gsub("\\*|\\ ", "", .)
  if (length(current_branch) == 0) {
    return(NULL)
  }
  return(current_branch)
}

#' Unnest data.table by list columns
#'
#' !! Note that this is only working when all list columns in the given
#' data.frame are to be unnested. See https://github.com/dymium-org/dymiumCore/issues/79.
#'
#' @param dt :: (`data.frame()`)\cr
#'  a data.frame object.
#' @param cols :: (`character()`)\cr
#'  a character vector denoting columns to be unnested.
#'
#' @return a data.table
#' @export
unnest_dt <- function(dt, cols) {
  if (!is.data.frame(dt)) {
    stop("`dt` must be a data.frame or a data.table.")
  }

  if (sum(sapply(dt, is.list)) != length(cols)) {
    stop("This unnest function only works if all list columns are to be unnested.")
  }

  if (!is.data.table(dt)) {
    dt <- as.data.table(dt)
  }

  clnms <- setdiff(colnames(dt), cols)

  dt <- eval(
    rlang::expr(dt[, lapply(.SD, unlist), by = c(clnms), .SDcols = cols])
  )

  colnames(dt) <- c(as.character(clnms), as.character(cols))

  dt
}

#' unnest data.table by reference column(s).
#'
#' for internal use only. This will be replaced with a more native approach once
#' https://github.com/Rdatatable/data.table/pull/4156 is officially included in
#' data.table.
#'
#' @param dt [data.table::data.table()]\cr
#'  a data.table object.
#' @param by_col (`character(1)`)\cr
#'  A reference column.
unnest_datatable <- function(dt, by_col) {
  stopifnot(is.data.table(dt))
  stopifnot(all(by_col %in% names(dt)))

  # unnest the list column
  # https://stackoverflow.com/questions/34692260/how-to-ungroup-list-columns-in-data-table
  dt[, lapply(.SD, unlist), by = c(by_col)]
}


#' Return the indices of n maximum or minimum values
#'
#' @param x :: (`numeric()`)\cr
#'  an numeric vector
#' @param n :: (`integer(1)`)\cr
#'  number of values to return.
#'
#' @return an `integer()` vector.
#' @export
#'
#' @examples
#'
#' which_max_n(1:4, 2)
#' which_min_n(1:4, 2)
which_max_n <- function(x, n = 1) {
  which(x >= -sort(-x, partial = n)[n])
}


#' @rdname which_max_n
#' @export
which_min_n <- function(x, n = 1) {
  which(x < -sort(-x, partial = n)[n])
}
dymium-org/dymiumCore documentation built on July 18, 2021, 5:10 p.m.