R/utils.R

Defines functions setuppernames setlowernames paste_comma quote_comma break_word proc_time sec_to_hms expand_date emonth bmonth add_mon add_year num2date vseq change_interval change_point nolast minmax_scaler expand.grid.df join set_prop_ set_prop set_cumsum_ set_cumsum set_rowsum_ set_rowsum replace_empty_with_na replace_na_with_empty replace_na_with_zero setcolafter_ setcolafter equal rm_ws trim_ws rm_punct_ rm_punct rm_cols_ rm_cols icol diff_cols factor_cols regmatch_cols match_cols has_missing has_cols has_rows match_class assert_class assert_type load_rds get_files get_options add_folder

Documented in join proc_time

# utils -------------------------------------------------------------------

utils::globalVariables(c(
  "ACCIDENT", "AMI", "ANGINA",
  "BLADDERCANCER", "BORDERLINETUMOR", "BREASTCANCER",
  "CARDITIS", "CANCER", "CEREBROVASCULARDISEASE", "CI10", "CI333", "CIRRHOSIS", "CIS",
  "DECLINE", "DIABETES",
  "GENERALCANCER",
  "HIV", "HYPERTENSION",
  "IHD",
  "PROSTATECANCER", "PSEUDOCANCER",
  "SKINCANCER", "SMALLCANCER", "STROKE",
  "THYROIDCANCER",
  "UTERICANCER",
  "VALVE"
))

add_folder <- function(folder = c("dev", "data", "info", "inst", "output", "R", "raw", "report", "rules")) {
  width <- max(nchar(folder))
  for (i in seq_along(folder)) {
    if (!file.exists(folder[i])) {
      dir.create(folder[i])
      cat(sprintf("%s folder is created\n",
                  str_pad(folder[i], width = width, side = "right")))
    } else {
      cat(sprintf("%s folder is already exists\n",
                  str_pad(folder[i], width = width, side = "right")))
    }
  }
}

get_options <- function(param) {
  opt_names <- names(options())
  sub_names <- opt_names[grepl(param, opt_names)]
  options()[sub_names]
}

get_files <- function(pattern = "", folder = getwd())
  sort(dir(folder)[grepl(pattern, dir(folder))])

load_rds <- function(file, path, refhook = NULL) {
  if (!missing(path))
    file <- paste0(path, "/", file)
  alloc.col(readRDS(file, refhook))
}

assert_type <- function(obj, type) {
  obj_name <- deparse(substitute(obj))
  if (typeof(obj) != type) {
    stop(obj_name, " is not an object of type: '",
      paste0(type, collapse = ", "), "'",
      call. = FALSE)
  }
}

assert_class <- function(obj, class) {
  obj_name <- deparse(substitute(obj))
  if (!inherits(obj, class)) {
    stop(obj_name, " is not an object of class: '",
      paste0(class, collapse = ", "), "'",
      call. = FALSE)
  }
}

match_class <- function(df, dtype) {
  if (missing(dtype))
    stop("no 'dtype' argument", call. = FALSE)
  dtype <- dtypes[match(tolower(dtype),
    c("integer", "numeric", "character", "list", "date"),
    nomatch = 0L)]
  if (length(dtype) == 0)
    stop("invalid 'dtype' argument")
  names(df)[which(tolower(sapply(df, class)) %in% dtype)]
}

has_rows <- function(df) {
  df_name <- deparse(substitute(df))
  if (!nrow(df)) {
    stop("'", df_name, "' doesn't have row(s): ",
      call. = FALSE)
  }
}

has_cols <- function(df, cols) {
  df_name <- deparse(substitute(df))
  df_cols <- colnames(df)
  diff_cols <- setdiff(cols, df_cols)
  if (length(diff_cols) > 0) {
    stop("'", df_name, "' doesn't have column(s): ",
      paste0(diff_cols, collapse = ", "), ".",
      call. = FALSE)
  }
}

has_missing <- function(x) {
  column_name <- deparse(substitute(x))
  if (any(is.na(x))) {
    stop("'", column_name, "' has missing value(s): ",
      call. = FALSE)
  }
}

match_cols <- function(df, cols) {
  colnames(df)[match(cols, colnames(df), 0L)]
}

regmatch_cols <- function(df, pattern) {
  colnames(df)[grepl(pattern, names(df), perl = TRUE)]
}

factor_cols <- function(df, cols) {
  cols <- match_cols(df, vapply(substitute(cols), deparse, "character"))
  df[, (cols) := lapply(.SD, factor), .SDcols = cols]
}

diff_cols <- function(df, cols) {
  setdiff(colnames(df), cols)
}

icol <- function(df, cols) {
  has_cols(df, cols)
  unlist(lapply(unique(cols), function(x) which(colnames(df) == x)))
}

rm_cols <- function(df, cols) {
  cols <- match_cols(df, vapply(substitute(cols), deparse, "character"))
  df[, `:=`((cols), NULL)]
}

rm_cols_ <- function(df, cols) {
  cols <- match_cols(df, cols)
  df[, `:=`((cols), NULL)]
}

rm_punct <- function(df, cols, pattern = "(?!\\*)[[:punct:]]") {
  cols <- match_cols(df, vapply(substitute(cols), deparse, "character"))
  df[, `:=`((cols), lapply(.SD, function(x) gsub(pattern, "", x, perl = TRUE))), .SDcols = cols]
}

rm_punct_ <- function(df, cols, pattern = "(?!\\*)[[:punct:]]") {
  cols <- match_cols(df, cols)
  df[, `:=`((cols), lapply(.SD, function(x) gsub(pattern, "", x, perl = TRUE))), .SDcols = cols]
}

trim_ws <- function(x, ws = "[ \t\r\n]", perl = FALSE) {
  re <- sprintf("^%s+|%s+$", ws, ws)
  gsub(re, "", x, perl = perl)
}

rm_ws <- function(df, ws = "[ \t\r\n]", perl = FALSE) {
  col <- names(sapply(df, class)[sapply(df, class) == "character"])
  if (length(col) > 0)
    df[, (col) := lapply(.SD, trim_ws, ws = ws, perl = perl), .SDcols = col]
}

equal <- function(x, y) {
  if (any(colnames(x) != colnames(y)))
    stop("different column names")
  sapply(colnames(x), function(col) all(x[[col]]==y[[col]]))
}

setcolafter <- function(df, cols, after = NA) {
  cols  <- match_cols(df, vapply(substitute(cols) , deparse, "character"))
  after <- match_cols(df, vapply(substitute(after), deparse, "character"))
  all_cols <- colnames(df)
  cols_pos <- sapply(cols, function(x) which(all_cols == x), USE.NAMES = FALSE)
  rest_pos <- which(!all_cols %in% cols)
  if (missing(after)) {
    neworder <- c(cols_pos, rest_pos)
  } else {
    after_pos <- which(all_cols == after)
    head_order <- rest_pos[rest_pos <= after_pos]
    tail_order <- rest_pos[rest_pos  > after_pos]
    new_order <- c(head_order, cols_pos, tail_order)
  }
  new_cols <- all_cols[new_order]
  setcolorder(df, new_cols)
}

setcolafter_ <- function(df, cols, after = NA) {
  all_cols <- colnames(df)
  cols_pos <- sapply(cols, function(x) which(all_cols == x), USE.NAMES = FALSE)
  rest_pos <- which(!all_cols %in% cols)
  if (missing(after)) {
    neworder <- c(cols_pos, rest_pos)
  } else {
    after_pos <- which(all_cols == after)
    head_order <- rest_pos[rest_pos <= after_pos]
    tail_order <- rest_pos[rest_pos  > after_pos]
    new_order <- c(head_order, cols_pos, tail_order)
  }
  new_cols <- all_cols[new_order]
  setcolorder(df, new_cols)
}

replace_na_with_zero <- function(df) {
  class <- sapply(df, class)
  cols <- names(class)[which(class %in% c("numeric", "integer"))]
  df[, (cols) := lapply(.SD, function(x) ifelse(is.na(x), 0, x)), .SDcols = cols]
}
replace_na_with_empty <- function(df) {
  class <- sapply(df, class)
  cols <- names(class)[which(class == "character")]
  df[, (cols) := lapply(.SD, function(x) ifelse(is.na(x), "", x)), .SDcols = cols]
}
replace_empty_with_na <- function(df) {
  class <- sapply(df, class)
  cols <- names(class)[which(class == "character")]
  df[, (cols) := lapply(.SD, function(x) ifelse(x == "", NA, x)), .SDcols = cols]
}

set_rowsum  <- function(df, cols, name = "sum") {
  cols <- match_cols(df, vapply(substitute(cols), deparse, "character"))
  set(df, j = name, value = apply(df[, ..cols], 1, sum))
}
set_rowsum_ <- function(df, cols, name = "sum") {
  set(df, j = name, value = apply(df[, ..cols], 1, sum))
}
set_cumsum  <- function(df, group_var, value_var, prefix = "c") {
  group_var <- match_cols(df, vapply(substitute(group_var), deparse, "character"))
  value_var <- match_cols(df, vapply(substitute(value_var), deparse, "character"))
  cols <- sprintf("%s%s", prefix, value_var)
  df[, (cols) := lapply(.SD, cumsum), group_var, .SDcols = value_var]
}
set_cumsum_ <- function(df, group_var, value_var, prefix = "c") {
  cols <- sprintf("%s%s", prefix, value_var)
  df[, (cols) := lapply(.SD, cumsum), group_var, .SDcols = value_var]
}
set_prop  <- function(df, cols, sum_col) {
  cols <- match_cols(df, vapply(substitute(cols), deparse, "character"))
  sum_col <- match_cols(df, vapply(substitute(sum_col), deparse, "character"))
  name <- sprintf("%s_prop", cols)
  df[, (name) := lapply(.SD, function(x) x / get(sum_col)), .SDcols = cols]
}
set_prop_ <- function(df, cols, sum_col) {
  name <- sprintf("%s_prop", cols)
  df[, (name) := lapply(.SD, function(x) x / get(sum_col)), .SDcols = cols]
}


#' Merge data frames
#'
#' This function merges several data frames at once
#' @param ... data frames, or objects to be coerced to one.
#' @param by specifications of the columns used for merging.
#' @param all logical; all = L is shorthand for all.x = L and all.y = L, where L is either TRUE or FALSE.
#' @param all.x logical; if TRUE, then extra rows will be added to the output, one for each row in x that has no matching row in y. These rows will have NAs in those columns that are usually filled with values from y. The default is FALSE, so that only rows with data from both x and y are included in the output.
#' @param all.y logical; analogous to all.x.
#' @param sort  logical. Should the result be sorted on the by columns?
join <- function(..., by, all = FALSE, all.x = all, all.y = all, sort = TRUE) {
  Reduce(function(...) merge(..., by = by, all = all, all.x = all.x, all.y = all.y, sort = sort), list(...))
}

expand.grid.df <- function(...) Reduce(function(...) merge(..., by = NULL), list(...))

minmax_scaler <- function(x) if (length(x) > 1) (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE)) else 0

# vector ------------------------------------------------------------------

nolast <- function(x) x[-length(x)]

change_point <- function(x) which(x[-1L] != x[-length(x)]) + 1

change_interval <- function(x) diff(c(1, change_point(x), length(x)+1))

vseq <- function(from, to, by = 1L) {
  if (length(from) != length(to))
    stop("Two vectors have a different length.")
  lapply(seq_along(from), function(x) seq(from[x], to[x], by))
}

# date --------------------------------------------------------------------

num2date <- function(x) as.Date(x, origin = "1970-01-01")
add_year <- function(date, year) {
  date <- as.POSIXlt(date)
  date$year <- date$year + year
  as.Date(date)
}
add_mon <- function(date, mon) {
  date <- as.POSIXlt(date)
  date$mon <- date$mon + mon
  as.Date(date)
}
bmonth <- function(x) as.Date(format(as.Date(x), format = "%Y-%m-01"))
emonth <- function(x) add_mon(x, 1L) - 1L
expand_date <- function(from, to) {
  if (inherits(from, "character"))
    from <- as.Date(from)
  if (inherits(to, "character"))
    to <- as.Date(to)
  num2date(.Call(vuw_expand_date, from, to)) # not a unique value
}

# time --------------------------------------------------------------------

sec_to_hms <- function(sec) {
  h <- sec %/% (60^2)
  r <- sec %%  (60^2)
  m <- r   %/% (60)
  s <- r   %%  (60)
  d <- s - trunc(s)
  d <- substr(sub("0.", "", as.character(d)), 1, 5)
  cat(sprintf("%02d:%02d:%02d.%s\n", h, m, trunc(s), d))
}
#' Simple running time of {vuw} package
#'
#' @param expr Valid \R expression to be timed
#' @examples
#' proc_time(df <- iris)
#' @export
proc_time <- function(expr) {
  stime <- as.numeric(Sys.time())
  eval(expr)
  etime <- as.numeric(Sys.time())
  sec_to_hms(etime - stime)
}

# word --------------------------------------------------------------------

break_word <- function(x, len = 15, collapse = "\n") {
  n <- ceiling(nchar(x) / len)
  v <- vector(mode = "character", length = length(n))
  for (j in 1:length(v)) {
    w <- vector(mode = "character", length = n[j])
    for (i in 1:length(w)) {
      w[i] <- substr(x[j], len*(i-1)+1, len*i)
    }
    v[j] <- paste0(w, collapse = collapse)
  }
  v
}

# comma utils -------------------------------------------------------------

# quote_comma(x, y, z)
# paste_comma(c("x", "y", "z"))

quote_comma <- function(..., newline = FALSE) {
  if (newline) {
    cat(paste0("\"", paste(vapply(substitute(list(...)), deparse, "character")[-1L], collapse = '",\n"'), "\""))
  } else {
    cat(paste0("\"", paste(vapply(substitute(list(...)), deparse, "character")[-1L], collapse = '", "'), "\""))
  }
}

paste_comma <- function(x, newline = FALSE) {
  if (newline) {
    cat(paste0("\"", paste(x, collapse = '",\n"'), "\""))
  } else {
    cat(paste0("\"", paste(x, collapse = '", "'), "\""))
  }
}

# setnames ----------------------------------------------------------------

setlowernames <- function(x) setnames(x, colnames(x), tolower(colnames(x)))
setuppernames <- function(x) setnames(x, colnames(x), toupper(colnames(x)))
seokhoonj/vuw documentation built on Jan. 30, 2024, 11:36 a.m.