R/init.R

Defines functions size_objects unique_n summary_x df.dims dfhead row_sums row_means as.Num Init

Documented in as.Num df.dims dfhead Init row_means row_sums size_objects summary_x unique_n

#' Wrapper for calling other init functions
#'
#' @param nrows passed to df.dims()
#' @param ncols passed to df.dims()
#' @param threads passed to data.table::setDTthreads()
#' @param scheduler passed to clustermq_setup()
#' @return NULL
#' @export
Init = function(nrows = 4, ncols = 20, threads = 4,
                scheduler = c("multicore", "sge")){
  require(data.table)
  data.table::setDTthreads(threads)
  require(clustermq)
  clustermq_setup(scheduler=scheduler)
  df.dims(nrows = 4, ncols = 20)
}

#' convert to numeric while avoiding factor conversion issues
#'
#' @param x an iterable
#' @return a numeric object
#' @export
as.Num = function(x){
  as.numeric(as.character(x))
}

#' rowMeans that works inside a dplyr::mutate() call
#' @export
row_means = function(..., na.rm=TRUE){
  rowMeans(cbind(...), na.rm=na.rm)
}

#' rowSums that works inside a dplyr::mutate() call
#' @export
row_sums = function(..., na.rm=TRUE){
  rowSums(cbind(...), na.rm=na.rm)
}

#' A simple dataframe summary
#'
#' @param df dataframe object
#' @param n Number of lines to print
#' @return a dataframe object
#' @export
dfhead = function(df, n=3){
  print(dim(df))
  head(df, n=n)
}

#' Changing number of rows/columns shown when printing a data frame
#'
#' This is most useful for working with IRkernl in Jupyter notebooks
#'
#' @param nrows number of rows to print
#' @param ncols number of columns to print
#' @return NULL
#' @export
df.dims = function(nrows=4, ncols=20){
  options(repr.matrix.max.rows=nrows, repr.matrix.max.cols=ncols)
}

#' Summary for numeric vectors that includes sd and stderr
#'
#' sd = standard deviation
#' stderr = standard error of the mean (sd(x) / sqrt(length(x)))
#'
#' @param x a numeric vector
#' @param label row name label for the output. If NULL, then the label will be the input object label.
#' @param sel_col If "x" is data.table or data.frame, which column to assess?
#' @param rnd number of digits to round sd and stderr to
#' @return a matrix
#' @export
#' @examples
#' summary_x(iris$Sepal.Length)
summary_x = function(x, label=NULL, sel_col=NULL, rnd=3){
  if(is.null(label)){
    label = as.character(deparse(substitute(x)))
    label = gsub('.+\\$', '', label)
  }
  if(any(c('tidytable', 'data.table') %in% class(x))){
    tryCatch({
      sel_col = ggplot2::enexpr(sel_col)
    })
    if(is.null(sel_col)){
      stop('sel_col cannot be NULL for data.table objects')
    }
    x = tidytable::pull.(x, !!sel_col)
  } else if(any(c('data.frame') %in% class(x))){
    tryCatch({
      sel_col = dplyr::enquo(sel_col)
    })
    if(is.null(sel_col)){
      stop('sel_col cannot be NULL for data.frame objects')
    }
    x = dplyr::pull(x, !!sel_col)
  }
  sd_x = sd(x)
  x = as.matrix(summary(x))
  y = matrix(round(sd_x, rnd), dimnames=list('sd', 'V1'))
  z = matrix(round(sd_x / sqrt(length(x)), rnd), dimnames=list('sd_err_of_mean', 'V1'))
  x = t(rbind(rbind(x,y), z))
  row.names(x) = label
  return(x)
}

#' Pretty print number of unique elements in a vector
#'
#' The result will be cat'ed to the screen.
#' tidytable compatable. Maje
#'
#' @param x a vector or data.table. If data.table, sel_col must not be NULL
#' @param label what to call the items in the vector (eg., "samples")
#' @param sel_col If x is data.table or data.frame, which column to assess?
#' @param ret Return the unique values?
#' @returns NULL
#' @export
#' @importFrom dplyr enquo pull
#' @importFrom ggplot2 enexpr
unique_n = function(x, label='items', sel_col=NULL, ret=FALSE){
  if(any(c('tidytable', 'data.table') %in% class(x))){
    tryCatch({
      sel_col = ggplot2::enexpr(sel_col)
    })
    if(is.null(sel_col)){
      stop('sel_col cannot be NULL for data.table objects')
    }
    x = tidytable::distinct.(x, !!sel_col)
    x = tidytable::pull.(x, !!sel_col)
  } else if(any(c('data.frame') %in% class(x))){
    tryCatch({
      sel_col = dplyr::enquo(sel_col)
    })
    if(is.null(sel_col)){
      stop('sel_col cannot be NULL for data.frame objects')
    }
    x = dplyr::pull(x, !!sel_col)
  }
  cat(sprintf('No. of unique %s:', label),
      length(base::unique(x)), '\n')
  if(ret==TRUE){
    return(base::unique(x))
  }
}

#' Determine counts of setdiff, intersect, & union of 2 vectors (or data.tables)
#'
#' The output is printed text of intersect, each-way setdiff, and union.
#' Data.table compatible! Just make sure to provide sel_col_x and/or sel_col_y
#'
#' @param x vector1 or data.table. If data.table, sel_col_x must not be NULL
#' @param y vector2 or data.table. If data.table, sel_col_y must not be NULL
#' @param sel_col_x If x = data.table, which column to assess?
#' @param sel_col_y If y = data.table, which column to assess?
#' @param to_return (depreciated) "counts" = print overlap counts; "diff_x-or-y" = return setdiff; "diff_fuzzy" = return closest matches for those that differ (ordered best to worst)
#' @param diff Alternative to "to_return". "x" or "y" = return setdiff; "int" = intersect, "union" = union
#' @return NULL
#' @export
#' @importFrom dplyr enquo pull
#' @importFrom ggplot2 enexpr
overlap = function (x, y, sel_col_x = NULL, sel_col_y = NULL,
                    to_return = c("counts", "diff_x", "diff_y", "diff_fuzzy"),
                    diff = c(NA, "x", "y", "int", "union", "fuzzy")){
  if (any(c("tidytable", "data.table") %in% class(x))) {
    require(tidytable)
    tryCatch({
      sel_col_x = ggplot2::enexpr(sel_col_x)
    })
    if (is.null(sel_col_x)) {
      stop("sel_col_x cannot be NULL for data.table objects")
    }
    x = tidytable::distinct.(x, !!sel_col_x)
    x = tidytable::pull.(x, !!sel_col_x)
  }
  if (any(c("tidytable", "data.table") %in% class(y))) {
    require(tidytable)
    tryCatch({
      sel_col_y = ggplot2::enexpr(sel_col_y)
    })
    if (is.null(sel_col_y)) {
      stop("sel_col_y cannot be NULL for data.table objects")
    }
    y = tidytable::distinct.(y, !!sel_col_y)
    y = tidytable::pull.(y, !!sel_col_y)
  }
  if (any(c("data.frame") %in% class(x))) {
    tryCatch({
      sel_col_x = dplyr::enquo(sel_col_x)
    })
    if (is.null(sel_col_x)) {
      stop("sel_col_x cannot be NULL for data.frame objects")
    }
    x = dplyr::pull(x, !!sel_col_x)
  }
  if (any(c("data.frame") %in% class(y))) {
    tryCatch({
      sel_col_y = dplyr::enquo(sel_col_y)
    })
    if (is.null(sel_col_y)) {
      stop("sel_col_y cannot be NULL for data.frame objects")
    }
    y = dplyr::pull(y, !!sel_col_y)
  }
  if (to_return[1] == "counts" & is.na(diff[1])) {
    cat("intersect(x,y):", length(base::intersect(x, y)),
        "\n")
    cat("setdiff(x,y):", length(base::setdiff(x, y)), "\n")
    cat("setdiff(y,x):", length(base::setdiff(y, x)), "\n")
    cat("union(x,y):", length(base::union(x, y)), "\n")
  }
  else if (to_return[1] == "diff_x" | (!is.na(diff[1]) & diff[1] == "x")) {
    return(setdiff(x, y))
  }
  else if (to_return[1] == "diff_y" | (!is.na(diff[1]) & diff[1] == "y")) {
    return(setdiff(y, x))
  }
  else if (to_return[1] == "diff_fuzzy" | (!is.na(diff[1])& diff[1] == 'fuzzy')) {
    x = base::setdiff(x, y)
    y = base::setdiff(y, x)
    d = cbind(expand.grid(x, y), as.vector(adist(x, y)))
    colnames(d) = c("x", "y", "dist")
    d = d[d$dist != 0, ]
    return(d[order(d$dist), ])
  }
  else if (!is.na(diff[1]) & diff[1] == "int") {
    return(intersect(x, y))
  }
  else if (!is.na(diff[1]) & diff[1] == "union") {
    return(union(x, y))
  }
  else {
    stop("\"to_return\" value not recognized")
  }
}

#' Returns the sizes of R objects
#'
#' @param Robj Vector with the names of R objects as characters
#' @return A list with the name of R objects as names and the formatted size of the objects
#' @export
size_objects <- function(Robj){
    sapply(Robj,function(x){format(object.size(get(x)), units = 'auto')})
}
leylabmpi/LeyLabRMisc documentation built on Nov. 3, 2022, 3:45 p.m.