R/z_deprecated.R

Defines functions getPackages cb_apply has_name make_names CBapply SPapply round_df switchv geojoin ORGetter tableSummary tableTest

Documented in cb_apply CBapply geojoin getPackages ORGetter round_df SPapply switchv tableSummary tableTest

#' getPackages
#'
#' This function takes a package and returns a list of its dependencies.  Good for downloading source files of packages to install on a R server where internet access is blocked.
#' @param packs a quoted package name or list of package names
#' @keywords CBapply
#' @export
#' @examples
#' \dontrun{
#' # use this to get specifically named packages and their dependencies:
#' packages <- getPackages('pbapply')
#' # use this to get all packages installed on local machine and their dependencies:
#' # packages <- getPackages(row.names(installed.packages()))
#' # then download the packages:
#' download.packages(packages, destdir='.',type='source')
#' }

getPackages <- function(packs){
    .Deprecated("just... ...no")
    packages <- unlist(
        tools::package_dependencies(packs, utils::available.packages(),
                                    which=c("Depends", "Imports", "LinkingTo"), recursive=TRUE)
    )
    packages <- union(packs, packages)
    packages
}

#' Function designed to handle anything that lapply can but can specify parallel
#' processing, progress bars, output format and more.
#'
#' Ideally, a function that returns a data.frame should be supplied. This gives
#' the user the advantage of specifying the names of the columns in the
#' resulting data.frame.  If the function does not return a data.frame, then
#' column names will be automatically generated.
#'
#' Use \code{.id} to control the designation of which input generate which
#' output. Set to \code{NULL} to suppress naming. By default, output lists will
#' be named and output data.frame will have an added column named \code{id}. The
#' name of this inserted column can be changed by specifying a character string.
#' Alternatively, a vector of character strings can be used to manually identify
#' the output (called \code{id} if in a data.frame). Names will be autogenerated
#' even if the input object has incomplete names or no names at all. Note that
#' this also works with functions that return a data.frame with more than one
#' row.
#'
#' Parallel processing is carried out by \code{pbapply::mclapply}. Use the
#' \code{parallel} option to switch parallel processing on or off. Only specify
#' the number of cores when really needed as the function will detect the
#' maximum number of available cores.  This makes it easy to rerun the script
#' with a higher number of available cores without having to change the code.
#'
#' A progress bar can be shown in the terminal using an interactive R session or
#' in an .Rout file, if using R CMD BATCH and submitting R scripts for
#' non-interactive completion. Although R Studio supports the progress bar for
#' single process workers, it has a problem showing the progress bar if using
#' parallel processing (see the discussion at
#' http://stackoverflow.com/questions/27314011/mcfork-in-rstudio). In this
#' specific case (R Studio + parallel processing), text updates will be printed
#' to the file `.process`. Use a shell and `tail -f .progress` to see the
#' updates.
#'
#' @param X List of objects to apply over
#' @param FUN. Function to apply; allows for compact anonymous functions (see
#'   ?purrr::as_function) for details
#' @param output Output type. Defaults to 'data.frame', but can also be set to
#'   'list' to suppress rbinding of the list.
#' @param parallel logical; use parallel processing?
#' @param num.cores The number of cores used for parallel processing.  Can be
#'   specified as an integer, or it will guess the number of cores available
#'   with detectCores(). If parallel is FALSE, the input here will be set to 1.
#' @param cache (defaults to FALSE) cache the results locally in a folder called "cache" using the memoise package
#' @param error.na (defaults to TRUE) use purrr::possibly to replace errors with NA instead of interrupting the process
#' @param fill (defaults to TRUE) use plyr::rbind.fill to fill in missing
#'   columns when rbinding together results
#' @param .id controls add identification of the output object based on the
#'   input object; see details
#' @param pb logical; use progress bar?
#' @param ... Additional arguments to the function
#' @export
#' @examples
#' \dontrun{
#' X <- as.data.frame(matrix(runif(100),ncol=10))
#'
#' fun. <- function(x) {
#'    Sys.sleep(0.5)
#'    mean(x)
#' }
#'
#' cb_apply(X,fun.,cache=TRUE)
#'
#' fun. <- function(x) {
#'   Sys.sleep(0.5)
#'   data.frame('mean'=mean(x),'median'=median(x))
#' }
#'
#' cb_apply(X,fun.)
#'
#' # when setting names of input object, function will attempt to assign them to
#' # the output in a new column
#' names(X) <- LETTERS[1:10]
#' cb_apply(X,fun.,output='list')
#' cb_apply(X,fun.)
#' # name the id columns something else
#' cb_apply(X,fun.,.id='group')
#' # specify a new identifier manually
#' cb_apply(X,fun.,.id=LETTERS[11:20])
#' # set .id to NULL to supress the addition of the id columns
#' cb_apply(X,fun.,.id=NULL)
#' # naming still works even if the function returns a data.frame with two rows
#' fun. <- function(x) {
#'   Sys.sleep(0.5)
#'   data.frame('stat'=c(mean(x),median(x)))
#' }
#' cb_apply(X,fun.)
#' }

cb_apply <- function(X,FUN.,fill=TRUE,.id='id',output='data.frame',
                     pb=TRUE,parallel=FALSE,cache=FALSE,error.na=TRUE,num.cores=NULL,...){

  .Deprecated('mappp','CB','CBapply is deprecated but is maintained here for backwards compatability.\nConsider using CB::mappp instead.')

  stopifnot(output %in% c('data.frame','list'),
            num.cores > 0 | is.null(num.cores))

    FUN <- purrr::as_mapper(FUN.)

  if (cache) {
    fc <- memoise::cache_filesystem('cache')
    FUN <- memoise::memoise(FUN,cache=fc)
  }

  if (error.na) FUN <- purrr::possibly(FUN,otherwise=NA,quiet=FALSE)

  n <- length(X)
  if (!is.vector(X) || is.object(X)) X <- as.list(X)

  # set number of cores
  if (parallel) {
    num.cores <- ifelse(is.null(num.cores),
                        parallel::detectCores(TRUE),
                        num.cores)
    if (is.na(num.cores)) num.cores <- 1
  }
  if(!parallel) num.cores <- 1

   # non parallel progress bar
  if (pb & (num.cores==1)) {
    tmp <- vector('list', n)
    pbb <- progress::progress_bar$new(total=100,
                                      format='...  :what (:percent)   [ ETA: :eta | Elapsed: :elapsed ]',
                                     clear=FALSE,force=TRUE,show_after=0)
    pbb$tick(0)
    for (i in 1:n) {
      pbb$tick(len=100/n,tokens = list(what = paste0('processing ',i,' of ',n)))
      tmp[[i]] <- FUN(X[[i]],...)
    }
  }

  # parallel with progress bar
  if (pb & (num.cores > 1)) {
    if (Sys.getenv("RSTUDIO") == "1") {
      message("progress bar doesn't work in RStudio!\n... follow the file \".progress\" instead")
      wrapFUN <- function(i,...) {
        out <- FUN(X[[i]],...)
        out.percentage <- round(i/n*100,digits=0)
        cat(paste0('   ... processing ',i,' of ',n,' (',out.percentage,'%)','\n'),
            file='.progress',append=FALSE)
        return(out)
      }
      tmp <- parallel::mclapply(1:n,wrapFUN,...,mc.cores=num.cores)
    } else {
      tmp <- mclapply_pb(X,FUN,...,mc.cores=num.cores)
    }
  }

  # no progress bar
  if ((!pb) & (num.cores > 1)) tmp <- parallel::mclapply(X,FUN,mc.cores=num.cores,...)
  if ((!pb) & (num.cores == 1)) tmp <- lapply(X,FUN,...)

  # test for and make names if .id is not NULL
  if (!is.null(.id)) {

    hn <- has_name(X)

    # if has_name returns all TRUE, use names
    if (all(hn)) nms <- names(X)

    # if has_name returns all FALSE, create names
    if (all(!hn)) nms <- make_names(1:n)

    # if has_name returns some TRUE, create names where needed
    if ( (sum(hn)) > 0 & (sum(hn) < n) ) nms <- make_names(X)

    # if .id provided as a vector, use that instead
    if (length(.id) > 1){
      stopifnot(length(.id)==n) # only use if vector of names is the right length
      nms <- .id
      .id <- 'id'
    }
  }

  # name the list only if .id is not NULL
  if (output=='list' & !is.null(.id)) names(tmp) <- nms


  # fill function and insert id column in data.frame
  if (output=='data.frame') {
    tmp.df.list <- lapply(tmp,as.data.frame)
    fillFUN <- ifelse(fill,plyr::rbind.fill,rbind)
    tmp <- do.call(fillFUN,tmp.df.list)
    if (!is.null(.id)) tmp[ ,.id] <- rep(nms,sapply(tmp.df.list,nrow))
  }

  return(tmp)
}


has_name <- function(x) {
  # stolen from https://github.com/hadley/modelr/blob/master/R/utils.R#L25
  nms <- names(x)
  if (is.null(nms)) {
    rep(FALSE, length(x))
  } else {
    !(is.na(nms) | nms == "")
  }
}


make_names <- function(x) {
  which.named <- has_name(x)
  new.names <- make.names(1:sum(!which.named))
  out.names <- vector('character',length(x))
  out.names[which.named] <- names(x)[which.named]
  out.names[!which.named] <- new.names
  return(out.names)
}

#' This function is a wrapper for sapply with simplify=FALSE and USE.NAMES=TRUE. It then rbinds via do.call to return data.frame. In order for the names to work properly, a function that returns a data.frame must be used (see example).
#' @param X List of objects to apply over
#' @param FUN Function to apply
#' @param output Output type. Defaults to 'data.frame', but can also be set to 'list' to suppress rbinding of the list.
#' @param num.cores Defaults to 1 and the base 'sapply' is used. If set to greater than one, then it is the number of cores used in parallel::mclapply().
#' @param fill (defaults to FALSE) use plry::rbind.fill to fill in missing columns
#' @param ... Additional arguments to the function
#' @keywords CBapply
#' @export
#' @examples X <- as.data.frame(matrix(runif(100),ncol=10))
#' names(X) <- LETTERS[1:10]
#' # CBapply(X,mean) # <- will return error
#' # function must return a data.frame with named columns for column names to work
#' CBapply(X,function(x) data.frame('mean'=mean(x)))

CBapply <- function(X,FUN,output='data.frame',fill=FALSE,num.cores=1,...) {
  .Deprecated('cb_apply','CB','CBapply is deprecated but is maintained here for backwards compatability.\nConsider using CB::cb_apply instead.','CBapply')
  if (! output %in% c('data.frame','list')) stop('output must be specified as "data.frame" or "list"')
  if (num.cores == 1) tmp <- sapply(X,FUN,simplify=FALSE,USE.NAMES=TRUE,...)
  if (! num.cores == 1) tmp <- parallel::mclapply(X,FUN,mc.cores=num.cores,...)
  if (output=='data.frame') {
    if (fill) {
      tmp <- lapply(tmp,as.data.frame)
      rtn <- do.call(plyr::rbind.fill,tmp)
      try({
        row.names(rtn) <- names(tmp)
        },silent=TRUE)
    }
    if (!fill) {
      rtn <- do.call(rbind,tmp)
      try({
        row.names(rtn) <- names(tmp)
      },silent=TRUE)
    }
  }
  if (output=='list') rtn <- tmp
  return(rtn)
}

#' SPapply
#'
#' *apply function for spatial point objects
#' @param sp.object a SpatialPoints or SpatialPointsDataFrame object
#' @param FUN. function to be applied; must take sp.object as first argument and must return a data.frame
#' @param ... additional arguments passed to function
#' @param id.row.names if \code{TRUE}, set row.names of output data.frame from \code{data$id} of \code{sp.object}
#' @param progress.bar logical, show progress bar?
#'
#' @return data.frame of all results from function applied to sp.object
#' @export

SPapply <- function(sp.object,FUN.,...,progress.bar=TRUE,id.row.names=FALSE) {
    .Deprecated(msg = "why aren't you using sf yet?")
  stopifnot(class(sp.object) %in% c('SpatialPoints','SpatialPointsDataFrame','SpatialPolygonsDataFrame','SpatialPolygons'))
  N. <- length(sp.object)
  FUN <- match.fun(FUN.)
  if (progress.bar) pb <- pbapply::startpb(0,N.)
  out <- vector('list',N.)
  for (j in 1:N.) {
    out[[j]] <- FUN(sp.object[j, ],...)
    if(progress.bar) pbapply::setpb(pb,j)
  }
  if (progress.bar) close(pb)
  out <- do.call(rbind,out)
  if(id.row.names) row.names(out) <- as.character(sp.object@data$id)
  return(out)
}

#' round all numeric columns in a data.frame
#'
#' @param x data.frame
#' @param digits number of digits after the decimal place
#'
#' @return rounded data.frame
#' @export
round_df <- function(x, digits=2) {
    .Deprecated("just... ...no")
    numeric_columns <- sapply(x, class) == 'numeric'
    x[numeric_columns] <-  round(x[numeric_columns], digits)
    x
}
# switchv
#' Vectorized version of switch (stolen from @kbroman)
#'
#' Vectorized version of \code{\link[base]{switch}}: just loops over
#' input and calls \code{\link[base]{switch}}.
#'
#' @param EXPR An expression evaluating to a vector of numbers of strings
#' @param ... List of alternatives
#'
#' @return Vector of returned values.
#'
#' @examples
#' \dontrun{
#' switchv(c("horse", "fish", "cat", "bug"),
#'         horse="fast",
#'         cat="cute",
#'         "what?")
#' }
#'
#' @export
switchv <- function(EXPR, ...) {
    .Deprecated("use 'case_when()` instead")
    result <- EXPR

    for(i in seq(along=result))
        result[i] <- switch(EXPR[i], ...)

    result
}

#' left join a data.frame to a spatial data frame
#'
#' modified from tigris::geo_join to not check data.frame names or coerce strings to factors
#'
#' @param spatial_data a spatial data frame
#' @param data_frame a data frame
#' @param by_sp column id for merge
#' @param by_df column id for merge
#'
#' @return spatial data frame
#' @export
geojoin <- function(spatial_data,data_frame,by_sp,by_df) {
    .Deprecated(msg = "why aren't you using sf yet?")
    spatial_data@data <- data.frame(spatial_data@data,
                                    data_frame[match(spatial_data@data[[by_sp]],
                                                     data_frame[[by_df]]), ],
                                    check.names=FALSE,stringsAsFactors=FALSE)
                                        # remove duplicated column IDs if joined on same name variable
    if (by_sp == by_df) spatial_data@data[, !duplicated(names(spatial_data@data), fromLast = TRUE)]
    return(spatial_data)
}

#' Retreive Odds Ratio Table from Logistic GLM Objects
#'
#' This function returns a data.frame of the odds ratios and their 95\% confidence intervals.
#' @param logistic.glm A logistic GLM R object.  If not an object of 'glm' and 'lm', it will stop with an error.
#' @param sig.star Will return an extra column with a star if the confidence interval does not contain 1. Defaults to TRUE.
#' @param show.intercept Will show the intercept and its confidence interval only if set to TRUE. Defaults to FALSE.
#' @param digits Number of digits to round table
#' @export
#' @keywords CB OddsRatio
#' @examples
#' \dontrun{ x1 <- rnorm(100)
#' x2 <- rnorm(100)
#' y <- rbinom(100,1,prob=0.3)
#' logistic.model <- glm(y ~ x1 + x2,family='binomial')
#' ORGetter(logistic.model)
#' }

ORGetter <- function(logistic.glm,digits=2,sig.star=TRUE,show.intercept=FALSE){
  .Deprecated(msg='tableSummary is deprecated but is maintained here for backwards compatability.\nConsider using tidyverse instead.')
  stopifnot(class(logistic.glm)==c('glm','lm'))
  OR <- stats::coef(logistic.glm)
  CI <- suppressMessages(stats::confint(logistic.glm))
  tmp <- exp(data.frame(OR,CI))
  names(tmp) <- c('odds ratio','lower.CI','upper.CI')
  tmp <- round(tmp,digits=digits)
  if (!show.intercept) tmp <- tmp[-1, ]
  tmp$sig <- ifelse((tmp$lower.CI > 1) & (tmp$upper.CI > 1)
                    | (tmp$lower.CI < 1) & (tmp$upper.CI < 1)
                    ,' * ','   ')
  if (!sig.star) tmp$sig <- NULL
  return(tmp)
}

#' Summary Table
#'
#' This function summarizes numerical and dichotomous variables only.  The summary number is either the mean of a numeric variable for the number and percentage of values that are the second of the two factors in a dichotomous variable.  Missing values are removed before the summary statistic is calculated and the numer of missing observations is also presented in the table.
#' @param x Vector of data which to summarize.  Should be used for numerical and dichotomous variables only.
#' @param digits.mean The mean is rounded and displayed using this many digits.
#' @param digits.percentage The percentage is rounded and displayed using this many digits.
#' @keywords CB summary table
#' @export
#' @examples
#' \dontrun{
#' X <- data.frame('some.continuous'=runif(300),'some.factor'=factor(rbinom(300,1,0.3)))
#' tableSummary(X$some.continuous)
# 'tableSummary(X$some.factor)
#' # use CBapply to create a table
#' CBapply(X,tableSummary)
#' # specify the digits differently to change the display of the table
#' CBapply(X,tableSummary,digits.mean=3,digits.percentage=2)
#' }


tableSummary <- function(x,digits.mean=2,digits.percentage=0){
  .Deprecated(msg='tableSummary is deprecated but is maintained here for backwards compatability.\nConsider using tidyverse instead.')
  stopifnot(class(x) %in% c("factor","numeric","integer",'ordered'))
  tmp <- stats::na.omit(x)
  number.missing <- length(x) - length(tmp)
  if (class(x) %in% c('numeric','integer')){
    out <- paste0(round(mean(tmp),digits=digits.mean))
    out.round <- round(mean(tmp),digits=digits.mean)
    out <- paste0(sprintf(paste0('%.',digits.mean,'f'),out.round))
  }
  if(class(x) %in% c('factor','ordered')){
    total <- length(tmp)
    n <- summary(tmp)[[2]]
    perc.round <- round(n/total*100,digits=digits.percentage)
    out <- paste0(n,' (',sprintf(paste0('%.',digits.percentage,'f'),perc.round),'%)')
  }
  return(data.frame('summary'=out,'number NA'=number.missing,check.names=FALSE))
}

#' Table Test
#'
#' This function summarizes and tests the differences of numerical and dichotomous variables only across some factor.  The summary number is either the mean of a numeric variable for the number and percentage of values that are the second of the two factors in a dichotomous variable.  Missing values are removed before the summary statistic, but the number missing is not reported. Furthermore, a p-value is reported testing the differences of the means or counts across the groups factor.  The p-value is derived from an ANOVA for continuous variables or from a chi-squared test via monte-carlo simulation using 100,000 bootstrap replicates.
#' @param x Vector of data which to summarize.  Should be used for numerical and dichotomous variables only.
#' @param group The factor for which to test the x variable across.
#' @param digits.mean The mean is rounded and displayed using this many digits.
#' @param digits.percentage The percentage is rounded and displayed using this many digits.
#' @keywords CB test table
#' @export
#' @examples X <- data.frame('some.continuous'=runif(300),'some.factor'=factor(rbinom(300,1,0.3)))
#' X$some.other.factor <- factor(rbinom(300,1,0.5))
#' tableTest(x=X$some.continuous,group=X$some.other.factor)
#' tableTest(x=X$some.factor,group=X$some.other.factor)
#' CBapply(X[ ,c('some.continuous','some.factor')],tableTest,group=X$some.other.factor)


tableTest <- function(x,group,digits.mean=2,digits.percentage=0){
  .Deprecated(msg='tableSummary is deprecated but is maintained here for backwards compatability.\nConsider using tidyverse instead.')
  stopifnot(class(x) %in% c("factor","numeric","integer",'ordered'))
  stopifnot(class(group) %in% c('factor','ordered'))
  tmp <- data.frame(x,group)
  tmp <- stats::na.omit(tmp)
  if (nrow(tmp) < length(x)) warning('NA values have been omitted')
  if (class(x) %in% c('numeric','integer')){
    p <- round(stats::anova(stats::lm(x ~ group,data=tmp))["Pr(>F)"][[1]][1],digits=3)
    p.text <- ifelse(p<0.001,"< 0.001",paste(p))
    means <- round(tapply(tmp$x,tmp$group,mean,na.rm=TRUE),digits=digits.mean)
    out <- paste0(sprintf(paste0('%.',digits.mean,'f'),means))
  }
  if(class(x) %in% c('factor','ordered')){
    if (!nlevels(tmp$x) == 2) warning('The outcome factor does not have 2 levels, the output of the table is wrong!')
    tbl <- table(tmp$x,tmp$group)
    n <- tbl[2, ]
    p <- round(stats::chisq.test(tbl,simulate.p.value=TRUE,B=100000)$p.value,digits=3)
    p.text <- ifelse(p<0.001,"< 0.001",paste(p))
    perc <- prop.table(tbl,margin=2)[2, ] * 100
    perc.round <- paste0(sprintf(paste0('%.',digits.percentage,'f'),perc))
    out <- paste0(n,' (',perc.round,'%)')
  }
  rtn <- data.frame(t(out),p.text)
  names(rtn) <- c(levels(group),'p-value')
  return(rtn)
}
cole-brokamp/CB documentation built on May 13, 2019, 8:49 p.m.