R/resampling_functions.R

Defines functions resample_df stratify_df get_key

Documented in resample_df

# find unique values for key
# @param df data frame
# @param key_cols vector of column names. Defaults to all columns
# @details 
# used in stratify_df, however also made available to find
# non-duplicated values across multiple columns 
# when !duplicated(col) is not sufficient
get_key <- function(df, 
                    key_cols = names(df)) {
  # add check to see if all key_cols available
  unique_df <- df[, key_cols, drop=F] %>%
    data.table::as.data.table() %>%
    unique(by=key_cols)
  return(dplyr::tbl_df(unique_df))
}

# stratify based on some columns
# @param df dataframe
# @param strat_cols columns to stratify on
# @param n number of samples
# @param replace whether to resample with replacement
stratify_df <- function(df, 
                        strat_cols,
                        n,
                        replace = TRUE
                        ) {
  frac <- n/nrow(df)
  sample <- df %>% dplyr::group_by_(.dots=strat_cols) %>% 
    dplyr::sample_frac(frac, replace=replace)
  nsample <- nrow(sample) 
  nleft <- n- nsample
  if(nleft != 0) {
    ifelse(nleft > 0, {
      extras <- dplyr::sample_n(df, size = nleft)
      sample <- dplyr::bind_rows(sample, extras)
    }, {
      removals <- sample.int(nsample, abs(nleft))
           sample <- sample[-removals,]
    }
    )

  }
  return(sample)
}

#' resampling
#' @param df data frame
#' @param key_cols key columns to resample on
#' @param strat_cols columns to maintain proportion for stratification
#' @param n number of unique sampled keys, defaults to match dataset
#' @param key_col_name name of outputted key column. Default to "KEY"
#' @param replace whether to stratify with replacement
#' @details 
#' This function is valuable when generating a large simulated population
#' where you goal is to create resampled sub-populations in addition to being able to
#' maintain certain stratifications of factors like covariate distributions
#' 
#' A new keyed column will be created (defaults to name 'KEY') that contains the uniquely
#' created new samples. This allows one to easily compare against the key'd columns. Eg,
#' if you would like to see how many times a particular individual was resampled you can 
#' check the original ID column against the number of key's associated with that ID number.
#' @examples
#' library(PKPDmisc)
#' library(dplyr, quiet = TRUE)
#' 
#' # simple example resampling by ID maintaining Gender distribution, with 10 individuals
#' resample_df(sd_oral_richpk, key_cols = "ID", strat_cols = "Gender", n = 10)
#' 
#' # for a more complex example lets resample "simulated" data with multiple replicates
#' subset_data <- sd_oral_richpk %>%
#'    filter(ID < 20)
#'    
#' # make 'simulated' data with 5 replicates and combine to single dataframe
#' rep_dat <- lapply(1:5, function(x) {
#' subset_data %>% 
#'   mutate(REP = x)
#'   }) %>% bind_rows()
#' 
#' # now when we resample we also want to maintain the ID+REP relationship as resampling
#' # just the ID would give all rows associated for an ID with all reps, rather than 
#' # a single "unit" of ID/REP
#' resample_df(rep_dat, key_cols = c("ID", "REP"))
#' 
#' # check to see that stratification is maintained
#' rep_dat %>% group_by(Gender) %>% tally
#' resample_df(rep_dat, key_cols=c("ID", "REP"), strat_cols="Gender") %>%
#'   group_by(Gender) %>% tally
#'   
#' rep_dat %>% group_by(Gender, Race) %>% tally
#' 
#' resample_df(rep_dat, key_cols=c("ID", "REP"), strat_cols=c("Gender", "Race")) %>%
#'   group_by(Gender, Race) %>% tally
#' @export
resample_df <- function(df, 
                        key_cols,
                        strat_cols = NULL, 
                        n = NULL,
                        key_col_name = "KEY",
                        replace = TRUE) {
  # checks
  if (is.numeric(strat_cols)) {
    message("It looks you are trying to give a numeric value for strat_cols, 
 perhaps you were trying to specify the number to sample instead? 
 If no strat_cols are specified you must explicitly specify 'n = ...' 
 For example resample_df(Theoph, 'Subject', n = 20 )")
    message("-----------------------------------")
    stop("To set the number of samples please explicitly specify 'n = <num>'.")
  }
  
  names <- c(key_col_name,names(df))
  key <- get_key(df, key_cols)
  if(is.null(n)) n <- nrow(key)
  
  if(is.null(strat_cols)) {
    sample <- dplyr::sample_n(key, size = n, replace=replace)
    sample[[key_col_name]] <- 1:n
  } else {
    strat_key <- get_key(df, c(key_cols, strat_cols))
    # because getting unique key based on stratification columns as well 
    # (to easily carry columns) if there are multiple stratification values
    # per unique key may introduce a bug
    # eg if stratifying by disease status if an individual has at some time
    # both positive and negative statuses, both will be picked up as two separate
    # instances so will duplicate that individual. For now issue warning but let
    # it proceed
    if(nrow(strat_key) != nrow(key)) {
      warning("Non-unique keys introduced from stratification,
check that all keys only have one stratification variable associated
                                             ")}
    sample <- stratify_df(strat_key, strat_cols, n, replace = replace)
    #drop strat cols so won't possibly mangle later left join
    sample <- dplyr::ungroup(sample)
    sample <- sample[, key_cols, drop=F] 
    sample[[key_col_name]] <- 1:nrow(sample)
  }
  resampled_df <- dplyr::left_join(sample, df, by = key_cols)
  
  
  #reorder columns to match original df with key column appended
  return(resampled_df[,names, drop=F])
}
#
#
#library(PKPDdatasets)
#dat <- sd_oral_richpk
#sid_dat <- filter(dat, !duplicated(ID))
#sid_dat %>% group_by(Gender) %>% summarize(n = n())
#stratify_df(sid_dat, strat_cols=c("ID", "Gender"), n= 50)%>% ungroup() %>%
#  group_by(Gender) %>% summarize(n = n())
#stratify_df(sid_dat, strat_cols=c("ID", "Gender"), n= 100)%>% ungroup() %>%
#  group_by(Gender) %>% summarize(n = n())
#sid_dat %>% group_by(Gender, Race) %>% summarize(n = n())
#stratify_df(sid_dat, strat_cols=c("ID", "Gender", "Race"), 50)%>% 
#  group_by(Gender, Race) %>% summarize(n = n())
#
#
#rep_dat <- rbind_all(lapply(1:5, function(x) dat %>%
#                              filter(ID < 20) %>% 
#                              mutate(REP = x)))
#resample_df(rep_dat, key_cols = c("ID", "REP"))
#rep_dat %>% group_by(Gender) %>% summarize(n = n())
#rep_dat %>% group_by(Gender, Race) %>% summarize(n = n())
#resample_df(rep_dat, key_cols=c("ID", "REP"), strat_cols=c("Gender", "Race")) %>%
#  group_by(Gender, Race) %>% summarize(n = n())
#unique(resample_df(rep_dat, key_cols=c("ID", "REP"))[["KEY"]])
#stratify_df(rep_dat, strat_cols=c("ID", "REP", "Gender", "Race"), 50, return_all=F)%>% 
#  group_by(Gender, Race) %>% summarize(n = n())
#
#resample_df(rep_dat, 
#            key_cols=c("ID", "REP"), 
#            strat_cols=c("Gender", "Race"),
#            n =50) %>% group_by(Gender, Race) %>% filter(!duplicated(KEY)) %>%
#  summarize(n = n())
#
#rep_dat %>% mutate(totn = n()) %>% 
#  group_by(Gender, Race) %>% summarize(n =n()/mean(totn))
#

Try the PKPDmisc package in your browser

Any scripts or data that you put into this service are public.

PKPDmisc documentation built on April 14, 2020, 5:49 p.m.