R/rowwise.R

Defines functions dt_rowwise_fn rowwise_summary

# PRIVATE -----------------------------------------------------------------


#' Summarise a group of columns row-wise
#'
#' Helper function
#'
#' @param dt A data table.
#' @param function_name A function name as a character.
#' @param selected_cols Character vector of column names.
#' @param new_colname Name of new summary column.
#' @noRd
rowwise_summary <- function(dt,
                            function_name,
                            selected_cols,
                            new_colname) {

  dt[, c(new_colname) := purrr::map(
    function_name,
    dt_rowwise_fn,
    .SD), .SDcols = selected_cols]

  return(dt)
}

#' Helper function for rowwise_summary()
#'
#' @param fn a character vector of functions
#' @param cols a character vector of column names to summarise with functions in
#'   fn
#' @param ... allows additional arguments such as na.rm to passed on to
#'   functions listed in fn
#' @noRd
dt_rowwise_fn <- function(fn, cols, ...) {
  # Row-wise summary helper function:
  # applies a set of summary functions ('fn' =
  # character vector of function names) across selected columns ('cols' =
  # character vector of column names in a datatable) using apply(). '...' allows
  # additional arguments to be passed on to functions (e.g. 'na.rm = ')
  apply(cols, MARGIN = 1, fn, ...)
}

# HASHED BELOW IS OLD FUNCTION FOR PERFORMING ROWWISE SUMMARIES WITH UKB DATA

#' #' Summarise a UK Biobank phenotype dataset row-wise
#' #'
#' #' Allows the user to summarise row-wise across a set of column "groups". e.g.
#' #' calculate standard deviation values for all systolic and diastolic blood
#' #' pressure readings per eid. Can also accept custom functions (e.g. a function
#' #' to count the number of non-NA values). Custom functions should include a
#' #' '...' parameter. \strong{Note: \code{ukb_main} must be a data table.}
#' #'
#' #' @param ukb_main a UK Biobank phenotype dataset. Must be a data table
#' #' @param functions a character vector of function names (e.g. \code{c("sd")})
#' #' @param data_dict a data dictionary for \code{ukb_main}. Can be a "filtered"
#' #'   version including only the columns to be summarised.
#' #' @param grouping_col the name of a column in \code{data_dict} that indicates
#' #'   column groups to summarise
#' #' @param prefix character. An optional prefix to add to the names of all newly
#' #'   created columns. Default is \code{NULL}.
#' #' @param ... arguments to passed on to summary functions listed in
#' #'   \code{functions}
#' #'
#' #' @noRd
#' #' @examples
#' #' \dontrun{
#' #' # make a custom function that counts the number of non-NA values.
#' #' # Note: include '...', otherwise an error is thrown by including na.rm = TRUE below
#' #'
#' #' # n_not_na <- function(x, ...) {
#' #' # sum(!is.na(x))
#' #' # }
#' #'
#' #' # summarise all numerical columns in UKB dataset by calculating mean/sd/n_not_na,
#' #' # including argument na.rm = TRUE
#' #'
#' #' # ukb_main_summarised <- summarise_rowwise(
#' #' # # UKB phenotype dataset as a datatable
#' #' # ukb_main = ukb_main,
#' #'
#' #' # functions = c("mean", "sd", "n_not_na"),
#' #'
#' #' # # data dictionary generated by make_data_dict(), filtered
#' #' # # for only numerical data
#' #' # data_dict = data_dict %>%
#' #' #    dplyr::filter(ValueType %in% c("Continuous", "Integer")),
#' #'
#' #' # # to summarise by FieldID
#' #' #      grouping_col = "Field_FieldID"
#' #'
#' #' # # additional args passed on to `functions`
#' #' #      na.rm = TRUE
#' #' #     )
#' #' }
#' summarise_rowwise <- function(ukb_main,
#'                               functions,
#'                               data_dict,
#'                               grouping_col = "Field_FieldID",
#'                               prefix = NULL,
#'                               ...) {
#'
#'   start_time <- proc.time()
#'
#'   # validate args
#'   assertthat::is.string(grouping_col)
#'
#'   assertthat::assert_that(
#'     all(data_dict$descriptive_colnames %in% names(ukb_main)),
#'     msg = "Error! `data_dict$descriptive_colnames` includes values not present in `names(ukb_main)`. Please filter `data_dict` for only the columns in `ukb_main` to be summarised"
#'   )
#'
#'   # ***STEP 1***
#'   # Create list of column groups with columns to summarise and new
#'   # colnames as nested dataframe
#'
#'   # get groups of columns to summarise from data dictionary
#'   col_groups_to_summarise <- data_dict %>%
#'     dplyr::group_by(.data[[grouping_col]]) %>%
#'     tidyr::nest()
#'
#'   # error if no col_groups_to_summarise
#'   assertthat::assert_that(
#'     !nrow(col_groups_to_summarise) == 0,
#'     msg = "Error! No columns identified to summarise from the argument values provided"
#'   )
#'
#'   # rename grouping_col to facilitate join in code chunk below
#'   col_groups_to_summarise <- rename_cols(df = col_groups_to_summarise,
#'                                          old_colnames = c(grouping_col, "data"),
#'                                          new_colnames = c("group", "cols_to_summarise"))
#'
#'   # new summary colnames: paste function names with grouping_col (e.g.
#'   # mean_blood_pressure...)
#'   new_colnames <- base::expand.grid(functions,
#'                                     col_groups_to_summarise$group,
#'                                     stringsAsFactors = FALSE)
#'   # expand.grid makes all possible combinations of functions/col_group_names,
#'   # now paste these together to create new colnames
#'   new_colnames$new_colnames <- paste(new_colnames$Var1,
#'                                      new_colnames$Var2,
#'                                      sep = "_")
#'
#'   # add optional prefix to new column names
#'   if (!is.null(prefix)) {
#'     new_colnames$new_colnames <- paste0(prefix,
#'                                         new_colnames$new_colnames)
#'   }
#'
#'   new_colnames <- new_colnames %>%
#'     # remove 'functions' column ('Var1')
#'     dplyr::select(-.data[["Var1"]]) %>%
#'     # group_by grouping_col and nest (and rename "data" as newcolnames for clarity)
#'     dplyr::group_by(.data[["Var2"]]) %>%
#'     tidyr::nest() %>%
#'     dplyr::rename("new_colnames" = .data[["data"]])
#'
#'
#'   # join with col_groups_to_summarise (prev step)
#'   col_groups_summary_dict <- col_groups_to_summarise %>%
#'     dplyr::left_join(new_colnames, by = c("group" = "Var2"))
#'
#'
#'   # ***STEP 2***
#'   # Summarise: Using dictionary of summary jobs
#'   # ('col_groups_summary_dict'), loop through each group of columns to be
#'   # summarised, applying the full set of summary functions
#'
#'   # Set up progress bar TODO - not shwoing for some reason...
#'   # pb <- progress::progress_bar$new(format = "[:bar] :current/:total (:percent)",
#'   #                                  total = length(col_groups_summary_dict$group),
#'   #                                  force = TRUE)
#'   # pb$tick(0) TODO
#'
#'   # loop
#'   for (col_group in seq_along(col_groups_summary_dict$group)) {
#'
#'     # progress bar - indicates how many col_groups have ben summarised
#'     # pb$tick(1) TODO
#'
#'     # time taken message
#'     if (col_group != 1) {
#'       time_taken <- proc.time() - start_time
#'
#'       message("\tTime taken: ",
#'               (time_taken[3] %/% 60),
#'               " minutes, ",
#'               (round(time_taken[3] %% 60)),
#'               " seconds")
#'     }
#'
#'     # number of col_group's processed message
#'     message("Processing ",
#'             col_group,
#'             " of ",
#'             length(col_groups_summary_dict$group))
#'
#'     # mutate summary cols for col_group
#'     ukb_main[
#'       # i
#'       ,
#'
#'       # j
#'       col_groups_summary_dict$new_colnames[[col_group]]$new_colnames := purrr::map(
#'         functions,
#'         dt_rowwise_fn,
#'         .SD,
#'         ...),
#'
#'       # by
#'       .SDcols = col_groups_summary_dict$cols_to_summarise[[col_group]]$descriptive_colnames]
#'   }
#'
#'   # time taken message
#'   time_taken <- proc.time() - start_time
#'
#'   message(
#'     "Complete! Time taken: ",
#'     (time_taken[3] %/% 60),
#'     " minutes, ",
#'     (round(time_taken[3] %% 60)),
#'     " seconds"
#'   )
#'
#'   # message - number of new columns created and their names
#'   new_colnames_vector <- tidyr::unnest(new_colnames,
#'                                        c(.data[["new_colnames"]]))$new_colnames
#'   message(paste0(
#'     "Appended, ", length(new_colnames_vector), " new columns: ", stringr::str_c(new_colnames_vector,
#'                                                                                 sep = "",
#'                                                                                 collapse = ", ")
#'   ))
#'
#'   return(ukb_main)
#' }
rmgpanw/rawutil documentation built on May 20, 2022, 1:29 a.m.