# PRIVATE -----------------------------------------------------------------
#' Summarise a group of columns row-wise
#'
#' Helper function
#'
#' @param dt A data table.
#' @param function_name A function name as a character.
#' @param selected_cols Character vector of column names.
#' @param new_colname Name of new summary column.
#' @noRd
rowwise_summary <- function(dt,
function_name,
selected_cols,
new_colname) {
dt[, c(new_colname) := purrr::map(
function_name,
dt_rowwise_fn,
.SD), .SDcols = selected_cols]
return(dt)
}
#' Helper function for rowwise_summary()
#'
#' @param fn a character vector of functions
#' @param cols a character vector of column names to summarise with functions in
#' fn
#' @param ... allows additional arguments such as na.rm to passed on to
#' functions listed in fn
#' @noRd
dt_rowwise_fn <- function(fn, cols, ...) {
# Row-wise summary helper function:
# applies a set of summary functions ('fn' =
# character vector of function names) across selected columns ('cols' =
# character vector of column names in a datatable) using apply(). '...' allows
# additional arguments to be passed on to functions (e.g. 'na.rm = ')
apply(cols, MARGIN = 1, fn, ...)
}
# HASHED BELOW IS OLD FUNCTION FOR PERFORMING ROWWISE SUMMARIES WITH UKB DATA
#' #' Summarise a UK Biobank phenotype dataset row-wise
#' #'
#' #' Allows the user to summarise row-wise across a set of column "groups". e.g.
#' #' calculate standard deviation values for all systolic and diastolic blood
#' #' pressure readings per eid. Can also accept custom functions (e.g. a function
#' #' to count the number of non-NA values). Custom functions should include a
#' #' '...' parameter. \strong{Note: \code{ukb_main} must be a data table.}
#' #'
#' #' @param ukb_main a UK Biobank phenotype dataset. Must be a data table
#' #' @param functions a character vector of function names (e.g. \code{c("sd")})
#' #' @param data_dict a data dictionary for \code{ukb_main}. Can be a "filtered"
#' #' version including only the columns to be summarised.
#' #' @param grouping_col the name of a column in \code{data_dict} that indicates
#' #' column groups to summarise
#' #' @param prefix character. An optional prefix to add to the names of all newly
#' #' created columns. Default is \code{NULL}.
#' #' @param ... arguments to passed on to summary functions listed in
#' #' \code{functions}
#' #'
#' #' @noRd
#' #' @examples
#' #' \dontrun{
#' #' # make a custom function that counts the number of non-NA values.
#' #' # Note: include '...', otherwise an error is thrown by including na.rm = TRUE below
#' #'
#' #' # n_not_na <- function(x, ...) {
#' #' # sum(!is.na(x))
#' #' # }
#' #'
#' #' # summarise all numerical columns in UKB dataset by calculating mean/sd/n_not_na,
#' #' # including argument na.rm = TRUE
#' #'
#' #' # ukb_main_summarised <- summarise_rowwise(
#' #' # # UKB phenotype dataset as a datatable
#' #' # ukb_main = ukb_main,
#' #'
#' #' # functions = c("mean", "sd", "n_not_na"),
#' #'
#' #' # # data dictionary generated by make_data_dict(), filtered
#' #' # # for only numerical data
#' #' # data_dict = data_dict %>%
#' #' # dplyr::filter(ValueType %in% c("Continuous", "Integer")),
#' #'
#' #' # # to summarise by FieldID
#' #' # grouping_col = "Field_FieldID"
#' #'
#' #' # # additional args passed on to `functions`
#' #' # na.rm = TRUE
#' #' # )
#' #' }
#' summarise_rowwise <- function(ukb_main,
#' functions,
#' data_dict,
#' grouping_col = "Field_FieldID",
#' prefix = NULL,
#' ...) {
#'
#' start_time <- proc.time()
#'
#' # validate args
#' assertthat::is.string(grouping_col)
#'
#' assertthat::assert_that(
#' all(data_dict$descriptive_colnames %in% names(ukb_main)),
#' msg = "Error! `data_dict$descriptive_colnames` includes values not present in `names(ukb_main)`. Please filter `data_dict` for only the columns in `ukb_main` to be summarised"
#' )
#'
#' # ***STEP 1***
#' # Create list of column groups with columns to summarise and new
#' # colnames as nested dataframe
#'
#' # get groups of columns to summarise from data dictionary
#' col_groups_to_summarise <- data_dict %>%
#' dplyr::group_by(.data[[grouping_col]]) %>%
#' tidyr::nest()
#'
#' # error if no col_groups_to_summarise
#' assertthat::assert_that(
#' !nrow(col_groups_to_summarise) == 0,
#' msg = "Error! No columns identified to summarise from the argument values provided"
#' )
#'
#' # rename grouping_col to facilitate join in code chunk below
#' col_groups_to_summarise <- rename_cols(df = col_groups_to_summarise,
#' old_colnames = c(grouping_col, "data"),
#' new_colnames = c("group", "cols_to_summarise"))
#'
#' # new summary colnames: paste function names with grouping_col (e.g.
#' # mean_blood_pressure...)
#' new_colnames <- base::expand.grid(functions,
#' col_groups_to_summarise$group,
#' stringsAsFactors = FALSE)
#' # expand.grid makes all possible combinations of functions/col_group_names,
#' # now paste these together to create new colnames
#' new_colnames$new_colnames <- paste(new_colnames$Var1,
#' new_colnames$Var2,
#' sep = "_")
#'
#' # add optional prefix to new column names
#' if (!is.null(prefix)) {
#' new_colnames$new_colnames <- paste0(prefix,
#' new_colnames$new_colnames)
#' }
#'
#' new_colnames <- new_colnames %>%
#' # remove 'functions' column ('Var1')
#' dplyr::select(-.data[["Var1"]]) %>%
#' # group_by grouping_col and nest (and rename "data" as newcolnames for clarity)
#' dplyr::group_by(.data[["Var2"]]) %>%
#' tidyr::nest() %>%
#' dplyr::rename("new_colnames" = .data[["data"]])
#'
#'
#' # join with col_groups_to_summarise (prev step)
#' col_groups_summary_dict <- col_groups_to_summarise %>%
#' dplyr::left_join(new_colnames, by = c("group" = "Var2"))
#'
#'
#' # ***STEP 2***
#' # Summarise: Using dictionary of summary jobs
#' # ('col_groups_summary_dict'), loop through each group of columns to be
#' # summarised, applying the full set of summary functions
#'
#' # Set up progress bar TODO - not shwoing for some reason...
#' # pb <- progress::progress_bar$new(format = "[:bar] :current/:total (:percent)",
#' # total = length(col_groups_summary_dict$group),
#' # force = TRUE)
#' # pb$tick(0) TODO
#'
#' # loop
#' for (col_group in seq_along(col_groups_summary_dict$group)) {
#'
#' # progress bar - indicates how many col_groups have ben summarised
#' # pb$tick(1) TODO
#'
#' # time taken message
#' if (col_group != 1) {
#' time_taken <- proc.time() - start_time
#'
#' message("\tTime taken: ",
#' (time_taken[3] %/% 60),
#' " minutes, ",
#' (round(time_taken[3] %% 60)),
#' " seconds")
#' }
#'
#' # number of col_group's processed message
#' message("Processing ",
#' col_group,
#' " of ",
#' length(col_groups_summary_dict$group))
#'
#' # mutate summary cols for col_group
#' ukb_main[
#' # i
#' ,
#'
#' # j
#' col_groups_summary_dict$new_colnames[[col_group]]$new_colnames := purrr::map(
#' functions,
#' dt_rowwise_fn,
#' .SD,
#' ...),
#'
#' # by
#' .SDcols = col_groups_summary_dict$cols_to_summarise[[col_group]]$descriptive_colnames]
#' }
#'
#' # time taken message
#' time_taken <- proc.time() - start_time
#'
#' message(
#' "Complete! Time taken: ",
#' (time_taken[3] %/% 60),
#' " minutes, ",
#' (round(time_taken[3] %% 60)),
#' " seconds"
#' )
#'
#' # message - number of new columns created and their names
#' new_colnames_vector <- tidyr::unnest(new_colnames,
#' c(.data[["new_colnames"]]))$new_colnames
#' message(paste0(
#' "Appended, ", length(new_colnames_vector), " new columns: ", stringr::str_c(new_colnames_vector,
#' sep = "",
#' collapse = ", ")
#' ))
#'
#' return(ukb_main)
#' }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.