# NOTES -------------------------------------------------------------------
# Dervied variable function requirements:
# - Function name should start with 'derive_'
# - Accept a df as first argument, and return a df with appended columns
# - Have a `.drop` argument - if `TRUE`, remove required input FID cols from output
# - Have an argument `.details_only`. If `TRUE`, return a list containing items:
## - `required_fields`
## - `new_columns`, a named list of new columns with details for each:
### - `label`
### - `value_labels`
# EXPORTED FUNCTIONS ------------------------------------------------------
#' Derive an estimated participant date of birth
#'
#' Estimates dates of birth from year and month of birth (Field IDs
#' \href{https://biobank.ndph.ox.ac.uk/ukb/field.cgi?id=34}{34} and
#' \href{https://biobank.ndph.ox.ac.uk/ukb/field.cgi?id=52}{52} respectively) as
#' the first date of the month.
#'
#' @param ukb_main A UK Biobank main dataset.
#' @param .drop If \code{TRUE}, remove the required input columns from the
#' result
#' @param .details_only If \code{TRUE}, return a list containing details of
#' required input variables (Field IDs) and derived variables (new column
#' name, label and values/value labels).
#' @inheritParams read_ukb
#'
#' @return A data frame with a column called \code{dob} (type
#' character).
#' @export
#' @examples
#' ukb_main <- data.frame(eid = 1, f34_0_0 = 1990, f52_0_0 = 1)
#' # dummy UKB data dictionary
#' dummy_ukb_data_dict <- get_ukb_dummy("dummy_Data_Dictionary_Showcase.tsv")
#'
#' # keep input year/month of birth columns
#' derive_dob(ukb_main,
#' ukb_data_dict = dummy_ukb_data_dict
#' )
#'
#' # remove input year/month of birth columns
#' derive_dob(ukb_main,
#' ukb_data_dict = dummy_ukb_data_dict,
#' .drop = TRUE
#' )
derive_dob <- function(ukb_main,
ukb_data_dict = get_ukb_data_dict(),
.drop = FALSE,
.details_only = FALSE) {
list_of_details <- list(
required_field_ids = c(yob = "34", mob = "52"),
new_columns = list(dob = list(
label = "Date of birth (estimated)",
value_labels = NA,
FieldID = NA,
instance = NA,
array = NA,
ValueType = "Date"
))
)
# if required field IDs requested
if (.details_only) {
return(list_of_details)
}
data_dict <- make_data_dict(ukb_main,
ukb_data_dict = ukb_data_dict
)
# select only the 2 required Field IDs
yob_col <- get_colnames_for_fieldids(
field_ids = list_of_details$required_field_ids["yob"],
data_dict = data_dict,
scalar_output = TRUE,
error_if_missing = TRUE,
colname_col = "colheaders_raw"
)
mob_col <- get_colnames_for_fieldids(
field_ids = list_of_details$required_field_ids["mob"],
data_dict = data_dict,
scalar_output = TRUE,
error_if_missing = TRUE,
colname_col = "colheaders_raw"
)
# check that 'mob' column is either a factor or numeric
assertthat::assert_that(
is.factor(ukb_main[[mob_col]]) | is.numeric(ukb_main[[mob_col]]),
msg = "Error! The month of birth column (Field ID 52) must either be type numeric or type factor"
)
# estimate dob
new_dob_col <- names(list_of_details$new_columns)
ukb_main[[new_dob_col]] <- paste(ukb_main[[yob_col]],
as.integer(ukb_main[[mob_col]]), # need to extract if a factor integer value e.g. 'January' == 1
01, # first day of month
sep = "-"
)
suppressWarnings(
ukb_main[[new_dob_col]] <- as.character(lubridate::ymd(ukb_main$dob))
)
# label
attributes(ukb_main[[new_dob_col]])$label <- list_of_details$new_columns$dob$label
# drop input cols if requested
if (.drop) {
ukb_main <- dplyr::select(ukb_main, -tidyselect::all_of(c(yob_col, mob_col)))
}
return(ukb_main)
}
#' Derive simplified ethnic background
#'
#' Simplifies ethnic background in a UK Biobank main dataset to the main
#' categories for
#' \href{https://biobank.ndph.ox.ac.uk/ukb/field.cgi?id=21000}{Field ID 21000}.
#'
#' Categories "Do not know" and "Prefer not to answer" are converted to
#' \code{NA}. A new column called \code{ethnic_background_simplified} of type
#' factor is appended to the input data frame. By default, "White" ethnicity is
#' set to the baseline level as this is the largest category. Levels can be
#' explicitly specified using the \code{ethnicity_levels} argument.
#'
#' @param ethnicity_levels The factor level order for the appended
#' \code{ethnic_background_simplified} column. By default, the baseline level
#' is set to "White" ethnicity.
#' @inheritParams derive_dob
#'
#' @return A data frame with a column called \code{ethnic_background_simplified}
#' (type factor).
#' @export
#'
#' @examples
#' library(magrittr)
#' # dummy UKB data and data dictionary
#' dummy_ukb_data_dict <- get_ukb_dummy("dummy_Data_Dictionary_Showcase.tsv")
#' dummy_ukb_codings <- get_ukb_dummy("dummy_Codings.tsv")
#'
#' dummy_ukb_main <- read_ukb(
#' path = get_ukb_dummy("dummy_ukb_main.tsv", path_only = TRUE),
#' ukb_data_dict = dummy_ukb_data_dict,
#' ukb_codings = dummy_ukb_codings
#' )
#'
#' # derive ethnic background
#' derive_ethnic_background_simplified(
#' ukb_main = dummy_ukb_main,
#' ukb_data_dict = dummy_ukb_data_dict
#' ) %>%
#' dplyr::select(tidyselect::contains("ethnic"))
derive_ethnic_background_simplified <- function(ukb_main,
ukb_data_dict = get_ukb_data_dict(),
ethnicity_levels = c(
"White",
"Mixed",
"Asian or Asian British",
"Black or Black British",
"Chinese",
"Other ethnic group"
),
.drop = FALSE,
.details_only = FALSE) {
list_of_details <- list(
required_field_ids = c(ethnic_background = "21000"),
new_columns = list(ethnic_background_simplified = list(
label = "Ethnic background (simplified)",
value_labels = NA,
FieldID = "21000",
instance = NA,
array = NA,
ValueType = "Categorical single"
))
)
# if required field IDs only requested
if (.details_only) {
return(list_of_details)
}
# validate args
assertthat::assert_that(!names(list_of_details$new_columns) %in% names(ukb_main),
msg = paste0(
"Error! `ukb_main` already has a column named ",
names(list_of_details$new_columns)
)
)
# get required cols
data_dict <- make_data_dict(ukb_main,
ukb_data_dict = ukb_data_dict
)
ethnic_background_cols <- get_colnames_for_fieldids(
field_ids = list_of_details$required_field_ids["ethnic_background"],
data_dict = data_dict,
scalar_output = FALSE,
error_if_missing = TRUE,
colname_col = "colheaders_raw"
)
# sort - sets earliest record to 'baseline'
ethnic_background_cols <- sort(ethnic_background_cols)
# loop through input ethnicity cols to simplify
params <- data.frame(old_ethnic_background_col = ethnic_background_cols)
params$new_ethnic_background_col <- paste0(params$old_ethnic_background_col, "_simplified")
for (i in 1:nrow(params)) {
old_ethnic_background_col <- params[i, ][["old_ethnic_background_col"]]
new_ethnic_background_col <- params[i, ][["new_ethnic_background_col"]]
ukb_main <- derive_ethnic_background_simplified_single(
ukb_main = ukb_main,
old_ethnic_background_col = old_ethnic_background_col,
new_ethnic_background_col = new_ethnic_background_col,
ethnicity_levels = ethnicity_levels,
.details_only = FALSE
)
}
# create single summary col - take the first non-missing value
ukb_main <- summarise_first_non_na(
df = ukb_main,
columns = params$new_ethnic_background_col,
new_col = names(list_of_details$new_columns)
)
# drop individual simplified ethnicity columns
ukb_main <- ukb_main %>%
dplyr::select(-tidyselect::all_of(params$new_ethnic_background_col))
# reorder factor
ukb_main[[names(list_of_details$new_columns)]] <-
factor(ukb_main[[names(list_of_details$new_columns)]],
levels = ethnicity_levels
)
# label
attributes(ukb_main[[names(list_of_details$new_columns)]])$label <- list_of_details$new_columns$ethnic_background_simplified$label
# drop original cols
if (.drop) {
ukb_main <- ukb_main %>%
dplyr::select(-tidyselect::all_of(params$old_ethnic_background_col))
}
# return result
return(ukb_main)
}
# PRIVATE FUNCTIONS -------------------------------------------------------
derive_ethnic_background_simplified_single <- function(ukb_main,
old_ethnic_background_col,
new_ethnic_background_col,
ethnicity_levels = c(
"White",
"Mixed",
"Asian or Asian British",
"Black or Black British",
"Chinese",
"Other ethnic group"
),
.details_only = FALSE) {
all_ethnicity_categories <- list(
White = c(
"White",
"British",
"Irish",
"Any other white background"
),
Mixed = c(
"Mixed",
"White and Black Caribbean",
"White and Black African",
"White and Asian",
"Any other mixed background"
),
`Asian or Asian British` = c(
"Asian or Asian British",
"Indian",
"Pakistani",
"Bangladeshi",
"Any other Asian background"
),
`Black or Black British` = c(
"Caribbean",
"Black or Black British",
"African",
"Any other Black background"
),
Chinese = c("Chinese"),
`Other ethnic group` = c("Other ethnic group"),
Do_not_know_Prefer_not_to_answer = c(
"Do not know",
"Prefer not to answer"
)
)
if (.details_only) {
return(all_ethnicity_categories)
}
all_ethnicity_categories_vector <- purrr::reduce(all_ethnicity_categories, c)
# validate args
assertthat::assert_that(is.factor(ukb_main[[old_ethnic_background_col]]) |
is.character(ukb_main[[old_ethnic_background_col]]),
msg = paste0(
"Error! ",
old_ethnic_background_col,
" must be a factor"
)
)
ethnic_background_col_unique_values <- unique(as.character(ukb_main[[old_ethnic_background_col]]))
unrecognised_ethnicity_values <- subset(
ethnic_background_col_unique_values,
(!ethnic_background_col_unique_values %in% all_ethnicity_categories_vector) &
(!is.na(ethnic_background_col_unique_values))
)
assertthat::assert_that(rlang::is_empty(unrecognised_ethnicity_values),
msg = paste0(
"Error! Column ",
old_ethnic_background_col,
" contains values that are not listed in UKB data coding 1001: ",
stringr::str_c(unrecognised_ethnicity_values,
sep = "",
collapse = ", "
)
)
)
assertthat::assert_that(length(unique(ethnicity_levels)) == length(ethnicity_levels),
msg = "Error! `ethnicity_levels` contains duplicate values"
)
assertthat::assert_that(
all(ethnicity_levels %in% c(
"White",
"Mixed",
"Asian or Asian British",
"Black or Black British",
"Chinese",
"Other ethnic group"
)),
msg = "Error! `ethnicity_levels` contains unrecognised values"
)
# simplify ethnicity
ukb_main[[new_ethnic_background_col]] <- dplyr::case_when(
ukb_main[[old_ethnic_background_col]] %in% all_ethnicity_categories$White ~ "White",
ukb_main[[old_ethnic_background_col]] %in% all_ethnicity_categories$Mixed ~ "Mixed",
ukb_main[[old_ethnic_background_col]] %in% all_ethnicity_categories$`Asian or Asian British` ~ "Asian or Asian British",
ukb_main[[old_ethnic_background_col]] %in% all_ethnicity_categories$`Black or Black British` ~ "Black or Black British",
ukb_main[[old_ethnic_background_col]] %in% all_ethnicity_categories$Chinese ~ "Chinese",
ukb_main[[old_ethnic_background_col]] %in% all_ethnicity_categories$`Other ethnic group` ~ "Other ethnic group",
(ukb_main[[old_ethnic_background_col]] %in% all_ethnicity_categories$Do_not_know_Prefer_not_to_answer) |
is.na(ukb_main[[old_ethnic_background_col]]) ~ as.character(NA),
TRUE ~ "error"
)
return(ukb_main)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.