# Helper function for processing user input data
#
# Columns in the UK Biobank data extracted on the Research Analysis Platform
# following the naming scheme p<field_id>_i<instance> where <field_id>
# corresponds to the biomarker, e.g. 23474 for 3-Hydroxybutyrate (given the variable
# name bOHbutyrate). <instance> corresponds to the timepoint of biomarker quantification,
# in the NMR data, either 0 for baseline assessment, or 1 for first repeat. The
# column names may also optionally have an addition _a<array_index> component
# reserved for multiple measures at the same timepoint, or in the case of the NMR
# biomarker data, the presence of multiple QC flags for a given measurement.
#
# This function extracts from raw UK Biobank data the fields corresponding to
# either (1) biomarker concentrations, (2) biomarker QC Flags, or (3) sample QC
# Flags (as per the calling function, indicated by the 'type' argument). Columns
# corresponding to each <instance> are aggregated instead to multiple rows per
# participant (i.e. so there is only 1 column per biomarker / QC Flag). In
# instances where there are multiple QC Flags for a biomarker measurement (at
# a single time point for a single person), these are collated into a single
# comma separated entry. Fields are renamed with the short biomarker variable
# name typically provided by Nightingale Health, listed in the Biomarker column
# in the nmr_info data sheet included with this package
#
# This function also handles column naming schemes for datasets predating the
# UKB Research Analysis Platform, e.g. those extracted by ukbconv which follow
# the naming scheme <field_id>-<instance>.<array_index>, by ukbconv_r which
# follow the naming scheme f<field_id>.<instance>.<array_index>, and the ukbtools
# R package.
#
process_data <- function(x, type) {
# Silence CRAN NOTES about undefined global variables (columns in data.tables)
Biomarker <- UKB.Field.ID <- QC.Flag.Field.ID <- visit_index <- repeat_index <-
Name <- Shipment.Plate <- Well.Position.Within.Plate <- tmp_cname <- NULL
# Determine format of data
data_format <- detect_format(x, type)
# Make sure its a data.table
x <- getDT(x)
# If already in correct format, no need to do anything extra, return.
if (data_format == "processed") {
return(x)
}
# Data extracted by ukbconv with the txt option and loaded in with fread
if (names(x)[1] == "eid" && all(is.na(x[[1]]))) {
warning("Fixing malformed column headers for loaded dataset generated by ukbconv txt")
setnames(x, names(x), c("V1", names(x)[-ncol(x)]))
}
# Give field ids consistent names based on data format
if (data_format == "raw") {
setnames(x, gsub("-|\\.", "_", names(x)))
} else if (data_format == "ukbtools") {
setnames(x, gsub(".*_f", "", names(x)))
} else if (data_format == "ukbconv_r") {
setnames(x, gsub("\\.", "_", gsub("f\\.", "", names(x))))
} else if (data_format == "ukb_rap") {
missing_array_index <- names(x)[which(names(x) %like% "i[0-9]$")]
setnames(x, missing_array_index, paste0(missing_array_index, "_a0"))
setnames(x, gsub("_a", "_", names(x)))
setnames(x, gsub("_i", "_", names(x)))
setnames(x, gsub("^p", "", names(x)))
}
# Extract field IDs present
field_ids <- data.table(UKB.Field.ID = names(x))
field_ids[, UKB.Field.ID := suppressWarnings(as.integer(gsub("_.*", "", UKB.Field.ID)))]
field_ids <- unique(field_ids[!is.na(UKB.Field.ID)])
# Filter to those we want to extract
if (type == "biomarkers") {
field_ids <- field_ids[UKB.Field.ID %in% na.omit(ukbnmr::nmr_info$UKB.Field.ID)]
} else if (type == "biomarker_qc_flags") {
# Not all biomarkers have QC flags, but we need to know where the column is
# missing due to this, or due to the user not having access.
biomarker_fields <- na.omit(ukbnmr::nmr_info$UKB.Field.ID)
biomarkers_present <- field_ids[UKB.Field.ID %in% biomarker_fields]
biomarkers_present[ukbnmr::nmr_info, on = list(UKB.Field.ID), QC.Flag.Field.ID := QC.Flag.Field.ID]
qc_flag_fields <- na.omit(ukbnmr::nmr_info$QC.Flag.Field.ID)
qc_flags_present <- field_ids[UKB.Field.ID %in% qc_flag_fields]
empty_flags_with_biomarkers <- biomarkers_present[!(QC.Flag.Field.ID %in% qc_flags_present$UKB.Field.ID)]
field_ids <- rbind(qc_flags_present, empty_flags_with_biomarkers[,list(UKB.Field.ID=na.omit(QC.Flag.Field.ID))])
} else if (type == "sample_qc_flags") {
field_ids <- field_ids[UKB.Field.ID %in% na.omit(ukbnmr::sample_qc_info$UKB.Field.ID)]
} else {
stop("internal error: 'type' must be one of \"biomarkers\", \"biomarker_qc_flags\", or \"sample_qc_flags\"")
}
# Map to biomarker variable names
if (type == "biomarkers") {
field_ids[ukbnmr::nmr_info, on = list(UKB.Field.ID), Biomarker := Biomarker]
} else if (type == "biomarker_qc_flags") {
field_ids[ukbnmr::nmr_info, on = list(UKB.Field.ID=QC.Flag.Field.ID), Biomarker := Biomarker]
} else if (type == "sample_qc_flags") {
field_ids[ukbnmr::sample_qc_info, on = list(UKB.Field.ID), Biomarker := Name] # Not biomarkers but harmonises with rest of code
}
# Split out instance (visit) and array index (repeat measure) fields so they
# are rows instead of columns
visit_repeats <- setdiff(unique(gsub("^[0-9]+_", "", names(x))), "eid")
x <- rbindlist(fill=TRUE, use.names=TRUE, lapply(visit_repeats, function(vr) {
# Find columns matching this visit repeat pair (e.g. ending in _0_0)
this_cols <- names(x)[grepl(pattern=paste0(vr, "$"), names(x))]
# Filter to these columns
this_x <- x[, .SD, .SDcols=c("eid", this_cols)]
# Drop repeat visit pair label from column name
setnames(this_x, this_cols, gsub(paste0("_", vr, "$"), "", this_cols))
# Get fields present in the visit repeat pair
this_fields <- intersect(names(this_x), field_ids$UKB.Field.ID)
# Return empty for this visit repeat pair if no fields of interest present
if (length(this_fields) == 0) {
return(NULL)
}
# Add columns for visit and repeat index
this_x[, visit_index := as.integer(gsub("_.*", "", vr))]
this_x[, repeat_index := as.integer(gsub(".*_", "", vr))]
# Filter to field IDs of interest
this_x <- this_x[, .SD, .SDcols=c("eid", "visit_index", "repeat_index", this_fields)]
# For biomarkers with no QC flags, add in missing columns
miss_fields <- setdiff(field_ids$UKB.Field.ID, names(this_x))
if (length(miss_fields) > 0) {
for (fid in miss_fields) {
this_x[, as.character(fid) := NA_integer_]
}
this_fields <- intersect(names(this_x), field_ids$UKB.Field.ID)
}
# Rename Field IDs to biomarker variable names
this_names <- field_ids[UKB.Field.ID %in% this_fields]
setnames(this_x, as.character(this_names$UKB.Field.ID), this_names$Biomarker)
# Shipment.Plate may be integer64, convert to char
if ("Shipment.Plate" %in% names(this_x) & inherits(this_x$Shipment.Plate, "integer64")) {
this_x[, Shipment.Plate := as.character(Shipment.Plate)]
} else if ("Shipment.Plate" %in% names(this_x)) {
# If already a character string, may have "" instead of NA_character_ for
# samples without NMR data
this_x[Shipment.Plate == "", Shipment.Plate := NA_character_]
}
# Some columns may have "" instead of NA_character_ for samples without NMR data, fix
for (this_col in names(this_x)[which(sapply(this_x, is.character))]) {
setnames(this_x, this_col, "tmp_cname")
this_x[tmp_cname == "", tmp_cname := NA_character_]
setnames(this_x, "tmp_cname", this_col)
}
# Well.Position.Within.Plate may have lowercase row letters in the July 2023
# UKB data release, fix:
if ("Well.Position.Within.Plate" %in% names(this_x)) {
this_x[, Well.Position.Within.Plate := toupper(Well.Position.Within.Plate)]
}
# Drop instance and array index combinations with all missing data
# eid, visit_index, and array_index always non-missing
this_x <- this_x[apply(this_x, 1, function(row) { sum(!is.na(row)) > 3L })]
# Return to rbindlist - which will row-bind all data.tables in the list
# (one data.table per visit repeat pair)
return(this_x)
}))
# For biomarker QC flags, map integers to flag values:
# https://biobank.ndph.ox.ac.uk/showcase/coding.cgi?id=2310
if (type == "biomarker_qc_flags") {
x <- get_biomarker_qc_flag_values(x)
} else if (type == "sample_qc_flags") {
x <- get_sample_qc_flag_values(x)
}
# Remove repeat_index column no longer needed
x[, repeat_index := NULL]
# Set column keys for fast joining by user
setkeyv(x, c("eid", "visit_index"))
# Finished processing
return(x)
}
# Map integer representation of biomarker QC flags to string
# https://biobank.ndph.ox.ac.uk/showcase/coding.cgi?id=2310
get_biomarker_qc_flag_values <- function(x) {
# Silence CRAN NOTES about undefined global variables (columns in data.tables)
value <- visit_index <- repeat_index <- integer_rep <- flag <- variable <-
eid <- NULL
x <- suppressWarnings(melt(x, id.vars=c("eid", "visit_index", "repeat_index")))
# Sometimes the integer representations have already been mapped to their
# character representations, e.g. when loading data using the Rscript made by
# ukbconv r, in which case we can skip the mapping step.
if (is.integer(x$value)) { # check if values are integer (or character representations of integer values)
if (!inherits(x$value, "integer")) {
x[, value := as.integer(value)]
}
x[biomarker_flag_map, on = list(value=integer_rep), flag := flag]
x[, value := NULL]
setnames(x, "flag", "value")
}
# Where there are multiple flags per visit_index, concatenate
x <- x[, list(repeat_index=0,
value=ifelse(all(is.na(value)), NA_character_, paste(sort(na.omit(value)), collapse="; "))),
by=list(eid, visit_index, variable)]
# Cast back to wide format
x <- dcast(x, eid + visit_index + repeat_index ~ variable, value.var="value")
return(x)
}
# Map integer representation of biomarker QC flags to string
# https://biobank.ndph.ox.ac.uk/showcase/coding.cgi?id=2310
get_sample_qc_flag_values <- function(x) {
# Silence CRAN NOTES about undefined global variables (columns in data.tables)
Spectrometer <- Shipment.Plate <- High.Lactate <- High.Pyruvate <-
Low.Glucose <- Low.Protein <- Measurement.Quality.Flagged <- integer_rep <-
flag <- Resolved.Plate.Swaps <- Processing.Batch <- NULL
if (exists("High.Lactate", where=x))
x[, High.Lactate := ifelse(is.na(High.Lactate), NA_character_, "Yes")]
if (exists("High.Pyruvate", where=x))
x[, High.Pyruvate := ifelse(is.na(High.Pyruvate), NA_character_, "Yes")]
if (exists("Low.Glucose", where=x))
x[, Low.Glucose := ifelse(is.na(Low.Glucose), NA_character_, "Yes")]
if (exists("Low.Protein", where=x))
x[, Low.Protein := ifelse(is.na(Low.Protein), NA_character_, "Yes")]
if (exists("Resolved.Plate.Swaps", where=x))
x[, Resolved.Plate.Swaps := ifelse(is.na(Resolved.Plate.Swaps), NA_character_, "Yes")]
if ("Spectrometer" %in% names(x)) {
if (is.integer(x$Spectrometer)) {
x[, Spectrometer := ifelse(is.na(Spectrometer), NA_character_, paste("Spectrometer", Spectrometer))]
} else if (is.factor(x$Spectrometer)) {
x[, Spectrometer := as.character(Spectrometer)]
}
}
if ("Measurement.Quality.Flagged" %in% names(x)) {
if (is.integer(x$Measurement.Quality.Flagged)) {
x[, Measurement.Quality.Flagged := as.character(Measurement.Quality.Flagged)]
x[measure_quality_map, on = list(Measurement.Quality.Flagged=integer_rep), Measurement.Quality.Flagged := flag]
} else if (is.factor(x$Spectrometer)) {
x[, Measurement.Quality.Flagged := as.character(Measurement.Quality.Flagged)]
}
}
if ("Shipment.Plate" %in% names(x)) {
if (bit64::is.integer64(x$Shipment.Plate)) {
x[, Shipment.Plate := as.character(Shipment.Plate)]
}
if (!is.character(x$Shipment.Plate) | !grepl("^Plate", stats::na.omit(x$Shipment.Plate)[1])) {
x[, Shipment.Plate := ifelse(is.na(Shipment.Plate), NA_character_, paste("Plate", Shipment.Plate))]
}
# Sometimes the leading 0 gets dropped (e.g. if loaded as integer64), add these back in to match UK Biobank release
x[!(Shipment.Plate %like% "^Plate 0"), Shipment.Plate := gsub("^Plate ", "Plate 0", Shipment.Plate)]
}
if ("Processing.Batch" %in% names(x)) {
if (!is.character(x$Processing.Batch) | !grepl("^Plate", stats::na.omit(x$Processing.Batch)[1])) {
x[, Processing.Batch := ifelse(is.na(Processing.Batch), NA_character_, paste("Batch", Processing.Batch))]
}
}
return(x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.