R/process_data.R

Defines functions get_sample_qc_flag_values get_biomarker_qc_flag_values process_data

# Helper function for processing user input data
#
# Columns in the UK Biobank data extracted on the Research Analysis Platform
# following the naming scheme p<field_id>_i<instance> where <field_id>
# corresponds to the biomarker, e.g. 23474 for 3-Hydroxybutyrate (given the variable
# name bOHbutyrate). <instance> corresponds to the timepoint of biomarker quantification,
# in the NMR data, either 0 for baseline assessment, or 1 for first repeat. The
# column names may also optionally have an addition _a<array_index> component
# reserved for multiple measures at the same timepoint, or in the case of the NMR
# biomarker data, the presence of multiple QC flags for a given measurement.
#
# This function extracts from raw UK Biobank data the fields corresponding to
# either (1) biomarker concentrations, (2) biomarker QC Flags, or (3) sample QC
# Flags (as per the calling function, indicated by the 'type' argument). Columns
# corresponding to each <instance> are aggregated instead to multiple rows per
# participant (i.e. so there is only 1 column per biomarker / QC Flag). In
# instances where there are multiple QC Flags for a biomarker measurement (at
# a single time point for a single person), these are collated into a single
# comma separated entry. Fields are renamed with the short biomarker variable
# name typically provided by Nightingale Health, listed in the Biomarker column
# in the nmr_info data sheet included with this package
#
# This function also handles column naming schemes for datasets predating the
# UKB Research Analysis Platform, e.g. those extracted by ukbconv which follow
# the naming scheme <field_id>-<instance>.<array_index>, by ukbconv_r which
# follow the naming scheme f<field_id>.<instance>.<array_index>, and the ukbtools
# R package.
#
process_data <- function(x, type) {
  # Silence CRAN NOTES about undefined global variables (columns in data.tables)
  Biomarker <- UKB.Field.ID <- QC.Flag.Field.ID <- visit_index <- repeat_index <-
    Name <- Shipment.Plate <- Well.Position.Within.Plate <- tmp_cname <- NULL

  # Determine format of data
  data_format <- detect_format(x, type)

  # Make sure its a data.table
  x <- getDT(x)

  # If already in correct format, no need to do anything extra, return.
  if (data_format == "processed") {
    return(x)
  }

  # Data extracted by ukbconv with the txt option and loaded in with fread
  if (names(x)[1] == "eid" && all(is.na(x[[1]]))) {
    warning("Fixing malformed column headers for loaded dataset generated by ukbconv txt")
    setnames(x, names(x), c("V1", names(x)[-ncol(x)]))
  }

  # Give field ids consistent names based on data format
  if (data_format == "raw") {
    setnames(x, gsub("-|\\.", "_", names(x)))
  } else if (data_format == "ukbtools") {
    setnames(x, gsub(".*_f", "", names(x)))
  } else if (data_format == "ukbconv_r") {
    setnames(x, gsub("\\.", "_", gsub("f\\.", "", names(x))))
  } else if (data_format == "ukb_rap") {
    missing_array_index <- names(x)[which(names(x) %like% "i[0-9]$")]
    setnames(x, missing_array_index, paste0(missing_array_index, "_a0"))
    setnames(x, gsub("_a", "_", names(x)))
    setnames(x, gsub("_i", "_", names(x)))
    setnames(x, gsub("^p", "", names(x)))
  }

  # Extract field IDs present
  field_ids <- data.table(UKB.Field.ID = names(x))
  field_ids[, UKB.Field.ID := suppressWarnings(as.integer(gsub("_.*", "", UKB.Field.ID)))]
  field_ids <- unique(field_ids[!is.na(UKB.Field.ID)])

  # Filter to those we want to extract
  if (type == "biomarkers") {
    field_ids <- field_ids[UKB.Field.ID %in% na.omit(ukbnmr::nmr_info$UKB.Field.ID)]
  } else if (type == "biomarker_qc_flags") {
    # Not all biomarkers have QC flags, but we need to know where the column is
    # missing due to this, or due to the user not having access.
    biomarker_fields <- na.omit(ukbnmr::nmr_info$UKB.Field.ID)
    biomarkers_present <- field_ids[UKB.Field.ID %in% biomarker_fields]
    biomarkers_present[ukbnmr::nmr_info, on = list(UKB.Field.ID), QC.Flag.Field.ID := QC.Flag.Field.ID]
    qc_flag_fields <- na.omit(ukbnmr::nmr_info$QC.Flag.Field.ID)
    qc_flags_present <- field_ids[UKB.Field.ID %in% qc_flag_fields]
    empty_flags_with_biomarkers <- biomarkers_present[!(QC.Flag.Field.ID %in% qc_flags_present$UKB.Field.ID)]
    field_ids <- rbind(qc_flags_present, empty_flags_with_biomarkers[,list(UKB.Field.ID=na.omit(QC.Flag.Field.ID))])
  } else if (type == "sample_qc_flags") {
    field_ids <- field_ids[UKB.Field.ID %in% na.omit(ukbnmr::sample_qc_info$UKB.Field.ID)]
  } else {
    stop("internal error: 'type' must be one of \"biomarkers\", \"biomarker_qc_flags\", or \"sample_qc_flags\"")
  }

  # Map to biomarker variable names
  if (type == "biomarkers") {
    field_ids[ukbnmr::nmr_info, on = list(UKB.Field.ID), Biomarker := Biomarker]
  } else if (type == "biomarker_qc_flags") {
    field_ids[ukbnmr::nmr_info, on = list(UKB.Field.ID=QC.Flag.Field.ID), Biomarker := Biomarker]
  } else if (type == "sample_qc_flags") {
    field_ids[ukbnmr::sample_qc_info, on = list(UKB.Field.ID), Biomarker := Name] # Not biomarkers but harmonises with rest of code
  }

  # Split out instance (visit) and array index (repeat measure) fields so they
  # are rows instead of columns
  visit_repeats <- setdiff(unique(gsub("^[0-9]+_", "", names(x))), "eid")
  x <- rbindlist(fill=TRUE, use.names=TRUE, lapply(visit_repeats, function(vr) {
    # Find columns matching this visit repeat pair (e.g. ending in _0_0)
    this_cols <- names(x)[grepl(pattern=paste0(vr, "$"), names(x))]

    # Filter to these columns
    this_x <- x[, .SD, .SDcols=c("eid", this_cols)]

    # Drop repeat visit pair label from column name
    setnames(this_x, this_cols, gsub(paste0("_", vr, "$"), "", this_cols))

    # Get fields present in the visit repeat pair
    this_fields <- intersect(names(this_x), field_ids$UKB.Field.ID)

    # Return empty for this visit repeat pair if no fields of interest present
    if (length(this_fields) == 0) {
      return(NULL)
    }

    # Add columns for visit and repeat index
    this_x[, visit_index := as.integer(gsub("_.*", "", vr))]
    this_x[, repeat_index := as.integer(gsub(".*_", "", vr))]

    # Filter to field IDs of interest
    this_x <- this_x[, .SD, .SDcols=c("eid", "visit_index", "repeat_index", this_fields)]

    # For biomarkers with no QC flags, add in missing columns
    miss_fields <- setdiff(field_ids$UKB.Field.ID, names(this_x))
    if (length(miss_fields) > 0) {
      for (fid in miss_fields) {
        this_x[, as.character(fid) := NA_integer_]
      }
      this_fields <- intersect(names(this_x), field_ids$UKB.Field.ID)
    }

    # Rename Field IDs to biomarker variable names
    this_names <- field_ids[UKB.Field.ID %in% this_fields]
    setnames(this_x, as.character(this_names$UKB.Field.ID), this_names$Biomarker)

    # Shipment.Plate may be integer64, convert to char
    if ("Shipment.Plate" %in% names(this_x) & inherits(this_x$Shipment.Plate, "integer64")) {
      this_x[, Shipment.Plate := as.character(Shipment.Plate)]
    } else if ("Shipment.Plate" %in% names(this_x)) {
      # If already a character string, may have "" instead of NA_character_ for
      # samples without NMR data
      this_x[Shipment.Plate == "", Shipment.Plate := NA_character_]
    }

    # Some columns may have "" instead of NA_character_ for samples without NMR data, fix
    for (this_col in names(this_x)[which(sapply(this_x, is.character))]) {
      setnames(this_x, this_col, "tmp_cname")
      this_x[tmp_cname == "", tmp_cname := NA_character_]
      setnames(this_x, "tmp_cname", this_col)
    }

    # Well.Position.Within.Plate may have lowercase row letters in the July 2023
    # UKB data release, fix:
    if ("Well.Position.Within.Plate" %in% names(this_x)) {
      this_x[, Well.Position.Within.Plate := toupper(Well.Position.Within.Plate)]
    }

    # Drop instance and array index combinations with all missing data
    # eid, visit_index, and array_index always non-missing
    this_x <- this_x[apply(this_x, 1, function(row) { sum(!is.na(row)) > 3L })]

    # Return to rbindlist - which will row-bind all data.tables in the list
    # (one data.table per visit repeat pair)
    return(this_x)
  }))


  # For biomarker QC flags, map integers to flag values:
  # https://biobank.ndph.ox.ac.uk/showcase/coding.cgi?id=2310
  if (type == "biomarker_qc_flags") {
    x <- get_biomarker_qc_flag_values(x)
  } else if (type == "sample_qc_flags") {
    x <- get_sample_qc_flag_values(x)
  }

  # Remove repeat_index column no longer needed
  x[, repeat_index := NULL]

  # Set column keys for fast joining by user
  setkeyv(x, c("eid", "visit_index"))

  # Finished processing
  return(x)
}

# Map integer representation of biomarker QC flags to string
# https://biobank.ndph.ox.ac.uk/showcase/coding.cgi?id=2310
get_biomarker_qc_flag_values <- function(x) {
  # Silence CRAN NOTES about undefined global variables (columns in data.tables)
  value <- visit_index <- repeat_index <- integer_rep <- flag <- variable <-
    eid <- NULL

  x <- suppressWarnings(melt(x, id.vars=c("eid", "visit_index", "repeat_index")))

  # Sometimes the integer representations have already been mapped to their
  # character representations, e.g. when loading data using the Rscript made by
  # ukbconv r, in which case we can skip the mapping step.
  if (is.integer(x$value)) { # check if values are integer (or character representations of integer values)
    if (!inherits(x$value, "integer")) {
      x[, value := as.integer(value)]
    }

    x[biomarker_flag_map, on = list(value=integer_rep), flag := flag]
    x[, value := NULL]
    setnames(x, "flag", "value")
  }

  # Where there are multiple flags per visit_index, concatenate
  x <- x[, list(repeat_index=0,
    value=ifelse(all(is.na(value)), NA_character_, paste(sort(na.omit(value)), collapse="; "))),
    by=list(eid, visit_index, variable)]

  # Cast back to wide format
  x <- dcast(x, eid + visit_index + repeat_index ~ variable, value.var="value")

  return(x)
}

# Map integer representation of biomarker QC flags to string
# https://biobank.ndph.ox.ac.uk/showcase/coding.cgi?id=2310
get_sample_qc_flag_values <- function(x) {
  # Silence CRAN NOTES about undefined global variables (columns in data.tables)
  Spectrometer <- Shipment.Plate <- High.Lactate <- High.Pyruvate <-
    Low.Glucose <- Low.Protein <- Measurement.Quality.Flagged <- integer_rep <-
    flag <- Resolved.Plate.Swaps <- Processing.Batch <- NULL

  if (exists("High.Lactate", where=x))
    x[, High.Lactate := ifelse(is.na(High.Lactate), NA_character_, "Yes")]
  if (exists("High.Pyruvate", where=x))
    x[, High.Pyruvate := ifelse(is.na(High.Pyruvate), NA_character_, "Yes")]
  if (exists("Low.Glucose", where=x))
    x[, Low.Glucose := ifelse(is.na(Low.Glucose), NA_character_, "Yes")]
  if (exists("Low.Protein", where=x))
    x[, Low.Protein := ifelse(is.na(Low.Protein), NA_character_, "Yes")]
  if (exists("Resolved.Plate.Swaps", where=x))
    x[, Resolved.Plate.Swaps := ifelse(is.na(Resolved.Plate.Swaps), NA_character_, "Yes")]

  if ("Spectrometer" %in% names(x)) {
    if (is.integer(x$Spectrometer)) {
      x[, Spectrometer := ifelse(is.na(Spectrometer), NA_character_, paste("Spectrometer", Spectrometer))]
    } else if (is.factor(x$Spectrometer)) {
      x[, Spectrometer := as.character(Spectrometer)]
    }
  }

  if ("Measurement.Quality.Flagged" %in% names(x)) {
    if (is.integer(x$Measurement.Quality.Flagged)) {
      x[, Measurement.Quality.Flagged := as.character(Measurement.Quality.Flagged)]
      x[measure_quality_map, on = list(Measurement.Quality.Flagged=integer_rep), Measurement.Quality.Flagged := flag]
    } else if (is.factor(x$Spectrometer)) {
      x[, Measurement.Quality.Flagged := as.character(Measurement.Quality.Flagged)]
    }
  }

  if ("Shipment.Plate" %in% names(x)) {
    if (bit64::is.integer64(x$Shipment.Plate)) {
      x[, Shipment.Plate := as.character(Shipment.Plate)]
    }
    if (!is.character(x$Shipment.Plate) | !grepl("^Plate", stats::na.omit(x$Shipment.Plate)[1])) {
      x[, Shipment.Plate := ifelse(is.na(Shipment.Plate), NA_character_, paste("Plate", Shipment.Plate))]
    }
    # Sometimes the leading 0 gets dropped (e.g. if loaded as integer64), add these back in to match UK Biobank release
    x[!(Shipment.Plate %like% "^Plate 0"), Shipment.Plate := gsub("^Plate ", "Plate 0", Shipment.Plate)]
  }

  if ("Processing.Batch" %in% names(x)) {
    if (!is.character(x$Processing.Batch) | !grepl("^Plate", stats::na.omit(x$Processing.Batch)[1])) {
      x[, Processing.Batch := ifelse(is.na(Processing.Batch), NA_character_, paste("Batch", Processing.Batch))]
    }
  }

  return(x)
}
sritchie73/ukbnmr documentation built on Nov. 24, 2024, 8:51 p.m.