R/prepare_ukb_data.R

Defines functions harmonize_ukb_data read_definition_table isNullNaNan

Documented in harmonize_ukb_data isNullNaNan read_definition_table

#' Check if string variable is valid
#'
#' Any of these will cause file.exists() to throw an error
#' @param x
#' @return  logical TRUE / FALSE
#' @export
isNullNaNan = function(x) {
  # "short-circuit" operator || : If condition1 is true, condition 2 and 3 will NOT be checked.
  # which is important, because is.na/is.nan throws an error when input is NULL
  any(is.null(x))  || any(is.na(x))  || any(is.nan(x))
}

#' Process definition table
#'
#' Given a definition table, check, clean, format as well as to expand the codes.
#' @param f.definition path to definition table (.tsv)
#' @param f.data.settings data frame containing data settings
#' @param dir.code.map path to directory of the code mappings
#' @return  expanded definition table
#' @keywords definitions
#' @export
#' @examples
#' dfDefinitions_processed_expanded<-read_defnition_table(fdefinitions,fdata_setting,dir.code.map=paste0(repo_dir,"inst/extdata/"))
read_definition_table <-function(f.definition,f.data.setting,dir.code.map){
  dfData.settings <-data.table::fread(f.data.setting)
  dfDefinitions <- data.table::fread(f.definition, colClasses = 'character', data.table = FALSE)
  dfDefinitions_processed <- ProcessDfDefinitions(dfDefinitions)

  # add a slash if the path does not contain a slash
  if (stringr::str_detect(dir.code.map,"/$",negate=TRUE)){
    if( .Platform$OS.type == "windows" ){
      dir.code.map<-paste0(dir.code.map,"\\")
    }else{
      dir.code.map<-paste0(dir.code.map,"/")
    }
  }
  # Check the codes in the definition table against the code maps
  # The code maps are either downloaded from UK biobank showcase (codingxx.tsv) [search corresponding Data-Coding in showcase] for which has been checked to include all codes present in data # or to be created from the current data at hand (.code)
  missing_codes<-check_dfDefinitions_codes(dfDefinitions_processed,dfData.settings,code_map_dir=dir.code.map,F)
  # WHAT IT MEANS: Codes that are not present will be removed in the expand_dfDefinitions...() as the they will be removed
  # in theory these extra codes will not cause errors of the pipeline unless the whole line is empty (no codes from any source)
  dfDefinitions_processed_expanded <-expand_dfDefinitions_processed2(dfDefinitions_processed,dfData.settings,code_map_dir=dir.code.map )
  return(dfDefinitions_processed_expanded)
}



#' Prepare harmonized long format data from various data sources
#'
#' This function reads ukb files including main dataset (ukbxxxxx.tab) and record based (.txt) files from data portal. It returns a list of 3 elements:
#' lst.data: event tables by episodes organized by source as well as classification system
#' dfukb: the main dataset with columns required for generating lst.data
#' vct.identifiers: a vector of participant identifiers from dfukb
#' @param f.ukbtab Path to the main dataset (.tab) file
#' @param f.html Path to html file containing the metadata of the main dataset which can be generated using ukb utility
#' @param dfDefinitions A processed and expanded definition table (data.table object), which can be generated by `read_defnition_table()`
#' @param f.hesin Path to HESIN (master file), RECORD LEVEL DATA
#' @param f.hesin_diag Path to HESIN_DIAG file containing diagnosis codes, RECORD LEVEL DATA
#' @param f.hesin_oper Path to HESIN_OPER file containing Operations and procedural codes, RECORD LEVEL DATA
#' @param f.death_portal Path to file with DEATH table, RECORD LEVEL DATA
#' @param f.death_cause_portal Path to file with DEATH_CAUSE table, RECORD LEVEL DATA
#' @param f.gp_clinical Path to GP clinical event records, RECORD LEVEL DATA
#' @param f.gp_scripts Path to GP prescription event records, RECORD LEVEL DATA
#' @param f.withdrawal_list Path to participant withdrawal list (.csv)
#' @param allow_missing_fields Logical flag specifying whether missing data field(s) is allowed (ignored) by the function. If FALSE, function will halt if any field is missing from the main dataset
#' @param death_from_portal Logical flag specifying whether death records will be read from data portal files and from the main dataset. The main dataset will be taken if the files from data portal are not present (readable).
#' @param add_extra_hesin_columns  if True, adds extra columns "ins_index","source"
#' @return   main dataset as dataframe with only selected data fields
#' @export
#' @examples
#' lst.harmonized.data<-harmonize_ukb_data(f.ukbtab = fukbtab,f.html = fhtml,f.gp_clinical = fgp_clinical,f.gp_scripts = fgp_scripts,f.hesin = fhesin,f.hesin_diag = fhesin_diag,f.hesin_oper =fhesin_oper,f.death_portal = fdeath_portal,f.death_cause_portal = fdeath_cause_portal )
#' summary(lst.harmonized.data)
harmonize_ukb_data <- function(f.ukbtab=NULL,f.html=NULL,dfDefinitions=NULL,f.hesin=NULL,f.hesin_diag=NULL,f.hesin_oper=NULL,f.death_portal=NULL,f.death_cause_portal=NULL,f.gp_clinical=NULL,f.gp_scripts=NULL,f.withdrawal_list=NULL,allow_missing_fields=TRUE,death_from_portal=TRUE,add_extra_hesin_columns=F,...){
  message("Start data harmonization")

  if (!(isNullNaNan(f.html)&isNullNaNan(f.ukbtab))){
    message("Read metadata file ")
    # # ukb's .tab meta data which can be generated using ukbconv
    dfhtml <- read_ukb_metadata(f.html)
  }else{
    message("Required metadata file (.html) or (.tab) is missing, please generate it with ukbconv. Exit!")
    return()
  }
  # use case 1 : no definition table , output default dataset
  if (is.null(dfDefinitions)){
    message("No definition file supplied, extract default fields from the main dataset")
    tictoc::tic("time elapsed processing .tab file")
    # extract default columns from the .tab file
    dfukb <- read_ukb_tabdata(f.ukbtab,dfhtml)
    tictoc::toc()
  }else{
    # use case 2 : with definition table,  extract only relevant fields needed
    # we then use this meta-data to check if all fields we need are inside the .tab file
    # the function outputs a list of fields that are required
    message("Verify if all required fields from the definitions are present in the main dataset (.tab)")
    dfDefinitions_ukb_fields <- get_allvarnames(dfDefinitions,dfhtml,allow_miss = allow_missing_fields)
    # get_allvarnames() returns null if a field is missing
    if (is.null(dfDefinitions_ukb_fields) ){
        message("Please ensure all required fields are present in the main dataset, abort!")
        return()
      }

      message("Read .tab file and retrieve required fields.")
      tictoc::tic()
      # Next is to extract those columns required from the .tab file
      dfukb <- read_ukb_tabdata(f.ukbtab,dfhtml,fields_to_keep = dfDefinitions_ukb_fields$all_ukb_fields)
      tictoc::toc()

  }
  # vector of eid needed for subsequent function
  vct.identifiers <- as.numeric(dfukb$identifier)
  gc()


  ################################################################################
  # converting data
  ################################################################################
  lst.data <- list()

  ##############################
  ## from main dataset .tab
  #############################
  #  extra check for column name in main dataset
  dfukb_colnames<-colnames(dfukb)
  f53 <- dfukb_colnames[grepl(paste0("[^0-9]","53","[^0-9]"),dfukb_colnames)]
  if (length(f53)==0){
    message("Unable to extract visit dates fields (field 53) \nPlease check if they are present OR column names are named f.53.y.z (non-numeric character before & after the field [^0-9]53[^0-9])\nConversion with r flag will give the right column headers`ukbconv ukb12345.enc_ukb r`")
    return()
  }

  ################################################################################
  ### touchscreen (event==2: only the first occurence is an event)
  ################################################################################
  # if no definition table is supplied, only default fields below are processed
  # check if TS fields contains input, skipping those with NA
  if (any(nchar(dfDefinitions$TS)>0,na.rm=TRUE)){
    message("Convert touchscreen data")
    lst.data$ts <- convert_touchscreen_to_episodedata(dfukb,ts_conditions = dfDefinitions$TS)
  }

  ################################################################################
  ### self reported data  (event==2: only the first occurence is an event)
  ################################################################################

  message("Convert self-reported data")
  message(" ->Self-reported cancer codes (field 20001)")
  # cancer
  lst.data$tte.sr.20001 <- convert_nurseinterview_to_episodedata(dfukb,field_sr_diagnosis = "20001",field_sr_date = "20006",qc_threshold_year = 10)
  message(" ->Self-reported non-cancer codes(field 20002)")
  # non cancer
  lst.data$tte.sr.20002 <- convert_nurseinterview_to_episodedata(dfukb,field_sr_diagnosis = "20002",field_sr_date = "20008",qc_threshold_year = 10)
  message(" ->Self-reported operation codes(field 20004)")
  # operation  ,  here we set event=1 to allow multiple operations happening at different time
  lst.data$tte.sr.20004 <- convert_nurseinterview_to_episodedata(dfukb,field_sr_diagnosis = "20004",field_sr_date = "20010",qc_threshold_year = 10,event_code=1)
  message(" ->Self-reported medication codes(field 20003)")
  # medication (event==0, since no age of diagnosis)
  lst.data$sr.20003 <- convert_nurseinterview_to_episodedata(dfukb,field_sr_diagnosis = "20003",field_sr_date = "",qc_threshold_year = 10)

  ################################################################################
  ### cancer registry
  ################################################################################
  message("Convert cancer registry data(field 40006/40013)")
  # qc_threshold set to zeros, because cancer recurrence is fairly common (?) <-set event=1 to not combine the multiple occurences
  lst.data$tte.cancer.icd10 <- convert_cancerregister_to_episodedata(dfukb,field_diagnosis = "40006",field_date = "40005",field_date_type="date",qc_threshold_year = 0,codetype = "character",event_code=1)
  lst.data$tte.cancer.icd9 <- convert_cancerregister_to_episodedata(dfukb,field_diagnosis = "40013",field_date = "40005",field_date_type="date",qc_threshold_year = 0,codetype ="character",event_code=1)

  ###################################################
  ## from data portal
  ###################################################

  ################################################################################
  ### death record
  ################################################################################
  # take only the death record from portal for now as they are updated more frequently.
  # death registry from tab file (event==1: every occurence is a real event)
  # death registry from data portal , same data as the main dataset but more up to date, refer document DeathLinkage

    if (!isNullNaNan(f.death_portal) & !isNullNaNan(f.death_cause_portal) & death_from_portal){
      message("Convert death registry data(data portal)")
      lst.data_dth <- read_death_data(f.death_portal,f.death_cause_portal)
      lst.data$tte.death.icd10.primary <-lst.data_dth$primary
      lst.data$tte.death.icd10.secondary<-lst.data_dth$secondary
      rm(lst.data_dth)
    }else{
      message("Convert death registry data(field 40000/40001/40002)")
      lst.data$tte.death.icd10.primary <- convert_nurseinterview_to_episodedata(dfukb,field_sr_diagnosis = "40001",field_sr_date = "40000",field_sr_date_type="date",qc_threshold_year = 10,event_code=1) # death
      lst.data$tte.death.icd10.secondary <- convert_nurseinterview_to_episodedata(dfukb,field_sr_diagnosis = "40002",field_sr_date = "40000",field_sr_date_type="date",qc_threshold_year = 10,event_code=1) # death
    }


  ################################################################################
  ###  hesin  (event==1)
  ################################################################################
  if (!isNullNaNan(f.hesin)& !isNullNaNan(f.hesin_diag)& !isNullNaNan(f.hesin_oper)){

      tictoc::tic("Convert Hospital Inpatient data(data portal)")
      lst.data <- append(lst.data,read_hesin_data(f.hesin ,f.hesin_diag ,f.hesin_oper,add_extra_hesin_columns)) #tte.hes.primary + tte.hes.secondary
      tictoc::toc()
  }

  ################################################################################
  ### primary care, gp  (event==1)
  ################################################################################
  if (!(isNullNaNan(f.gp_clinical))){
    tictoc::tic("Convert GP diagnosis data(data portal")
    lst.data <- append(lst.data,read_gp_clinical_data(fgp=f.gp_clinical))
    tictoc::toc()
    gc()
  }
  if (!(isNullNaNan(f.gp_scripts))){
    tictoc::tic("Convert GP prescription data(data portal")
    # this one does medication
    lst.data <- append(lst.data,read_gp_script_data(fgp=f.gp_scripts))
    gc()
    tictoc::toc()
  }

  # make sure eeverything is in the right format:
  lst.data <- lapply(lst.data,function(x) {setkey(x,code) })
  # lst.data <- lapply(lst.data,function(x) {x[, ('f.eid') := lapply(.SD, as.character), .SDcols = 'f.eid'] })
  # lst.data <- lapply(lst.data,function(x) {x[, ('f.eid') := lapply(.SD, as.numeric), .SDcols = 'f.eid'] })
  lst.data <- lapply(lst.data,function(x) {x[, ('identifier') := lapply(.SD, as.numeric), .SDcols = 'identifier'] })

  lst.data <- lapply(lst.data,function(x) {x[,'eventdate'] <-  round(x$eventdate);return(x) })

  if (! is.null(f.withdrawal_list)){
    message(paste0("Remove records corresponding to all withdrawn partipants specified in ",f.withdrawal_list))
    df_withdrawal<-fread(f.withdrawal_list)
    message(glue::glue("*** {nrow(df_withdrawal)} participants on the withdrawal list ***"))
    lst.data <- lapply(lst.data, function(x) {
      x<-x[!identifier %in% df_withdrawal$V1]
    })
    dfukb<-dfukb[! identifier %in% df_withdrawal$V1]
    vct.identifiers<-vct.identifiers[!vct.identifiers %in% df_withdrawal$V1]
  }



  return(list(lst.data=lst.data,dfukb=dfukb,vct.identifiers=vct.identifiers))

}
niekverw/ukbpheno documentation built on Oct. 30, 2023, 9:17 p.m.