R/diagnoses_table.R

Defines functions diagnoses_table

Documented in diagnoses_table

#' A tidyUkBioBank function
#' Function output: a table with an eid columns, columns pertaining to specific icd/ self reported/ cause of death codes, and a column for all previously mentioned disease indicators (combined) 
#' The columns with disease indicators contain 1's and 0's to represent whether the eid has the icd code, etc. of interest (1) or not (0)
#' 
#' @param icd_codes optional. list of icd codes
#' @param dataframe the originial phenotype dataframe containing all individuals in the ukbiobank (~500,000 cols x 18,000 rows as of 09/07/2021)
#' @param self_reported optional. if the user requests self reported diagnoses, then type self_reported = sr_list, where sr_list contains the self_reported codes of interest
#' @param cause_of_death optional. if the user requests cause_of_death diagnoses, then type cause_of_death = cod_list, where cod_list contains the cause of death of interest
#' @keywords diagnoses table
#' @export
#' @examples
#' diagnoses_table()

diagnoses_table <- function(dataframe, ...) {
  arguments <- list(...)
  SR <- arguments$self_reported
  COD <- arguments$cause_of_death
  icd_list <- arguments$icd_code_list

  if (length(icd_list) > 0) {
  
    icd10_list <- grep("([A-Za-z].*[0-9])|[0-9].*[A-Za-z].*[0-9]", icd_list, value = TRUE)
    icd9_list <- grep("([A-Za-z].*[0-9])|[0-9].*[A-Za-z].*[0-9]", icd_list, value = TRUE, invert=TRUE)
    
    if (length(icd10_list) > 0){
      dx_df_icd10 <- map2(icd10_list, icd10_list, dx_hx, dataframe) %>%
      reduce(left_join) %>%
      mutate(Total_Sums_Icd10 = rowSums(select(., -eid))) %>%
      mutate(Presence_of_Icd10_dx = case_when(Total_Sums_Icd10 > 0 ~ 1, Total_Sums_Icd10 < 1 ~ 0))
      ICD10 <- TRUE
    } else {
        dx_df_icd10 = data.frame(eid = dataframe$eid, Total_Sums_Icd10 = 0)
        ICD10 <- FALSE
  
    }
  
    if (length(icd9_list) > 0) {
      dx_df_icd9 <- map2(icd9_list, icd9_list, dx_hx, dataframe) %>%
      reduce(left_join) %>%
      mutate(Total_Sums_Icd9 = rowSums(select(., -eid))) %>%
      mutate(Presence_of_Icd9_dx = case_when(Total_Sums_Icd9 > 0 ~ 1, Total_Sums_Icd9 < 1 ~ 0))
      ICD9 <- TRUE
    } else {
      dx_df_icd9 = data.frame(eid = dataframe$eid, Total_Sums_Icd9 = 0)
      ICD9 <- FALSE
    }
  } else {
       print("ICD code information not requested")
       dx_df_icd9 = data.frame(eid = dataframe$eid, Total_Sums_Icd9 = 0)
       ICD9 <- FALSE
       dx_df_icd10 = data.frame(eid = dataframe$eid, Total_Sums_Icd10 = 0)
       ICD10 <- FALSE
  }
 
  if (length(SR) > 0) {
    self_reported_df <- arguments$self_reported
    cancer <- str_detect(self_reported_df, "cancer")
    if (cancer == TRUE) {
        self_reported_df <- str_remove(self_reported_df, "cancer")
        self_reported_df <- str_remove(self_reported_df, "_")
        self_reported_df <- str_remove(self_reported_df, " ")
    }
    dx_sr <- map2(self_reported_df, self_reported_df, dx_sr, dataframe, cancer) %>%
      reduce(left_join) %>%
      mutate(Total_Sums_Self_Reported = rowSums(select(., -eid))) %>%
      mutate(Presence_of_Self_Reported_DX = case_when(Total_Sums_Self_Reported > 0 ~ 1, Total_Sums_Self_Reported < 1 ~ 0))
    SR <- TRUE
  } else {
    print("Self Reported stats not requested")
    dx_sr = data.frame(eid = dataframe$eid, Presence_of_Self_Reported_DX = 0, Total_Sums_Self_Reported = 0)
    SR <- FALSE
  }
  
  if (length(COD) > 0) {
    dx_df_cod <- map2(COD, COD, dx_cod, dataframe) %>%
      reduce(left_join)
    cod_colnames <- colnames(dx_df_cod)
    cod_colnames <- cod_colnames[-1] 
    cod_colnames <- paste("COD", cod_colnames, sep="_")
    cod_colnames <- c("eid", cod_colnames)
    colnames(dx_df_cod) <- cod_colnames
    dx_df_cod <- dx_df_cod %>%  mutate(Total_Sums_Cause_of_Death = rowSums(select(., -eid))) %>%
      mutate(Presence_of_Cause_of_Death_DX = case_when(Total_Sums_Cause_of_Death > 0 ~ 1, Total_Sums_Cause_of_Death < 1 ~ 0))
    COD <- TRUE
  } else {
    dx_df_cod = data.frame(eid = dataframe$eid, Presence_of_Cause_of_Death_DX = 0, Total_Sums_Cause_of_Death = 0)
    COD <- FALSE
    
  }
  
  df_list <- list(dx_df_icd10, dx_df_icd9, dx_sr, dx_df_cod)
  history_df <- df_list %>% reduce(left_join) %>% 
    mutate(Sum_of_All_Diagnoses = rowSums(select(., Total_Sums_Icd10, Total_Sums_Icd9, Total_Sums_Self_Reported, Total_Sums_Cause_of_Death))) %>%
    mutate(Presence_of_Any_Requested_DX = case_when(Sum_of_All_Diagnoses > 0 ~ 1, Sum_of_All_Diagnoses < 1 ~ 0)) %>%
    select(-Sum_of_All_Diagnoses, -Total_Sums_Cause_of_Death, -Total_Sums_Self_Reported)
    if (ICD9 == FALSE) {
       history_df <- history_df %>% select(-Total_Sums_Icd9)
   }
   if (ICD10 == FALSE) {
       history_df <- history_df %>% select(-Total_Sums_Icd10)
   }
  if (SR == FALSE) {
      history_df <- history_df %>% select(-Presence_of_Self_Reported_DX)  
  }
  if (COD == FALSE) {
      history_df <- history_df %>% select(-Presence_of_Cause_of_Death_DX)
  }
  
  history_df
}
Lab-Jaiswal/tidyUkBioBank documentation built on Sept. 10, 2023, 5:13 p.m.