testfun/R/testf-function.R

testf <- function(data_l) {
  
  drop_cols <- list()
  for(i in 1:length(data_l)) drop_cols[[i]] <- grep( 
    "location_id|provider_id|care_site_id|person_source_value|gender_source_value|gender_source_concept_id|race_source_value|race_source_concept_id|
    ethnicity_source_value|ethnicity_source_concept_id|cause_concept_id|cause_source_value|cause_source_concept_id|visit_start_time|visit_end_time|
    visit_source_value|visit_source_concept_id|modifier_concept_id|quantity|procedure_source_value|procedure_source_concept_id|qualifier_source_value|
    drug_exposure_end_date|stop_reason|refills|days_supply|sig|route_concept_id|effective_drug_dose|dose_unit_concept_ id|lot_number|drug_source_value|
    drug_source_concept_id|route_source_value|dose_unit_source_value|condition_end_date|condition_source_value|condition_source_concept_id|
    operator_concept_id|value_as_concept_id|measurement_source_value|measurement_source_concept_id|unit_source_value|value_source_value|
    observation_time|value_as_number|qualifier_concept_id|observation_source_value|observation_source_concept_id", 
    names(data_l[[i]]))
  
  for(i in 1:length(data_l)) data_l[[i]][, drop_cols[[i]] := NULL]
  
  #remove all na columns
  test <- lapply(data_l, function (x) x[,which(unlist(lapply(x, function(y)!all(is.na(y))))),with=F])
  rm(data_l)
  
  ##DRUG EXPOSURE CLEAN UP
  #calculate days of drug usage - create new column
  #test[[3]]$drug_exposure_days <- as.integer(as.Date(test[[3]]$verbatim_end_date, format = "%Y-%m-%d") - as.Date(test[[3]]$drug_exposure_start_datetime, format = "%Y-%m-%d")) 
  test[["drug_exposure"]][["drug_exposure_days"]] <- as.integer(as.Date(test[["drug_exposure"]][["verbatim_end_date"]], format = "%Y-%m-%d") - as.Date(test[["drug_exposure"]][["drug_exposure_start_datetime"]], format = "%Y-%m-%d")) 
  
  #keep only those with a valid number of days (>=0)
  #test[[3]] <- test[[3]][drug_exposure_days >= 0]
  test[["drug_exposure"]] <- test[["drug_exposure"]][drug_exposure_days >= 0]
  
  #drop redundant columns
  test[["drug_exposure"]][ , (c("drug_exposure_start_date","drug_exposure_start_datetime","drug_type_concept_id",
                                "drug_source_concept_id", "drug_exposure_id")) := NULL ]
  
  #drop NAs based on drug_concept_id
  test[["drug_exposure"]]<- test[["drug_exposure"]][drug_concept_id != "NA"]
  
  
  ##MEASUREMENT CLEAN UP
  test[["measurement"]][, (c("unit_concept_id", "measurement_id", "measurement_datetime", 
                             "measurement_type_concept_id")) := NULL]
  
  test[["measurement"]] <- test[["measurement"]][measurement_concept_id != "NA"]
  
  ##CONDITION OCCURENCE CLEAN UP
  test[["condition_occurrence"]][, (c("condition_start_datetime", "condition_status_concept_id", 
                                      "condition_type_concept_id", "condition_occurrence_id")) := NULL]
  
  test[["condition_occurrence"]] <- test[["condition_occurrence"]][condition_concept_id != "NA"]
  
  
  ##PROCEDURE CLEAN UP
  test[["procedure_occurrence"]][, (c("procedure_datetime", "procedure_type_concept_id", 
                                      "procedure_occurrence_id")) := NULL]
  
  test[["procedure_occurrence"]] <- test[["procedure_occurrence"]][procedure_concept_id != "NA"]
  
  ##PERSON CLEAN UP
  test[["person"]][, (c("month_of_birth", "day_of_birth", "birth_datetime", 
                        "ethnicity_source_value")) := NULL]
  
  ##VISIT OCCURENCE CLEAN UP
  #calculate days of drug usage - create new column
  test[["visit_occurrence"]][["visit_days"]] <- as.integer(as.Date(test[["visit_occurrence"]][["visit_end_date"]], format = "%Y-%m-%d") - as.Date(test[["visit_occurrence"]][["visit_start_date"]], format = "%Y-%m-%d")) 
  #keep only those with a valid number of days (>=0)
  test[["visit_occurrence"]] <- test[["visit_occurrence"]][visit_days >= 0]
  
  #drop redundant columns
  test[["visit_occurrence"]][, (c("visit_start_datetime", "visit_type_concept_id", 
                                  "admitting_source_concept_id", "discharge_to_concept_id", 
                                  "visit_source_value","visit_end_datetime","visit_start_date")) := NULL]
  
  test[["visit_occurrence"]] <- test[["visit_occurrence"]][visit_concept_id != "NA"]
  
  
  ##OBSERVATION OCCURENCE CLEAN UP
  test[["observation"]][, (c("observation_type_concept_id", "observation_id", 
                             "observation_datetime", "observation_source_concept_i")) := NULL]
  
  test[["observation"]] <- test[["observation"]][observation_concept_id != "NA"]
  
  ##OBSERVATION PERIOD CLEAN UP
  test[["observation_period"]][["observation_days"]] <- as.integer(as.Date(test[["observation_period"]][["observation_period_end_date"]], format = "%Y-%m-%d") - as.Date(test[["observation_period"]][["observation_period_start_date"]], format = "%Y-%m-%d")) 
  test[["observation_period"]] <- test[["observation_period"]][observation_days >= 0]
  
  test[["observation_period"]][, (c("period_type_concept_id", "observation_period_id",
                                    "observation_period_end_date")) := NULL]
  
  
  ##DEATH CLEAN UP
  test[["death"]][, (c("death_datetime", "death_type_concept_id")) := NULL]
  
  d_date <- test$death
  
  test <- lapply(test, function(x) left_join(x, d_date, by = "person_id"))
  
  
  test[["drug_exposure"]] <- test[["drug_exposure"]] %>%
    mutate(var = as.integer(as.Date(.data$death_date, format = "%Y-%m-%d") - as.Date(.data$verbatim_end_date, format = "%Y-%m-%d"))) %>%
    filter(var >= 0 | is.na(var)) %>%
    select(-c(var, death_date))
  
  test[["condition_occurrence"]] <- test[["condition_occurrence"]] %>%
    mutate(var = as.integer(as.Date(.data$death_date, format = "%Y-%m-%d") - as.Date(.data$condition_start_date, format = "%Y-%m-%d"))) %>%
    filter(var >= 0 | is.na(var)) %>%
    select(-c(var, death_date))
  
  test[["measurement"]] <- test[["measurement"]] %>%
    mutate(var = as.integer(as.Date(.data$death_date, format = "%Y-%m-%d") - as.Date(.data$measurement_date, format = "%Y-%m-%d"))) %>%
    filter(var >= 0 | is.na(var)) %>%
    select(-c(var, death_date))
  
  test[["observation"]] <- test[["observation"]] %>%
    mutate(var = as.integer(as.Date(.data$death_date, format = "%Y-%m-%d") - as.Date(.data$observation_date, format = "%Y-%m-%d"))) %>%
    filter(var >= 0 | is.na(var)) %>%
    select(-c(var, death_date))
  
  test[["observation_period"]] <- test[["observation_period"]] %>%
    mutate(var = as.integer(as.Date(.data$death_date, format = "%Y-%m-%d") - as.Date(.data$observation_period_start_date, format = "%Y-%m-%d"))) %>%
    filter(var >= 0 | is.na(var)) %>%
    select(-c(var, death_date))
  
  test[["person"]] <- test[["person"]] %>%
    select(-c(death_date))
  
  test[["procedure_occurrence"]] <- test[["procedure_occurrence"]] %>%
    mutate(var = as.integer(as.Date(.data$death_date, format = "%Y-%m-%d") - as.Date(.data$procedure_date, format = "%Y-%m-%d"))) %>%
    filter(var >= 0 | is.na(var)) %>%
    select(-c(var, death_date))
  
  test[["visit_occurrence"]] <- test[["visit_occurrence"]] %>%
    mutate(var = as.integer(as.Date(.data$death_date, format = "%Y-%m-%d") - as.Date(.data$visit_end_date, format = "%Y-%m-%d"))) %>%
    filter(var >= 0 | is.na(var)) %>%
    select(-c(var, death_date))
  
  
  
  
  
  
  #Join the dfs
  #by = "person_id"
  group1 <- c("person", "death", "observation_period", "visit_occurrence")
  
  visit_days_sum <- test[group1] %>% 
    purrr::reduce(left_join) %>%
    select(person_id, visit_days) %>%
    group_by(person_id) %>%
    summarise(visit_days = n()) %>%
    select(visit_days) %>%
    unlist %>% unname()
  
  
  
  df1 <- test[group1] %>% 
    purrr::reduce(left_join) %>%
    mutate(death_yesno = case_when(is.na(.data$death_date.x) ~ 0, TRUE ~ 1),
           death_date = as.Date(.data$death_date.x, format = "%Y-%m-%d"),
           visit_end_date = as.Date(.data$visit_end_date, format = "%Y-%m-%d"),
           observation_period_start_date = as.Date(.data$observation_period_start_date, format = "%Y-%m-%d")) %>%
    mutate(last_visit_to_death = as.integer(death_date - visit_end_date)) %>%
    group_by(person_id) %>%
    dplyr::slice(which.max(last_visit_to_death), preserve = TRUE) %>%
    ungroup %>% 
    #select(-c(visit_end_date, observation_period_start_date, death_date, visit_days, visit_concept_id, visit_occurrence_id)) %>%
    distinct(person_id, .keep_all = TRUE) %>%
    mutate(visit_days_sum = visit_days_sum) %>%
    as.data.table()
  
  df1 <-  df1 %>% mutate(death_yesno = ifelse(last_visit_to_death > 180 | is.na(last_visit_to_death), 0, 1)) %>% as.data.table()
  
  procedure_sum <- test[["procedure_occurrence"]] %>%
    group_by(person_id) %>%
    summarise(sum_procedures = n()) %>%
    data.table
  
  most_freq_procedure <- test[["procedure_occurrence"]] %>%
    group_by(person_id) %>%
    dplyr::slice(which.max(procedure_concept_id)) %>%
    select(person_id, procedure_concept_id) %>%
    rename(most_freq_procedure = procedure_concept_id) %>%
    data.table
  
  most_recent_procedure <- test[["procedure_occurrence"]] %>%
    mutate(procedure_date = as.Date(.data$procedure_date, format = "%Y-%m-%d")) %>%
    group_by(person_id) %>%
    dplyr::slice(which.max(procedure_date)) %>%
    select(person_id, procedure_concept_id) %>%
    rename(most_recent_procedure = procedure_concept_id) %>%
    data.table
  
  
  df1 <- left_join(df1, 
                   left_join(procedure_sum, most_freq_procedure, most_recent_procedure, by = 'person_id')) %>%
    as.data.table()
  
  
  drug_exposure_days <- left_join(df1, test[["drug_exposure"]]) %>%
    select(person_id, drug_exposure_days) %>%
    group_by(person_id) %>%
    summarise(drug_exposure_days = sum(drug_exposure_days)) %>%
    mutate(drug_exposure_days = case_when(is.na(drug_exposure_days) ~ 0,
                                          TRUE ~ as.numeric(drug_exposure_days))) %>%
    as.data.table()
  
  
  drugs_sum <- test[["drug_exposure"]] %>%
    group_by(person_id) %>%
    summarise(sum_drugs_used = n()) %>%
    data.table
  
  most_freq_drug <- test[["drug_exposure"]] %>%
    group_by(person_id) %>%
    dplyr::slice(which.max(drug_concept_id)) %>%
    select(person_id, drug_concept_id) %>%
    rename(most_freq_drug = drug_concept_id) %>%
    data.table
  
  df1 <- left_join(df1, left_join(drugs_sum, most_freq_drug, drug_exposure_days, by = "person_id")) %>% 
    as.data.table()
  
  
  conditions_sum <- test[["condition_occurrence"]] %>%
    group_by(person_id) %>%
    summarise(sum_conditions = n()) %>%
    data.table
  
  most_freq_condition <- test[["condition_occurrence"]] %>%
    group_by(person_id) %>%
    dplyr::slice(which.max(condition_concept_id)) %>%
    select(person_id, condition_concept_id) %>%
    rename(most_freq_condition = condition_concept_id) %>%
    data.table
  
  most_recent_condition <- test[["condition_occurrence"]] %>%
    mutate(condition_start_date = as.Date(.data$condition_start_date, format = "%Y-%m-%d")) %>%
    group_by(person_id) %>%
    dplyr::slice(which.max(condition_start_date)) %>%
    select(person_id, condition_concept_id) %>%
    rename(most_recent_condition = condition_concept_id) %>%
    data.table
  
  
  df1 <- left_join(df1, left_join(conditions_sum, most_freq_condition, most_recent_condition, by = "person_id")) %>% 
    as.data.table()
  
  
  measurements_sum <- test[["measurement"]] %>%
    group_by(person_id) %>%
    summarise(sum_measurements = n()) %>%
    data.table
  
  most_freq_measurement <- test[["measurement"]] %>%
    group_by(person_id) %>%
    dplyr::slice(which.max(measurement_concept_id)) %>%
    select(person_id, measurement_concept_id) %>%
    rename(most_freq_measurement = measurement_concept_id) %>%
    data.table
  
  most_recent_measurement <- test[["measurement"]] %>%
    mutate(measurement_date = as.Date(.data$measurement_date, format = "%Y-%m-%d")) %>%
    group_by(person_id) %>%
    dplyr::slice(which.max(measurement_date)) %>%
    select(person_id, measurement_concept_id) %>%
    rename(most_recent_measurement = measurement_concept_id) %>%
    data.table
  
  df1 <- left_join(df1, left_join(measurements_sum, most_freq_measurement, most_recent_measurement, by = "person_id")) %>% 
    as.data.table()
  
  
  observations_sum <- test[["observation"]] %>%
    group_by(person_id) %>%
    summarise(sum_observations = n()) %>%
    data.table
  
  most_freq_observation <- test[["observation"]] %>%
    group_by(person_id) %>%
    dplyr::slice(which.max(observation_concept_id)) %>%
    select(person_id, observation_concept_id) %>%
    rename(most_freq_observation = observation_concept_id) %>%
    data.table
  
  most_recent_observation <- test[["observation"]] %>%
    mutate(observation_date = as.Date(.data$observation_date, format = "%Y-%m-%d")) %>%
    group_by(person_id) %>%
    dplyr::slice(which.max(observation_date)) %>%
    select(person_id, observation_concept_id) %>%
    rename(most_recent_observation = observation_concept_id) %>%
    data.table
  
  df1 <- left_join(df1, left_join(observations_sum, most_freq_observation, most_recent_observation, by = "person_id")) %>% 
    as.data.table()
  
  
  df1 <- df1 %>%
    select(-c(visit_end_date, observation_period_start_date, death_date, visit_days, visit_concept_id, visit_occurrence_id, death_date.x, death_date.y)) %>%
    as.data.table()
  
  #rm(list=setdiff(ls(), c("test", "df1")))
  
  #  df1$death_yesno %>% table
  
  ###########
  
  #apply(df1[,-c(grep('sum', names(df1)), "year_of_birth", "death_yesno" )], 2)
  
  
  df1 <- df1 %>% 
    mutate_if(!grepl('sum|birth|death_yesno', colnames(df1)), as.character) %>% 
    mutate_if(grepl('sum', colnames(df1)), ~replace(., is.na(.), 0)) %>%
    as.data.table()
  
  df1[, (c("observation_days")) := lapply(.SD, as.integer), .SDcols = c("observation_days")]
  
  return(df1)
  
  
}
ptogias/testfun documentation built on Nov. 13, 2019, 12:35 a.m.