testf <- function(x) {
drop_cols <- list()
for(i in 1:length(data_l)) drop_cols[[i]] <- grep(
"location_id|provider_id|care_site_id|person_source_value|gender_source_value|gender_source_concept_id|race_source_value|race_source_concept_id|
ethnicity_source_value|ethnicity_source_concept_id|cause_concept_id|cause_source_value|cause_source_concept_id|visit_start_time|visit_end_time|
visit_source_value|visit_source_concept_id|modifier_concept_id|quantity|procedure_source_value|procedure_source_concept_id|qualifier_source_value|
drug_exposure_end_date|stop_reason|refills|days_supply|sig|route_concept_id|effective_drug_dose|dose_unit_concept_ id|lot_number|drug_source_value|
drug_source_concept_id|route_source_value|dose_unit_source_value|condition_end_date|condition_source_value|condition_source_concept_id|
operator_concept_id|value_as_concept_id|measurement_source_value|measurement_source_concept_id|unit_source_value|value_source_value|
observation_time|value_as_number|qualifier_concept_id|observation_source_value|observation_source_concept_id",
names(data_l[[i]]))
for(i in 1:length(data_l)) data_l[[i]][, drop_cols[[i]] := NULL]
#remove all na columns
test <- lapply(data_l, function (x) x[,which(unlist(lapply(x, function(y)!all(is.na(y))))),with=F])
##DRUG EXPOSURE CLEAN UP
#calculate days of drug usage - create new column
#test[[3]]$drug_exposure_days <- as.integer(as.Date(test[[3]]$verbatim_end_date, format = "%Y-%m-%d") - as.Date(test[[3]]$drug_exposure_start_datetime, format = "%Y-%m-%d"))
test[["drug_exposure"]][["drug_exposure_days"]] <- as.integer(as.Date(test[["drug_exposure"]][["verbatim_end_date"]], format = "%Y-%m-%d") - as.Date(test[["drug_exposure"]][["drug_exposure_start_datetime"]], format = "%Y-%m-%d"))
#keep only those with a valid number of days (>=0)
#test[[3]] <- test[[3]][drug_exposure_days >= 0]
test[["drug_exposure"]] <- test[["drug_exposure"]][drug_exposure_days >= 0]
#drop redundant columns
test[["drug_exposure"]][ , (c("drug_exposure_start_date","drug_exposure_start_datetime","drug_type_concept_id",
"drug_source_concept_id", "drug_exposure_id")) := NULL ]
#drop NAs based on drug_concept_id
test[["drug_exposure"]]<- test[["drug_exposure"]][drug_concept_id != "NA"]
##MEASUREMENT CLEAN UP
test[["measurement"]][, (c("unit_concept_id", "measurement_id", "measurement_datetime",
"measurement_type_concept_id")) := NULL]
test[["measurement"]] <- test[["measurement"]][measurement_concept_id != "NA"]
##CONDITION OCCURENCE CLEAN UP
test[["condition_occurrence"]][, (c("condition_start_datetime", "condition_status_concept_id",
"condition_type_concept_id", "condition_occurrence_id")) := NULL]
test[["condition_occurrence"]] <- test[["condition_occurrence"]][condition_concept_id != "NA"]
##PROCEDURE CLEAN UP
test[["procedure_occurrence"]][, (c("procedure_datetime", "procedure_type_concept_id",
"procedure_occurrence_id")) := NULL]
test[["procedure_occurrence"]] <- test[["procedure_occurrence"]][procedure_concept_id != "NA"]
##PERSON CLEAN UP
test[["person"]][, (c("month_of_birth", "day_of_birth", "birth_datetime",
"ethnicity_source_value")) := NULL]
##VISIT OCCURENCE CLEAN UP
#calculate days of drug usage - create new column
test[["visit_occurrence"]][["visit_days"]] <- as.integer(as.Date(test[["visit_occurrence"]][["visit_end_date"]], format = "%Y-%m-%d") - as.Date(test[["visit_occurrence"]][["visit_start_date"]], format = "%Y-%m-%d"))
#keep only those with a valid number of days (>=0)
test[["visit_occurrence"]] <- test[["visit_occurrence"]][visit_days >= 0]
#drop redundant columns
test[["visit_occurrence"]][, (c("visit_start_datetime", "visit_type_concept_id",
"admitting_source_concept_id", "discharge_to_concept_id",
"visit_source_value","visit_end_datetime","visit_start_date")) := NULL]
test[["visit_occurrence"]] <- test[["visit_occurrence"]][visit_concept_id != "NA"]
##OBSERVATION OCCURENCE CLEAN UP
test[["observation"]][, (c("observation_type_concept_id", "observation_id",
"observation_datetime", "observation_source_concept_i")) := NULL]
test[["observation"]] <- test[["observation"]][observation_concept_id != "NA"]
##OBSERVATION PERIOD CLEAN UP
test[["observation_period"]][["observation_days"]] <- as.integer(as.Date(test[["observation_period"]][["observation_period_end_date"]], format = "%Y-%m-%d") - as.Date(test[["observation_period"]][["observation_period_start_date"]], format = "%Y-%m-%d"))
test[["observation_period"]] <- test[["observation_period"]][observation_days >= 0]
test[["observation_period"]][, (c("period_type_concept_id", "observation_period_id",
"observation_period_end_date")) := NULL]
##DEATH CLEAN UP
test[["death"]][, (c("death_datetime", "death_type_concept_id")) := NULL]
d_date <- test$death
test <- lapply(test, function(x) left_join(x, d_date, by = "person_id"))
test[["drug_exposure"]] <- test[["drug_exposure"]] %>%
mutate(var = as.integer(as.Date(.data$death_date, format = "%Y-%m-%d") - as.Date(.data$verbatim_end_date, format = "%Y-%m-%d"))) %>%
filter(var >= 0 | is.na(var)) %>%
select(-c(var, death_date))
test[["condition_occurrence"]] <- test[["condition_occurrence"]] %>%
mutate(var = as.integer(as.Date(.data$death_date, format = "%Y-%m-%d") - as.Date(.data$condition_start_date, format = "%Y-%m-%d"))) %>%
filter(var >= 0 | is.na(var)) %>%
select(-c(var, death_date))
test[["measurement"]] <- test[["measurement"]] %>%
mutate(var = as.integer(as.Date(.data$death_date, format = "%Y-%m-%d") - as.Date(.data$measurement_date, format = "%Y-%m-%d"))) %>%
filter(var >= 0 | is.na(var)) %>%
select(-c(var, death_date))
test[["observation"]] <- test[["observation"]] %>%
mutate(var = as.integer(as.Date(.data$death_date, format = "%Y-%m-%d") - as.Date(.data$observation_date, format = "%Y-%m-%d"))) %>%
filter(var >= 0 | is.na(var)) %>%
select(-c(var, death_date))
test[["observation_period"]] <- test[["observation_period"]] %>%
mutate(var = as.integer(as.Date(.data$death_date, format = "%Y-%m-%d") - as.Date(.data$observation_period_start_date, format = "%Y-%m-%d"))) %>%
filter(var >= 0 | is.na(var)) %>%
select(-c(var, death_date))
test[["person"]] <- test[["person"]] %>%
select(-c(death_date))
test[["procedure_occurrence"]] <- test[["procedure_occurrence"]] %>%
mutate(var = as.integer(as.Date(.data$death_date, format = "%Y-%m-%d") - as.Date(.data$procedure_date, format = "%Y-%m-%d"))) %>%
filter(var >= 0 | is.na(var)) %>%
select(-c(var, death_date))
test[["visit_occurrence"]] <- test[["visit_occurrence"]] %>%
mutate(var = as.integer(as.Date(.data$death_date, format = "%Y-%m-%d") - as.Date(.data$visit_end_date, format = "%Y-%m-%d"))) %>%
filter(var >= 0 | is.na(var)) %>%
select(-c(var, death_date))
#Join the dfs
#by = "person_id"
group1 <- c("person", "death", "observation_period", "visit_occurrence")
visit_days_sum <- test[group1] %>%
purrr::reduce(left_join) %>%
select(person_id, visit_days) %>%
group_by(person_id) %>%
summarise(visit_days = n()) %>%
select(visit_days) %>%
unlist %>% unname()
df1 <- test[group1] %>%
purrr::reduce(left_join) %>%
mutate(death_yesno = case_when(is.na(.data$death_date.x) ~ 0, TRUE ~ 1),
death_date = as.Date(.data$death_date.x, format = "%Y-%m-%d"),
visit_end_date = as.Date(.data$visit_end_date, format = "%Y-%m-%d"),
observation_period_start_date = as.Date(.data$observation_period_start_date, format = "%Y-%m-%d")) %>%
mutate(last_visit_to_death = as.integer(death_date - visit_end_date)) %>%
group_by(person_id) %>%
dplyr::slice(which.max(last_visit_to_death), preserve = TRUE) %>%
ungroup %>%
#select(-c(visit_end_date, observation_period_start_date, death_date, visit_days, visit_concept_id, visit_occurrence_id)) %>%
distinct(person_id, .keep_all = TRUE) %>%
mutate(visit_days_sum = visit_days_sum) %>%
as.data.table()
df1 <- df1 %>% mutate(death_yesno = ifelse(last_visit_to_death > 180 | is.na(last_visit_to_death), 0, 1)) %>% as.data.table()
procedure_sum <- test[["procedure_occurrence"]] %>%
group_by(person_id) %>%
summarise(sum_procedures = n()) %>%
data.table
most_freq_procedure <- test[["procedure_occurrence"]] %>%
group_by(person_id) %>%
dplyr::slice(which.max(procedure_concept_id)) %>%
select(person_id, procedure_concept_id) %>%
rename(most_freq_procedure = procedure_concept_id) %>%
data.table
most_recent_procedure <- test[["procedure_occurrence"]] %>%
mutate(procedure_date = as.Date(.data$procedure_date, format = "%Y-%m-%d")) %>%
group_by(person_id) %>%
dplyr::slice(which.max(procedure_date)) %>%
select(person_id, procedure_concept_id) %>%
rename(most_recent_procedure = procedure_concept_id) %>%
data.table
df1 <- left_join(df1,
left_join(procedure_sum, most_freq_procedure, most_recent_procedure, by = 'person_id')) %>%
as.data.table()
drug_exposure_days <- left_join(df1, test[["drug_exposure"]]) %>%
select(person_id, drug_exposure_days) %>%
group_by(person_id) %>%
summarise(drug_exposure_days = sum(drug_exposure_days)) %>%
mutate(drug_exposure_days = case_when(is.na(drug_exposure_days) ~ 0,
TRUE ~ as.numeric(drug_exposure_days))) %>%
as.data.table()
drugs_sum <- test[["drug_exposure"]] %>%
group_by(person_id) %>%
summarise(sum_drugs_used = n()) %>%
data.table
most_freq_drug <- test[["drug_exposure"]] %>%
group_by(person_id) %>%
dplyr::slice(which.max(drug_concept_id)) %>%
select(person_id, drug_concept_id) %>%
rename(most_freq_drug = drug_concept_id) %>%
data.table
df1 <- left_join(df1, left_join(drugs_sum, most_freq_drug, drug_exposure_days, by = "person_id")) %>%
as.data.table()
conditions_sum <- test[["condition_occurrence"]] %>%
group_by(person_id) %>%
summarise(sum_conditions = n()) %>%
data.table
most_freq_condition <- test[["condition_occurrence"]] %>%
group_by(person_id) %>%
dplyr::slice(which.max(condition_concept_id)) %>%
select(person_id, condition_concept_id) %>%
rename(most_freq_condition = condition_concept_id) %>%
data.table
most_recent_condition <- test[["condition_occurrence"]] %>%
mutate(condition_start_date = as.Date(.data$condition_start_date, format = "%Y-%m-%d")) %>%
group_by(person_id) %>%
dplyr::slice(which.max(condition_start_date)) %>%
select(person_id, condition_concept_id) %>%
rename(most_recent_condition = condition_concept_id) %>%
data.table
df1 <- left_join(df1, left_join(conditions_sum, most_freq_condition, most_recent_condition, by = "person_id")) %>%
as.data.table()
measurements_sum <- test[["measurement"]] %>%
group_by(person_id) %>%
summarise(sum_measurements = n()) %>%
data.table
most_freq_measurement <- test[["measurement"]] %>%
group_by(person_id) %>%
dplyr::slice(which.max(measurement_concept_id)) %>%
select(person_id, measurement_concept_id) %>%
rename(most_freq_measurement = measurement_concept_id) %>%
data.table
most_recent_measurement <- test[["measurement"]] %>%
mutate(measurement_date = as.Date(.data$measurement_date, format = "%Y-%m-%d")) %>%
group_by(person_id) %>%
dplyr::slice(which.max(measurement_date)) %>%
select(person_id, measurement_concept_id) %>%
rename(most_recent_measurement = measurement_concept_id) %>%
data.table
df1 <- left_join(df1, left_join(measurements_sum, most_freq_measurement, most_recent_measurement, by = "person_id")) %>%
as.data.table()
observations_sum <- test[["observation"]] %>%
group_by(person_id) %>%
summarise(sum_observations = n()) %>%
data.table
most_freq_observation <- test[["observation"]] %>%
group_by(person_id) %>%
dplyr::slice(which.max(observation_concept_id)) %>%
select(person_id, observation_concept_id) %>%
rename(most_freq_observation = observation_concept_id) %>%
data.table
most_recent_observation <- test[["observation"]] %>%
mutate(observation_date = as.Date(.data$observation_date, format = "%Y-%m-%d")) %>%
group_by(person_id) %>%
dplyr::slice(which.max(observation_date)) %>%
select(person_id, observation_concept_id) %>%
rename(most_recent_observation = observation_concept_id) %>%
data.table
df1 <- left_join(df1, left_join(observations_sum, most_freq_observation, most_recent_observation, by = "person_id")) %>%
as.data.table()
df1 <- df1 %>%
select(-c(visit_end_date, observation_period_start_date, death_date, visit_days, visit_concept_id, visit_occurrence_id, death_date.x, death_date.y)) %>%
as.data.table()
#rm(list=setdiff(ls(), c("test", "df1")))
# df1$death_yesno %>% table
###########
#apply(df1[,-c(grep('sum', names(df1)), "year_of_birth", "death_yesno" )], 2)
df1 <- df1 %>%
mutate_if(!grepl('sum|birth|death_yesno', colnames(df1)), as.character) %>%
mutate_if(grepl('sum', colnames(df1)), ~replace(., is.na(.), 0)) %>%
as.data.table()
df1[, (c("observation_days")) := lapply(.SD, as.integer), .SDcols = c("observation_days")]
return(df1)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.