devtools::load_all() library(tidyverse)
raw_data_dir <- "H:/raw/" if(!dir.exists(raw_data_dir)) { stop("Raw data dir not found") } mapping_file <- paste0(raw_data_dir,"mapping.xlsx") raw_data_map <- readxl::read_excel(mapping_file, sheet = "HospitalMapping") %>% filter(!is.na(File))
Notes to resolve: - Missing "Covid medication before hospitalization" in "Old" version.
hospital_data <- list() for(i in 1:nrow(raw_data_map)) { file_relative <- raw_data_map$File[i] hospital_id <- raw_data_map$HospitalID[i] lang <- raw_data_map$Lang[i] file_version <- raw_data_map$Version[i] file <- paste0(raw_data_dir, file_relative) cat("Processing ", i, "-", hospital_id, "\n") hospital_data[[hospital_id]] <- read_raw_data(file, hospital_id, lang, file_version) }
complete_data <- join_hospital_data(hospital_data) # TODO check:last day vs. breathing data # MAx. day vs. read columns differing_units <- complete_data$marker_data %>% group_by(marker) %>% filter(length(unique(unit)) > 1) %>% group_by(marker, unit) %>% summarise(count = n(), hospitals = paste0(unique(hospital_id), collapse = ","), .groups = "drop") if(nrow(differing_units) > 0) { print(differing_units) print(complete_data$marker_data %>% group_by(marker) %>% filter(length(unique(unit)) > 1, !is.na(value)) %>% ggplot(aes(x = value)) + geom_histogram() + facet_wrap(marker ~ unit, scales = "free") ) stop("Differing units") }
Check values for all markers with converted units (Note the 0 for Amoxiclav has been checked and is in the original data as well)
for(conversion in unit_conversions) { for(marker_to_check in conversion$markers) { cat(" ==== ", marker_to_check, "=====\n") complete_data$marker_data %>% filter(marker == marker_to_check, !is.na(value)) %>% pull(value) %>% unique() %>% print() print( complete_data$marker_data %>% filter(marker == marker_to_check, !is.na(value)) %>% ggplot(aes(x = value)) + geom_histogram() + facet_wrap(marker ~ unit, scales = "free") ) } }
unknown_breathing
max_days <- complete_data$breathing_data %>% group_by(hospital_id, patient_id) %>% summarise(max_day = max(day)) max_max_day = max(max_days$max_day) crossing(max_days, day = 0:max_max_day) %>% filter(day <= max_day, day >= 0) %>% anti_join(complete_data$breathing_data, by = c("hospital_id", "patient_id", "day")) complete_data$breathing_data %>% filter(is.na(breathing))
Markers outside breathing
patients_to_merge <- readxl::read_excel(mapping_file, sheet = "PatientsToMerge", col_types = c("text", "text", "text", "text", "text", "numeric", "text")) hospitals_to_merge <- readxl::read_excel(mapping_file, sheet = "HospitalsToMerge")
Check for candidates to merge
for(i in 1:(length(hospital_data) - 1)) { if(names(hospital_data)[i] %in% c("cTdij", "iVMlA","bGzOO", "tMdnA", "nDINR", "vQNDS")) { next # Checked manually during date entry } for(j in (i + 1):length(hospital_data)) { #cat(i,",", j,"\n") if(names(hospital_data)[j] %in% c("cTdij", "iVMlA","bGzOO", "tMdnA", "nDINR","vQNDS")) { next # Checked manually during date entry } candidate_overlap <- check_patient_overlap(hospital_data[[i]], hospital_data[[j]], patients_to_merge, hospitals_to_merge) %>% select(hospital_id1, patient_id1, hospital_id2, patient_id2, symptom_onset1, symptom_onset2, outcome1, outcome2, BMI1, BMI2, last_record1, last_record2, transferred_from1, transferred_to2, transferred_from2, transferred_to1) if(nrow(candidate_overlap) > 0) { # Workaround, issue https://github.com/rstudio/rstudio/issues/4439 for(n in names(candidate_overlap)) { names(candidate_overlap[[n]]) <- NULL } print(candidate_overlap) } } } # mutate(d = as.numeric(symptom_onset2 - symptom_onset1))
complete_data_merged_1 <- merge_patients(complete_data, patients_to_merge)
complete_data_merged <- merge_hospitals(complete_data_merged_1, hospitals_to_merge) complete_data_merged$patient_data %>% filter(hospital_id == "Oriik")
complete_data <- complete_data_merged
no_data_pts <- complete_data$breathing_data %>% group_by(hospital_id, patient_id) %>% summarise(count = n(), .groups = "drop") %>% filter(count < 1) if(nrow(no_data_pts) > 0) { print(no_data_pts) stop("Some patients have no data") }
Patient counts
complete_data$patient_data %>% group_by(hospital_id) %>% summarise(n())
Data collection periods
data_collection_period <- complete_data$patient_data %>% group_by(hospital_id) %>% summarise(first_admission = min(admission), last_admission = max(admission), .groups = "drop") %>% arrange(first_admission) %>% mutate(hospital_id_anon = 1:n()) %>% select(-hospital_id) write_csv(data_collection_period, here::here("manuscript", "data_collection_period.csv"))
complete_data$patient_data <- complete_data$patient_data %>% mutate(first_wave = admission < lubridate::ymd("2020-09-01"))
anonymized_data <- anonymize_for_analysis(complete_data)
for(n in names(anonymized_data) ) { write_csv(anonymized_data[[n]], path = here::here("private_data", paste0(n,".csv"))) } write_lines(digest::digest(anonymized_data), path = here::here("private_data", "data_revision.txt"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.