In cas-bioinf/covid19retrospective: What the Package Does (One Line, Title Case)

devtools::load_all()
library(tidyverse)

raw_data_dir <- "H:/raw/"
if(!dir.exists(raw_data_dir)) {
  stop("Raw data dir not found")
}

mapping_file <- paste0(raw_data_dir,"mapping.xlsx")

raw_data_map <- readxl::read_excel(mapping_file, sheet = "HospitalMapping") %>% filter(!is.na(File))

Notes to resolve: - Missing "Covid medication before hospitalization" in "Old" version.

hospital_data <- list()
for(i in 1:nrow(raw_data_map)) {
  file_relative <- raw_data_map$File[i]
  hospital_id <- raw_data_map$HospitalID[i]
  lang <- raw_data_map$Lang[i]
  file_version <- raw_data_map$Version[i]

  file <- paste0(raw_data_dir, file_relative)

  cat("Processing ", i, "-", hospital_id, "\n")

  hospital_data[[hospital_id]] <- read_raw_data(file, hospital_id, lang, file_version)
}

complete_data <- join_hospital_data(hospital_data)
# TODO check:last day vs. breathing data
# MAx. day vs. read columns
differing_units <- complete_data$marker_data %>% group_by(marker) %>% filter(length(unique(unit)) > 1) %>% 
  group_by(marker, unit) %>% summarise(count = n(), hospitals = paste0(unique(hospital_id), collapse = ","), .groups = "drop")

if(nrow(differing_units) > 0) {
  print(differing_units)
  print(complete_data$marker_data %>% group_by(marker) %>% filter(length(unique(unit)) > 1, !is.na(value)) %>% 
  ggplot(aes(x = value)) + geom_histogram() + facet_wrap(marker ~ unit, scales = "free")
  )

  stop("Differing units")
}

Checking unit conversions

Check values for all markers with converted units (Note the 0 for Amoxiclav has been checked and is in the original data as well)

for(conversion in unit_conversions) {
  for(marker_to_check in conversion$markers) {
    cat(" ==== ", marker_to_check, "=====\n")
    complete_data$marker_data %>% filter(marker == marker_to_check, !is.na(value)) %>% 
      pull(value) %>% unique() %>% print()
    print(
      complete_data$marker_data %>% filter(marker == marker_to_check, !is.na(value)) %>% 
        ggplot(aes(x = value)) + geom_histogram() + facet_wrap(marker ~ unit, scales = "free")
    )
  }
}

Some more checks

unknown_breathing

max_days <- complete_data$breathing_data %>% group_by(hospital_id, patient_id) %>%
  summarise(max_day = max(day))

max_max_day = max(max_days$max_day)

crossing(max_days, day = 0:max_max_day) %>% filter(day <= max_day, day >= 0) %>%
  anti_join(complete_data$breathing_data, by = c("hospital_id", "patient_id", "day"))

complete_data$breathing_data %>% filter(is.na(breathing))

Markers outside breathing

Merging patients/wards

patients_to_merge <- readxl::read_excel(mapping_file, sheet = "PatientsToMerge", 
                                        col_types = c("text", "text", "text", "text", "text", "numeric", "text"))
hospitals_to_merge <- readxl::read_excel(mapping_file, sheet = "HospitalsToMerge")

Check for candidates to merge

for(i in 1:(length(hospital_data) - 1)) {
  if(names(hospital_data)[i] %in% c("cTdij", "iVMlA","bGzOO", "tMdnA", "nDINR", "vQNDS")) {
    next # Checked manually during date entry
  }
  for(j in (i + 1):length(hospital_data)) {
    #cat(i,",", j,"\n")
    if(names(hospital_data)[j] %in% c("cTdij", "iVMlA","bGzOO", "tMdnA", "nDINR","vQNDS")) {
      next # Checked manually during date entry
    }

    candidate_overlap <-  check_patient_overlap(hospital_data[[i]], hospital_data[[j]], 
                                                patients_to_merge, hospitals_to_merge) %>% 
      select(hospital_id1, patient_id1, hospital_id2, patient_id2, 
             symptom_onset1, symptom_onset2, outcome1, outcome2, 
             BMI1, BMI2, last_record1, last_record2,
             transferred_from1, transferred_to2, transferred_from2, transferred_to1) 
    if(nrow(candidate_overlap) > 0) {
      # Workaround, issue https://github.com/rstudio/rstudio/issues/4439
      for(n in names(candidate_overlap)) {
        names(candidate_overlap[[n]]) <- NULL
      }
      print(candidate_overlap)
    }
  }
}
#  mutate(d = as.numeric(symptom_onset2 - symptom_onset1))

complete_data_merged_1 <- merge_patients(complete_data, patients_to_merge)

complete_data_merged <- merge_hospitals(complete_data_merged_1, hospitals_to_merge)

complete_data_merged$patient_data %>% filter(hospital_id == "Oriik")

complete_data <- complete_data_merged

Final processing steps

no_data_pts <- 
  complete_data$breathing_data %>% group_by(hospital_id, patient_id) %>%
  summarise(count = n(), .groups = "drop") %>%
  filter(count < 1)

if(nrow(no_data_pts) > 0) {
  print(no_data_pts)
  stop("Some patients have no data")
}

Patient counts

complete_data$patient_data %>% group_by(hospital_id) %>%
  summarise(n())

Data collection periods

data_collection_period <- complete_data$patient_data %>% group_by(hospital_id) %>%
  summarise(first_admission = min(admission), last_admission = max(admission), .groups = "drop") %>%
  arrange(first_admission) %>%
  mutate(hospital_id_anon = 1:n()) %>% 
  select(-hospital_id)

write_csv(data_collection_period, here::here("manuscript", "data_collection_period.csv"))

complete_data$patient_data <- complete_data$patient_data  %>% mutate(first_wave = admission < lubridate::ymd("2020-09-01"))

anonymized_data <- anonymize_for_analysis(complete_data)

for(n in names(anonymized_data) ) {
  write_csv(anonymized_data[[n]], path = here::here("private_data", paste0(n,".csv")))
}
write_lines(digest::digest(anonymized_data), path = here::here("private_data", "data_revision.txt"))