data-raw/generate_data_for_package.R

# generate datasets to bundle with blantyreSepsis package for
# publication


library(plyr)
library(reshape2)
library(tidyverse)
library(here)
library(usethis)

thesis_folder <- "~/Documents/PhD/Thesis/bookdown/final_cleaning_scripts/"


# load data -----------------------------------------------------------------

# This main cleaning script is available at
# https://github.com/joelewis101/thesis/tree/
# post_phd_analysis/final_cleaning_scripts
# basically sources a bunch of subsidiary scripts to load
# all the different datasets

source(paste0(thesis_folder,
              "load_PhD_data.R")
)


# serology data - loads a function ------------------------------------------
source(here("data-raw/clean_serology_data.R"))

# load data
df.sera <-
  load_and_clean_serology_csvs(
    all_sera_path =
      "/Users/joelewis/Documents/PhD/serology/all_serology.csv",
    acute_sera_path =
      "/Users/joelewis/Documents/PhD/serology/acute_serology.csv" )

# tidy up, restrict to arm 1 -------------------------------------------------

enroll %>%
  filter(arm == 1) %>%
  left_join(bloods, by = "pid") %>%
  mutate(
    datefirstunwell = case_when(
      unwelfreq == "Days" ~ datefirstunwell *1,
      unwelfreq == "Weeks" ~ datefirstunwell * 7,
      unwelfreq == "Months" ~ datefirstunwell * 30,
    ),
    art.time = as.numeric((data_date - hivartstart) / (365.25/12)),
    hivart = recode(hivart,"5A" = "5A",
                    .default = "Other")
  ) -> e1

# get the aetiology data in the shape we want --------------------------

df.sera <- gimme_array_card_results_one_hot_coding(df.sera)

# wide
# this uses the following to decide re dengue and chik
# if both IgM are pos and only one IgG - pick the one with IgG
# if boith IgG are pos, pick the one with the highest IgG
# ricketsiosses are defined by IgG ≥ 1:512


df.sera %>%
  mutate(
    dengue =
      case_when(
        day_0_den_IgM  == "pos" & day_0_chik_IgM == "neg" ~ 1,
        day_0_den_IgM  == "pos" &
          day_0_chik_IgM == "pos" &
          day_0_den_IgG  == "pos" &
          day_0_chik_IgG == "neg" ~ 1,
        day_0_den_IgM  == "pos" &
          day_0_chik_IgM == "pos" &
          (
            (day_0_den_IgG  == "pos" & day_0_chik_IgG == "pos") |
              day_0_den_IgG  == "neg" &
              day_0_chik_IgG == "neg"
          ) & day_0_den_IgM_OD > day_0_chik_IgM_OD ~ 1,
        day_28_den_IgM  == "pos" &
          day_28_chik_IgM == "neg" ~ 1,
        day_28_den_IgM  == "pos" &
          day_28_chik_IgM == "pos" &
          day_28_den_IgG  == "pos" &
          day_28_chik_IgG == "neg" ~ 1,
        day_28_den_IgM  == "pos" &
          day_28_chik_IgM == "pos" &
          (
            (day_28_den_IgG  == "pos" & day_28_chik_IgG == "pos") |
              day_28_den_IgG  == "neg" &
              day_28_chik_IgG == "neg"
          ) &
          day_28_den_IgM_OD > day_28_chik_IgM_OD ~ 1,
        is.na(day_28_den_IgM) &
          is.na(day_0_den_IgM) ~ NA_real_,
        TRUE ~ 0
      ),
    chik =
      case_when(
        day_0_chik_IgM  == "pos" & day_0_den_IgM == "neg" ~ 1,
        day_0_chik_IgM  == "pos" &
          day_0_den_IgM == "pos" &
          day_0_chik_IgG  == "pos" &
          day_0_den_IgG == "neg" ~ 1,
        day_0_chik_IgM  == "pos" &
          day_0_den_IgM == "pos" &
          (
            (day_0_chik_IgG  == "pos" & day_0_den_IgG == "pos") |
              day_0_chik_IgG  == "neg" &
              day_0_den_IgG == "neg"
          ) & day_0_chik_IgM_OD > day_0_den_IgM_OD ~ 1,
        day_28_chik_IgM  == "pos" &
          day_28_den_IgM == "neg" ~ 1,
        day_28_chik_IgM  == "pos" &
          day_28_den_IgM == "pos" &
          day_28_chik_IgG  == "pos" &
          day_28_den_IgG == "neg" ~ 1,
        day_28_chik_IgM  == "pos" &
          day_28_den_IgM == "pos" &
          (
            (day_28_chik_IgG  == "pos" & day_28_den_IgG == "pos") |
              day_28_chik_IgG  == "neg" &
              day_28_den_IgG == "neg"
          ) &
          day_28_chik_IgM_OD > day_28_den_IgM_OD ~ 1,
        is.na(day_28_chik_IgM) &
          is.na(day_0_chik_IgM) ~ NA_real_,
        TRUE ~ 0
      ),
    bact_target_pcr = case_when(
      array_card_klebsiella.pneumoniae ~ 1,
      array_card_enterobacter.cloacae ~ 1,
      array_card_enterobacteria ~ 1,
      array_card_pan.strep ~ 1,
      array_card_strep.pneumo ~ 1,
      array_card_pseudomonas.aeruginosa ~ 1,
      array_card_salmonella.hilA ~ 1,
      !is.na(array_card_klebsiella.pneumoniae) ~ 0,
      TRUE ~ NA_real_
    ),
    lepto = case_when(
      day_0_lepto_IgM == "pos" ~ 1,
      day_28_lepto_IgM == "pos" ~ 1,
      is.na(day_0_lepto_IgM) &
        is.na(day_28_lepto_IgM) ~ NA_real_,
      TRUE ~ 0
    ),
    SF = case_when(
      day_28_SF_IgG_dilution %in% c("1:512", "1:1024") ~ 1,
      is.na(day_28_SF_IgG_dilution)  ~ NA_real_,
      TRUE ~ 0
    ),
    ET = case_when(
      day_28_ET_IgG_dilution %in% c("1:512", "1:1024") ~ 1,
      is.na(day_28_ET_IgG_dilution)  ~ NA_real_,
      TRUE ~ 0
    ),
    borrelia = as.numeric(array_card_pan.borrelia),
    rift_valley_fever = as.numeric(array_card_RVF)
  ) %>%
  select(
    pid.corrected,
    dengue,
    chik,
    bact_target_pcr,
    lepto,
    SF,
    ET,
    borrelia,
    rift_valley_fever
  ) -> aetiol.sera

# join to other aetiol data wide - make one big df ----------------------------


left_join((
  aetiol %>%
    pivot_wider(
      id_cols = c(pid, hivstatus),
      names_from = type,
      values_from = pathogen
    )),
  aetiol.sera,
  by = c("pid" = "pid.corrected")) %>%
  mutate(
    tb = case_when(
      `mtb bsi` ==  1 ~ 1,
      Xpert == 1 ~ 1,
      uLAM == 1 ~ 1,
      is.na(`mtb bsi`) & is.na(Xpert) & is.na(uLAM) ~ NA_real_,
      TRUE ~ 0
    ),
    bsi = case_when(
      bc == 1 ~ 1,
      bact_target_pcr == 1 ~ 1,
      is.na(bc) & is.na(bact_target_pcr) ~ NA_real_,
      TRUE ~ 0
    )
  ) -> aetiol.all


left_join(aetiol.all,
          select(bc.full, pid,crne) %>% unique(),
          by = "pid") %>%
  left_join( select(csf.full, pid, CrAg_LFA) %>% unique(),by = "pid") %>%
  mutate(bsi.bacterial = case_when(bsi == 1 & crne == 0 ~ 1,
                                   bsi == 1 & crne == 1 ~ 0,
                                   bsi == 0 ~ 0,
                                   TRUE ~ NA_real_),
         bsi.fungal = case_when(bsi == 1 & crne == 0 ~ 0,
                                bsi == 1 & crne == 1 ~ 1,
                                bsi == 0 ~ 0,
                                TRUE ~ NA_real_),
         csf.fungal = case_when(csf == 1 ~ 1,
                                csf == 0 ~ 0,
                                TRUE ~ NA_real_),
         csf.bacterial = case_when(csf == 0 ~ 0,
                                   csf == 1 ~ 0,
                                   TRUE ~ NA_real_),
         inv.bacterial = case_when(bsi.bacterial == 1 ~ 1,
                                   csf.bacterial == 1 ~ 1,
                                   bsi.bacterial == 0 ~ 0,
                                   csf.bacterial == 0 ~ 0),
         inv.fungal = case_when(bsi.fungal== 1 ~ 1,
                                csf.fungal == 1 ~ 1,
                                bsi.fungal == 0 ~ 0,
                                csf.fungal == 0 ~ 0),
         arbovirus = case_when(chik == 1 ~ 1,
                               dengue == 1 ~ 1,
                               chik == 0 ~ 0,
                               dengue == 0 ~ 0,
                               TRUE ~ NA_real_),
         ricks = case_when(ET == 1 ~ 1,
                           SF == 1 ~ 1,
                           ET == 0 ~ 0,
                           SF == 0 ~ 0,
                           TRUE ~ NA_real_),
         CrAg_LFA = case_when(CrAg_LFA == "POSITIVE" ~ 1,
                              CrAg_LFA == "NEGATIVE" ~ 0,
                              TRUE ~ NA_real_)
  ) -> aetiol.all

# and make the df for modelling




left_join(
  left_join(
    e1 %>%
      mutate(gcs = t0gcse + t0gcsv + t0gcsm) %>%
      select(
        pid,
        ustand,
        calc_age,
        ptsex,
        hivstatus,
        CD4_Absolute,
        Haemoglobin,
        screentemp,
        t0sbp,
        t0dbp,
        t0hr,
        t0rr,
        t0spo2,
        gcs,
        ustand,
        lactate_value,
        WCC,
        Platelets,
        CO2,
        Urea,
        Creatinine,
        NA.
      ),
    aetiol.all %>%
      select(
        pid,
        malaria,
        dengue,
        chik,
        arbovirus,
        bc,
        bact_target_pcr,
        inv.bacterial,
        inv.fungal,
        tb
      ),
    by = "pid"
  ),
  visits %>%
    select(pid, d28_death),
  by = "pid"
) -> df.mod

# If you wanna rerun mort models and impute missing data, run this script ----
# Takes a while! --

#source("mortmodels_final.R")

# make v1 df for KMs

v1 <- filter(visits, arm == 1)
v1$died <- as.numeric(v1$died)
v1 <- left_join(v1, select(e1, pid, hivstatus))

e1 %>%
  mutate(
    gcs = t0gcse + t0gcsv + t0gcsm,
    hivstatus = if_else(hivstatus == "Unknown", NA_character_, hivstatus)
  ) %>%
  select(pid,
    calc_age,ptsex,
    hivstatus,CD4_Absolute, hivonart,  art.time, hivcpt,
    tbstatus, tbongoing,
    screentemp,t0hr,t0rr,t0sbp, t0dbp, t0spo2, gcs, ustand,
    datefirstunwell,
    Haemoglobin,Platelets, WCC, NA.,Potassium, CO2, Urea, Creatinine, lactate_value
  ) %>%
  left_join(select(v1, pid, d28_death, d90_death, d180_death, t, died)) %>%
  rename_with( ~tolower(gsub("\\.", "_", .x))) %>%
  rename("sodium" = "na_",
         "days_unwell" = "datefirstunwell",
         "ever_tb" = "tbstatus",
         "lactate" = "lactate_value"
         ) -> dat

BTparticipants <- dat

use_data(BTparticipants, overwrite = TRUE)

### treatments -------------------------------------------------------

df.abs %>%
  select(pid,time_to_abx, first_ab) %>%
  mutate(
    first_ab = case_when(
      first_ab == "cipro" ~ "Ciprofloxacin",
      first_ab == "AUGMENTIN" ~ "Co-amoxiclav",
      TRUE ~ first_ab
    ),
    first_ab = str_to_title(first_ab)
  ) %>%
  rename("which_ab" = "first_ab",
         "timeto_ab" = "time_to_abx") %>%
  left_join(
    select(df.fung, pid,time_to_fung_rx, first_ab) %>%
      mutate(
        first_ab = case_when(
          first_ab == "fluco" ~ "Fluconazole",
          TRUE ~ first_ab
        ),
        first_ab = str_to_title(first_ab)
      ) %>%
      rename("which_antifungal" = "first_ab",
             "timeto_antifungal" = "time_to_fung_rx"),
    by = "pid"
  ) %>%
  left_join(
    select(df.mal, pid, time_to_mal_rx, first_ab) %>%
      mutate(
        first_ab = case_when(
          first_ab == "LA" ~ "Lumefantrine-artemether",
          TRUE ~ first_ab
        ),
        first_ab = str_to_title(first_ab)
        ) %>%
      rename("which_antimalarial" = "first_ab",
             "timeto_antimalarial" = "time_to_mal_rx"),
    by = "pid"
  ) %>%
  left_join(
    df.tb %>%
      mutate(first_ab = case_when(
        any_tb_rx == "yes" ~ "RHZE",
        TRUE ~ NA_character_
      )) %>%
      filter(!is.na(first_ab)) %>%
      select(pid, first_ab, time_to_tb_rx) %>%
      rename("which_antitb" = "first_ab",
             "timeto_antitb" = "time_to_tb_rx"),
    by = "pid"
  ) %>%
  left_join(
    select(fluid, pid, `6`) %>%
      rename("iv_fluid_6hr" = `6`),
    by = "pid"
  ) %>%
  mutate(across(contains("timeto"), as.numeric)) %>%
  mutate(which_ab = if_else(which_ab == "None", NA_character_, which_ab)) ->
  BTtreatment

use_data(BTtreatment, overwrite = TRUE)

# general aetiology -------------------------------------------------------

aetiol.all %>%
  select(-c(csf, crne, bc, bsi)) ->
  BTaetiology

use_data(BTaetiology, overwrite = TRUE)

# bloodculture --------------------------------------------------------------

bc.full %>%
  select(-c(anycontam, n_contam, n_orgs, contam_only)) %>%
  mutate(pathogen = as.numeric(pathogen),
         bcult = as.numeric(bcult)) %>%
  rename("bcult_anygrowth" = "bcult") ->
  BTbc

use_data(BTbc, overwrite = TRUE)

# sera ---------------------------------------------------------------------

df.sera %>%
  select(!contains("array_card") & !contains("day ") & !contains("Array ")) %>%
  rename("pid" = "pid.corrected") ->
  BTsera

use_data(BTsera, overwrite = TRUE)

# array card

df.sera %>%
  select(contains("pid") | contains("array_")) %>%
  rename("pid" = "pid.corrected") ->
  BTarraycard

use_data(BTarraycard, overwrite = TRUE)

#
joelewis101/blantyreSepsis documentation built on Aug. 30, 2021, 11:16 p.m.