R/template_dataexploration.R

# Doc header --------------------------------------------------------------

# author: "Jan van den Brand, PhD"
# email: jan.vandenbrand@kuleuven.be
# project: NSN19OK003
# funding: Dutch Kidney Foundation
# Topic: Data exploration

# 0: Preliminaries  ------------------------------------------------------

reqlib <- c("foreign", "lattice", "MASS", "gridExtra", "tidyr", "haven", "mice", "reshape2", 
            "visdat", "tableone", "lubridate")
lapply(reqlib, library, character.only = TRUE)
setwd("C:/Users/jajgv/Documents/repos/highdimjm")
source("R/mmm_functions.R")
# set seed for reproducibility
set.seed(20201013)

# 1: edit --------------------------------------------------------

source('R/edit.R')

source("R/create_codebook.R")
## Codebook for baseline data
varlabels <- lapply(d_bas[1:ncol(d_bas)], get_label)
classlabel <- lapply(d_bas[1:ncol(d_bas)], get_class)
varsum <- lapply(d_bas[1:ncol(d_bas)], get_summary)
codebook <- data.frame(
  varname = names(d_bas),
  varlabel = unlist(varlabels),
  class = unlist(classlabel),
  summary = paste(varsum)
)
write.table(codebook, file = "data/codebook_baseline.txt", sep = "\t", row.names = FALSE)

# 2: Visualization ----
# Pie/bar charts for categorical variables
factors <- names(classlabel)[classlabel == "factor"]
pdf(file = "plots/visualization_barcharts.pdf")
  lapply(factors, make_bar_plot, data = d_bas)
dev.off()

# create histograms for continuous variables
markers <- names(classlabel)[classlabel == "numeric"]
pdf(file = "plots/visualization_histograms.pdf")
  lapply(markers, make_histogram, data = d_bas)
dev.off()

# Spaghetti plots
markers <- c("gfr", "nf_protu", "nf_procr")
pdf(file = "plots/visualization_spaghetti.pdf")
  lapply(markers, make_spaghetti_plots, data = d_long, ftime = "time", id = "transnr", breaks = 12)
dev.off()

# Missing data analysis -----------------
pdf(file = "plots/template_baseline_missingdata.pdf")
vis_miss(d_bas, cluster = TRUE)
dev.off()
# Impute baseline missing data. Use a single imputation for testing and MI for model implementation
init_mice <- mice(d_bas, maxit = 0, predictorMatrix = quickpred(d_bas, mincor = 0.2), print = FALSE)
predmat <- init_mice$predictorMatrix
# Exclude predictors
predmat[,c("transnr", "eadnr", "txdate", "date")] <- 0 
imp_method <- init_mice$method
imp_bas <- mice(d_bas, method = imp_method, predictorMatrix = predmat, maxit = 10, m = 20)
# diagnostics
pdf(file = "plots/template_baseline_imputation_dx.pdf")
plot(imp_bas)
stripplot(imp_bas)
dev.off()

pdf(file = "plots/template_missingfudata.pdf")
vis_miss(d_long, warn_large_data = FALSE)
dev.off()

# 3: Inspect baseline data -----------------------

tablevars <- c("rec_age", "rec_sex_m1", "rec_race", "rec_bmi_d0", "primary_kd",
               "donor_age", "donor_sex_m1", "donor_type",
               "txyear", "abdr_antigen_mismatches", "abdrdq_antigen_mismatches",
               "pretx_hla_abs", "overall_pretx_dsa", "induction", 
               "cit", "anastomosis_time_minutes", "current_dsa",
               "t", "i", "g", "ah", "v", "cg", "ci", "ct", "cv", "mm", "ptcitis",
               "trombi", "gs", "c4d_ptc", "c4d_gc", "c3d_ptc", "c3d_gc", 
               "stime", "event")
factorvars <- c("rec_sex_m1", "rec_race", "primary_kd", "donor_type", "donor_sex_m1", 
             "abdr_antigen_mismatches", "pretx_hla_abs", "overall_pretx_dsa", "induction",
             "t", "i", "g", "ah", "v", "cg", "ci", "ct", "cv", "mm", "ptcitis",
             "trombi", "c4d_ptc", "c4d_gc", "c3d_ptc", "c3d_gc")
d_bas <- d_bas %>% mutate(
  across(all_of(factorvars), as.factor)
)
skewvars <- c("rec_age", "donor_age", "rec_bmi_d0", "anastomosis_time_minutes", "gs")
lapply(factorvars, make_bar_plot, data=d_bas, by="event")
lapply(skewvars, make_histogram, data=d_bas)
baseline_table <- CreateTableOne(vars=tablevars, 
                                 strata="event", 
                                 data=d_bas,
                                 factorVars=factorvars)
baseline_table <- print(baseline_table, nonnormal = skewvars, formatOptions = list(big.mark = ","),
                        quote = FALSE, noSpaces = TRUE, printToggle = FALSE) 
baseline_table

# 4: Inspect Follow-up data -----
d_long <- d_long %>% mutate(
  across(all_of(factorvars), as.factor)
)
pdf("plots/template_followup.pdf")
  lapply(factorvars, make_bar_plot, data=d_long, by="event")
dev.off()

survfit(Surv(d_bas$stime/365.25, as.integer(d_bas$event) > 1) ~ 1)
JanvandenBrand/highdimjm documentation built on Dec. 18, 2021, 12:32 a.m.