###################################################
# s04_eda_cvariates.Rmd
# Description: Exploratory data analysis of covariates for population PK analysis
# Dependencies: s01_dataset_preparation.R / s01.RData
###################################################

# Settings to knit in top directory:
# Everything after this chunk works with paths relative top level
library(rprojroot)
knitr::opts_knit$set(root.dir=find_root(has_file("OpenProject.Rproj"))) 
knitr::opts_chunk$set(echo=F)

# Note: R markdown opens a new R session, your global environment is not available.

This script uses the data.frame "data", loaded from s01.RData. That is, all rows with C=="C" has been excluded for exploratory data analysis.

# -----------------------------------------------
# Prepare environment
# -----------------------------------------------
source(file = file.path("./Scripts","Setup","setup01_rEnvironment.R"))
load(file = file.path("./Scripts","s01.RData"))

Are the plots and tables also being written to file?

params$print_results

Baseline covariates

Numeric summaries

# 1. This needs to be thought through again: how to handle this in variable file instead? 
# 2. table one does not include missings, is that needed? 
covMapping <- 
  data.frame(original = c(columns[["base_cont_cov"]], columns[["base_cat_cov"]]),
             new = c("Age (yrs)", "Creatinine clearance (mL/min)",
                     "Body weight (kg)", "Sex", "Race"),
             stringsAsFactors = F)

# replace with new names
columns[["base_cont_cov"]] <- c("Age (yrs)", "Creatinine clearance (mL/min)","Body weight (kg)")
columns[["base_cat_cov"]] <- c( "Sex", "Race")

# rename to more presentable column names
baseline_data <- baseline_data %>% rename_cols_mapping(., covMapping)

# -----------------------------------------------
# Overall summary
# -----------------------------------------------

# Create table 1 object
covTabTot <- CreateTableOne(data = baseline_data, 
                         vars = c(columns[["base_cont_cov"]], columns[["base_cat_cov"]]), 
                         factorVars = columns[["base_cat_cov"]]) 

# Covert to data frame
covTabTot <- print(covTabTot, contDigits = 1, quote = FALSE, noSpaces = TRUE, 
                printToggle = FALSE, showAllLevels = TRUE, test=F)
covTabTot <- cbind(Variable=rownames(covTabTot), covTabTot) # not to lose Variable in next step

covTabTot <- as_data_frame(covTabTot)

# -----------------------------------------------
# Stratified (by study or dose or whatever seems suitable)
# -----------------------------------------------
# Create table 1 object
covTabStrat <- CreateTableOne(data = baseline_data, 
                              vars = c(columns[["base_cont_cov"]], columns[["base_cat_cov"]]), 
                              factorVars = c("Sex", "Race"), 
                              strata = "STUDYID") 

# Covert to data frame
covTabStrat  <- print(covTabStrat , contDigits = 1, quote = FALSE, noSpaces = TRUE, 
                printToggle = FALSE, showAllLevels = TRUE, test=F)
covTabStrat  <- cbind(Variable=rownames(covTabStrat ), covTabStrat) # not to lose Variable in next step

covTabStrat  <- as_data_frame(covTabStrat) %>% 
  rename(`Study 1` = `1`,
         `Study 2` = `2`)

# -----------------------------------------------
# Join to one tab
# -----------------------------------------------
covTab <- full_join(covTabTot, covTabStrat) %>% 
  rename(Level = level) %>% 
  select(Variable, Level, `Study 1`,`Study 2`, Overall)

kable(covTab)

# continuous_covariates <- 
#   baseline_data %>% 
#   summarize_at(.vars = columns[["base_cont_cov"]], 
#                .funs = c("mean", "sd", "min", "max"), na.rm=T) %>% 
#   # restructure
#   gather(key=Variable) %>% 
#   mutate(Characteristic = str_split(Variable, "_", simplify = T)[,1], 
#          measure = str_split(Variable, "_", simplify = T)[,2]) %>% 
#   select(-Variable) %>% 
#   spread(key=measure, value=value) 
# --------- Save to file
if(params$print_results){
  write.csv(covTab, row.names = F, 
            file = file.path(directories[["res_eda_dir"]], 
                             paste0("covariate_tab_", delivery_date,".csv")))
}

Plots of distributions and correlations

Continuous covariates

The diagonal graphs show histograms of each covariate. The lower off-diagonal graphs are scatter plots of observations (black open circles) and LOESS smooth (black line) and its 95% confidence interval (grey shaded area). The uppoer off-diagonal graphs show the Pearson’s correlation coefficient.

# You may need to set up different lists if you have many covariates
# or lumping of groups based on too few individuals etc.
cont_vs_cont <-
  ggpairs(baseline_data, columns = columns[["base_cont_cov"]], 
          upper = list(continuous = wrap("cor", color = "black")),
          diag= list(continuous = "barDiag"), 
          lower = list(continuous = "smooth"))

print(cont_vs_cont)

if(params$print_results){
  ggsave(file=file.path(directories[["res_eda_dir"]], 
                    paste0("continuous_covariates_", delivery_date,".pdf")), 
         plot=cont_vs_cont, 
         height=8, width=8, units = 'in', 
         device=cairo_pdf)
}

Categorical covariates

The diagonal graphs show bar charts of each covariate while the the off-diagonal graphs bar charts split by covariate. NA refers to not available, i.e., missing.

cat_vs_cat <-
  ggpairs(baseline_data, columns = columns[["base_cat_cov"]], 
          upper = list(discrete = "blank")) + 
  theme(axis.text.x = element_text(angle=40, hjust=1),
        panel.grid.minor = element_blank()) 

print(cat_vs_cat)

if(params$print_results){
  ggsave(file = file.path(directories[["res_eda_dir"]],
                          paste0("categorical_covariates_", delivery_date, ".pdf")), 
         plot = cat_vs_cat, 
         height = 8, width = 8, units = 'in', 
         device = cairo_pdf)
}

Categorical versus continuous

The black line within the box shows the median and the box's upper and lower edges show the inter quartile range (IQR). Whiskers extend to the highest value that is within 1.5*IQR. Data beyond the end of the whiskers are outliers and plotted as points. NA refers to not available, i.e., missing.

cont_vs_cat <-
  ggduo(baseline_data, columns[["base_cat_cov"]], columns[["base_cont_cov"]],
        types = list(comboVertical = wrap("box_no_facet", outlier.shape = 1)))
print(cont_vs_cat)

if(params$print_results){
  ggsave(file = file.path(directories[["res_eda_dir"]],
                          paste0("continous_catagorical_covariates_", delivery_date,".pdf")), 
         plot = cont_vs_cat, 
         height = 10, width = 8, units = 'in', 
         device = cairo_pdf)
}

Distributions stratified by study

cont_by_study <-
  ggduo(baseline_data, "STUDYID", columns[["base_cont_cov"]],
        types = list(comboVertical = wrap("box_no_facet", outlier.shape = 1)))
print(cont_by_study)
cat_by_study <-
  ggduo(baseline_data, "STUDYID", columns[["base_cat_cov"]],
        types = list(discrete = 'facetbar'))
print(cat_by_study)
if(params$print_results){

  ggsave(file = file.path(directories[["res_eda_dir"]],
                          paste0("cont_cov_by_study_", delivery_date,".pdf")), 
         plot = cont_by_study, 
         height = 2.5, width = 5, units = 'in', 
         device = cairo_pdf)

  ggsave(file = file.path(directories[["res_eda_dir"]],
                          paste0("cat_cov_by_study_", delivery_date,".pdf")), 
         plot = cont_by_study, 
         height = 2.5, width = 5, units = 'in', 
         device = cairo_pdf)
}

Distributions by dose group/regimen

The black line within the box shows the median and the box's upper and lower edges show the inter quartile range (IQR). Whiskers extend to the highest value that is within 1.5*IQR. Data beyond the end of the whiskers are outliers and plotted as points.

cont_by_dose <-
  ggduo(baseline_data, "DOSE", columns[["base_cont_cov"]],
        types = list(comboVertical = wrap("box_no_facet", outlier.shape = 1)))
print(cont_by_dose)

The diagonal graphs show bar charts of each covariate. The off-diagonal graphs show the correlation between covariate categories: the black point is a visual reference point, and the numbers are percentage of subjects of a variable split by the groups of the other variable. NA refers to not available, i.e., missing. See also example text for categorical covariate correlation above.

cat_by_dose <-
  ggduo(baseline_data, "DOSE", columns[["base_cat_cov"]],
        types = list(discrete = 'facetbar'))
print(cat_by_dose)
# write to file
if(params$print_results){

  ggsave(file = file.path(directories[["res_eda_dir"]],
                          paste0("cont_cov_by_dose_", delivery_date,".pdf")), 
         plot = cont_by_dose, 
         height = 6, width = 4.5, units = 'in', 
         device = cairo_pdf)

  ggsave(file = file.path(directories[["res_eda_dir"]],
                          paste0("cat_cov_by_dose_", delivery_date,".pdf")), 
         plot = cont_by_dose, 
         height = 6, width = 4.5, units = 'in', 
         device = cairo_pdf)
}

Time-varying covariates versus time

Stratified by study

Continuous

The lines connect data from one individual. Ticks indicate all individual records of the covariate in the dataset. The blue line with shaded area is a loess smooth and its 95% confidence interval indicating any overall trends in changes of covariate values over time after first dose.

# If -99/other flag used for missing variables is included in the dataset 
# I recommend replacing them with NA for this plot.

# Shortest and longest TAFD to set the same x-axes in plot
tafd_range <- conc_data %>% 
  summarize(min = min(TAFD), 
            max = max(TAFD))

# 1. Continuous cov. vs time by study
p1 <- list()
for(i in columns[["cont_cov"]]){
  # Create list with each covariate vs time
  p1[[i]] <-
    ggplot(conc_data, aes_string(x="TAFD", y=i)) +
    geom_point(aes_string(group="NMSEQSID"), size = 1.5, shape=124) +
    geom_line(aes_string(group="NMSEQSID")) +
    geom_smooth(aes_string(x="TAFD", y=i),
                inherit.aes = F, method="loess") +
    facet_wrap(~OSTUDYID, labeller = "label_both", nrow=2) +
    guides(group="none") +
    coord_cartesian(xlim = c(tafd_range$min, tafd_range$max)) +
    labs(y=i, x = labs_TAFD)
}
walk(p1, grob_draw)

if(params$print_results){
  pdf(file = file.path(directories[["res_eda_dir"]], 
                       paste0("time_varying_cont_cov_TAFD_", delivery_date,".pdf")),
      height=8.5, width=11)
  walk(p1, grob_draw)
  dev.off()
}

Categorical

The lines are a step function connecting data from one individual. Ticks indicate all individual records of the covariate in the dataset.

p1 <- list()

for(i in columns[["cat_cov"]]){
  # Create list with each covariate vs time
  p1[[i]] <-
    ggplot(conc_data, aes_string(x="TAFD", y=i, group="NMSEQSID")) +
    geom_point(size = 1.5, shape=124) +
    geom_step() +
    facet_wrap(~OSTUDYID, labeller = "label_both", nrow=2) +
    guides(group="none") +
    coord_cartesian(xlim = c(tafd_range$min, tafd_range$max)) + 
    labs(y=i, x = labs_TAFD)
}
walk(p1, grob_draw)

if(params$print_results){
  pdf(file=file.path(directories[["res_eda_dir"]],
                 paste0("time_varying_cat_cov_TAFD_", delivery_date,".pdf")),
      height=8.5, width=11)
  walk(p1, grob_draw)
  dev.off()
}

Stratified by subject

Continuous

Ticks indicate all individual records of the covariate in the dataset.

time_var_cont_cov_plots <-  list()

for(i in columns[["cont_cov"]]){

  cov_range <- data.frame(
    min = min(conc_data[,i], na.rm=T), 
    max = max(conc_data[,i], na.rm=T))

  # list of individual plots for each covariate
  p1 <- vector("list", length(conc_data_id_splits))

  for(j in 1:length(conc_data_id_splits)){
    p1[[j]] <-
      ggplot(conc_data_id_splits[[j]], aes_string(x="TAFD", y=i)) +
      geom_point(size = 1.5, shape=124) +
      geom_line() +
      facet_wrap(~NMSEQSID+OSTUDYID, labeller = "label_both", nrow=3, ncol=4) +
      coord_cartesian(xlim = c(tafd_range$min, tafd_range$max),
                      ylim =c(cov_range$min, cov_range$max)) +
      labs(y=i, x = labs_TAFD)
  }
  # join lists for all covariates
  time_var_cont_cov_plots <- c(time_var_cont_cov_plots, p1)
}

# takes some time to print if many individuals and many covariates...
walk(time_var_cont_cov_plots, grob_draw)

if(params$print_results){
  pdf(file = file.path(directories[["res_eda_dir"]],
                       paste0("individual_time_varying_cont_cov_TAFD_", delivery_date,".pdf")),
      height=8.5, width=11)
  walk(time_var_cont_cov_plots, grob_draw)
  dev.off()
}

Categorical

The lines are a step function connecting data from one individual. Ticks indicate all individual records of the covariate in the dataset.

time_var_cat_cov_plots <- list()

for(i in columns[["cat_cov"]]){

  # list of individual plots for each covariate
  p1 <- vector("list", length(conc_data_id_splits))

  for(j in 1:length(conc_data_id_splits)){
    p1[[j]] <-
      ggplot(conc_data_id_splits[[j]], aes_string(x="TAFD", y=i)) +
      geom_point(shape=124, size = 1.5) +
      geom_line() +
      facet_wrap(~NMSEQSID+OSTUDYID, labeller = "label_both", nrow=3, ncol=4) +
      coord_cartesian(xlim = c(tafd_range$min, tafd_range$max)) +
      labs(y=i, x = labs_TAFD)
    }
  # join lists for all covariates
  time_var_cat_cov_plots <- c(time_var_cat_cov_plots, p1)
}

# takes some time to print if many individuals and many covariates...
walk(time_var_cat_cov_plots, grob_draw)

if(params$print_results){
  pdf(file = file.path(directories[["res_eda_dir"]],
                       paste0("individual_time_varying_cat_cov_TAFD_", delivery_date,".pdf")),
      height=8.5, width=11)
  walk(time_var_cat_cov_plots, grob_draw)
  dev.off()
}


AstraZeneca/pmworkbench documentation built on Nov. 18, 2019, 12:51 a.m.