################################################### # s04_eda_cvariates.Rmd # Description: Exploratory data analysis of covariates for population PK analysis # Dependencies: s01_dataset_preparation.R / s01.RData ################################################### # Settings to knit in top directory: # Everything after this chunk works with paths relative top level library(rprojroot) knitr::opts_knit$set(root.dir=find_root(has_file("OpenProject.Rproj"))) knitr::opts_chunk$set(echo=F) # Note: R markdown opens a new R session, your global environment is not available.
This script uses the data.frame "data", loaded from s01.RData. That is, all rows with C=="C" has been excluded for exploratory data analysis.
# ----------------------------------------------- # Prepare environment # ----------------------------------------------- source(file = file.path("./Scripts","Setup","setup01_rEnvironment.R")) load(file = file.path("./Scripts","s01.RData"))
Are the plots and tables also being written to file?
params$print_results
# 1. This needs to be thought through again: how to handle this in variable file instead? # 2. table one does not include missings, is that needed? covMapping <- data.frame(original = c(columns[["base_cont_cov"]], columns[["base_cat_cov"]]), new = c("Age (yrs)", "Creatinine clearance (mL/min)", "Body weight (kg)", "Sex", "Race"), stringsAsFactors = F) # replace with new names columns[["base_cont_cov"]] <- c("Age (yrs)", "Creatinine clearance (mL/min)","Body weight (kg)") columns[["base_cat_cov"]] <- c( "Sex", "Race") # rename to more presentable column names baseline_data <- baseline_data %>% rename_cols_mapping(., covMapping) # ----------------------------------------------- # Overall summary # ----------------------------------------------- # Create table 1 object covTabTot <- CreateTableOne(data = baseline_data, vars = c(columns[["base_cont_cov"]], columns[["base_cat_cov"]]), factorVars = columns[["base_cat_cov"]]) # Covert to data frame covTabTot <- print(covTabTot, contDigits = 1, quote = FALSE, noSpaces = TRUE, printToggle = FALSE, showAllLevels = TRUE, test=F) covTabTot <- cbind(Variable=rownames(covTabTot), covTabTot) # not to lose Variable in next step covTabTot <- as_data_frame(covTabTot) # ----------------------------------------------- # Stratified (by study or dose or whatever seems suitable) # ----------------------------------------------- # Create table 1 object covTabStrat <- CreateTableOne(data = baseline_data, vars = c(columns[["base_cont_cov"]], columns[["base_cat_cov"]]), factorVars = c("Sex", "Race"), strata = "STUDYID") # Covert to data frame covTabStrat <- print(covTabStrat , contDigits = 1, quote = FALSE, noSpaces = TRUE, printToggle = FALSE, showAllLevels = TRUE, test=F) covTabStrat <- cbind(Variable=rownames(covTabStrat ), covTabStrat) # not to lose Variable in next step covTabStrat <- as_data_frame(covTabStrat) %>% rename(`Study 1` = `1`, `Study 2` = `2`) # ----------------------------------------------- # Join to one tab # ----------------------------------------------- covTab <- full_join(covTabTot, covTabStrat) %>% rename(Level = level) %>% select(Variable, Level, `Study 1`,`Study 2`, Overall) kable(covTab) # continuous_covariates <- # baseline_data %>% # summarize_at(.vars = columns[["base_cont_cov"]], # .funs = c("mean", "sd", "min", "max"), na.rm=T) %>% # # restructure # gather(key=Variable) %>% # mutate(Characteristic = str_split(Variable, "_", simplify = T)[,1], # measure = str_split(Variable, "_", simplify = T)[,2]) %>% # select(-Variable) %>% # spread(key=measure, value=value)
# --------- Save to file if(params$print_results){ write.csv(covTab, row.names = F, file = file.path(directories[["res_eda_dir"]], paste0("covariate_tab_", delivery_date,".csv"))) }
The diagonal graphs show histograms of each covariate. The lower off-diagonal graphs are scatter plots of observations (black open circles) and LOESS smooth (black line) and its 95% confidence interval (grey shaded area). The uppoer off-diagonal graphs show the Pearson’s correlation coefficient.
# You may need to set up different lists if you have many covariates # or lumping of groups based on too few individuals etc. cont_vs_cont <- ggpairs(baseline_data, columns = columns[["base_cont_cov"]], upper = list(continuous = wrap("cor", color = "black")), diag= list(continuous = "barDiag"), lower = list(continuous = "smooth")) print(cont_vs_cont) if(params$print_results){ ggsave(file=file.path(directories[["res_eda_dir"]], paste0("continuous_covariates_", delivery_date,".pdf")), plot=cont_vs_cont, height=8, width=8, units = 'in', device=cairo_pdf) }
The diagonal graphs show bar charts of each covariate while the the off-diagonal graphs bar charts split by covariate. NA refers to not available, i.e., missing.
cat_vs_cat <- ggpairs(baseline_data, columns = columns[["base_cat_cov"]], upper = list(discrete = "blank")) + theme(axis.text.x = element_text(angle=40, hjust=1), panel.grid.minor = element_blank()) print(cat_vs_cat) if(params$print_results){ ggsave(file = file.path(directories[["res_eda_dir"]], paste0("categorical_covariates_", delivery_date, ".pdf")), plot = cat_vs_cat, height = 8, width = 8, units = 'in', device = cairo_pdf) }
The black line within the box shows the median and the box's upper and lower edges show the inter quartile range (IQR). Whiskers extend to the highest value that is within 1.5*IQR. Data beyond the end of the whiskers are outliers and plotted as points. NA refers to not available, i.e., missing.
cont_vs_cat <- ggduo(baseline_data, columns[["base_cat_cov"]], columns[["base_cont_cov"]], types = list(comboVertical = wrap("box_no_facet", outlier.shape = 1))) print(cont_vs_cat) if(params$print_results){ ggsave(file = file.path(directories[["res_eda_dir"]], paste0("continous_catagorical_covariates_", delivery_date,".pdf")), plot = cont_vs_cat, height = 10, width = 8, units = 'in', device = cairo_pdf) }
cont_by_study <- ggduo(baseline_data, "STUDYID", columns[["base_cont_cov"]], types = list(comboVertical = wrap("box_no_facet", outlier.shape = 1))) print(cont_by_study)
cat_by_study <- ggduo(baseline_data, "STUDYID", columns[["base_cat_cov"]], types = list(discrete = 'facetbar')) print(cat_by_study)
if(params$print_results){ ggsave(file = file.path(directories[["res_eda_dir"]], paste0("cont_cov_by_study_", delivery_date,".pdf")), plot = cont_by_study, height = 2.5, width = 5, units = 'in', device = cairo_pdf) ggsave(file = file.path(directories[["res_eda_dir"]], paste0("cat_cov_by_study_", delivery_date,".pdf")), plot = cont_by_study, height = 2.5, width = 5, units = 'in', device = cairo_pdf) }
The black line within the box shows the median and the box's upper and lower edges show the inter quartile range (IQR). Whiskers extend to the highest value that is within 1.5*IQR. Data beyond the end of the whiskers are outliers and plotted as points.
cont_by_dose <- ggduo(baseline_data, "DOSE", columns[["base_cont_cov"]], types = list(comboVertical = wrap("box_no_facet", outlier.shape = 1))) print(cont_by_dose)
The diagonal graphs show bar charts of each covariate. The off-diagonal graphs show the correlation between covariate categories: the black point is a visual reference point, and the numbers are percentage of subjects of a variable split by the groups of the other variable. NA refers to not available, i.e., missing. See also example text for categorical covariate correlation above.
cat_by_dose <- ggduo(baseline_data, "DOSE", columns[["base_cat_cov"]], types = list(discrete = 'facetbar')) print(cat_by_dose)
# write to file if(params$print_results){ ggsave(file = file.path(directories[["res_eda_dir"]], paste0("cont_cov_by_dose_", delivery_date,".pdf")), plot = cont_by_dose, height = 6, width = 4.5, units = 'in', device = cairo_pdf) ggsave(file = file.path(directories[["res_eda_dir"]], paste0("cat_cov_by_dose_", delivery_date,".pdf")), plot = cont_by_dose, height = 6, width = 4.5, units = 'in', device = cairo_pdf) }
The lines connect data from one individual. Ticks indicate all individual records of the covariate in the dataset. The blue line with shaded area is a loess smooth and its 95% confidence interval indicating any overall trends in changes of covariate values over time after first dose.
# If -99/other flag used for missing variables is included in the dataset # I recommend replacing them with NA for this plot. # Shortest and longest TAFD to set the same x-axes in plot tafd_range <- conc_data %>% summarize(min = min(TAFD), max = max(TAFD)) # 1. Continuous cov. vs time by study p1 <- list() for(i in columns[["cont_cov"]]){ # Create list with each covariate vs time p1[[i]] <- ggplot(conc_data, aes_string(x="TAFD", y=i)) + geom_point(aes_string(group="NMSEQSID"), size = 1.5, shape=124) + geom_line(aes_string(group="NMSEQSID")) + geom_smooth(aes_string(x="TAFD", y=i), inherit.aes = F, method="loess") + facet_wrap(~OSTUDYID, labeller = "label_both", nrow=2) + guides(group="none") + coord_cartesian(xlim = c(tafd_range$min, tafd_range$max)) + labs(y=i, x = labs_TAFD) } walk(p1, grob_draw) if(params$print_results){ pdf(file = file.path(directories[["res_eda_dir"]], paste0("time_varying_cont_cov_TAFD_", delivery_date,".pdf")), height=8.5, width=11) walk(p1, grob_draw) dev.off() }
The lines are a step function connecting data from one individual. Ticks indicate all individual records of the covariate in the dataset.
p1 <- list() for(i in columns[["cat_cov"]]){ # Create list with each covariate vs time p1[[i]] <- ggplot(conc_data, aes_string(x="TAFD", y=i, group="NMSEQSID")) + geom_point(size = 1.5, shape=124) + geom_step() + facet_wrap(~OSTUDYID, labeller = "label_both", nrow=2) + guides(group="none") + coord_cartesian(xlim = c(tafd_range$min, tafd_range$max)) + labs(y=i, x = labs_TAFD) } walk(p1, grob_draw) if(params$print_results){ pdf(file=file.path(directories[["res_eda_dir"]], paste0("time_varying_cat_cov_TAFD_", delivery_date,".pdf")), height=8.5, width=11) walk(p1, grob_draw) dev.off() }
Ticks indicate all individual records of the covariate in the dataset.
time_var_cont_cov_plots <- list() for(i in columns[["cont_cov"]]){ cov_range <- data.frame( min = min(conc_data[,i], na.rm=T), max = max(conc_data[,i], na.rm=T)) # list of individual plots for each covariate p1 <- vector("list", length(conc_data_id_splits)) for(j in 1:length(conc_data_id_splits)){ p1[[j]] <- ggplot(conc_data_id_splits[[j]], aes_string(x="TAFD", y=i)) + geom_point(size = 1.5, shape=124) + geom_line() + facet_wrap(~NMSEQSID+OSTUDYID, labeller = "label_both", nrow=3, ncol=4) + coord_cartesian(xlim = c(tafd_range$min, tafd_range$max), ylim =c(cov_range$min, cov_range$max)) + labs(y=i, x = labs_TAFD) } # join lists for all covariates time_var_cont_cov_plots <- c(time_var_cont_cov_plots, p1) } # takes some time to print if many individuals and many covariates... walk(time_var_cont_cov_plots, grob_draw) if(params$print_results){ pdf(file = file.path(directories[["res_eda_dir"]], paste0("individual_time_varying_cont_cov_TAFD_", delivery_date,".pdf")), height=8.5, width=11) walk(time_var_cont_cov_plots, grob_draw) dev.off() }
The lines are a step function connecting data from one individual. Ticks indicate all individual records of the covariate in the dataset.
time_var_cat_cov_plots <- list() for(i in columns[["cat_cov"]]){ # list of individual plots for each covariate p1 <- vector("list", length(conc_data_id_splits)) for(j in 1:length(conc_data_id_splits)){ p1[[j]] <- ggplot(conc_data_id_splits[[j]], aes_string(x="TAFD", y=i)) + geom_point(shape=124, size = 1.5) + geom_line() + facet_wrap(~NMSEQSID+OSTUDYID, labeller = "label_both", nrow=3, ncol=4) + coord_cartesian(xlim = c(tafd_range$min, tafd_range$max)) + labs(y=i, x = labs_TAFD) } # join lists for all covariates time_var_cat_cov_plots <- c(time_var_cat_cov_plots, p1) } # takes some time to print if many individuals and many covariates... walk(time_var_cat_cov_plots, grob_draw) if(params$print_results){ pdf(file = file.path(directories[["res_eda_dir"]], paste0("individual_time_varying_cat_cov_TAFD_", delivery_date,".pdf")), height=8.5, width=11) walk(time_var_cat_cov_plots, grob_draw) dev.off() }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.