# s03_summary_excluded_data.R
# Description: Summary of data excluded with C="C"
# Dependencies: s01_dataset_preparation.R / s01.RData

# Settings to knit in top directory:
# Everything after this chunk works with paths relative top level

# Note: R markdown opens a new R session, your global environment is not available.

Load the dataset as prepared by "s01_dataset_preparation.R"

# -----------------------------------------------
# Prepare environment
# -----------------------------------------------
source(file = file.path("./Scripts","Setup","setup01_rEnvironment.R"))
load(file = file.path("./Scripts","s01.RData"))

The outputs from this scripts are:

  1. Summary table of number of excluded data and reasons (split by >/< LLOQ)

  2. Table listing each excluded record

Both are intended to be used in the report (appendices)

Summary of excluded subjects, concentrations and doses

Number of subjects excluded and total number in final data:

# data frame 'rawdata' contains all rows. 
# In the data frame 'data' rows with c="C" has been removed
rawdata %>% summarize(nSubTot = length(unique(NMSEQSID)))

data    %>% summarize(nSubIncl = length(unique(NMSEQSID)))

Total number of excluded rows:

excluded <- rawdata %>% filter(C=="C")

excl_doses <- excluded %>% filter(EVID==1)
excl_conc  <- excluded %>% filter(EVID==0)

Out of which r nrow(excl_doses) rows are doses and r nrow(excl_conc) concentrations.

Is anything else than EVID=1 or EVID=0 that is excluded?

test <- sum(nrow(excl_doses), nrow(excl_conc)) !=  nrow(excluded)


How much of the excluded concentration data is missing or BLQ?

excl_conc_missing <- 
  excl_conc %>% filter(MDV==1)

nrow(excl_conc_missing) # rows
100*( nrow(excl_conc_missing) / nrow(excl_conc) )  # percent

Summarize excluded data

Review reasons for exclusion

Excluded doses


Excluded concentrations


Primary reason for exclusion split by BLQ/non-BLQ:

# In this dataset there are not multiple comments in one row/cell. 
# If that is the case, you need to be careful not to count it twice/
# overwrite the reason/decide which is the primary cause for exclusion.

# Add flag
excl_conc <- excl_conc %>% 
  mutate(REASON = ifelse(str_detect(COMMENT, "Pre first dose sample"), 
                         "Pre first dose sample", NA),
         REASON = ifelse(str_detect(COMMENT, "Randomly selected for illustration"), 
                         "Randomly selected for illustration", REASON)

# Total n concentration 
total_conc <- rawdata %>% filter(EVID==0)

# Generate summary and tweak to presentable table
summaryExclConc <- 
  excl_conc %>%
  group_by(REASON, BLQ) %>% 
  summarize(n = n(), 
            `(%)` = round(100 * (n / nrow(total_conc)), digits=1)) %>% 
  ungroup() %>% 
  mutate(BLQ = factor(BLQ, levels = c(0,1), labels = c("Non-BLQ", "BLQ")))

# Write summary table to file
            file = file.path(directories[["res_eda_dir"]], "omittedConcDataSummary.csv"),
            row.names = F)

# Full listing to file
listingExcluded <- 
  excl_conc %>%  # in this case using excl_conc as no doses were excluded
  # excluded %>% 

            file = file.path(directories[["res_eda_dir"]], "omittedDataListing.csv"),
            row.names = F)

How much of the non-BLQ excluded data are from sparse samples:

# Note, in this example the sparse data is those without a OCC assignment
summarySparse <- 
  excl_conc %>%
  filter(is.na(OCC) & BLQ==0) %>% 
  group_by(REASON) %>% 
  summarize(n = n(), 
            `(%)` = round(100 * (n / nrow(total_conc) ), digits=1))

Review of comments not leading to exclusion

# Use the dataset 'data' for this since all C=="C" is commented out

data %>%
  filter(! (COMMENT %in% c(" ", "") | is.na(COMMENT))) %>% 
  group_by(EVID, COMMENT) %>% 
  summarize(n = n(), 
            percentOfTotal = round(100 * (n / nrow(data)), digits=2))

