###################################################
# s02_dataset_review.Rmd
# Description: Review of rawdata, identification of a priori outliers etc
# Dependencies: s01_dataset_preparation.R / s01.RData
###################################################

# Settings to knit in top directory:
# Everything after this chunk works with paths relative top level
library(rprojroot)
knitr::opts_knit$set(root.dir=find_root(has_file("OpenProject.Rproj"))) 
knitr::opts_chunk$set(echo=F)

# Note: R markdown opens a new R session, your global environment is not available.

Load the dataset as prepared by "s01_dataset_preparation.R":

# -----------------------------------------------
# Prepare environment
# -----------------------------------------------
source(file = file.path("./Scripts","Setup","setup01_rEnvironment.R"))
load(file = file.path("./Scripts","s01.RData"))

Structure and dimentions

The first 10 rows of the data:

head(rawdata, 10)

The dimentions of the data:

dim(rawdata)

Included columns:

names(rawdata)

Are there any duplicated rows?

check_message(logical = any(duplicated(rawdata)))

Study related columns and project setup files

Are all columns in the dataset defined in "setup03_variables.R"?

check_columns(names(rawdata), columns[['all']])

Are any columns defined in "setup03_variables.R" and not included in the dataset?

check_columns(columns[['all']], names(rawdata))

Which studies are included and are they defined in "setup03_variables.R":

  1. Character version
unique(rawdata$OSTUDYID)

check_columns(unique(rawdata$OSTUDYID), as.integer(ostudies))
  1. Character version
unique(rawdata$STUDYID)

check_columns(unique(rawdata$STUDYID), as.integer(studies))

Which cohorts are included and are they defined in "setup03_variables.R":

unique(rawdata$COHORT)

check_columns(unique(rawdata$COHORT), cohorts)

Which parts are included and are they defined in "setup03_variables.R":

unique(rawdata$PART)

check_columns(unique(rawdata$PART), parts)

Which dose groups are included and are they defined in "setup03_variables.R":

unique(rawdata$DOSE)  

check_columns(unique(rawdata$DOSE), as.integer(doses))

Cross-check of studies, cohorts, parts and dosing regimens

# Note: code needs to be modified if several dose/freq levels per cohort
rawdata %>% 
  group_by(STUDYID, COHORT) %>% 
  summarize(ostudyid = collapse_unique(OSTUDYID),
            studyid  = collapse_unique(STUDYID), 
            dose = collapse_unique(DOSE),
            freq = collapse_unique(FREQ), 
            regimen = collapse_unique(paste(DOSE, FREQ)))

# Uncomment and include if you want to cross-check parts
# rawdata %>% 
#   group_by(PART) %>% 
#   summarize(ostudyid = unique(OSTUDYID),
#             cohort   = unique(COHORT),
#             studyid  = unique(STUDYID),
#             dose     = unique(DOSE), 
#             freq = unique(FREQ), 
#             regimen  = unique(paste(DOSE, FREQ)))

Is the data sorted correctly?

Is NMSEQSID defined in an increasing order?

ids      <- unique(rawdata$NMSEQSID)
id_shift <- c(ids[-1], NA)
id_diff  <- na.omit(ids-id_shift)

check_message(logical = all(id_diff < 0))

Is TAFD increasing for each subject?

tmp <- 
  rawdata %>%
  group_by(NMSEQSID) %>%
  mutate(TAFDSHIFT = c(TAFD[-1], NA)) %>%
  mutate(TAFDDIFF = (TAFD-TAFDSHIFT)) %>%
  filter(!is.na(TAFDDIFF)) %>% 
  summarize(TAFDINCR = all(TAFDDIFF < 0)) %>% # summary for each patient
  summarize(output = unique(TAFDINCR))        # summary for entire dataset

check_message(logical = tmp$output)

Is TAPD increasing for each dose event?

## Uncomment first part if dataset is using ADDL and II and doesnt include dosing records

# # Explicitly add dose times to the dataset (expand addl and ii)
# doses <- rawdata %>% 
#   filter(!is.na(AMT)) %>% 
#   select(NMSEQSID, TAFD, AMT, ADDL, II)  
# 
# for(i in 1:nrow(doses)){
#   dosetime <- seq(from = doses$TAFD[i], 
#                   by   = doses$II[i], 
#                   length.out = (doses$ADDL[i]+1))
#   
#   dat <- data.frame(NMSEQSID = rep(doses$NMSEQSID[i], length(dosetime)), 
#                     TAFD = dosetime, 
#                     AMT = rep(doses$AMT[i], length(dosetime)))
#   if(i == 1){
#     dat_out <- dat
#   } else {
#     dat_out <- bind_rows(dat_out, dat)
#   }
# } 
# 
# # Merge 
# tmp <- rawdata %>% 
#   full_join(dat_out) %>% 
#   arrange(NMSEQSID, TAFD)
# 
# rm(dat, dat_out, dosetime, doses)

tmp <- rawdata

# Add a flag identifier for each dosing event (can be checked irrespectively of subject id)
# 1. Check if there are any rows before the first dose. 
na_index <- which(!is.na(tmp$AMT))[1]-1
# 2. Create flag
tmp <- tmp %>% 
  mutate(AMTFLAG = ifelse(!is.na(AMT), c(1:n()), NA), 
         AMTFLAG = ifelse(na_index==0, 
                          na.locf(AMTFLAG), 
                          # if rows before first dose, fill those with NA and do locf for the rest
                          c(rep(NA, na_index), na.locf(AMTFLAG)))
  )

# Summarize similar to above
tmp <- tmp %>%
  group_by(AMTFLAG) %>%
  mutate(TAPDSHIFT = c(TAPD[-1], NA)) %>%
  mutate(TAPDDIFF = (TAPD-TAPDSHIFT)) %>%
  filter(!is.na(TAPDDIFF)) %>% 
  summarize(TAPDINCR = all(TAPDDIFF < 0)) %>% 
  summarize(output = unique(TAPDINCR))

check_message(logical = tmp$output)

Are the dates and times increasing for each subject?

# # This code needs to be double checked so that it works. (no date and time included in this example set)
# rawdata <- 
#   rawdata %>% 
#   mutate(DATETIME = paste(DATE, TIME, sep=" ")) %>% 
#   mutate(DATETIME = as.Date(DATETIME, format = "%Y-%m-%d %H:%M")) # may need to change format
# 
# rawdata %>% 
#   group_by(NMSEQSID) %>% 
#   mutate(DATETIMEShift = c(DATETIME[-1], NA)) %>% 
#   mutate(DATETIMEDiff = (DATETIME-DATETIMEShift)) %>% 
#   summarize(DateTimeIncr = all(DATETIMEDiff[!is.na(DATETIMEDiff)] < 0)) %>% 
#   summarize(unique(DateTimeIncr))

Subjects

The total number of subjects included is:

nTotal <- length(unique(rawdata$NMSEQSID))
nTotal

Cross-check of numeric and original subject ID columns:

nTotal == length(unique(rawdata$OSID))

rawdata %>% 
  group_by(NMSEQSID) %>% 
  summarize(N = length(unique(OSID))) %>% 
  summarize(max = max(N), 
            duplicates = unique(duplicated(NMSEQSID)))

The total number of subjects in each study:

rawdata %>% 
  group_by(OSTUDYID) %>% 
  summarize(N=length(unique(NMSEQSID)))

Number of PK occasions and maximum follow up time per subject:

rawdata %>% 
  group_by(STUDYID, NMSEQSID) %>% 
  summarize(nOcc=ifelse(all(is.na(OCC)), 
                        0, max(unique(OCC[is.na(AMT)]), na.rm=T)),
            maxTAFD=max(TAFD, na.rm=T)) %>% 
  summarize(min_occ_per_id = min(nOcc), 
            max_occ_per_id = max(nOcc), 
            min_follow_up_per_id = min(maxTAFD), 
            max_follow_up_per_id = max(maxTAFD))

Regimens

Does any subject have any (unexpected) change in dosing freqency?

# Don't forget to exclude cohorts where you expect a change (if any)
tmp <- rawdata %>% 
  group_by(STUDYID, NMSEQSID) %>% 
  distinct(FREQ, .keep_all = T) %>% 
  ungroup() %>% 
  summarize(duplicates = unique(duplicated(NMSEQSID))) 

# Use if using ADDL and II in dataset
# tmp <- rawdata %>% 
#   group_by(STUDYID, NMSEQSID) %>% 
#   filter(II != 0) %>% 
#   summarize(II = collapse_unique(II)) %>% 
#   summarize(duplicates = unique(duplicated(NMSEQSID))) 

check_message(logical = any(tmp$duplicates))

Does any subject have any (unexpected) change in dose?

# Don't forget to exclude cohorts where you expect a change (if any)
tmp <- rawdata %>% 
  group_by(STUDYID, NMSEQSID) %>% 
  filter(!is.na(AMT)) %>% 
  summarize(AMT = collapse_unique(AMT)) %>% 
  summarize(duplicates=unique(duplicated(NMSEQSID))) 

check_message(logical = any(tmp$duplicates))

Other column summaries (not including covariates)

Numeric columns:

rawdata %>% 
  select(columns[['numeric']]) %>% 
  summary()

Numeric columns by study:

for(i in studies){
  print(paste("study", i,":"))
  print(
    rawdata %>% 
      filter(STUDYID==i) %>% select(columns[['numeric']]) %>% 
      summary()
    )
}

Unique values of categorical columns:

for(i in columns[['factors']]){
  print(rawdata %>% select(i) %>% unique())
}

Unique values of categorical columns by study:

for(i in columns[['factors']]){
print(rawdata %>% group_by(STUDYID) %>%  
        select(c("STUDYID", i)) %>% unique())
}

Cross-check of AMT, EVID, DV, MDV, BQL and CMT:

Is EVID==0 when AMT is missing?

test <- rawdata %>% 
  filter(is.na(AMT)) %>% 
  summarise(evid = collapse_unique(EVID))

check_message(logical = test$evid==1)

Is EVID==1 when AMT is not missing?

test <- rawdata %>% 
  filter(!is.na(AMT)) %>% 
  summarise(evid = collapse_unique(EVID))

check_message(logical = test$evid==1)

Is EVID=0 for observations?

test <- rawdata %>% 
  filter(!is.na(DV)) %>% 
  summarize(evid = collapse_unique(EVID))

check_message(logical = test$evid==0)

Is BLQ=0 for non-missing observations?

test <- rawdata %>% 
  filter(!is.na(DV)) %>% 
  summarize(blq = collapse_unique(BLQ))
check_message(logical = test$blq==0)

Are any quantified concentrations reported $<$LLOQ?

test <- rawdata %>% 
  filter(!is.na(DV)) %>% 
  summarize(blq_dv = collapse_unique(DV < LLOQ)) 

check_message(logical = test$blq_dv=="FALSE")

Are all TAPD=0 dosing events (evid=1)?

test <- rawdata %>% 
  filter(!is.na(TAPD) & TAPD==0) %>% 
  summarize(evid = collapse_unique(EVID))

check_message(logical = test$evid==1)

Are the compartments set correctly?

rawdata %>% 
  group_by(EVID) %>% 
  summarize(cmt = collapse_unique(CMT)) 

Does rows with missing DV and the correct EVID, MDV and BLQ?

rawdata %>% 
  filter(is.na(DV) & is.na(AMT)) %>% 
  summarize(dv = collapse_unique(DV), 
            evid = collapse_unique(EVID), 
            mdv = collapse_unique(MDV), 
            blq = collapse_unique(BLQ))

Does DV and LNDV align?

  1. are the same rows missing?
unique(is.na(rawdata$DV) == is.na(rawdata$LNDV)) 
  1. Is exp(LNDV) the same as DV? If not, what is the max difference (on DV scale)?
tmp <- rawdata %>% 
  filter(!is.na(DV)) %>% 
  select(DV, LNDV) %>% 
  mutate_if(is.numeric, signif, digits=3) %>% 
  mutate(is_equal  = DV==LNDV, 
         deviation = DV-exp(LNDV))

logic <- tmp %>% 
  summarize(test = all(is_equal))

check_message(logical = !logic$test)

if(!logic$test){
  # print maximum absolute deviation
  print(max(abs(tmp$deviation)))
} 

Cross-check of C and COMMENTS

Is there any C="C" which does not have an exclusion comment?

test <- rawdata %>% 
  filter(C=="C") %>% 
  summarise(comment = any(COMMENT %in% c("", " ") | is.na(COMMENT)))

check_message(logical = test$comment)

Check of concentration-time data

Comparison of nomial time and time after dose: Note: The code in the 4 following chunks are examples from a previous project and may contain bugs.

# data subset with observations only
rawdataConc <- rawdata %>% 
  filter(EVID==0) %>% 
  filter(!(MDV==1 & is.na(BLQ)))

# # This comparison may have to be split by study if data includes many subjects and different study designs
# 
# rawdataConc %>% 
#   mutate(DIFF = NOMTIME - TAPD)
# summary(rawdataConc$DIFF)

Are there any missings for NOMTIME or TAPD?

# tmp <- rawdataConc %>% 
#   filter(is.na(DIFF))
# tmp
# unique(tmp$COMMENT)
# unique(tmp$C)

Check of negative differences (i.e. assessment later than planned)

# rawdataConc %>% 
#   filter(!is.na(DIFF) & DIFF <= -0.17) %>% #>= 10 min difference
#   select(c("OSID","DATE","TIME",'TAFD','NOMTIME','TAPD',
#            "DIFF",'DV','BLQ','FREQ','COMMENT','MDV','OCC','COHORT')) %>% 
#   group_by(FREQ, NOMTIME, COHORT) %>% 
#   summarize(min = min(DIFF),
#             max = max(DIFF),
#             median=median(DIFF),
#             n = length(DIFF))

Check of positive differences (i.e. assessment earlier than planned)

# rawdataConc %>% 
#   filter(!is.na(DIFF) & DIFF >= 0.17) %>% #>= 10 min difference
#   select(c("OSID","DATE","TIME",'TAFD','NOMTIME','TAPD',
#            "DIFF",'DV','BLQ','FREQ','COMMENT','MDV','OCC','COHORT')) %>% 
#   group_by(FREQ, NOMTIME, COHORT) %>% 
#   summarize(min = min(DIFF),
#             max = max(DIFF),
#             median=median(DIFF),
#             n = length(DIFF))

Covariate summary and cross-check

Baseline data

baseline_rawdata <- rawdata %>% 
  distinct(NMSEQSID, .keep_all = T)

Summary of continous covariates:

summary(baseline_rawdata[, columns[['base_cont_cov']]])

Unique values of categorical covariates:

apply(baseline_rawdata[, columns[['base_cat_cov']]], 2, unique)

Is there only one value for each subject? (should return TRUE for all subjects)

# Each individual should only have one value
per_id <- function(x){
  return(length(unique(x))==1) 
}
summary(
  rawdata %>% 
  group_by(NMSEQSID) %>% 
  summarize_at(c(columns[['base_cont_cov']],columns[['base_cat_cov']]), 
               funs(values=per_id))
  )

Cross-check of covariate categories to ensure that the coding seems correct:

Does males have higher body weight?

baseline_rawdata %>% 
  filter(BWT !=-99 | !is.na(BWT) ) %>% 
  group_by(SEXM) %>% 
  summarise(min = min(BWT),
            median = median(BWT),
            max = max(BWT))

Is the RENAL impairment correct compared to CRCL and/or EGFR?

baseline_rawdata %>% 
  group_by(BRENAL) %>% 
  filter(BEGFR !=-99) %>% 
  summarize(min = min(BEGFR, na.rm = T),
            median = median(BEGFR, na.rm = T),
            max = max(BEGFR, na.rm = T))

# baseline_rawdata %>% 
#   group_by(BRENAL) %>% 
#   filter(BCRCL !=-99) %>% 
#   summarize(min = min(BCRCL, na.rm = T),
#             median = median(BCRCL, na.rm = T),
#             max = max(BCRCL, na.rm = T))

Compare the EGFR output to:

| Category | Definition | | ---------- | ------------------------------------------------------| | Normal | eGFR (mL/min/1.73 m2) ≥90; or CRCL (mL/min) ≥90 | | Mild | eGFR (mL/min/1.73 m2) 60-89; or CRCL (mL/min) 60-89 | | Moderate | eGFR (mL/min/1.73 m2) 30-59; or CRCL (mL/min) 30-59 | | Severe | eGFR (mL/min/1.73 m2) 15-29; or CRCL (mL/min) 15-29 | | End Stage | eGFR (mL/min/1.73 m2) <15 /on dialysis; or CRCL (mL/min) <15 /on dialysis |

Is the HEPATIC impairment correct compared to BI/AST (if NCI criteria)?

baseline_rawdata %>%
  group_by(BHEPATIC) %>%
  filter(BBI !=-99) %>%
  summarize(minBI = min(BBI, na.rm = T),
            medianBI = median(BBI, na.rm = T),
            maxBI = max(BBI, na.rm = T),
            minAST    = min(BAST, na.rm = T),
            medianAST = median(BAST, na.rm = T),
            maxAST    = max(BAST, na.rm = T))

Compare the output to e.g. NCI criteria (or modify to what was used to define categories):

| Category | Definition | | ---------- | -------------------------------------------------------| | Normal | Bilirubin ≤ ULN, and AST ≤ ULN | | Mild | Bilirubin ≤ ULN, and AST > ULN or Bilirubin >1.0*ULN to ≤1.5*ULN, and AST of any value | | Moderate | Bilirubin > 1.5ULN to ≤ 3ULN, and AST of any value | | Severe | Bilirubin >3.0*ULN, and AST of any value |

(The upper limit of normal (ULN) for bilirubin and AST are defined within the SDTM of each study.)

Spot check of derived variables:

# Selection of random ids to re-calculate for
ids <- sample(unique(baseline_rawdata$OSID), 10)
temp <- baseline_rawdata[baseline_rawdata$OSID %in% ids,]

Baseline Creatinine clearance

# Re-calculate and compare
CRCL <- temp %>% 
  mutate(calcCRCL = (140-AGE) * BWT / (BSCR*0.8136), # males
         calcCRCL = ifelse(SEXM==0, 0.85*calcCRCL, calcCRCL), # females
         calcCRCL = round(calcCRCL, digits = 2),
         orignialCRCL = round(BCRCL, digits = 2)) %>% 
  select(calcCRCL, orignialCRCL)
CRCL

Baseline EGFR

# Re-calculate and compare

BEGFR <- vector("numeric", length(ids))
for(i in 1:length(ids)){

  # not black male
  BEGFR[i] <- 175 * temp$BSCR[i]^(-1.154) * temp$AGE[i]^(-0.203)

  # if female
  if(temp$SEXM[i]==0){
    BEGFR[i] <- BEGFR[i] * 0.742
  }
  # if black
  if(temp$RACE[i]==2){
    BEGFR[i] <- BEGFR[i] * 1.212
  }
}
round(BEGFR, digits = 2)
round(temp$BEGFR, digits = 2)

Baseline body surface area

round(0.007184 * (temp$BWT^0.425) * (temp$BHT^0.725), digits=2)
round(temp$BBSA, digits = 2)

Baseline BMI

round(temp$BWT / ((temp$BHT/100)^2), digits=2)
round(temp$BBMI, digits = 2)


AstraZeneca/pmworkbench documentation built on Nov. 18, 2019, 12:51 a.m.