################################################### # s02_dataset_review.Rmd # Description: Review of rawdata, identification of a priori outliers etc # Dependencies: s01_dataset_preparation.R / s01.RData ################################################### # Settings to knit in top directory: # Everything after this chunk works with paths relative top level library(rprojroot) knitr::opts_knit$set(root.dir=find_root(has_file("OpenProject.Rproj"))) knitr::opts_chunk$set(echo=F) # Note: R markdown opens a new R session, your global environment is not available.
Load the dataset as prepared by "s01_dataset_preparation.R":
# ----------------------------------------------- # Prepare environment # ----------------------------------------------- source(file = file.path("./Scripts","Setup","setup01_rEnvironment.R")) load(file = file.path("./Scripts","s01.RData"))
The first 10 rows of the data:
head(rawdata, 10)
The dimentions of the data:
dim(rawdata)
Included columns:
names(rawdata)
Are there any duplicated rows?
check_message(logical = any(duplicated(rawdata)))
Are all columns in the dataset defined in "setup03_variables.R"?
check_columns(names(rawdata), columns[['all']])
Are any columns defined in "setup03_variables.R" and not included in the dataset?
check_columns(columns[['all']], names(rawdata))
Which studies are included and are they defined in "setup03_variables.R":
unique(rawdata$OSTUDYID) check_columns(unique(rawdata$OSTUDYID), as.integer(ostudies))
unique(rawdata$STUDYID) check_columns(unique(rawdata$STUDYID), as.integer(studies))
Which cohorts are included and are they defined in "setup03_variables.R":
unique(rawdata$COHORT) check_columns(unique(rawdata$COHORT), cohorts)
Which parts are included and are they defined in "setup03_variables.R":
unique(rawdata$PART) check_columns(unique(rawdata$PART), parts)
Which dose groups are included and are they defined in "setup03_variables.R":
unique(rawdata$DOSE) check_columns(unique(rawdata$DOSE), as.integer(doses))
# Note: code needs to be modified if several dose/freq levels per cohort rawdata %>% group_by(STUDYID, COHORT) %>% summarize(ostudyid = collapse_unique(OSTUDYID), studyid = collapse_unique(STUDYID), dose = collapse_unique(DOSE), freq = collapse_unique(FREQ), regimen = collapse_unique(paste(DOSE, FREQ))) # Uncomment and include if you want to cross-check parts # rawdata %>% # group_by(PART) %>% # summarize(ostudyid = unique(OSTUDYID), # cohort = unique(COHORT), # studyid = unique(STUDYID), # dose = unique(DOSE), # freq = unique(FREQ), # regimen = unique(paste(DOSE, FREQ)))
Is NMSEQSID defined in an increasing order?
ids <- unique(rawdata$NMSEQSID) id_shift <- c(ids[-1], NA) id_diff <- na.omit(ids-id_shift) check_message(logical = all(id_diff < 0))
Is TAFD increasing for each subject?
tmp <- rawdata %>% group_by(NMSEQSID) %>% mutate(TAFDSHIFT = c(TAFD[-1], NA)) %>% mutate(TAFDDIFF = (TAFD-TAFDSHIFT)) %>% filter(!is.na(TAFDDIFF)) %>% summarize(TAFDINCR = all(TAFDDIFF < 0)) %>% # summary for each patient summarize(output = unique(TAFDINCR)) # summary for entire dataset check_message(logical = tmp$output)
Is TAPD increasing for each dose event?
## Uncomment first part if dataset is using ADDL and II and doesnt include dosing records # # Explicitly add dose times to the dataset (expand addl and ii) # doses <- rawdata %>% # filter(!is.na(AMT)) %>% # select(NMSEQSID, TAFD, AMT, ADDL, II) # # for(i in 1:nrow(doses)){ # dosetime <- seq(from = doses$TAFD[i], # by = doses$II[i], # length.out = (doses$ADDL[i]+1)) # # dat <- data.frame(NMSEQSID = rep(doses$NMSEQSID[i], length(dosetime)), # TAFD = dosetime, # AMT = rep(doses$AMT[i], length(dosetime))) # if(i == 1){ # dat_out <- dat # } else { # dat_out <- bind_rows(dat_out, dat) # } # } # # # Merge # tmp <- rawdata %>% # full_join(dat_out) %>% # arrange(NMSEQSID, TAFD) # # rm(dat, dat_out, dosetime, doses) tmp <- rawdata # Add a flag identifier for each dosing event (can be checked irrespectively of subject id) # 1. Check if there are any rows before the first dose. na_index <- which(!is.na(tmp$AMT))[1]-1 # 2. Create flag tmp <- tmp %>% mutate(AMTFLAG = ifelse(!is.na(AMT), c(1:n()), NA), AMTFLAG = ifelse(na_index==0, na.locf(AMTFLAG), # if rows before first dose, fill those with NA and do locf for the rest c(rep(NA, na_index), na.locf(AMTFLAG))) ) # Summarize similar to above tmp <- tmp %>% group_by(AMTFLAG) %>% mutate(TAPDSHIFT = c(TAPD[-1], NA)) %>% mutate(TAPDDIFF = (TAPD-TAPDSHIFT)) %>% filter(!is.na(TAPDDIFF)) %>% summarize(TAPDINCR = all(TAPDDIFF < 0)) %>% summarize(output = unique(TAPDINCR)) check_message(logical = tmp$output)
Are the dates and times increasing for each subject?
# # This code needs to be double checked so that it works. (no date and time included in this example set) # rawdata <- # rawdata %>% # mutate(DATETIME = paste(DATE, TIME, sep=" ")) %>% # mutate(DATETIME = as.Date(DATETIME, format = "%Y-%m-%d %H:%M")) # may need to change format # # rawdata %>% # group_by(NMSEQSID) %>% # mutate(DATETIMEShift = c(DATETIME[-1], NA)) %>% # mutate(DATETIMEDiff = (DATETIME-DATETIMEShift)) %>% # summarize(DateTimeIncr = all(DATETIMEDiff[!is.na(DATETIMEDiff)] < 0)) %>% # summarize(unique(DateTimeIncr))
The total number of subjects included is:
nTotal <- length(unique(rawdata$NMSEQSID)) nTotal
Cross-check of numeric and original subject ID columns:
nTotal == length(unique(rawdata$OSID)) rawdata %>% group_by(NMSEQSID) %>% summarize(N = length(unique(OSID))) %>% summarize(max = max(N), duplicates = unique(duplicated(NMSEQSID)))
The total number of subjects in each study:
rawdata %>% group_by(OSTUDYID) %>% summarize(N=length(unique(NMSEQSID)))
Number of PK occasions and maximum follow up time per subject:
rawdata %>% group_by(STUDYID, NMSEQSID) %>% summarize(nOcc=ifelse(all(is.na(OCC)), 0, max(unique(OCC[is.na(AMT)]), na.rm=T)), maxTAFD=max(TAFD, na.rm=T)) %>% summarize(min_occ_per_id = min(nOcc), max_occ_per_id = max(nOcc), min_follow_up_per_id = min(maxTAFD), max_follow_up_per_id = max(maxTAFD))
Does any subject have any (unexpected) change in dosing freqency?
# Don't forget to exclude cohorts where you expect a change (if any) tmp <- rawdata %>% group_by(STUDYID, NMSEQSID) %>% distinct(FREQ, .keep_all = T) %>% ungroup() %>% summarize(duplicates = unique(duplicated(NMSEQSID))) # Use if using ADDL and II in dataset # tmp <- rawdata %>% # group_by(STUDYID, NMSEQSID) %>% # filter(II != 0) %>% # summarize(II = collapse_unique(II)) %>% # summarize(duplicates = unique(duplicated(NMSEQSID))) check_message(logical = any(tmp$duplicates))
Does any subject have any (unexpected) change in dose?
# Don't forget to exclude cohorts where you expect a change (if any) tmp <- rawdata %>% group_by(STUDYID, NMSEQSID) %>% filter(!is.na(AMT)) %>% summarize(AMT = collapse_unique(AMT)) %>% summarize(duplicates=unique(duplicated(NMSEQSID))) check_message(logical = any(tmp$duplicates))
Numeric columns:
rawdata %>% select(columns[['numeric']]) %>% summary()
Numeric columns by study:
for(i in studies){ print(paste("study", i,":")) print( rawdata %>% filter(STUDYID==i) %>% select(columns[['numeric']]) %>% summary() ) }
Unique values of categorical columns:
for(i in columns[['factors']]){ print(rawdata %>% select(i) %>% unique()) }
Unique values of categorical columns by study:
for(i in columns[['factors']]){ print(rawdata %>% group_by(STUDYID) %>% select(c("STUDYID", i)) %>% unique()) }
Is EVID==0 when AMT is missing?
test <- rawdata %>% filter(is.na(AMT)) %>% summarise(evid = collapse_unique(EVID)) check_message(logical = test$evid==1)
Is EVID==1 when AMT is not missing?
test <- rawdata %>% filter(!is.na(AMT)) %>% summarise(evid = collapse_unique(EVID)) check_message(logical = test$evid==1)
Is EVID=0 for observations?
test <- rawdata %>% filter(!is.na(DV)) %>% summarize(evid = collapse_unique(EVID)) check_message(logical = test$evid==0)
Is BLQ=0 for non-missing observations?
test <- rawdata %>% filter(!is.na(DV)) %>% summarize(blq = collapse_unique(BLQ)) check_message(logical = test$blq==0)
Are any quantified concentrations reported $<$LLOQ?
test <- rawdata %>% filter(!is.na(DV)) %>% summarize(blq_dv = collapse_unique(DV < LLOQ)) check_message(logical = test$blq_dv=="FALSE")
Are all TAPD=0 dosing events (evid=1)?
test <- rawdata %>% filter(!is.na(TAPD) & TAPD==0) %>% summarize(evid = collapse_unique(EVID)) check_message(logical = test$evid==1)
Are the compartments set correctly?
rawdata %>% group_by(EVID) %>% summarize(cmt = collapse_unique(CMT))
Does rows with missing DV and the correct EVID, MDV and BLQ?
rawdata %>% filter(is.na(DV) & is.na(AMT)) %>% summarize(dv = collapse_unique(DV), evid = collapse_unique(EVID), mdv = collapse_unique(MDV), blq = collapse_unique(BLQ))
Does DV and LNDV align?
unique(is.na(rawdata$DV) == is.na(rawdata$LNDV))
tmp <- rawdata %>% filter(!is.na(DV)) %>% select(DV, LNDV) %>% mutate_if(is.numeric, signif, digits=3) %>% mutate(is_equal = DV==LNDV, deviation = DV-exp(LNDV)) logic <- tmp %>% summarize(test = all(is_equal)) check_message(logical = !logic$test) if(!logic$test){ # print maximum absolute deviation print(max(abs(tmp$deviation))) }
Is there any C="C" which does not have an exclusion comment?
test <- rawdata %>% filter(C=="C") %>% summarise(comment = any(COMMENT %in% c("", " ") | is.na(COMMENT))) check_message(logical = test$comment)
Comparison of nomial time and time after dose: Note: The code in the 4 following chunks are examples from a previous project and may contain bugs.
# data subset with observations only rawdataConc <- rawdata %>% filter(EVID==0) %>% filter(!(MDV==1 & is.na(BLQ))) # # This comparison may have to be split by study if data includes many subjects and different study designs # # rawdataConc %>% # mutate(DIFF = NOMTIME - TAPD) # summary(rawdataConc$DIFF)
Are there any missings for NOMTIME or TAPD?
# tmp <- rawdataConc %>% # filter(is.na(DIFF)) # tmp # unique(tmp$COMMENT) # unique(tmp$C)
Check of negative differences (i.e. assessment later than planned)
# rawdataConc %>% # filter(!is.na(DIFF) & DIFF <= -0.17) %>% #>= 10 min difference # select(c("OSID","DATE","TIME",'TAFD','NOMTIME','TAPD', # "DIFF",'DV','BLQ','FREQ','COMMENT','MDV','OCC','COHORT')) %>% # group_by(FREQ, NOMTIME, COHORT) %>% # summarize(min = min(DIFF), # max = max(DIFF), # median=median(DIFF), # n = length(DIFF))
Check of positive differences (i.e. assessment earlier than planned)
# rawdataConc %>% # filter(!is.na(DIFF) & DIFF >= 0.17) %>% #>= 10 min difference # select(c("OSID","DATE","TIME",'TAFD','NOMTIME','TAPD', # "DIFF",'DV','BLQ','FREQ','COMMENT','MDV','OCC','COHORT')) %>% # group_by(FREQ, NOMTIME, COHORT) %>% # summarize(min = min(DIFF), # max = max(DIFF), # median=median(DIFF), # n = length(DIFF))
baseline_rawdata <- rawdata %>% distinct(NMSEQSID, .keep_all = T)
Summary of continous covariates:
summary(baseline_rawdata[, columns[['base_cont_cov']]])
Unique values of categorical covariates:
apply(baseline_rawdata[, columns[['base_cat_cov']]], 2, unique)
Is there only one value for each subject? (should return TRUE for all subjects)
# Each individual should only have one value per_id <- function(x){ return(length(unique(x))==1) } summary( rawdata %>% group_by(NMSEQSID) %>% summarize_at(c(columns[['base_cont_cov']],columns[['base_cat_cov']]), funs(values=per_id)) )
Does males have higher body weight?
baseline_rawdata %>% filter(BWT !=-99 | !is.na(BWT) ) %>% group_by(SEXM) %>% summarise(min = min(BWT), median = median(BWT), max = max(BWT))
Is the RENAL impairment correct compared to CRCL and/or EGFR?
baseline_rawdata %>% group_by(BRENAL) %>% filter(BEGFR !=-99) %>% summarize(min = min(BEGFR, na.rm = T), median = median(BEGFR, na.rm = T), max = max(BEGFR, na.rm = T)) # baseline_rawdata %>% # group_by(BRENAL) %>% # filter(BCRCL !=-99) %>% # summarize(min = min(BCRCL, na.rm = T), # median = median(BCRCL, na.rm = T), # max = max(BCRCL, na.rm = T))
Compare the EGFR output to:
| Category | Definition | | ---------- | ------------------------------------------------------| | Normal | eGFR (mL/min/1.73 m2) ≥90; or CRCL (mL/min) ≥90 | | Mild | eGFR (mL/min/1.73 m2) 60-89; or CRCL (mL/min) 60-89 | | Moderate | eGFR (mL/min/1.73 m2) 30-59; or CRCL (mL/min) 30-59 | | Severe | eGFR (mL/min/1.73 m2) 15-29; or CRCL (mL/min) 15-29 | | End Stage | eGFR (mL/min/1.73 m2) <15 /on dialysis; or CRCL (mL/min) <15 /on dialysis |
Is the HEPATIC impairment correct compared to BI/AST (if NCI criteria)?
baseline_rawdata %>% group_by(BHEPATIC) %>% filter(BBI !=-99) %>% summarize(minBI = min(BBI, na.rm = T), medianBI = median(BBI, na.rm = T), maxBI = max(BBI, na.rm = T), minAST = min(BAST, na.rm = T), medianAST = median(BAST, na.rm = T), maxAST = max(BAST, na.rm = T))
Compare the output to e.g. NCI criteria (or modify to what was used to define categories):
| Category | Definition | | ---------- | -------------------------------------------------------| | Normal | Bilirubin ≤ ULN, and AST ≤ ULN | | Mild | Bilirubin ≤ ULN, and AST > ULN or Bilirubin >1.0*ULN to ≤1.5*ULN, and AST of any value | | Moderate | Bilirubin > 1.5ULN to ≤ 3ULN, and AST of any value | | Severe | Bilirubin >3.0*ULN, and AST of any value |
(The upper limit of normal (ULN) for bilirubin and AST are defined within the SDTM of each study.)
Spot check of derived variables:
# Selection of random ids to re-calculate for ids <- sample(unique(baseline_rawdata$OSID), 10) temp <- baseline_rawdata[baseline_rawdata$OSID %in% ids,]
Baseline Creatinine clearance
# Re-calculate and compare CRCL <- temp %>% mutate(calcCRCL = (140-AGE) * BWT / (BSCR*0.8136), # males calcCRCL = ifelse(SEXM==0, 0.85*calcCRCL, calcCRCL), # females calcCRCL = round(calcCRCL, digits = 2), orignialCRCL = round(BCRCL, digits = 2)) %>% select(calcCRCL, orignialCRCL) CRCL
Baseline EGFR
# Re-calculate and compare BEGFR <- vector("numeric", length(ids)) for(i in 1:length(ids)){ # not black male BEGFR[i] <- 175 * temp$BSCR[i]^(-1.154) * temp$AGE[i]^(-0.203) # if female if(temp$SEXM[i]==0){ BEGFR[i] <- BEGFR[i] * 0.742 } # if black if(temp$RACE[i]==2){ BEGFR[i] <- BEGFR[i] * 1.212 } } round(BEGFR, digits = 2) round(temp$BEGFR, digits = 2)
Baseline body surface area
round(0.007184 * (temp$BWT^0.425) * (temp$BHT^0.725), digits=2) round(temp$BBSA, digits = 2)
Baseline BMI
round(temp$BWT / ((temp$BHT/100)^2), digits=2) round(temp$BBMI, digits = 2)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.