In AstraZeneca/pmworkbench: Respoitory of templates for pharmacometric projects

###################################################
# s07_nm_datasets.Rmd
# Description: Dataset preparation for NONMEM
# Dependencies: s01_dataset_preparation.R / s01.RData 
###################################################

# Settings to knit in top directory:
# Everything after this chunk works with paths relative top level
library(rprojroot)
knitr::opts_knit$set(root.dir=find_root(has_file("OpenProject.Rproj"))) 
knitr::opts_chunk$set(echo=F)

# Note: R markdown opens a new R session, your global environment is not available.

This script uses the data.frame "rawdata". That is, all rows with C=="C" are included (as required by regulatory agencies). These rows are excluded in the NONMEM scripts instead.

# -----------------------------------------------
# Prepare environment
# -----------------------------------------------
source(file = file.path("./Scripts","Setup","setup01_rEnvironment.R"))
load(file = file.path("./Scripts","s01.RData"))

Are the datasets being outputted to file?

params$print_csv

Selection of columns and re-naming for NONMEM

# List numeric columns to include for nonmem dataset
nm_columns <- c("C","NMSEQSID","TIME","TAPD","AMT","DV","MDV","EVID",
                "ADDL","II","CMT","BLQ","OCC","STUDYID","DOSE","DAY",
                "AGE","BCRCL","BWT","SEXM","RACE")

# use rawdata: contains all "C" and numeric versions of columns
nm_data <- rawdata %>% select(nm_columns)

# Comment out rows with negative time (pre-first dose samples)
nm_data <- nm_data %>% 
  mutate(C = ifelse(TAFD < 0, "C", C))


# rename NMSEQSID to ID 
nm_data <- nm_data %>% 
  rename(ID = NMSEQSID)


# rename NMSEQSID to ID and TAFD to TIME
nm_data <- nm_data %>% 
  rename(ID = NMSEQSID, 
         TIME = TAFD) 

# use ln transformed data?
# nm_data <- nm_data %>% 
#   rename(NORMDV = DV, 
#          DV = LNDV) 


## Add a RATE column with values -2 at dosing records 
# for estimation of zero order input (for models with zero order input)
nm_data <- nm_data %>% 
  mutate(RATE = ifelse(!is.na(AMT), -2, NA))

Imputation of missing covariates

## Categorical covariates: model as separate groups and potentially merge with other
nm_data <- nm_data %>% 
  mutate(BRENAL = ifelse(is.na(BRENAL), -99, BRENAL))



## Continous covariates: impute based on median (stratified by sex if appropriate)
impute_baseline <- nm_data %>% 
  filter(!duplicated(ID)) 

nm_data <- nm_data %>% 
  mutate(
    # Height 
    BHT = ifelse(is.na(BHT) & SEXM==0 ,
                 median(impute_baseline$BHT[impute_baseline$SEXM==0], na.rm=T),
                 BHT),
    BHT = ifelse(is.na(BHT) & SEXM==1,
                 median(impute_baseline$BHT[impute_baseline$SEXM==1], na.rm=T),
                 BHT), 
    # Weight
    BWT = ifelse(is.na(BWT) & SEXM==0 ,
                 median(impute_baseline$BWT[impute_baseline$SEXM==0], na.rm=T),
                 BWT),
    BWT = ifelse(is.na(BWT) & SEXM==1,
                 median(impute_baseline$BWT[impute_baseline$SEXM==1], na.rm=T),
                 BWT), 
    # Calculate BMI based on imputed (and reported) values
    BBMI = ifelse(is.na(BBMI), 
                  nm_data$BWT[is.na(nm_data$BBMI)] / (nm_data$BHT[is.na(nm_data$BBMI)]/100)^2, 
                  BBMI), 
    # # Calculate BSA based on imputed (and reported) values (Dubois&Dubois)
    # BBSA = ifelse(is.na(BBSA), 
    #               0.007184 * ((nm_data$BHT[is.na(nm_data$BBSA)])^0.725) *
    #                 (nm_data$BWT[is.na(nm_data$BBSA)]^0.425), BBSA), 
    # # CrCL
    # BCRCL = ifelse(is.na(BCRCL) & SEXM==0, 
    #                median(impute_baseline$BCRCL[impute_baseline$SEXM==0], na.rm=T), 
    #                BCRCL), 
    # BCRCL = ifelse(is.na(BCRCL) & SEXM==1, 
    #                median(impute_baseline$BCRCL[impute_baseline$SEXM==1], na.rm=T), 
    #                BCRCL), 
    # eGFR
    BEGFR = ifelse(is.na(BEGFR) & SEXM==0,
                   median(impute_baseline$BEGFR[impute_baseline$SEXM==0], na.rm=T), 
                   BEGFR), 
    BEGFR = ifelse(is.na(BEGFR) & SEXM==1,
                   median(impute_baseline$BEGFR[impute_baseline$SEXM==1], na.rm=T), 
                   BEGFR),
    # SCR
    BSCR = ifelse(is.na(BSCR),
                  median(impute_baseline$BSCR, na.rm=T), 
                  BSCR)
  )

Output and naming of dataset

The dataset names are saved in variables that are used to update the nonmem control-files later on.

No. 1: Original with imputed covariates

# Dataset name and path to location
nm_data_filename <- paste0(dv_name,"_nm_", delivery_date,".csv")
out_file         <- file.path(directories[["derived_data_dir"]], nm_data_filename)

# Write dataset 
if(params$print_csv){ 
  write.csv(nm_data, file=out_file, 
            row.names=F, quote=F, na = ".") 
}

No. 2: added duplicated dose records for estimation of sequential or parallell absorption

nm_data_par <- nm_data   

# Duplicate all dose records for estimation of zero-order input into central (cmt==2)
doses <- nm_data_par %>% 
  filter(EVID==1) %>% 
  mutate(CMT=2)

# Modify record in dataset for which we use Ka         
nm_data_par <- nm_data_par %>% 
  mutate(RATE = ifelse(EVID==1, NA, RATE))

# Merge duplicated doses with data and sort by ID and time
nm_data_par <- rbind(nm_data_par, doses) %>% 
  arrange(ID, TIME)

# Dataset name and path to location
nm_data_par_filename <- paste0(dv_name, "_nm_", delivery_date, "_par",".csv")
out_file <- file.path(directories[["derived_data_dir"]], nm_data_par_filename)

# Write dataset
if(params$print_csv){
  write.csv(nm_data_par, file=out_file, row.names=F, quote=F, na = ".")
}

No. 3: added dummy rows for multiple dose transit compartment model

## Add dummy variables at time of dosing in order 
## to aviod negative TAD for transit compartment model
nm_data_transit <- nm_data

# Use dose records and modify to merge right before next dose
transit_dummy <- nm_data_transit %>%
  # not needed for first dose
  filter(!is.na(AMT) & TIME !=0) %>% 
  # change evid to 2, the time to 10 min before the dose, AMT to 0
  # only baseline values for covariates so does not need to change these
  mutate(EVID = 2, 
         AMT = NA, 
         TIME = TIME - (10/60), 
         TAPD = NA, 
         CMT = 2)

# Bind with the original data and sort by ID and time
nm_data_transit <- rbind(nm_data_transit, transit_dummy) %>% 
  arrange(ID, TIME)
rownames(nm_data_transit) <- NULL

# Dataset name and path to location
nm_data_transit_filename <- paste0(dv_name, "_nm_", delivery_date, "_transit",".csv")
out_file <- file.path(directories[["derived_data_dir"]], nm_data_transit_filename)

# Write dataset 
if(params$print_csv){
  write.csv(nm_data_transit, file=out_file, row.names=F, quote=F, na = ".")
}

List of prepared datasets

# Save dataframe with number of datasets and modifications done. 
nm_datasets <- 
  data.frame(filename = c(nm_data_filename, nm_data_par_filename,nm_data_transit_filename), 
            Description = 
              c("Dataset prepared for NONMEM use: (i) only numerical columns selected, (ii) missing covariates imputed, (iii) NMSEQSID renamed to ID and TAFD renamed to TIME, (iv) RATE column added to estimate duration of zero-order absorption", 
                "Duplicated dose record added for estimation of sequential or parallel zero- and first order absorption",
                "Dummy variable added at 10 min before each dose in order to avoid negative time after dose results for transit compartment model"))
kable(nm_datasets)

# Write dataset 
if(params$print_csv){
  write.csv(nm_datasets, 
            file=file.path(directories[["derived_data_dir"]], "prepared_datasets.csv"), 
            row.names=F, quote=T)
}

# Save environment to be used in developement scripts 
rm("params") # remove rmarkdown params or it will cause issues in the next script

# this will generate in error but all the output is generated as should.
save.image(file = file.path(directories[["scripts_dir"]],"s06.RData"))