R/match_obs_to_sim.R

Defines functions match_obs_to_sim

Documented in match_obs_to_sim

#' Match observed concentration-time data to the correct simulated
#' concentration-time data
#'
#' \code{match_obs_to_sim} will match observed concentration-time data to the
#' correct simulated concentration-time data and figure out what the dose number
#' should be at each time as well as figure out what the correct compound names
#' should be since that's not included in observed-data Excel files for the
#' Simulator. It will then add the observed data to your existing
#' concentration-time data.frame.
#'
#' @param ct_dataframe a data.frame of concentration-time data generated by
#'   running either \code{\link{extractConcTime}} or
#'   \code{\link{extractConcTime_mult}}. Not quoted.
#' @param obs_dataframe a data.frame of observed concentration-time data
#'   generated by running either \code{\link{extractConcObsTime}} or
#'   \code{\link{extractObsConcTime_mult}}. Not quoted.
#' @param obs_to_sim_assignment optionally specify which observed files should
#'   be compared to which simulator files. If left as NA, this will assume that
#'   all obsereved data should match all simulated data. To specify, use a
#'   data.frame like this:
#'   \code{obs_to_sim_assignment = data.frame(ObsFile = c("obs data 1.xlsx",
#'   "obs data 2.xlsx"), File = c("mdz-5mg-qd.xlsx", "mdz-5mg-qd-cancer.xlsx"))}
#'   or use a named character vector: \code{obs_to_sim_assignment =
#'   c("obs data 1.xlsx" = "mdz-5mg-qd.xlsx", "obs data 2.xlsx" =
#'   "mdz-5mg-qd-cancer.xlsx")}
#' @param existing_exp_details the output from running
#'   \code{\link{extractExpDetails}} or \code{\link{extractExpDetails_mult}} on
#'   the same simulations that were used for making \code{ct_dataframe}. If you
#'   don't supply this, we'll run  \code{\link{extractExpDetails_mult}} behind
#'   the scenes, which will increase the time this takes to run. If you would
#'   like to have this calculate dose numbers on data where you wouldn't have
#'   simulation experimental details (example: you've only got observed data),
#'   then supply a single-row data.frame with the following columns: \code{File}
#'   (character, just set this to "all" or some other placeholder text),
#'   \code{DoseInt_sub} (numeric), \code{StartHr_sub} (numeric; probably 0),
#'   \code{NumDoses_sub} (numeric), and \code{Regimen_sub} (character;
#'   presumably "Multiple Dose" here). If you want the dose number for other
#'   compound IDs, then replace "_sub" with, e.g., "_inhib". Please run
#'   \code{view(ExpDetailDefinitions)} to see examples of acceptable suffixes.
#'
#' @return returns a data.frame where the information from obs_dataframe has
#'   dose numbers, compound names, inhibitor names, etc. to match ct_dataframe,
#'   and obs_dataframe has been added to ct_dataframe.
#' @export
#'
#' @examples
#' # None yet
#' 

match_obs_to_sim <- function(ct_dataframe, 
                             obs_dataframe, 
                             obs_to_sim_assignment = NA, 
                             existing_exp_details = NA){
   
   # NB: Does NOT currently check that dosing data match between sim and obs
   # files. Could add that in the future as a safety check.
   
   # error catching -------------------------------------------------------
   # Check whether tidyverse is loaded
   if("package:tidyverse" %in% search() == FALSE){
      stop("The SimcypConsultancy R package requires the package tidyverse to be loaded, and it doesn't appear to be loaded yet. Please run\nlibrary(tidyverse)\n    ...and then try again.", 
           call. = FALSE)
   }
   
   if(nrow(ct_dataframe) == 0){
      warning("Please check your input. The data.frame you supplied for ct_dataframe doesn't have any rows.", 
              call. = FALSE)
      return(ct_dataframe)
   }
   
   if(nrow(obs_dataframe) == 0){
      warning("Please check your input. The data.frame you supplied for obs_dataframe doesn't have any rows.", 
              call. = FALSE)
      return(ct_dataframe)
   }
   
   # Adjusting for slight differences in column names. 
   names(obs_dataframe)[tolower(names(obs_dataframe)) == "file"] <- "File"
   names(obs_dataframe)[tolower(names(obs_dataframe)) == "obsfile"] <- "ObsFile"
   
   if("simfile" %in% tolower(names(obs_dataframe)) &
      "file" %in% tolower(names(obs_dataframe)) == FALSE){
      names(obs_dataframe)[tolower(names(obs_dataframe)) == "simfile"] <- "File"
   }
   
   if(all(c("File", "CompoundID", "Compound", "Conc", "Conc_units", 
            "Time", "Time_units") %in% names(ct_dataframe)) == FALSE){
      stop("It doesn't appear that you have supplied output from extractConcTime or extractConcTime_mult as input for ct_dataframe. This function only works when ct_dataframe is output from those functions.", 
           call. = FALSE)
   }
   
   if(all(c("ObsFile", "CompoundID", "Conc", "Conc_units", 
            "Time", "Time_units") %in% names(obs_dataframe)) == FALSE){
      stop("It doesn't appear that you have supplied output from extractObsConcTime or extractObsConcTime_mult as input for ct_dataframe. This function only works when obs_dataframe is output from those functions.", 
           call. = FALSE)
   }
   
   if("logical" %in% class(obs_to_sim_assignment)){
      if(all(is.na(obs_to_sim_assignment)) == FALSE){
         stop("You have supplied a value for obs_to_sim_assigment that isn't a character vector, a data.frame, or just the value NA, which are the only acceptable options for obs_to_sim_assigment. Please check your input and try again.", 
              call. = FALSE)
      }
   } else if(any(c("data.frame", "character") %in% 
                 class(obs_to_sim_assignment)) == FALSE){
      stop("You have supplied a value for obs_to_sim_assigment that isn't a character vector, a data.frame, or just the value NA, which are the only acceptable options for obs_to_sim_assigment. Please check your input and try again.", 
           call. = FALSE)
   }
   
   # Main body of function -------------------------------------------------
   
   if(is.logical(existing_exp_details)){
      existing_exp_details <- 
         extractExpDetails_mult(sim_data_files = unique(ct_dataframe$File))
   }
   
   existing_exp_details <- harmonize_details(existing_exp_details)
   
   # Summary of approach: Use expand_grid on obs_dataframe to include all
   # possible appropriate File values and then bind obs_dataframe to
   # ct_dataframe.
   
   # Making obs_to_sim_assignment a data.frame regardless of what it was
   # originally. We'll then split it into a list of data.frames and loop
   # through to assign.
   if("character" %in% class(obs_to_sim_assignment)){
      if(all(obs_to_sim_assignment == "use existing_exp_details")){
         ObsAssign <- existing_exp_details$MainDetails %>% 
            select(File, ObsOverlayFile) %>% 
            rename(ObsFile = ObsOverlayFile)
      } else {
         if(is.null(names(obs_to_sim_assignment))){
            stop(wrapn("You appear to have supplied a character vector for the argument 'obs_to_sim_assignment', but you haven't supplied names for that vector, so we don't know how to match your simulated and observed files. Please check the help file for the argument 'obs_to_sim_assignment' and try again."), 
                 call. = FALSE)
         }
         ObsAssign <- data.frame(ObsFile = names(obs_to_sim_assignment), 
                                 File = obs_to_sim_assignment)
      }
   } else if("logical" %in% class(obs_to_sim_assignment)){
      # This is when they have left obs_to_sim_assignment as NA and want all the
      # obs data to match all the sim.
      ObsAssign <- expand_grid(File = unique(ct_dataframe$File), 
                               ObsFile = unique(obs_dataframe$ObsFile))
   } else {
      # This is when they have supplied a data.frame.
      ObsAssign <- obs_to_sim_assignment
   } 
   
   # Adding ".xml" to any files that don't have it already since people often
   # omit the extension or use the wrong one. Checking that the file exists.
   ObsAssign <- ObsAssign %>% 
      mutate(Extension = str_extract(basename(ObsFile), pattern = "\\.(?<=\\.)[^\\.]+$"), 
             ObsFile_xml = case_when(is.na(Extension) ~ paste0(ObsFile, ".xml"), 
                                     Extension == ".xml" ~ ObsFile, 
                                     Extension == ".xlsx" ~ sub("xlsx$", "xml", ObsFile)), 
             ObsFile_xlsx = case_when(is.na(Extension) ~ paste0(ObsFile, ".xlsx"), 
                                      Extension == ".xlsx" ~ ObsFile, 
                                      Extension == ".xml" ~ sub("xml$", "xlsx", ObsFile)), 
             ObsFile_xml_exists = file.exists(ObsFile_xml), 
             ObsFile_xlsx_exists = file.exists(ObsFile_xlsx), 
             
             # Preferentially using the XML file since people don't always save
             # the xlsx version of the file
             ObsFileToUse = case_when(ObsFile_xml_exists == TRUE ~ ObsFile_xml, 
                                      ObsFile_xml_exists == FALSE & 
                                         ObsFile_xlsx_exists == TRUE ~ ObsFile_xlsx, 
                                      .default = ObsFile_xml))
   
   # Matching the file extension in obs_dataframe or this will have trouble
   obs_dataframe <- obs_dataframe %>% 
      left_join(ObsAssign %>% select(ObsFile, ObsFileToUse), 
                by = "ObsFile") %>% 
      select(-ObsFile) %>% 
      rename(ObsFile = ObsFileToUse)
   
   ObsAssign <- ObsAssign %>% 
      select(-ObsFile) %>% 
      rename(ObsFile = ObsFileToUse)
   
   # Making sure we have all the info we need.
   if(all(ObsAssign$File[complete.cases(ObsAssign$File)] %in%
          existing_exp_details$MainDetails$File) == FALSE){
      suppressMessages(
         existing_exp_details <- 
            extractExpDetails_mult(sim_data_files = ObsAssign$File, 
                                   existing_exp_details = existing_exp_details))
   }
   
   # This will cause problems if any of the sets of data have 0 rows. Checking
   # for that. 
   MissingFile <- setdiff(unique(ObsAssign$File), unique(ct_dataframe$File))
   if(length(MissingFile) > 0){
      warning(paste0("The file(s) ", 
                     str_comma(paste0("`", MissingFile, "`")), 
                     " is/are included in `obs_to_sim_assignment` but not present in the simulated concentration-time data. It/They cannot be matched to any observed data.\n"), 
              call. = FALSE)
      ObsAssign <- ObsAssign %>% filter(File %in% unique(ct_dataframe$File))
   }
   
   MissingObsFile <- setdiff(unique(ObsAssign$ObsFile), 
                             unique(obs_dataframe$ObsFile))
   if(length(MissingObsFile) > 0){
      warning(paste0("The observed data file(s) ", 
                     str_comma(paste0("`", MissingObsFile, "`")), 
                     " is/are included in `obs_to_sim_assignment` but not present in the observed concentration-time data. It/They cannot be matched to any simulated data.\n"), 
              call. = FALSE)
      ObsAssign <- ObsAssign %>% filter(ObsFile %in% unique(obs_dataframe$ObsFile))
   }
   
   # NB: Splitting by ObsFile b/c you could have more than one obs file per
   # sim file, e.g., when there are observed data for more than 3 compound
   # IDs, the maximum allowed in the PE data entry template. You could also
   # have more than one File per ObsFile, though, so need to split by that as
   # well.
   obs_dataframe <- split(obs_dataframe, f = obs_dataframe$ObsFile) # NOTE: SPLIT BY OBSFILE
   ct_dataframe <- split(ct_dataframe, f = ct_dataframe$File) # NOTE: SPLIT BY FILE
   ObsAssign <- split(ObsAssign, f = ObsAssign$ObsFile)
   
   for(j in names(ObsAssign)){
      # j = ObsFile
      
      ObsAssign[[j]] <- split(ObsAssign[[j]], 
                              f = ObsAssign[[j]]$File)
      
      ObsData_j <- list()
      
      for(k in names(ObsAssign[[j]])){
         # k = File
         
         # Checking for whether there was a custom-dosing regimen b/c that
         # messes up EVERYTHING. If there was, Dose_x needs to be NA. 
         CustomDosingCheck <- existing_exp_details$MainDetails %>% 
            filter(File == k) %>% 
            select(any_of(c("Dose_sub", "Dose_inhib", "Dose_inhib2"))) %>% 
            pivot_longer(cols = everything(), 
                         names_to = "CompoundID", values_to = "Value") %>% 
            filter(Value == "custom dosing")
         
         # Setting observed data dose to NA if it's custom dosing.
         for(cmpd in CustomDosingCheck$CompoundID){
            obs_dataframe[[j]][, cmpd] <- NA
         }
         
         # NB: This does not include matching by dose, which may change over the
         # course of the study or simulation. This is assuming that the observed
         # file in question definitely should match the simulated data in
         # question, so that should NOT be a problem. 
         ObsData_j[[k]] <- obs_dataframe[[j]] %>% 
            select(-Compound) %>% 
            mutate(File = k) %>% 
            left_join(ct_dataframe[[k]] %>% 
                         select(any_of(c("File", "CompoundID", "Compound"))) %>% 
                         unique(), 
                      relationship = "many-to-many", 
                      by = c("CompoundID", "File"))
         
         # Observed files often only include the 1st dose, even if it was
         # a multiple-dose simulation, so we can't trust the obs file to
         # include all the dose numbers at all the right times and instead
         # should get the dose number and dosing interval from the sim
         # file existing_exp_details.
         Deets <- existing_exp_details
         Deets$MainDetails <- Deets$MainDetails %>% filter(File == k)
         
         ObsData_j[[k]] <- calc_dosenumber(ct_dataframe = ObsData_j[[k]], 
                                           existing_exp_details = Deets)
         
         # Matching units. Need to do this one compound at a time. 
         ObsData_j[[k]] <- split(ObsData_j[[k]], f = ObsData_j[[k]]$CompoundID)
         ct_dataframe[[k]] <- split(ct_dataframe[[k]], f = ct_dataframe[[k]]$CompoundID)
         
         for(cmpd in intersect(names(ObsData_j[[k]]), 
                               names(ct_dataframe[[k]]))){
            ObsData_j[[k]][[cmpd]] <- 
               convert_units(DF_to_convert = ObsData_j[[k]][[cmpd]], 
                             DF_with_good_units = ct_dataframe[[k]][[cmpd]], 
                             MW = as.numeric(
                                Deets$MainDetails[
                                   paste0("MW", AllCompounds$Suffix[AllCompounds$CompoundID == cmpd])]))
            
         }
         
         ObsData_j[[k]] <- bind_rows(ObsData_j[[k]])
         ct_dataframe[[k]] <- bind_rows(ct_dataframe[[k]])
         
         # Adding inhibitor name as needed
         MyPerpetrator <- determine_myperpetrator(Deets,
                                                  prettify_compound_names = FALSE)
         
         ObsData_j[[k]]$Inhibitor[ObsData_j[[k]]$Inhibitor != "none"] <- 
            MyPerpetrator
         
         rm(Deets)
      }
      
      obs_dataframe[[j]] <- bind_rows(ObsData_j)
   }
   
   ct_dataframe <- bind_rows(ct_dataframe) %>% 
      bind_rows(bind_rows(obs_dataframe) %>% unique()) %>% 
      filter(complete.cases(CompoundID)) %>% unique()
   
   return(ct_dataframe)
   
}
shirewoman2/Consultancy documentation built on Feb. 18, 2025, 10 p.m.