R/make_simulation_directory.R

Defines functions make_simulation_directory

Documented in make_simulation_directory

#' Make a directory of simulations indicating which simulation files are present
#' in a given folder and which XML observed overlay files were associated with
#' those simulation files
#'
#' @description The function \code{make_simulation_directory} will create a
#'   data.frame of simulations in a given project folder, the associated XML
#'   files when applicable (you'll have to provide that information with the
#'   argument \code{existing_exp_details} or else ask this function to read all
#'   the workspace files to find them), and, optionally, save that data.frame to
#'   an Excel file. It will also check whether the file names comply with USFDA
#'   and Consultancy Team standards and check whether the same simulation is
#'   saved in more than one place.
#'
#'   This is wicked fast.*
#'
#'   *Caveat: Unless you search all the workspaces for XML observed overlay data files. Then it
#'   takes about 2 seconds per simulation.
#'
#' @param project_folder location of the project folder to search. Please note
#'   that R requires forward slashes ("/") in file paths rather than the back
#'   slashes Microsoft uses. If left as NA (default), we'll assume your current
#'   working directory is the top level of your project folders.
#' @param sim_data_files which files to include in the directory of simulations.
#'   This only applies when you don't supply anything for the argument
#'   \code{existing_exp_details}. Options are:
#'   \describe{
#'
#'   \item{"recursive" (default)}{all .xlsx, .db, or .wksz files in the current folder and any
#'   subfolders}
#'
#'   \item{NA}{all .xlsx, .db, or .wksz files in the current folder}
#'
#'   \item{a single text string such as "dev"}{include any files in the current
#'   folder or any folders below it that have that specific text}
#'
#'   \item{a character vector of specific file names, e.g., \code{sim_data_files =
#'   c("abc.xlsx", "def.xlsx")}}{only include the files listed. We'll figure
#'   out which folder they're in.}}
#'
#' @param existing_exp_details optionally supply the output from running
#'   \code{\link{extractExpDetails_mult}} to get only the simulation files
#'   included there in your simulation directory. If you supply something here,
#'   whatever you supply for \code{sim_data_files} will be ignored. This will
#'   also be used to figure out which XML files go with which simulations.
#' @param search_workspaces_for_obsfile TRUE or FALSE (default) for whether to
#'   search through any workspace files and check for possible XML overlay
#'   files. This runs considerably slower when set to TRUE and will take about
#'   an additional 2 seconds per simulation. If you have supplied something for
#'   'existing_exp_details', you don't need this and we'll ignore it if you set
#'   this to TRUE.
#' @param use_basename_for_obsfile TRUE or FALSE (default) for whether to remove
#'   the folder name from the observed overlay XML file name. By default, we'll
#'   list the XML file where it was located when the workspace was saved.
#'   However, if you set this to TRUE, we'll only list the file name without the
#'   path. Why would you want this? Say you've moved all your XML observed
#'   overlay files from their original locations when you ran the simulations
#'   into a new folder that you'll be sharing with the client at the end of the
#'   project. This way, you can show which simulation had which XML observed
#'   overlay file without the confusion of including the original path.
#' @param save_table optionally specify an Excel file name for saving your
#'   simulation directory. If you don't include the file extension ".xlsx",
#'   we'll add it.
#' @param overwrite Should we overwrite if your Excel file already exists and
#'   already has a tab named "Simulation directory"? Options are "yes" to always
#'   overwrite, "no" to never overwrite, or "ask" (default), which means that we
#'   will ask you whether to overwrite and give you a chance to supply a
#'   different file name if the one you supplied already exists.
#'
#' @return a data.frame of simulation files and their respective file paths
#' @export
#'
#' @examples
#' # none yet

make_simulation_directory <- function(project_folder = NA, 
                                      sim_data_files = "recursive", 
                                      existing_exp_details = NA, 
                                      search_workspaces_for_obsfile = FALSE, 
                                      use_basename_for_obsfile = FALSE, 
                                      save_table = NA, 
                                      overwrite = "ask"){
   
   # Error catching ----------------------------------------------------------
   
   # Check whether tidyverse is loaded
   if("package:tidyverse" %in% search() == FALSE){
      stop("The SimcypConsultancy R package requires the package tidyverse to be loaded, and it doesn't appear to be loaded yet. Please run\nlibrary(tidyverse)\n    ...and then try again.", 
           call. = FALSE)
   }
   
   if(length(project_folder) > 1){
      stop("You can only search one project folder at a time. Please check your input for `project_folder`.", 
           call. = FALSE)
   }
   
   if("logical" %in% class(existing_exp_details) == FALSE & 
      search_workspaces_for_obsfile == TRUE){
      warning(wrapn("You requested that we search through workspaces to look for XML observed data overlay file names, but you also supplied something for 'existing_exp_details', which should already have that infomation. We will save you some time and *not* search through your workspaces for the XML observed data overlay file name."), 
              call. = F)
      search_workspaces_for_obsfile <- FALSE
   }
   
   # subfuns -----------------------------------------------------------------
   # subfun for V23+ data extraction 
   
   # For some reason, you have to unzip the workspaces 1st if they're V23 or
   # later. Not sure what changed.
   unzip1st_fun <- function(workspace){
      R.utils::gunzip(workspace, destname = "TEMP.wks", remove = FALSE)
      workspace_xml <- XML::xmlTreeParse("TEMP.wks", useInternal = TRUE)
      file.remove("TEMP.wks")
      return(workspace_xml)
   }
   
   get_obs_file <- function(workspace){
      
      if(file.exists(workspace) == FALSE){
         return(NA)
      }
      
      workspace_xml <- tryCatch(XML::xmlTreeParse(workspace, useInternal = TRUE), 
                                error = unzip1st_fun(workspace))
      
      RootNode <- XML::xmlRoot(workspace_xml)
      
      UseObs <- as.logical(XML::xmlValue(RootNode[["GraphsData"]][["UseObservedData"]]))
      
      if(UseObs){
         Out <- XML::xmlValue(RootNode[["GraphsData"]][["ObservedDataPath"]])
      } else {
         Out <- NA
      }
      
      return(Out)
   }
   
   
   # Main body of function ---------------------------------------------------
   
   if(is.na(project_folder)){
      project_folder <- paste0(getwd(), "/")
   }
   
   # Making sure project folder ends with / so that we can paste the file to
   # that correctly.
   project_folder <- ifelse(str_detect(project_folder, "/$"), 
                            project_folder, 
                            paste0(project_folder, "/"))
   
   # Harmonizing input. This will be of class "NULL" if input for argument was
   # NA.
   existing_exp_details_orig <- existing_exp_details
   existing_exp_details <- harmonize_details(existing_exp_details)
   
   if(all(is.na(existing_exp_details_orig)) == FALSE){
      if("NULL" %in% class(existing_exp_details)){
         warning(wrapn("You supplied something for the argument 'existing_exp_details', but it's not in a format we expected. We'll look at all the files in the project folder and any subfolders and return those instead."), 
                 call. = FALSE)
         
         Directory <- tibble(
            PathFile = paste0(project_folder, 
                              list.files(path = project_folder, 
                                         pattern = "\\.xlsx|\\.wksz|\\.db", 
                                         recursive = TRUE))) 
         
      } else {
         
         NoFileExt <- sub("\\.xlsx$|\\.db$|\\.wksz$", "", 
                          basename(existing_exp_details$MainDetails$File))
         AllPossFiles <- paste0(rep(NoFileExt, each = 3), c(".xlsx", ".db", ".wksz"))
         
         Directory <- tibble(File = AllPossFiles) %>% 
            # adding path 
            left_join(tibble(
               PathFile = paste0(project_folder, 
                                 list.files(path = project_folder, 
                                            recursive = TRUE)), 
               File = basename(PathFile)), 
               by = "File")
      }
   } else if(length(sim_data_files) == 1){
      if(is.na(sim_data_files)){
         Directory <- tibble(
            PathFile = paste0(project_folder, 
                              list.files(path = project_folder, 
                                         pattern = "\\.xlsx|\\.wksz|\\.db", 
                                         recursive = TRUE)))
         
      } else if(tolower(sim_data_files) == "recursive"){
         Directory <- tibble(
            PathFile = paste0(project_folder, 
                              list.files(path = project_folder, 
                                         pattern = "\\.xlsx|\\.wksz|\\.db", 
                                         recursive = TRUE)))
         
      } else {
         # This is when they have supplied a character string to match
         Directory <- tibble(
            PathFile = paste0(project_folder, 
                              list.files(path = project_folder, 
                                         pattern = sim_data_files, 
                                         recursive = TRUE))) 
      }
      
   } else {
      # If length(sim_data_files) > 1, then they have supplied a character
      # vector of files. If they didn't include ".xlsx" at the end, add that.
      Directory <- 
         tibble(File = sim_data_files) %>% 
         mutate(File = case_when(str_detect(File, "\\.xlsx|\\.db|\\.wksz") == FALSE ~ 
                                    paste0(File, ".xlsx"), 
                                 .default = File), 
                File = basename(File)) %>% 
         # adding path 
         left_join(tibble(
            PathFile = paste0(project_folder, 
                              list.files(path = project_folder, 
                                         recursive = TRUE)), 
            File = basename(PathFile)), 
            by = "File")
   } 
   
   if("File" %in% names(Directory) == FALSE){Directory$File <- Directory$PathFile}
   
   # Removing temporary files and making sure that all File values are basename. 
   Directory <- Directory %>% 
      mutate(File = case_when(complete.cases(PathFile) ~ basename(PathFile), 
                              .default = File)) %>% 
      filter(is.na(PathFile) | 
                (complete.cases(PathFile) & !str_detect(PathFile, "^~"))) %>% 
      filter(str_detect(File, "xlsx$|wksz$|db$")) %>% 
      mutate(Folder = dirname(PathFile), 
             Folder = ifelse(Folder == ".", getwd(), Folder), 
             Folder = if_else(file.exists(PathFile), Folder, "FILE NOT FOUND"),  
             Folder = case_when(Folder == "FILE NOT FOUND" & nchar(PathFile) > 260 ~ 
                                   "FILE PATH TOO LONG",
                                .default = Folder), 
             Filename = sub("\\.xlsx|\\.db|\\.wksz", "", File),
             Filetype = str_extract(File, "xlsx$|wksz$|db$"))
   
   # Removing from consideration any files that were not included in
   # existing_exp_details, if that was supplied, or if sim_data_files was
   # "recursive" or NA if the folder is now "FILE NOT FOUND" and if the Filetype
   # is NOT xlsx. Removing these b/c we're the ones who added them: They are
   # just variations on the simulation file names that we're checking to see if
   # they exist, e.g., workspaces or database files.
   if((any(c("logical", "NULL") %in% class(existing_exp_details)) == FALSE) | 
      (length(sim_data_files) == 1 && 
       (is.na(sim_data_files) || sim_data_files == "recursive" | 
        !str_detect(sim_data_files, "[^\\.]|\\.xlsx")))){
      Directory <- Directory %>% 
         filter(!(Folder == "FILE NOT FOUND" & Filetype != "xlsx"))
   }
   
   if(length(Directory$Filename[Directory$Folder != "FILE NOT FOUND"]) == 0){
      # This will happen if they have supplied something for
      # existing_exp_details but those files aren't in the project folder.
      stop(wrapn(paste0("We can't find any of the files in 'existing_exp_details'. Are you in the main folder for this project or have you set the argument 'project_folder' to that folder? We need to know the project folder to figure out where your files are. Please set the project folder and try again.")), 
           call. = FALSE)
   }
   
   if(nrow(Directory) == 0){
      stop(wrapn("There are no files that match the pattern of text you supplied for 'sim_data_files', so we cannot make your simulation diratory."), 
           call. = FALSE)
   }
   
   # We don't need the simulation directory file name to show up in the
   # simulation directory b/c that's just way too meta.
   if(complete.cases(save_table)){
      Directory <- Directory %>% 
         filter(Filename != save_table)
   }
   
   Directory <- Directory %>% 
      group_by(Filename, Folder) %>% 
      summarize(Filetype = str_c(Filetype, collapse = ", ")) %>% 
      ungroup() %>% 
      mutate(`Table/Figure` = "", 
             Comments = "")
   
   # Checking for file name issues
   suppressWarnings(
      Directory <- Directory %>% 
         mutate(FileNameCheck = check_file_name(Filename))
   )
   
   BadFileNames <- Directory %>% 
      filter(!FileNameCheck == "File name meets naming standards.")
   
   if(nrow(BadFileNames)> 0){
      BadFileNames <- BadFileNames %>% 
         mutate(Bad = paste0(Filename, ": ", FileNameCheck)) %>% 
         pull(Bad)
      
      warning(paste0("The following file names do not meet file-naming standards for the Simcyp Consultancy Team:\n", 
                     str_c(paste0("     ", BadFileNames), collapse = "\n"), "\n"),
              call. = FALSE)
   }
   
   Directory$DuplicateFileCheck <- duplicated(Directory$Filename, fromLast = T) |
      duplicated(Directory$Filename, fromLast = F)
   Directory <- Directory %>% 
      mutate(DuplicateFileCheck = case_when(DuplicateFileCheck == TRUE ~ "simulation file in multiple locations", 
                                            DuplicateFileCheck == FALSE ~ ""))
   
   # Making relative paths
   Directory$Folder <- R.utils::getRelativePath(Directory$Folder, 
                                                relativeTo = project_folder)
   
   # Formatting per FDA/Consultancy Team requirements
   Directory <- Directory %>% 
      mutate(Folder = case_when(Folder == "." ~ "", 
                                .default = Folder)) %>% 
      select(Filename, Filetype, Folder, `Table/Figure`, Comments, FileNameCheck, DuplicateFileCheck) %>% 
      unique()
   
   if("NULL" %in% class(existing_exp_details) == FALSE){
      
      if("ObsOverlayFile" %in% names(existing_exp_details$MainDetails)){
         if(any(complete.cases(existing_exp_details$MainDetails$ObsOverlayFile))){
            Directory <- Directory %>% 
               left_join(existing_exp_details$MainDetails %>% 
                            select(File, ObsOverlayFile) %>% 
                            mutate(Filename = sub("\\..*$", "", basename(File))) %>% 
                            select(-File), by = "Filename") %>% 
               mutate(ObsOverlayFile = R.utils::getRelativePath(ObsOverlayFile, 
                                                                relativeTo = project_folder), 
                      ObsOverlayFile = case_when(
                         str_detect(ObsOverlayFile, "Working Directory/XMLs") ~ 
                            sub(".*/XMLs", "XMLs", ObsOverlayFile),
                         .default = ObsOverlayFile), 
                      XMLFileNameCheck = check_file_name(ObsOverlayFile), 
                      XMLFileNameCheck = case_when(
                         XMLFileNameCheck == "File name meets naming standards." ~ "", 
                         is.na(XMLFileNameCheck) ~ "", 
                         .default = sub("File name", "XML file name", XMLFileNameCheck))) %>% 
               rename("XML file used" = ObsOverlayFile)
            
            ObsOverlayKnown <- TRUE
         } else {
            ObsOverlayKnown <- FALSE
            Directory$XMLFileNameCheck <- ""
         }
         
      } else {
         # This is when they have existing_exp_details but they must not have
         # had the workspaces available when they ran extractExpDetails_mult
         # b/c there is no info on any observed overlay files. Warning in
         # that case.
         warning(wrapn("You have supplied something for the argument 'existing_exp_details', but it does not include any information about the observed overlay files that were used with these simulations. To get that information, when you run 'extractExpDetails_mult', your Excel simulation results files must be in the same folder as the workspaces because we need both to figure out which XML file went with which simulation."), 
                 call. = FALSE)
         
         ObsOverlayKnown <- FALSE
         Directory$XMLFileNameCheck <- ""
      }
   } else {
      ObsOverlayKnown <- FALSE
      Directory$XMLFileNameCheck <- ""
   }
   
   if(search_workspaces_for_obsfile){
      
      Directory$ObsOverlayFile <- as.character(NA)
      
      for(ff in 1:nrow(Directory)){
         
         if(Directory$Folder[ff] %in% c("check file name - no folder found", 
                                        "FILE PATH TOO LONG", 
                                        "FILE NOT FOUND")){
            next
         }
         
         Directory$ObsOverlayFile[ff] <- 
            get_obs_file(workspace = 
                            paste0(
                               case_when(Directory$Folder[ff] == "" ~ "", 
                                         .default = paste0(Directory$Folder[ff], "/")),
                               Directory$Filename[ff], ".wksz"))
      }
      
      Directory$ObsOverlayFile <- gsub("\\\\", "/", Directory$ObsOverlayFile)
      Directory$ObsOverlayFile <- sub("C:/Users.*Certara/Simcyp PBPKConsult BMG BMG43A - Model(l)?ing Working Directory/", 
                                      "", Directory$ObsOverlayFile)
      
      Directory <- Directory %>% 
         mutate(ObsOverlayFile = R.utils::getRelativePath(ObsOverlayFile, 
                                                          relativeTo = project_folder), 
                ObsOverlayFile = case_when(
                   str_detect(ObsOverlayFile, "Working Directory/XMLs") ~ 
                      sub(".*/XMLs", "XMLs", ObsOverlayFile),
                   .default = ObsOverlayFile), 
                XMLFileNameCheck = check_file_name(ObsOverlayFile), 
                XMLFileNameCheck = case_when(
                   XMLFileNameCheck == "File name meets naming standards." ~ "", 
                   is.na(XMLFileNameCheck) ~ "", 
                   .default = sub("File name", "XML file name", XMLFileNameCheck))) %>% 
         rename("XML file used" = ObsOverlayFile)
      
      ObsOverlayKnown <- TRUE
   }
   
   if(ObsOverlayKnown == FALSE){
      # Checking for XML files in both the project directory and in any folder
      # that includes "XML" below the folder ending in "Working Directory", e.g., 
      # for the project folder whose full path would be 
      
      # "C:/Users/myname/Certara/Simcyp PBPKConsult ABC1A - Modelling Working Directory/Modelling"
      
      # look in the folder here: 
      # "C:/Users/myname/Certara/Simcyp PBPKConsult ABC1A - Modelling Working Directory/XMLs"
      
      ProjectFullPath <- R.utils::getAbsolutePath(pathname = project_folder)
      PossibleXMLPath <- str_extract(ProjectFullPath, "^.*Working Directory/Model(l)?ing/")
      PossibleXMLPath <- sub("/Modelling|/Modeling", "", PossibleXMLPath)
      PossibleXMLPath <- list.dirs(path = PossibleXMLPath, recursive = FALSE)
      PossibleXMLPath <- PossibleXMLPath[str_detect(tolower(PossibleXMLPath), "xml")]
      
      XMLs <- list()
      for(path in c(project_folder, PossibleXMLPath)){
         XMLs[[path]] <- tibble(PathFile = list.files(path = path, 
                                                      pattern = "\\.xml$",
                                                      full.names = TRUE, 
                                                      recursive = TRUE))
      }
      
      XMLs <- bind_rows(XMLs) 
      
      if(nrow(XMLs) > 0){
         
         XMLs <- XMLs %>% 
            mutate(Filetype = "xml", 
                   PathFile = R.utils::getRelativePath(PathFile, relativeTo = project_folder), 
                   Filename = basename(PathFile), 
                   Filename = sub("\\.xml$", "", Filename), 
                   Folder = dirname(PathFile), 
                   Folder = ifelse(Folder == ".", "", Folder), 
                   # hacking this by adding the longest of the possible extensions
                   # b/c regulators need the character length to include the
                   # extension.
                   FileNameCheck = check_file_name(paste0(Filename, ".wksz"))) %>% 
            select(Filename, Folder, Filetype)
         
         Directory <- bind_rows(Directory, 
                                tibble(Filename = c(rep(NA, 5), 
                                                    "Possible XML files for this project that we found either in the the project folder or in the 'XMLs' folder associated with this project:")), 
                                XMLs)
         
      }
   }
   
   # Simplifying FileNameCheck column 
   Directory <- Directory %>% 
      mutate(FileNameCheck = case_when(
         FileNameCheck == "File name meets naming standards." ~ "", 
         .default = FileNameCheck))
   
   # Combining main and XML file name checks into 1 column
   Directory <- Directory %>% 
      mutate(FileNameCheck = case_when(
         FileNameCheck == "" & XMLFileNameCheck == "" ~ "", 
         FileNameCheck != "" & XMLFileNameCheck == "" ~ FileNameCheck, 
         FileNameCheck == "" & XMLFileNameCheck != "" ~ XMLFileNameCheck, 
         FileNameCheck != "" & XMLFileNameCheck != "" ~ paste(FileNameCheck, XMLFileNameCheck))) 
   
   if(all(Directory$FileNameCheck == "", na.rm = T)){
      Directory <- Directory %>% select(-FileNameCheck)
   }
   
   if(any(Directory$DuplicateFileCheck == 
          "simulation file in multiple locations", na.rm = T) == FALSE){
      Directory <- Directory %>% select(-DuplicateFileCheck)
   }
   
   # Setting column order 
   Directory <- Directory %>% 
      select(any_of(c("Filename", "Filetype", "Folder", 
                      "XML file used", "Table/Figure",
                      "Comments", "FileNameCheck", "DuplicateFileCheck"))) %>% 
      rename("File type" = Filetype, 
             "File name" = Filename)
   
   if(use_basename_for_obsfile == TRUE & "XML file used" %in% names(Directory)){
      
      Directory$`XML file used` <- basename(Directory$`XML file used`)
      
   }
   
   
   # Saving -----------------------------------------------------------------
   
   if(complete.cases(save_table)){
      
      Highlighting <- list()
      
      if("FileNameCheck" %in% names(Directory)){
         Highlighting[["FileNameCheck"]] <- 
            list("rows" = which(Directory$FileNameCheck != ""), 
                 "columns" = which(names(Directory) %in% c("File name", "FileNameCheck")))
      } 
      
      if(any(Directory$Folder %in% c("FILE NOT FOUND", 
                                     "FILE PATH TOO LONG"), na.rm = T)){
         Highlighting[["Folder"]] <- 
            list("rows" = which(Directory$Folder %in% c("FILE NOT FOUND", 
                                                        "FILE PATH TOO LONG")), 
                 "columns" = which(names(Directory) %in% c("File name", "Folder")))
      } 
      
      if("DuplicateFileCheck" %in% names(Directory)){
         Highlighting[["DuplicateFileCheck"]] <- 
            list("rows" = which(Directory$DuplicateFileCheck != ""), 
                 "columns" = which(names(Directory) %in% c("File name", "DuplicateFileCheck")))
         
         warning(paste0(wrapn("The following simulation files were found in multiple locations: "), 
                        str_c(Directory$`File name`[which(Directory$DuplicateFileCheck != "")], 
                              collapse = "\n")), 
                 call. = FALSE)
      } 
      
      Highlighting <- Highlighting[which(lapply(Highlighting, length) > 0)]
      
      BoldRow <- which(Directory$`File name` == "Possible XML files for this project that we found either in the the project folder or in the 'XMLs' folder associated with this project:")[1] # there shall be only one. 
      if(any(is.na((BoldRow)))){
         Bold <- NA
      } else {
         Bold <- list(list("rows" = BoldRow, 
                           "columns" = 1))
      }
      
      BoldRow <- ifelse(length(BoldRow) == 0, NA, BoldRow)
      
      ColWidths <- guess_col_widths(DF = Directory, wrap = FALSE)
      ColWidths[ColWidths > 85] <- 85
      
      save_table_to_Excel(table = Directory, 
                          save_table = save_table, 
                          overwrite = overwrite, 
                          output_tab_name = "Simulation directory", 
                          center_top_row = FALSE, 
                          highlight_cells = list("yellow" = Highlighting), 
                          bold_cells = Bold, 
                          column_widths = ColWidths, 
                          wrap_text = FALSE)
   }
   
   return(Directory)
   
}
shirewoman2/Consultancy documentation built on Feb. 18, 2025, 10 p.m.