RapidACi: Automated Rapid A-Ci curve correction for Li-Cor system

Documented in build_list

#' build_list function
#'
#' @description  Generates a list of the files present in a specified directory. By
#'   default, the function uses the MATCH timestamp generated by the Li-Cor system to pair
#'   an A-Ci measurement file to its corresponding 'empty chamber' file. If the Li-Cor MATCH 
#'   function was not used, or was forgotten, during measurements the option the closest
#'   empty chamber timestamp will be used for the correction.
#'
#' @param path_to_licor_files Directory path where all files are stored
#' @param sampleID_format Regex pattern that uniquely identifies the sample ID in
#'   filenames. For example (default), "[:upper:]{3}_[:digit:]{3}" will extract sample ID
#'   of the format ABC_123 from a filename like
#'   "2019-03-20_456_Logdata_ABC_123_Fast_KF.xlsx"
#' @param pattern_empty Regex pattern that must only match filenames for empty chamber
#'   files
#' @param pattern_rapidACi Regex pattern that must only match filenames for rapid A-Ci
#'   measurement files
#' @param pattern_standardACi Regex pattern that must only match filenames for standard
#'   A-Ci measurement files
#' @param pattern_dark Regex pattern that identifies measurements in dark chamber files
#' @param leafArea_df A dataframe containing at least a "sample_ID" column and a
#'   "leafArea_mm2" column (default = NULL). It's recommanded if the sample doesn't cover
#'   the whole area of the chamber in order to apply corrections.
#'   
#' @return The function returns a dataframe that includes the path to the Li-Cor
#'   files to use, the type of measurements, the starting time of the measure, the
#'   timestamps, and how the timestamp was acquired. It also includes leaf area if
#'   supplied.
#'
#' @export


build_list <- function(path_to_licor_files = "data",
                       sampleID_format     = "[:upper:]{3}_[:digit:]{3}",
                       pattern_empty       = "(mpty).*\\.xls",     
                       pattern_rapidACi    = "(fast).*\\.xls",
                       pattern_standardACi = "(slow).*\\.xls",
                       pattern_dark        = "(dark).*\\.xls",
                       leafArea_df         =  NULL) {

  x <- str_remove(path_to_licor_files, "/$")
 
  mpty <- file.path(x, list.files(x, pattern = pattern_empty, ignore.case = TRUE))
  fast <- file.path(x, list.files(x, pattern = pattern_rapidACi, ignore.case = TRUE))
  slow <- file.path(x, list.files(x, pattern = pattern_standardACi, ignore.case = TRUE))
  dark <- file.path(x, list.files(x, pattern = pattern_dark, ignore.case = TRUE))

  df <- 
    tibble(
      path = c(mpty, fast),
      sample_ID = ifelse(is.na(str_extract(path, sampleID_format)), "none",
                         str_extract(path, sampleID_format)),
      get_system_nfo(path)[2:3],
      matchvar = case_when(grepl("1.3", osv) ~ "Match_time",
                           grepl("1.4", osv) ~ "MchEvent_time",
                           TRUE ~ NA_character_),
      chamber = c(rep("EMPTY", length(mpty)), 
                  rep("FAST",  length(fast)))) %>% 
    group_by(osv) %>% 
    group_modify(~ mutate(.x, startpos = list(get_fromExcel(.x$path[1], return = "startpos", 
                                  variables = c("GasEx_TIME", .x$matchvar[1], "MchEvent_co2_t", 
                                                "MchEvent_co2_adj", "MchStatus_cf_co2_a"))))) %>%
    mutate(START_time = ifelse(grepl("6400", sys), NA, 
                               extr_values(path, unlist(lapply(startpos, "[[", 2)), 
                                                    unlist(lapply(startpos, "[[", 1)))),
           timestamp  = ifelse(grepl("6400", sys), NA, 
                               extr_values(path, unlist(lapply(startpos, "[[", 3)), 
                                                    unlist(lapply(startpos, "[[", 1)))),
           MchEvent_co2_t = ifelse(grepl("1.3", osv), NA, 
                                     extr_values(path, unlist(lapply(startpos, "[[", 4)), 
                                                 unlist(lapply(startpos, "[[", 1)))),
           MchEvent_co2_adj = ifelse(grepl("1.3", osv), NA, 
                               extr_values(path, unlist(lapply(startpos, "[[", 5)), 
                                                    unlist(lapply(startpos, "[[", 1)))),
           MchStatus_cf_co2_a = ifelse(grepl("1.3", osv), NA, 
                                     extr_values(path, unlist(lapply(startpos, "[[", 6)), 
                                                 unlist(lapply(startpos, "[[", 1))))) %>%
    select(-startpos)
  
  mpty <- dplyr::filter(df, chamber == "EMPTY")
  fast <- dplyr::filter(df, chamber == "FAST")
  
  # case when no timestamp logged in empty files
  for(i in 1:nrow(mpty)) {
    if(mpty$timestamp[i] == 0 & grepl("1.3", mpty$osv[i])) {
      mpty$timestamp[i] <- mpty$START_time[i]
      warning(paste(mpty$path[i], "doesn't have a logged timestamp. The time at the start of 
                    measurement will be used for matching measurement files."))
   # } else if(empty$timestamp[i] == 0 & grepl("1.4", empty$osv[i]) & empty$MchEvent_co2_t[i] != 0) {
    } else if(grepl("1.4", mpty$osv[i]) & mpty$MchEvent_co2_t[i] != 0) {
      mpty$timestamp[i] <- mpty$MchEvent_co2_t[i]
      message(paste0(mpty$path[i], ": logged match timestamp has been replaced by CO2 match\n"))
    #  empty$timestamp[i] <- get_fromExcel(empty$path[i], 
    #                                      variables = c("MchEvent_co2_adj", "MchStatus_cf_co2_a"), 
    #                                      return = "dataframe")[1,1]
      if(mpty$timestamp[i] == 0) {
        mpty$timestamp[i] <- mpty$START_time[i]
        warning(paste(mpty$path[i], "doesn't have either a match log timestamp or a CO2 match. The 
        time at the start of measurement will be used for matching measurement files."))
      }
    }
  }
  
  for(i in 1:nrow(fast)) {
    if(grepl("1.4", fast$osv[i]) & fast$MchEvent_co2_t[i] != 0) {
      fast$timestamp[i] <- fast$MchEvent_co2_t[i]
      message(paste0(fast$path[i], ": logged match timestamp has been replaced by CO2 match\n"))
    }
  }
  # case when timestamp in fast file but without corresponding empty
  fast_uniqts <- setdiff(unique(fast[, "timestamp"]), unique(mpty[, "timestamp"])) %>%
                 filter(timestamp >= 1)
  
  # if(dim(fast_uniqts)[1] > 0) {
  #     warning("Some measurement files still do not have a matching empty file. The script will 
  #             continue using the closest empty file produced")
  # } 
  for(ts in fast_uniqts$timestamp) fast[fast$timestamp == ts,]$timestamp <- 0

  # case when no timestamp in fast files
  for(i in 1:nrow(fast)) {
    if(fast$timestamp[i] == 0 | is.na(fast$timestamp[i])) {
       fast$timestamp[i] <- empty$timestamp[which.min(abs(empty$timestamp - fast$START_time[i]))]
       warning(paste(fast$path[i], "has no matching empty file. The script will continue using the 
                     closest empty file produced"))
    }
  }
  
  # all together
  df <- bind_rows(mpty, fast, dplyr::filter(df, chamber %in% c("DARK", "SLOW"))) %>%
        mutate(START_time = lubridate::as_datetime(START_time)) %>%
        arrange(timestamp)
  
  # If dataframe for leaf area is provided...
  if (is.null(leafArea_df)) { 
    df$leafArea_mm2 <- NA 
  } else {
    df <- left_join(df, dplyr::select(leafArea_df, sample_ID, "leafArea_mm2")) 
  }
  
  # If dark files are found, include Rd in list_files; multiple dark files ok, but not duplicated 
  # sample_ID names
  if(length(dark) > 0) {
    merged_dark <- vector("list", length = length(dark))
    LA <- dplyr::filter(ungroup(df), chamber != "EMPTY", sample_ID != "none") %>%
          select(sample_ID, leafArea_mm2)
    for(i in 1:length(dark)) {
      nfo <- get_system_nfo(dark[i])
      merged_dark[[i]] <- left_join(df, correct_dark(dark[i], LA, nfo$sys, nfo$osv), by = "sample_ID")
    }

    df <- do.call(bind_rows, merged_dark) %>% unique()
  } else {
    message("No DARK files detected: No Rd value included")
    df$Rd <- NA
  }
  
  return(df)
}