R/get_file.R

Defines functions str2regex parse_mixed_file get_file_from_usms get_file_one get_file_ get_file

#' Read STICS observation or simulation files (.obs or mod_s)
#'
#' @description Read STICS observation or simulation files from a JavaSTICS
#' workspace and store data into a list per usm.
#' Used by `get_obs()` and `get_sim()`. Operate first computation and
#' then call `get_file_()`.
#'
#' @param workspace      Path of a JavaSTICS workspace, or a vector of.
#' @param usm_name       Vector of usms to read (optional, used to filter usms)
#' @param var_list   vector of output variables names to filter
#' (optional, see `get_var_info()` to get the names of the variables)
#' @param dates_list list of dates to filter (optional, should be a POSIX date)
#' @param usms_filepath  Path of the usms file (optional)
#' @param javastics_path JavaSTICS installation path (optional, needed if
#' the plant files are not in the `workspace` but rather in the JavaSTICS
#'  default workspace). Only used to get the plants names.
#' @param verbose        Logical value (optional), TRUE to display information
#' on error, FALSE otherwise (default)
#' @param type          The type of file to read, either "obs" or "sim".
#'
#' @details The `.obs` files names should match USMs names, e.g.
#' for a usm called "banana", the `.obs` file should be named `banana.obs`.
#' For intercrops, the name should be suffixed by "p" for the principal
#' and "a" for the associated plant.
#'
#' @return A named list of `data.frame`s with observations or simulation data.
#' The list elements are named after
#' the usms names.
#'
#' @keywords internal
#'
#' @noRd
#'
get_file <- function(
    workspace,
    usm_name = NULL,
    var_list = NULL,
    dates_list = NULL,
    usms_filepath = NULL,
    javastics_path = NULL,
    verbose = TRUE,
    type = c("sim", "obs")) {
  type <- match.arg(type, c("sim", "obs"), several.ok = FALSE)

  usms_path <- NULL

  # Try absolute path here
  # (if it is, read the file only once, and pass its content)
  if (!is.null(usms_filepath)) {
    usms_path <- normalizePath(usms_filepath, mustWork = FALSE)
    # For the moment searching the usms.xml file in the workspace dir
    # is inactivated, before doing tests on performances.
    # TODO: add a else condition with same command using
    # workspace and "usms.xml"
  }

  # Not keeping usms_filepath if does not exist
  if (!is.null(usms_path) && !file.exists(usms_path)) {
    warning(usms_path, ": file does not exist !")
    usms_path <- NULL
  }

  # Extracting data for a vector of workspace
  res <- unlist(
    lapply(workspace, function(x) {
      get_file_(
        workspace = x,
        usm_name = usm_name,
        usms_filepath = usms_path,
        var_list = var_list,
        dates_list = dates_list,
        javastics_path = javastics_path,
        verbose = verbose,
        type = type
      )
    }),
    recursive = FALSE
  )

  return(res)
}

#' Read STICS observation or simulation files (.obs or mod_s)
#'
#' @description Read STICS observation or simulation files from a
#' JavaSTICS workspace and store data into a list per usm.
#' Used by `get_obs()` and `get_sim()`.
#'
#' @param workspace      Path of a JavaSTICS workspace
#' @param usm_name       Vector of usms to read (optional, used to filter usms)
#' @param usms_filepath  Path of the usms file (optional)
#' @param var_list   vector of output variables names to filter
#' (optional, see `get_var_info()` to get the names of the variables)
#' @param dates_list list of dates to filter (optional, POSIX dates)
#' @param javastics_path JavaSTICS installation path (optional, needed if
#' the plant files are not in the `workspace` but rather in the JavaSTICS
#' default workspace). Only used to get the plants names.
#' @param verbose        Logical value (optional), TRUE to display information
#' on error, FALSE otherwise (default)
#' @param type          The type of file to read, either "obs" or "sim".
#'
#' @details The `.obs` files names should match USMs names, e.g. for a
#' usm called "banana", the `.obs` file should be named `banana.obs`.
#' For intercrops, the name should be suffixed by "p" for the principal
#' and "a" for the associated plant.
#'
#' @return A named list of `data.frame`s with observations or simulation data.
#' The list elements are named after the usms names.
#'
#' @importFrom rlang .data
#'
#' @keywords internal
#'
#' @noRd
#'
get_file_ <- function(
    workspace,
    usm_name = NULL,
    usms_filepath = NULL,
    var_list = NULL,
    dates_list = NULL,
    javastics_path = NULL,
    verbose = TRUE,
    type = c("sim", "obs")) {
  # TODO: add checking dates_list format, or apply the used format in sim
  # data.frame

  type <- match.arg(type, c("sim", "obs"), several.ok = FALSE)
  if (type == "sim") {
    file_pattern <- "^mod_s"
    file_ext <- "sti"
    full_type <- "simulation"
  } else {
    file_pattern <- "\\.obs$"
    file_ext <- "obs"
    full_type <- "observation"
  }

  # Getting files list from workspace vector
  workspace_files <- list.files(
    pattern = file_pattern,
    path = workspace,
    recursive = FALSE
  )

  # Checking if usm_name correspond to existing simulation
  # or observation files, a warning with missing outputs/obs usm
  # names
  if (length(workspace_files) > 0) {
    if (!is.null(usm_name)) {
      idx <- lapply(
        str2regex(usm_name),
        function(y) {
          # using optional "p" or "a" in pattern for associated crops
          # p for principal crop, a for associated crop
          patt <- paste0(y, "[a|p]?\\.", file_ext)
          grep(pattern = patt, x = workspace_files)
        }
      )
      usm_idx <- unlist(lapply(idx, function(x) length(x) > 0))
      files_idx <- unlist(idx)
      workspace_files <- workspace_files[files_idx]
    }
  }

  # Trying to find sub-directories named with usms names
  if (!is.null(usm_name)) {
    workspace_sub <- file.path(workspace, usm_name)
  } else {
    # If no usm_name is given, we are looking for all sub-dirs
    # in the workspace
    workspace_sub <- list.dirs(
      path = workspace,
      full.names = TRUE,
      recursive = FALSE
    )
  }

  # Getting the files list from sub-directories
  if (exists("workspace_sub") && length(workspace_sub) > 0) {
    workspace_files_sub <- unlist(
      lapply(workspace_sub, {
        function(x) {
          list.files(
            path = x,
            pattern = file_pattern,
            recursive = FALSE,
            full.names = TRUE
          )
        }
      })
    )
  }

  # Testing if duplicates files found either in workspace or in sub-dirs
  if (exists("workspace_files_sub") && (length(workspace_files_sub) > 0)) {
    # checking common files
    common_idx <- basename(workspace_files_sub) %in% workspace_files
    if (any(common_idx)) {
      warning(
        "Files exist in both ",
        workspace,
        " and ",
        workspace_sub[common_idx],
        ": \n",
        paste(basename(workspace_files_sub)[common_idx], collapse = ", ")
      )
    }
  } else {
    workspace_files_sub <- vector(mode = "character", 0)
  }

  # Exiting without finding any file
  if (!length(workspace_files) > 0) {
    # No sim/obs file found
    if (!length(workspace_files_sub) > 0) {
      warning(
        "Not any ",
        full_type,
        " file detected, neither in workspace ",
        workspace,
        ", nor in sub-directory(ies)."
      )
      return()
    }
    workspace_files <- workspace_files_sub
    workspace <- dirname(workspace_files_sub)
  }

  # No usms file path is given
  if (!is.null(usms_filepath)) {
    # In the get_file_from_usms the usms are filtered against
    # usm_name
    file_name <- get_file_from_usms(
      workspace = workspace,
      usms_path = usms_filepath,
      type = type,
      usm_name = usm_name
    )

    # Filtering existing files in file_name list
    exist_files <- unlist(
      lapply(file_name, function(x) all(x %in% basename(workspace_files)))
    )

    file_name <- file_name[exist_files]

    # Exiting: not any existing files
    if (!length(file_name)) {
      return()
    }

    # Getting plant names, if javastics_path or workspace path contains
    # a plant directory, otherwise setting plant name to plant file name
    # as a default.
    usms <- names(file_name)
    plant_names <-
      get_plant_name(workspace, usms_filepath, usms, javastics_path, verbose)
  }

  # The user did not provide any usms file path, so using the names of
  # the .sti files as information.
  if (is.null(usms_filepath)) {
    # Getting sim/obs files list from directory
    file_name <-
      parse_mixed_file(
        file_names = as.list(basename(workspace_files)),
        type = type
      )
    usms <- names(file_name)

    # Selecting using usm_name
    if (!is.null(usm_name)) {
      usms <- intersect(usms, usm_name)
      # Not any matching names
      if (!length(usms)) {
        return()
      }
      file_name <- file_name[usms]
    }
    # Calculating plant ids
    plant_names <- lapply(file_name, function(x) {
      if (length(x) > 1) {
        c("plant_1", "plant_2")
      } else {
        c("plant_1")
      }
    })
  }

  # to be sure that file_name and workspace are in the same order ...
  # this may not be the case if usms_filepath is not given and if some
  # USMs are named ****a or ****p, but are not intercrop USMs
  if (length(workspace) > 1) {
    idx <- sapply(
      str2regex(basename(workspace)),
      function(y) {
        grep(
          pattern = paste0("^", y, "$"),
          x = names(file_name)
        )
      }
    )

    to_remove <- which(sapply(idx, function(x) (length(x) == 0)))

    if (length(to_remove) > 0) {
      workspace <- workspace[-to_remove]
    }
    file_name <- file_name[unlist(idx)]
  }

  # Getting sim/obs data list
  df_list <- mapply(
    function(dirpath, filename, p_name) {
      get_file_one(
        dirpath,
        filename,
        p_name,
        verbose,
        dates_list,
        var_list
      )
    },
    dirpath = workspace,
    filename = file_name,
    p_name = plant_names,
    SIMPLIFY = FALSE,
    USE.NAMES = FALSE
  )

  names(df_list) <- names(file_name)

  return(df_list)
}


#' Get file for one workspace / file_name / plant_name
#'
#' Get a simulation or observation file for one situation at a time,
#' for sole or intercrop
#'
#' @param dirpath Path of a JavaSTICS workspace
#' @param filename File name(s)
#' @param p_name Plant name(s)
#' @param verbose Logical value (optional), TRUE to display information
#' on error, FALSE otherwise (default)
#' @param dates_list list of dates to filter (optional, should be a POSIX date)
#' @param var_list vector of output variables names to filter
#' (optional, see `get_var_info()` to get the names of the variables)
#'
#' @return the obs or simulation output
#' @keywords internal
#'
#' @noRd
#'
get_file_one <- function(
    dirpath,
    filename,
    p_name,
    verbose,
    dates_list,
    var_list) {
  out <-
    get_file_int(dirpath, filename, p_name, verbose = verbose) %>%
    dplyr::select_if(function(x) {
      any(!is.na(x))
    })

  # Filtering
  # Filtering Date on dates_list (format Posixct)
  if (!is.null(dates_list) && "Date" %in% names(out)) {
    out <-
      out %>%
      dplyr::filter(out$Date %in% dates_list)
  }

  # Selecting variables columns
  if (!is.null(var_list)) {
    # Managing output columns according to out content
    out_cols <- var_to_col_names(var_list)
    time_idx <- names(out) %in% c("ian", "mo", "jo", "jul")
    time_elts <- names(out)[time_idx]
    out_cols <- c(time_elts, out_cols)
    if ("cum_jul" %in% names(out)) out_cols <- c("cum_jul", out_cols)
    if ("Date" %in% names(out)) out_cols <- c("Date", out_cols)
    if ("Plant" %in% names(out)) out_cols <- c(out_cols, "Plant")
    out <-
      out %>%
      dplyr::select(dplyr::one_of(out_cols))
  }

  if (length(p_name) > 1) {
    out$Dominance <- "Principal"
    out$Dominance[out$Plant == p_name[2]] <- "Associated"
  }
  out
}

get_file_from_usms <- function(
    workspace,
    usms_path,
    type = c("sim", "obs"),
    usm_name = NULL,
    verbose = TRUE) {
  # Getting usms names from the usms.xml file
  usms <- get_usms_list(file = file.path(usms_path))

  # Filtering USMs if required:
  if (!is.null(usm_name)) {
    usm_exist <- usm_name %in% usms

    # Some provided usms are not available:
    if (!all(usm_exist)) {
      if (verbose) {
        cli::cli_alert_danger(
          paste0(
            "The usm{?s} {.val {usm_name[!usm_exist]}}",
            " d{?oes/o} not exist in the workspace!"
          )
        )
        cli::cli_alert_info("Usm{?s} found in the workspace: {.val {usms}}")
      }
      stop(usm_name, ": do(es) not match usms")
    }
    usms <- usm_name
  }

  # Intercropping
  mixed <- get_plants_nb(usms_path)[usms] > 1
  # Extracting expected observation file names:
  file_name <- vector(mode = "list", length = length(usms))
  names(file_name) <- usms

  if (type == "sim") {
    file_name[!mixed] <- paste0("mod_s", usms[!mixed], ".sti")
    file_name[mixed] <- lapply(usms[mixed], function(x) {
      paste0("mod_s", c("p", "a"), x, ".sti")
    })
  } else {
    file_name[!mixed] <- paste0(usms[!mixed], ".obs")
    file_name[mixed] <- lapply(usms[mixed], function(x) {
      paste0(x, c("p", "a"), ".obs")
    })
  }

  # Filtering with all files exist
  # Using now possibly multiple workspaces
  files_exist <- mapply(
    function(dirpath, filename) {
      all(file.exists(file.path(dirpath, filename)))
    },
    dirpath = workspace,
    filename = file_name
  )

  file_name <- file_name[files_exist]

  return(file_name)
}


#' Get mixed file names
#'
#' Get mixed observation or simulation files by name
#'
#' @param file_names A list of files
#' @param type      The type of file to read, either "obs" or "sim".
#' @note The function use the obs/sim files names to retrieve the usm name.
#' So each obs file should be named with the usm name, followed by a or p
#' at the end in the case of associated crops.
#'
#' @return A list of observation or simulation files associated to
#' their usm name. The mixed crops are always returned as
#' c("Principal","Associated").
#'
#' @keywords internal
#'
#' @noRd
#'
#' @examples
#' \dontrun{
#' parse_mixed_file(
#'   list(
#'     "banana.obs", "IC_banana_sorghuma.obs",
#'     "IC_banana_sorghump.obs"
#'   ),
#'   type = "obs"
#' )
#'
#' # Simulations with usm names starting with "a", with or
#' # without intercropping:
#' file_names <- list(
#'   "mod_sauzevilleSC_Wheat_Wheat_2005-2006_N0.sti",
#'   "mod_saIC_Wheat_Wheat_2005-2006_N0.sti",
#'   "mod_spIC_Wheat_Wheat_2005-2006_N0.sti"
#' )
#'
#' parse_mixed_file(file_names, type = "sim")
#' }
#'
parse_mixed_file <- function(file_names, type = c("sim", "obs")) {
  type <- match.arg(type, c("sim", "obs"), several.ok = FALSE)

  if (type == "sim") {
    usm_pattern <- "^(mod_s)|(\\.sti)$"
    mixed_pattern <- "^(mod_s(a|p))|(\\.sti)$"
    associated_pattern <- "^mod_sa"
  } else {
    usm_pattern <- "\\.obs$"
    mixed_pattern <- "((a|p)\\.obs)$"
    associated_pattern <- "a\\.obs$"
  }

  usm_names <- gsub(pattern = usm_pattern, replacement = "", x = file_names)
  names(file_names) <- usm_names

  is_potential_mixed <- grepl(mixed_pattern, file_names)

  usm_name_potential_mixed <-
    gsub(pattern = mixed_pattern, replacement = "", x = file_names)

  potential_mixed <- usm_name_potential_mixed[is_potential_mixed]

  file_names2 <- file_names

  mixed_and_not_duplicated <-
    seq_along(file_names)[is_potential_mixed][!duplicated(potential_mixed)]

  for (i in mixed_and_not_duplicated) {
    mixed <- which(
      usm_name_potential_mixed[i] == usm_name_potential_mixed &
        is_potential_mixed
    )

    if (length(mixed) > 1) {
      mixed_names <- unlist(file_names[mixed])

      associated_index <- grep(associated_pattern, mixed_names)

      file_names2[[i]] <-
        c(mixed_names[-associated_index], mixed_names[associated_index])

      names(file_names2)[i] <- usm_name_potential_mixed[i]
    } else {
      # Here we thougth it was mixed, but it really is not because
      # we did not found another associated file with the same name
      # modulo "a" or "p"
      names(file_names2)[i] <-
        gsub(pattern = usm_pattern, replacement = "", x = file_names2[i])
    }
  }
  file_names2[c(which(!is_potential_mixed), mixed_and_not_duplicated)]
}


#' Transform a string into a regex string
#'
#' @param in_str The string to transform
#'
#' @return A string with special characters (. + *) replaced
#' with escaped ones
#'
#' @keywords internal
#'
#' @noRd
#'
#' @examples
#'
#' str2regex("myfile.ext")
#' str2regex("myfile+.ext")
#' str2regex("mydir*")
#'
str2regex <- function(in_str) {
  regex_chars <- c("\\.", "\\+", "\\*")
  replace_chars <- paste0("\\", regex_chars)
  out_str <- in_str
  for (i in seq_along(regex_chars)) {
    out_str <- gsub(
      x = out_str,
      pattern = regex_chars[i],
      replacement = replace_chars[i]
    )
  }
  out_str
}
SticsRPacks/SticsRFiles documentation built on July 4, 2025, 4:19 p.m.