R/fdata.R
In eFCM: Exponential Factor Copula Model

Documented in fdata

#' Transform datasets for factor copula modeling
#'
#' Prepares and organizes datasets for use with the exponential Factor Copula Model (eFCM). The function converts raw station-level observations and their spatial coordinates into an "fdata" object, which contains the data, grid structure, and neighborhood information required for model fitting with `fcm()`.
#'
#' @param data A matrix or data.frame. Each column corresponds to a station, with rows containing observations (on the original scale).
#' @param coord A two-column matrix or data frame of station coordinates (longitude and latitude), one row per station.
#' @param grid Optional two-column matrix or data frame of grid locations (longitude, latitude) at which the model will be fitted. If \code{NULL} (default), a regular grid is generated based on \code{cellsize}.
#' @param neigh Optional list of neighborhood station indices for each grid point. If \code{NULL}, neighborhoods
#'        are constructed using [neighborhood_HT()].
#' @param theta0 Optional matrix or data.frame with two columns: initial lambda and delta. Must match number of stations.
#' @param cellsize Numeric vector of length 1 or 2, specifying longitude and latitude resolution.
#' @param parallel Logical; if \code{TRUE}, run neighbourhood selection in parallel using \pkg{pbmcapply}.
#'   On Windows, \code{pbmclapply} will fall back to serial execution (progress still shown).
#' @param ncpus Integer; number of worker processes when \code{parallel = TRUE} on Unix-alikes.
#' @param mc.set.seed Logical; seed the RNG streams in workers (default \code{TRUE}).
#'   Effective on Unix-alikes; on Windows (serial fallback) it has no effect.
#' @param ... Additional arguments passed to [neighborhood_HT()].
#'
#' @return An object of class `"fdata"`, which is a list with elements:
#' \item{data}{Original input data}
#' \item{coord}{Coordinates of stations}
#' \item{grid}{Grid points with assigned IDs}
#' \item{neigh}{List of neighbor station indices per grid point}
#' \item{theta0}{Initial values matrix}
#' \item{N}{Number of stations}
#'
#' @importFrom progress progress_bar
#' @examples
#' \donttest{
#' # Load precipitation data for counterfactual scenarios
#' data("counterfactual")
#' data("LonLat")
#' coord = LonLat  # station coordinates (longitude-latitude)
#' cf_data <- fdata(counterfactual, coord, cellsize = c(1, 1))
#' }
#' @seealso [fcm()], [neighborhood_HT()]
#'
#' @srrstats {G2.0a} *Explicit documentation of expected input lengths*
#' @srrstats {G2.7} *Accepts standard tabular formats*
#' @srrstats {G2.8} *Performs input coercion and validation*
#' @srrstats {G2.9} *Warns about information loss in conversions*
#' @srrstats {G2.10} *Robust to column extraction across tabular types*
#' @srrstats {G5.2a} *All stop/warning messages are unique and testable*
#' @srrstats {G5.8a} *Handles zero-length input edge cases*
#' @srrstats {G5.8b} *Handles unsupported input types*
#'
#' @export

fdata <- function(data, coord, grid = NULL, neigh = NULL, theta0 = NULL, cellsize = c(0.5, 0.5), parallel = TRUE, ncpus = 4, mc.set.seed = TRUE, ...) {
  # Input check
  assert_tabular(data, "data")
  assert_tabular(coord, "coord")
  assert_no_missing(coord, "coord")

  if (ncol(data) != nrow(coord)) {
    stop("Number of stations (columns in `data`) does not match rows in `coord`.", call. = FALSE)
  }

  if (!is.null(theta0)) {
    assert_tabular(theta0, "theta0")
    if (nrow(theta0) != ncol(data)) {
      stop("Number of rows in `theta0` must match number of stations.", call. = FALSE)
    }
    if (ncol(theta0) != 2) {
      stop("`theta0` must have exactly two columns: lambda and delta.", call. = FALSE)
    }
  }

  # Construct grid
  if (is.null(grid)) {
    if (length(cellsize) == 1) cellsize <- rep(cellsize, 2)
    lon <- seq(min(coord[, 1]), max(coord[, 1]), by = cellsize[1])
    lat <- seq(min(coord[, 2]), max(coord[, 2]), by = cellsize[2])
    grid <- expand.grid(lon = lon, lat = lat)
  }
  if (ncol(grid) != 2) {
    stop("`grid` must have exactly two columns (longitude, latitude).", call. = FALSE)
  }
  names(grid) <- c("lon", "lat")


  # neighborhood selection
  if (is.null(neigh)) {
    if (nrow(grid) == 1) {
      temp <- tryCatch(
        neighborhood_HT(data, coord, unlist(grid[1, ]), ...),
        error = function(e) e
      )
      neigh <- list(if (!inherits(temp, "error")) temp else NULL)
    } else {
      requireNamespace("pbmcapply", quietly = TRUE)
      idx_seq <- seq_len(nrow(grid))

      mc_cores <- if (isTRUE(parallel) && .Platform$OS.type != "windows") {
        max(1L, as.integer(ncpus))
      } else {
        1L
      }

      neigh <- pbmcapply::pbmclapply(
        idx_seq,
        function(i) {
          temp <- tryCatch(
            neighborhood_HT(data, coord, unlist(grid[i, ]), ...),
            error = function(e) e
          )
          if (!inherits(temp, "error")) temp else NULL
        },
        mc.cores       = mc_cores,
        mc.set.seed    = mc.set.seed,
        mc.preschedule = FALSE
      )
    }
    message("Neighborhood selection completed.")
  }

  # Remove grid point without neighbors
  emp <- tidy(neigh)
  if (length(emp) > 0) {
    neigh <- neigh[-emp]
    grid  <- grid[-emp, ]
  }

  if (nrow(grid) == 0) {
    stop("All grid points were removed due to failed neighborhood selection.", call. = FALSE)
  }

  # output
  grid <- cbind(ID = seq_len(nrow(grid)), grid)

  out <- list(
    data    = data,
    coord   = coord,
    grid    = grid,
    neigh   = neigh,
    theta0  = theta0,
    N       = ncol(data)
  )
  class(out) <- c("fdata", class(out))
  return(out)
}