harpIO: IO functions and data wrangling for harp

Documented in read_grid read_grid_interpolate

#' Read gridded data in various formats
#'
#' \code{read_grid} is a generic function to read gridded data from a file. A
#' transformation can optionally be applied to those data.
#'
#' @param file_name A character string with the (full) file name.
#' @param parameter The parameter to be read.
#' @param is_forecast Logical. If TRUE (the default), data are filtered based on
#'   the forecast initialization time and the lead time. If FALSE, the data are
#'   filtered on the valid time.
#' @param dttm A vector of date time strings of the form YYYYMMDD, YYYYMMDDhh,
#'   YYYYMMDDhhmm, or YYYYMMDDhhmmss. For forecast data these would be the
#'   forecast initialization times. \code{\link[harpCore]{seq_dttm}} can be used
#'   to generate a vector of equally spaced date-time strings.
#' @param file_format The file format. Possible values include grib, netcdf, FA,
#'   hdf5... Whatever the value is, it is supposed to correspond to a function
#'   "read_XXX" that can deal with the format. If not specified, the format can
#'   often be guessed correctly from file extension or the first few bytes.
#' @param file_format_opts Options for the file format as generated by one of
#'   the options functions, e.g. \code{\link{netcdf_opts}}.
#' @param vertical_coordinate The vertical coordinate for upper air data. May be
#'   "pressure" for pressure levels, "model" for model levels or "height" for
#'   height levels.
#' @param lead_time The lead times to read from a forecast file. If set to NULL,
#'   all lead times will be read from the file.
#' @param members For ensemble data, the ensemble members to read from the file.
#'   If set to NULL all members will be read.
#' @param transformation The transformation to apply to the gridded data. Can be
#'   "none", "interpolate", "regrid" or "xsection".
#' @param transformation_opts Options for the transformation as generated by
#'   \code{\link{interpolate_opts}}, \code{\link{regrid_opts}} or
#'   \code{\link{xsection_opts}}.
#' @param param_defs A list of parameter definitions that includes the file
#'   format to be read. By default the built in list \code{\link{harp_params}}
#'   is used. Modifications and additions to this list can be made using
#'   \code{\link{modify_param_def}} and \code{\link{add_param_def}}
#'   respectively.
#' @param show_progress Show progress when reading large amounts of data.
#' @param data_frame Logical. For compatibility with current version of
#'   harpSpatial, make sure only a geofield is returned rather than a data frame
#'   when set to FALSE.
#' @param readable_times When \code{data_frame = TRUE}, whether to convert times
#'   in unix format to a data-time format. The default is TRUE.
#' @param spread_members When \code{data_frame = TRUE}, whether to spread the
#'   members into a column for each member.
#' @param ... All arguments passed to the specified reader function.
#'
#' @return A geofield or (possibly) a plain  matrix.
#' @export
#'
#' @examples
#' if (requireNamespace("Rgrib2", quietly = TRUE) & requireNamespace("harpData", quietly = TRUE)) {
#'   read_grid(
#'     system.file(
#'       "grib/AROME_Arctic/2018/07/10/00/fc2018071000+000grib_fp",
#'       package = "harpData"
#'     ),
#'     parameter = "T2m"
#'   )
#'   read_grid(
#'     system.file(
#'       "grib/AROME_Arctic/2018/07/10/00/fc2018071000+000grib_fp",
#'        package = "harpData"
#'     ),
#'     parameter = "S10m"
#'   )
#'   read_grid(
#'     system.file(
#'       "grib/AROME_Arctic/2018/07/10/00/fc2018071000+000grib_fp",
#'        package = "harpData"
#'     ),
#'     parameter = "tcc"
#'   )
#' }
read_grid <- function(
  file_name,
  parameter,
  is_forecast         = TRUE,
  dttm                = NULL,
  file_format         = NULL,
  file_format_opts    = list(),
  vertical_coordinate = c("pressure", "model", "height", NA),
  lead_time           = NULL,
  members             = NULL,
  transformation      = c("none", "interpolate", "regrid", "xsection", "subgrid"),
  transformation_opts = list(),
  param_defs          = get("harp_params"),
  show_progress       = TRUE,
  data_frame          = FALSE,
  readable_times      = TRUE,
  spread_members      = FALSE,
  ...
) {

  # Set progress bar to false for batch running
  if (!interactive()) show_progress <- FALSE

  if (missing(parameter)) parameter <- NULL

  if (!is.null(dttm)) {
    dttm <- str_datetime_to_unixtime(dttm)
  }

  if (any(is.na(vertical_coordinate))) vertical_coordinate <- as.character(vertical_coordinate)
  vertical_coordinate <- match.arg(vertical_coordinate)
  transformation      <- match.arg(transformation)
  transformation_opts <- check_opts(transformation, transformation_opts)

  file_format <- check_for_na(file_format)

  if (is.null(file_format)) {
    file_format <- guess_format(file_name)
  }

  if (length(file_format) > 1) {
    stop(
      "More than one 'file_format' passed: '", paste(file_format, collpase = "','"), "'.",
      call. = FALSE
    )
  }

  if (is.na(file_format)) {
    stop("Please provide explicit file format for '", file_name, "'.", call. = FALSE)
  }

  # Make sure default interpolate_opts are set for vfld files if none are passed
  if (file_format == "vfld" && !is.element("correct_t2m", names(transformation_opts))) {
    transformation_opts <- interpolate_opts(stations = harpCore::station_list)
  }

  read_func <- try(get(paste0("read_", file_format)), silent = TRUE)
  if (inherits(read_func, "try-error")) {
    stop(
      "Cannot find function 'read_", file_format, "' to read ", file_format, " files.",
      call. = FALSE
    )
  }

  members   <- unique(check_for_na(members))
  lead_time <- unique(check_for_na(lead_time))
  if (is.list(parameter) && length(parameter) == 1) {
    parameter <- parameter[[1]]
  }
  if (!inherits(parameter, "harp_parameter")) {
    parameter <- unique(check_for_na(parameter))
  }

  message("Reading ", file_name)

  gridded_data <- read_func(
    file_name           = file_name,
    parameter           = parameter,
    is_forecast         = is_forecast,
    date_times          = dttm,
    lead_time           = lead_time,
    members             = members,
    vertical_coordinate = vertical_coordinate,
    transformation      = transformation,
    transformation_opts = transformation_opts,
    format_opts         = file_format_opts,
    param_defs          = param_defs,
    show_progress       = show_progress
  )

  if (is.element("transformation_opts", names(attributes(gridded_data)))) {
    transformation_opts <- attr(gridded_data, "transformation_opts")
  }

  if (!is_forecast) {
    gridded_data <- dplyr::select(
      gridded_data,
      -.data[["lead_time"]],
      -.data[["fcdate"]]
    )
  }

  data_col <- switch(
    transformation,
    "none"     = "gridded_data",
    "regrid"   = "regridded_data",
    "xsection" = "xsection_data",
    "subgrid"  = "subgrid_data",
    NA
  )

  if (
    transformation == "none" &&
      is.element("station_data", colnames(gridded_data))
  ) {
    data_col <- NA_character_
  }

  if (readable_times) {

    if (!is.null(gridded_data[["fcdate"]])) {
      gridded_data[["fcdate"]] <- unix2datetime(gridded_data[["fcdate"]])
    }
    if (!is.null(gridded_data[["validdate"]])) {
      gridded_data[["validdate"]] <- unix2datetime(gridded_data[["validdate"]])
    }

  }

  if (spread_members && is.element("members", colnames(gridded_data))) {
    gridded_data[["members"]] <- paste0(
      "fcst_mbr",
      formatC(gridded_data[["members"]], width = 3, flag = "0")
    )
    gridded_data <- tidyr::spread(gridded_data, key = "members", value = data_col)
  }

  list_cols <- which(sapply(gridded_data, typeof) == "list")
  for (df_col in list_cols) {
    if (all(sapply(gridded_data[[df_col]], meteogrid::is.geofield))) {
      gridded_data[[df_col]] <- harpCore::geolist(gridded_data[[df_col]])
    }
  }

  if (any(sapply(gridded_data, function(x) inherits(x, "geolist")))) {
    class(gridded_data) <- c("harp_spatial_fcst", class(gridded_data))
  }

  if (transformation == "xsection") {

    all_attrs <- unique(lapply(gridded_data[[data_col]], attributes))

    if (length(all_attrs) == 1) {

      gridded_data <- tidyr::unnest(gridded_data, .data[[data_col]]) %>%
        dplyr::group_by_at(
          dplyr::vars(-.data[["level"]], -.data[["distance"]], -.data[["value"]])
        ) %>%
        tidyr::nest(
          !!rlang::sym(data_col) := c(
            .data[["level"]], .data[["distance"]], .data[["value"]]
          )
        ) %>%
        dplyr::ungroup()

      if (transformation_opts[["levels_ascending"]]) {
        levels_direction <- get("c")
      } else {
        levels_direction <- get("rev")
      }

      gridded_data[[data_col]] <- purrr::map(
        gridded_data[[data_col]],
        ~dplyr::mutate(
          dplyr::arrange(.x, .data[["level"]]),
          level_number = as.numeric(factor(levels_direction(.data[["level"]])))
        )
      )


      gridded_data[[data_col]] <- lapply(
        gridded_data[[data_col]],
        function(x) structure(x, class = c("harp_xs", class(x)))
      )

      class(gridded_data[[data_col]]) <- c("harp_xs_list", class(gridded_data[[data_col]]))
      class(gridded_data) <- c("harp_xs_df", class(gridded_data))

    }

    if (length(all_attrs) > 1) {
      stop("Different x sections found in the same data", call. = FALSE)
    }

  }

  if (!data_frame) {

    if (is.na(data_col)) {
      #message("For transformation = '", transformation, "' data must be returned as a data frame.")
      return(gridded_data)
    }

    if (nrow(gridded_data) == 1) {
      gridded_data <- gridded_data[[data_col]][[1]]
    } else {
      gridded_data <- gridded_data[[data_col]]
      if (transformation == "xsection") {
        class(gridded_data) <- union("xsection_list", class(gridded_data))
      } else {
        class(gridded_data) <- union("geolist", class(gridded_data))
      }
    }

  } else {

    attr(gridded_data, "transformation_opts") <- transformation_opts
    gridded_data <- dplyr::rename_with(
      gridded_data,
      ~gsub("date", "_dttm", .x),
      dplyr::matches("fcdate|validdate")
    )
    gridded_data <- dplyr::rename_with(
      gridded_data,
      ~gsub("fc", "fcst", .x),
      dplyr::matches("^fc_dttm$")
    )
    gridded_data <- dplyr::rename_with(
      gridded_data,
      ~gsub("leadtime", "lead_time", .x),
      dplyr::matches("^leadtime$")
    )
  }

  gridded_data

}

#' @rdname read_grid
#' @export
read_grid_interpolate <- function(file_name, parameter, file_format = NULL, ...) {
  if (is.null(file_format)) file_format <- guess_format(file_name)
  if (is.na(file_format)) stop("Please provide explicit file format for ", file_name, call. = FALSE)
  reader <- get(paste0("read_", file_format, "_interpolate"))
  reader(file_name, parameter = parameter, ...)
}

# Try to guess the binary format of a gridded data file.
# @param file_name The file name
# @return A character string with the format, or NA.
guess_format <- function(file_name) {
  if (!file.exists(file_name)) stop("File not found.", call. = FALSE)

  # first we try the file extension:
  ## TODO: tar? csv, txt...?
  ## for now, we assume tar -> fatar (it's the only one we know, anyway)
  ext <- tools::file_ext(file_name)
  ff <- switch(tolower(ext),
    "grib" = ,
    "grb"  = "grib",
    "nc"   = ,
    "nc4"  = ,
    "ncf"  = "netcdf",
    "h5"   = ,
    "hdf"  = ,
    "hdf5" = "hdf5",
    "tar"  = "fatar",
    NA_character_)
  ## try reading the first few bytes of the file
  if (is.na(ff)) {
    df <- file(file_name, open = "rb")
    ttt <- readBin(df, "raw", n = 4 * 8)
    close(df)
    if (rawToChar(ttt[1:4]) == "GRIB") return("grib")
    if (rawToChar(ttt[1:4]) == "BUFR") return("bufr")
    if (rawToChar(ttt[2:4]) == "HDF") return("hdf5")
    ## FA files have a header of 22 8-byte integers
    ## 2 and 4 should always have the same value
    header <- readBin(ttt, "integer", size = 8, n = 22, endian = "big")
    if (as.numeric(header[2]) == 16 && as.numeric(header[4]) == 22) return("fa")
    first_line <- try(scan(file_name, nlines = 1, quiet = TRUE))
    if (!inherits(first_line, "try-error")) {
      if (is.numeric(first_line) && length(first_line) > 1 & length(first_line) < 4) {
        if (nchar(first_line[1]) < 8) {
          return("vfld")
        }
        if (length(first_line) == 2) {
          try_date <- try(
            str_datetime_to_unixtime(
              paste0(
                first_line[1],
                formatC(as.integer(first_line[2]), width = 6, flag = "0")
              )
            ),
            silent = TRUE
          )
          if (!inherits(try_date, "try-error")) {
            return("obsoul")
          }
        }
      }
    }
  }
  return(ff)
}

check_for_na <- function(x) {
  x <- stats::na.omit(x)
  if (length(x) < 1) {
    x <- NULL
  }
  as.vector(x)
}

check_opts <- function(trans, trans_opts) {
  if (length(trans_opts) < 1) {
    if (trans == "none") {
      trans_opts[["keep_raw_data"]] <- TRUE
      return(trans_opts)
    }
    if (trans == "interpolate") {
      message(
        "transformation_opts not passed for transformation = 'interpolate'.\n",
        "Setting to default interpolate_opts()."
      )
      return(interpolate_opts())
    }
    stop("transformation_opts must be passed for transformation = '", trans, "'.", call. = FALSE)
  }
  trans_opts
}