R/class_Dataset.R

Defines functions new_dataset_from_auto new_dataset

Documented in new_dataset new_dataset_from_auto

#' @include internal.R
NULL

#' Dataset class
#'
#' Definition for the Dataset class.
#'
#' @seealso [new_dataset()].
Dataset <- R6::R6Class(
  "Dataset",
  public = list(

    #' @field id `character` identifier.
    id = NA_character_,

    #' @field spatial_path `character` file path.
    spatial_path = NA_character_,

    #' @field attribute_path `character` file path.
    attribute_path = NA_character_,

    #' @field boundary_path `character` file path.
    boundary_path = NA_character_,

    #' @field spatial_data `NULL`, [sf::st_sf()], or [terra::rast()] object.
    spatial_data = NULL,

    #' @field attribute_data `NULL`, or [tibble::tibble()] object.
    attribute_data = NULL,

    #' @field boundary_data `NULL`, or [Matrix::sparseMatrix()] or `NA` object.
    boundary_data = NULL,

    #' @description
    #' Create a Dataset object.
    #' @param id `character` value.
    #' @param spatial_path `character` file path.
    #' @param attribute_path `character` file path.
    #' @param boundary_path `character` file path.
    #' @param spatial_data [sf::st_sf()], or [terra::rast()] object.
    #' @param attribute_data [tibble::tibble()] object.
    #' @param boundary_data [Matrix::sparseMatrix()] object or [NA].
    #' @return A new Dataset object.
    initialize = function(id, spatial_path, attribute_path, boundary_path,
                          spatial_data, attribute_data, boundary_data) {
      
      ## assert that arguments are valid
      assertthat::assert_that(
        ## id
        assertthat::is.string(id),
        assertthat::noNA(id),
        ## spatial_path
        assertthat::is.string(spatial_path),
        assertthat::noNA(spatial_path),
        ## attribute_path
        assertthat::is.string(attribute_path),
        assertthat::noNA(attribute_path),
        ## boundary_path
        assertthat::noNA(boundary_path),
        assertthat::is.string(boundary_path),
        ## spatial_data
        inherits(spatial_data, c("NULL", "sf", "SpatRaster")),
        ## attribute_data
        inherits(attribute_data, c("NULL", "data.frame")),
        ## boundary_data
        inherits(boundary_data, c("NULL", "dsCMatrix", "logical"))
      )
      ## validate paths
      if (!identical(spatial_path, "memory")) {
        assertthat::assert_that(assertthat::is.readable(spatial_path))
      }
      if (!identical(attribute_path, "memory")) {
        assertthat::assert_that(assertthat::is.readable(attribute_path))
      }
      if (!identical(boundary_path, "memory")) {
        assertthat::assert_that(assertthat::is.readable(boundary_path))
      }
      ## set fields
      self$id <- id
      self$spatial_path <- spatial_path
      self$attribute_path <- attribute_path
      self$boundary_path <- boundary_path
      self$spatial_data <- spatial_data
      self$attribute_data <- attribute_data
      self$boundary_data <- boundary_data

      ### validate data
      if (inherits(self$spatial_data, "sf")) {
        #### CRS
        assertthat::assert_that(
          !is.na(methods::as(sf::st_crs(self$spatial_data), "CRS")
          ),
          msg = "vector data must have a defined CRS"
        )
      } else if (inherits(self$spatial_data, "SpatRaster")) {
        #### CRS
        assertthat::assert_that(
          !is.na(methods::as(sf::st_crs(self$spatial_data), "CRS")
          ),
          msg = "raster data must have a defined CRS"
        )
      }

      ## validate attribute data
      if (inherits(attribute_data, "data.frame")) {
        assertthat::assert_that(
          assertthat::has_name(self$attribute_data, "_index"),
          last(names(self$attribute_data)) == "_index"
        )
      }
    },

    #' @description
    #' Print the object.
    #' @param ... not used.
    print = function(...) {
      message("Dataset")
      message("  paths:")
      message("    spatial: ", self$spatial_path)
      message("    attribute: ", self$attribute_path)
      message("    boundary: ", self$boundary_path)
      invisible(self)
    },

    #' @description
    #' Generate a `character` summarizing the representation of the object.
    #' @return `character` value.
    repr = function() {
      if (identical(self$spatial_path, "memory")) {
        out <- "memory"
      } else {
        out <- paste0(".../", basename(self$spatial_path))
      }
      out
    },

    #' @description
    #' Import the data into memory.
    import = function() {
      # if data files are not stored in memory, then import them
      ## spatial data
      if (is.null(self$spatial_data)) {
        ### import data
        suppressWarnings({
          self$spatial_data <- read_spatial_data(self$spatial_path)
        })
        ### validate data
        if (inherits(self$spatial_data, "sf")) {
          #### CRS
          assertthat::assert_that(
            !is.na(methods::as(sf::st_crs(self$spatial_data), "CRS")
            ),
            msg = "vector data must have a defined CRS"
          )
        } else if (inherits(self$spatial_data, "SpatRaster")) {
          #### CRS
           assertthat::assert_that(
            !is.na(methods::as(sf::st_crs(self$spatial_data), "CRS")
            ),
            msg = "raster data must have a defined CRS"
          )
        }
      }
      ## attribute data
      if (is.null(self$attribute_data)) {
        ### import data
        self$attribute_data <-
          tibble::as_tibble(data.table::fread(
            self$attribute_path,
            data.table = FALSE
          ))
        ## validate attribute data
        assertthat::assert_that(
          assertthat::has_name(self$attribute_data, "_index"),
          last(names(self$attribute_data)) == "_index"
        )
      }
      ## boundary data
      if (is.null(self$boundary_data)) {
        ### import data
        bd <- tibble::as_tibble(data.table::fread(
          self$boundary_path,
          data.table = FALSE
        ))
        ### convert to matrix format
        if (
          endsWith(self$boundary_path, ".csv") ||
          endsWith(self$boundary_path, ".csv.gz")
        ) {
          #### format data as .csv or .csv.gz format
          self$boundary_data <- Matrix::sparseMatrix(
            i = bd[[1]], j = bd[[2]], x = bd[[3]],
            index1 = FALSE, repr = "C", symmetric = TRUE,
            dims = rep(nrow(self$attribute_data), 2)
          )
        } else if (
          endsWith(self$boundary_path, ".dat") ||
          endsWith(self$boundary_path, ".dat.gz")
        ) {
          #### format data as .dat or .dat.gz format
          self$boundary_data <- Matrix::sparseMatrix(
            i = match(bd[[1]], self$attribute_data[["_index"]]),
            j = match(bd[[2]], self$attribute_data[["_index"]]),
            x = bd[[3]],
            index1 = TRUE, repr = "C", symmetric = TRUE,
            dims = rep(nrow(self$attribute_data), 2)
          )
        } else {
          stop("boundary_path is not a recognized file extension.")
        }
      }
      invisible(self)
    },

    #' @description
    #' Write the data to disk.
    #' @param spatial_path `character` file path.
    #' @param attribute_path `character` file path.
    #' @param boundary_path `character` file path.
    write = function(spatial_path, attribute_path, boundary_path) {
      # assert that arguments are valid
      assertthat::assert_that(
        assertthat::is.string(spatial_path),
        assertthat::noNA(spatial_path),
        assertthat::is.string(attribute_path),
        assertthat::noNA(attribute_path),
        assertthat::is.string(boundary_path),
        assertthat::noNA(boundary_path)
      )
      self$import()
      # spatial data
      if (inherits(self$spatial_data, "sf")) {
        suppressWarnings({
          sf::write_sf(self$spatial_data, spatial_path)
        })
      } else {
        suppressWarnings({
          terra::writeRaster(
            self$spatial_data, spatial_path,
            overwrite = TRUE, NAflag = -9999
          )
        })
      }
      # attribute data
      data.table::fwrite(
        self$attribute_data, attribute_path,
        sep = ",", row.names = FALSE
      )
      # boundary data
      bd <- methods::as(self$boundary_data, "TsparseMatrix")
      ### if data in .csv or .csv.fz format
      if (
        endsWith(boundary_path, ".csv") ||
        endsWith(boundary_path, ".csv.gz")
      ) {
        data.table::fwrite(
          tibble::tibble(i = bd@i, j = bd@j, x = bd@x),
          boundary_path,
          sep = ",", row.names = FALSE
        )
      } else if (
        endsWith(boundary_path, ".dat") ||
        endsWith(boundary_path, ".dat.gz")
      ) {
        data.table::fwrite(
          tibble::tibble(
            id1 = self$attribute_data[["_index"]][bd@i + 1],
            id2 = self$attribute_data[["_index"]][bd@j + 1],
            boundary = bd@x
          ),
          boundary_path,
          sep = ",", row.names = FALSE
        )
      } else {
        stop("boundary_path is not a recognized file extension.")
      }
      # return result
      invisible(self)
    },

    #' @description
    #' Clean the dataset from memory.
    #' @details
    #' Note that this method has no effect if the dataset does not have
    #' file path on disk.
    clean = function() {
      ## spatial data
      if (!identical(self$spatial_path, "memory")) {
        self$spatial_data <- NULL
      }
      ## attribute data
      if (!identical(self$attribute_path, "memory")) {
        self$attribute_data <- NULL
      }
      ## boundary data
      if (!identical(self$boundary_path, "memory")) {
        self$boundary_data <- NULL
      }
      invisible(self)
    },

    #' @description
    #' Get the spatial data.
    #' @return [sf::st_as_sf()] or [terra::rast()] object.
    get_spatial_data = function() {
      self$import()
      self$spatial_data
    },

    #' @description
    #' Get the attribute data.
    #' @return [tibble::tibble()] object.
    get_attribute_data = function() {
      self$import()
      self$attribute_data
    },

    #' @description
    #' Get the spatial data.
    #' @return [Matrix::sparseMatrix()] object.
    get_boundary_data = function() {
      self$import()
      self$boundary_data
    },

    #' @description
    #' Get the coordinate reference system.
    #' @return [sf::st_crs()] object.
    get_crs = function() {
      self$import()
      sf::st_crs(self$spatial_data)
    },

    #' @description
    #' Get the bounding box.
    #' @param native `logical` indicating if the bounding box should
    #'   be in (`TRUE`) the native coordinate reference system or (`FALSE`)
    #'   re-projected to longitude/latitude?
    #' @param expand `FALSE` should the bounding box be expanded by 10%?
    #' @return `list` object with `"xmin"`, `"xmax"`, `"ymin"`, and `"ymax"`
    #'   elements.
    get_bbox = function(native = TRUE, expand = FALSE) {
      # assert arguments are valid
      assertthat::assert_that(
        assertthat::is.flag(native),
        assertthat::noNA(native),
        assertthat::is.flag(expand),
        assertthat::noNA(expand)
      )
      # get extent
      self$import()
      # generate extent object
      if (native) {
        # if native then extract extent
        ext <- terra::ext(self$get_spatial_data())
      } else {
        # if not native, then reproject data and extract extent
        ext <- terra::ext(self$get_spatial_data())
        ## convert to WGS1984
        ext <- terra::project(
          x = ext, from = terra::crs(self$get_spatial_data()), to = "EPSG:4326"
        ) 
      }
      # expand bounding box if needed
      if (expand) {
        out <- list()
        out$xmin <- unname(ext[1] - (0.1 * (ext[2] - ext[1])))
        out$xmax <- unname(ext[2] + (0.1 * (ext[2] - ext[1])))
        out$ymin <- unname(ext[3] - (0.1 * (ext[4] - ext[3])))
        out$ymax <- unname(ext[4] + (0.1 * (ext[4] - ext[3])))
      } else {
        out <- list(
          xmin = unname(ext[1]),
          xmax = unname(ext[2]),
          ymin = unname(ext[3]),
          ymax = unname(ext[4])
        )
      }
      # if using lon/lat CRS, then ensure valid extent
      if (!native) {
        out$xmin <- max(out$xmin, -180)
        out$xmax <- min(out$xmax, 180)
        out$ymin <- max(out$ymin, -90)
        out$ymax <- min(out$ymax, 90)
      }
      # return result
      out
    },

    #' @description
    #' Get planning unit indices.
    #' @return `integer` vector of indices.
    get_planning_unit_indices = function() {
      self$import()
      self$attribute_data[["_index"]]
    },

    #' @description
    #' Get attribute names.
    #' @return `character` vector of field/layer names.
    get_names = function() {
      self$import()
      names(self$attribute_data)[-ncol(self$attribute_data)]
    },

    #' @description
    #' Get area values.
    #' @return `numeric` vector of values.
    get_planning_unit_areas = function() {
      self$import()
      idx <- self$attribute_data[["_index"]]
      if (inherits(self$spatial_data, "SpatRaster")) {
        out <-
          rep(prod(terra::res(self$spatial_data)), length(idx))
      } else {
        out <- as.numeric(sf::st_area(self$spatial_data[idx, ]))
      }
      out
    },

    #' @description
    #' Get a data from the dataset at an index.
    #' @param index `character` or `integer` indicating the field/layer with
    #'   the data.
    #' @return [sf::st_as_sf()] or [terra::rast()] object.
    get_index = function(index) {
      assertthat::assert_that(
        is.character(index) || is.numeric(index),
        assertthat::noNA(index),
        all(self$has_index(index))
      )
      self$import()
      idx <- self$attribute_data[["_index"]]
      if (inherits(self$spatial_data, "SpatRaster")) {
        blank <- terra::setValues(self$spatial_data, NA_real_)
        out <- lapply(index, function(x) {
          r <- blank
          r[idx] <- self$attribute_data[[x]]
          r
        })
        if (length(index) == 1) {
          out <- out[[1]]
        } else {
          out <- c(out)
        }
      } else {
        out <- tibble::as_tibble(self$attribute_data[, index, drop = FALSE])
        out$geometry <- sf::st_geometry(self$spatial_data)[idx]
        out <- sf::st_as_sf(out, sf_column_name = "geometry")
        attr(out, "agr") <- NULL
      }
      if (is.character(index)) {
        names(out)[seq_along(index)] <- index
      } else {
        names(out)[seq_along(index)] <- paste0("V", index)
      }
      out
    },

    #' @description
    #' Check if the dataset has an index.
    #' @param index `character` or `integer` indicating the field/layer with
    #'   the data.
    #' @return `logical` indicating if data is present or not.
    has_index = function(index) {
      assertthat::assert_that(
        is.character(index) || is.numeric(index),
        assertthat::noNA(index)
      )
      self$import()
      if (is.numeric(index)) {
        out <-
          index %in% seq_along(names(self$attribute_data)[-1])
      } else {
        out <-
          index %in% (names(self$attribute_data)[-ncol(self$attribute_data)])
      }
      out
    },

    #' @description
    #' Maximum index.
    #' @return `integer` largest index.
    max_index = function() {
      self$import()
      length(names(self$attribute_data)) - 1
    },

    #' @description
    #' Add data at an index.
    #' @param index `character` or `integer` indicating the field/layer with
    #'   the data.
    #' @param values `numeric` vector.
    add_index = function(index, values) {
      # import data if needed
      self$import()
      # assert arguments are valid
      assertthat::assert_that(
        assertthat::is.string(index) || assertthat::is.count(index),
        assertthat::noNA(index),
        length(values) == nrow(self$attribute_data)
      )
      # if index is an integer, then generate new column name
      # because each column must have a name
      if (is.numeric(index)) {
        index <- uuid::UUIDgenerate()
      }
      # insert new column with values
      self$attribute_data[[index]] <- values
      # re-order columns
      self$attribute_data <-
        self$attribute_data[
          ,
          c(setdiff(names(self$attribute_data), "_index"), "_index")
        ]
      # return self
      invisible(self)
    },
    
    #' @description
    #' Updates old boundary matrix to new format.
    #' @return `Matrix::sparseMatrix()` object.
    update_bm = function() {
      self$import()
      bm_colsums  <- Matrix::colSums(self$boundary_data)
      Matrix::diag(self$boundary_data) <- bm_colsums
      # return self
      invisible(self)
    }    
  )
)

#' New dataset
#'
#' Create a new [Dataset] object.
#'
#' @param spatial_path `character` file path for spatial data.
#'
#' @param attribute_path `character` file path for attribute data.
#'
#' @param boundary_path `character` file path for boundary data.
#'
#' @param spatial_data `NULL`, [sf::st_sf()], or [terra::rast()] object.
#'   Defaults to `NULL` such that data are automatically imported
#'   using the argument to `spatial_path`.
#'
#' @param attribute_data `NULL`, or [tibble::tibble()] object.
#'   Defaults to `NULL` such that data are automatically imported
#'   using the argument to `attribute_path`.
#'
#' @param boundary_data `NULL`, or [Matrix::sparseMatrix()] object.
#'   Defaults to `NULL` such that data are automatically imported
#'   using the argument to `boundary_path`.
#'
#' @param id `character` unique identifier.
#'   Defaults to a random identifier ([uuid::UUIDgenerate()]).
#'
#' @return A [Dataset] object.
#'
#' @examples
#' # find data file paths
#' f1 <- system.file(
#'   "extdata", "projects", "sim_raster", "sim_raster_spatial.tif",
#'   package = "wheretowork"
#' )
#' f2 <- system.file(
#'   "extdata",  "projects", "sim_raster", "sim_raster_attribute.csv.gz",
#'   package = "wheretowork"
#' )
#' f3 <- system.file(
#'   "extdata",  "projects", "sim_raster", "sim_raster_boundary.csv.gz",
#'   package = "wheretowork"
#' )
#'
#' # create new dataset
#' d <- new_dataset(f1, f2, f3)
#'
#' # print object
#' print(d)
#' @export
new_dataset <- function(spatial_path, attribute_path, boundary_path,
                        spatial_data = NULL, attribute_data = NULL,
                        boundary_data = NULL,
                        id = uuid::UUIDgenerate()) {
  # verify that data are supplied when specifying that data
  # are stored in memory
  if (identical(spatial_path, "memory")) {
    assertthat::assert_that(!is.null(spatial_data))
  }
  if (identical(attribute_path, "memory")) {
    assertthat::assert_that(!is.null(attribute_data))
  }
  if (identical(boundary_path, "memory")) {
    assertthat::assert_that(!is.null(boundary_data))
  }
  # create new dataset
  Dataset$new(
    id = id,
    spatial_path = spatial_path,
    attribute_path = attribute_path,
    boundary_path = boundary_path,
    spatial_data = spatial_data,
    attribute_data = attribute_data,
    boundary_data = boundary_data
  )
}

#' New dataset from automatic calculations
#'
#' Create a new [Dataset] object by automatically calculating
#' all metadata from the underlying data.
#' This function is useful when pre-calculated metadata are not available.
#' Note this function will take longer to create variables than other
#' functions because it requires performing geospatial operations.
#'
#' Create a new [Dataset] object.
#'
#' @param x [sf::st_sf()] or a combined [terra::rast()] object.
#'
#' @inheritParams new_dataset
#'
#' @inherit new_dataset return
#'
#' @examples
#' # find example data
#' f <- system.file(
#'   "extdata", "projects", "sim_raster", "sim_raster_spatial.tif",
#'   package = "wheretowork"
#' )
#'
#' # import data
#' r <- suppressWarnings(terra::rast(f))
#' r <- c(r, r * 2, r * 3, r * 4)
#' names(r) <- c("r1", "r2", "r3", "r4")
#'
#' # create new dataset
#' d <- new_dataset_from_auto(r)
#'
#' # print object
#' print(d)
#' @export
new_dataset_from_auto <- function(x, id = uuid::UUIDgenerate()) {
  
  # assert arguments are valid
  assertthat::assert_that(
    inherits(x, c("sf", "SpatRaster"))
  )
  
  # assert that the combine SpatRasters names or sf attribute names are unique
  assertthat::assert_that(
    anyDuplicated(names(x)) == 0, msg = "names must be unique"
  )

  # prepare geometry data
  if (inherits(x, "sf")) {
    x[["_index"]] <- seq_len(nrow(x))
    spatial_data <- x[, "_index"]
  } else {
    spatial_data <- x[[1]]
  }

  # prepare attribute data
  if (inherits(x, "sf")) {
    attribute_data <- sf::st_drop_geometry(x)
  } else {
    attribute_data <- terra::as.data.frame(x, na.rm = FALSE)
    pu_idx <- rowSums(is.na(as.matrix(attribute_data)))
    attribute_data <- tibble::as_tibble(attribute_data)
    attribute_data <- dplyr::select_if(attribute_data, is.numeric)
    attribute_data[["_index"]] <- seq_len(nrow(attribute_data))
    attribute_data <- attribute_data[pu_idx < 0.5, , drop = FALSE]
  }

  # fix geometry if needed
  if (inherits(spatial_data, "sf")) {
    spatial_data <- repair_spatial_data(spatial_data)
  }
  
  # build boundary data
  bm <- try({
    # re-project sf if CRS is not projected. only used for generating boundary
    if (inherits(spatial_data, "sf") && (sf::st_is_longlat(spatial_data))) {
      bm_spatial_data  <- sf::st_transform(spatial_data, 3857)
    } else {
      bm_spatial_data <- spatial_data
    }
    # prepare boundary data
    str_tree <- inherits(x, "sf") && !identical(Sys.info()[["sysname"]], "Darwin")
    bm <- prioritizr::boundary_matrix(bm_spatial_data)
    if (inherits(x, "SpatRaster")) {
      bm <- bm[attribute_data[["_index"]], attribute_data[["_index"]]]
    }
    bm # return bm
  }, silent = TRUE)
  
  # catch std::bad_alloc error (shapefile import)
  if (inherits(bm, "try-error")) {
    bm <- NA 
  }
  
  # create new dataset
  Dataset$new(
    spatial_path = "memory",
    attribute_path = "memory",
    boundary_path = "memory",
    spatial_data = spatial_data,
    attribute_data = attribute_data,
    boundary_data = bm,
    id = id
  )
}
NCC-CNC/wheretowork documentation built on Feb. 27, 2025, 6:11 p.m.