#' extractSimData
#'
#' @description This function extracts data and aggregates EverForecast simulation data (contained in a list of netCDFs) for spatial locations of interest. The output is a list of four permutations of the same data. This function can take a very long time depending on the number of regions-of-interest and their size (e.g., 2 hours for ~50 Combined Operations Plan indicator regions).
#'
#' @param simulationData a list of netCDF simulation data in SpatRaster format (e.g., "datList" object)
#' @param targetLocations locations to extract data from; must be class SpatVector
#' @param targetLocationNames option to specify the name of target locations (e.g., pts$gage)
#' @param func function to apply to the data (if multiple raster cells are in location of interest). The function name can be a character string or the function object (i.e., "mean" and mean produce identical output)
#'
#' @return a list.
#' This function extracts simulation data for spatial locations of interest (polygons, points)
#' It returns a list with four objects (permutations of the same data):
#' (1) a list with a dataframe for each simulation data time series for each target location,
#' (2) a list with a dataframe for each target location: rows = simulations, columns = days. Useful in cluster analysis
#' (3) a wide dataframe, and
#' (4) a long dataframe
#'
#' @importFrom stats reshape
#' @importFrom terra plot
#' @importFrom terra vect
#' @importFrom terra rast
#' @importFrom terra crs
#' @importFrom terra project
#' @importFrom terra extract
#'
#' @examples
#' \dontrun{
#' ### step 1: identify region(s) of interest
#' locs <- IRMap[[2]]
#' loc <- locs[locs$INDICATOR %in% 118:119, ]
#' ### step 2: homogenize projections (or let the function do this internally)
#' # loc <- spTransform(loc, crs(edenDEM))
#'
#' ### step 3: load EverForeCast data from netCDFs
#' ### available here:
#' ### https://s3.amazonaws.com/jem.models.headless/Ever4Cast_2020_12/simulations_interpolated.zip
#' # simDat <- loadSims()
#'
#' # step 4: extract simulation data at each ROIs
#' # EFDat <- extractSimData(simulationData = simDat,
#' # targetLocations = loc,
#' # targetLocationNames = loc$NAME)
#' }
#' @export
#'
extractSimData <- function(simulationData, # = datList,# *a list* of ncdf simulation data
targetLocations, # = pts,# locations to extract data from (IRMap[[2]])
targetLocationNames = NULL, # name of target locations (e.g., pts$gage)
func = mean # function to apply to data (if multiple raster cells are in location of interest)
) {
### This function extracts simulation data for spatial locations of interest (polygons, points)
### It returns a list with four objects (permutations of the same data):
### (1) a list with a dataframe for each simulation data time series for each target location,
### (2) a list with a dataframe for each IR: rows = simulations, columns = days. Useful in cluster analysis
### (3) a wide dataframe, and
### (4) a long dataframe
### convert data to terra
if(any(class(targetLocations) %in% c("SpatialPolygonsDataFrame", "SpatialPointsDataFrame", 'sf'))) {
targetLocations <- terra::vect(targetLocations)
}
remove_duplicate_data <- FALSE
if (nrow(targetLocations) == 1) {
targetLocations <- rbind(targetLocations, targetLocations)
targetLocationNames <- c(targetLocationNames, targetLocationNames)
remove_duplicate_data <- TRUE
}
### convert EDEN data to terra, if necessary
if(any(class(simulationData[[1]]) %in% c("RasterBrick", "RasterStack", "Raster", "raster"))) {
### if test is a raster:
simulationData <- lapply(X = simulationData, FUN = terra::rast)
} else if (!any(class(simulationData[[1]]) %in% c("SpatRaster"))) {
stop("simulationData input must be in terra (SpatRaster) or raster format. Pro-tip: use `returnType = 'raster'` argument in fireHydro::getEDEN()")
}
### check/transform CRS
if (!identical(terra::crs(targetLocations, proj = TRUE), terra::crs(simulationData[[1]], proj = TRUE))) {
targetLocations <- terra::project(targetLocations, terra::crs(simulationData[[1]], proj = TRUE))
cat("Coordinate reference systems did not match. Conversion has been performed internally but may be incorrect (and result in NAs). \n")
}
if (!identical(terra::crs(targetLocations, proj = TRUE), terra::crs(simulationData[[1]], proj = TRUE))) {
targetLocations <- terra::project(targetLocations, terra::crs(simulationData[[1]], proj = TRUE))
cat("Coordinate reference systems did not match. Conversion has been performed internally but may be incorrect (and result in NAs). \n")
}
terra::plot(simulationData[[1]][[1]], main = "Target locations used", axes = FALSE, legend = FALSE)
terra::plot(targetLocations, add = TRUE)
cat("Extracting simulation data for each target location. This may take a while... \n")
extracted.int <- lapply(simulationData, function(x) { # takes a LONG time
x.df <- data.frame(t(terra::extract(x, y = targetLocations, fun = func, na.rm = TRUE)))
x.df <- x.df[!grepl(x = row.names(x.df), pattern = 'ID'), ] # noticed an ID row is appearing as data
})
### get start date from simulation data
sim_dates <- gsub(x = rownames(extracted.int[[1]]), pattern = "X|\\.", replacement = "" )
# extracted.int <- lapply(simulationData, function(x) { # takes a LONG time
# x.df <- t(raster::extract(x, y = targetLocations, fun = func, na.rm = TRUE))
# x.df
# })
# extracted.int <- lapply(extracted.int, as.data.frame)
sim_means_locs <- mapply(cbind, extracted.int, "simulation" = 1:length(simulationData), SIMPLIFY = FALSE) # add an ID column to each element in the list
traces_locs <- do.call(rbind, sim_means_locs)
### first version:
# traces_locs$date <- as.Date(beginDate, format = "%Y%m%d") + 1:raster::nlayers(simulationData[[1]])
# dates.int <- as.Date(strptime(as.character(beginDate), format = "%Y%m%d"))
### 2nd version:
# traces_locs$date <- format(seq.Date(from = dates.int, to = dates.int + raster::nlayers(simulationData[[1]]), by = "day"), "%Y%m%d")
traces_locs$date <- as.Date(strptime(as.character(sim_dates), format = "%Y%m%d"))
traces_locs <- traces_locs[!is.na(traces_locs$date), ] # using terra introduced some artifacts
if (length(targetLocations) == 1) { # otherwise, name will appear as "t.raster..extract.x..y...targetLocations..fun...func..na.rm...TRUE.."
names(traces_locs)[1] <- "X1"
}
if (nrow(traces_locs) > nrow(expand.grid(unique(traces_locs$simulation), unique(traces_locs$date)))) {
### there are duplicate dates in 20220501 data, wtf? average by simulation, date (November 6 2022 in first simulation)
### this detects and takes the first duplicate, discards others
### issue resolve by changing date column from posixlt to date class
no_dups <- !duplicated(paste0(traces_locs$simulation, "-", traces_locs$date))
traces_locs <- traces_locs[no_dups, ]
cat(sum(!no_dups), " duplicated simulation-date combinations removed\n")
}
traces_locs_long <- stats::reshape(traces_locs,
direction = "long",
varying = list(names(traces_locs)[1:c(ncol(traces_locs) - 2)]),
v.names = "value",
timevar = "roi", # change to "roi" (requires updating package data (polyDat))
idvar = c("simulation", "date"))
newDF <- data.frame(roi = 1:length(targetLocationNames), name = targetLocationNames) # change "time" to "roi"
traces_locs_long$name <- newDF$name[match(traces_locs_long$roi, newDF$roi)] # change "time" to "roi"
for (i in 1:length(targetLocations)) {
if (i == 1) featureDat <- list()
featureDat[[i]] <- data.frame(t(sapply(extracted.int, '[[', i))) # transpose, so that each day is a column
featureDat[[i]] <- featureDat[[i]][, -1] # remove day 1; all values are the same (all simulations have the same starting point)
}
if (remove_duplicate_data == TRUE) {
### if there's really only one region of interest, remove duplicated data
extracted.int <- lapply(X = extracted.int, FUN = function(x) {x <- data.frame(X1 = x[, -c(ncol(x))])}) # one column per location, so want to remove ncol(x) for each dataframe
featureDat <- featureDat[[1]] # only one ROI, so only first list is needed
traces_locs <- traces_locs[, !grepl(x = names(traces_locs), pattern = 'X2')]
no_dups <- !duplicated(paste0(traces_locs_long$simulation, "-", traces_locs_long$date, "-", traces_locs_long$name))
traces_locs_long <- traces_locs_long[no_dups, ]
}
### return all useful objects in a list
list(listOfSims = extracted.int, # a list with one dataframe per simulation, columns = locations, rows = days
listOfLocations = featureDat, # a list with a dataframe for each IR: rows = 100 simulations, columns = 182 days (Day 1 is removed)
traceDataWide = traces_locs, # a single dataframe, semi-wide form, with one row per simulation-date
traceDataLong = traces_locs_long) # a single dataframe, long form, with one row per location-simulation-date
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.