validation/sampling_for_model_comparison.R

# Validation based on random Sampling of 20% validation set  ----
fx_fieldsView(noaa_points_sf %>% st_drop_geometry(),
              output_path = rds_output_path,
              "00",
              "noaa_points_sf")

fx_fieldsView(noaa_stations_climate_dt %>% st_drop_geometry(),
              output_path = rds_output_path,
              "02",
              "noaa_stations_climate_dt")

fx_fieldsView(EHE_ECE_compiled,
              output_path = rds_output_path,
              "03",
              "EHE_ECE_compiled")

length(unique(noaa_stations_climate_dt$shape_id))
length(unique(noaa_stations_climate_dt$DATE))

noaa_stations_n <- length(unique(noaa_stations_climate_dt$shape_id))
noaa_stations_list <- as.list(unique(noaa_stations_climate_dt$shape_id))

noaa_points_basic_sf <- noaa_points_sf[c("geoid", "shape_id",
                                                    "Station_Name")]

fx_fieldsView(noaa_points_basic_sf %>% st_drop_geometry(),
              output_path = rds_output_path,
              "01",
              "noaa_points_basic_sf")

fx_saveSHP(noaa_points_basic_sf, folder = rds_output_path,
           shpname = "noaa_points_basic_sf",
           subfolder = "noaa_stations_geo")

# All voronoi for comparison
all_stations_voronoi_dtsf <- fx_VoronoiPolygons_fromSF(noaa_points_basic_sf)

ca_voronoi_all_stations_dtsf <- st_intersection(st_cast(all_stations_voronoi_dtsf),
                                                st_union(CA_counties_sf))

fx_saveSHP(ca_voronoi_all_stations_dtsf, folder = rds_output_path,
           shpname = "ca_all_stations_voronoi",
           prefix = "",
           subfolder = "ca_all_stations_voronoi")
gc()
## Sampling ----
library(doParallel)
cores <- detectCores() - 1
cl <- makeCluster(cores)
registerDoParallel(cl)
print(cl)

foreach(counter=1:iteration_n, .verbose = TRUE) %dopar% {
  source(here::here("R", "libraries.R"), local=T)
  source(here::here("R", "helper_functions.R"), local=T)
  source(here::here("validation", "runtime_conf.R"), local=T)

  theme_set(new)

#for(counter in 1:iteration_n){
  dir.create(file.path(output_path,
                       paste0("sample_", counter)))
  sample_output_path <- file.path(output_path,
                        paste0("sample_", counter))

  dir.create(file.path(sample_output_path,
                       "Viz_Nearest_Station"))
  vis_output_path <- file.path(sample_output_path,
                               "Viz_Nearest_Station")
### sample size definition ----
smaple_size <- noaa_stations_n * sampling_ratio

sampled_noaa_stations_80 <- sample(noaa_stations_list, smaple_size)
glimpse(noaa_points_sf)
### extract dataframes of sampled/excluded dataframes accordingly ----
test_stations_points_dtsf <- noaa_points_basic_sf %>% filter(.,
                                                             shape_id %in% sampled_noaa_stations_80)

validation_stations_points_dtsf <- noaa_points_basic_sf %>% filter(.,
                                                                   !shape_id %in% sampled_noaa_stations_80)

validation_stations_points_sp <- as_Spatial(validation_stations_points_dtsf,
                                cast = TRUE)

validation_stations_points_df <- data.frame(validation_stations_points_sp)

test_stations_climate_dtsf <- noaa_stations_climate_dt %>% filter(.,
                              shape_id %in% sampled_noaa_stations_80)

validation_stations_climate_dtsf <- noaa_stations_climate_dt %>% filter(.,
                                    !shape_id %in% sampled_noaa_stations_80)

length(unique(test_stations_climate_dtsf$Station_Name))
length(unique(test_stations_climate_dtsf$DATE))

#glimpse(noaa_stations_climate_dt)
#plot(test_stations_points_dtsf[1])
#plot(validation_stations_points_dtsf[1])

# Create Voronoi polygons ----
voronoi_test_stations_points_dtsf <- fx_VoronoiPolygons_fromSF(test_stations_points_dtsf)
#glimpse(voronoi_test_stations_points_dtsf)
#plot(voronoi_test_stations_points_dtsf[1])

ca_test_set_voronoi_dtsf <- st_intersection(st_cast(voronoi_test_stations_points_dtsf),
                                                 st_union(CA_counties_sf))

#glimpse(ca_test_set_voronoi_dtsf)
#plot(ca_test_set_voronoi_dtsf[1])

# Join the nearest stations records
validation_set_interim <- st_join(validation_stations_climate_dtsf,
                                  ca_test_set_voronoi_dtsf["shape_id"],
                             join = st_within, suffix = c("","_Nearest_Station"), left = TRUE)

validation_set_climate_data <- merge(validation_set_interim,
                           noaa_stations_climate_dt %>% st_drop_geometry(),
                           all.x = TRUE,
                           by.x=c("DATE","shape_id_Nearest_Station"), # key columns to join to the data table
                           by.y=c("DATE","shape_id"), # original table key columns
                           suffix = c("","_Nearest"))

fx_saveCSV(validation_stations_points_dtsf,
           sample_output_path,
           prefix = paste0("sample_", counter, "_"),
           suffix = "")

# fx_fieldsView(validation_stations_climate_dtsf %>% st_drop_geometry(),
#               output_path = rds_output_path,
#               paste0("04_sample_", counter),
#               "validation_stations_climate_dtsf")

fx_saveSHP(ca_test_set_voronoi_dtsf, folder = sample_output_path,
           shpname = "ca_test_set_voronoi",
           prefix = "", suffix = paste0("_sample_", counter),
           subfolder = "ca_test_set_voronoi"
)

fx_saveSHP(validation_stations_points_dtsf, folder = sample_output_path,
           prefix = "",
           shpname = paste0("validation_set_stations_sample_", counter),
           subfolder = paste0("validation_set_sample_", counter)
)

validation_set_climate_data_diff <- validation_set_climate_data %>%
  mutate(Delta_AT_nearest =
           round(dm_apparent_temperature - dm_apparent_temperature_Nearest, 1))

# Apply spatial models over the selected samples

# Estimate interpolated surface ----

selected_day <- start_date
W <- as_Spatial(CA_counties_sf,
                cast = TRUE)

# Loop ----
## Create empty DF to collect and compile daily outputs ----
CA_blocks_joined_EHE <- NULL
CA_AT_Boundaries <- NULL
CA_AT_Vectorized_Grid <- NULL
CA_AT_Contours <- NULL
idw_estimates_collector <- NULL
i <- 1
loop_length <- time_length(end_date - start_date + 1, unit = "day")

test_stations_climate_valid <- test_stations_climate_dtsf[c("DATE", "shape_id",
                                                            "dm_apparent_temperature")] %>%
  filter(!is.na(dm_apparent_temperature)) %>%
  filter(DATE >= start_date & DATE <= end_date)

test_stations_sp <- as_Spatial(test_stations_climate_valid,
                               cast = TRUE)

# Replace point boundary extent with that of the state
test_stations_sp@bbox <- W@bbox

#length(unique(test_stations_sp@data$DATE))
length(unique(test_stations_sp@data$dm_apparent_temperature))
# Create an empty grid where n is the total number of cells
grd              <- as.data.frame(spsample(test_stations_sp, "regular",
                                           n = number_of_grid_cells))

names(grd)       <- c("X", "Y")
coordinates(grd) <- c("X", "Y")
gridded(grd)     <- TRUE  # Create SpatialPixel object
fullgrid(grd)    <- TRUE  # Create SpatialGrid object

# Add projection information to the empty grid
proj4string(test_stations_sp) <- proj4string(test_stations_sp)
proj4string(grd) <- proj4string(test_stations_sp)

gc()
while (selected_day <= end_date)
{
  #if (selected_day %in% AT_DATES[[2]])
  #{
  # Convert rasters to dataframes
  # Convert sf_points to dataframes with explicit coordinates

  test_stations_climate_daily <- test_stations_climate_dtsf[c("DATE", "shape_id",
                                                              "dm_apparent_temperature")] %>%
    filter(!is.na(dm_apparent_temperature)) %>%
    filter(DATE == selected_day)

  test_stations_sp <- as_Spatial(test_stations_climate_daily,
                                 cast = TRUE)

  test_stations_df <- data.frame(test_stations_sp)

  # Create an empty DF for parsing daily estimates from idw
  idw_estimates_collector_day <- validation_stations_points_dtsf["shape_id"] %>% st_drop_geometry()
  idw_estimates_collector_day$DATE <- selected_day

  # Create a voronoi layer to store and visualize estimates from the nearest station
  ca_test_set_voronoi_compiled_dtsf <-
    merge(ca_test_set_voronoi_dtsf,
          test_stations_climate_daily %>% st_drop_geometry(),
          all.x = TRUE,
          by.x=c("shape_id"), # key columns to join to the data table
          by.y=c("shape_id"), # original table key columns
          suffix = c("","_Nearest"))

  print(selected_day)
  print("interpolating temperature surface")

  #Interpolate the grid cells using a power value of 2 (idp=2.0) ----
  interpolated_idw <- gstat::idw(dm_apparent_temperature ~ 1,
                                 test_stations_sp,
                                 nmax=5,#
                                 newdata=grd,
                                 idp = idw)

  # Convert the surface into raster object and clip it to the Window of Analysis
  interpolated_idw_raster   <-   raster(interpolated_idw)

  interpolated_idw_raster_masked   <-   mask(interpolated_idw_raster, W)

  # ## Convert raster into SP and SF objects for visualization ----
  # interpolated_idw_raster_masked_sp <- as(interpolated_idw_raster_masked,'SpatialPolygonsDataFrame')
  # interpolated_idw_raster_masked_sf <- st_as_sf(interpolated_idw_raster_masked_sp,
  #                                               crs = spatial_projection) %>%
  #   rename_at(vars(contains("var1.pred")), ~"Estimated_Level") %>%
  #   mutate(DATE = as.Date(selected_day, "%a, %d %b %Y")) %>%
  #   mutate(Estimated_Level = round(Estimated_Level, 2))
  #
  # st_crs(interpolated_idw_raster_masked_sf) <- spatial_projection
  # print(st_crs(interpolated_idw_raster_masked_sf)$epsg)


  # ## Compile the selected day DF ----
  # AT_Vectorized_Grid_daily <- interpolated_idw_raster_masked_sf %>%
  #   #filter(Estimated_Level > AT_threshold) %>%
  #   mutate(computed_area = round(as.numeric(st_area(.)), 1)) %>%
  #   mutate(area_hectare = round((computed_area / 10000), 2)) %>%
  #   mutate(DATE = selected_day)
  #
  #
  # AT_Boundaries_daily <- AT_Vectorized_Grid_daily %>%
  #   st_union(.) %>% as_tibble() %>% st_as_sf() %>%
  #   #mutate(event_type = "EHE") %>%
  #   mutate(Estimated_Level = 1) %>%
  #   mutate(computed_area = round(as.numeric(st_area(.)), 1)) %>%
  #   mutate(area_hectare = round((computed_area / 10000), 2)) %>%
  #   mutate(DATE = selected_day)
  #
  #
  # blocks_joined_AT_daily <- st_join(
  #   CA_blocks_compiled,
  #   AT_Boundaries_daily,
  #
  #   join = st_intersects,
  #   suffix = c("", "_GIS"),
  #   left = FALSE) %>%
  #   mutate(DATE = as.Date(selected_day, "%a, %d %b %Y")) %>%
  #   st_as_sf() %>% as_tibble() %>% st_as_sf()


  # Resolve the issue with the spatially too small contours
  tryCatch({
    r2c <- rasterToContour(interpolated_idw_raster_masked, nlevels = 10)

  }, error = function(e){
    print(selected_day)
    r2c <<- r2c[0,]

    cat("ERROR :",conditionMessage(e), "\n")
    print(r2c)
    return(r2c)
  })

  AT_Contours_daily <- st_as_sf(r2c,
                                crs = spatial_projection) %>%
    mutate(DATE = selected_day)

  st_crs(AT_Contours_daily) <- spatial_projection
  #print(st_crs(AT_Contours_daily)$epsg)


  ## Visualization of the estimated AT based on the nearest station ----
  nearest_AT_plot <- ggplot() +
    geom_sf(data = ca_test_set_voronoi_compiled_dtsf,
            mapping = aes(fill = dm_apparent_temperature),
            colour = NA) +
    scale_fill_gradientn(colors = c("white",
                                    "lightblue",
                                    "yellow",
                                    "red", "brown")) +

    geom_sf(data = CA_counties_sf,
            fill = NA,
            color = "Black",
            lwd = .5
    ) +

    geom_point(data = validation_stations_points_df,
               color = "Blue", size = 2, shape = 22,
               aes(x=coords.x1, y=coords.x2)
    ) +

    geom_sf(data = test_stations_climate_dtsf,
            color = "Black", size = 1.5) +
    #scale_shape_manual(values = 23) +

    labs(title = paste0("California State (estimated temperature based on the nearest ",
                        #nrow(test_stations_df),
                        " test set stations)"
    )) +
    labs(subtitle = paste0(
      "Date: ",
      selected_day))

  plot_file_name <- paste0(vis_output_path, "/",
                           "Estimated_AT",
                           "_", #counter, "_",
                           selected_day,
                           ".jpg")
  ggsave(plot_file_name,
         plot = nearest_AT_plot,
         dpi = 300,
         width = 16, height = 22, units = "cm")


  ### Compile the iteration outputs ----
  #### Vector format
  # CA_AT_Vectorized_Grid <- rbind(CA_AT_Vectorized_Grid,
  #                                AT_Vectorized_Grid_daily)

  CA_AT_Contours <- rbind(CA_AT_Contours,
                          AT_Contours_daily)

  # CA_blocks_joined_EHE <- rbind(CA_blocks_joined_EHE,
  #                               blocks_joined_AT_daily)

  #### Raster format
  rasValue = round(extract(interpolated_idw_raster_masked,
                           validation_stations_points_dtsf), 2)

  idw_estimates_collector_day$dm_apparent_temperature_IDW <- rasValue
  idw_estimates_collector <- rbind(idw_estimates_collector,
                                   idw_estimates_collector_day)

  #### Index stacked Raster based on the reformatted date string
  raster_index <- paste0("X", format(selected_day, "%Y%m%d"))
  names(interpolated_idw_raster_masked) <- raster_index

  if (i < 2) { # apply to the initial round only
    stacked_interpolated_idw <- interpolated_idw_raster_masked
    cumulative_interpolated_idw <- interpolated_idw_raster_masked

    print(i)
  } else { # for the second round and afterwards
    print(i)
    ### Stacking the daily raster outputs ----
    stacked_interpolated_idw <- stack(stacked_interpolated_idw,
                                      projectRaster(
                                        interpolated_idw_raster_masked,
                                        stacked_interpolated_idw,
                                        #method = 'bilinear'
                                        method = 'ngb'
                                      ))

    # cumulative_interpolated_idw <- cumulative_interpolated_idw +
    #   projectRaster(
    #     interpolated_idw_raster_masked,
    #     stacked_interpolated_idw,
    #     #method = 'bilinear'
    #     method = 'ngb'
    #   )
  }
  i <- i + 1
  print("stacking interpolated surface")
  print(selected_day)

  cat(paste0(scales::percent(i/loop_length),
             " Completed", "\n\n"))

  selected_day <- selected_day + 1
gc()
} #

### IDW Error estimation ------
validation_set_climate_data_diff_2_interim <- merge(validation_set_climate_data_diff,
                                                    idw_estimates_collector,
                                                    all.x = TRUE,
                                                    by.x=c("DATE","shape_id"), # key columns to join to the data table
                                                    by.y=c("DATE","shape_id"), # original table key columns
                                                    suffix = c("","_IDW"))

validation_set_climate_data_diff_2 <- validation_set_climate_data_diff_2_interim %>%
  mutate(Delta_AT_idw =
           round(dm_apparent_temperature - dm_apparent_temperature_IDW, 1))

validation_results <- validation_set_climate_data_diff_2 %>%
  dplyr::select(!starts_with("EH") &
                  !starts_with("EC") &
                  !starts_with("x")
  ) %>%
  mutate(Sample_ID = paste0("sample_", counter)) %>%
  st_drop_geometry()

fx_saveCSV(validation_results, sample_output_path,
           prefix = paste0("sample_", counter, "_"),
           suffix = "")

fx_saveCSV(validation_results, rds_output_path,
           prefix = paste0("sample_", counter, "_"),
           suffix = "")

fx_appendCSV(validation_results, output_path)

fx_fieldsView(validation_results,
              output_path = sample_output_path,
              "09",
              "validation_results")

ObjSave(
  CA_AT_Contours,
  validation_set_climate_data_diff_2,
  validation_results,

  folder = sample_output_path)

###  Visualize the cumulative raster ----
# compute the average interpolated surface
# cumulative_interpolated_idw <- cumulative_interpolated_idw / loop_length
#
# class(cumulative_interpolated_idw)
# nlayers(cumulative_interpolated_idw)
# dim(cumulative_interpolated_idw)
# ##```
#
# ##```{r}
# str(cumulative_interpolated_idw@data)
# #glimpse(cumulative_interpolated_idw@data)
#
# # cumulative_interpolated_idw <- readRDS(here(input_dir,
# #                                  "ehe_cumulative_interpolated_idw.rds"))[[1]]
# length(cumulative_interpolated_idw[cumulative_interpolated_idw])
# length(cumulative_interpolated_idw[cumulative_interpolated_idw > 1])
# length(cumulative_interpolated_idw[cumulative_interpolated_idw < 1])
# length(cumulative_interpolated_idw[cumulative_interpolated_idw == 1])
# cumulative_AT_coverage <- 100 * round(
#   (length(cumulative_interpolated_idw[cumulative_interpolated_idw > 1]) / length(cumulative_interpolated_idw[cumulative_interpolated_idw]))
#   ,4)
#
# plot_file_name <- paste0(sample_output_path, "/_AT_",
#                          "00_cumulative_interpolated_idw_",
#                          cumulative_AT_coverage,
#                          "_percent.jpg")
#
# jpeg(plot_file_name, width = 880, height = 1200)
#
# print(plot(cumulative_interpolated_idw))
#
# dev.off()
#
# class(cumulative_interpolated_idw)
# plot_file_name <- paste0(sample_output_path, "/_AT_",
#                          "01_cumulative_interpolated_idw_",
#                          cumulative_AT_coverage,
#                          "_percent.jpg")
#
# jpeg(plot_file_name, width = 880, height = 1200)
#
# print(spplot(cumulative_interpolated_idw))
#
# dev.off()
# ##```
#
# ### Export spatial objects ----
# writeRaster(cumulative_interpolated_idw, paste0(sample_output_path, "/_AT_",
#                                                 "01_cumulative_interpolated_idw"), overwrite=TRUE)

writeRaster(stacked_interpolated_idw, paste0(sample_output_path, "/_AT_",
                                             "02_stacked_interpolated_idw"), overwrite=TRUE)

stacked_interpolated_idw_rb <- brick(stacked_interpolated_idw)
writeRaster(stacked_interpolated_idw_rb, paste0(sample_output_path, "/_AT_",
                                                "03_stacked_interpolated_idw_rb"), overwrite=TRUE)

ObjSave(
  #cumulative_interpolated_idw,
  stacked_interpolated_idw,
  stacked_interpolated_idw_rb,

  folder = sample_output_path)

gc()
source(here::here("validation", "idw_model_visualization.R"), local=T)

#} # CLOSE SAMPLING LOOP (serial)
} # CLOSE SAMPLING LOOP (parallel)

### closing parallel computing
print(showConnections())

print("Closing")
print(stopCluster(cl))
print(rm(cl))
gc()

print("Check")
print(showConnections())
epedram/ehmi documentation built on Sept. 25, 2023, 12:30 a.m.