prepare_climate_analytics_data.R

library(r2dii.physical.risk)

# =================================
# load distinct_geo_data which will subset the raw climate data
# =================================

distinct_geo_data <- r2dii.physical.risk:::load_distinct_geo_data()

# =================================
# load list of all countries iso codes
# =================================

all_countries <- rworldmap::countryRegions %>%
  dplyr::transmute(iso = ISO3) %>%
  dplyr::pull(iso)

# =================================
# set climate analytics parameters
# =================================

# cross parameters
api_paramter <- tidyr::crossing(
  region = c(
    # all_countries, # explodes number of parameters but single regions have higher resolution
    "AFRICA",
    "EUROPE",
    "NORTH_AMERICA",
    "SOUTH_AMERICA",
    "ASIA",
    "OCEANIA",
    "ATA" # antarctica
  ),
  indicator = c(
    "tasAdjust", # air temperature
    "tasminAdjust", # daily minimum air temperature
    "tasmaxAdjust", # daily maximum air temperature
    "prAdjust", # precipitation
    "hursAdjust", # relative humidity
    "prsnAdjust", # snowfall
    "hussAdjust", # specific humidity
    "sfcWindAdjust", # wind speed
    "ec4", # 1-in-100-year expected damage from tropical cyclones
    "ec2", # annual expected damage from river floods
    "ec3", # annual expected damage from tropical cyclones
    "ec1", # labour productivity due to heat stress
    "lec", # land fraction annually exposed to crop failures
    "leh", # land fraction annually exposed to heat waves
    "fldfrc", # land fraction annually exposed to river floods
    "lew", # land fraction annually exposed to wild fires,
    "flddph", # river flood depth
    "maxdis", # maximum daily river discharge
    "mindis", # minimum daily river discharge
    "dis", # river discharge
    "qs" # surface runoff
  ),
  scenario = c(
    "cat_current", # cat current policies
    "h_cpol", # NGFS current policies
    "d_delfrag", # NGFS delayed 2°
    "o_1p5c", # NGFS net zero
    "rcp26",
    "rcp45",
    "rcp60",
    "rcp85"
  ),
  year = c(
    2030,
    2050,
    2100
  )
)

# create links
api_paramter <- api_paramter %>%
  dplyr::mutate(
    link = paste0(
      "https://cie-api.climateanalytics.org/api/map-difference/?iso=",
      region,
      "&var=",
      indicator,
      "&season=annual&format=csv&scenarios=",
      scenario,
      "&years=",
      year
    )
  )

# =================================
# set up for loop for indicator
# =================================

# define starting variables + data frame
processed_links <- 0
processed_indicators <- 0
all_duration <- NA

for (sub_indicator in unique(api_paramter$indicator)) {

  # define start time to get an estimate how much time is needed to load the data
  start_time <- Sys.time()

  # subset the links by indicator
  api_paramter_sub <- api_paramter %>%
    filter(indicator == sub_indicator)

  # create target data frame
  all_data <- data.frame(
    region = NA_character_,
    indicator = NA_character_,
    scenario = NA_character_,
    year = NA_integer_,
    link = NA_character_,
    variable_id = NA_character_,
    country = NA_character_,
    warming_level = NA_character_,
    reference = NA_character_,
    difference = NA_character_,
    gcm_gim_combinations = NA_character_,
    all_used_runs = NA_character_,
    lat = NA_character_,
    long = NA_character_,
    risk_level = NA_character_
  )

  # create lat and long start vector (used to determine raster size later)
  all_lats <- NA_integer_
  all_longs <- NA_integer_

  # count processed hazards
  processed_indicators <- processed_indicators + 1

  # =================================
  # download all links for one indicators with for loop
  # =================================

  # scrape data
  for (sub_link in api_paramter_sub$link) {

    # add an estimate of how many links have been downloaded
    processed_links <- processed_links + 1

    # message which link is currently processed
    message(paste0("Processing link Nr.", processed_links, " of ", nrow(api_paramter)))
    print(sub_link)

    # filter api_paramater to get the relevant parameters for each links
    api_paramter_sub_sub <- api_paramter_sub %>%
      dplyr::filter(link == sub_link)

    # download data
    data <- vroom::vroom(sub_link, delim = ",")

    # in case a set of parameters is not available, the downloaded data will only have one column
    # --> only proceed with data wrangling if set of parameters is available
    if (ncol(data) == 2) {

      # as names of downloaded data vary based on parameters, create agnostic variables
      names(data) <- c("v1", "v2")

      # slice rows which contain parameter information
      summary <- data %>%
        dplyr::slice(c(1:7))

      # prep sumamry of parameter information
      summary <- summary %>%
        tidyr::pivot_wider(id_cols = "v2", names_from = "v1", values_from = "v2") %>%
        janitor::clean_names(case = "snake")

      # kickout rows which contain parameter information
      data <- data %>%
        slice(-c(1:7))

      # slice rows which contains the number of longitudes
      index <- data %>%
        dplyr::select(v2) %>%
        dplyr::slice(1)

      # count the number of longitudes
      index <- stringr::str_count(index$v2, ",")

      # add 1 to the number of longitudes, as there is always one comma less
      index <- index + 1

      # expand the data
      data <- data %>%
        tidyr::separate(v2, into = as.character(c(1:index)), sep = ",")

      # use longitude as the column name
      colnames(data) <- data %>%
        dplyr::slice(1)

      # kickout row which contains the longitude
      data <- data %>%
        dplyr::slice(-c(1))

      # rename latitude
      data <- data %>%
        dplyr::rename(lat = `lat \\ lon`)

      # pivot longer
      data <- data %>%
        tidyr::pivot_longer(cols = !lat, values_to = "risk_level", names_to = "long")

      # subset lat and long
      all_lats <- unique(c(all_lats, as.numeric(data$lat))) %>% sort()
      all_longs <- unique(c(all_longs, as.numeric(data$long))) %>% sort()


      # kickout coordinates without risk data (as countries are not squares this is expected to happen)
      data <- data %>%
        dplyr::filter(risk_level != "")

      # add summary and paramter information to data
      data <- dplyr::bind_cols(
        api_paramter_sub_sub,
        summary,
        data
      )

      # bind data from previous loops to all data
      all_data <- rbind.data.frame(
        all_data,
        data
      )
    }

    # ensure that there is always a small break between downloading data
    Sys.sleep(1)
  }

  # kickput out the starting row of the target data frame
  all_data <- all_data %>%
    filter(!is.na(link))

  diff_lats <- all_lats[-1] - all_lats[-length(all_lats)]
  median_diff_lats <- median(diff_lats, na.rm = TRUE)
  cat(crayon::red("Using", median_diff_lats, "as median lat", "\n"))

  diff_longs <- all_longs[-1] - all_longs[-length(all_longs)]
  median_diff_longs <- median(diff_longs, na.rm = TRUE)
  cat(crayon::red("Using", median_diff_longs, "as median long", "\n"))


  if (nrow(all_data > 0)) {
    all_data <- all_data %>%
      mutate(
        scenario = case_when(
          scenario == "cat_current" ~ "cat_current_policies",
          scenario == "h_cpol" ~ "ngfs_current_policies",
          scenario == "d_delfrag" ~ "ngfs_delayed_2_degrees",
          scenario == "o_1p5c" ~ "ngfs_net_zero",
          TRUE ~ scenario
        ),
        model = "Ensemble", # to many models -> just say ensemble
        provider = "ClimateAnalytics"
      ) %>%
      rename(
        hazard = indicator,
        period = year
      )

    # transform risk level from percentages to decimals (100% -> 1)
    all_data <- all_data %>%
      dplyr::mutate(risk_level = as.numeric(risk_level) / 100)

    # apply distinct to eliminate duplicate entries for the same centroids used at the borders of to regions / countries
    all_data <- all_data %>%
      distinct(scenario, hazard, period, long, lat, .keep_all = TRUE)

    # duplicate long lats for sf coordinates
    all_data <- all_data %>%
      mutate(
        duplicate_long = long,
        duplicate_lat = lat
      )

    # create sf coordinates / points
    all_data <- all_data %>%
      sf::st_as_sf(coords = c("duplicate_long", "duplicate_lat"))

    # hash geometry
    all_data <- all_data %>%
      dplyr::mutate(geometry_id = openssl::md5(as.character(geometry)))

    # create sf data frame based on longitude and latitude
    all_data_distinct_geo_data <- all_data %>%
      sf::st_drop_geometry() %>%
      distinct(long, lat, geometry_id) %>%
      as_tibble()

    # save coordinates as numeric
    all_data_distinct_geo_data <- all_data_distinct_geo_data %>%
      mutate(
        long = as.numeric(long),
        lat = as.numeric(lat),
      )

    # calculate square coordinates based on climate analytics (they use the centroid as the lower left hand corner (did double check with their website and replicated their "squares"))
    all_data_distinct_geo_data <- all_data_distinct_geo_data %>%
      mutate(
        geometry_id = as.character(geometry_id),
        north_lat = lat + median_diff_lats,
        south_lat = lat,
        east_lng = long + median_diff_longs,
        west_lng = long
        # north_lat = lat + median_diff_lats/2,
        # south_lat = lat - median_diff_lats/2,
        # east_lng = long + median_diff_longs/2,
        # west_lng = long - median_diff_longs/2
      ) %>%
      as.data.frame()


    # build polygons based on calculated "corners"
    all_data_distinct_geo_data_polygons <- lapply(
      1:nrow(all_data_distinct_geo_data), function(x) {
        ## create a matrix of coordinates that also 'close' the polygon
        res <- matrix(c(
          all_data_distinct_geo_data[x, "west_lng"], all_data_distinct_geo_data[x, "north_lat"],
          all_data_distinct_geo_data[x, "east_lng"], all_data_distinct_geo_data[x, "north_lat"],
          all_data_distinct_geo_data[x, "east_lng"], all_data_distinct_geo_data[x, "south_lat"],
          all_data_distinct_geo_data[x, "west_lng"], all_data_distinct_geo_data[x, "south_lat"],
          all_data_distinct_geo_data[x, "west_lng"], all_data_distinct_geo_data[x, "north_lat"]
        ) ## need to close the polygon
        ,
        ncol = 2, byrow = T
        )
        ## create polygon objects
        st_polygon(list(res))
      }
    )

    all_data_distinct_geo_data <- st_sf(all_data_distinct_geo_data, st_sfc(all_data_distinct_geo_data_polygons), crs = 4326)

    asset_scenario_data <- sf::st_join(distinct_geo_data, all_data_distinct_geo_data)

    asset_scenario_data <- asset_scenario_data %>%
      filter(!is.na(geometry_id))

    asset_geometry <- asset_scenario_data %>%
      select(geometry)

    scenario_geometry <- asset_scenario_data %>%
      sf::st_drop_geometry() %>%
      sf::st_as_sf(coords = c("long", "lat")) %>%
      select(geometry)

    sf::st_crs(scenario_geometry) <- 4326

    asset_scenario_data$dist <- sf::st_distance(
      asset_geometry,
      scenario_geometry,
      by_element = TRUE
      )

    asset_scenario_data <- asset_scenario_data %>%
      sf::st_drop_geometry() %>%
      select(asset_id, geometry_id)


    asset_scenario_data <- asset_scenario_data %>%
      left_join(
        all_data %>%
          mutate(geometry_id = as.character(geometry_id)),
        by = c("geometry_id")
      )

    asset_scenario_data <- asset_scenario_data %>%
      transmute(
        asset_id,
        provider,
        hazard,
        scenario,
        period,
        is_reference_period = FALSE,
        model,
        relative_change = risk_level, # TODO: maybe change name of the variable in the beginning already
        risk_level = NA,
        reference = NA,
        absolute_change = NA,
        geometry_id
      )


    # save data
    asset_scenario_data %>%
      r2dii.physical.risk:::save_climate_data(
        path_db_pr_climate_data = path_db_pr_climate_data,
        use_distinct_for_assets_between_two_rasters = TRUE,
        drop_any_NAs = FALSE
      )

    # save downloaded all data
    all_data %>%
      r2dii.physical.risk:::for_loops_climate_data(
        parent_path = fs::path(path_db_pr_climate_data_raw),
        fns = function(data, final_path) {
          readr::write_csv(
            data,
            fs::path(
              final_path,
              paste(scenario_sub, hazard_sub, model_sub, period_sub, sep = "_"),
              ext = "csv"
              )
          )
        }
      )

    # determine the end time
    end_time <- Sys.time()

    # calculate how long it took
    duration <- as.double(end_time) - as.double(start_time)
    all_duration <- c(all_duration, duration)

    # estimate how many minutes are left based on current processing time
    minutes_left <- round((length(unique(api_paramter$indicator)) - processed_indicators) * mean(all_duration, na.rm = TRUE) / 60, 0)

    # print estimated processing time which is left
    message("Roughly ", minutes_left, " minutes left")
  }

  Sys.sleep(2)
}
2DegreesInvesting/r2dii.physical.risk documentation built on March 21, 2022, 2:03 a.m.