In fineprint-global/mininglucc: What the Package Does (One Line, Title Case)

library(pangaear)
library(tidyverse)
library(stringr)
library(here)
library(sf)
library(parallel)
library(fastcluster)
library(dbscan)
library(progress)
library(purrr)
library(lwgeom)
library(glue)
library(measurements)
library(mco)
knitr::opts_chunk$set(echo = TRUE)

version <- "202201170917"
raw_data_path <- here::here('analysis', 'data', 'raw_data')
derived_data_path <- here::here('analysis', 'data', 'derived_data', version)
derived_tables_path <- glue::glue(derived_data_path, '/tables')
dir.create(raw_data_path, showWarnings = FALSE, recursive = TRUE)
dir.create(derived_data_path, showWarnings = FALSE, recursive = TRUE)
dir.create(derived_tables_path, showWarnings = FALSE, recursive = TRUE)

Download Ecoregions

if(!file.exists(glue::glue(raw_data_path, "/Ecoregions2017.zip"))){
  to <- getOption('timeout')
  options(timeout=600)
  download.file(url = "https://storage.googleapis.com/teow2016/Ecoregions2017.zip", 
              destfile = glue::glue(raw_data_path, "/Ecoregions2017.zip"))
  options(timeout=to)
  unzip(zipfile = glue::glue(raw_data_path, "/Ecoregions2017.zip"), 
      overwrite = TRUE, 
      exdir = raw_data_path)
}
ecoregions <- sf::st_read(glue::glue(raw_data_path, "/Ecoregions2017.shp")) %>% 
  dplyr::transmute(biome_name = BIOME_NAME,
                   biome_code = BIOME_NUM,
                   eco_name = ECO_NAME,
                   eco_code = ECO_ID) |> 
  sf::st_make_valid()

Global mining areas

For mining areas we use the mining polygons data set available from 10.1594/PANGAEA.910894. This data set is described here 10.1038/s41597-020-00624-w. The polygons cover the entire globe.

# pg_record <- pangaear::pg_data(doi = "10.1594/PANGAEA.910894", overwrite = TRUE, mssgs = TRUE)
# unzip(zipfile = pg_record[[1]]$path, 
#       files = "global_mining_polygons_v1.gpkg", 
#       overwrite = TRUE, 
#       exdir = raw_data_path)

mining_polygons <- sf::st_read(glue::glue(derived_data_path, "/global_mining_polygons_v2.gpkg"), fid_column_name = "fid") %>% 
  dplyr::transmute(id = stringr::str_pad(as.hexmode(fid), pad = "0", 20), 
                   area = AREA,
                   country_name = COUNTRY_NAME,
                   country_isoa3 = ISO3_CODE) |> 
  dplyr::rename(geometry = geom)

Add biomes as attributes of mining polygons

file_path <- glue::glue(derived_tables_path, "/mining_polygons_ecoregions.geojson")
if(file.exists(file_path)){
  mining_polygons_ecoregions <- sf::st_read(file_path, stringsAsFactors = FALSE, fid_column_name = "id")
} else {
  nc <- parallel::detectCores()
  nr <- nrow(mining_polygons)
  system.time(
    mining_polygons_ecoregions <- mining_polygons %>% 
      split(., rep(1:ceiling(nr/nc), each=nc, length.out=nr)) %>% 
      parallel::mclapply(mc.cores = nc, FUN = sf::st_join, left = TRUE, y = ecoregions, join = sf::st_nearest_feature) %>% 
      dplyr::bind_rows() 
  )
  sf::st_write(mining_polygons_ecoregions, file_path)
}

Calculate distance matrix per country

In this step we calculate the distances among all mining features in meters. The results are saved to the output_dir per each country.

mine_properties <- glue::glue(raw_data_path, "/mining_commodities.gpkg") %>% 
  sf::st_read(stringsAsFactors = FALSE) %>% 
  dplyr::left_join(readr::read_csv(glue::glue(raw_data_path, "/snl_isoa3_country_tbl.csv")), 
                   by = c("country" = "snl_country")) %>% 
  dplyr::mutate(id = str_pad(as.hexmode(snl_id), pad = "0", 20)) %>% 
  dplyr::select(id, mine_name = mine, known_as, commodities_list = list_of_commodities,
  development_stage, operating_status, country_name = COUNTRY_NAME, country_isoa3 = ISO3_CODE, geometry = geom)

mine_polygons <- mining_polygons_ecoregions %>% 
  dplyr::transmute(dataset_id = id, dataset_name = "polygons", country_isoa3 = country_isoa3) 

mine_points <- mine_properties %>% 
  dplyr::transmute(dataset_id = id, dataset_name = "points", country_isoa3 = country_isoa3)

mine_features <- dplyr::bind_rows(mine_points, mine_polygons) %>% 
  dplyr::mutate(id = str_pad(dplyr::row_number(), pad = "0", 20))

sf::st_write(mine_features, glue::glue(derived_data_path, "/mine_features.geojson"), delete_dsn = TRUE)

mine_split <- split(mine_features, mine_features$country_isoa3)
pb <- progress::progress_bar$new(total = length(mine_split))
system.time(dist_matrices <- purrr::map_dfr(
  .x = mine_split,
  .f = mininglucc::calc_dist_matrix,
  split_att = "country_isoa3",
  output_dir = derived_data_path,
  pb = pb)
)

Create mining clusters per country

Below we cluster mining features based on the distance matrix for each country. We set the cutting distance to 10 km, see ?fastcluster::hclust and ?stats::cutree. Features further than 10 km from each other are related only if other features connect them, see 10.1038/s41597-020-00624-w.

DBSCAN algorithm works better to cluster mining points and polygons because of the heterogeneity of the data. I set minPts=1 to allow for clusters composed of a single feature. The argument eps was set to 20000, so that algorithm is flexible to cluster distant mining features. Though, in many cases the clusters are kept small. The results were visually checked.

h <- units::set_units(10000, m)
dist_files <- dir(glue::glue(derived_data_path, "/dist_matrix"), full.names = TRUE)
names(dist_files) <- stringr::str_remove_all(basename(dist_files), ".rds")
mine_clusters <- parallel::mclapply(names(mine_split), function(f){
  hcluster <- 1
  dbcluster <- 1
  if(f %in% names(dist_files)){
    dist_matrix <- readRDS(dist_files[[f]])
    hcluster <- fastcluster::hclust(dist_matrix, method = "single") %>% 
      cutree(h = as.numeric(units::set_units(h, m)))
    dbcluster <- dbscan::dbscan(dist_matrix, eps = as.numeric(h), minPts = 1)$cluster
  }
  mine_split[[f]] %>% 
    sf::st_drop_geometry() %>% 
    tibble::as_tibble() %>% 
    dplyr::transmute(hcluster_id = hcluster,
                     dbcluster_id = dbcluster,
                     dataset_id = dataset_id, 
                     country_isoa3 = country_isoa3,
                     dataset_name = dataset_name)
}) %>% 
  dplyr::bind_rows() %>% 
  dplyr::group_by(country_isoa3, hcluster_id) %>% 
  dplyr::mutate(hcluster_id = str_pad(cur_group_id(), pad = "0", 20)) %>% 
  dplyr::ungroup() %>% 
  dplyr::group_by(country_isoa3, dbcluster_id) %>%
  dplyr::mutate(dbcluster_id = str_pad(cur_group_id(), pad = "0", 20)) %>% 
  dplyr::ungroup() %>% 
  dplyr::select(hcluster_id, dbcluster_id, dataset_id, dataset_name)

mining_polygons_ecoregions %>% 
  dplyr::select(id, country_isoa3, country_name, area, biome_name, biome_code, eco_name, eco_code) %>% 
  dplyr::left_join(dplyr::filter(mine_clusters, dataset_name == "polygons") %>% dplyr::select(-dataset_name), 
                   by = c("id" = "dataset_id")) %>% 
  sf::st_write(glue::glue(derived_tables_path, "/mine_polygons.geojson"), delete_dsn = TRUE)

mine_properties %>% 
  dplyr::select(id, commodities_list, development_stage, operating_status, country_name, country_isoa3) %>% 
  dplyr::left_join(dplyr::filter(mine_clusters, dataset_name == "points") %>% dplyr::select(-dataset_name), 
                   by = c("id" = "dataset_id")) %>% 
  sf::st_write(glue::glue(derived_tables_path, "/mine_properties.geojson"), delete_dsn = TRUE)

Mine production

glue::glue(raw_data_path, "/mine_properties") %>% 
  dir(pattern = "_production.Rdata", full.names = TRUE) %>% 
  lapply(function(f){
    load(f)
    tibble::as_tibble(x) %>% 
      dplyr::rename(id = snl_id, commodity_name = commodity, 
                    quantity = value, quantity_unit = unit) %>%
      dplyr::filter(!is.na(quantity)) %>% 
      dplyr::mutate(
        quantity_unit = ifelse(quantity_unit == "ct", "carat", quantity_unit),
        quantity_unit = ifelse(quantity_unit == "tonne", "metric_ton", quantity_unit),
        quantity_unit = ifelse(quantity_unit == "lb", "lbs", quantity_unit),
        quantity = purrr::map2_dbl(.x = quantity, 
                                   .y = quantity_unit, 
                                   .f = measurements::conv_unit, to = "metric_ton") %>% unlist(),
        quantity_unit = "metric_ton") %>% 
      dplyr::mutate(id = str_pad(as.hexmode(id), pad = "0", 20))
  }) %>% 
  dplyr::bind_rows() %>% 
  readr::write_csv(glue::glue(derived_tables_path, "/mine_production.csv"))


load("data/raw_data/mine_properties/Ore.Rdata")
Ore %>% 
  dplyr::select(id = snl_id, year, quantity = value, quantity_unit = unit) %>% 
  dplyr::filter(!is.na(quantity)) %>% 
  dplyr::mutate(id = str_pad(as.hexmode(id), pad = "0", 20)) %>% 
  readr::write_csv(glue::glue(derived_tables_path, "/mine_ore_processed.csv"))

Additional tables

tibble::tribble(~biome_group, ~biome_name,
                "Boreal Forests/Taiga", "Boreal Forests/Taiga",
                "Other", "Deserts & Xeric Shrublands",
                "Other", "Flooded Grasslands & Savannas",
                "Other", "Mangroves",
                "Mediterranean", "Mediterranean Forests, Woodlands & Scrub",
                "Other", "Montane Grasslands & Shrublands",
                "Temperate", "Temperate Broadleaf & Mixed Forests",
                "Temperate", "Temperate Conifer Forests",
                "Temperate", "Temperate Grasslands, Savannas & Shrublands",
                "Tropical & Subtropical", "Tropical & Subtropical Coniferous Forests",
                "Tropical & Subtropical", "Tropical & Subtropical Dry Broadleaf Forests",
                "Tropical & Subtropical", "Tropical & Subtropical Grasslands, Savannas & Shrublands",
                "Tropical & Subtropical", "Tropical & Subtropical Moist Broadleaf Forests",
                "Other", "Tundra") %>% 
  readr::write_csv(glue::glue(derived_tables_path, "/biome_groups.csv"))

Commodities groups according to 10.1038/s41467-020-17928-5.

development_groups <- tibble::tribble(
  ~development_group,             ~development_stage,
  "Operational",      "Preproduction",
  "Operational",      "Construction Planned",
  "Operational",      "Construction Started",
  "Operational",      "Commissioning",
  "Operational",      "Operating",
  "Operational",      "Satellite",
  "Operational",      "Expansion",
  "Operational",      "Limited Production",
  "Operational",      "Residual Production",
  "Pre-operational",  "Grassroots",
  "Pre-operational",  "Exploration",
  "Pre-operational",  "Target Outline",
  "Pre-operational",  "Reserves Development",
  "Pre-operational",  "Advanced Exploration",
  "Pre-operational",  "Prefeas/Scoping",
  "Pre-operational",  "Feasibility",
  "Pre-operational",  "Feasibility Started",
  "Pre-operational",  "Feasibility Complete",
  "Closed",           "Closed"
)

readr::write_csv(development_groups, 
                 glue::glue(derived_tables_path, "/development_groups.csv"))

Aggregated tables

# Spatial layers
mine_properties <- sf::st_read(dsn = glue::glue(derived_tables_path, "/mine_properties.geojson"))

mine_polygons <- sf::st_read(dsn = glue::glue(derived_tables_path, "/mine_polygons.geojson"))

# Mining attribute tables 
mine_production <- readr::read_csv(glue::glue(derived_tables_path, "/mine_production.csv"))
mine_ore_processed <- readr::read_csv(glue::glue(derived_tables_path, "/mine_ore_processed.csv"))

# Concordance and groups 
development_groups <- readr::read_csv(glue::glue(derived_tables_path, "/development_groups.csv"))
biome_groups <- readr::read_csv(glue::glue(derived_tables_path, "/biome_groups.csv"))

fun_collapse_groups <- function(x){
  x <- unlist(x)
  glue::glue_collapse(glue::glue("{unique(x)}"), sep = ",", width = Inf)
}

fun_adjust_operation_status <- function(x){
  x <- fun_collapse_groups(x)
  if(is.na(x)) return("Unknown")
  if(stringr::str_detect(x, "Operational")) return("Operational")
  if(stringr::str_detect(x, "Pre-operational")) return("Pre-operational") 
  if(stringr::str_detect(x, "Closed")) return("Closed")
  return("Unknown")
}

mine_properties %>% 
  tidyr::separate_rows(commodities_list, sep = ",") %>% 
  dplyr::left_join(development_groups, by = c("development_stage" = "development_stage")) %>% 
  dplyr::group_by(hcluster_id) %>% 
  dplyr::summarise(
    country_isoa3 = unique(country_isoa3), 
    country_name = unique(country_name), 
    commodities_list = fun_collapse_groups(commodities_list),
    development_stage = fun_collapse_groups(development_stage),
    operating_status = fun_collapse_groups(operating_status),
    development_group = fun_adjust_operation_status(development_group), .groups = 'drop') %>% 
  sf::st_write(glue::glue(derived_tables_path, "/mine_properties_hcluster.geojson"), delete_dsn = TRUE)

mine_properties %>% 
  tidyr::separate_rows(commodities_list, sep = ",") %>% 
  dplyr::left_join(development_groups, by = c("development_stage" = "development_stage")) %>% 
  dplyr::group_by(dbcluster_id) %>% 
  dplyr::summarise(
    country_isoa3 = unique(country_isoa3), 
    country_name = unique(country_name), 
    commodities_list = fun_collapse_groups(commodities_list),
    development_stage = fun_collapse_groups(development_stage),
    operating_status = fun_collapse_groups(operating_status),
    development_group = fun_adjust_operation_status(development_group), .groups = 'drop') %>% 
  dplyr::mutate(development_group = purrr::map(.x = development_group, .f = fun_adjust_operation_status)) %>% 
  sf::st_write(glue::glue(derived_tables_path, "/mine_properties_dbcluster.geojson"), delete_dsn = TRUE)

dplyr::group_by(mine_polygons, hcluster_id) %>% 
    dplyr::summarise(country_isoa3 = unique(country_isoa3), 
                   country_name = unique(country_name), 
                   biome_name = biome_name[which.max(area)],
                   biome_code = biome_code[which.max(area)],
                   eco_name = eco_name[which.max(area)],
                   eco_code = eco_code[which.max(area)],
                   area = sum(area, na.rm = TRUE), 
                   .groups = 'drop') %>% 
  dplyr::left_join(biome_groups, by = c("biome_name" = "biome_name")) %>% 
  sf::st_write(glue::glue(derived_tables_path, "/mine_polygons_hcluster.geojson"), delete_dsn = TRUE)

dplyr::group_by(mine_polygons, dbcluster_id) %>% 
  dplyr::summarise(country_isoa3 = unique(country_isoa3), 
                   country_name = unique(country_name), 
                   biome_name = biome_name[which.max(area)],
                   biome_code = biome_code[which.max(area)],
                   eco_name = eco_name[which.max(area)],
                   eco_code = eco_code[which.max(area)],
                   area = sum(area, na.rm = TRUE), 
                   .groups = 'drop') %>% 
  dplyr::left_join(biome_groups, by = c("biome_name" = "biome_name")) %>% 
  sf::st_write(glue::glue(derived_tables_path, "/mine_polygons_dbcluster.geojson"), delete_dsn = TRUE)

mine_properties %>% 
  sf::st_drop_geometry() %>% 
  tibble::as_tibble() %>% 
  dplyr::select(id, hcluster_id) %>% 
  dplyr::right_join(mine_production, by = c("id" = "id")) %>% 
  dplyr::group_by(hcluster_id, commodity_name, year) %>% 
  dplyr::summarise(quantity = sum(quantity, na.rm = TRUE),
                   quantity_unit = unique(quantity_unit), .groups = 'drop') %>% 
  readr::write_csv(glue::glue(derived_tables_path, "/mine_production_hcluster.csv"))

mine_properties %>% 
  sf::st_drop_geometry() %>% 
  tibble::as_tibble() %>% 
  dplyr::select(id, dbcluster_id) %>% 
  dplyr::right_join(mine_production, by = c("id" = "id")) %>% 
  dplyr::group_by(dbcluster_id, commodity_name, year) %>% 
  dplyr::summarise(quantity = sum(quantity, na.rm = TRUE),
                   quantity_unit = unique(quantity_unit), .groups = 'drop') %>% 
  readr::write_csv(glue::glue(derived_tables_path, "/mine_production_dbcluster.csv"))

mine_properties %>% 
  sf::st_drop_geometry() %>% 
  tibble::as_tibble() %>% 
  dplyr::select(id, hcluster_id) %>% 
  dplyr::right_join(mine_ore_processed, by = c("id" = "id")) %>% 
  dplyr::group_by(hcluster_id, year) %>% 
  dplyr::summarise(quantity = sum(quantity, na.rm = TRUE),
                   quantity_unit = unique(quantity_unit), .groups = 'drop') %>% 
  readr::write_csv(glue::glue(derived_tables_path, "/mine_ore_processed_hcluster.csv"))

mine_properties %>% 
  sf::st_drop_geometry() %>% 
  tibble::as_tibble() %>% 
  dplyr::select(id, dbcluster_id) %>% 
  dplyr::right_join(mine_ore_processed, by = c("id" = "id")) %>% 
  dplyr::group_by(dbcluster_id, year) %>% 
  dplyr::summarise(quantity = sum(quantity, na.rm = TRUE),
                   quantity_unit = unique(quantity_unit), .groups = 'drop') %>% 
  readr::write_csv(glue::glue(derived_tables_path, "/mine_ore_processed_dbcluster.csv"))

mine_polygons_hcluster <- sf::st_read(glue::glue(derived_tables_path, "/mine_polygons_hcluster.geojson")) %>% 
  sf::st_drop_geometry() %>% 
  tibble::as_tibble() 

mine_polygons_dbcluster <- sf::st_read(glue::glue(derived_tables_path, "/mine_polygons_dbcluster.geojson")) %>% 
  sf::st_drop_geometry() %>% 
  tibble::as_tibble() 

mine_production_dbcluster <- readr::read_csv(glue::glue(derived_tables_path, "/mine_production_dbcluster.csv"))
mine_production_hcluster <- readr::read_csv(glue::glue(derived_tables_path, "/mine_production_hcluster.csv"))

mine_production_hcluster %>% 
  dplyr::filter(2000 <= year, year <= 2019) %>% 
  dplyr::group_by(hcluster_id, commodity_name) %>% 
  # Mass of each commodity produced per cluster 
  dplyr::summarise(quantity = sum(quantity, na.rm = TRUE), quantity_unit = unique(quantity_unit), .groups = 'drop') %>% 
  dplyr::right_join(mine_polygons_hcluster, by = c("hcluster_id" = "hcluster_id")) %>% 
  dplyr::filter(!is.na(commodity_name), !commodity_name %in% c("NA", "", "Na", "na")) %>% 
  dplyr::group_by(hcluster_id) %>% 
  # Allocate area proportionally to the mass of each commodity produced per cluster 
  dplyr::mutate(area = quantity/sum(quantity, na.rm = TRUE) * sum(area, na.rm = TRUE),
                area_intensity = area / quantity,
                companion = factor(length(unique(commodity_name))>1, c(T, F), c("Companion", "Single host"))) %>% 
  dplyr::ungroup() %>% 
  readr::write_csv(glue::glue(derived_tables_path, "/mine_area_intensity_hcluster.csv"))

mine_production_dbcluster %>% 
  dplyr::filter(2000 <= year, year <= 2019) %>% 
  dplyr::group_by(dbcluster_id, commodity_name) %>% 
  # Mass of each commodity produced per cluster 
  dplyr::summarise(quantity = sum(quantity, na.rm = TRUE), quantity_unit = unique(quantity_unit), .groups = 'drop') %>% 
  dplyr::right_join(mine_polygons_dbcluster, by = c("dbcluster_id" = "dbcluster_id")) %>% 
  dplyr::filter(!is.na(commodity_name), !commodity_name %in% c("NA", "", "Na", "na")) %>% 
  dplyr::group_by(dbcluster_id) %>% 
  # Allocate area proportionally to the mass of each commodity produced per cluster 
  dplyr::mutate(area = quantity/sum(quantity, na.rm = TRUE) * sum(area, na.rm = TRUE),
                area_intensity = area / quantity,
                companion = factor(length(unique(commodity_name))>1, c(T, F), c("Companion", "Single host"))) %>% 
  dplyr::ungroup() %>% 
  readr::write_csv(glue::glue(derived_tables_path, "/mine_area_intensity_dbcluster.csv"))

fitness_hcluster <- function(x){

    hcluster <- 1

    fun_collapse_groups <- function(x){
      x <- unlist(x)
      glue::glue_collapse(glue::glue("{unique(x)}"), sep = ",", width = Inf)
    }

    f = "/home/maus/workspace/mininglucc/analysis/data/derived_data/20210317/dist_matrix/BRA.rds"
    dist_matrix <- readRDS(dist_files[[f]])
    hcluster <- fastcluster::hclust(dist_matrix, method = "single") %>% 
      cutree(h = x[1])

    cluster_ids <- mine_split[[f]] %>% 
      sf::st_drop_geometry() %>% 
      tibble::as_tibble() %>% 
      dplyr::transmute(hcluster_id = hcluster,
                       dataset_id = dataset_id,
                       country_isoa3 = country_isoa3,
                       dataset_name = dataset_name)

    mine_properties_hcluster <- mine_commodities %>% 
      dplyr::filter(country_isoa3 == f) %>% 
      dplyr::left_join(dplyr::filter(cluster_ids, dataset_name == "points"), by = c("id" = "dataset_id")) %>% 
      tidyr::separate_rows(commodities_list, sep = ",") %>% 
      dplyr::group_by(hcluster_id) %>% 
      dplyr::summarise(commodities_list = fun_collapse_groups(commodities_list), .groups = 'drop') %>% 
      dplyr::transmute(cluster_id = hcluster_id, commodities_list = commodities_list)

    mine_polygons_hcluster <- mine_polygons %>% 
      dplyr::filter(country_isoa3 == f) %>% 
      dplyr::left_join(dplyr::filter(cluster_ids, dataset_name == "polygons"), by = c("id" = "dataset_id")) %>% 
      dplyr::group_by(hcluster_id) %>% 
      dplyr::summarise(area = sum(area, na.rm = TRUE), .groups = 'drop') 

    res_hcluster_aux <- mine_properties_hcluster %>% 
      tidyr::separate_rows(commodities_list, sep = ",") %>% 
      dplyr::right_join(mine_polygons_hcluster, by = c("cluster_id" = "hcluster_id")) %>% 
      dplyr::group_by(cluster_id) %>% 
      dplyr::summarise(n = length(na.omit(commodities_list)), area = sum(area, na.rm = TRUE)) %>% 
      dplyr::mutate(commodities_list = ifelse(n == 0, "Unknown", ifelse(n == 1, "Single host", "Companion")))

    res_hcluster <- res_hcluster_aux %>% 
      dplyr::group_by(commodities_list) %>% 
      dplyr::summarise(n = dplyr::n(), area = sum(area)) %>% 
      dplyr::mutate(n.perc = n/sum(n)*100, area.perc = area/sum(area)*100)

    companion <- dplyr::filter(res_hcluster_aux, commodities_list == "Companion") %>% 
      dplyr::mutate(n * area) %>% 
      dplyr::summarise(area = sum(area, na.rm = TRUE)) %>% 
      .$area

    unkown <- dplyr::filter(res_hcluster, commodities_list == "Unknown") %>% .$area
    single_host <- dplyr::filter(res_hcluster, commodities_list == "Single host") %>% .$area

    y <- numeric(2)
    y[1] <- companion 
    y[2] <- unkown
    #y[3] <- -single_host
    return(y)
}


get.elbow.points.indices <- function(x, y, threshold) {
  ids <- order(x)
  x <- x[ids]
  y <- y[ids]
  d1 <- diff(y) / diff(x) # first derivative
  d2 <- diff(d1) / diff(x[-1]) # second derivative
  # plot(d2)
  # indices <- which.max(abs(d2)) + 2
  # indices <- which(abs(d1)>threshold) + 1
  indices <- which.max(abs(d1)) + 1
  #d1 <- diff(y[indices]) / diff(x[indices]) # first derivative
  #d2 <- diff(d1) / diff(x[indices][-1]) # second derivative
  # indices <- which(abs(d2) > threshold)
  return(ids[indices])
}

mine_commodities <- mine_properties %>% 
  dplyr::select(id, country_isoa3, commodities_list) %>% 
  sf::st_drop_geometry() %>% 
  tibble::as_tibble()
mine_polygons <- mining_polygons_ecoregions %>% 
  sf::st_drop_geometry() %>%
  tibble::as_tibble() %>% 
  dplyr::select(id, country_isoa3, area)
dist_files <- dir(glue::glue(derived_data_path, "/dist_matrix"), full.names = TRUE)
names(dist_files) <- stringr::str_remove_all(basename(dist_files), ".rds")

# f <- "USA"
# usar2 <- mco::nsga2(fn = fitness_hcluster, idim = 1, odim = 2, generations = 2, popsize = 100, 
#                  lower.bounds = c(1000), upper.bounds = c(15000))
# 
# f <- "BRA"
# bra2 <- mco::nsga2(fn = fitness_hcluster, idim = 1, odim = 2, generations = 2, popsize = 100,
#                  lower.bounds = c(1000), upper.bounds = c(15000))
# f <- "ZAF"
# zaf2 <- mco::nsga2(fn = fitness_hcluster, idim = 1, odim = 2, generations = 2, popsize = 100,
#                  lower.bounds = c(1000), upper.bounds = c(15000))

library(parallelMap)
library(ecr)
parallelStartSocket(4)    # start in socket mode and create 2 processes on localhost
parallelExport("f", "mine_split", "dist_files", "mine_commodities", "mine_polygons", "fun_collapse_groups", "fun_collapse_groups")
parallelLibrary("magrittr")
lower = c(1000)
upper = c(15000)
system.time(
  zaf2 <- ecr::ecr(fitness.fun = fitness_hcluster, n.dim = 1, n.objectives = 2, representation = "float", 
                   lower = lower, upper = upper, mu = 8, lambda = 8, mutator = setup(mutGauss, lower = lower, upper = upper))
)
parallelStop()            # turn parallelization off again
plot(zaf2$pareto.front)

## 50 x 2
#  user  system elapsed 
# 5.172   2.137 228.185 


## 8 x 8 
#  user  system elapsed 
# 3.258   2.098 220.277 

plot(zaf2$pareto.front)

f <- "ZAF"
system.time(
zaf1 <- mco::nsga2(fn = fitness_hcluster, idim = 1, odim = 2, generations = 8, popsize = 8,
                   lower.bounds = c(1000), upper.bounds = c(15000))
)
#    user  system elapsed 
# 166.422  10.235 180.288 

library(caRamel)
f <- "BRA"
results <-
  caRamel::caRamel(nobj = 2, 
                   nvar = 1,
                   minmax = c(FALSE, FALSE) ,
                   bounds = matrix(data = c(1000, 15000), nrow = 1, ncol = 2),
                   func = fitness_hcluster,
                   popsize = 1000, # size of the genetic population
                   archsize = 1, # size of the archive for the Pareto front
                   maxrun = 100, # maximum number of calls
                   prec = matrix(100, nrow = 1, ncol = 2), # 100km2
                   carallel=FALSE, 
                   sensitivity=TRUE) # sensitivity required

results

r2 <- zaf1
aux <- r2$value %>% 
  as.data.frame() %>% 
  tibble::as_tibble() %>% 
  dplyr::mutate(id = dplyr::row_number(), par = r2$par[,1]) %>% 
  dplyr::group_by(V2) %>% 
  dplyr::summarise(id = id[which.min(V1)], par = par[which.min(V1)], V1 = min(V1)) %>% 
  dplyr::group_by(V1) %>% 
  dplyr::summarise(id = id[which.min(V1)], par = par[which.min(V2)], V2 = min(V2))

r2$value <- r2$value[aux$id,]
r2$par <- r2$par[aux$id,]
r2$pareto.optimal <- r2$pareto.optimal[aux$id]
#points(findCutoff(aux$V1, aux$V2, method = "curvature"), col = "green")
ids <- get.elbow.points.indices(x=aux$V1, y=aux$V2, threshold = 0.1)
plot(r2)
points(aux$V1[ids], aux$V2[ids], col = "blue")
points(aux$V1[18], aux$V2[18], col = "green")
r2$par[ids]
r2 <- bra2
plot(r2)
lines(x,predict(loess(y ~ x)),type="l")
ids <- order(x)
KneeArrower:::findCutoffCurvature(x,predict(loess(y ~ x)))
points(findCutoff(r2$value[ids,1], r2$value[ids,2], method = "curvature"), col = "green")
ids <- get.elbow.points.indices(x=r2$value[,1], y=r2$value[,2], threshold = 1e4)
points(r2$value[ids,1], r2$value[ids,2], col = "blue")

paretoSet(r2)
dominatedHypervolume(r2)


f <- "USA"
# debugonce(nsga2R)
hcluster_optim <- nsga2R::nsga2R(fn = fitness_hcluster, varNo = 2, objDim = 2, generations = 10, popSize = 100, 
                        lowerBounds = c(1000, 1), upperBounds = c(15000, 1))
par_optim <- hcluster_optim$parameters[hcluster_optim$paretoFrontRank == 1, ] %>% 
  as.data.frame() %>% 
  tibble::as_tibble() %>% 
  dplyr::transmute(h = V1)

hcluster_optim$objectives[hcluster_optim$paretoFrontRank == 1, ] %>% 
  as.data.frame() %>% 
  dplyr::transmute(Single_host = -V2, Unkown = V1) %>% 
  dplyr::bind_cols(par_optim) %>% 
  dplyr::group_by(Single_host, Unkown) %>% 
  dplyr::summarise(h = min(h), .groups = 'drop') %>% 
  dplyr::arrange(desc(Single_host), Unkown) %>% 
  dplyr::mutate(ds = c(NA, diff(Single_host)), du = c(NA, diff(Unkown)), r = du/ds) %>% 
  round(2) %>% 
  View()

plot(hcluster_optim$objectives[,1], hcluster_optim$objectives[,2])

library(rPref)

sky <- hcluster_optim$objectives %>% 
  as.data.frame() %>% 
  tibble::as_tibble() %>% 
  dplyr::transmute(Unkown = V1, 
                   Single_host = V2,
                   h = hcluster_optim$parameters[, 1],
                   .level = hcluster_optim$paretoFrontRank)

sky <- rPref::psel(df, low(Unkown) * low(Single_host))

ggplot(sky, aes(x = Unkown, y = Single_host)) + geom_point(shape = 21) + 
  geom_point(data = sky, size = 3) + geom_step(data = sky, direction = "vh") 

p <- low(Unkown) * low(Single_host)
res <- psel(df, p, top = nrow(df))

ggplot(res, aes(x = Unkown, y = Single_host, color = factor(.level))) +
  geom_point(size = 3) + 
  geom_step(direction = "vh") +
  scale_color_viridis_d()