data-raw/metadata_bc.R

#' ---
#' title: "metadata_bc.R"
#' author: "Dean Koch"
#' date: "June 25, 2020"
#' output: github_document
#' ---
#'
#' R code for preparing metadata list
#' eventually this may be turned into a public component of the rasterbc package, allowing
#' users to recreate all the steps used in preparing the dataset, and/or add their new collections
#'
#' For now it serves to record the steps for getting from the rasterbc_src repo output to the metadata_BC
#' list, which is saved as external data (in subdirectory data/) of the rasterbc package with a link in the
#' documentation to the rasterbc_src repo, where there is ample documentation of the sourcing workflow.
#'
#' This script has some information on filepaths that is specific to my workstation, so it is not intended for
#' distribution in the rasterbc package.
#'

library(here)
library(devtools)
#library(rasterbc)
load_all()

# URL prefix for permanent links that automatically redirect to correct globus endpoint (November 2023)
# permalinks were added to FRDR in late August 2023 (see https://www.frdr-dfdr.ca/docs/en/change_log/)
cfg.frdr = 'https://www.frdr-dfdr.ca/repo/files/1/published/publication_278/submitted_data/'

# URL at FRDR where the files are currently hosted (updated Nov 2023 - permanent this time?)
#cfg.frdr = 'https://g-772fa5.cd4fe.0ec8.data.globus.org/1/published/publication_278/submitted_data/'

# URL at FRDR where the files are currently hosted (updated Nov 2021)
#cfg.frdr = 'https://g-624536.53220.5898.data.globus.org/1/published/publication_278/submitted_data/'

# ...which replaces the old URL...
# cfg.frdr = 'https://2a9f4.8443.dn.glob.us/1/published/publication_278/submitted_data/'

# ...replaces the old FRDR demo site links at...
#cfg.frdr = 'https://364cb.0ec8.dn.glob.us/1/published/publication_227/submitted_data/'

# current location on my machine to look for metadata generated by the scripts in the rasterbc_src repo
current.data.dir = 'L:/MPB_ARCHIVE/git-MPB/rasterbc_src/data/'

# path to original storage location from when metadata files were written (don't change!)
original.data.dir = 'H:/git-MPB/rasterbc/data/'

# for now, we only have the DEM data hosted, but this code should work for the whole collection
collections = unlist(strsplit(list.files(current.data.dir)[endsWith(list.files(current.data.dir), '.rds')], '.rds'))
metadata_bc = vector(mode='list', length=length(collections))
names(metadata_bc) = collections

# load the metadata and convert output filenames to web urls
for(collection in collections)
{
  # read in the metadata created by the src_*.R script
  cfg = readRDS(file.path(current.data.dir, paste0(collection, '.rds')))
  cfg.fname = listswap_bc(cfg$out$fname$tif, original.data.dir, '')
  cfg.src = listswap_bc(cfg$src[names(cfg$src) != 'dir'], original.data.dir, '')
  cfg.codes = cfg$out$code
  metadata_bc[[collection]] = list(source=cfg.src, frdr=cfg.frdr, fname=cfg.fname, metadata=list(df=NULL, year=NULL, coding=cfg.codes))
}

# the 'pine' layers have their cfg.src$years entry as character strings. Replace these with integers
metadata_bc[['pine']]$source$years = setNames(as.integer(metadata_bc[['pine']]$source$years), nm=names(metadata_bc[['pine']]$source$years))

# prepare the full list of available variables as a list of dataframes
for(collection in names(metadata_bc))
{
  # initially assign the variable names based on the block directory names
  collection.varnames = setNames(nm=names(metadata_bc[[collection]]$fname$block))
  #metadata_bc[[collection]]$metadata$varnames = collection.varnames

  # check if this collection is/contains a time-series (a year-indexed set of rasters)...
  collection.years = metadata_bc[[collection]]$source$years
  if(!is.null(collection.years))
  {
    # ... if so, check which variables are associated with time-series
    is.year = setNames(grepl('(yr)(\\d+)', collection.varnames), collection.varnames)

    # check which variables are available in which years
    lyrs.byyear = lapply(which(is.year), function(idx.year) names(metadata_bc[[collection]]$fname$block[[idx.year]]))
    ts.lyrs = unique(unlist(lyrs.byyear))
    lyr.year.matrix = sapply(ts.lyrs, function(lyr) sapply(lyrs.byyear, function(year) lyr %in% year))

    # for each variable, construct a string indicating the available years and also store them as integer vector
    timeseries.string = setNames(nm=ts.lyrs)
    collection.yearlist = setNames(vector(mode='list', length=length(timeseries.string)), ts.lyrs)
    for(lyr in timeseries.string)
    {
      available.years = collection.years[names(which(lyr.year.matrix[, lyr]))]
      collection.yearlist[[lyr]] = available.years

      # strings for printing time series info into console
      timeseries.string[[lyr]] = paste(available.years, collapse=', ')
      if(identical(unname(available.years), min(available.years):max(available.years)))
      {
        # if the years are sequential, use a short-form string (eg. "2001-2018")
        timeseries.string[[lyr]] = paste(range(available.years), collapse='-')
      }
    }

    # reorder the fids collection to group species together
    if(collection == 'fids')
    {
      # for loops with a counter seems to be the simplest way to do this
      new.index = rep(NA, length(timeseries.string))
      new.index.pointer = 0
      for(species in names(metadata_bc[[collection]]$source$spp.codes))
      {
        # the ordering coded in the for loop argument is the desired ordering (min first, then mid, ...)
        for(stat in c('min','mid','max', names(metadata_bc[[collection]]$source$sev.codes$post2003)))
        {
          new.index.pointer = new.index.pointer + 1
          new.index[new.index.pointer] = which(paste0(species, '_', stat) == names(timeseries.string))
        }
      }

      # reorder the relevant objects
      timeseries.string = timeseries.string[new.index]
      collection.yearlist = collection.yearlist[new.index]
    }

    # overwrite the variable names vector
    collection.varnames = setNames(nm=c(collection.varnames[!is.year], names(timeseries.string)))

    # create a vector of strings indicating the years
    collection.yearstrings = setNames(c(rep(NA, sum(!is.year)), timeseries.string), collection.varnames)

    # create a list of vectors indicating the years
    collection.yearlist = setNames(c(rep(NA, sum(!is.year)), collection.yearlist), names(collection.varnames))

  } else {

    # assign NA values to the years field when the variable is not yearly
    collection.yearstrings = setNames(rep(NA, length(collection.varnames)), collection.varnames)
    collection.yearlist = setNames(rep(list(NA), length(collection.varnames)), collection.varnames)
  }

  # write these results to the metadata list
  metadata_bc[[collection]]$metadata$df = data.frame(year=collection.yearstrings)
  metadata_bc[[collection]]$metadata$year = collection.yearlist

}

# With this skeleton defined, I add the rest of the metadata by hand...

# biogeoclimatic zone
metadata_bc[['bgcz']]$metadata$df$description = c(region = 'regional placename',
                                                  org = 'more specific placename',
                                                  cover = 'landcover type (water, ice, land)',
                                                  zone = 'biogeoclimatic zone classification',
                                                  subzone = 'subclassification of zone',
                                                  variant = 'subclassification of subzone')
metadata_bc[['bgcz']]$metadata$df$unit = rep('(integer code)', nrow(metadata_bc[['bgcz']]$metadata$df))


# geographical boundary and position
metadata_bc[['borders']]$metadata$df$description = c(prov = 'out-of-province mask',
                                                     latitude = 'geodetic latitude (south-north position)',
                                                     longitude = 'geodetic longitude (east-west position)')
metadata_bc[['borders']]$metadata$df$unit = c(prov = '(indicator)',
                                              latitude = '(degrees north)',
                                              longitude = '(degrees west)')

# forestry harvest activity
metadata_bc[['cutblocks']]$metadata$df$description = c(harvest = 'forest cover loss attributed to harvest')
metadata_bc[['cutblocks']]$metadata$df$unit = c(harvest = '(fraction of cell area)')


# digital elevation model
metadata_bc[['dem']]$metadata$df$description = c(dem = 'digital elevation map',
                                                 slope = 'derived from digital elevation map',
                                                 aspect = 'derived from digital elevation map')
metadata_bc[['dem']]$metadata$df$unit = c(dem = '(metres above sea level)',
                                          slope = '(degrees above horizontal)',
                                          aspect = '(degrees counterclockwise from north)')

# insect damage
stat.strings = paste(c('minimum', 'midpoint', 'maximum'), 'of total forest cover loss')
sev.strings = paste(c('trace', 'light', 'moderate', 'severe', 'very severe'), 'rated damage observation')
pest.strings = c(IBM = 'mountain pine beetle', IBS = 'spruce beetle', IBD = 'douglas-fir beetle', IBB = 'western balsam bark beetle')
metadata_bc[['fids']]$metadata$df$description = paste(paste(c(stat.strings, sev.strings), 'attributed to'), rep(pest.strings, each=8))
metadata_bc[['fids']]$metadata$df$unit = rep('(fraction of cell area)', nrow(metadata_bc[['fids']]$metadata$df))

# global forest change
metadata_bc[['gfc']]$metadata$df$description = c(treecover = 'tree cover estimate circa year 2000',
                                                 gain = 'indicator for forest regrowth during period 2000-2019',
                                                 mask = 'landcover classification (0=water, 1=land)',
                                                 loss = 'yearly indicator for forest loss')
metadata_bc[['gfc']]$metadata$df$unit = c('(fraction of cell area)', rep('(binary)', 3))


# wildfire
metadata_bc[['nfdb']]$metadata$df$description = c('proportion of area falling within a recorded wildfire boundary')
metadata_bc[['nfdb']]$metadata$df$unit = c('(fraction of cell area)')

# forest attributes
highlevel.strings = paste(c(paste(c('vegetation', 'tree', 'needle-leaf species'), 'cover'), 'stand age'),  'estimate')
species.strings = paste(c('whitebark', 'jack', 'lodgepole', 'western white', 'ponderosa', 'red', 'unidentified', 'eastern white', 'scots', 'total'), 'pine cover estimate')
metadata_bc[['pine']]$metadata$df$description = c(highlevel.strings, species.strings)
metadata_bc[['pine']]$metadata$df$unit = c(rep('(fraction of cell area)', 2), '(fraction of \"vegTreed\" area)', '(years)', rep('(fraction of \"needle\" area)', 10))


# xx = lapply(names(metadata_bc), function(x) cbind(collection=rep(x, nrow(metadata_bc[[x]]$metadata$df)), metadata_bc[[x]]$metadata$df))
# print(do.call(rbind, xx), row.names=FALSE)

# write to data directory for package
use_data(metadata_bc, overwrite=TRUE)
deankoch/rasterbc documentation built on Nov. 16, 2023, 2:29 a.m.