R/cohorts.R

Defines functions updateCohortData

Documented in updateCohortData

utils::globalVariables(c(
  ".", "..cols", "..colsToSubset", ".I", ":=", "..groupVar", "age", "age2", "aNPPAct",
  "cover", "coverOrig", "ecoregion", "ecoregionGroup", "hasBadAge",
  "imputedAge", "initialEcoregion", "initialEcoregionCode", "initialPixels",
  "lcc", "maxANPP", "maxB", "maxB_eco", "mortality",
  "newPossLCC", "noPixels", "oldSumB", "ord", "outBiomass", "oldEcoregionGroup",
  "pixelGroup2", "pixelIndex", "pixels", "planted", "Provenance", "possERC",
  "speciesposition", "speciesGroup", "speciesInt", "state", "sumB",
  "temppixelGroup", "toDelete", "totalBiomass", "totalBiomass2", "totalCover",
  "uniqueCombo", "uniqueComboByRow", "uniqueComboByPixelIndex", "V1", "year"
))

#' Add cohorts to `cohortData` and `pixelGroupMap`
#'
#' This is a wrapper for `generatePixelGroups`, `initiateNewCohort` and updates to
#' `pixelGroupMap` via assignment to new `pixelIndex` values in `newPixelCohortData`.
#' By running these all together, there is less chance that they will diverge.
#' There are some checks internally for consistency.
#'
#' Does the following:
#' \enumerate{
#'   \item add *new cohort* (not survivor) data into `cohortData`;
#'   \item assign initial `B` and `age` for new cohort;
#'   \item assign the new `pixelGroup` to the pixels that have new cohort;
#'   \item update the `pixelGroup` map.
#' }
#' Note that if `newPixelCohortData` is generated after a disturbance
#'   it must contain a `type` column indicating the origin of the cohorts
#'   (e.g. "survivor", "serotiny", "resprouting"). "Survivor" cohorts will
#'   not be added to the output objects, as they are assumed to be accounted
#'   for in the input `cohortData` and, correspondingly, `pixelGroupMap`.
#'
#' @template newPixelCohortData
#' @template cohortData
#' @template pixelGroupMap
#' @template currentTime
#' @template speciesEcoregion
#' @template cohortDefinitionCols
#'
#' @param treedFirePixelTableSinceLastDisp A data.table with at least 2 columns, `pixelIndex`
#'   and `pixelGroup`.
#'   This will be used in conjunction with `cohortData` and `pixelGroupMap`
#'   to ensure that everything matches correctly.
#' @template successionTimestep
#' @template verbose
#' @template initialB
#' @template doAssertion
#'
#' @return
#' A list of length 2, `cohortData` and `pixelGroupMap`, with
#' `newPixelCohortData` inserted.
#'
#' @export
#' @rdname updateCohortData
updateCohortData <- function(newPixelCohortData, cohortData, pixelGroupMap, currentTime,
                             speciesEcoregion, treedFirePixelTableSinceLastDisp = NULL,
                             successionTimestep,
                             cohortDefinitionCols = c("pixelGroup", "age", "speciesCode"),
                             initialB = 10,
                             verbose = getOption("LandR.verbose", TRUE),
                             doAssertion = getOption("LandR.assertions", TRUE)) {
  maxPixelGroup <- as.integer(maxFn(pixelGroupMap))

  if (!is.null(treedFirePixelTableSinceLastDisp)) {
    ## only set to 0 the pixels that became empty (have no survivors)
    emptyPix <- setdiff(
      treedFirePixelTableSinceLastDisp$pixelIndex,
      newPixelCohortData[type == "survivor", pixelIndex]
    )
    pixelGroupMap[emptyPix] <- 0L
  }
  relevantPixels <- pixelGroupMap[][newPixelCohortData$pixelIndex]
  zeroOnPixelGroupMap <- relevantPixels == 0

  if (!"age" %in% colnames(newPixelCohortData)) {
    newPixelCohortData[, age := 1L]
  }

  if (all(zeroOnPixelGroupMap)) {
    # Deal with pixels on the map that have no pixelGroup -- these are burned
    # pixels --> the entirely newly regenerated pixels do not require a
    # re-pixelGroupMaping  -- can just add to existing pixelGroup values
    if (verbose > 0) {
      message(crayon::green(
        "  Regenerating only burnt pixels with no survivors (i.e. resprouting & serotiny)"
      ))
    }
    columnsForPG <- c("ecoregionGroup", "speciesCode", "age") ## no Biomass b/c they all have zero
    cd <- newPixelCohortData[, c("pixelIndex", columnsForPG), with = FALSE]
    newPixelCohortData[, pixelGroup := generatePixelGroups(cd,
                                                           maxPixelGroup = maxPixelGroup,
                                                           columns = columnsForPG
    )] # ,
    # successionTimestep = successionTimestep)

    # Remove the duplicated pixels within pixelGroup (i.e., 2+ species in the same pixel)
    pixelsToChange <- unique(newPixelCohortData[, c("pixelIndex", "pixelGroup")],
                             by = c("pixelIndex")
    )
  } else {
    # This is for situations where there are some empty pixels being filled,
    #   and some occupied pixels getting infilling. This requires a wholesale
    #   re-pixelGroup
    if (verbose > 0) {
      message(crayon::green("  Regenerating open and pixels with B (likely after seed dispersal, or partial mortality following disturbance)"))
    }

    pixelIndex <- which(pixelGroupMap[] %in% cohortData$pixelGroup)

    # remove unnecessary columns before making cohortDataLong
    suppressWarnings(set(cohortData, j = "prevMortality", value = NULL))
    suppressWarnings(set(newPixelCohortData, j = "year", value = NULL))

    cohortDataPixelIndex <- data.table(
      pixelIndex = pixelIndex,
      pixelGroup = pixelGroupMap[][pixelIndex]
    )
    cdLong <- cohortDataPixelIndex[cohortData, on = "pixelGroup", allow.cartesian = TRUE]
    if (!is.null(newPixelCohortData$type)) {
      ## survivor cohorts are assumed to be in cohort data already.
      newPixelCohortData <- newPixelCohortData[type != "survivor"]
    }
    cohorts <- rbindlist(list(cdLong, newPixelCohortData), use.names = TRUE, fill = TRUE)

    columnsForPG <- c("ecoregionGroup", "speciesCode", "age", "B")
    cd <- cohorts[, c("pixelIndex", columnsForPG), with = FALSE]
    cohorts[, pixelGroup := generatePixelGroups(cd, maxPixelGroup = 0L, columns = columnsForPG)]

    # Bring to pixelGroup level -- this will squash the data.table
    if (is.null(cohorts[["sumB"]])) {
      cohorts[, sumB := sum(B, na.rm = TRUE), by = pixelGroup]
    }
    # Old way that does not preserve additional 'unknown' columns in cohortData
    # allCohortData <- cohorts[ , .(ecoregionGroup = ecoregionGroup[1],
    #                               mortality = mortality[1],
    #                               aNPPAct = aNPPAct[1],
    #                               sumB = sumB[1]),
    #                           by = uniqueCohortDefinition]

    # newer way that will potentially conflict with LandR.CS due to differing aNPP
    colsToSubset <- setdiff(colnames(cohortData), c("pixelIndex"))
    allCohortData <- cohorts[!duplicated(cohorts[, .(pixelGroup, speciesCode, ecoregionGroup, age)]),
                             ..colsToSubset]

    theNewOnes <- is.na(allCohortData$B)
    cohortData <- allCohortData[!theNewOnes]
    newPixelCohortData <- allCohortData[theNewOnes]

    # Remove the duplicated pixels within pixelGroup (i.e., 2+ species in the same pixel)
    pixelsToChange <- unique(cohorts[, c("pixelIndex", "pixelGroup")], by = c("pixelIndex"))
  }

  # update pixelGroupMap
  pixelGroupMap[pixelsToChange$pixelIndex] <- pixelsToChange$pixelGroup

  if (doAssertion) {
    if (!isTRUE(all(pixelsToChange$pixelGroup == pixelGroupMap[][pixelsToChange$pixelIndex]))) {
      stop("pixelGroupMap and newPixelCohortData$pixelGroupMap don't match in updateCohortData fn")
    }
  }

  ## give B in pixels that have serotiny/resprouting
  # newPixelCohortData[, sumB := sum(B, na.rm = TRUE), by = pixelGroup]

  ##########################################################
  # Add new cohorts and rm missing cohorts (i.e., those pixelGroups that are gone)
  ##########################################################
  cohortData <- .initiateNewCohorts(newPixelCohortData, cohortData,
                                    pixelGroupMap,
                                    currentTime = currentTime,
                                    speciesEcoregion = speciesEcoregion,
                                    successionTimestep = successionTimestep,
                                    initialB = initialB
  )

  outs <- rmMissingCohorts(cohortData, pixelGroupMap, cohortDefinitionCols = cohortDefinitionCols)

  if (!is.null(outs$cohortData$sumB)) {
    outs$cohortData[, sumB := NULL]
  }

  assertCohortData(outs$cohortData, outs$pixelGroupMap,
                   cohortDefinitionCols = cohortDefinitionCols,
                   doAssertion = doAssertion, verbose = verbose)

  if (doAssertion) {
    maxPixelGroupFromCohortData <- max(outs$cohortData$pixelGroup)
    maxPixelGroup <- as.integer(maxFn(outs$pixelGroupMap))
    test1 <- (!identical(maxPixelGroup, maxPixelGroupFromCohortData))
    if (test1) {
      stop(
        "The sim$pixelGroupMap and cohortData have unmatching pixelGroup.",
        " They must be matching.",
        " If this occurs, please contact the module developers"
      )
    }
  }

  if (verbose > 0) {
    nPixForest <- sum(!is.na(outs$pixelGroupMap[]))
    nPixGrps <- length(unique(outs$cohortData$pixelGroup))
    nPixNoPixGrp <- sum(outs$pixelGroupMap[] == 0, na.rm = TRUE)
    nPixTreed <- sum(outs$pixelGroupMap[] != 0, na.rm = TRUE)

    nDigits <- max(nchar(c(nPixForest, nPixGrps, nPixNoPixGrp))) + 3
    message(crayon::magenta(
      "NUMBER OF FORESTED PIXELS          :",
      paddedFloatToChar(nPixForest, padL = nDigits, pad = " ")
    ))
    message(crayon::magenta(
      "NUMBER OF PIXELS WITH TREES        :",
      paddedFloatToChar(nPixTreed, padL = nDigits, pad = " ")
    ))
    message(crayon::magenta(
      "NUMBER OF UNIQUE PIXELGROUPS       :",
      paddedFloatToChar(nPixGrps, padL = nDigits, pad = " ")
    ))
    message(crayon::magenta(
      "NUMBER OF PIXELS WITH NO PIXELGROUP:",
      paddedFloatToChar(nPixNoPixGrp, padL = nDigits, pad = " ")
    ))
  }

  return(list(
    cohortData = outs$cohortData,
    pixelGroupMap = outs$pixelGroupMap
  ))
}

#' Initiate new cohorts
#'
#' Calculate new values for `B`, add `age`, then `rbindlist` this
#' with `cohortData`.
#'
#' @template newPixelCohortData
#' @template cohortData
#' @template cohortDefinitionCols
#' @template initialB
#'
#' @return A `data.table` with a new `rbindlist`ed `cohortData`
#'
#' @rdname updateCohortData
.initiateNewCohorts <- function(newPixelCohortData, cohortData, pixelGroupMap, currentTime,
                                cohortDefinitionCols = c("pixelGroup", "speciesCode", "age"),
                                speciesEcoregion, successionTimestep, initialB = 10) {

  ## get spp "productivity traits" per ecoregion/present year
  ## calculate maximum B per ecoregion, join to new cohort data
  namesNCD <- names(newPixelCohortData)
  if (!isTRUE("pixelGroup" %in% namesNCD)) {
    if (isTRUE("pixelIndex" %in% namesNCD)) {
      newPixelCohortData[, pixelGroup := as.vector(pixelGroupMap[])[pixelIndex]]
    } else {
      stop("newPixelCohortData must have either pixelIndex or pixelGroup")
    }
  }

  ## simplifying to pixelGroup again
  if (!is.null(newPixelCohortData[["pixelIndex"]])) {
    set(newPixelCohortData, NULL, "pixelIndex", NULL)
  }
  newPixelCohortData <- newPixelCohortData[!duplicated(newPixelCohortData), ] ## faster than unique

  specieseco_current <- speciesEcoregionLatestYear(speciesEcoregion, currentTime)
  specieseco_current <- setkey(
    specieseco_current[, .(speciesCode, maxANPP, maxB, ecoregionGroup)],
    speciesCode, ecoregionGroup
  )

  ## Note that after the following join, some cohorts will be lost due to lack of
  ##  parameters in speciesEcoregion. These need to be modified in pixelGroupMap.
  # missingNewPixelCohortData <- newPixelCohortData[!specieseco_current,
  #                                                 on = uniqueSpeciesEcoregionDefinition]
  specieseco_current <- specieseco_current[!is.na(maxB)]
  specieseco_current[, maxB_eco := max(maxB), by = ecoregionGroup]
  newPixelCohortData <- specieseco_current[newPixelCohortData,
                                           on = uniqueSpeciesEcoregionDefinition]
  newPixelCohortData <- newPixelCohortData[!is.na(maxB)]

  if (any(newPixelCohortData$age > 1)) {
    stop("newPixelCohortData should only have new cohorts aged 1")
  }
  set(newPixelCohortData, NULL, "age", 1L) ## set age to 1

  ## Ceres: this was causing new cohorts to be initialized with maxANPP.
  ## instead, remove column and calculate total biomass of older cohorts in cohortData
  # set(newPixelCohortData, NULL, "sumB", 0L)
  suppressWarnings(set(newPixelCohortData, j = "sumB", value = NULL))

  cohortData[age >= successionTimestep, oldSumB := sum(B, na.rm = TRUE), by = "pixelGroup"]

  newPixelCohortData <- unique(cohortData[, .(pixelGroup, oldSumB)],
                               by = "pixelGroup"
  )[newPixelCohortData, on = "pixelGroup"]
  ## using set() is faster than [:=]
  set(newPixelCohortData, which(is.na(newPixelCohortData$oldSumB)), "oldSumB", 0)
  setnames(newPixelCohortData, "oldSumB", "sumB")
  set(cohortData, NULL, "oldSumB", NULL)

  if ("B" %in% names(newPixelCohortData)) {
    newPixelCohortData[, B := NULL]
  }

  if (isTRUE(is.na(initialB)) || is.null(initialB)) {
    set(
      newPixelCohortData, NULL, "B",
      asInteger(pmax(1, newPixelCohortData$maxANPP *
                       exp(-1.6 * newPixelCohortData$sumB / newPixelCohortData$maxB_eco)))
    )
    set(newPixelCohortData, NULL, "B",
        asInteger(pmin(newPixelCohortData$maxANPP, newPixelCohortData$B)))
  } else {
    ## 2022-02: change to 10 - maxANPP is unrealistic, particularly with
    ##          high maxANPP needed to produce realistic growth curves
    set(newPixelCohortData, NULL, "B", asInteger(initialB))
  }

  newPixelCohortData <- newPixelCohortData[, .(pixelGroup, ecoregionGroup, speciesCode, age, B,
                                               mortality = 0L, aNPPAct = 0L)]

  if (getOption("LandR.assertions")) {
    if (isTRUE(NROW(unique(newPixelCohortData,
                           by = cohortDefinitionCols)) != NROW(newPixelCohortData))) {
      stop("Duplicated new cohorts in a pixelGroup. Please debug LandR:::.initiateNewCohorts")
    }
  }

  cohortData <- rbindlist(list(cohortData, newPixelCohortData), fill = TRUE, use.names = TRUE)
  # cohortData[, sumB := sum(B, na.rm = TRUE), by = "pixelGroup"]  ## recalculate sumB
  # if (!is.integer(cohortData[["sumB"]]))
  #   set(cohortData, NULL, "sumB", asInteger(cohortData[["sumB"]]))

  return(cohortData)
}

#' Remove missing cohorts from `cohortData` based on `pixelGroupMap`
#'
#' @template cohortData
#'
#' @template cohortDefinitionCols
#'
#' @template pixelGroupMap
#'
#' @template doAssertion
#'
#' @return
#' A `list` with 2 `data.table` objects, `cohortData` and `pixelGroupMap`,
#' each updated based on missing `pixelGroups` in the other.
#'
#' @export
rmMissingCohorts <- function(cohortData, pixelGroupMap,
                             cohortDefinitionCols = c("pixelGroup", "age", "speciesCode"),
                             doAssertion = getOption("LandR.assertions", TRUE)) {
  pgmValues <- data.table(
    pixelGroup = as.vector(pixelGroupMap[]),
    pixelIndex = seq(ncell(pixelGroupMap))
  )

  pgmVals <- na.omit(pgmValues)
  pgmVals <- pgmVals[pixelGroup > 0]
  whPgsStillInCDGoneFromPGM <- !cohortData$pixelGroup %in% pgmVals$pixelGroup
  pgsStillInCDGoneFromPGM <- cohortData[whPgsStillInCDGoneFromPGM, ]
  whPgsStillInPGMGoneFromCD <- !pgmVals$pixelGroup %in% cohortData$pixelGroup
  pgsStillInPGMGoneFromCD <- pgmVals[whPgsStillInPGMGoneFromCD, ]

  # REMOVE lines in cohortData that are no longer in the pixelGroupMap
  cohortData <- cohortData[!pixelGroup %in% pgsStillInCDGoneFromPGM$pixelGroup]
  # REMOVE pixels in pixelGroupMap that are no longer in the cohortData
  pixelGroupMap[pgsStillInPGMGoneFromCD$pixelIndex] <- NA

  assertCohortData(cohortData, pixelGroupMap,
                   message = "rmMissingCohorts",
                   cohortDefinitionCols = cohortDefinitionCols,
                   doAssertion = doAssertion)

  if (NROW(unique(cohortData[pixelGroup == 67724]$ecoregionGroup)) > 1) stop()

  return(list(
    cohortData = cohortData,
    pixelGroupMap = pixelGroupMap
  ))
}

#' Add the correct `pixelGroups` to a `pixelDataTable` object
#'
#' Generates unique groupings of a `data.table` object where one or more rows can
#' all belong to the same `pixelIndex`. Pixel groups will be identical pixels based
#' on unique combinations of `columns`.
#'
#' @param pixelDataTable  A `data.table` with column-based descriptions.
#'   Must have a column called `pixelIndex`, which allows for multiple rows to be associated
#'   with a single pixel.
#' @param maxPixelGroup A length 1 numeric indicating the current maximum `pixelGroup` value;
#'    the `pixelGroup` numbers returned will start at `maxPixelGroup + 1`.
#' @param columns A character vector of column names to use as part of the generation of unique
#'   combinations of features. Default is `c("ecoregionGroup", "speciesCode", "age", "B")`
#'
#' @return
#' Returns a vector of `pixelGroup` in the original order of the input `pixelDataTable`.
#' This should likely be added to the `pixelDataTable` object immediately.
#'
#' @export
generatePixelGroups <- function(pixelDataTable, maxPixelGroup,
                                columns = c("ecoregionGroup", "speciesCode", "age", "B")) {
  columnsOrig <- columns
  columns <- columns[columns %in% names(pixelDataTable)]
  columns2 <- paste0(columns, "2")
  if (!all(columns == columnsOrig)) {
    message(
      "Creating pixelGroup values, but not using all columns requested. Only using, ",
      paste(columns, collapse = ", "), " instead of ", paste(columnsOrig, collapse = ", ")
    )
  }

  pcd <- pixelDataTable # no copy -- just for simpler name

  if (getOption("LandR.assertions")) {
    pcdOrig <- data.table::copy(pcd)
  }
  ## concatenate within rows:
  ## e.g., ecoregionCode_speciesCode_age_biomass or 647_11_Abie_sp_100_2000
  pcd[, uniqueComboByRow := do.call(paste, as.list(.SD)), .SDcols = columns]

  ## concatenate within pixelIndex
  pcd[, c("uniqueComboByPixelIndex") := paste(uniqueComboByRow, collapse = "__"), by = "pixelIndex"]
  pcd[, c("pixelGroup") := as.integer(maxPixelGroup) + as.integer(factor(uniqueComboByPixelIndex))]

  ## old algorithm:
  if (isTRUE(getOption("LandR.assertions"))) {
    ## prepare object 1 (pcd) for checking below
    pcd[, ord := seq_len(.N)]
    setorderv(pcd, c("pixelIndex"))
    uniqPG <- unique(pcd$pixelGroup)
    pcd[, pixelGroup2 := mapvalues2(pixelGroup, from = uniqPG, to = as.character(seq_along(uniqPG)))]
    # pcd[, pixelGroup2 := mapvalues(pixelGroup, from = unique(pixelGroup),
    #                                to = as.character(seq_along(unique(pixelGroup))))]
    setorderv(pcd, "ord")

    pcdOld <- data.table::copy(pcdOrig)

    # Convert to unique numeric
    pcdOld[, c(columns2) := lapply(.SD, function(x) {
      a <- as.integer(factor(x))
    }), .SDcols = columns]

    ## concatenate within rows:
    ## e.g., ecoregionCode_speciesCode_age_biomass or 647_11_Abie_sp_100_2000
    pcdOld[, uniqueComboByRow := as.integer(factor(do.call(paste, as.list(.SD)))),
           .SDcols = columns2
    ]

    ## concatenate within pixelIndex
    pcdOld[, c("uniqueComboByPixelIndex") := paste(uniqueComboByRow, collapse = "__"),
           by = "pixelIndex"]
    pcdOld[, c("pixelGroup") := as.integer(maxPixelGroup) +
             as.integer(factor(uniqueComboByPixelIndex))]
    ## prepare object 2 (pcdOld) for checking below
    pcdOld[, ord := seq_len(.N)]
    setorderv(pcdOld, c("pixelIndex"))

    uniqPG <- unique(pcdOld$pixelGroup)
    pcdOld[, pixelGroup2 := mapvalues2(pixelGroup,
                                       from = uniqPG,
                                       to = as.character(seq_along(uniqPG)))]

    setorderv(pcdOld, "ord")

    ## the check
    if (!identical(pcdOld$pixelGroup2, pcd$pixelGroup2)) {
      stop("new generatePixelGroups algorithm failing")
    }
  }

  return(pcd$pixelGroup)
}

#' Pull out the values from `speciesEcoregion` table for current time
#'
#' @template speciesEcoregion
#' @template currentTime
#'
#' @return
#' The `speciesEcoregion` input object, but with data from only one year, the year
#' that is less than or equal to the `currentTime`
#'
#' @export
speciesEcoregionLatestYear <- function(speciesEcoregion, currentTime) {
  spEco <- speciesEcoregion[year <= currentTime]
  spEco[year == max(year)]
}

#' @keywords internal
.ageRndUpSuccessionTimestep <- function(age, successionTimestep) {
  as.integer(ceiling(as.numeric(age) / successionTimestep) * successionTimestep)
}

#' The columns in a `cohortData` that define "unique"
#'
#' If two pixels have identical values in all of these columns, they are the same `pixelGroup`.
#'
#' @export
#' @rdname uniqueDefinitions
uniqueCohortDefinition <- c("pixelGroup", "speciesCode", "age", "B")

#' @export
#' @rdname uniqueDefinitions
uniqueSpeciesEcoregionDefinition <- c("speciesCode", "ecoregionGroup")

#' Summary for `cohortData`
#'
#' @template cohortData
#'
#' @export
describeCohortData <- function(cohortData) {
  vals <- c("B", "totalBiomass", "age", "cover")
  names(vals) <- vals
  out <- lapply(vals, function(val) {
    .cohortMessages(cohortData, val)
  })
  message(magenta("Pixels with non-NA cover:, ", cohortData[!is.na(cover), length(unique(pixelIndex))]))
}

#' @keywords internal
.cohortMessages <- function(cohortData, val) {
  out <- list()
  if (val %in% colnames(cohortData)) {
    pixelsNA <- NROW(cohortData[is.na(get(val)), unique("pixelIndex"), with = FALSE])
    message(magenta("Pixels with missing", val, ":", format(pixelsNA, big.mark = ",")))
    pixelsZero <- NROW(cohortData[, all(get(val) == 0), by = "pixelIndex"][get("V1") == TRUE])
    message(magenta("Pixels with all(", val, " == 0): ", format(pixelsZero, big.mark = ",")))
    pixelsBiomassNonZero <- NROW(cohortData[, any(get(val) > 0), by = "pixelIndex"][get("V1") == TRUE])
    message(magenta("Pixels with all(", val, " > 0): ", format(pixelsBiomassNonZero, big.mark = ",")))
    out <- list(pixelsNA = pixelsNA, pixelsZero = pixelsZero, pixelsBiomassNonZero = pixelsBiomassNonZero)
  }
  return(invisible(out))
}

#' Convert Land Cover Classes (LCC) to another value in its neighbourhood
#'
#' This will search around the pixels on `rstLCC` that have
#' `classesToReplace`, and search in iteratively increasing
#' radii outwards for other Land Cover Classes than the those indicated in
#' `classesToReplace`. This will constrain
#' It will then take the cohorts that were in pixels with `classesToReplace`
#' and assign them new values in the output object. This function will
#' also check that it must be an `ecoregionCode` that already exists in
#' `cohortData`, i.e., not create new `ecoregionCode` values. See Details.
#'
#' @details
#' This function is designed to be used in highly constrained situations, where it is not
#' just replacing a Land Cover Class by a neighbouring Land Cover Class. But it can
#' be used for the simpler cases of simply replacing a Land Cover Class.
#'
#' @param pixelClassesToReplace Deprecated. Use `classesToReplace`
#'
#' @param classesToReplace Integer vector of classes that are are to be replaced,
#'     e.g., 34, 35, 36 on LCC2005, which are burned young, burned 10 year, and cities.
#'
#' @param rstLCC LCC raster, e.g., LCC2005
#'
#' @param theUnwantedPixels An optional vector of pixel IDs that need to be changed.
#'   If not provided, then pixels to change will be taken from the match between
#'   `availableERC_by_Sp` and `classesToReplace`. Supplying this allows
#'   the user to only replace some of the pixels with a given class.
#'
#' @param ecoregionGroupVec Deprecated. Use `availableERC_by_Sp`
#'
#' @param speciesEcoregion Deprecated. Use `availableERC_by_Sp`
#'
#' @param availableERC_by_Sp A `data.table` or `data.frame` with 3 columns:
#'   `speciesCode`, `initialEcoregionCode` and `pixelIndex`.
#'   `pixelIndex` is the pixel id for each line in the `data.table`;
#'   `speciesCode` is the species name in the pixel (can have more than one
#'     species per pixel, so multiple rows per pixel); and,
#'   `initialEcoregionCode` is the unique codes that are "available" to be
#'     used as a replacement for `classesToReplace`. `initialEcoregionCode`
#'     must be a character vector, with one or no "_" used as a separator, with the last
#'     component being the Land Cover Class that matches `classesToReplace`, e.g.,
#'     `"242_18"`. If there is no "_" in this code, then the codes must match the
#'     `classesToReplace` exactly, e.g., `"11"`.
#'     If `pixelIndex` is missing, the function will fill it
#'     with `seq(ncell(rstLCC))`. If `speciesCode` is missing, the function
#'     will replace it with a dummy value (`"allSpecies"`).
#'
#' @template doAssertion
#'
#' @return
#' A `data.table` with two columns, `pixelIndex` and `ecoregionGroup`.
#' This represents the new codes to used in the `pixelIndex` locations.
#' These should have no values overlapping with `classesToReplace`.
#'
#' @author Eliot McIntire
#' @export
convertUnwantedLCC <- function(classesToReplace = 34:36, rstLCC,
                               availableERC_by_Sp, theUnwantedPixels,
                               ecoregionGroupVec, speciesEcoregion, pixelClassesToReplace,
                               doAssertion = getOption("LandR.assertions", TRUE)) {
  if (!missing(pixelClassesToReplace)) {
    stop("pixelClassesToReplace is deprecated. Please use classesToReplace")
  }
  if (!missing(ecoregionGroupVec)) {
    stop("ecoregionGroupVec is deprecated. Please use availableERC_by_Sp")
  }
  if (!missing(speciesEcoregion)) {
    stop("speciesEcoregion is deprecated. Please use availableERC_by_Sp")
  }

  if (!is.data.table(availableERC_by_Sp)) {
    if (is.data.frame(availableERC_by_Sp)) {
      availableERC_by_Sp <- as.data.table(availableERC_by_Sp)
    } else {
      stop("availableERC_by_Sp must be a data.table or data.frame")
    }
  }

  if (all(grepl("_", availableERC_by_Sp$initialEcoregionCode))) {
    hasPreDash <- TRUE
    availableERC_by_Sp[, ecoregion := gsub("_.*", "", initialEcoregionCode)]
  } else {
    hasPreDash <- FALSE
  }

  genericSpeciesName <- "allSpecies"
  if (!"speciesCode" %in% names(availableERC_by_Sp)) {
    availableERC_by_Sp[, speciesCode := genericSpeciesName]
  }
  if (!"pixelIndex" %in% names(availableERC_by_Sp)) {
    availableERC_by_Sp[, pixelIndex := seq(ncell(rstLCC))]
  }
  if (!is.character(availableERC_by_Sp$initialEcoregionCode) && hasPreDash) {
    availableERC_by_Sp[, initialEcoregionCode := as.character(initialEcoregionCode)]
  }

  # if (!all(grepl("_", availableERC_by_Sp$initialEcoregionCode))) {
  #   numChar <- max(nchar(availableERC_by_Sp$initialEcoregionCode), na.rm = TRUE)
  #   availableERC_by_Sp[!is.na(initialEcoregionCode),
  #                      initialEcoregionCode := paste0("1_", paddedFloatToChar(as.integer(initialEcoregionCode), numChar))]
  # }

  if (missing(theUnwantedPixels)) {
    theUnwantedRows <- gsub(".*_", "", availableERC_by_Sp$initialEcoregionCode) %in%
      as.character(classesToReplace)
    theUnwantedPixels <- sort(unique(availableERC_by_Sp[theUnwantedRows, "pixelIndex"])[[1]])
  }

  if (doAssertion) {
    #  stop("values of 34 and 35 on pixelCohortData and sim$LCC2005 don't match")
  }
  iterations <- 1
  # remove the lines that have the code "classesToReplace"
  availableERG2 <- if (hasPreDash) {
    availableERC_by_Sp[-which(gsub(".*_", "", initialEcoregionCode) %in% classesToReplace)]
  } else {
    availableERC_by_Sp[-which(initialEcoregionCode %in% classesToReplace)]
  }

  availableERG2 <- unique(availableERG2, by = c("speciesCode", "initialEcoregionCode"))
  if (doAssertion) {
    if (any(gsub(".*_", "", availableERG2$initialEcoregionCode) %in% classesToReplace)) {
      stop("classesToReplace are still considered 'available' forest classes")
    }
  }

  availableERG2[, `:=`(pixelIndex = NULL)]

  numCharIEC <- max(nchar(availableERC_by_Sp$initialEcoregionCode), na.rm = TRUE)
  if (hasPreDash) {
    numCharEcoregion <- if (nrow(availableERG2) > 0) {
      max(nchar(availableERG2$ecoregion), na.rm = TRUE)
    } else {
      ## above gives -Inf when no classes to replace, so use zero instead
      0
    }
    numCharLCCCodes <- numCharIEC - numCharEcoregion - 1 # minus 1 for the dash
    availableERG2[, `:=`(ecoregion = NULL)]
  } else {
    numCharLCCCodes <- numCharIEC
  }

  currentLenUnwantedPixels <- length(theUnwantedPixels)
  repeatsOnSameUnwanted <- 0

  while (length(theUnwantedPixels) > 0) {
    message("Converting unwanted LCCs: ", length(theUnwantedPixels), " pixels remaining.")
    out <- spread2(rstLCC,
                   start = theUnwantedPixels, asRaster = FALSE,
                   iterations = iterations, allowOverlap = TRUE, spreadProb = 1
    )
    out <- out[initialPixels != pixels] # rm pixels which are same as initialPixels --> these are known wrong
    iterations <- iterations + 1
    out[, lcc := as.vector(rstLCC[])[pixels]]
    out[lcc %in% c(classesToReplace), lcc := NA]
    out <- na.omit(out)
    out5 <- availableERC_by_Sp[out[, state := NULL],
                               allow.cartesian = TRUE,
                               on = c("pixelIndex" = "initialPixels"), nomatch = NA
    ] # join the availableERC_by_Sp which has initialEcoregionCode

    if (hasPreDash) {
      out5[, possERC := paste0(
        ecoregion, "_",
        paddedFloatToChar(as.integer(lcc), padL = numCharLCCCodes, padR = 0)
      )]
    } else {
      out5[, possERC := lcc]
    }
    out7 <- out5[availableERG2, on = c("speciesCode", "possERC" = "initialEcoregionCode"), nomatch = NA]
    out6 <- na.omit(out7)

    # These ones are missing at least something in the new possERC
    possERCToRm <- out5[!availableERG2, on = c("speciesCode", "possERC" = "initialEcoregionCode")]
    out6 <- out6[!possERC %in% unique(possERCToRm$possERC)]

    # sanity check -- don't let an infinite loop
    if (currentLenUnwantedPixels == length(theUnwantedPixels)) {
      repeatsOnSameUnwanted <- repeatsOnSameUnwanted + 1
    } else {
      currentLenUnwantedPixels <- length(theUnwantedPixels)
      repeatsOnSameUnwanted <- 0
    }

    if (repeatsOnSameUnwanted > 5) {
      out2 <- data.table(newPossLCC = NA, pixelIndex = theUnwantedPixels, ecoregionGroup = NA)
      message(
        "  removing ", NROW(theUnwantedPixels), " pixel of class ",
        paste(rstLCC[theUnwantedPixels], collapse = ", "), " because couldn't",
        " find a suitable replacement"
      )
      pixelsToNA <- theUnwantedPixels
      theUnwantedPixels <- integer()
    }

    if (NROW(out6) > 0) {
      ## take random sample of available, weighted by abundance
      rowsToKeep <- out6[, list(keep = .resample(.I, 1)), by = c("pixelIndex")]
      out8 <- out6[rowsToKeep$keep]
      out2 <- out8[, list(newPossLCC = lcc, pixelIndex)]
      if (hasPreDash) {
        out2[, initialEcoregion := substr(out8[, initialEcoregionCode], 1, numCharEcoregion)]
        out2[, ecoregionGroup := paste0(
          initialEcoregion, "_",
          paddedFloatToChar(as.integer(newPossLCC), padL = 2, padR = 0)
        )] # nolint
        out2[, initialEcoregion := NULL]
      } else {
        out2[, ecoregionGroup := as.integer(newPossLCC)] # nolint
      }

      ## remove combinations of ecoregionGroup and speciesCode that don't exist
      ## -- Now this excludes B = 0
      keepPixels <- unique(out2$pixelIndex)
      theUnwantedPixels <- theUnwantedPixels[!theUnwantedPixels %in% keepPixels]
      out2 <- unique(out2)

      if (!exists("out3")) {
        out3 <- out2
      } else {
        out3 <- rbindlist(list(out2, out3))
      }
    }
  }

  if (!exists("out3")) {
    out3 <- data.table(pixelIndex = NA, ecoregionGroup = NA)[!is.na(pixelIndex)]
  } else {
    # setnames(out3, c("initialPixels", "initialEcoregionCode"), c("pixelIndex", "ecoregionGroup"))
    out3[, `:=`(newPossLCC = NULL)]
    # out3 <- unique(out3, by = c("pixelIndex", "ecoregionGroup"))
    out3 <- unique(out3)
  }

  if (exists("pixelsToNA")) {
    ## make sure these pixels get an NA ecoregion by rm them in case they are present
    if (any(out3$pixelIndex %in% pixelsToNA)) {
      out3 <- out3[!pixelIndex %in% pixelsToNA]
    }
    out3 <- rbind(out3, data.table(pixelIndex = pixelsToNA, ecoregionGroup = NA))
  }

  if (doAssertion) {
    if (any(gsub(".*_", "", out3$ecoregionGroup) %in% classesToReplace)) {
      stop("classesToReplace we're not fully removed")
    }
  }

  out3
}


#' Assess non-forested pixels based on species cover data and land-cover
#'
#' @template speciesLayers
#'
#' @param omitNonTreedPixels logical. Should pixels with classes in `forestedLCCClasses` be
#'                           included as non-forested?
#'
#' @param forestedLCCClasses vector of non-forested land-cover classes in `rstLCC`
#'
#' @template rstLCC
#'
#' @return a logical vector of length `ncell(rstLCC)`
#'   where `TRUE` indicates non-forested pixels where there is no
#'   species cover data, or a non-forested land-cover class
#'
#' @export
nonForestedPixels <- function(speciesLayers, omitNonTreedPixels, forestedLCCClasses,
                              rstLCC) {
  # pixelsToRm <- rowSums(!is.na(sim$speciesLayers[])) == 0 # keep
  pixelsToRm <- is.na(as.vector(speciesLayers[[1]][])) # seems to be OK because seem to be NA on each layer for a given pixel

  ## remove non-forested if asked by user
  if (omitNonTreedPixels) {
    if (is.null(forestedLCCClasses))
      stop("No P(sim)$forestedLCCClasses provided, but P(sim)$omitNonTreedPixels is TRUE.
             \nPlease provide a vector of forested classes in P(sim)$forestedLCCClasses")
    lccPixelsRemoveTF <- !(as.vector(rstLCC[]) %in% forestedLCCClasses)
    pixelsToRm <- lccPixelsRemoveTF | pixelsToRm
  }
  pixelsToRm
}

#' Generate template `cohortData` table
#'
#' Internal function used by [makeAndCleanInitialCohortData()].
#'
#' @param inputDataTable A `data.table` with columns described above.
#'
#' @param sppColumns A vector of the names of the columns in `inputDataTable` that
#'   represent percent cover by species, rescaled to sum up to 100%%.
#'
#' @param minCoverThreshold minimum total cover percentage necessary to consider the pixel
#'  vegetated, or a cohort present in a pixel.
#'
#' @template doAssertion
#'
#' @param rescale Logical. If `TRUE`, the default, cover for each species will be rescaled
#'   so all cover in `pixelGroup` or pixel sums to 100.
#' @return `cohortData` (`data.table`) with attribute `"imputedPixID"`
#'
#' @keywords internal
.createCohortData <- function(inputDataTable, # pixelGroupBiomassClass,
                              sppColumns,
                              minCoverThreshold = 5,
                              doAssertion = getOption("LandR.assertions", TRUE), rescale = TRUE) {
  newCoverColNames <- gsub("cover\\.", "", sppColumns)
  setnames(inputDataTable, old = sppColumns, new = newCoverColNames)
  message(blue("Create initial cohortData object, with no pixelGroups yet"))
  message(green("-- Begin reconciling data inconsistencies"))

  imputedPixID <- integer(0)

  inputDataTable[, totalCover := rowSums(.SD), .SDcols = newCoverColNames]
  whEnoughCover <- inputDataTable$totalCover > minCoverThreshold
  message(green(
    "  -- Removing all pixels with totalCover <= minCoverThreshold (affects",
    sum(!whEnoughCover),
    "of", NROW(inputDataTable), "pixels)"
  ))
  message(green("     --> resulting in", sum(whEnoughCover), "pixels)"))
  inputDataTable <- inputDataTable[whEnoughCover]

  whAgeEqZero <- which(inputDataTable$age == 0)

  if (!is.null(inputDataTable[["totalBiomass"]])) {
    message(green(
      "  -- Setting TotalBiomass in pixel to 0 where age == 0 (affects", length(whAgeEqZero),
      "of", NROW(inputDataTable), "pixels)"
    ))
    message(green("     --> keeping ", NROW(inputDataTable), "pixels)"))
    ## correct B in a separate column to keep track of imputed pixels, then replace column
    inputDataTable[, `:=`(totalBiomass2 = totalBiomass)]
    inputDataTable[whAgeEqZero, `:=`(totalBiomass2 = 0)]
    imputedPixID <- inputDataTable[totalBiomass != totalBiomass2, pixelIndex]
    inputDataTable[, totalBiomass := totalBiomass2]
    inputDataTable[, totalBiomass2 := NULL]

    whTotalBEqZero <- which(inputDataTable$totalBiomass == 0)
    message(green(
      "  -- Setting age in pixel to 0 where totalBiomass == 0 (affects", length(whTotalBEqZero),
      "of", NROW(inputDataTable), "pixels)"
    ))
    message(green("     --> keeping ", NROW(inputDataTable), "pixels)"))
    ## correct age in a separate column to keep track of imputed pixels, then replace column
    inputDataTable[, `:=`(age2 = age)]
    inputDataTable[whTotalBEqZero, `:=`(age2 = 0)]
    imputedPixID <- c(imputedPixID, inputDataTable[age != age2, pixelIndex])
    inputDataTable[, age := age2]
    inputDataTable[, age2 := NULL]
  }

  cohortData <- data.table::melt(inputDataTable,
                                 value.name = "cover",
                                 measure.vars = newCoverColNames,
                                 variable.name = "speciesCode"
  )

  # Remove all cover <= minCoverThreshold
  whCoverGTMinCover <- which(cohortData$cover > minCoverThreshold)
  message(green(
    "  -- Removing all cohorts with cover <= minCoverThreshold (affects", NROW(cohortData) - length(whCoverGTMinCover),
    "of", NROW(cohortData), "cohorts"
  ))
  message(green("     --> resulting in", length(whCoverGTMinCover), "cohorts)"))
  cohortData <- cohortData[whCoverGTMinCover]
  message(green("     --> resulting in", length(unique(cohortData$pixelIndex)), "pixels)"))

  cohortData[, coverOrig := cover]
  if (isTRUE(doAssertion)) {
    if (any(duplicated(cohortData))) {
      warning(".createCohortData: cohortData contains duplicate rows.")
    }
  }

  # if (doAssertion)
  # describeCohortData(cohortData)
  # message(green("  -- Assign B = 0 and age = 0 for pixels where cover = 0,\n",
  #             "because cover is most reliable dataset"))

  # hasCover0 <- which(cohortData[["cover"]] == 0)
  cncd <- colnames(cohortData)
  # if (any(c("age", "logAge") %in% cncd)) {
  #   set(cohortData, hasCover0, "age", 0L)
  #   set(cohortData, hasCover0, "logAge", .logFloor(0))
  # }
  if (any(c("B", "totalBiomass") %in% cncd)) {
    # set(cohortData, hasCover0, "B", 0)
    # message(green("  -- Assign totalBiomass = 0 if sum(cover) = 0 in a pixel, ",
    #             "  because cover is most reliable dataset"))
    # cohortData <- cohortData[, sum(cover) == 0, by = "pixelIndex"][V1 == TRUE][
    #  cohortData, on = "pixelIndex"][V1 == TRUE, totalBiomass := 0L]
    # cohortData[, V1 := NULL]
  }

  ## CRAZY TODO: DIVIDE THE COVER BY 2 for DECIDUOUS -- will only affect mixed stands
  # message(crayon::green(paste("POSSIBLE ALERT:",
  #                            "assume deciduous cover is 1/2 the conversion to B as conifer")))
  # cohortData[speciesCode == "Popu_sp", cover := asInteger(cover / 2)]

  # Change temporarily to numeric for following calculation
  set(cohortData, NULL, "cover", as.numeric(cohortData[["cover"]]))
  if (isTRUE(rescale)) {
    cohortData[, totalCover := sum(cover), by = "pixelIndex"]
    cohortData[, cover := cover / totalCover * 100]
  }

  # cohortData[ , cover := {
  #   sumCover <- sum(cover)
  #   if (sumCover > 100) {
  #     cover <- cover / (sumCover + 0.0001) * 100L
  #   }
  #   cover
  # }, by = "pixelIndex"]

  # set(cohortData, NULL, "cover", asInteger(cohortData[["cover"]]))

  if (FALSE) {
    if (any(c("B", "totalBiomass") %in% cncd)) {
      # Biomass -- by cohort (NOTE: divide by 100 because cover is percent)
      # set(cohortData, NULL, "B", as.numeric(cohortData[["B"]]))
      set(cohortData, NULL, "B", cohortData[["totalBiomass"]] * cohortData[["cover"]] / 100)
      message(green("  -- Divide total B of each pixel by the relative cover of the cohorts"))

      # cohortData[ , B := mean(totalBiomass) * cover / 100, by = "pixelIndex"]
      # message(blue("Round B to nearest P(sim)$pixelGroupBiomassClass"))
      # cohortData[ , B := ceiling(B / pixelGroupBiomassClass) * pixelGroupBiomassClass]

      message(green("Set B to 0 where cover > 0 and age = 0, because B is least quality dataset"))
      cohortData[cover > 0 & age == 0, B := 0L]
      cohortData[, totalBiomass := asInteger(totalBiomass)]
      set(cohortData, NULL, "B", asInteger(cohortData[["B"]]))
    }
  }

  # clean up
  set(cohortData, NULL, c("totalCover", "coverOrig"), NULL)
  setattr(cohortData, "imputedPixID", imputedPixID)
  return(cohortData)
}

#' Generate initial `cohortData` table
#'
#' Takes a single `data.table` input, which has the following columns in addition to
#' others that will be labelled with species name, and contain percent cover of each:
#'
#' \itemize{
#'   \item `pixelIndex` (integer)
#'   \item `age` (integer)
#'   \item `logAge` (numeric)
#'   \item `initialEcoregionCode` (factor)
#'   \item `totalBiomass` (integer)
#'   \item `lcc` (integer)
#'   \item `rasterToMatch` (integer)
#'   \item `speciesCode` (factor)
#'   \item `cover` (integer)
#'   \item `coverOrig` (integer)
#'   \item `B` (integer)
#' }
#'
#' Several data correction/imputation operations are also performed. Namely, age is imputed
#' in pixels where age data is missing (but not cover) and where `cover == 0` but `age > 0`,
#' total biomass is zeroed if `age == 0`, and age is zeroed if `biomass == 0`.
#'
#' @param inputDataTable A `data.table` with columns described above.
#'
#' @param sppColumns A vector of the names of the columns in `inputDataTable` that
#'   represent percent cover by species, rescaled to sum up to 100%%.
#'
#' @param imputeBadAgeModel statistical model used to impute ages in pixels with missing
#'  data or with cover == 0. If set to NULL no imputation will be attempted, and pixels with
#'  missing age are excluded.
#'
#' @param minCoverThreshold minimum total cover percentage necessary to consider the pixel
#'  vegetated, or a cohort present in a pixel.
#'
#' @template doAssertion
#'
#' @param doSubset Turns on/off subsetting. Defaults to `TRUE`.
#'
#' @return a `cohortData` `data.table` with attribute `"imputedPixID"`
#'     (a vector of pixel IDs that suffered imputation).
#'
#' @author Eliot McIntire
#' @export
makeAndCleanInitialCohortData <- function(inputDataTable, sppColumns,
                                          # pixelGroupBiomassClass,
                                          # pixelGroupAgeClass = 1,
                                          imputeBadAgeModel = quote(lme4::lmer(age ~ B * speciesCode + cover * speciesCode + (1 | initialEcoregionCode))),
                                          minCoverThreshold,
                                          doAssertion = getOption("LandR.assertions", TRUE),
                                          doSubset = TRUE) {
  ### Create groupings
  if (doAssertion) {
    expectedColNames <- c(
      "age", "logAge", "initialEcoregionCode", "totalBiomass",
      "lcc", "pixelIndex"
    )
    if (!all(expectedColNames %in% colnames(inputDataTable))) {
      stop(
        "Column names for inputDataTable must include ",
        paste(expectedColNames, collapse = " ")
      )
    }
    if (!all(sppColumns %in% colnames(inputDataTable))) {
      stop("Species names are incorrect")
    }
    if (!all(unlist(lapply(
      inputDataTable[, sppColumns, with = FALSE],
      function(x) all(x >= 0 & x <= 100)
    )))) {
      stop(
        "Species columns are not percent cover between 0 and 100. This may",
        " be because they more NA values than the Land Cover raster"
      )
    }
  }

  ## .createCohortData deals with the following inconsistencies:
  ## 1. Pixels with total cover (across all species) < 5% are removed;
  ## 2. Pixels with 0 stand age, are assigned 0 stand biomass;
  ## 3. Pixels with 0 stand biomass, are assigned 0 stand age.
  ## 4. (after calcualting cohort B and age): if `cover > 0` and `age == 0`, `B` is set to 0
  cohortData <- Cache(.createCohortData,
                      inputDataTable = inputDataTable,
                      # pixelGroupBiomassClass = pixelGroupBiomassClass,
                      minCoverThreshold = minCoverThreshold,
                      sppColumns = sppColumns,
                      doAssertion = doAssertion
  )
  assertCohortDataAttr(cohortData, doAssertion = doAssertion)
  imputedPixID <- attr(cohortData, "imputedPixID")

  ######################################################
  # Impute missing ages on poor age dataset
  ######################################################
  # Cases:
  #  All species cover = 0 yet totalB > 0
  # (see other age inconsistencies solved above)

  cohortDataMissingAge <- cohortData[
    , hasBadAge :=
      # (age == 0 & cover > 0)#| # ok because cover can be >0 with biomass = 0
      (age > 0 & cover == 0) |
      is.na(age) #|
    # (B > 0 & age == 0) |
    # (B == 0 & age > 0)
  ][hasBadAge == TRUE] # , by = "pixelIndex"]

  if (NROW(cohortDataMissingAge) > 0) {
    if (!is.null(imputeBadAgeModel)) {
      cohortDataMissingAgeUnique <- unique(cohortDataMissingAge,
                                           by = c("initialEcoregionCode", "speciesCode")
      )[
        , .(initialEcoregionCode, speciesCode)
      ]
      cohortDataMissingAgeUnique <- cohortDataMissingAgeUnique[
        cohortData,
        on = c("initialEcoregionCode", "speciesCode"), nomatch = 0
      ]
      cohortDataMissingAgeUnique <- cohortDataMissingAgeUnique[!is.na(cohortDataMissingAgeUnique$age)]
      cohortDataMissingAgeUnique <- cohortDataMissingAgeUnique[, .(
        totalBiomass, age, speciesCode,
        initialEcoregionCode, cover
      )]
      zeros <- sapply(cohortDataMissingAgeUnique, function(x) sum(x == 0))
      if (sum(zeros, na.rm = TRUE)) {
        hasZeros <- zeros[zeros > 0]
        message(
          " ", paste(names(hasZeros), collapse = ", "), " had ",
          paste(hasZeros, collapse = ", "), " zeros, respectively"
        )
        warning(" These are being removed from the dataset. If this is not desired; please fix.")
        # terms <- strsplit(gsub(" ", "", as.character(imputeBadAgeModel)), split = "[[:punct:]]+")[[2]][-1] # remove response
        # terms <- unique(terms)
        # terms <- terms[terms %in% colnames(cohortDataMissingAgeUnique)]
        terms <- termsInData(imputeBadAgeModel, cohortDataMissingAgeUnique)
        lapply(terms, function(x) {
          cohortDataMissingAgeUnique <<- cohortDataMissingAgeUnique[get(x) != 0]
        })
      }
      cohortDataMissingAgeUnique <- subsetDT(cohortDataMissingAgeUnique,
                                             by = c("initialEcoregionCode", "speciesCode"),
                                             doSubset = doSubset
      )
      message(blue("Impute missing age values: started", Sys.time()))

      outAge <- Cache(statsModel,
                      modelFn = imputeBadAgeModel,
                      uniqueEcoregionGroups = .sortDotsUnderscoreFirst(
                        as.character(unique(cohortDataMissingAgeUnique$initialEcoregionCode))
                      ),
                      .specialData = cohortDataMissingAgeUnique,
                      omitArgs = ".specialData"
      )
      message(blue("                           completed", Sys.time()))

      # paste with capture.output keeps table structure intact
      messageDF(outAge$rsq, 3, "blue")

      ## allow.new.levels = TRUE because some groups will have only NA for age for all species
      cohortDataMissingAge[
        , imputedAge := pmax(0L, asInteger(predict(outAge$mod,
                                                   newdata = cohortDataMissingAge,
                                                   allow.new.levels = TRUE
        )))
      ]

      cohortData <- cohortDataMissingAge[, .(pixelIndex, imputedAge, speciesCode)][
        cohortData,
        on = c("pixelIndex", "speciesCode")
      ]
      cohortData[!is.na(imputedAge), `:=`(age = imputedAge, logAge = .logFloor(imputedAge))]
      imputedPixID <- c(imputedPixID, unique(cohortData[!is.na(imputedAge), pixelIndex]))
      cohortData[, `:=`(imputedAge = NULL)]
    } else {
      ## if not imputing bad ages, then exclude bad data entries.
      cohortData <- cohortData[!cohortDataMissingAge[, .(pixelIndex, speciesCode)], on = c("pixelIndex", "speciesCode")]
    }
  }
  # # Round ages to nearest pixelGroupAgeClass
  # set(cohortData, NULL, "age", asInteger(cohortData$age / pixelGroupAgeClass) *
  #       as.integer(pixelGroupAgeClass))

  cohortData[, `:=`(hasBadAge = NULL)]

  # #######################################################
  # # set B to zero if age is zero because B is lowest quality dataset
  # #######################################################
  # message(blue("Set recalculate totalBiomass as sum(B);",
  #              "many biomasses will have been set to 0 in previous steps"))
  # cohortData[cover > 0 & age == 0, B := 0L]
  # cohortData[, totalBiomass := asInteger(sum(B)), by = "pixelIndex"]

  # This was unused, but for beta regression, this can allow 0s and 1s without
  #   needing a separate model for zeros and ones
  # https://stats.stackexchange.com/questions/31300/dealing-with-0-1-values-in-a-beta-regression
  # cohortData[ , coverProp := (cover/100 * (NROW(cohortData) - 1) + 0.5) / NROW(cohortData)]

  setattr(cohortData, "imputedPixID", imputedPixID)
  return(cohortData)
}

#' Subset a `data.table` with random subsampling within `by` groups
#'
#' @param DT A `data.table`
#' @param by Character vector of column names to use for groups
#' @param doSubset Logical or numeric indicating the number of subsamples to use
#' @param indices Logical. If `TRUE`, this will return vector of row indices only. Defaults
#'   to `FALSE`, i.e., return the subsampled `data.table`
#'
#' @export
#' @examples
#' library(data.table)
#' dt <- data.table(Lett = sample(LETTERS, replace = TRUE, size = 1000), Nums = 1:100)
#' dt1 <- subsetDT(dt, by = "Lett", doSubset = 3)
subsetDT <- function(DT, by, doSubset = TRUE, indices = FALSE) {
  if (!is.null(doSubset)) {
    if (!isFALSE(doSubset)) {
      sam <- if (is.numeric(doSubset)) doSubset else 50
      message(
        "subsampling initial dataset for faster model estimation: ",
        "using maximum of ", sam, " samples per combination of ecoregionGroup and speciesCode. ",
        "Change 'doSubset' to a different number if this is not enough"
      )
      # subset -- add line numbers of those that were sampled
      a <- DT[, list(lineNum = .I[sample(.N, size = min(.N, sam))]), by = by]
      # Select only those row numbers from whole dataset
      if (isFALSE(indices)) {
        DT <- DT[a$lineNum]
      } else {
        DT <- a$lineNum
      }
    } else {
      if (isTRUE(indices)) {
        DT <- seq_len(nrow(DT))
      }
    }
  } else {
    if (isTRUE(indices)) {
      DT <- seq_len(nrow(DT))
    }
  }
  return(DT)
}

#' Drop factor term including interactions from a model formula
#'
#' Based on <https://stackoverflow.com/a/23382097/1380598>.
#'
#' @param form A model formula.
#' @param term Character vector giving the name of the term to drop.
#'
#' @return An updated model formula.
#'
#' @export
dropTerm <- function(form, term) {
  if (!is(form, "formula")) {
    form <- as.formula(form)
  }

  fterms <- terms(form)
  fac <- attr(fterms, "factors")

  new_form <- form
  for (tt in term) {
    idr <- grepl(tt, rownames(fac))
    idc <- which(as.logical(fac[idr, ]))
    toDrop <- names(fac[idr, ][idc])
    needsParenth <- vapply(paste0("(", toDrop, ")"), FUN = grepl, FUN.VALUE = logical(1),
                           x = as.character(new_form)[3], fixed = TRUE)
    if (any(needsParenth)) {
      toDrop[needsParenth] <- paste0("(", toDrop[needsParenth], ")")
    }

    new_form <- update(new_form, paste0(". ~ . -", paste(toDrop, collapse = " - ")))
  }

  return(new_form)
}

#' The generic statistical model to run (`lmer` or `glmer`)
#'
#' This does a few things including R squared, gets the fitted values.
#' It appears that running the models "as is" without this wrapper does not work with `Cache`.
#' The return of the model in a list solves this problem.
#' For Caching, the `.specialData` should be "omitted" via `omitArgs`, and
#' `uniqueEcoregionGroups` should not be omitted.
#'
#' @param modelFn A quoted expression of type `package::model(Y ~ X, ...)`, omitting
#'   the `data` argument. E.g. `lme4::glmer(Y ~ X + (X|G), family = poisson)`
#' @param uniqueEcoregionGroups Unique values of `ecoregionGroups`.
#'   This is the basis for the statistics, and can be used to optimize caching,
#'   e.g. ignore `.specialData` in `.omitArgs`.
#' @param sumResponse a sum of all the response variable values
#'   Also to be used to optimize caching, e.g. ignore `.specialData`
#'   in `.omitArgs`.
#' @param .specialData The custom dataset required for the model.
#'
#' @export
statsModel <- function(modelFn, uniqueEcoregionGroups, sumResponse, .specialData) {
  ## convert model call to vector of arguments
  modelArgs <- as.character(modelFn)
  names(modelArgs) <- names(modelFn)

  ## function name
  fun <- modelArgs[[1]]

  ## get formula and check
  form <- tryCatch(as.formula(modelArgs[2], env = .GlobalEnv), # .GlobalEnv keeps object small
                   error = function(e) {
                     stop(paste(
                       "Could not convert '", modelArgs[2], "'to formula.",
                       "Check if formula is of type 'Y ~ X'"
                     ))
                   }
  )

  ## check the no of grouping levels
  if (grepl("\\|", modelArgs[2])) {
    groupVar <- sub("\\).*", "", sub(".*\\| ", "", modelArgs[2]))
    keepGrouping <- NROW(unique(.specialData[, ..groupVar])) >= 2

    if (!keepGrouping) {
      form <- sub("\\+ \\(.*\\|.*\\)", "", modelArgs[2])
      form <- as.formula(form, env = .GlobalEnv)

      ## change to glm if dropping random effects
      fun <- "stats::glm"

      whereFam <- grep("family", names(modelArgs))
      modelFn2 <- if (length(whereFam)) {
        call(fun, form, family = modelArgs[[whereFam]])
      } else {
        call(fun, form)
      }

      message(
        blue("Grouping variable "), red("only has one level. "),
        blue(
          "Formula changed to\n",
          magenta(paste0(format(modelFn2, appendLF = FALSE), collapse = ""))
        )
      )
    }
  }

  ## get function and check
  fun <- .extractFunction(fun)
  if (!is.function(fun)) {
    stop(paste0(
      "Can't find the function '", modelArgs[1], "'.",
      " Is the function name correct and the package installed?"
    ))
  }

  ## prepare arguments, and strip function from list
  modelArgs <- as.list(modelArgs)
  modelArgs[[2]] <- form
  names(modelArgs)[2] <- "formula"
  modelArgs[[1]] <- NULL
  modelArgs$data <- quote(.specialData)

  isChar <- unlist(lapply(modelArgs, is.character))
  if (any(isChar)) {
    modelArgs[isChar] <- lapply(modelArgs[isChar], function(yyy) {
      eval(parse(text = yyy))
    })
  }

  ## drop factor terms with a single level
  singles <- names(which(sapply(lapply(.specialData, unique), length) == 1))
  keep <- which(unname(vapply(singles, grepl, x = paste(modelArgs$formula, collapse = " "), logical(1))))
  singles <- singles[keep]
  if (length(singles) > 0) {
    modelArgs$formula <- dropTerm(modelArgs$formula, singles)
  }

  mod <- do.call(fun, modelArgs)

  list(mod = mod, pred = fitted(mod), rsq = MuMIn::r.squaredGLMM(mod))
}

#' Default columns that define pixel groups
#'
#' @export
columnsForPixelGroups <- c("ecoregionGroup", "speciesCode", "age", "B")

#' Generate `cohortData` table per pixel:
#'
#' @template cohortData
#'
#' @template pixelGroupMap
#'
#' @template cohortDefinitionCols
#'
#' @template doAssertion
#'
#' @return
#' An expanded `cohortData` `data.table` with a new `pixelIndex` column.
#'
#' @export
addPixels2CohortData <- function(cohortData, pixelGroupMap,
                                 cohortDefinitionCols = c("pixelGroup", "age", "speciesCode"),
                                 doAssertion = getOption("LandR.assertions", TRUE)) {
  assertCohortData(cohortData, pixelGroupMap,
                   cohortDefinitionCols = cohortDefinitionCols,
                   doAssertion = doAssertion)

  pixelGroupTable <- na.omit(data.table(
    pixelGroup = as.vector(pixelGroupMap[]),
    pixelIndex = 1:ncell(pixelGroupMap)
  ))
  pixelCohortData <- cohortData[pixelGroupTable,
                                on = "pixelGroup",
                                nomatch = 0, allow.cartesian = TRUE
  ]

  assertPixelCohortData(pixelCohortData, pixelGroupMap, doAssertion = doAssertion)

  return(pixelCohortData)
}

#' Add number of pixels per `pixelGroup` and add it has a new column to `cohortData`
#'
#' @template cohortData
#'
#' @template pixelGroupMap
#'
#' @template cohortDefinitionCols
#'
#' @template doAssertion
#'
#'
#' @return
#' An `cohortData` `dat.table` with a new `noPixels`
#' column
#'
#' @export
addNoPixel2CohortData <- function(cohortData, pixelGroupMap,
                                  cohortDefinitionCols = c("pixelGroup", "age", "speciesCode"),
                                  doAssertion = getOption("LandR.assertions", TRUE)) {
  assertCohortData(cohortData, pixelGroupMap,
                   cohortDefinitionCols = cohortDefinitionCols, doAssertion = doAssertion)

  noPixelsXGroup <- data.table(
    noPixels = tabulate(pixelGroupMap[]),
    pixelGroup = c(1:maxFn(pixelGroupMap))
  )

  pixelCohortData <- cohortData[noPixelsXGroup, on = "pixelGroup", nomatch = 0]

  if (doAssertion) {
    test1 <- length(setdiff(pixelCohortData$pixelGroup, cohortData$pixelGroup)) > 0
    test2 <- sum(unique(pixelCohortData[, .(pixelGroup, noPixels)])$noPixels) !=
      sum(!is.na(pixelGroupMap[]) & pixelGroupMap[] != 0) ## 0's have no cohorts.

    if (test1 || test2) {
      stop("pixelGroups differ between pixelCohortData/pixelGroupMap and cohortData")
    }
  }

  return(pixelCohortData)
}

#' Make the `cohortData` table, while modifying the temporary
#' `pixelCohortData` that will be used to prepare other files.
#'
#' Takes a `pixelCohortData` table (see `makeAndCleanInitialCohortData`),
#'   the `speciesEcoregion` list and returns a modified `pixelCohortData` and
#'   the `cohortData` tables to be used in the simulation.
#'   This function mainly removes unnecessary columns from `pixelCohortData`,
#'   subsets pixels with `biomass > 0`, generates `pixelGroups`,
#'   and adds `ecoregionGroup` and `totalBiomass` columns to `pixelCohortData`.
#'   `cohortData` is then created by subsetting unique combinations of `pixelGroup` and
#'   whatever columns are listed in `columnsForPixelGroups`.
#'   The resulting `cohortData` table has the following columns:
#' \itemize{
#'   \item `speciesCode` (factor)
#'   \item `ecoregionGroup` (factor)
#'   \item `pixelGroup` (integer)
#'   \item `age` (integer)
#'   \item `B` (integer)
#' }
#'
#' @template pixelCohortData
#' @param columnsForPixelGroups Default columns that define pixel groups
#' @param pixelGroupAgeClass Integer. When assigning `pixelGroup` membership, this defines the
#'   resolution of ages that will be considered 'the same `pixelGroup`', e.g., if it is 10,
#'   then 6 and 14 will be the same.
#' @param pixelGroupBiomassClass Integer. When assigning `pixelGroup` membership, this defines
#'   the resolution of biomass that will be considered 'the same `pixelGroup`', e.g., if it is
#'   100, then 5160 and 5240 will be the same
#' @param minAgeForGrouping Minimum age for regrouping. This may be because there is a source of
#'   ages for young stands/trees that is very reliable, such as a fire database.
#'   Ages below this will not be grouped together. Defaults to -1, meaning treat all ages equally.
#'   If this is related to known ages from a high quality database, then use age of the oldest
#'   trees in that database.
#' @param pixelFateDT A `data.table` of `pixelFateDT`; if none provided, will make an empty one.
#' @param rmImputedPix Should imputed pixels be removed
#' @param imputedPixID a vector of IDs of pixels that suffered data imputation.
#'
#' @template speciesEcoregion
#'
#' @return A list with a modified `pixelCohortData`, `cohortData`, and `pixelFateDT`
#' `data.table`s.
#'
#' @export
makeCohortDataFiles <- function(pixelCohortData, columnsForPixelGroups, speciesEcoregion,
                                pixelGroupBiomassClass, pixelGroupAgeClass, minAgeForGrouping = 0,
                                rmImputedPix = FALSE, imputedPixID, pixelFateDT) {
  ## make ecoregioGroup a factor (again) and remove unnecessary cols.
  # refactor because the "_34" and "_35" ones are still levels
  pixelCohortData[, ecoregionGroup := factor(as.character(ecoregionGroup))]
  cols <- intersect(
    c(
      "logAge", "coverOrig", "totalBiomass",
      "initialEcoregionCode", "cover", "lcc"
    ),
    names(pixelCohortData)
  )
  set(pixelCohortData, j = cols, value = NULL)


  # Round ages to nearest pixelGroupAgeClass
  pixelCohortData[
    age > minAgeForGrouping,
    age := asInteger(age / pixelGroupAgeClass) *
      as.integer(pixelGroupAgeClass)
  ]

  # Round Biomass to nearest pixelGroupBiomassClass
  message(blue("Round B to nearest P(sim)$pixelGroupBiomassClass"))
  pixelCohortData[ # age > minAgeForGrouping,
    , B := asInteger(B / pixelGroupBiomassClass) * as.integer(pixelGroupBiomassClass)
  ]

  # Remove B == 0 cohorts after young removals
  message(green("  -- Removing cohorts with B = 0 and age > 0 -- these were likely poor predictions from updateYoungBiomasses"))
  whBEqZeroAgeGT0 <- which(pixelCohortData$B == 0 & pixelCohortData$age > 0)

  if (length(whBEqZeroAgeGT0) > 0) {
    pixelCohortData2 <- pixelCohortData[-whBEqZeroAgeGT0]
  } else {
    pixelCohortData2 <- pixelCohortData
  }

  lostPixels <- setdiff(pixelCohortData$pixelIndex, pixelCohortData2$pixelIndex)
  message(green("     affected", length(whBEqZeroAgeGT0), "cohorts, in", length(lostPixels), "pixels;"))
  lenUniquePix <- length(unique(pixelCohortData2$pixelIndex))
  message(green("     leaving", lenUniquePix, "pixels"))
  pixelFateDT <- pixelFate(pixelFateDT,
                           fate = "rm pixels with Biomass == 0, after updating young cohort B",
                           length(lostPixels), runningPixelTotal = lenUniquePix
  )

  pixelCohortData <- pixelCohortData2
  # # Set B to 0 if age is 0
  # whAgeZero <- which(pixelCohortData$age == 0)
  # if (length(whAgeZero)) {
  #   message(green("    -- There were", length(whAgeZero), "pixels with age = 0; forcing B to zero"))
  #   pixelCohortData[whAgeZero, B := 0L]
  # }

  ## select pixels with biomass and generate pixel groups
  # pixelCohortData <- pixelCohortData[B >= 0]

  ########################################################################
  ## rebuild ecoregion, ecoregionMap objects -- some initial ecoregions disappeared (e.g., 34, 35, 36)
  ## rebuild biomassMap object -- biomasses have been adjusted
  ## There will be some speciesCode * ecoregionGroups combinations for which we only had age = 0 and B = 0
  if (is.factor(speciesEcoregion$ecoregionGroup)) {
    ## we need to check available params for speciesXecoregionGroup combos, this will be achieved with a
    # ecoregionsWeHaveParametersFor <- levels(speciesEcoregion$ecoregionGroup)
  } else {
    stop(
      "speciesEcoregion$ecoregionGroup is supposed to be a factor; please go back in this",
      "module and correct this."
    )
  }

  message(blue("Removing some pixels because their species * ecoregionGroup combination has no age or B data to estimate ecoregion traits:"))
  # message(blue(paste(sort(unique(pixelCohortData[!ecoregionGroup %in% ecoregionsWeHaveParametersFor]$ecoregionGroup)), collapse = ", ")))
  cols <- c("speciesCode", "ecoregionGroup")
  messageDF(
    colour = "blue",
    pixelCohortData[!speciesEcoregion, on = cols][, ..cols][, list(numPixelsRemoved = .N), by = cols], # anti-join
  )

  ## REMOVE PIXELS IN ECOREGION GROUPS THAT ENDED UP WITHOUT PARAMS
  # pixelCohortData <- pixelCohortData[ecoregionGroup %in% ecoregionsWeHaveParametersFor] # keep only ones we have params for
  pixelCohortData <- speciesEcoregion[, ..cols][pixelCohortData, on = cols, nomatch = 0]

  pixelFateDT <- pixelFate(
    pixelFateDT, "removing ecoregionGroups without enough data to est. maxBiomass",
    tail(pixelFateDT$runningPixelTotal, 1) - NROW(unique(pixelCohortData$pixelIndex))
  )

  ## REMOVE IMPUTED PIXELS FROM THE SIMULATION IF NEED BE
  if (rmImputedPix) {
    pixelCohortData <- pixelCohortData[!pixelIndex %in% imputedPixID]

    pixelFateDT <- pixelFate(
      pixelFateDT, "removing pixels that suffered data imputation",
      tail(pixelFateDT$runningPixelTotal, 1) - NROW(unique(pixelCohortData$pixelIndex))
    )
  }

  # Lost some ecoregionGroups -- refactor
  pixelCohortData[, ecoregionGroup := factor(as.character(ecoregionGroup))]

  cd <- pixelCohortData[, .SD, .SDcols = c("pixelIndex", columnsForPixelGroups)]
  pixelCohortData[, pixelGroup := Cache(generatePixelGroups, cd,
                                        maxPixelGroup = 0,
                                        columns = columnsForPixelGroups
  )]

  pixelCohortData[, totalBiomass := asInteger(sum(B)), by = "pixelIndex"]

  cohortData <- unique(pixelCohortData, by = c("pixelGroup", columnsForPixelGroups))
  cohortData[, `:=`(pixelIndex = NULL)]

  assertUniqueCohortData(cohortData, c("pixelGroup", "ecoregionGroup", "speciesCode"))
  return(list(cohortData = cohortData, pixelCohortData = pixelCohortData, pixelFateDT = pixelFateDT))
}

#' Create new cohorts based on provenance table with unique `pixelGroup` and add to `cohortData`
#'
#' @param newPixelCohortData the cohorts that were harvested
#' @template cohortData
#' @template pixelGroupMap
#' @template currentTime
#' @template successionTimestep
#' @param trackPlanting adds column that tracks planted cohorts if `TRUE`
#' @param initialB the initial biomass of new cohorts. Defaults to ten.
#'
#' @return A `data.table` with a new `cohortData`
#'
#' @export
plantNewCohorts <- function(newPixelCohortData, cohortData, pixelGroupMap, initialB = 10,
                            currentTime, successionTimestep, trackPlanting = FALSE) {
  ## get spp "productivity traits" per ecoregion/present year

  namesNCD <- names(newPixelCohortData)
  if (!isTRUE("pixelGroup" %in% namesNCD)) {
    if (isTRUE("pixelIndex" %in% namesNCD)) {
      newPixelCohortData[, pixelGroup := as.vector(pixelGroupMap[])[pixelIndex]]
    } else {
      stop("newPixelCohortData must have either pixelIndex or pixelGroup")
    }
  }

  if (!is.null(newPixelCohortData[["pixelIndex"]])) {
    set(newPixelCohortData, NULL, "pixelIndex", NULL)
  }

  # remove duplicate pixel groups that were harvested
  duplicates <- duplicated(newPixelCohortData[, .(speciesCode, ecoregionGroup, pixelGroup, age)])
  newCohortData <- newPixelCohortData[!duplicates]

  # Plant trees
  newCohortData[, age := 2]

  ## Ceres: temporary workaround to ensure the default value is used even
  ## if LANDIS behaviour is being used to calculate initial B in non-planted cohorts
  if (isTRUE(is.na(initialB)) || is.null(initialB)) {
    initialB <- formals(plantNewCohorts)$initialB
  }
  newCohortData[, B := initialB]

  # Here we subset cohortData instead of setting added columns to NULL. However, as these are 'new' cohorts, this is okay
  newCohortData <- newCohortData[, .(pixelGroup, ecoregionGroup, speciesCode, age, B, Provenance,
                                     mortality = 0L, aNPPAct = 0L
  )]

  if (getOption("LandR.assertions")) {
    if (isTRUE(NROW(unique(newCohortData, by = c("pixelGroup", "age", "speciesCode", "Provenance")))
               != NROW(newCohortData))) {
      stop("Duplicated new cohorts in a pixelGroup. Please debug LandR:::.plantNewCohorts")
      # in this situation, it may be caused by not replanting all species. Not sure if this will come up.
    }
  }

  if (trackPlanting) {
    newCohortData[, planted := TRUE]
  }

  cohortData <- rbindlist(list(cohortData, newCohortData), fill = TRUE, use.names = TRUE)

  return(cohortData)
}

#' Add cohorts to `cohortData` and `pixelGroupMap`
#'
#' This is a wrapper for  `generatePixelGroups`, `initiateNewCohort` and updates to
#' `pixelGroupMap` via assignment to new `pixelIndex` values in `newPixelCohortData`.
#' By running these all together, there is less chance that they will diverge.
#' There are some checks internally for consistency.
#'
#' Does the following:
#' \enumerate{
#'   \item add new cohort data into `cohortData`;
#'   \item assign initial `B` and `age` for new cohort;
#'   \item assign the new `pixelGroup` to the pixels that have new cohort;
#'   \item update the `pixelGroup` map.
#' }
#'
#' @template newPixelCohortData
#' @template cohortData
#' @template pixelGroupMap
#' @template cohortDefinitionCols
#' @template currentTime
#' @template speciesEcoregion
#'
#' @param treedHarvestPixelTable A data.table with at least 2 columns, `pixelIndex` and `pixelGroup`.
#'   This will be used in conjunction with `cohortData` and `pixelGroupMap`
#'   to ensure that everything matches correctly.
#' @template successionTimestep
#' @param provenanceTable A `data.table` with three columns:
#' New cohorts are initiated at the `ecoregionGroup` `speciesEcoregion` from the
#' corresponding `speciesEcoregion` listed in the `Provenance` column
#' @param initialB the initial biomass of new cohorts. Defaults to ten, even if NA/NULL is passed.
#' @param trackPlanting if true, planted cohorts in `cohortData` are tracked with `TRUE`
#' in column 'planted'
#'
#' @template verbose
#'
#' @template doAssertion
#'
#' @return
#' A list of length 2, `cohortData` and `pixelGroupMap`, with
#' `newPixelCohortData` inserted.
#'
#' @export
#' @rdname updateCohortDataPostHarvest
updateCohortDataPostHarvest <- function(newPixelCohortData, cohortData, pixelGroupMap, currentTime,
                                        speciesEcoregion, treedHarvestPixelTable = NULL,
                                        successionTimestep, provenanceTable, trackPlanting = FALSE,
                                        initialB = 10,
                                        cohortDefinitionCols = c("pixelGroup", "age", "speciesCode"),
                                        verbose = getOption("LandR.verbose", TRUE),
                                        doAssertion = getOption("LandR.assertions", TRUE)) {
  cohortData <- copy(cohortData)
  provenanceTable <- copy(provenanceTable)

  maxPixelGroup <- as.integer(maxFn(pixelGroupMap))

  if (!is.null(treedHarvestPixelTable)) {
    pixelGroupMap[treedHarvestPixelTable$pixelIndex] <- 0L
  }
  relevantPixels <- pixelGroupMap[][newPixelCohortData$pixelIndex]
  zeroOnPixelGroupMap <- relevantPixels == 0

  newPixelCohortData[B == 0, age := 1] # this is necessary for now.
  # Can't lose the biomass of non-zero (non harvested) trees

  setkey(newPixelCohortData, ecoregionGroup, speciesCode)
  setkey(provenanceTable, ecoregionGroup, speciesCode)

  newPixelCohortData <- provenanceTable[newPixelCohortData]

  # ecoregionGroup should remain the same, Provenance will be a newly added column

  # NA in provenance means this area would not be planted with this species

  # Remove duplicate species. These were created by cohorts of different age but same species
  duplicates <- duplicated(newPixelCohortData[, .(speciesCode, ecoregionGroup, pixelIndex)])
  newPixelCohortData <- newPixelCohortData[!duplicates]

  # this block must be run here, in case some pixelGroups change due to no existing maxB
  specieseco_current <- speciesEcoregionLatestYear(speciesEcoregion, currentTime)
  specieseco_current <- setkey(
    specieseco_current[, .(speciesCode, maxANPP, maxB, ecoregionGroup)],
    speciesCode, ecoregionGroup
  )

  specieseco_current[, maxB_eco := max(maxB), by = ecoregionGroup]

  setkey(newPixelCohortData, speciesCode, ecoregionGroup)

  newPixelCohortData <- specieseco_current[newPixelCohortData]

  columnsForPG <- c("ecoregionGroup", "speciesCode", "age", "B", "maxB", "maxANPP", "Provenance")

  cd <- newPixelCohortData[, c("pixelIndex", columnsForPG), with = FALSE]

  newPixelCohortData[, pixelGroup := generatePixelGroups(cd,
                                                         maxPixelGroup = maxPixelGroup,
                                                         columns = columnsForPG
  )]

  # Remove the duplicated pixels within pixelGroup (i.e., 2+ species in the same pixel)
  pixelsToChange <- unique(newPixelCohortData[, c("pixelIndex", "pixelGroup")],
                           by = c("pixelIndex")
  )

  pixelGroupMap[pixelsToChange$pixelIndex] <- pixelsToChange$pixelGroup

  if (doAssertion) {
    if (!isTRUE(all(pixelsToChange$pixelGroup == pixelGroupMap[][pixelsToChange$pixelIndex]))) {
      stop("pixelGroupMap and newPixelCohortData$pixelGroupMap don't match in updateCohortData fn")
    }
  }


  ##########################################################
  # Add new cohorts and rm missing cohorts (i.e., those pixelGroups that are gone)
  ##########################################################

  cohortData <- plantNewCohorts(newPixelCohortData, cohortData,
                                pixelGroupMap,
                                currentTime = currentTime,
                                successionTimestep = successionTimestep,
                                initialB = initialB,
                                trackPlanting = trackPlanting
  )

  outs <- rmMissingCohorts(cohortData, pixelGroupMap, cohortDefinitionCols = cohortDefinitionCols)

  assertCohortData(outs$cohortData, outs$pixelGroupMap,
                   cohortDefinitionCols = cohortDefinitionCols,
                   doAssertion = doAssertion, verbose = verbose)

  if (doAssertion) {
    maxPixelGroupFromCohortData <- max(outs$cohortData$pixelGroup)
    maxPixelGroup <- as.integer(maxFn(outs$pixelGroupMap))
    test1 <- (!identical(maxPixelGroup, maxPixelGroupFromCohortData))
    if (test1) {
      stop(
        "The sim$pixelGroupMap and cohortData have unmatching pixelGroup.",
        " They must be matching.",
        " If this occurs, please contact the module developers"
      )
    }
  }

  if (verbose > 0) {
    nPixForest <- sum(!is.na(outs$pixelGroupMap[]))
    nPixGrps <- length(unique(outs$cohortData$pixelGroup))
    nPixNoPixGrp <- sum(outs$pixelGroupMap[] == 0, na.rm = TRUE)
    nPixTreed <- sum(outs$pixelGroupMap[] != 0, na.rm = TRUE)

    nDigits <- max(nchar(c(nPixForest, nPixGrps, nPixNoPixGrp))) + 3
    message(crayon::magenta(
      "NUMBER OF FORESTED PIXELS          :",
      paddedFloatToChar(nPixForest, padL = nDigits, pad = " ")
    ))
    message(crayon::magenta(
      "NUMBER OF PIXELS WITH TREES        :",
      paddedFloatToChar(nPixTreed, padL = nDigits, pad = " ")
    ))
    message(crayon::magenta(
      "NUMBER OF UNIQUE PIXELGROUPS       :",
      paddedFloatToChar(nPixGrps, padL = nDigits, pad = " ")
    ))
    message(crayon::magenta(
      "NUMBER OF PIXELS WITH NO PIXELGROUP:",
      paddedFloatToChar(nPixNoPixGrp, padL = nDigits, pad = " ")
    ))
  }

  return(list(
    cohortData = outs$cohortData,
    pixelGroupMap = outs$pixelGroupMap
  ))
}

#' Create or amend data to a `pixelFateDT` object
#'
#' @param pixelFateDT A `pixelFateDT` `data.table` with 3 columns: `fate`,
#'   `pixelsRemoted`, and `runningPixelTotal`.
#' @param fate A character string (length 1) describing in words the change
#' @param pixelsRemoved A numeric indicating how many pixels were removed due to the `fate`.
#' @param runningPixelTotal an optional numeric with new, running total. If not supplied,
#'   it will be calculated from the last row of `pixelFateDT` `runningTotal` minus the
#'   `pixelsRemoved`
#'
#' @return A `pixelFateDT` object, updated with one extra row.
#'
#' @export
pixelFate <- function(pixelFateDT, fate = NA_character_, pixelsRemoved = 0,
                      runningPixelTotal = NA_integer_) {
  if (missing(pixelFateDT)) {
    pixelFateDT <- data.table(fate = character(), pixelsRemoved = integer(), runningPixelTotal = integer())
  }
  if (is.na(runningPixelTotal)) {
    runningPixelTotal <- tail(pixelFateDT$runningPixelTotal, 1) - pixelsRemoved
  }
  pixelFateDT <- rbindlist(list(pixelFateDT, data.table(
    fate = fate, pixelsRemoved = pixelsRemoved,
    runningPixelTotal = runningPixelTotal
  )))
  pixelFateDT
}

#' Generate and add vegetation type column to `cohortData`
#'
#' This function is a simplification of `vegTypeMapGenerator`
#' and instead of generating a map, it adds the vegetation type column
#' to the `cohortData` table.
#'
#' @param x A `cohortData` object
#'
#' @template vegLeadingProportion
#'
#' @param mixedType An integer defining whether mixed stands are:
#'                  not differentiated (0), any kind of species admixture (1),
#'                  or deciduous mixed with conifer (2; default).
#'
#' @template sppEquiv
#'
#' @template sppEquivCol
#'
#' @param pixelGroupColName Name of the column in `pixelGroup` to use.
#'
#' @template doAssertion
#'
#' @param ... Additional arguments.
#'
#' @return `x` with a new column, 'leading', coding the vegetation type
#'    of each group defined by `pixelGroupColName`
#'
#' @author Eliot McIntire, Ceres Barros, Alex Chubaty
#' @export
#' @rdname vegTypeGenerator
#'
#' @examples
#' library(data.table)
#' x <- data.table(
#'   pixelGroup = rep(1:2, each = 2), B = c(100, 200, 20, 400),
#'   speciesCode = rep(c("Pice_Gla", "Popu_Tre"), 2)
#' )
#' vegTypeGenerator(x)
vegTypeGenerator <- function(x, vegLeadingProportion = 0.8,
                             mixedType = 2, sppEquiv = NULL, sppEquivCol,
                             pixelGroupColName = "pixelGroup",
                             doAssertion = getOption("LandR.assertions", TRUE), ...) {
  stopifnot(
    mixedType %in% 0:2,
    length(mixedType) == 1
  )

  nrowCohortData <- NROW(x)

  leadingBasedOn <- preambleVTG(x, vegLeadingProportion, doAssertion, nrowCohortData)

  if (mixedType == 2) {
    if (is.null(sppEquiv)) {
      sppEquiv <- get(data("sppEquivalencies_CA", package = "LandR", envir = environment()),
                      inherits = FALSE
      )

      # Find the sppEquivCol that best matches what you have in x
      sppEquivCol <- names(sort(sapply(sppEquiv, function(xx) sum(xx %in% unique(x$species))),
                                decreasing = TRUE
      )[1])
      message(paste0(
        "Using mixedType == 2, but no sppEquiv provided. ",
        "Attempting to use data('sppEquivalencies_CA', 'LandR') ",
        "and sppEquivCol = '", sppEquivCol, "'"
      ))
    }
  }

  ## use new vs old algorithm based on size of x. new one (2) is faster in most cases.
  ## enable assertions to view timings for each algorithm before deciding which to use.
  algo <- ifelse(nrowCohortData > 3.5e6, 1, 2)

  pgdAndSc <- c(pixelGroupColName, "speciesCode")
  pgdAndScAndLeading <- c(pgdAndSc, leadingBasedOn)
  totalOfLeadingBasedOn <- paste0("total", leadingBasedOn)
  speciesOfLeadingBasedOn <- paste0("speciesGroup", leadingBasedOn)
  if (algo == 1 || isTRUE(doAssertion)) {
    # slower -- older, but simpler Eliot June 5, 2019
    # 1. Find length of each pixelGroup -- don't include pixelGroups in "by" that have only 1 cohort: N = 1
    cohortData1 <- copy(x)
    systimePre1 <- Sys.time()
    pixelGroupData1 <- cohortData1[, list(N = .N), by = pixelGroupColName]

    # Calculate speciesProportion from cover or B
    pixelGroupData1 <- cohortData1[, ..pgdAndScAndLeading][pixelGroupData1, on = pixelGroupColName]
    set(pixelGroupData1, NULL, totalOfLeadingBasedOn, pixelGroupData1[[leadingBasedOn]])

    if (identical(leadingBasedOn, "cover")) {
      pixelGroupData1[N != 1, (totalOfLeadingBasedOn) := sum(cover, na.rm = TRUE), by = pixelGroupColName]
      pixelGroupData1 <- pixelGroupData1[, list(sum(cover, na.rm = TRUE), totalcover[1]), by = pgdAndSc]
    } else {
      pixelGroupData1[N != 1, (totalOfLeadingBasedOn) := sum(B, na.rm = TRUE), by = pixelGroupColName]
      pixelGroupData1 <- pixelGroupData1[, list(sum(B, na.rm = TRUE), totalB[1]), by = pgdAndSc]
    }
    setnames(pixelGroupData1, old = c("V1", "V2"), new = c(speciesOfLeadingBasedOn, totalOfLeadingBasedOn))

    set(pixelGroupData1, NULL, "speciesProportion", pixelGroupData1[[speciesOfLeadingBasedOn]] /
          pixelGroupData1[[totalOfLeadingBasedOn]])
    systimePost1 <- Sys.time()

    setorderv(pixelGroupData1, pixelGroupColName)
  }

  if (algo == 2 || isTRUE(doAssertion)) {
    # Replacement algorithm to calculate speciesProportion
    #  Logic is similar to above --
    #  1. sort by pixelGroup
    #  2. calculate N, use this to repeat itself (instead of a join above)
    #  3. calculate speciesProportion, noting to calculate with by only if N > 1, otherwise
    #     it is a simpler non-by calculation
    cohortData2 <- copy(x)
    systimePre2 <- Sys.time()
    setkeyv(cohortData2, pgdAndSc)
    # setorderv(x, pixelGroupColName)
    pixelGroupData2 <- cohortData2[, list(N = .N), by = pixelGroupColName]
    cohortData2 <- cohortData2[, ..pgdAndScAndLeading]

    N <- rep.int(pixelGroupData2$N, pixelGroupData2$N)
    wh1 <- N == 1
    set(cohortData2, which(wh1), totalOfLeadingBasedOn, cohortData2[[leadingBasedOn]][wh1])
    if (identical(leadingBasedOn, "cover")) {
      totalBNot1 <- cohortData2[!wh1, list(N = .N, totalcover = sum(cover, na.rm = TRUE)), by = pixelGroupColName]
    } else {
      totalBNot1 <- cohortData2[!wh1, list(N = .N, totalB = sum(B, na.rm = TRUE)), by = pixelGroupColName]
    }
    totalBNot1 <- rep.int(totalBNot1[[totalOfLeadingBasedOn]], totalBNot1[["N"]])
    set(cohortData2, which(!wh1), totalOfLeadingBasedOn, totalBNot1)

    b <- cohortData2[, list(N = .N), by = pgdAndSc]
    b <- rep.int(b[["N"]], b[["N"]])
    GT1 <- (b > 1)
    if (any(GT1)) {
      pixelGroupData2List <- list()
      cohortData2[GT1, speciesProportion := sum(B, na.rm = TRUE) / totalB[1], by = pgdAndSc]
      cohortData2[!GT1, speciesProportion := B / totalB]
      # pixelGroupData2List[[2]] <- cohortData2[!GT1]
      # pixelGroupData2 <- rbindlist(pixelGroupData2List)
    } else {
      # cols <- c(pixelGroupColName, "speciesCode", "speciesProportion")
      set(cohortData2, NULL, "speciesProportion", cohortData2[[leadingBasedOn]] /
            cohortData2[[totalOfLeadingBasedOn]])
      # pixelGroupData2[[NROW(pixelGroupData2) + 1]] <- cohortData2[!GT1, ..cols]
    }
    pixelGroupData2 <- cohortData2
    systimePost2 <- Sys.time()
  }

  if (isTRUE(doAssertion)) {
    ## slower -- older, but simpler Eliot June 5, 2019
    ## TODO: these algorithm tests should be deleted after a while. See date on prev line.
    if (!exists("oldAlgoVTM", envir = .pkgEnv)) .pkgEnv$oldAlgoVTM <- 0
    if (!exists("newAlgoVTM", envir = .pkgEnv)) .pkgEnv$newAlgoVTM <- 0
    .pkgEnv$oldAlgoVTM <- .pkgEnv$oldAlgoVTM + (systimePost1 - systimePre1)
    .pkgEnv$newAlgoVTM <- .pkgEnv$newAlgoVTM + (systimePost2 - systimePre2)
    message("LandR::vegTypeMapGenerator: new algo ", .pkgEnv$newAlgoVTM)
    message("LandR::vegTypeMapGenerator: old algo ", .pkgEnv$oldAlgoVTM)
    setorderv(pixelGroupData2, pgdAndSc)
    whNA <- unique(unlist(sapply(pixelGroupData2, function(xx) which(is.na(xx)))))
    pixelGroupData1 <- pixelGroupData1[!pixelGroupData2[whNA], on = pgdAndSc]
    setkeyv(pixelGroupData1, pgdAndSc)
    setkeyv(pixelGroupData2, pgdAndSc)
    aa <- pixelGroupData1[pixelGroupData2, on = pgdAndSc]
    if (!isTRUE(all.equal(aa[["speciesProportion"]], aa[["i.speciesProportion"]]))) {
      stop("Old algorithm in vegMapGenerator is different than new map")
    }
  }

  if (algo == 1) {
    pixelGroupData <- pixelGroupData1
    rm(pixelGroupData1)
  } else if (algo == 2) {
    pixelGroupData <- pixelGroupData2
    rm(pixelGroupData2)
  }

  ########################################################
  #### Determine "mixed"
  ########################################################
  if (mixedType == 0) {
    ## 1. sort on pixelGroup and speciesProportion, reverse so 1st row of each pixelGroup is the largest
    ## 2. Keep only first row in each pixelGroup
    pixelGroupData3 <- pixelGroupData[, list(speciesCode, get(pixelGroupColName), speciesProportion)]
    setnames(pixelGroupData3, "V2", pixelGroupColName)
    setorderv(pixelGroupData3, cols = c(pixelGroupColName, "speciesProportion"), order = -1L)
    set(pixelGroupData3, NULL, "speciesProportion", NULL)
    pixelGroupData3 <- pixelGroupData3[, .SD[1], by = pixelGroupColName]
    setnames(pixelGroupData3, "speciesCode", "leading")
  } else if (mixedType == 1) {
    ## create "mixed" class #    -- Eliot May 28, 2019 -- faster than previous below
    ## 1. anything with >= vegLeadingProportion is "pure"
    ## 2. sort on pixelGroup and speciesProportion, reverse so that 1st row of each pixelGroup is the largest
    ## 3. Keep only first row in each pixelGroup
    ## 4. change column names and convert pure to mixed ==> mixed <- !pure
    pixelGroupData3 <- pixelGroupData[, list(
      pure = speciesProportion >= vegLeadingProportion,
      speciesCode, get(pixelGroupColName), speciesProportion)]
    setnames(pixelGroupData3, "V3", pixelGroupColName)
    setorderv(pixelGroupData3, cols = c(pixelGroupColName, "speciesProportion"), order = -1L)
    set(pixelGroupData3, NULL, "speciesProportion", NULL)
    pixelGroupData3 <- pixelGroupData3[, .SD[1], by = pixelGroupColName]
    pixelGroupData3[pure == FALSE, speciesCode := "Mixed"]
    setnames(pixelGroupData3, "speciesCode", "leading")
    pixelGroupData3[, pure := !pure]
    setnames(pixelGroupData3, "pure", "mixed")

    ## Old algorithm for above, this is ~43 times slower
    # a2 <- Sys.time()
    # pixelGroupData2 <- pixelGroupData[, list(mixed = all(speciesProportion < vegLeadingProportion),
    #                                          leading = speciesCode[which.max(speciesProportion)]),
    #                                   by = pixelGroupColName]
    # pixelGroupData2[mixed == TRUE, leading := "Mixed"]
    # b2 <- Sys.time()
  } else if (mixedType == 2) {
    if (!sppEquivCol %in% colnames(sppEquiv)) {
      stop(sppEquivCol, " is not in sppEquiv. Please pass an existing sppEquivCol")
    }

    sppEq <- data.table(sppEquiv[[sppEquivCol]], sppEquiv[["Type"]])

    names(sppEq) <- c("speciesCode", "Type")
    setkey(pixelGroupData, speciesCode)

    # don't need all columns now
    colsToDelete <- c("rasterToMatch", leadingBasedOn, totalOfLeadingBasedOn)
    colsToDelete <- colsToDelete[colsToDelete %in% colnames(pixelGroupData)]
    set(pixelGroupData, NULL, colsToDelete, NULL)
    pixelGroupData3 <- merge(pixelGroupData, sppEq[!duplicated(sppEq)], all.x = TRUE)
    setkeyv(pixelGroupData, pgdAndSc)

    setkeyv(pixelGroupData3, pgdAndSc)
    mixedType2Condition <- quote(Type == "Deciduous" &
                                   speciesProportion < vegLeadingProportion &
                                   speciesProportion > 1 - vegLeadingProportion)
    pixelGroupData3[, mixed := FALSE]

    if (algo == 2 || isTRUE(doAssertion)) {
      b <- pixelGroupData3[, list(N = .N), by = pixelGroupColName]
      b <- rep.int(b[["N"]], b[["N"]])
      GT1 <- b > 1


      pgd3GT1 <- pixelGroupData3[GT1]
      pgd3NGT1 <- pixelGroupData3[!GT1]

      pgd3GT1[eval(mixedType2Condition), mixed := TRUE, by = pixelGroupColName]
      pgd3GT1[, mixed := any(mixed), by = pixelGroupColName]
      pixelGroupData3 <- rbindlist(list(pgd3NGT1, pgd3GT1))
    } else {
      pixelGroupData3[eval(mixedType2Condition), mixed := TRUE, by = pixelGroupColName]
      pixelGroupData3[, mixed := any(mixed), by = pixelGroupColName]
    }

    setorderv(pixelGroupData3, cols = c(pixelGroupColName, "speciesProportion"), order = -1L)
    set(pixelGroupData3, NULL, "speciesProportion", NULL)
    set(pixelGroupData3, NULL, "Type", NULL)
    pixelGroupData3 <- pixelGroupData3[, .SD[1], by = pixelGroupColName] ## sp. w/ highest prop. per pixelGroup
    pixelGroupData3[mixed == TRUE, speciesCode := "Mixed"]
    setnames(pixelGroupData3, "speciesCode", "leading")
    set(pixelGroupData3, NULL, "leading", factor(pixelGroupData3[["leading"]]))
  } else {
    stop("invalid mixedType! Must be one of '0', '1' or '2'.")
  }

  cols <- c(pixelGroupColName, "leading")
  xx <- pixelGroupData3[, ..cols][x, on = pixelGroupColName]

  return(xx)
}

preambleVTG <- function(x, vegLeadingProportion, doAssertion, nrowCohortData) {
  if (!inRange(vegLeadingProportion, 0, 1)) {
    stop("vegLeadingProportion must be a proportion")
  }

  leadingBasedOn <- if ("B" %in% colnames(x)) {
    message("Using B to derive leading type")
    "B"
  } else if ("cover" %in% colnames(x)) {
    message("Using cover to derive leading type, as there is no B column")
    "cover"
  } else {
    stop("x must have either B or cover to determine leading species/type")
  }
  if (!nrowCohortData > 0) stop("cohortData is empty")

  if (isTRUE(doAssertion)) {
    message("LandR::vegTypeMapGenerator: NROW(x) == ", nrowCohortData)
  }

  leadingBasedOn
}

# derived from plyr::mapvalues
mapvalues2 <- function(x, from, to) { #
  if (length(from) != length(to)) {
    stop("`from` and `to` vectors are not the same length.")
  }
  mapidx <- match(x, from)
  mapidxNA <- is.na(mapidx)
  from_found <- sort(unique(mapidx))
  x[!mapidxNA] <- to[mapidx[!mapidxNA]]
  x
}
PredictiveEcology/LandR documentation built on June 7, 2024, 4:16 p.m.