R/cohorts.R

Defines functions updateCohortData

Documented in updateCohortData

utils::globalVariables(c(
  ".", "..cols", "..colsToSubset", ".I", ":=", "..groupVar", "age", "aNPPAct", "cover", "coverOrig",
  "ecoregion", "ecoregionGroup", "hasBadAge",
  "imputedAge", "initialEcoregion", "initialEcoregionCode", "initialPixels",
  "lcc", "maxANPP", "maxB", "maxB_eco", "mortality",
  "newPossLCC", "noPixels", "oldSumB", "ord", "outBiomass", "oldEcoregionGroup",
  "pixelGroup2", "pixelIndex", "pixels", "Provenance", "possERC",
  "speciesposition", "speciesGroup", "speciesInt", "state", "sumB",
  "temppixelGroup", "toDelete", "totalBiomass", "totalCover",
  "uniqueCombo", "uniqueComboByRow", "uniqueComboByPixelIndex", "V1", "year"
))

#' Add cohorts to \code{cohortData} and \code{pixelGroupMap}
#'
#' This is a wrapper for  \code{generatePixelGroups}, \code{initiateNewCohort} and updates to
#' \code{pixelGroupMap} via assignment to new \code{pixelIndex} values in \code{newPixelCohortData}.
#' By running these all together, there is less chance that they will diverge.
#' There are some checks internally for consistency.
#'
#' Does the following:
#' \enumerate{
#'   \item add new cohort data into \code{cohortData};
#'   \item assign initial \code{B} and \code{age} for new cohort;
#'   \item assign the new \code{pixelGroup} to the pixels that have new cohort;
#'   \item update the \code{pixelGroup} map.
#' }
#'
#' @template newPixelCohortData
#' @template cohortData
#' @template pixelGroupMap
#' @template currentTime
#' @template speciesEcoregion
#' @template cohortDefinitionCols
#'
#' @param treedFirePixelTableSinceLastDisp A data.table with at least 2 columns, \code{pixelIndex} and \code{pixelGroup}.
#'   This will be used in conjunction with \code{cohortData} and \code{pixelGroupMap}
#'   to ensure that everything matches correctly.
#' @param successionTimestep The time between successive seed dispersal events.
#'   In LANDIS-II, this is called "Succession Timestep". This is used here
#' @param verbose Integer, where increasing number is increasing verbosity. Currently,
#'    only level 1 exists; but this may change.
#'
#' @template doAssertion
#'
#' @return
#' A list of length 2, \code{cohortData} and \code{pixelGroupMap}, with
#' \code{newPixelCohortData} inserted.
#'
#' @export
#' @rdname updateCohortData
#' @importFrom crayon green magenta
#' @importFrom data.table copy rbindlist set setkey
#' @importFrom raster getValues
#' @importFrom SpaDES.core paddedFloatToChar
#' @importFrom stats na.omit
updateCohortData <- function(newPixelCohortData, cohortData, pixelGroupMap, currentTime,
                             speciesEcoregion, treedFirePixelTableSinceLastDisp = NULL,
                             successionTimestep,
                             cohortDefinitionCols = c("pixelGroup", 'age', 'speciesCode'),
                             verbose = getOption("LandR.verbose", TRUE),
                             doAssertion = getOption("LandR.assertions", TRUE)) {

  maxPixelGroup <- as.integer(maxValue(pixelGroupMap))

  if (!is.null(treedFirePixelTableSinceLastDisp)) {
    ## only set to 0 the pixels that became empty (have no survivors)
    emptyPix <- setdiff(treedFirePixelTableSinceLastDisp$pixelIndex,
                        newPixelCohortData[type == "survivor", pixelIndex])
    pixelGroupMap[emptyPix] <- 0L
  }
  relevantPixels <- pixelGroupMap[][newPixelCohortData$pixelIndex]
  zeroOnPixelGroupMap <- relevantPixels == 0

  if (!"age" %in% colnames(newPixelCohortData))
    newPixelCohortData[, age := 1L]

  if (all(zeroOnPixelGroupMap)) {
    # Deal with pixels on the map that have no pixelGroup -- these are burned
    # pixels --> the entirely newly regenerated pixels do not require a
    # re-pixelGroupMaping  -- can just add to existing pixelGroup values
    if (verbose > 0)
      message(crayon::green("  Regenerating only burnt pixels with no survivors (i.e. resprouting & serotiny)"))
    columnsForPG <- c("ecoregionGroup", "speciesCode", "age") # no Biomass because they all have zero
    cd <- newPixelCohortData[,c("pixelIndex", columnsForPG), with = FALSE]
    newPixelCohortData[, pixelGroup := generatePixelGroups(cd, maxPixelGroup = maxPixelGroup,
                                                           columns = columnsForPG)]#,
    #successionTimestep = successionTimestep)

    # Remove the duplicated pixels within pixelGroup (i.e., 2+ species in the same pixel)
    pixelsToChange <- unique(newPixelCohortData[, c("pixelIndex", "pixelGroup")],
                             by = c("pixelIndex"))
  } else {
    # This is for situations where there are some empty pixels being filled,
    #   and some occupied pixels getting infilling. This requires a wholesale
    #   re-pixelGroup
    if (verbose > 0)
      message(crayon::green("  Regenerating open and pixels with B (likely after seed dispersal, or partial mortality following disturbance)"))

    pixelIndex <- which(pixelGroupMap[] %in% cohortData$pixelGroup)

    #remove unnecessary columns before making cohortDataLong
    cohortData[, prevMortality := NULL]
    newPixelCohortData[, year := NULL]

    cohortDataPixelIndex <- data.table(pixelIndex = pixelIndex,
                                       pixelGroup = pixelGroupMap[][pixelIndex])
    cdLong <- cohortDataPixelIndex[cohortData, on = "pixelGroup", allow.cartesian = TRUE]
    cohorts <- rbindlist(list(cdLong, newPixelCohortData), use.names = TRUE, fill = TRUE)

    columnsForPG <- c("ecoregionGroup", "speciesCode", "age", "B")
    cd <- cohorts[, c("pixelIndex", columnsForPG), with = FALSE]
    cohorts[, pixelGroup := generatePixelGroups(cd, maxPixelGroup = 0L, columns = columnsForPG)]

    # Bring to pixelGroup level -- this will squash the data.table
    if (is.null(cohorts[["sumB"]])) {
      cohorts[, sumB := sum(B, na.rm = TRUE), by = pixelGroup]
    }
    #Old way that does not preserve additional 'unknown' columns in cohortData
    # allCohortData <- cohorts[ , .(ecoregionGroup = ecoregionGroup[1],
    #                               mortality = mortality[1],
    #                               aNPPAct = aNPPAct[1],
    #                               sumB = sumB[1]),
    #                           by = uniqueCohortDefinition]

    #newer way that will potentially conflict with LandR.CS due to differing aNPP
    colsToSubset <- setdiff(colnames(cohortData), c("pixelIndex"))
    allCohortData <- cohorts[!duplicated(cohorts[, .(pixelGroup, speciesCode, ecoregionGroup, age)]), ..colsToSubset]

    theNewOnes <- is.na(allCohortData$B)
    cohortData <- allCohortData[!theNewOnes]
    newPixelCohortData <- allCohortData[theNewOnes]

    # Remove the duplicated pixels within pixelGroup (i.e., 2+ species in the same pixel)
    pixelsToChange <- unique(cohorts[, c("pixelIndex", "pixelGroup")], by = c("pixelIndex"))
  }

  # update pixelGroupMap
  pixelGroupMap[pixelsToChange$pixelIndex] <- pixelsToChange$pixelGroup

  if (doAssertion) {
    if (!isTRUE(all(pixelsToChange$pixelGroup == pixelGroupMap[][pixelsToChange$pixelIndex])))
      stop("pixelGroupMap and newPixelCohortData$pixelGroupMap don't match in updateCohortData fn")
  }

  ## give B in pixels that have serotiny/resprouting
  # newPixelCohortData[, sumB := sum(B, na.rm = TRUE), by = pixelGroup]

  ##########################################################
  # Add new cohorts and rm missing cohorts (i.e., those pixelGroups that are gone)
  ##########################################################
  cohortData <- .initiateNewCohorts(newPixelCohortData, cohortData,
                                      pixelGroupMap, currentTime = currentTime,
                                      speciesEcoregion = speciesEcoregion,
                                      successionTimestep = successionTimestep)

  outs <- rmMissingCohorts(cohortData, pixelGroupMap, cohortDefinitionCols = cohortDefinitionCols)

  if (!is.null(outs$cohortData$sumB)) {
    outs$cohortData[, sumB := NULL]
  }

  assertCohortData(outs$cohortData, outs$pixelGroupMap,
                   cohortDefinitionCols = cohortDefinitionCols,
                   doAssertion = doAssertion, verbose = verbose)

  if (doAssertion) {
    maxPixelGroupFromCohortData <- max(outs$cohortData$pixelGroup)
    maxPixelGroup <- as.integer(maxValue(outs$pixelGroupMap))
    test1 <- (!identical(maxPixelGroup, maxPixelGroupFromCohortData))
    if (test1) {
      stop("The sim$pixelGroupMap and cohortData have unmatching pixelGroup.",
           " They must be matching.",
           " If this occurs, please contact the module developers")
    }
  }

  if (verbose > 0) {
    nPixForest <- sum(!is.na(outs$pixelGroupMap[]))
    nPixGrps <- length(unique(outs$cohortData$pixelGroup))
    nPixNoPixGrp <- sum(outs$pixelGroupMap[] == 0, na.rm = TRUE)
    nPixTreed <- sum(outs$pixelGroupMap[] != 0, na.rm = TRUE)

    nDigits <- max(nchar(c(nPixForest, nPixGrps, nPixNoPixGrp))) + 3
    message(crayon::magenta("NUMBER OF FORESTED PIXELS          :",
                            paddedFloatToChar(nPixForest, padL = nDigits, pad = " ")))
    message(crayon::magenta("NUMBER OF PIXELS WITH TREES        :",
                            paddedFloatToChar(nPixTreed, padL = nDigits, pad = " ")))
    message(crayon::magenta("NUMBER OF UNIQUE PIXELGROUPS       :",
                            paddedFloatToChar(nPixGrps, padL = nDigits, pad = " ")))
    message(crayon::magenta("NUMBER OF PIXELS WITH NO PIXELGROUP:",
                            paddedFloatToChar(nPixNoPixGrp, padL = nDigits, pad = " ")))
  }

  return(list(cohortData = outs$cohortData,
              pixelGroupMap = outs$pixelGroupMap))
}

#' Initiate new cohorts
#'
#' Calculate new values for \code{B}, add \code{age}, then \code{rbindlist} this
#' with \code{cohortData}.
#'
#' @template newPixelCohortData
#' @template cohortData
#' @template cohortDefinitionCols
#'
#' @return A \code{data.table} with a new \code{rbindlist}ed \code{cohortData}
#'
#' @importFrom data.table copy rbindlist set setkey
#' @importFrom raster getValues
#' @importFrom stats na.omit
#' @rdname updateCohortData
.initiateNewCohorts <- function(newPixelCohortData, cohortData, pixelGroupMap, currentTime,
                                cohortDefinitionCols = c("pixelGroup", 'speciesCode', 'age'),
                                speciesEcoregion, successionTimestep) {

  ## get spp "productivity traits" per ecoregion/present year
  ## calculate maximum B per ecoregion, join to new cohort data
  namesNCD <- names(newPixelCohortData)
  if (!isTRUE("pixelGroup" %in% namesNCD)) {
    if (isTRUE("pixelIndex" %in% namesNCD)) {
      newPixelCohortData[, pixelGroup := getValues(pixelGroupMap)[pixelIndex]]
    } else {
      stop("newPixelCohortData must have either pixelIndex or pixelGroup")
    }
  }

  ## simplifying to pixelGroup again
  if (!is.null(newPixelCohortData[["pixelIndex"]]))
    set(newPixelCohortData, NULL, "pixelIndex", NULL)
  newPixelCohortData <- newPixelCohortData[!duplicated(newPixelCohortData), ]   ## faster than unique

  specieseco_current <- speciesEcoregionLatestYear(speciesEcoregion, currentTime)
  specieseco_current <- setkey(specieseco_current[, .(speciesCode, maxANPP, maxB, ecoregionGroup)],
                               speciesCode, ecoregionGroup)

  ## Note that after the following join, some cohorts will be lost due to lack of
  ##  parameters in speciesEcoregion. These need to be modified in pixelGroupMap.
  #missingNewPixelCohortData <- newPixelCohortData[!specieseco_current, on = uniqueSpeciesEcoregionDefinition]
  specieseco_current <- specieseco_current[!is.na(maxB)]
  specieseco_current[, maxB_eco := max(maxB), by = ecoregionGroup]
  newPixelCohortData <- specieseco_current[newPixelCohortData, on = uniqueSpeciesEcoregionDefinition]
  newPixelCohortData <- newPixelCohortData[!is.na(maxB)]

  if (any(newPixelCohortData$age > 1))
    stop("newPixelCohortData should only have new cohorts aged 1")
  set(newPixelCohortData, NULL, "age", 1L)  ## set age to 1

  ## Ceres: this was causing new cohorts to be initialized with maxANPP.
  ## instead, calculate total biomass of older cohorts
  # set(newPixelCohortData, NULL, "sumB", 0L)
  if (!is.null(newPixelCohortData[["sumB"]]))
    set(newPixelCohortData, NULL, "sumB", NULL)
  cohortData[age >= successionTimestep, oldSumB := sum(B, na.rm = TRUE), by = "pixelGroup"]

  newPixelCohortData <- unique(cohortData[, .(pixelGroup, oldSumB)],
                               by = "pixelGroup")[newPixelCohortData, on = "pixelGroup"]
  set(newPixelCohortData, which(is.na(newPixelCohortData$oldSumB)), "oldSumB", 0)   ## faster than [:=]
  setnames(newPixelCohortData, "oldSumB", "sumB")
  set(cohortData, NULL, "oldSumB", NULL)

  if ("B" %in% names(newPixelCohortData))
    newPixelCohortData[, B := NULL]
  set(newPixelCohortData, NULL, "B",
      asInteger(pmax(1, newPixelCohortData$maxANPP *
                       exp(-1.6 * newPixelCohortData$sumB / newPixelCohortData$maxB_eco))))
  set(newPixelCohortData, NULL, "B", asInteger(pmin(newPixelCohortData$maxANPP, newPixelCohortData$B)))

  newPixelCohortData <- newPixelCohortData[, .(pixelGroup, ecoregionGroup, speciesCode, age, B,
                                               mortality = 0L, aNPPAct = 0L)]

  if (getOption("LandR.assertions")) {
    if (isTRUE(NROW(unique(newPixelCohortData, by = cohortDefinitionCols)) != NROW(newPixelCohortData)))
      stop("Duplicated new cohorts in a pixelGroup. Please debug LandR:::.initiateNewCohorts")
  }

  cohortData <- rbindlist(list(cohortData, newPixelCohortData), fill = TRUE, use.names = TRUE)
  # cohortData[, sumB := sum(B, na.rm = TRUE), by = "pixelGroup"]  ## recalculate sumB
  # if (!is.integer(cohortData[["sumB"]]))
  #   set(cohortData, NULL, "sumB", asInteger(cohortData[["sumB"]]))

  return(cohortData)
}

#' Remove missing cohorts from \code{cohortData} based on \code{pixelGroupMap}
#'
#' @template cohortData
#' @template cohortDefinitionCols
#' @template pixelGroupMap
#'
#' @template doAssertion
#'
#' @return
#' A \code{list} with 2 \code{data.table} objects, \code{cohortData} and \code{pixelGroupMap},
#' each updated based on missing \code{pixelGroups} in the other.
#'
#' @export
#' @importFrom data.table rbindlist set setkey
#' @importFrom raster getValues
#' @importFrom stats na.omit
rmMissingCohorts <- function(cohortData, pixelGroupMap,
                             cohortDefinitionCols = c("pixelGroup", 'age', 'speciesCode'),
                             doAssertion = getOption("LandR.assertions", TRUE)) {

  pgmValues <- data.table(pixelGroup = getValues(pixelGroupMap),
                          pixelIndex = seq(ncell(pixelGroupMap)))

  pgmVals <- na.omit(pgmValues)
  pgmVals <- pgmVals[pixelGroup > 0]
  whPgsStillInCDGoneFromPGM <- !cohortData$pixelGroup %in% pgmVals$pixelGroup
  pgsStillInCDGoneFromPGM <- cohortData[whPgsStillInCDGoneFromPGM,] #setdiff(unique(cohortData$pixelGroup), unique(pgmVals$pixelGroup))
  #pgsStillInPGMGoneFromCD <- setdiff(unique(pgmVals$pixelGroup), unique(cohortData$pixelGroup))
  whPgsStillInPGMGoneFromCD <- !pgmVals$pixelGroup %in% cohortData$pixelGroup
  pgsStillInPGMGoneFromCD <- pgmVals[whPgsStillInPGMGoneFromCD,]

  # REMOVE lines in cohortData that are no longer in the pixelGroupMap
  cohortData <- cohortData[!pixelGroup %in% pgsStillInCDGoneFromPGM$pixelGroup]
  # REMOVE pixels in pixelGroupMap that are no longer in the cohortData
  pixelGroupMap[pgsStillInPGMGoneFromCD$pixelIndex] <- NA

  assertCohortData(cohortData, pixelGroupMap, message = "rmMissingCohorts",
                   cohortDefinitionCols = cohortDefinitionCols,
                   doAssertion = doAssertion)

  if (NROW(unique(cohortData[pixelGroup == 67724]$ecoregionGroup)) > 1) stop()

  return(list(cohortData = cohortData,
              pixelGroupMap = pixelGroupMap))
}

#' Add the correct \code{pixelGroups} to a \code{pixelDataTable} object
#'
#' Generates unique groupings of a \code{data.table} object where one or more rows can
#' all belong to the same \code{pixelIndex}. Pixel groups will be identical pixels based
#' on unique combinations of \code{columns}.
#'
#' @param pixelDataTable  A \code{data.table} with column-based descriptions.
#'   Must have a column called \code{pixelIndex}, which allows for multiple rows to be associated
#'   with a single pixel.
#' @param maxPixelGroup A length 1 numeric indicating the current maximum \code{pixelGroup} value;
#'    the \code{pixelGroup} numbers returned will start at \code{maxPixelGroup + 1}.
#' @param columns A character vector of column names to use as part of the generation of unique
#'   combinations of features. Default is \code{c("ecoregionGroup", "speciesCode", "age", "B")}
#'
#' @return
#' Returns a vector of \code{pixelGroup} in the original order of the input \code{pixelDataTable}.
#' This should likely be added to the \code{pixelDataTable} object immediately.
#'
#' @export
#' @importFrom data.table setkey setorderv
#' @importFrom plyr mapvalues
#' @importFrom SpaDES.core paddedFloatToChar
generatePixelGroups <- function(pixelDataTable, maxPixelGroup,
                                columns = c("ecoregionGroup", "speciesCode", "age", "B")) {
  columnsOrig <- columns
  columns <- columns[columns %in% names(pixelDataTable)]
  columns2 <- paste0(columns, "2")
  if (!all(columns == columnsOrig))
    message("Creating pixelGroup values, but not using all columns requested. Only using, ",
            paste(columns, collapse = ", "), " instead of ", paste(columnsOrig, collapse = ", "))

  pcd <- pixelDataTable # no copy -- just for simpler name

  if (getOption("LandR.assertions")) {
    pcdOrig <- data.table::copy(pcd)
  }
  # concatenate within rows -- e.g., ecoregionCode_speciesCode_age_biomass or 647_11_Abie_sp_100_2000
  pcd[, uniqueComboByRow := do.call(paste, as.list(.SD)), .SDcols = columns]

  # concatenate within pixelIndex
  pcd[ , c("uniqueComboByPixelIndex") := paste(uniqueComboByRow, collapse = "__"), by = "pixelIndex"]
  pcd[ , c("pixelGroup") := as.integer(maxPixelGroup) + as.integer(factor(uniqueComboByPixelIndex))]

  if (getOption("LandR.assertions")) { # old algorithm
    # prepare object 1 (pcd) for checking below
    pcd[, ord := 1:.N]
    setorderv(pcd, c("pixelIndex"))
    pcd[, pixelGroup2 := mapvalues(pixelGroup, from = unique(pixelGroup), to = as.character(seq_along(unique(pixelGroup))))]
    setorderv(pcd, "ord")

    pcdOld <- data.table::copy(pcdOrig)

    # Convert to unique numeric
    pcdOld[ , c(columns2) := lapply(.SD, function(x) {
      a <- as.integer(factor(x))
    }), .SDcols = columns]

    # concatenate within rows -- e.g., ecoregionCode_speciesCode_age_biomass or 647_11_Abie_sp_100_2000
    pcdOld[, uniqueComboByRow := as.integer(factor( do.call(paste, as.list(.SD)))),
           .SDcols = columns2]

    # concatenate within pixelIndex
    pcdOld[ , c("uniqueComboByPixelIndex") := paste(uniqueComboByRow, collapse = "__"),
            by = "pixelIndex"]
    pcdOld[ , c("pixelGroup") := as.integer(maxPixelGroup) +
              as.integer(factor(uniqueComboByPixelIndex))]
    # prepare object 2 (pcdOld) for checking below
    pcdOld[, ord := 1:.N]
    setorderv(pcdOld, c("pixelIndex"))
    pcdOld[, pixelGroup2 := mapvalues(pixelGroup, from = unique(pixelGroup),
                                      to = as.character(seq_along(unique(pixelGroup))))]
    setorderv(pcdOld, "ord")

    # The check
    if (!identical(pcdOld$pixelGroup2, pcd$pixelGroup2))
      stop("new generatePixelGroups algorithm failing")
  }

  return(pcd$pixelGroup)
}

#' Pull out the values from \code{speciesEcoregion} table for current time
#'
#' @template speciesEcoregion
#' @template currentTime
#'
#' @return
#' The \code{speciesEcoregion} input object, but with data from only one year, the year
#' that is less than or equal to the \code{currentTime}
#'
#' @export
speciesEcoregionLatestYear <- function(speciesEcoregion, currentTime) {
  spEco <- speciesEcoregion[year <= currentTime]
  spEco[year == max(year)]
}

#' @keywords internal
.ageRndUpSuccessionTimestep <- function(age, successionTimestep) {
  as.integer(ceiling(as.numeric(age) / successionTimestep) * successionTimestep)
}

#' The columns in a \code{cohortData} that define "unique"
#'
#' If two pixels have identical values in all of these columns, they are the same \code{pixelGroup}.
#'
#' @export
#' @rdname uniqueDefinitions
uniqueCohortDefinition <- c("pixelGroup", "speciesCode", "age", "B")

#' @export
#' @rdname uniqueDefinitions
uniqueSpeciesEcoregionDefinition <- c("speciesCode", "ecoregionGroup")

#' Summary for \code{cohortData}
#'
#' @template cohortData
#'
#' @export
describeCohortData <- function(cohortData) {
  vals <- c("B", "totalBiomass", "age", "cover")
  names(vals) <- vals
  out <- lapply(vals, function(val)
    .cohortMessages(cohortData, val)
  )
  message(magenta("Pixels with non-NA cover:, ", cohortData[!is.na(cover), length(unique(pixelIndex))]))
}

#' @keywords internal
.cohortMessages <- function(cohortData, val) {
  out <- list()
  if (val %in% colnames(cohortData)) {
    pixelsNA <- NROW(cohortData[is.na(get(val)), unique("pixelIndex"), with = FALSE])
    message(magenta("Pixels with missing", val, ":", format(pixelsNA, big.mark = ",")))
    pixelsZero <- NROW(cohortData[, all(get(val) == 0), by = "pixelIndex"][get("V1") == TRUE])
    message(magenta("Pixels with all(",val," == 0): ", format(pixelsZero, big.mark = ",")))
    pixelsBiomassNonZero <- NROW(cohortData[, any(get(val) > 0), by = "pixelIndex"][get("V1") == TRUE])
    message(magenta("Pixels with all(",val," > 0): ", format(pixelsBiomassNonZero, big.mark = ",")))
    out <- list(pixelsNA = pixelsNA, pixelsZero = pixelsZero, pixelsBiomassNonZero = pixelsBiomassNonZero)
  }
  return(invisible(out))
}

#' Convert Land Cover Classes (LCC) to another value in its neighbourhood
#'
#' This will search around the pixels on \code{rstLCC} that have
#' \code{classesToReplace}, and search in iteratively increasing
#' radii outwards for other Land Cover Classes than the those indicated in
#' \code{classesToReplace}. This will constrain
#' It will then take the cohorts that were in pixels with \code{classesToReplace}
#' and assign them new values in the output object. This function will
#' also check that it must be an \code{ecoregionCode} that already exists in
#' \code{cohortData}, i.e., not create new \code{ecoregionCode} values. See Details.
#'
#' @details
#' This function is designed to be used in highly constrained situations, where it is not
#' just replacing a Land Cover Class by a neighbouring Land Cover Class. But it can
#' be used for the simpler cases of simply replacing a Land Cover Class.
#'
#' @param pixelClassesToReplace Deprecated. Use \code{classesToReplace}
#'
#' @param classesToReplace Integer vector of classes that are are to be replaced,
#'     e.g., 34, 35, 36 on LCC2005, which are burned young, burned 10 year, and cities.
#'
#' @param rstLCC LCC raster, e.g., LCC2005
#'
#' @param theUnwantedPixels An optional vector of pixel IDs that need to be changed.
#'   If not provided, then pixels to change will be taken from the match between
#'   \code{availableERC_by_Sp} and \code{classesToReplace}. Supplying this allows
#'   the user to only replace some of the pixels with a given class.
#'
#' @param ecoregionGroupVec Deprecated. Use \code{availableERC_by_Sp}
#'
#' @param speciesEcoregion Deprecated. Use \code{availableERC_by_Sp}
#'
#' @param availableERC_by_Sp A \code{data.table} or \code{data.frame} with 3 columns:
#'   \code{speciesCode}, \code{initialEcoregionCode} and \code{pixelIndex}.
#'   \code{pixelIndex} is the pixel id for each line in the \code{data.table};
#'   \code{speciesCode} is the species name in the pixel (can have more than one
#'     species per pixel, so multiple rows per pixel); and,
#'   \code{initialEcoregionCode} is the unique codes that are "available" to be
#'     used as a replacement for \code{classesToReplace}. \code{initialEcoregionCode}
#'     must be a character vector, with one or no "_" used as a separator, with the last
#'     component being the Land Cover Class that matches \code{classesToReplace}, e.g.,
#'     \code{"242_18"}. If there is no "_" in this code, then the codes must match the
#'     \code{classesToReplace} exactly, e.g., \code{"11"}.
#'     If \code{pixelIndex} is missing, the function will fill it
#'     with \code{seq(ncell(rstLCC))}. If \code{speciesCode} is missing, the function
#'     will replace it with a dummy value (\code{"allSpecies"}).
#'
#' @template doAssertion
#'
#' @return
#' A \code{data.table} with two columns, \code{pixelIndex} and \code{ecoregionGroup}.
#' This represents the new codes to used in the \code{pixelIndex} locations.
#' These should have no values overlapping with \code{classesToReplace}.
#'
#' @author Eliot McIntire
#' @export
#' @importFrom data.table as.data.table is.data.table rbindlist setnames
#' @importFrom raster raster
#' @importFrom SpaDES.core paddedFloatToChar
#' @importFrom SpaDES.tools spread2
convertUnwantedLCC <- function(classesToReplace = 34:36, rstLCC,
                               availableERC_by_Sp, theUnwantedPixels,
                               ecoregionGroupVec, speciesEcoregion, pixelClassesToReplace,
                               doAssertion = getOption("LandR.assertions", TRUE)) {
  if (!missing(pixelClassesToReplace))
    stop("pixelClassesToReplace is deprecated. Please use classesToReplace")
  if (!missing(ecoregionGroupVec))
    stop("ecoregionGroupVec is deprecated. Please use availableERC_by_Sp")
  if (!missing(speciesEcoregion))
    stop("speciesEcoregion is deprecated. Please use availableERC_by_Sp")

  if (!is.data.table(availableERC_by_Sp))
    if (is.data.frame(availableERC_by_Sp)) {
      availableERC_by_Sp <- as.data.table(availableERC_by_Sp)
    } else {
      stop("availableERC_by_Sp must be a data.table or data.frame")
    }

  if (all(grepl("_", availableERC_by_Sp$initialEcoregionCode))) {
    hasPreDash <- TRUE
    availableERC_by_Sp[, ecoregion := gsub("_.*", "", initialEcoregionCode)]
  } else {
    hasPreDash <- FALSE
  }

  genericSpeciesName <- "allSpecies"
  if (!"speciesCode" %in% names(availableERC_by_Sp)) {
    availableERC_by_Sp[, speciesCode := genericSpeciesName]
  }
  if (!"pixelIndex" %in% names(availableERC_by_Sp)) {
    availableERC_by_Sp[, pixelIndex := seq(ncell(rstLCC))]
  }
  if (!is.character(availableERC_by_Sp$initialEcoregionCode) && hasPreDash) {
    availableERC_by_Sp[, initialEcoregionCode := as.character(initialEcoregionCode)]
  }

  # if (!all(grepl("_", availableERC_by_Sp$initialEcoregionCode))) {
  #   numChar <- max(nchar(availableERC_by_Sp$initialEcoregionCode), na.rm = TRUE)
  #   availableERC_by_Sp[!is.na(initialEcoregionCode),
  #                      initialEcoregionCode := paste0("1_", paddedFloatToChar(as.integer(initialEcoregionCode), numChar))]
  # }

  if (missing(theUnwantedPixels)) {
      theUnwantedRows <- gsub(".*_", "", availableERC_by_Sp$initialEcoregionCode) %in%
        as.character(classesToReplace)
      theUnwantedPixels <- sort(unique(availableERC_by_Sp[theUnwantedRows, "pixelIndex"])[[1]])
  }

  if (doAssertion) {
    #  stop("values of 34 and 35 on pixelCohortData and sim$LCC2005 don't match")
  }
  iterations <- 1
  # remove the lines that have the code "classesToReplace"
  availableERG2 <- if (hasPreDash) {
    availableERC_by_Sp[-which(gsub(".*_", "", initialEcoregionCode) %in% classesToReplace)]
  } else {
    availableERC_by_Sp[-which(initialEcoregionCode %in% classesToReplace)]
  }

  availableERG2 <- unique(availableERG2, by = c("speciesCode", "initialEcoregionCode"))
  if (doAssertion) {
    if (any(gsub(".*_", "", availableERG2$initialEcoregionCode) %in% classesToReplace))
      stop("classesToReplace are still considered 'available' forest classes")
  }

  availableERG2[, `:=`(pixelIndex = NULL)]

  numCharIEC <- max(nchar(availableERC_by_Sp$initialEcoregionCode), na.rm = TRUE)
  if (hasPreDash) {
    numCharEcoregion <- max(nchar(availableERG2$ecoregion), na.rm = TRUE)
    numCharLCCCodes <- numCharIEC - numCharEcoregion - 1 # minus 1 for the dash
    availableERG2[, `:=`(ecoregion = NULL)]
  } else {
    numCharLCCCodes <- numCharIEC
  }

  currentLenUnwantedPixels <- length(theUnwantedPixels)
  repeatsOnSameUnwanted <- 0

  while (length(theUnwantedPixels) > 0) {
    message("Converting unwanted LCCs: ", length(theUnwantedPixels), " pixels remaining.")
    out <- spread2(rstLCC, start = theUnwantedPixels, asRaster = FALSE,
                   iterations = iterations, allowOverlap = TRUE, spreadProb = 1)
    out <- out[initialPixels != pixels] # rm pixels which are same as initialPixels --> these are known wrong
    iterations <- iterations + 1
    out[, lcc := rstLCC[][pixels]]
    out[lcc %in% c(classesToReplace), lcc := NA]
    out <- na.omit(out)
    out5 <- availableERC_by_Sp[out[, state := NULL], allow.cartesian = TRUE,
                               on = c("pixelIndex" = "initialPixels"), nomatch = NA] # join the availableERC_by_Sp which has initialEcoregionCode

    if (hasPreDash) {
      out5[, possERC := paste0(ecoregion, "_",
                               paddedFloatToChar(as.integer(lcc), padL = numCharLCCCodes, padR = 0))]
    } else {
      out5[, possERC := lcc]
    }
    out7 <- out5[availableERG2, on = c("speciesCode", "possERC" = "initialEcoregionCode"), nomatch = NA]
    out6 <- na.omit(out7)

    # These ones are missing at least something in the new possERC
    possERCToRm <- out5[!availableERG2, on = c("speciesCode", "possERC" = "initialEcoregionCode")]
    out6 <- out6[!possERC %in% unique(possERCToRm$possERC)]

    # sanity check -- don't let an infinite loop
    if (currentLenUnwantedPixels == length(theUnwantedPixels)) {
      repeatsOnSameUnwanted <- repeatsOnSameUnwanted + 1
    } else {
      currentLenUnwantedPixels <- length(theUnwantedPixels)
      repeatsOnSameUnwanted <- 0
    }

    if (repeatsOnSameUnwanted > 5) {
      out2 <- data.table(newPossLCC = NA, pixelIndex = theUnwantedPixels, ecoregionGroup = NA)
      message("  removing ", NROW(theUnwantedPixels), " pixel of class ",
              paste(rstLCC[theUnwantedPixels], collapse = ", "), " because couldn't",
              " find a suitable replacement")
      pixelsToNA <- theUnwantedPixels
      theUnwantedPixels <- integer()
    }

    if (NROW(out6) > 0) {
      ## take random sample of available, weighted by abundance
      rowsToKeep <- out6[, list(keep = .resample(.I, 1)), by = c("pixelIndex")]
      out8 <- out6[rowsToKeep$keep]
      out2 <- out8[, list(newPossLCC = lcc, pixelIndex)]
      if (hasPreDash) {
        out2[, initialEcoregion := substr(out8[, initialEcoregionCode], 1, numCharEcoregion)]
        out2[, ecoregionGroup := paste0(initialEcoregion, "_",
                                        paddedFloatToChar(as.integer(newPossLCC), padL = 2, padR = 0))] #nolint
        out2[, initialEcoregion := NULL]
      } else {
        out2[, ecoregionGroup := as.integer(newPossLCC)] #nolint
      }

      ## remove combinations of ecoregionGroup and speciesCode that don't exist
      ## -- Now this excludes B = 0
      keepPixels <- unique(out2$pixelIndex)
      theUnwantedPixels <- theUnwantedPixels[!theUnwantedPixels %in% keepPixels]
      out2 <- unique(out2)

      if (!exists("out3")) {
        out3 <- out2
      } else {
        out3 <- rbindlist(list(out2, out3))
      }
    }
  }

  if (!exists("out3")) {
    out3 <- data.table(pixelIndex = NA, ecoregionGroup = NA)[!is.na(pixelIndex)]
  } else {
    # setnames(out3, c("initialPixels", "initialEcoregionCode"), c("pixelIndex", "ecoregionGroup"))
    out3[, `:=`(newPossLCC = NULL)]
    # out3 <- unique(out3, by = c("pixelIndex", "ecoregionGroup"))
    out3 <- unique(out3)
  }

  if (exists("pixelsToNA")) {
    ## make sure these pixels get an NA ecoregion by rm them in case they are present
    if (any(out3$pixelIndex %in% pixelsToNA))
      out3 <- out3[!pixelIndex %in% pixelsToNA]
    out3 <- rbind(out3, data.table(pixelIndex = pixelsToNA, ecoregionGroup = NA))
  }

  if (doAssertion) {
    if (any(gsub(".*_", "", out3$ecoregionGroup) %in% classesToReplace))
      stop("classesToReplace we're not fully removed")
  }

  out3
}

#' Generate template \code{cohortData} table
#'
#' Internal function used by \code{\link{makeAndCleanInitialCohortData}}.
#'
#' @param inputDataTable A \code{data.table} with columns described above.
#'
#' @template doAssertion
#'
#' @param rescale Logical. If \code{TRUE}, the default, cover for each species will be rescaled
#'   so all cover in \code{pixelGroup} or pixel sums to 100.
#'
#' @importFrom crayon blue
#' @importFrom data.table melt setnames
#' @keywords internal
.createCohortData <- function(inputDataTable, # pixelGroupBiomassClass,
                              doAssertion = getOption("LandR.assertions", TRUE), rescale = TRUE,
                              minCoverThreshold = 5) {
  coverColNames <- grep(colnames(inputDataTable), pattern = "cover", value = TRUE)
  newCoverColNames <- gsub("cover\\.", "", coverColNames)
  setnames(inputDataTable, old = coverColNames, new = newCoverColNames)
  message(blue("Create initial cohortData object, with no pixelGroups yet"))
  message(green("-- Begin reconciling data inconsistencies"))

  inputDataTable[, totalCover := rowSums(.SD), .SDcols = newCoverColNames]
  whEnoughCover <- inputDataTable$totalCover > minCoverThreshold
  message(green("  -- Removing all pixels with totalCover <= minCoverThreshold (affects",
                sum(!whEnoughCover),
                "of", NROW(inputDataTable),"pixels)"))
  message(green("     --> resulting in", sum(whEnoughCover), "pixels)"))
  inputDataTable <- inputDataTable[whEnoughCover]

  whAgeEqZero <- which(inputDataTable$age == 0)
  message(green("  -- Setting TotalBiomass in pixel to 0 where age == 0 (affects", length(whAgeEqZero),
                "of", NROW(inputDataTable),"pixels)"))
  message(green("     --> keeping ", NROW(inputDataTable), "pixels)"))
  inputDataTable[whAgeEqZero, `:=`(totalBiomass = 0)]

  whTotalBEqZero <- which(inputDataTable$totalBiomass == 0)
  message(green("  -- Setting age in pixel to 0 where totalBiomass == 0 (affects", length(whTotalBEqZero),
                "of", NROW(inputDataTable),"pixels)"))
  message(green("     --> keeping ", NROW(inputDataTable), "pixels)"))
  inputDataTable[whTotalBEqZero, `:=`(age = 0)]

  cohortData <- data.table::melt(inputDataTable,
                                 value.name = "cover",
                                 measure.vars = newCoverColNames,
                                 variable.name = "speciesCode")

  # Remove all cover <= minCoverThreshold
  whCoverGTMinCover <- which(cohortData$cover > minCoverThreshold)
  message(green("  -- Removing all cohorts with cover <= minCoverThreshold (affects", NROW(cohortData) - length(whCoverGTMinCover),
                "of", NROW(cohortData), "cohorts"))
  message(green("     --> resulting in", length(whCoverGTMinCover), "cohorts)"))
  cohortData <- cohortData[whCoverGTMinCover]
  message(green("     --> resulting in", length(unique(cohortData$pixelIndex)), "pixels)"))

  cohortData[, coverOrig := cover]
  if (isTRUE(doAssertion))
    if (any(duplicated(cohortData)))
      warning(".createCohortData: cohortData contains duplicate rows.")

  #if (doAssertion)
  #describeCohortData(cohortData)
  #message(green("  -- Assign B = 0 and age = 0 for pixels where cover = 0,\n",
  #             "because cover is most reliable dataset"))

  # hasCover0 <- which(cohortData[["cover"]] == 0)
  cncd <- colnames(cohortData)
  # if (any(c("age", "logAge") %in% cncd)) {
  #   set(cohortData, hasCover0, "age", 0L)
  #   set(cohortData, hasCover0, "logAge", .logFloor(0))
  # }
  if (any(c("B", "totalBiomass") %in% cncd)) {
    #set(cohortData, hasCover0, "B", 0)
    #message(green("  -- Assign totalBiomass = 0 if sum(cover) = 0 in a pixel, ",
    #             "  because cover is most reliable dataset"))
    #cohortData <- cohortData[, sum(cover) == 0, by = "pixelIndex"][V1 == TRUE][
    #  cohortData, on = "pixelIndex"][V1 == TRUE, totalBiomass := 0L]
    #cohortData[, V1 := NULL]
  }

  ## CRAZY TODO: DIVIDE THE COVER BY 2 for DECIDUOUS -- will only affect mixed stands
  # message(crayon::green(paste("POSSIBLE ALERT:",
  #                            "assume deciduous cover is 1/2 the conversion to B as conifer")))
  # cohortData[speciesCode == "Popu_sp", cover := asInteger(cover / 2)]

  # Change temporarily to numeric for following calculation
  set(cohortData, NULL, "cover", as.numeric(cohortData[["cover"]]))
  if (isTRUE(rescale)) {
    cohortData[, totalCover := sum(cover), by = "pixelIndex"]
    cohortData[, cover := cover/totalCover*100]
  }

    # cohortData[ , cover := {
    #   sumCover <- sum(cover)
    #   if (sumCover > 100) {
    #     cover <- cover / (sumCover + 0.0001) * 100L
    #   }
    #   cover
    # }, by = "pixelIndex"]

  # set(cohortData, NULL, "cover", asInteger(cohortData[["cover"]]))

  if (FALSE) {
    if (any(c("B", "totalBiomass") %in% cncd)) {
      # Biomass -- by cohort (NOTE: divide by 100 because cover is percent)
      #set(cohortData, NULL, "B", as.numeric(cohortData[["B"]]))
      set(cohortData, NULL, "B", cohortData[["totalBiomass"]] * cohortData[["cover"]] / 100)
      message(green("  -- Divide total B of each pixel by the relative cover of the cohorts"))

      #cohortData[ , B := mean(totalBiomass) * cover / 100, by = "pixelIndex"]
      # message(blue("Round B to nearest P(sim)$pixelGroupBiomassClass"))
      # cohortData[ , B := ceiling(B / pixelGroupBiomassClass) * pixelGroupBiomassClass]

      message(green("Set B to 0 where cover > 0 and age = 0, because B is least quality dataset"))
      cohortData[cover > 0 & age == 0, B := 0L]
      cohortData[ , totalBiomass := asInteger(totalBiomass)]
      set(cohortData, NULL, "B", asInteger(cohortData[["B"]]))
    }
  }

  # clean up
  set(cohortData, NULL, c("totalCover", "coverOrig"), NULL)
  return(cohortData)
}

#' Generate initial \code{cohortData} table
#'
#' Takes a single \code{data.table} input, which has the following columns in addition to
#'    others that will be labelled with species name, and contain percent cover of each:
#' \itemize{
#'   \item \code{pixelIndex} (integer)
#'   \item \code{age} (integer)
#'   \item \code{logAge} (numeric)
#'   \item \code{initialEcoregionCode} (factor)
#'   \item \code{totalBiomass} (integer)
#'   \item \code{lcc} (integer)
#'   \item \code{rasterToMatch} (integer)
#'   \item \code{speciesCode} (factor)
#'   \item \code{cover} (integer)
#'   \item \code{coverOrig} (integer)
#'   \item \code{B} (integer)
#' }
#'
#' @param inputDataTable A \code{data.table} with columns described above.
#'
#' @param sppColumns A vector of the names of the columns in \code{inputDataTable} that
#'   represent percent cover by species, rescaled to sum up to 100\%.
#'
#' @param imputeBadAgeModel DESCRIPTION NEEDED
#'
#' @param minCoverThreshold DESCRIPTION NEEDED
#'
#' @template doAssertion
#'
#' @param doSubset Turns on/off subsetting. Defaults to \code{TRUE}.
#'
#' @author Eliot McIntire
#' @export
#' @importFrom crayon blue green
#' @importFrom data.table melt setnames
#' @importFrom reproducible Cache .sortDotsUnderscoreFirst
#' @importFrom pemisc termsInData messageDF
#' @rdname makeAndCleanInitialCohortData
makeAndCleanInitialCohortData <- function(inputDataTable, sppColumns,
                                          #pixelGroupBiomassClass,
                                          #pixelGroupAgeClass = 1,
                                          imputeBadAgeModel = quote(lme4::lmer(age ~ B * speciesCode + cover * speciesCode + (1 | initialEcoregionCode))),
                                          minCoverThreshold,
                                          doAssertion = getOption("LandR.assertions", TRUE),
                                          doSubset = TRUE) {
  ### Create groupings
  if (doAssertion) {
    expectedColNames <- c("age", "logAge", "initialEcoregionCode", "totalBiomass",
                          "lcc", "pixelIndex")
    if (!all(expectedColNames %in% colnames(inputDataTable)))
      stop("Column names for inputDataTable must include ",
           paste(expectedColNames, collapse = " "))
    if (!all(sppColumns %in% colnames(inputDataTable)))
      stop("Species names are incorrect")
    if (!all(unlist(lapply(inputDataTable[, sppColumns, with = FALSE],
                           function(x) all(x >= 0 & x <= 100)))))
      stop("Species columns are not percent cover between 0 and 100. This may",
           " be because they more NA values than the Land Cover raster")
  }

  cohortData <- Cache(.createCohortData, inputDataTable = inputDataTable,
                      # pixelGroupBiomassClass = pixelGroupBiomassClass,
                      minCoverThreshold = minCoverThreshold,
                      doAssertion = doAssertion)

  ######################################################
  # Impute missing ages on poor age dataset
  ######################################################
  # Cases:
  #  All species cover = 0 yet totalB > 0

  cohortDataMissingAge <- cohortData[, hasBadAge :=
                                       #(age == 0 & cover > 0)#| # ok because cover can be >0 with biomass = 0
                                       (age > 0 & cover == 0) |
                                       is.na(age) #|
                                       #(B > 0 & age == 0) |
                                       #(B == 0 & age > 0)
  ][hasBadAge == TRUE]#, by = "pixelIndex"]

  if (NROW(cohortDataMissingAge) > 0) {
    cohortDataMissingAgeUnique <- unique(cohortDataMissingAge,
                                         by = c("initialEcoregionCode", "speciesCode"))[
                                           , .(initialEcoregionCode, speciesCode)]
    cohortDataMissingAgeUnique <- cohortDataMissingAgeUnique[
      cohortData, on = c("initialEcoregionCode", "speciesCode"), nomatch = 0]
    cohortDataMissingAgeUnique <- cohortDataMissingAgeUnique[!is.na(cohortDataMissingAgeUnique$age)]
    cohortDataMissingAgeUnique <- cohortDataMissingAgeUnique[, .(totalBiomass, age, speciesCode,
                                                                 initialEcoregionCode, cover)]
    zeros <- sapply(cohortDataMissingAgeUnique, function(x) sum(x == 0))
    if (sum(zeros, na.rm = TRUE)) {
      hasZeros <- zeros[zeros > 0]
      message(" ", paste(names(hasZeros), collapse = ", "), " had ",
              paste(hasZeros, collapse = ", "), " zeros, respectively")
      warning(" These are being removed from the dataset. If this is not desired; please fix.")
      # terms <- strsplit(gsub(" ", "", as.character(imputeBadAgeModel)), split = "[[:punct:]]+")[[2]][-1] # remove response
      # terms <- unique(terms)
      # terms <- terms[terms %in% colnames(cohortDataMissingAgeUnique)]
      terms <- termsInData(imputeBadAgeModel, cohortDataMissingAgeUnique)
      lapply(terms, function(x) {
        cohortDataMissingAgeUnique <<- cohortDataMissingAgeUnique[get(x) != 0]
      })

    }
    cohortDataMissingAgeUnique <- subsetDT(cohortDataMissingAgeUnique,
                                           by = c("initialEcoregionCode", "speciesCode"),
                                           doSubset = doSubset)
    message(blue("Impute missing age values: started", Sys.time()))

    outAge <- Cache(statsModel, modelFn = imputeBadAgeModel,
                    uniqueEcoregionGroups = .sortDotsUnderscoreFirst(
                      as.character(unique(cohortDataMissingAgeUnique$initialEcoregionCode))
                    ),
                    .specialData = cohortDataMissingAgeUnique,
                    omitArgs = ".specialData")
    message(blue("                           completed", Sys.time()))

    # paste with capture.output keeps table structure intact
    messageDF(outAge$rsq, 3, "blue")

    ## allow.new.levels = TRUE because some groups will have only NA for age for all species
    cohortDataMissingAge[
      , imputedAge := pmax(0L, asInteger(predict(outAge$mod,
                                                 newdata = cohortDataMissingAge,
                                                 allow.new.levels = TRUE)))]

    cohortData <- cohortDataMissingAge[, .(pixelIndex, imputedAge, speciesCode)][
      cohortData, on = c("pixelIndex", "speciesCode")]
    cohortData[!is.na(imputedAge), `:=`(age = imputedAge, logAge = .logFloor(imputedAge))]
    cohortData[, `:=`(imputedAge = NULL)]

  }
  # # Round ages to nearest pixelGroupAgeClass
  # set(cohortData, NULL, "age", asInteger(cohortData$age / pixelGroupAgeClass) *
  #       as.integer(pixelGroupAgeClass))

  cohortData[, `:=`(hasBadAge = NULL)]

  # #######################################################
  # # set B to zero if age is zero because B is lowest quality dataset
  # #######################################################
  # message(blue("Set recalculate totalBiomass as sum(B);",
  #              "many biomasses will have been set to 0 in previous steps"))
  # cohortData[cover > 0 & age == 0, B := 0L]
  # cohortData[, totalBiomass := asInteger(sum(B)), by = "pixelIndex"]

  # This was unused, but for beta regression, this can allow 0s and 1s without
  #   needing a separate model for zeros and ones
  # https://stats.stackexchange.com/questions/31300/dealing-with-0-1-values-in-a-beta-regression
  # cohortData[ , coverProp := (cover/100 * (NROW(cohortData) - 1) + 0.5) / NROW(cohortData)]

  return(cohortData)
}

#' Subset a \code{data.table} with random subsampling within \code{by} groups
#'
#' @param DT A \code{data.table}
#' @param by Character vector of column names to use for groups
#' @param doSubset Logical or numeric indicating the number of subsamples to use
#' @param indices Logical. If \code{TRUE}, this will return vector of row indices only. Defaults
#'   to \code{FALSE}, i.e., return the subsampled \code{data.table}
#'
#' @export
#' @examples
#' library(data.table)
#' dt <- data.table(Lett = sample(LETTERS, replace = TRUE, size = 1000), Nums = 1:100)
#' dt1 <- subsetDT(dt, by = "Lett", doSubset = 3)
subsetDT <- function(DT, by, doSubset = TRUE, indices = FALSE) {
  if (!is.null(doSubset)) {
    if (!isFALSE(doSubset)) {
      sam <- if (is.numeric(doSubset)) doSubset else 50
      message("subsampling initial dataset for faster estimation of maxBiomass parameter: ",
              "using maximum of ", sam, " samples per combination of ecoregionGroup and speciesCode. ",
              "Change 'doSubset' to a different number if this is not enough")
      # subset -- add line numbers of those that were sampled
      a <- DT[, list(lineNum = .I[sample(.N, size = min(.N, sam))]), by = by]
      # Select only those row numbers from whole dataset
      if (isFALSE(indices)) {
        DT <- DT[a$lineNum]
      } else {
        DT <- a$lineNum
      }
    }
  }
  return(DT)
}

#' The generic statistical model to run (\code{lmer} or \code{glmer})
#'
#' This does a few things including R squared, gets the fitted values.
#' It appears that running the models "as is" without this wrapper does not work with \code{Cache}.
#' The return of the model in a list solves this problem.
#' For Caching, the \code{.specialData} should be "omitted" via \code{omitArgs}, and
#' \code{uniqueEcoregionGroups} should not be omitted.
#'
#' @param modelFn A quoted expression of type \code{package::model(Y ~ X, ...)}, omitting
#'   the \code{data} argument. E.g. \code{lme4::glmer(Y ~ X + (X|G), family = poisson)}
#' @param uniqueEcoregionGroups Unique values of \code{ecoregionGroups}.
#'   This is the basis for the statistics, and can be used to optimize caching,
#'   e.g. ignore \code{.specialData} in \code{.omitArgs}.
#' @param sumResponse a sum of all the response variable values
#'   Also to be used to optimize caching, e.g. ignore \code{.specialData}
#'   in \code{.omitArgs}.
#' @param .specialData The custom dataset required for the model.
#'
#' @export
#' @importFrom crayon blue magenta red
#' @importFrom lme4 glmer lmer
#' @importFrom MuMIn r.squaredGLMM
#' @importFrom stats as.formula glm fitted predict
statsModel <- function(modelFn, uniqueEcoregionGroups, sumResponse, .specialData) {
  ## convert model call to vector of arguments
  modelArgs <- as.character(modelFn)
  names(modelArgs) <- names(modelFn)

  ## function name
  fun <- modelArgs[[1]]

  ## get formula and check
  form <- tryCatch(as.formula(modelArgs[2]),
                   error = function(e)
                     stop(paste("Could not convert '", modelArgs[2], "'to formula.",
                                "Check if formula is of type 'Y ~ X'")))

  ## check the no of grouping levels
  if (grepl("\\|", modelArgs[2])) {
    groupVar <- sub("\\).*", "", sub(".*\\| ", "",  modelArgs[2]))
    keepGrouping <- NROW(unique(.specialData[, ..groupVar])) >= 2

    if (!keepGrouping) {
      form <- sub("\\+ \\(.*\\|.*\\)", "", modelArgs[2])
      form <- as.formula(form)

      ## change to glm if dropping random effects
      fun <- "stats::glm"

      whereFam <- grep("family", names(modelArgs))
      modelFn2 <- if (length(whereFam))
        call(fun, form, family = modelArgs[[whereFam]]) else
          call(fun, form)

      message(blue("Grouping variable "), red("only has one level. "),
              blue("Formula changed to\n",
                   magenta(paste0(format(modelFn2, appendLF = FALSE), collapse = ""))))
    }
  }

  ## get function and check
  fun <- reproducible:::.extractFunction(fun)
  if (!is.function(fun))
    stop(paste0("Can't find the function '", modelArgs[1], "'.",
                " Is the function name correct and the package installed?"))

  ## prepare arguments, and strip function from list
  modelArgs <- as.list(modelArgs)
  modelArgs[[2]] <- form
  names(modelArgs)[2] <- "formula"
  modelArgs[[1]] <- NULL
  modelArgs$data <- quote(.specialData)

  isChar <- unlist(lapply(modelArgs, is.character))
  if (any(isChar))
    modelArgs[isChar] <- lapply(modelArgs[isChar], function(yyy) {
      eval(parse(text = yyy))
    })

  mod <- do.call(fun, modelArgs)

  list(mod = mod, pred = fitted(mod), rsq = MuMIn::r.squaredGLMM(mod))
}

#' Default columns that define pixel groups
#'
#' @export
columnsForPixelGroups <- c("ecoregionGroup", "speciesCode", "age", "B")

#' Generate \code{cohortData} table per pixel:
#'
#' @template cohortData
#'
#' @template pixelGroupMap
#'
#' @template cohortDefinitionCols
#'
#' @template doAssertion
#'
#' @return
#' An expanded \code{cohortData} \code{data.table} with a new \code{pixelIndex} column.
#'
#' @export
#' @importFrom raster getValues ncell
addPixels2CohortData <- function(cohortData, pixelGroupMap,
                                 cohortDefinitionCols = c("pixelGroup", 'age', 'speciesCode'),
                                 doAssertion = getOption("LandR.assertions", TRUE)) {
  assertCohortData(cohortData, pixelGroupMap, cohortDefinitionCols = cohortDefinitionCols,
                   doAssertion = doAssertion)

  pixelGroupTable <- na.omit(data.table(pixelGroup = getValues(pixelGroupMap),
                                        pixelIndex = 1:ncell(pixelGroupMap)))
  pixelCohortData <- cohortData[pixelGroupTable, on = "pixelGroup",
                                nomatch = 0, allow.cartesian = TRUE]

  assertPixelCohortData(pixelCohortData, pixelGroupMap, doAssertion = doAssertion)

  return(pixelCohortData)
}

#' Add number of pixels per \code{pixelGroup} and add it has a new column to \code{cohortData}
#'
#' @template cohortData
#'
#' @template pixelGroupMap
#' @template cohortDefinitionCols
#' @template doAssertion
#'
#' @return
#' An \code{cohortData} \code{dat.table} with a new \code{noPixels}
#' column
#'
#' @export
#' @importFrom data.table data.table
#' @importFrom raster maxValue
addNoPixel2CohortData <- function(cohortData, pixelGroupMap,
                                  cohortDefinitionCols = c("pixelGroup", 'age', 'speciesCode'),
                                  doAssertion = getOption("LandR.assertions", TRUE)) {
  assertCohortData(cohortData, pixelGroupMap,
                   cohortDefinitionCols = cohortDefinitionCols, doAssertion = doAssertion)

  noPixelsXGroup <- data.table(noPixels = tabulate(pixelGroupMap[]),
                               pixelGroup = c(1:maxValue(pixelGroupMap)))

  pixelCohortData <- cohortData[noPixelsXGroup, on = "pixelGroup", nomatch = 0]

  if (doAssertion) {
    test1 <- length(setdiff(pixelCohortData$pixelGroup, cohortData$pixelGroup)) > 0
    test2 <- sum(unique(pixelCohortData[, .(pixelGroup, noPixels)])$noPixels) !=
      sum(!is.na(pixelGroupMap[]) & pixelGroupMap[] != 0)  ## 0's have no cohorts.

    if (test1 | test2)
      stop("pixelGroups differ between pixelCohortData/pixelGroupMap and cohortData")
  }

  return(pixelCohortData)
}

#' Make the \code{cohortData} table, while modifying the temporary
#' \code{pixelCohortData} that will be used to prepare other files.
#'
#' Takes a \code{pixelCohortData} table (see \code{makeAndCleanInitialCohortData}),
#'   the \code{speciesEcoregion} list and returns a modified \code{pixelCohortData} and
#'   the \code{cohortData} tables to be used in the simulation.
#'   This function mainly removes unnecessary columns from \code{pixelCohortData},
#'   subsets pixels with \code{biomass > 0}, generates \code{pixelGroups},
#'   and adds \code{ecoregionGroup} and \code{totalBiomass} columns to \code{pixelCohortData}.
#'   \code{cohortData} is then created by subsetting unique combinations of \code{pixelGroup} and
#'   whatever columns are listed in \code{columnsForPixelGroups}.
#'   The resulting \code{cohortData} table has the following columns:
#' \itemize{
#'   \item \code{speciesCode} (factor)
#'   \item \code{ecoregionGroup} (factor)
#'   \item \code{pixelGroup} (integer)
#'   \item \code{age} (integer)
#'   \item \code{B} (integer)
#' }
#'
#' @template pixelCohortData
#' @param columnsForPixelGroups Default columns that define pixel groups
#' @param pixelGroupAgeClass Integer. When assigning \code{pixelGroup} membership, this defines the
#'   resolution of ages that will be considered 'the same \code{pixelGroup}', e.g., if it is 10,
#'   then 6 and 14 will be the same.
#' @param pixelGroupBiomassClass Integer. When assigning \code{pixelGroup} membership, this defines
#'   the resolution of biomass that will be considered 'the same \code{pixelGroup}', e.g., if it is
#'   100, then 5160 and 5240 will be the same
#' @param minAgeForGrouping Minimum age for regrouping. This may be because there is a source of
#'   ages for young stands/trees that is very reliable, such as a fire database.
#'   Ages below this will not be grouped together. Defaults to -1, meaning treat all ages equally.
#'   If this is related to known ages from a high quality database, then use age of the oldest
#'   trees in that database.
#' @param pixelFateDT A \code{data.table} of \code{pixelFateDT}; if none provided, will make an empty one.
#'
#' @template speciesEcoregion
#'
#' @return A list with a modified \code{pixelCohortData}, \code{cohortData}, and \code{pixelFateDT}
#' \code{data.table}s.
#'
#' @export
#' @importFrom data.table melt setnames set
#' @importFrom reproducible Cache
#' @importFrom utils tail
makeCohortDataFiles <- function(pixelCohortData, columnsForPixelGroups, speciesEcoregion,
                                pixelGroupBiomassClass, pixelGroupAgeClass, minAgeForGrouping = 0,
                                pixelFateDT) {
  ## make ecoregioGroup a factor (again) and remove unnecessary cols.
  # refactor because the "_34" and "_35" ones are still levels
  pixelCohortData[, ecoregionGroup := factor(as.character(ecoregionGroup))]
  cols <- intersect(c("logAge", "coverOrig", "totalBiomass",
                      "initialEcoregionCode", "cover", "lcc"),
                    names(pixelCohortData))
  set(pixelCohortData, j = cols, value = NULL)


  # Round ages to nearest pixelGroupAgeClass
  pixelCohortData[age > minAgeForGrouping,
                  age := asInteger(age / pixelGroupAgeClass) *
                    as.integer(pixelGroupAgeClass)]

  # Round Biomass to nearest pixelGroupBiomassClass
  message(blue("Round B to nearest P(sim)$pixelGroupBiomassClass"))
  pixelCohortData[#age > minAgeForGrouping,
                  , B := asInteger(B / pixelGroupBiomassClass) * as.integer(pixelGroupBiomassClass)]

  # Remove B == 0 cohorts after young removals
  message(green("  -- Removing cohorts with B = 0 and age > 0 -- these were likely poor predictions from updateYoungBiomasses"))
  whBEqZeroAgeGT0 <- which(pixelCohortData$B == 0 & pixelCohortData$age > 0)

  if (length(whBEqZeroAgeGT0) > 0)
    pixelCohortData2 <- pixelCohortData[-whBEqZeroAgeGT0]
  else
    pixelCohortData2 <- pixelCohortData

  lostPixels <- setdiff(pixelCohortData$pixelIndex, pixelCohortData2$pixelIndex)
  message(green("     affected", length(whBEqZeroAgeGT0), "cohorts, in", length(lostPixels), "pixels;"))
  lenUniquePix <- length(unique(pixelCohortData2$pixelIndex))
  message(green("     leaving", lenUniquePix, "pixels"))
  pixelFateDT <- pixelFate(pixelFateDT, fate = "rm pixels with Biomass == 0, after updating young cohort B",
                           length(lostPixels), runningPixelTotal = lenUniquePix)

  pixelCohortData <- pixelCohortData2
  # # Set B to 0 if age is 0
  # whAgeZero <- which(pixelCohortData$age == 0)
  # if (length(whAgeZero)) {
  #   message(green("    -- There were", length(whAgeZero), "pixels with age = 0; forcing B to zero"))
  #   pixelCohortData[whAgeZero, B := 0L]
  # }

  ## select pixels with biomass and generate pixel groups
  #pixelCohortData <- pixelCohortData[B >= 0]

  ########################################################################
  ## rebuild ecoregion, ecoregionMap objects -- some initial ecoregions disappeared (e.g., 34, 35, 36)
  ## rebuild biomassMap object -- biomasses have been adjusted
  ## There will be some speciesCode * ecoregionGroups combinations for which we only had age = 0 and B = 0
  if (is.factor(speciesEcoregion$ecoregionGroup)) {
    ## we need to check available params for speciesXecoregionGroup combos, this will be achieved with a
    # ecoregionsWeHaveParametersFor <- levels(speciesEcoregion$ecoregionGroup)
  } else {
    stop("speciesEcoregion$ecoregionGroup is supposed to be a factor; please go back in this",
         "module and correct this.")
  }

  message(blue("Removing some pixels because their species * ecoregionGroup combination has no age or B data to estimate ecoregion traits:"))
  # message(blue(paste(sort(unique(pixelCohortData[!ecoregionGroup %in% ecoregionsWeHaveParametersFor]$ecoregionGroup)), collapse = ", ")))
  cols <- c("speciesCode", "ecoregionGroup")
  messageDF(
    colour = "blue",
    pixelCohortData[!speciesEcoregion, on = cols][, ..cols][, list(numPixelsRemoved = .N), by = cols], # anti-join
  )

  # pixelCohortData <- pixelCohortData[ecoregionGroup %in% ecoregionsWeHaveParametersFor] # keep only ones we have params for
  pixelCohortData <- speciesEcoregion[, ..cols][pixelCohortData, on = cols, nomatch = 0]

  # Lost some ecoregionGroups -- refactor
  pixelCohortData[, ecoregionGroup := factor(as.character(ecoregionGroup))]

  cd <- pixelCohortData[, .SD, .SDcols = c("pixelIndex", columnsForPixelGroups)]
  pixelCohortData[, pixelGroup := Cache(generatePixelGroups, cd, maxPixelGroup = 0,
                                        columns = columnsForPixelGroups)]

  pixelCohortData[, totalBiomass := asInteger(sum(B)), by = "pixelIndex"]

  cohortData <- unique(pixelCohortData, by = c("pixelGroup", columnsForPixelGroups))
  cohortData[, `:=`(pixelIndex = NULL)]

  pixelFateDT <- pixelFate(pixelFateDT, "removing ecoregionGroups without enough data to est. maxBiomass",
                           tail(pixelFateDT$runningPixelTotal, 1) - NROW(unique(pixelCohortData$pixelIndex)))

  assertUniqueCohortData(cohortData, c("pixelGroup", "ecoregionGroup", "speciesCode"))
  return(list(cohortData = cohortData, pixelCohortData = pixelCohortData, pixelFateDT = pixelFateDT))
}

#' Create new cohorts based on provenance table with unique \code{pixelGroup} and add to \code{cohortData}
#'
#' @param newPixelCohortData the cohorts that were harvested
#' @template cohortData
#' @template pixelGroupMap
#' @template currentTime
#' @param successionTimestep succession timestep used in the simulation
#'
#' @return A \code{data.table} with a new \code{cohortData}
#'
#' @importFrom data.table copy rbindlist set setkey
#' @importFrom raster getValues
#' @importFrom stats na.omit
#' @export
plantNewCohorts <- function(newPixelCohortData, cohortData, pixelGroupMap,
                            currentTime, successionTimestep) {
   ## get spp "productivity traits" per ecoregion/present year

  namesNCD <- names(newPixelCohortData)
  if (!isTRUE("pixelGroup" %in% namesNCD)) {
    if (isTRUE("pixelIndex" %in% namesNCD)) {
      newPixelCohortData[, pixelGroup := getValues(pixelGroupMap)[pixelIndex]]
    } else {
      stop("newPixelCohortData must have either pixelIndex or pixelGroup")
    }
  }

  if (!is.null(newPixelCohortData[["pixelIndex"]]))
    set(newPixelCohortData, NULL, "pixelIndex", NULL)

  #remove duplicate pixel groups that were harvested
  duplicates <- duplicated(newPixelCohortData[, .(speciesCode, ecoregionGroup, pixelGroup, age)])
  newCohortData <- newPixelCohortData[!duplicates]

  #Plant trees
  newCohortData[, age := 2]
  #Give the planted trees 2 * maxANPP - newly regenerating cohorts receive 1x maxANPP. 2x seems overkill
  newCohortData[, B := asInteger(2 * maxANPP)]

  #Here we subset cohortData instead of setting added columns to NULL. However, as these are 'new' cohorts, this is okay
  newCohortData <- newCohortData[, .(pixelGroup, ecoregionGroup, speciesCode, age, B, Provenance,
                                     mortality = 0L, aNPPAct = 0L)]

  if (getOption("LandR.assertions")) {
    if (isTRUE(NROW(unique(newCohortData, by = c("pixelGroup", 'age', 'speciesCode', 'Provenance')))
               != NROW(newCohortData))) {

      stop("Duplicated new cohorts in a pixelGroup. Please debug LandR:::.plantNewCohorts")
    #in this situation, it may be caused by not replanting all species. Not sure if this will come up.
    }
  }

  cohortData <- rbindlist(list(cohortData, newCohortData), fill = TRUE, use.names = TRUE)

  return(cohortData)
}

#' Add cohorts to \code{cohortData} and \code{pixelGroupMap}
#'
#' This is a wrapper for  \code{generatePixelGroups}, \code{initiateNewCohort} and updates to
#' \code{pixelGroupMap} via assignment to new \code{pixelIndex} values in \code{newPixelCohortData}.
#' By running these all together, there is less chance that they will diverge.
#' There are some checks internally for consistency.
#'
#' Does the following:
#' \enumerate{
#'   \item add new cohort data into \code{cohortData};
#'   \item assign initial \code{B} and \code{age} for new cohort;
#'   \item assign the new \code{pixelGroup} to the pixels that have new cohort;
#'   \item update the \code{pixelGroup} map.
#' }
#'
#' @template newPixelCohortData
#' @template cohortData
#' @template pixelGroupMap
#' @template cohortDefinitionCols
#' @template currentTime
#' @template speciesEcoregion
#'
#' @param treedHarvestPixelTable A data.table with at least 2 columns, \code{pixelIndex} and \code{pixelGroup}.
#'   This will be used in conjunction with \code{cohortData} and \code{pixelGroupMap}
#'   to ensure that everything matches correctly.
#' @param successionTimestep The time between successive seed dispersal events.
#'   In LANDIS-II, this is called "Succession Timestep". This is used here
#' @param provenanceTable A \code{data.table} with three columns:
#' New cohorts are initiated at the \code{ecoregionGroup} \code{speciesEcoregion} from the
#' corresponding \code{speciesEcoregion} listed in the \code{Provenance} column
#'
#' @param verbose Integer, where increasing number is increasing verbosity. Currently,
#'    only level 1 exists; but this may change.
#'
#' @template doAssertion
#'
#' @return
#' A list of length 2, \code{cohortData} and \code{pixelGroupMap}, with
#' \code{newPixelCohortData} inserted.
#'
#' @export
#' @importFrom crayon green magenta
#' @importFrom data.table copy rbindlist set setkey
#' @importFrom raster getValues
#' @importFrom SpaDES.core paddedFloatToChar
#' @importFrom stats na.omit
#' @rdname updateCohortDataPostHarvest
updateCohortDataPostHarvest <- function(newPixelCohortData, cohortData, pixelGroupMap, currentTime,
                                        speciesEcoregion, treedHarvestPixelTable = NULL,
                                        successionTimestep, provenanceTable,
                                        cohortDefinitionCols =c("pixelGroup", 'age', 'speciesCode'),
                                        verbose = getOption("LandR.verbose", TRUE),
                                        doAssertion = getOption("LandR.assertions", TRUE)) {


  cohortData <- copy(cohortData)
  provenanceTable <- copy(provenanceTable)

  maxPixelGroup <- as.integer(maxValue(pixelGroupMap))

  if (!is.null(treedHarvestPixelTable)) {
    pixelGroupMap[treedHarvestPixelTable$pixelIndex] <- 0L
  }
  relevantPixels <- pixelGroupMap[][newPixelCohortData$pixelIndex]
  zeroOnPixelGroupMap <- relevantPixels == 0

  newPixelCohortData[B == 0, age := 1] #this is necessary for now.
  #Can't lose the biomass of non-zero (non harvested) trees

  setkey(newPixelCohortData, ecoregionGroup, speciesCode)
  setkey(provenanceTable, ecoregionGroup, speciesCode)

  newPixelCohortData <- provenanceTable[newPixelCohortData]

  #ecoregionGroup should remain the same, Provenance will be a newly added column

  #NA in provenance means this area would not be planted with this species

  #Remove duplicate species. These were created by cohorts of different age but same species
  duplicates <- duplicated(newPixelCohortData[, .(speciesCode, ecoregionGroup, pixelIndex)])
  newPixelCohortData <- newPixelCohortData[!duplicates]

  #this block must be run here, in case some pixelGroups change due to no existing maxB
  specieseco_current <- speciesEcoregionLatestYear(speciesEcoregion, currentTime)
  specieseco_current <- setkey(specieseco_current[, .(speciesCode, maxANPP, maxB, ecoregionGroup)],
                               speciesCode, ecoregionGroup)

  specieseco_current[, maxB_eco := max(maxB), by = ecoregionGroup]

  setkey(newPixelCohortData, speciesCode, ecoregionGroup)

  newPixelCohortData <- specieseco_current[newPixelCohortData]

  columnsForPG <- c("ecoregionGroup", "speciesCode", "age", "B", 'maxB', 'maxANPP', 'Provenance')

  cd <- newPixelCohortData[,c("pixelIndex", columnsForPG), with = FALSE]

  newPixelCohortData[, pixelGroup := generatePixelGroups(cd, maxPixelGroup = maxPixelGroup,
                                                         columns = columnsForPG)]

  # Remove the duplicated pixels within pixelGroup (i.e., 2+ species in the same pixel)
  pixelsToChange <- unique(newPixelCohortData[, c("pixelIndex", "pixelGroup")],
                           by = c("pixelIndex"))

  pixelGroupMap[pixelsToChange$pixelIndex] <- pixelsToChange$pixelGroup

  if (doAssertion) {
    if (!isTRUE(all(pixelsToChange$pixelGroup == pixelGroupMap[][pixelsToChange$pixelIndex])))
      stop("pixelGroupMap and newPixelCohortData$pixelGroupMap don't match in updateCohortData fn")
  }


  ##########################################################
  # Add new cohorts and rm missing cohorts (i.e., those pixelGroups that are gone)
  ##########################################################

  cohortData <- plantNewCohorts(newPixelCohortData, cohortData,
                                pixelGroupMap, currentTime = currentTime,
                                successionTimestep = successionTimestep)

  outs <- rmMissingCohorts(cohortData, pixelGroupMap, cohortDefinitionCols = cohortDefinitionCols)

  assertCohortData(outs$cohortData, outs$pixelGroupMap,
                   cohortDefinitionCols = cohortDefinitionCols,
                   doAssertion = doAssertion, verbose = verbose)

  if (doAssertion) {
    maxPixelGroupFromCohortData <- max(outs$cohortData$pixelGroup)
    maxPixelGroup <- as.integer(maxValue(outs$pixelGroupMap))
    test1 <- (!identical(maxPixelGroup, maxPixelGroupFromCohortData))
    if (test1) {
      stop("The sim$pixelGroupMap and cohortData have unmatching pixelGroup.",
           " They must be matching.",
           " If this occurs, please contact the module developers")
    }
  }

  if (verbose > 0) {
    nPixForest <- sum(!is.na(outs$pixelGroupMap[]))
    nPixGrps <- length(unique(outs$cohortData$pixelGroup))
    nPixNoPixGrp <- sum(outs$pixelGroupMap[] == 0, na.rm = TRUE)
    nPixTreed <- sum(outs$pixelGroupMap[] != 0, na.rm = TRUE)

    nDigits <- max(nchar(c(nPixForest, nPixGrps, nPixNoPixGrp))) + 3
    message(crayon::magenta("NUMBER OF FORESTED PIXELS          :",
                            paddedFloatToChar(nPixForest, padL = nDigits, pad = " ")))
    message(crayon::magenta("NUMBER OF PIXELS WITH TREES        :",
                            paddedFloatToChar(nPixTreed, padL = nDigits, pad = " ")))
    message(crayon::magenta("NUMBER OF UNIQUE PIXELGROUPS       :",
                            paddedFloatToChar(nPixGrps, padL = nDigits, pad = " ")))
    message(crayon::magenta("NUMBER OF PIXELS WITH NO PIXELGROUP:",
                            paddedFloatToChar(nPixNoPixGrp, padL = nDigits, pad = " ")))
  }

  return(list(cohortData = outs$cohortData,
              pixelGroupMap = outs$pixelGroupMap))
}

#' Create or amend data to a \code{pixelFateDT} object
#'
#' @param pixelFateDT A \code{pixelFateDT} \code{data.table} with 3 columns: \code{fate},
#'   \code{pixelsRemoted}, and \code{runningPixelTotal}.
#' @param fate A character string (length 1) describing in words the change
#' @param pixelsRemoved A numeric indicating how many pixels were removed due to the \code{fate}.
#' @param runningPixelTotal an optional numeric with new, running total. If not supplied,
#'   it will be calculated from the last row of \code{pixelFateDT} \code{runningTotal} minus the
#'   \code{pixelsRemoved}
#'
#' @return A \code{pixelFateDT} object, updated with one extra row.
#'
#' @export
pixelFate <- function(pixelFateDT, fate = NA_character_, pixelsRemoved = 0,
                      runningPixelTotal = NA_integer_) {
  if (missing(pixelFateDT))
    pixelFateDT <- data.table(fate = character(), pixelsRemoved = integer(), runningPixelTotal = integer())
  if (is.na(runningPixelTotal))
    runningPixelTotal <- tail(pixelFateDT$runningPixelTotal,1) - pixelsRemoved
  pixelFateDT <- rbindlist(list(pixelFateDT, data.table(fate = fate, pixelsRemoved = pixelsRemoved,
                                                        runningPixelTotal = runningPixelTotal)))
  pixelFateDT
}

#' Generate and add vegetation type column to \code{cohortData}
#'
#' This function is a simplification of \code{vegTypeMapGenerator}
#' and instead of generating a map, it adds the vegetation type column
#' to the \code{cohortData} table.
#'
#' @param x A \code{cohortData} object
#'
#' @param vegLeadingProportion Numeric between 0-1, determining the relative biomass
#'                             threshold a species needs to pass to be considered "leading".
#'
#' @param mixedType An integer defining whether mixed stands are of any kind of species
#'                  admixture (1), or only when deciduous mixed with conifer (2).
#'                  Defaults to 2.
#'
#' @template sppEquiv
#'
#' @template sppEquivCol
#'
#' @param pixelGroupColName Name of the column in \code{pixelGroup} to use.
#'
#' @template doAssertion
#'
#' @param ... Additional arguments.
#'
#' @return \code{x} with a new column, 'leading', coding the vegetation type
#'    of each group defined by \code{pixelGroupColName}
#'
#' @author Eliot McIntire, Ceres Barros, Alex Chubaty
#' @export
#' @importFrom assertthat assert_that
#' @importFrom data.table copy data.table setkey setorderv
#' @importFrom SpaDES.tools inRange
#' @importFrom utils data
#'
#' @rdname vegTypeGenerator
vegTypeGenerator <- function(x, vegLeadingProportion = 0.8,
                             mixedType = 2, sppEquiv = NULL, sppEquivCol,
                             pixelGroupColName = "pixelGroup",
                             doAssertion = getOption("LandR.assertions", TRUE), ...) {
  if (!inRange(vegLeadingProportion, 0, 1))
    stop("vegLeadingProportion must be a proportion")

  nrowCohortData <- NROW(x)

  leadingBasedOn <- if ("B" %in% colnames(x)) {
    message("Using B to derive leading type")
    "B"
  } else if ("cover" %in% colnames(x)) {
    message("Using cover to derive leading type, as there is no B column")
    "cover"
  } else {
    stop("x must have either B or cover to determine leading species/type")
  }
  assert_that(nrowCohortData > 0)

  if (isTRUE(doAssertion))
    message("LandR::vegTypeMapGenerator: NROW(x) == ", nrowCohortData)

  if (mixedType == 2) {
    if (is.null(sppEquiv)) {
      sppEquiv <- get(data("sppEquivalencies_CA", package = "LandR", envir = environment()),
                      inherits = FALSE)

      # Find the sppEquivCol that best matches what you have in x
      sppEquivCol <- names(sort(sapply(sppEquiv, function(xx) sum(xx %in% unique(x$species))),
                                decreasing = TRUE)[1])
      message(paste0("Using mixedType == 2, but no sppEquiv provided. ",
                     "Attempting to use data('sppEquivalencies_CA', 'LandR') ",
                     "and sppEquivCol == '", sppEquivCol, "'"))
    }
  }

  ## use new vs old algorithm based on size of x. new one (2) is faster in most cases.
  ## enable assertions to view timings for each algorithm before deciding which to use.
  algo <- ifelse(nrowCohortData > 3.5e6, 1, 2)

  pgdAndSc <- c(pixelGroupColName, "speciesCode")
  pgdAndScAndLeading <- c(pgdAndSc, leadingBasedOn)
  totalOfLeadingBasedOn <- paste0("total", leadingBasedOn)
  speciesOfLeadingBasedOn <- paste0("speciesGroup", leadingBasedOn)
  if (algo == 1 || isTRUE(doAssertion)) {
    # slower -- older, but simpler Eliot June 5, 2019
    # 1. Find length of each pixelGroup -- don't include pixelGroups in "by" that have only 1 cohort: N = 1
    cohortData1 <- copy(x)
    systimePre1 <- Sys.time()
    pixelGroupData1 <- cohortData1[, list(N = .N), by = pixelGroupColName]

    # Calculate speciesProportion from cover or B
    pixelGroupData1 <- cohortData1[, ..pgdAndScAndLeading][pixelGroupData1, on = pixelGroupColName]
    set(pixelGroupData1, NULL, totalOfLeadingBasedOn, pixelGroupData1[[leadingBasedOn]])

    if (identical(leadingBasedOn, "cover")) {
      pixelGroupData1[N != 1, (totalOfLeadingBasedOn) := sum(cover, na.rm = TRUE), by = pixelGroupColName]
      pixelGroupData1 <- pixelGroupData1[, list(sum(cover, na.rm = TRUE), totalcover[1]), by = pgdAndSc]
    } else {
      pixelGroupData1[N != 1, (totalOfLeadingBasedOn) := sum(B, na.rm = TRUE), by = pixelGroupColName]
      pixelGroupData1 <- pixelGroupData1[, list(sum(B, na.rm = TRUE), totalB[1]), by = pgdAndSc]
    }
    setnames(pixelGroupData1, old = c("V1", "V2"), new = c(speciesOfLeadingBasedOn, totalOfLeadingBasedOn))

    set(pixelGroupData1, NULL, "speciesProportion", pixelGroupData1[[speciesOfLeadingBasedOn]] /
          pixelGroupData1[[totalOfLeadingBasedOn]])
    systimePost1 <- Sys.time()

    setorderv(pixelGroupData1, pixelGroupColName)
  }

  if (algo == 2 || isTRUE(doAssertion)) {
    # Replacement algorithm to calculate speciesProportion
    #  Logic is similar to above --
    #  1. sort by pixelGroup
    #  2. calculate N, use this to repeat itself (instead of a join above)
    #  3. calculate speciesProportion, noting to calculate with by only if N > 1, otherwise
    #     it is a simpler non-by calculation
    cohortData2 <- copy(x)
    systimePre2 <- Sys.time()
    setkeyv(cohortData2, pgdAndSc)
    # setorderv(x, pixelGroupColName)
    pixelGroupData2 <- cohortData2[, list(N = .N), by = pixelGroupColName]
    cohortData2 <- cohortData2[, ..pgdAndScAndLeading]

    N <- rep.int(pixelGroupData2$N, pixelGroupData2$N)
    wh1 <- N == 1
    set(cohortData2, which(wh1), totalOfLeadingBasedOn, cohortData2[[leadingBasedOn]][wh1])
    if (identical(leadingBasedOn, "cover")) {
      totalBNot1 <- cohortData2[!wh1, list(N = .N, totalcover = sum(cover, na.rm = TRUE)), by = pixelGroupColName]
    } else {
      totalBNot1 <- cohortData2[!wh1, list(N = .N, totalB = sum(B, na.rm = TRUE)), by = pixelGroupColName]
    }
    totalBNot1 <- rep.int(totalBNot1[[totalOfLeadingBasedOn]], totalBNot1[["N"]])
    set(cohortData2, which(!wh1), totalOfLeadingBasedOn, totalBNot1)

    b <- cohortData2[, list(N = .N), by = pgdAndSc]
    b <- rep.int(b[["N"]], b[["N"]])
    GT1 <- (b > 1)
    if (any(GT1)) {
      pixelGroupData2List <- list()
      cohortData2[GT1, speciesProportion := sum(B, na.rm = TRUE) / totalB[1], by = pgdAndSc]
      cohortData2[!GT1, speciesProportion := B / totalB]
      #pixelGroupData2List[[2]] <- cohortData2[!GT1]
      #pixelGroupData2 <- rbindlist(pixelGroupData2List)
    } else {
      # cols <- c(pixelGroupColName, "speciesCode", "speciesProportion")
      set(cohortData2, NULL, "speciesProportion", cohortData2[[leadingBasedOn]] /
            cohortData2[[totalOfLeadingBasedOn]])
      # pixelGroupData2[[NROW(pixelGroupData2) + 1]] <- cohortData2[!GT1, ..cols]
    }
    pixelGroupData2 <- cohortData2
    systimePost2 <- Sys.time()
  }

  if (isTRUE(doAssertion)) {
    ## slower -- older, but simpler Eliot June 5, 2019
    ## TODO: these algorithm tests should be deleted after a while. See date on prev line.
    if (!exists("oldAlgoVTM", envir = .pkgEnv)) .pkgEnv$oldAlgoVTM <- 0
    if (!exists("newAlgoVTM", envir = .pkgEnv)) .pkgEnv$newAlgoVTM <- 0
    .pkgEnv$oldAlgoVTM <- .pkgEnv$oldAlgoVTM + (systimePost1 - systimePre1)
    .pkgEnv$newAlgoVTM <- .pkgEnv$newAlgoVTM + (systimePost2 - systimePre2)
    message("LandR::vegTypeMapGenerator: new algo ", .pkgEnv$newAlgoVTM)
    message("LandR::vegTypeMapGenerator: old algo ", .pkgEnv$oldAlgoVTM)
    setorderv(pixelGroupData2, pgdAndSc)
    whNA <- unique(unlist(sapply(pixelGroupData2, function(xx) which(is.na(xx)))))
    pixelGroupData1 <- pixelGroupData1[!pixelGroupData2[whNA], on = pgdAndSc]
    setkeyv(pixelGroupData1, pgdAndSc)
    setkeyv(pixelGroupData2, pgdAndSc)
    aa <- pixelGroupData1[pixelGroupData2, on = pgdAndSc]
    if (!isTRUE(all.equal(aa[["speciesProportion"]], aa[["i.speciesProportion"]])))
      stop("Old algorithm in vegMapGenerator is different than new map")
  }

  if (algo == 1) {
    pixelGroupData <- pixelGroupData1
    rm(pixelGroupData1)
  } else if (algo == 2) {
    pixelGroupData <- pixelGroupData2
    rm(pixelGroupData2)
  }

  ########################################################
  #### Determine "mixed"
  ########################################################
  if (mixedType == 1) {
    ## create "mixed" class #    -- Eliot May 28, 2019 -- faster than previous below
    ## 1. anything with >= vegLeadingProportion is "pure"
    ## 2. sort on pixelGroup and speciesProportion, reverse so that 1st row of each pixelGroup is the largest
    ## 3. Keep only first row in each pixelGroup
    ## 4. change column names and convert pure to mixed ==> mixed <- !pure
    pixelGroupData3 <- pixelGroupData[, list(pure = speciesProportion >= vegLeadingProportion,
                                             speciesCode, pixelGroup, speciesProportion)]
    setorderv(pixelGroupData3, cols = c(pixelGroupColName, "speciesProportion"), order = -1L)
    set(pixelGroupData3, NULL, "speciesProportion", NULL)
    pixelGroupData3 <- pixelGroupData3[, .SD[1], by = pixelGroupColName]
    pixelGroupData3[pure == FALSE, speciesCode := "Mixed"]
    setnames(pixelGroupData3, "speciesCode", "leading")
    pixelGroupData3[, pure := !pure]
    setnames(pixelGroupData3, "pure", "mixed")

    ## Old algorithm for above, this is ~43 times slower
    # a2 <- Sys.time()
    # pixelGroupData2 <- pixelGroupData[, list(mixed = all(speciesProportion < vegLeadingProportion),
    #                                          leading = speciesCode[which.max(speciesProportion)]),
    #                                   by = pixelGroupColName]
    # pixelGroupData2[mixed == TRUE, leading := "Mixed"]
    # b2 <- Sys.time()
  } else if (mixedType == 2) {
    if (!sppEquivCol %in% colnames(sppEquiv))
      stop(sppEquivCol, " is not in sppEquiv. Please pass an existing sppEquivCol")

    sppEq <- data.table(sppEquiv[[sppEquivCol]], sppEquiv[["Type"]])

    names(sppEq) <- c("speciesCode", "Type")
    setkey(pixelGroupData, speciesCode)

    # don't need all columns now
    colsToDelete <- c("rasterToMatch", leadingBasedOn, totalOfLeadingBasedOn)
    colsToDelete <- colsToDelete[colsToDelete %in% colnames(pixelGroupData)]
    set(pixelGroupData, NULL, colsToDelete, NULL)
    pixelGroupData3 <- merge(pixelGroupData, sppEq[!duplicated(sppEq)], all.x = TRUE)
    setkeyv(pixelGroupData, pgdAndSc)

    setkeyv(pixelGroupData3, pgdAndSc)
    mixedType2Condition <- quote(Type == "Deciduous" &
                                   speciesProportion < vegLeadingProportion &
                                   speciesProportion > 1 - vegLeadingProportion)
    pixelGroupData3[, mixed := FALSE]

    if (algo == 2 || isTRUE(doAssertion)) {
      b <- pixelGroupData3[, list(N = .N), by = pixelGroupColName]
      b <- rep.int(b[["N"]], b[["N"]])
      GT1 <- b > 1


      pgd3GT1 <- pixelGroupData3[GT1]
      pgd3NGT1 <- pixelGroupData3[!GT1]

      pgd3GT1[eval(mixedType2Condition), mixed := TRUE, by = pixelGroupColName]
      pgd3GT1[, mixed := any(mixed), by = pixelGroupColName]
      pixelGroupData3 <- rbindlist(list(pgd3NGT1, pgd3GT1))
    } else {
      pixelGroupData3[eval(mixedType2Condition), mixed := TRUE, by = pixelGroupColName]
      pixelGroupData3[, mixed := any(mixed), by = pixelGroupColName]
    }

    setorderv(pixelGroupData3, cols = c(pixelGroupColName, "speciesProportion"), order = -1L)
    set(pixelGroupData3, NULL, "speciesProportion", NULL)
    set(pixelGroupData3, NULL, "Type", NULL)
    pixelGroupData3 <- pixelGroupData3[, .SD[1], by = pixelGroupColName] ## sp. w/ highest prop. per pixelGroup
    pixelGroupData3[mixed == TRUE, speciesCode := "Mixed"]
    setnames(pixelGroupData3, "speciesCode", "leading")
    set(pixelGroupData3, NULL, "leading", factor(pixelGroupData3[["leading"]]))
  } else {
    stop("invalid mixedType! Must be one of '1' or '2'.")
  }

  cols <- c(pixelGroupColName, "leading")
  xx <- pixelGroupData3[, ..cols][x, on = pixelGroupColName]

  return(xx)
}
PredictiveEcology/LandR documentation built on Jan. 24, 2021, 12:52 a.m.