R/extract_soils_NRCS_SDA.R

Defines functions extract_soils_NRCS_SDA fetch_soils_from_NRCS_SDA fetch_mukeys_spatially_NRCS_SDA calculate_soil_depth_NRCS is_NRCS_horizon_organic sqlQuote create_reference_for_NRCS_SDA

Documented in calculate_soil_depth_NRCS extract_soils_NRCS_SDA fetch_mukeys_spatially_NRCS_SDA fetch_soils_from_NRCS_SDA is_NRCS_horizon_organic

create_reference_for_NRCS_SDA <- function() {
  paste0(
    "Soil Survey Staff, Natural Resources Conservation Service, ",
    "United States Department of Agriculture. Web Soil Survey. ",
    "Available online at https://websoilsurvey.nrcs.usda.gov/. ",
    "Accessed [",
    format(as.POSIXlt(Sys.Date()), "%Y-%b-%e"),
    "]"
  )
}


# Single quotes for T-SQL
# `shQuote()` returns double quotes on windows OS
sqlQuote <- function(x) {
  paste0("'", x, "'", recycle0 = TRUE)
}


#' Check whether a \var{NRCS} soil horizon is organic
#'
#' Based on function \code{CheckTexture()}, version 2020-08-31, of the
#' \var{SoilDataDevelopmentToolbox}.
#'
#' @param x A two-dimensional character object. Required columns are
#'   \var{taxorder}, \var{taxsubgrp}, \var{desgnmaster}, \var{texture},
#'   and \var{lieutex}. Each horizon is represented by exactly one row.
#'
#' @return A logical vector representing each horizon. \code{TRUE} indicates
#'   that a horizon is considered organic; default is \code{FALSE}
#'   representing mineral soils. \code{NA}s in the input propagate.
#'
#' @references Code based on \var{CheckTexture()} version \var{2020-Aug-31} from
# nolint start: line_length_linter
#'   \url{https://github.com/ncss-tech/SoilDataDevelopmentToolbox/blob/master/SDA_Valu2Table.py}
# nolint end
#'
#' @examples
#' x <- data.frame(
#'   taxorder = c("Histosols", "x", "x", "x", "x", "x", NA),
#'   taxsubgrp = c("x", "histic", "x", "x", "x", "x", NA),
#'   desgnmaster = c("L", "L", "O", "x", "x", "x", NA),
#'   texture = c("x", "x", "x", "CE", "x", "x", NA),
#'   lieutex = c("x", "x", "x", "x", "Muck", "x", NA)
#' )
#'
#' cbind(
#'   organic = is_NRCS_horizon_organic(x),
#'   expected_organic = c(FALSE, FALSE, TRUE, TRUE, TRUE, FALSE, NA)
#' )
#'
#' \dontrun{
#' if (curl::has_internet()) {
#'   x <- fetch_soils_from_NRCS_SDA(bind_params = c(471168, 1606800))
#'   is_NRCS_horizon_organic(x)
#' }
#' }
#'
#' @export
is_NRCS_horizon_organic <- function(x) {

  vars <- c("taxorder", "taxsubgrp", "desgnmaster", "texture", "lieutex")
  cns <- colnames(x)

  for (var in vars) {
    if (!(var %in% cns)) {
      stop(shQuote(var), " is a required column name, but cannot be found.")
    }
  }

  lieuList <- c(
    "Slightly decomposed plant material",
    "Moderately decomposed plant material",
    "Highly decomposed plant material",
    "Undecomposed plant material",
    "Muck",
    "Mucky peat",
    "Peat",
    "Coprogenous earth"
  )

  txList <- c(
    "CE", "COP-MAT", "HPM", "MPM", "MPT", "MUCK", "PDOM", "PEAT",
    "SPM", "UDOM"
  )

  # propagate NAs conservatively
  is_mineral <-
    x[, "taxorder"] == "Histosols" |
    grepl("histic", x[, "taxsubgrp"], ignore.case = TRUE)

  is_organic <- cbind(
    desgnmaster = x[, "desgnmaster"] %in% c("O", "L"),
    texture = x[, "texture"] %in% txList,
    lieutex = x[, "lieutex"] %in% lieuList
  )
  is_organic[is.na(x[, c("desgnmaster", "texture", "lieutex")])] <- NA

  !is_mineral & apply(is_organic, 1, any)
}


#' Determine soil depth for \var{NRCS} soil data
#'
#' @param x A \code{data.frame} or \code{matrix}.
#'   Soil horizons/layers are organized in rows
#'   and soil texture variables in columns.
#' @param target_site_ids A vector. The unique location identifiers, e.g.,
#'   \var{cokey} values, for which soil depth is to be determined.
#' @param restrict_by_ec_or_ph A logical value. Include depth restrictions
#'   by \code{ph <= 3.5} or \code{ec >= 16}. This option requires additional
#'   variables.
#' @param var_site_id A character string. The column name of \code{x} which
#'   contains the unique location identifiers.
#' @param var_horizon A character string. The column name of \code{x} which
#'   contains the horizon/layer numbers where the shallowest horizon/layer
#'   is number one.
#' @param var_horizon_lower_depth A character string. The column name of
#'   \code{x} that contains the lower depth limit of horizons/layers.
#' @param var_restrictions A vector of character strings. The column names of
#'   \code{x} that contain depth restrictions, e.g., bedrock.
#' @param var_soiltexture A vector of character strings. The column names of
#'   \code{x} that contain the three soil texture variables sand, clay,
#'   and silt.
#'
#'
#' @return A \code{data.frame} with at least three columns. Each row represents
#'   one value of \code{target_cokeys}. The columns are: \itemize{
#'     \item \var{N_horizons}: The number of soil horizons/layers.
#'     \item \var{SoilDepth_cm}: The soil depth in centimeters.
#'     \item \var{depth_Lx}: The lower depth of soil horizon/layer \var{x}.
#'       Note: \var{depth_L1} may vary from \code{x[, "hzdepb_r"]}.
#'   }
#'
#' @references Code based on \var{CalcRZDepth()} version 2020-08-31:
# nolint start: line_length_linter
#'   \url{https://github.com/ncss-tech/SoilDataDevelopmentToolbox/blob/master/SDA_Valu2Table.py}
# nolint end
#'   Note: currently ignores "dense" layer restrictions
#'
#' @examples
#' \dontrun{
#' if (curl::has_internet()) {
#'   var_stxt3 <- c("sandtotal_r", "claytotal_r", "silttotal_r")
#'
#'   x <- fetch_soils_from_NRCS_SDA(bind_params = c(471168, 1606800))
#'
#'   calculate_soil_depth_NRCS(
#'     x,
#'     restrict_by_ec_or_ph = FALSE,
#'     var_soiltexture = var_stxt3
#'   )
#'
#'   x2 <- cbind(x, organic = is_NRCS_horizon_organic(x))
#'   calculate_soil_depth_NRCS(
#'     x2,
#'     restrict_by_ec_or_ph = TRUE,
#'     var_soiltexture = var_stxt3
#'   )
#' }
#' }
#'
#' @export
calculate_soil_depth_NRCS <- function(
  x,
  target_site_ids,
  restrict_by_ec_or_ph = TRUE,
  var_site_id = "COKEY",
  var_horizon = "Horizon_No",
  var_horizon_lower_depth = "hzdepb_r",
  var_restrictions =
    c("Horizon_depth", "RootZoneRestriction_depth", "Bedrock_depth"),
  var_soiltexture = c("sand", "clay", "silt")
) {

  vars <- c(
    var_site_id,
    var_horizon, var_horizon_lower_depth,
    var_restrictions,
    var_soiltexture
  )

  if (restrict_by_ec_or_ph) {
    vars <- c(
      vars,
      "hzdept_r", "taxorder", "taxsubgrp", "organic", "ec_r", "ph1to1h2o_r"
    )
  }


  cns <- colnames(x)

  for (var in vars) {
    if (!(var %in% cns)) {
      stop(shQuote(var), " is a required column name, but cannot be found.")
    }
  }

  if (missing(target_site_ids)) {
    target_site_ids <- x[, var_site_id]
  }

  # Check that sites and horizons are unique combinations
  stopifnot(anyDuplicated(x[, c(var_site_id, var_horizon)]) == 0)


  # Determine number of soil horizons with complete/partial soil texture values
  tmp <- aggregate(
    x = apply(
      x[, var_soiltexture, drop = FALSE],
      MARGIN = 1,
      FUN = function(x) sum(!is.na(x))
    ),
    by = list(id = x[, var_site_id]),
    FUN = function(x) c(complete = sum(x == 3), partial = sum(x > 0))
  )

  ids <- match(target_site_ids, tmp[["id"]], nomatch = NA)
  target_complete_soiltexture <- tmp[["x"]][ids, "complete"]
  target_partial_soiltexture <- tmp[["x"]][ids, "partial"]


  # Calculate additional restrictions
  if (restrict_by_ec_or_ph) {
    # nolint start: scalar_in_linter.
    x[, "is_organic"] <- x[, "organic"] %in% TRUE
    x[, "is_histosol_histic"] <-
      x[, "taxorder"] %in% "Histosols" |
      grepl("histic", x[, "taxsubgrp"], ignore.case = TRUE)
    # nolint end: scalar_in_linter.

    # Restrictions pH < 3.5 or EC > 16 apply only
    # if horizon is non-organic and not a histosol/histic soil
    x[, "check"] <- !x[, "is_organic"] & !x[, "is_histosol_histic"]

    tmp <- by(
      data = x,
      INDICES = x[, var_site_id],
      FUN = function(xc) {
        ids_ec_restricted <- which(
          xc[, "check"] & !is.na(xc[, "ec_r"]) & xc[, "ec_r"] >= 16
        )
        ids_ph_restricted <- which(
          xc[, "check"] & !is.na(xc[, "ph1to1h2o_r"]) &
            xc[, "ph1to1h2o_r"] <= 3.5
        )

        c(
          ec_restriction_depth = if (length(ids_ec_restricted) > 0) {
            xc[ids_ec_restricted[[1L]], "hzdept_r"]
          } else {
            NA
          },
          ph_restriction_depth = if (length(ids_ph_restricted) > 0) {
            xc[ids_ph_restricted[[1L]], "hzdept_r"]
          } else {
            NA
          }
        )
      },
      simplify = FALSE
    )

    restriction2_depth <- matrix(
      data = unlist(tmp[match(x[, var_site_id], names(tmp))]),
      ncol = 2,
      byrow = TRUE
    )
  }


  # Convert to wide format (one row for each point location)
  tmp_depths <- reshape2::acast(
    data = as.data.frame(
      x[, c(var_site_id, var_horizon, var_horizon_lower_depth)]
    ),
    formula = formula(paste(var_site_id, "~", var_horizon)),
    value.var = var_horizon_lower_depth
  )

  ids <- match(target_site_ids, rownames(tmp_depths), nomatch = NA)
  locs_table_depths <- tmp_depths[ids, , drop = FALSE]
  colnames(locs_table_depths) <- paste0("depth_L", colnames(locs_table_depths))
  rownames(locs_table_depths) <- target_site_ids


  # Determine soil depth as depth of shallowest restriction
  tmp <- x[, var_restrictions, drop = FALSE]

  if (restrict_by_ec_or_ph) {
    tmp <- cbind(tmp, restriction2_depth)
  }

  soil_depth_cm <- apply(
    X = tmp,
    MARGIN = 1,
    FUN = min,
    na.rm = TRUE
  )

  ids <- match(target_site_ids, x[, var_site_id], nomatch = NA)
  soil_depth_cm <- unname(round(soil_depth_cm)[ids])


  # Case:
  #   * soil depth is missing and
  #   * no complete soil texture available
  # ==> adjust soil depth to 0
  ids <-
    !is.finite(soil_depth_cm) &
    target_complete_soiltexture == 0
  soil_depth_cm[ids] <- 0

  # Adjust soil depth and/or layers where needed
  locs_table_depths <- cbind(
    SoilDepth_cm = soil_depth_cm,
    depth_L = locs_table_depths
  )


  # Case:
  #   * soil_depth disagrees with horizon_depth
  #   * soil_depth > 0 and N_horizons_tmp > 0
  # ==> adjust depth_Lx to soil_depth
  L_at_soildepth <- apply(
    X = locs_table_depths,
    MARGIN = 1,
    FUN = function(x) {
      findInterval(x[[1L]], c(0, na.exclude(x[-1])), left.open = TRUE)
    }
  )
  ids <- which(!apply(
    X = locs_table_depths,
    MARGIN = 1,
    FUN = function(x) x[[1L]] == 0 || all(is.na(x[-1])) || x[[1L]] %in% x[-1]
  ))
  locs_table_depths[cbind(ids, 1 + L_at_soildepth[ids])] <-
    locs_table_depths[ids, "SoilDepth_cm"]

  # Case:
  #   * soil_depth > 0 and horizon_depth L1 in (0, NA) and
  #   * target_complete_soiltexture > 0
  # ==> adjust depth_L1 to soil_depth
  ids <- apply(
    X = cbind(
      target_complete_soiltexture,
      locs_table_depths
    ),
    MARGIN = 1,
    FUN = function(x) {
      !anyNA(x[1:2]) & x[[1L]] > 0 & x[[2L]] > 0 & x[[3L]] %in% c(0, NA)
    }
  )
  locs_table_depths[ids, "depth_L1"] <- locs_table_depths[ids, "SoilDepth_cm"]

  # Case:
  #   * soil_depth > 0 and horizon_depth L1 in (0, NA) and
  #   * target_complete_soiltexture == 0
  # ==> adjust soil_depth to 0
  ids <- apply(
    X = cbind(
      target_complete_soiltexture,
      locs_table_depths
    ),
    MARGIN = 1,
    FUN = function(x) {
      !anyNA(x[1:2]) & x[[1L]] == 0 & x[[2L]] > 0 & x[[3L]] %in% c(0, NA)
    }
  )
  locs_table_depths[ids, "SoilDepth_cm"] <- 0

  # Case:
  #  * soil_depth > 0
  #  * target_partial_soiltexture == 0
  # ==> adjust soil_depth to 0
  ids <- apply(
    X = cbind(
      target_partial_soiltexture,
      locs_table_depths
    ),
    MARGIN = 1,
    FUN = function(x) !is.na(x[[1L]]) & x[[1L]] == 0 & x[[2L]] > 0
  )
  locs_table_depths[ids, "SoilDepth_cm"] <- 0

  # Case:
  #   * soil_depth == 0 and horizon_depth L1 > 0 and
  #   * target_complete_soiltexture > 0
  # ==> adjust soil_depth to depth_L1
  ids <- apply(
    X = cbind(
      target_complete_soiltexture,
      locs_table_depths
    ),
    MARGIN = 1,
    FUN = function(x) {
      !is.na(x[[1L]]) &
        x[[1L]] > 0 &
        x[[2L]] == 0 &
        !is.na(x[[3L]]) &
        x[[3L]] > 0
    }
  )
  locs_table_depths[ids, "SoilDepth_cm"] <- locs_table_depths[ids, "depth_L1"]


  # Clean layer depths > soil depth
  ids <- locs_table_depths[, "SoilDepth_cm"] < locs_table_depths[, -1]
  locs_table_depths[, -1][ids] <- NA


  # Put together for output
  cbind(
    N_horizons = apply(
      X = locs_table_depths[, -1, drop = FALSE],
      MARGIN = 1,
      function(x) as.integer(sum(!is.na(x)))
    ),
    depth_L = locs_table_depths
  )
}



#' Spatially query \var{mukey} values for point locations from \var{NRCS}
#' web-based \var{SDA} service
#'
#' @inheritParams rSW2st::as_points
#' @param db A character string. Query \var{mukey} from the \var{SSURGO} or
#'   from the \var{STATSGO} soil database.
#' @param ... Currently ignored.
#' @param chunk_size An integer value. The size of chunks into which
#'   \code{locations} is broken up and looped over for processing.
#' @param progress_bar A logical value. Display a progress bar as the code
#'   loops over the chunks?
#'
#' @return A named list with two elements: \itemize{
#'   \item{\var{ref}} The data reference.
#'   \item{\var{mukeys}} A vector with a \var{mukey} value for each
#'     \code{locations}.
#' }
#'
#' @examples
#' \dontrun{
#' if (curl::has_internet()) {
#'   locations <- matrix(
#'     data = c(-120.325, -111.245, 39.855, 36.753),
#'     nrow = 2
#'   )
#'
#'   fetch_mukeys_spatially_NRCS_SDA(locations)
#' }
#' }
#'
#' @export
fetch_mukeys_spatially_NRCS_SDA <- function(
  x,
  crs = 4326,
  db = c("SSURGO", "STATSGO"),
  ...,
  chunk_size = 50L,
  progress_bar = FALSE
) {

  stopifnot(
    requireNamespace("soilDB"),
    curl::has_internet()
  )


  #------ Make sure inputs are correctly formatted
  db <- match.arg(db)

  locations <- rSW2st::as_points(x, to_class = "sf", crs = crs)
  nxlocs <- nrow(locations)


  #--- Extract mukeys for each point location
  res <- list()

  ids_chunks <- rSW2utils::make_chunks(
    nx = nxlocs,
    chunk_size = chunk_size
  )

  N_chunks <- length(ids_chunks)
  progress_bar <- progress_bar && N_chunks > 2

  if (progress_bar) {
    message("Fetch 'mukey' values from ", shQuote(db))

    has_progress_bar <- requireNamespace("utils")

    if (has_progress_bar) {
      pb <- utils::txtProgressBar(max = N_chunks, style = 3)
    } else {
      warning("Progress bar requested but package 'utils' is not available.")
    }

  } else {
    has_progress_bar <- FALSE
  }

  for (k in seq_len(N_chunks)) {
    res_mukeys <- try(
      soilDB::SDA_spatialQuery(
        geom = locations[ids_chunks[[k]], ], # sf since soilDB v2.6.10
        db = db,
        what = "mupolygon" # since soilDB v2.6.3
      ),
      silent = FALSE
    )

    if (inherits(res_mukeys, "try-error")) {
      warning("Spatial SDA query produced error: chunk = ", k)
      res[[k]] <- rep(NA, length(ids_chunks[[k]]))

    } else if (inherits(res_mukeys, c("SpatialPolygons", "sf"))) {
      # Extract mukey for each location because
      # return values of `SDA_spatialQuery` are not ordered by input `geom`
      # (unless `byFeature = TRUE` since v2.6.10)
      tmp <- sf::st_intersects(locations[ids_chunks[[k]], ], res_mukeys)
      ltmp <- lengths(tmp)

      if (any(ltmp == 0L, ltmp > 1L)) {
        stop("Spatial SDA query return no or more than one result.")
      }

      res[[k]] <- as.vector(
        res_mukeys[unlist(unclass(tmp)), "mukey", drop = TRUE]
      )

    } else {
      warning("Spatial SDA query returned non-spatial object: chunk = ", k)
      res[[k]] <- rep(NA, length(ids_chunks[[k]]))
    }

    if (has_progress_bar) {
      utils::setTxtProgressBar(pb, k)
    }
  }

  if (has_progress_bar) {
    close(pb)
  }

  list(
    ref = create_reference_for_NRCS_SDA(),
    mukeys = unlist(res)
  )
}



#' Download soil data from `NRCS` `SDA` web service
#'
#' @param mukeys_unique An integer vector with unique `mukey` values.
#'   Deprecated; use `bind_params` instead.
#' @param bind_params A vector or 2-dimensional container with parameter values
#'   to bind to the `T-SQL` query via `injection_format`.
#' @param sql_template A character vector.
#'   A valid `T-SQL` query with a `"WHERE"` clause so that the code can
#'   inject chunks of `bind_params` values via format `injection_format`.
#'   If `NA`, then the default query is loaded, see examples.
#' @param injection_format A character vector that identifies the location
#'   (format specifier) in `sql_template` for parameter value injection/binding.
#' @param majcompflag A character string. `"subset"` keeps
#'   the `"WHERE"` clause `component.majcompflag = 'Yes'` that is contained in
#'   `sql_template`; `"ignore"` removes it from the query. Note that
#'   the field `"majcompflag"` exists only in the `SSURGO` version
#'   of the `"component"` table, but not in the `STATSGO` version.
#' @param only_soilcomp A logical value. If `TRUE`, then query restricts
#'   to soil components. If `FALSE`, then query includes
#'   all components including `"Miscellaneous areas"` and `"NOTCOM"`
#'   (not complete) components.
#' @param chunk_size An integer value. The size of chunks into which
#'   `bind_params` is broken up and looped over for processing.
#' @param progress_bar A logical value. Display a progress bar as the code
#'   loops over the chunks?
#'
#' @return A `data.frame` according to the specifications of `sql` or
#'   `NULL` if the query returns empty.
#'
#' @section Notes: A live internet connection is required to access `SDA`.
#'
#' @section Notes: This is a function with minimal functionality;
#' use [extract_soils_NRCS_SDA()] for a user-friendly interface.
#'
#' @seealso [soilDB::SDA_query()]
#'
#' @examples
#' \dontrun{
#' if (curl::has_internet()) {
#'   # Query soils of dominant component of soil map unit
#'   fetch_soils_from_NRCS_SDA(bind_params = 67616)
#'
#'   # As of 2022-March-15, mukey 2479921 contained one "NOTCOM" component
#'   fetch_soils_from_NRCS_SDA(bind_params = 2479921)
#'   fetch_soils_from_NRCS_SDA(bind_params = 2479921, only_soilcomp = FALSE)
#'
#'   sql <- readLines(
#'     system.file("NRCS", "nrcs_sql_template.sql", package = "rSW2exter")
#'   )
#'
#'   fetch_soils_from_NRCS_SDA(bind_params = 67616, sql_template = sql)
#'
#'   # This will return NULL because -1 is not an existing mukey value
#'   fetch_soils_from_NRCS_SDA(bind_params = -1, sql_template = sql)
#'
#'   # Query soils of a specific component of a soil map unit
#'   sql2 <- readLines(
#'     system.file("NRCS", "nrcs_sql_template2.sql", package = "rSW2exter")
#'   )
#'
#'   fetch_soils_from_NRCS_SDA(
#'     bind_params = data.frame(mukey = 398856, compname = "Waupaca"),
#'     sql_template = sql2,
#'     injection_format = "(VALUES %s) AS t (mm, cn)"
#'   )
#' }
#' }
#'
#' @md
#' @export
fetch_soils_from_NRCS_SDA <- function(
  mukeys_unique,
  bind_params = mukeys_unique,
  sql_template = NA,
  injection_format = "mukey IN (%s)",
  majcompflag = c("subset", "ignore"),
  only_soilcomp = TRUE,
  chunk_size = 1000L,
  progress_bar = FALSE
) {

  stopifnot(requireNamespace("soilDB"), curl::has_internet())

  majcompflag <- match.arg(majcompflag)

  if (!missing(mukeys_unique)) {
    .Deprecated(
      msg = "Argument 'mukeys_unique' is deprecated; please use 'bind_params'."
    )
  }

  stopifnot(missing(mukeys_unique) || identical(bind_params, mukeys_unique))
  if (length(dim(bind_params)) != 2L) {
    bind_params <- data.frame(bind_params)
  }
  stopifnot(!anyDuplicated(bind_params))

  hasMultiParams <- ncol(bind_params) > 1L

  ids_chunks <- rSW2utils::make_chunks(
    nx = nrow(bind_params),
    chunk_size = chunk_size
  )

  N_chunks <- length(ids_chunks)

  if (isTRUE(is.na(sql_template))) {
    sql_template <- readLines(
      system.file(
        "NRCS", "nrcs_sql_template.sql",
        package = "rSW2exter",
        mustWork = TRUE
      )
    )
  }


  #--- Extract soil horizon data for mukeys (chunked)
  res <- list()

  # trim off comments at top of file
  tmp_ids <- seq_len(grep("SELECT", sql_template, fixed = TRUE)[[1L]] - 1L)
  sql_base <- if (length(tmp_ids) > 0) {
    sql_template[-tmp_ids]
  } else {
    sql_template
  }

  # remove majcompflag (may be necessary for STATSGO)
  if (majcompflag == "ignore") {
    txt_majcompflag <- "AND component.majcompflag = 'Yes'"
    tmp <- regexpr(txt_majcompflag, sql_base, fixed = TRUE)
    iline <- which(tmp > 0)[[1L]]
    sql_base[iline] <- sub(txt_majcompflag, "", sql_base[iline])
  }

  # handle non-soil components
  if (!only_soilcomp) {
    txt_nosoilflag <- "compkind NOT IN"
    tmp <- regexpr(txt_nosoilflag, sql_base, fixed = TRUE)
    iline <- which(tmp > 0)[[1L]]
    sql_base[iline] <- ""
  }

  progress_bar <- progress_bar && N_chunks > 2

  if (progress_bar) {
    print("Fetch soil information from NRCS SDA:")

    has_progress_bar <- requireNamespace("utils")

    if (has_progress_bar) {
      pb <- utils::txtProgressBar(max = N_chunks, style = 3)
    } else {
      warning("Progress bar requested but package 'utils' is not available.")
    }

  } else {
    has_progress_bar <- FALSE
  }

  # Identify lines where mukey values are injected
  tmp <- regexpr(injection_format, sql_base, fixed = TRUE)
  iline <- which(tmp > 0)[[1L]]

  stopifnot(length(iline) == 1L)


  for (k in seq_along(ids_chunks)) {
    # Prepare SQL query for SDA
    sql <- sql_base

    # Insert requested parameter values
    tmp <- bind_params[ids_chunks[[k]], , drop = !hasMultiParams]

    sql[iline] <- sprintf(
      fmt = sql[iline],

      if (hasMultiParams) {
        paste0(
          "(",
          apply(tmp, 1, function(x) toString(sqlQuote(x))),
          ")",
          collapse = ","
        )

      } else {
        paste(sqlQuote(tmp), collapse = ",")
      }
    )

    # Send query to SDA
    # Suppress messages about returning a data.frame
    # (but returned value could be NULL)
    tmp_sql <- paste(sql, collapse = " ")
    res[[k]] <- suppressMessages(soilDB::SDA_query(tmp_sql))

    if (length(res) >= k && inherits(res[[k]], "try-error")) {
      message(
        "Error produced during call to `soilDB::SDA_query`; ",
        "result will be set to NULL; query leading to error was: ",
        tmp_sql
      )
      warning(res[[k]])
      res[[k]] <- NULL
    }

    if (has_progress_bar) {
      utils::setTxtProgressBar(pb, k)
    }
  }

  if (has_progress_bar) {
    close(pb)
  }

  do.call(rbind, res)
}


#' Extract soil information from the Soil Data Access \var{SDA} service by
#' \var{NRCS} for \pkg{SOILWAT2} applications
#'
#' @inheritParams rSW2st::as_points
#' @inheritParams fetch_mukeys_spatially_NRCS_SDA
#' @param mukeys A character or integer vector. List of soil map unit keys
#'   for which soil information should be extracted. Provide \code{locations}
#'   or \code{mukeys}.
#' @param method A character string. Method indicating whether \var{SDA}
#'   should query \var{"SSURGO"}, \var{"STATSGO"}, or
#'   \var{"SSURGO_then_STATSGO"} which attempts to replace
#'   \code{locations} without \var{"SSURGO"} soil information,
#'   e.g., due to unmapped areas, with \var{"STATSGO"} data.
#' @param only_majcomp A logical value. If \code{TRUE} and extraction is
#'   from \var{"SSURGO"}, then the query extracts only major components.
#'   If \code{FALSE}, then the query extracts components that are major and
#'   those that are not (see \var{majcompflag} of
#'   \code{\link{fetch_soils_from_NRCS_SDA}}).
#'   Ignored if extraction is from \var{"STATSGO"}.
#' @param remove_organic_horizons A character string. Method
#'   indicating how to deal with organic horizons as determined by
#'   function \code{\link{is_NRCS_horizon_organic}}. See details.
#' @param replace_missing_fragvol_with_zero A character string. Method
#'   indicating how missing/null values of rock/gravel fragment fractions
#'   should be interpreted;
#'   passed to \code{\link[rSW2data]{set_missing_soils_to_value}}.
#'   The options are one of
#'   \describe{
#'     \item{\var{"all"}}{
#'       Missing/null values of rack/gravel fragments are
#'       replaced by 0 %.
#'       See also argument \var{nullFragsAreZero} of
#'       function \code{\link[soilDB]{fetchSDA}}.
#'     }
#'     \item{\var{"at_surface"}}{
#'       Missing/null values of rack/gravel fragments are
#'       replaced by 0 % in the shallowest horizon only.
#'       Note, remaining missing values in deeper horizons
#'       can subsequently be imputed by argument \code{impute}.
#'     }
#'     \item{\var{"none"}}{
#'        Missing/null values remain unmodified unless
#'        argument \code{impute} is activated.
#'     }
#'   }
#' @param estimate_missing_bulkdensity A logical value. Estimate missing
#'   bulk density values from saturated water content and gravel volume.
#'   See \code{\link[rSW2data]{estimate_bulkdensity}}.
#' @param impute A logical value. Impute missing values with a
#'   shallow-depth value carried deeper approach (in analogy to \var{LOCF}).
#'   Consequently, missing values in the shallowest horizon are not imputed.
#'   See \code{\link[rSW2utils]{impute_df}}.
#' @param digits An integer value. The number of digits to which soil texture
#'   variables are rounded. Skip rounding if \code{NA} or \code{NULL}.
#' @param verbose A logical value.
#' @inheritParams fetch_soils_from_NRCS_SDA
#' @inheritParams calculate_soil_depth_NRCS
#'
#' @section Details: \var{NRCS} soil datasets \var{SSURGO} and \var{STATSGO} are
#'   organized in soil map units \var{mukey} that are
#'   spatially explicit (i.e., we can query their values by geographic location)
#'   and within each \var{mukey} into soil map unit components \var{cokey}
#'   which have no explicit spatial arrangement within a soil map unit.
#'   Because soil texture information is specific to soil map unit components,
#'   geographic location alone is insufficient to query soil texture.
#'
#'   This function relies that soil information of exactly one \var{cokey}
#'   per each \code{location} or \code{mukeys} is returned by
#'   \code{\link{fetch_soils_from_NRCS_SDA}}. The default \var{SQL} template
#'   \var{"nrcs_sql_template.sql"} extracts the "dominant component".
#'   The dominant component is defined as the the first \var{cokey} with the
#'   highest representative component percent \var{comppct_r} that is a
#'   soil component.
#'   See \var{GetDominantComponent.py}
#'   from \url{https://github.com/ncss-tech/SoilDataDevelopmentToolbox}.
#'
#'   Whereas \var{mukey} is expected to identify the same soil map unit
#'   across different releases of the \var{NRCS} database, a component
#'   cannot be tracked by its \var{cokey} across releases. A component of
#'   a soil map unit is best identified by a unique combination of
#'   \var{compname}, \var{comppct_r}, and \var{localphase}.
#'
#' @section Details:
#'   The argument \code{remove_organic_horizons} is one of
#'   \describe{
#'     \item{\var{"all"}}{
#'       All organic layers (at surface or buried) are removed and horizon
#'       number and depths are recalculated.
#'     }
#'     \item{\var{"at_surface"}}{
#'       Organic layer(s) at the soil surface are removed and horizon
#'       number and depths are recalculated. Buried organic horizons remain
#'       unmodified.
#'     }
#'     \item{\var{"none"}}{
#'        Horizons are not modified.
#'     }
#'   }
#'
#' @section Notes: A live internet connection is required to access \var{SDA}.
#'
#' @seealso \code{\link[soilDB]{fetchSDA}} and \code{\link[soilDB]{SDA_query}}
#'
#' @examples
#' \dontrun{
#' if (curl::has_internet()) {
#'   locations <- matrix(
#'     data = c(-120.325, -111.245, 39.855, 36.753),
#'     nrow = 2
#'   )
#'
#'   # Example 1: extract soils by mukey values
#'   extract_soils_NRCS_SDA(mukeys = c(471168, 1606800))
#'
#'   # Example 2: extract soils by geographic location
#'   extract_soils_NRCS_SDA(x = locations)
#'
#'   # Example 3: first identify mukey values by geographic location,
#'   # then query soils from SSURGO by mukey,
#'   # but still pass locations in case we need to query STATSGO as well
#'
#'   mukeys <- fetch_mukeys_spatially_NRCS_SDA(
#'     x = locations,
#'     db = "SSURGO",
#'     progress_bar = TRUE
#'   )
#'
#'   extract_soils_NRCS_SDA(
#'     x = locations,
#'     mukeys = mukeys[["mukeys"]],
#'     method = "SSURGO_then_STATSGO",
#'     remove_organic_horizons = "at_surface",
#'     replace_missing_fragvol_with_zero = "at_surface",
#'     estimate_missing_bulkdensity = TRUE,
#'     restrict_by_ec_or_ph = FALSE,
#'     impute = TRUE,
#'     progress_bar = TRUE,
#'     verbose = TRUE
#'   )
#' }
#' }
#'
#' @export
extract_soils_NRCS_SDA <- function(
  x,
  crs = 4326,
  mukeys = NULL,
  method = c("SSURGO", "STATSGO", "SSURGO_then_STATSGO"),
  sql_template = NA,
  only_majcomp = TRUE,
  only_soilcomp = TRUE,
  remove_organic_horizons = c("none", "all", "at_surface"),
  replace_missing_fragvol_with_zero = c("none", "all", "at_surface"),
  estimate_missing_bulkdensity = FALSE,
  restrict_by_ec_or_ph = TRUE,
  impute = FALSE,
  digits = 3L,
  chunk_size = 1000L,
  progress_bar = FALSE,
  verbose = FALSE
) {

  if (!missing(x)) {
    x <- rSW2st::as_points(x, to_class = "sf", crs = crs)
  }

  stopifnot(
    !(missing(x) && is.null(mukeys)),
    curl::has_internet(),
    missing(x) || is.null(mukeys) || nrow(x) == length(mukeys)
  )

  method <- match.arg(method)

  if (missing(x) && method == "SSURGO_then_STATSGO") {
    warning(
      "'method' == \"SSURGO_then_STATSGO\" has no effect ",
      "if locations are missing"
    )
  }

  db <- if (method == "STATSGO") "STATSGO" else "SSURGO"

  locs_keys <- data.frame(
    row_id = NA,
    unit_id = NA,
    source = db,
    mukey = if (is.null(mukeys)) {
      fetch_mukeys_spatially_NRCS_SDA(
        x = x,
        crs = crs,
        db = db,
        progress_bar = progress_bar
      )[["mukeys"]]

    } else {
      mukeys
    },
    cokey = NA,
    compname = NA,
    compkind = NA,
    comppct_r = NA,
    localphase = NA
  )

  stopifnot(!anyNA(locs_keys[["mukey"]]))

  locs_keys[, "row_id"] <- seq_len(nrow(locs_keys)) # extraction ID

  #--- Identify unique soil units, here, unique mukeys, (and define unit_id)
  locs_keys[, "unit_id"] <- match(
    locs_keys[, "mukey"],
    unique(locs_keys[, "mukey"])
  )


  # nolint start: object_usage_linter, unreachable_code_linter.
  if (FALSE) {
    # e.g., unique soil units defined by mukey-component combinations
    tmp_tag <- apply(
      locs_keys[, c("mukey", "compname", "comppct_r", "localphase")],
      MARGIN = 1,
      FUN = function(x) paste0(as.integer(x[[1L]]), "_", x[[2L]])
    )
    locs_keys[, "unit_id"] <- match(tmp_tag, unique(tmp_tag))
  }
  # nolint end


  #--- Download soil data from NRCS SDA web service (for unique "mukeys")
  res <- fetch_soils_from_NRCS_SDA(
    bind_params = as.integer(unique(locs_keys[["mukey"]])),
    sql_template = sql_template,
    majcompflag = if (only_majcomp) {
      switch(db, SSURGO = "subset", STATSGO = "ignore")
    } else {
      "ignore"
    },
    only_soilcomp = only_soilcomp,
    chunk_size = chunk_size,
    progress_bar = progress_bar
  )

  # Assign unique soil unit IDs to `res`
  ids <- match(res[, "MUKEY"], locs_keys[, "mukey"])
  res[, "unit_id"] <- locs_keys[ids, "unit_id"]

  # nolint start: object_usage_linter, unreachable_code_linter.
  if (FALSE) {
    # e.g., unique soil units defined by mukey-compname combinations
    tmp_tag2 <- apply(
      res[, c("MUKEY", "compname", "comppct_r", "localphase")],
      MARGIN = 1,
      FUN = function(x) paste0(as.integer(x[[1L]]), "_", x[[2L]])
    )
    ids <- match(tmp_tag2, tmp_tag)
    res[, "unit_id"] <- locs_keys[ids, "unit_id"]
  }
  # nolint end


  # Copy extracted soil information to table `locs_keys` (based on unit_id)
  ids <- match(locs_keys[, "unit_id"], res[, "unit_id"], nomatch = 0)
  tmp_vars <- c("compname", "compkind", "comppct_r", "localphase")
  tmp_vars <- intersect(tmp_vars, colnames(res))
  locs_keys[ids > 0, c("cokey", tmp_vars)] <- res[ids, c("COKEY", tmp_vars)]

  if (verbose && any(ids == 0)) {
    message("Some requests returned without soils: n = ", sum(ids == 0))
  }


  #--- Identify which variables are fixed per soil unit
  var_stxt3 <- c("sandtotal_r", "claytotal_r", "silttotal_r")
  var_stxt <- intersect(var_stxt3, colnames(res))
  var_output <- c(
    "dbovendry_r",
    "fragvol_r",
    var_stxt,
    "organic", "om_r"
  )

  tmp <- by(
    data = res,
    INDICES = res[["unit_id"]],
    FUN = function(x) {
      vapply(
        X = x,
        FUN = function(v) {
          if (is.numeric(v)) {
            var(v, na.rm = TRUE) > 0
          } else {
            nlevels(factor(v)) > 1
          }
        },
        FUN.VALUE = NA
      )
    },
    simplify = FALSE
  )

  tmp <- matrix(unlist(tmp), ncol = ncol(res), byrow = TRUE)

  fx_per_unit <- colnames(res)[!apply(tmp, 2, any, na.rm = TRUE)]

  fx_per_unit <- union(fx_per_unit, c("MUKEY", "COKEY"))
  fx_per_unit <- setdiff(fx_per_unit, c("hzdept_r", "hzdepb_r", var_output))

  stopifnot("unit_id" %in% fx_per_unit)


  #--- Deduce soil texture iff one of three values is missing
  if (all(var_stxt3 %in% colnames(res))) {
    res <- rSW2data::deduce_complete_soil_texture(
      x = res,
      var_stxt = var_stxt,
      val_total = 100,
      ignore_le = 5
    )
  }


  #--- Interpret missing values for rock/gravel fragments as 0 %
  if ("fragvol_r" %in% colnames(res)) {
    replace_missing_fragvol_with_zero <- match.arg(
      replace_missing_fragvol_with_zero
    )

    res <- rSW2data::set_missing_soils_to_value(
      x = res,
      variable = "fragvol_r",
      value = 0,
      where = replace_missing_fragvol_with_zero,
      horizon = "Horizon_No",
      verbose = verbose
    )
  }


  #--- Estimate soil bulk density if missing
  if (estimate_missing_bulkdensity) {
    is_missing_bd <- !is.finite(res[, "dbovendry_r"])

    if (any(is_missing_bd)) {
      res[is_missing_bd, "dbovendry_r"] <- rSW2data::estimate_bulkdensity(
        theta_saturated = res[is_missing_bd, "wsatiated_r"] / 100,
        gravel_volume = res[is_missing_bd, "fragvol_r"] / 100
      )

      if (verbose) {
        tmp <- is.finite(res[is_missing_bd, "dbovendry_r"])
        nm <- sum(!tmp)

        message(
          "Missing bulk density values estimated: n = ",
          sum(tmp),
          if (nm) paste0("; however, n = ", nm, " remain missing.")
        )
      }
    }
  }


  #--- Remove organic horizons
  # nolint start: scalar_in_linter.
  res[, "organic"] <- is_NRCS_horizon_organic(res) %in% TRUE
  # nolint end: scalar_in_linter.

  remove_organic_horizons <- match.arg(remove_organic_horizons)

  if (remove_organic_horizons %in% c("all", "at_surface")) {
    is_organic <- res[, "organic"]

    if (remove_organic_horizons == "all") {
      is_remove <- is_organic

      n_remove <- sum(is_remove)
      if (verbose && n_remove > 0) {
        message("Removed organic horizons: n = ", n_remove)
      }

    } else if (remove_organic_horizons == "at_surface") {
      #--- Organic horizon at the surface
      is_remove <- is_organic & res[, "Horizon_No"] == 1

      # Check whether more than one surface horizon is organic
      unit_w_osurf <- unique(res[is_remove, "unit_id"])
      ids_check <- which(res[["unit_id"]] %in% unit_w_osurf)

      if (length(ids_check) > 0) {
        tmp <- tapply(
          X = is_organic[ids_check],
          INDEX = res[ids_check, "unit_id"],
          FUN = function(x) {
            # First element corresponds to TRUE
            # because we only look at soil units with an organic surface horizon
            n <- rle(x)[["lengths"]][[1L]]
            c(rep(TRUE, n), rep(FALSE, length(x) - n))
          },
          simplify = FALSE
        )

        is_remove[ids_check] <- unlist(tmp[match(unit_w_osurf, names(tmp))])
      }


      n_remove <- sum(is_remove)
      if (verbose && n_remove > 0) {
        n_oburied <- sum(is_organic & !is_remove)

        message(
          "Removed organic horizons at surface: n = ", n_remove,
          " for n = ", length(unit_w_osurf), " unique soils",
          if (n_oburied > 0) {
            paste0("; n = ", n_oburied, " buried organic horizons remain.")
          } else {
            "."
          }
        )
      }
    }


    #--- Remove identified organic horizons

    if (any(is_remove)) {
      # Identify soil units with horizons to be removed
      unit_affected <- unique(res[is_remove, "unit_id"])
      ids_affected <- which(res[["unit_id"]] %in% unit_affected)

      res_affected <- cbind(
        res[ids_affected, ],
        remove = is_remove[ids_affected]
      )

      # Recalculate horizon ranks and all depth values
      tmp <- by(
        data = res_affected,
        INDICES = res_affected[["unit_id"]],
        FUN = function(x) {
          ids_keep <- !x[["remove"]]

          xnew <- x[ids_keep, , drop = FALSE]
          n <- nrow(xnew)

          if (n > 0) {
            tmp <- rep(0, length(x[["remove"]]))
            tmp[x[["remove"]]] <-
              x[x[["remove"]], "hzdepb_r"] - x[x[["remove"]], "hzdept_r"]
            removed_total <- sum(tmp)
            removed_widths <- cumsum(tmp)[ids_keep]

            # Re-calculate horizon rank
            xnew[, "Horizon_No"] <- seq_len(n)

            # Re-calculate upper/lower horizon depth limits
            ids <- grep("hzdep", colnames(xnew), fixed = TRUE)
            xnew[, ids] <- xnew[, ids] - removed_widths

            # Re-calculate depth restrictions
            ids <- grep("_depth", colnames(xnew), fixed = TRUE)
            xnew[, ids] <- xnew[, ids] - removed_total

          } else {
            # We need at least one entry per soil unit
            # Copy values of first row of those columns that are fixed per
            # soil unit and set others to NA
            xnew <- x[1, , drop = FALSE]
            xnew[, setdiff(colnames(xnew), fx_per_unit)] <- NA
          }

          xnew
        },
        simplify = FALSE
      )

      tmp <- do.call(rbind, tmp)

      # Put data back together
      res <- rbind(
        res[-ids_affected, ],
        tmp[, !grepl("remove", colnames(tmp), fixed = TRUE)]
      )
    }
  }

  res[, "organic"] <- as.integer(res[, "organic"])


  #--- Calculate soil depth of profile (per unique soil unit)
  # and convert depth table to wide-format for output
  locs_table_depths <- calculate_soil_depth_NRCS(
    x = res,
    target_site_ids = locs_keys[, "unit_id"],
    restrict_by_ec_or_ph = restrict_by_ec_or_ph,
    var_site_id = "unit_id",
    var_horizon = "Horizon_No",
    var_horizon_lower_depth = "hzdepb_r",
    var_restrictions =
      c("Horizon_depth", "RootZoneRestriction_depth", "Bedrock_depth"),
    var_soiltexture = c("sandtotal_r", "claytotal_r", "silttotal_r")
  )

  # Transfer final soil depth and (potentially adjusted depth_L1)
  ids <- match(res[["unit_id"]], rownames(locs_table_depths))
  res[, "SoilDepth_cm"] <- locs_table_depths[ids, "SoilDepth_cm"]
  res[, "N_horizons"] <- locs_table_depths[ids, "N_horizons"]

  tmp <- res[, "Horizon_No"] == 1 | res[, "N_horizons"] == 0
  is_shallowest <- tmp & !is.na(tmp)
  res[is_shallowest, "hzdepb_r"] <-
    locs_table_depths[ids[is_shallowest], "depth_L1"]

  # Fix rownames of depth table
  rownames(locs_table_depths) <- locs_keys[, "row_id"]


  #--- Last step: impute remaining missing values for each soil unit
  # by shallow-depth value carried deeper (in analogy to LOCF)
  # but do not impute missing values in the shallowest horizon
  if (impute) {
    res <- rSW2data::impute_soils(
      x = res,
      var_site_id = "unit_id",
      var_horizon = "Horizon_No",
      var_values = var_output,
      verbose = verbose
    )
  }


  #--- Convert units & rounding
  # Convert % to fraction
  var_pct_to_fraction <- intersect(
    c("fragvol_r", var_stxt3),
    colnames(res)
  )
  res[, var_pct_to_fraction] <- res[, var_pct_to_fraction] / 100

  # Round texture
  if (is.finite(digits)) {

    if (all(var_stxt3 %in% colnames(res))) {
      has_vals <-
        complete.cases(res[, var_stxt3]) &
        rowSums(res[, var_stxt3, drop = FALSE], na.rm = TRUE) > 0

      res[has_vals, var_stxt3] <- rSW2utils::scale_rounded_by_sum(
        x = res[has_vals, var_stxt3],
        digits = digits,
        icolumn_adjust = 3
      )

      var_others2 <- "fragvol_r"

    } else {
      var_others2 <- c("fragvol_r", var_stxt)
    }

    var_others2 <- intersect(var_others2, colnames(res))
    res[, var_others2] <- round(res[, var_others2], digits)
  }


  #--- Create texture table
  # Convert to wide format (one row for each point location)
  tmp_texture <- reshape2::acast(
    data = reshape2::melt(
      data = cbind(
        res[, c("Horizon_No", "unit_id", var_output)]
      ),
      id.vars = c("Horizon_No", "unit_id")
    ),
    formula = unit_id ~ Horizon_No + variable
  )

  ids <- match(locs_keys[, "unit_id"], rownames(tmp_texture), nomatch = NA)
  locs_table_texture <- tmp_texture[ids, , drop = FALSE]

  colnames(locs_table_texture) <- vapply(
    X = strsplit(colnames(locs_table_texture), split = "_", fixed = TRUE),
    FUN = function(x) paste0(x[[2L]], "_L", x[[1L]]),
    FUN.VALUE = NA_character_
  )
  rownames(locs_table_texture) <- locs_keys[, "row_id"]



  #--- Attempt to replace nosoil-sites with STATSGO
  ids_h0 <- which(locs_table_depths[, "N_horizons"] == 0)
  if (
    length(ids_h0) > 0 &&
      !missing(x) &&
      method == "SSURGO_then_STATSGO"
  ) {

    # Call again for nosoil rows and extract from STATSGO instead of SSURGO
    res0 <- Recall(
      x = x[ids_h0, ],
      method = "STATSGO",
      remove_organic_horizons = remove_organic_horizons,
      replace_missing_fragvol_with_zero = replace_missing_fragvol_with_zero,
      estimate_missing_bulkdensity = estimate_missing_bulkdensity,
      restrict_by_ec_or_ph = restrict_by_ec_or_ph,
      impute = impute,
      digits = digits,
      chunk_size = chunk_size,
      progress_bar = progress_bar,
      verbose = verbose
    )

    # Check that we have no longer nosoils
    is_res0_good <-
      !is.na(res0[["table_depths"]][, "N_horizons"]) &
      res0[["table_depths"]][, "N_horizons"] > 0
    ids_h0_replace <- ids_h0[is_res0_good]

    # Update nosoil data with soils from STATSGO
    if (length(ids_h0_replace) > 0) {
      locs_keys[ids_h0_replace, ] <- res0[["table_keys"]][is_res0_good, ]

      ivars <- seq_len(
        min(ncol(locs_table_depths), ncol(res0[["table_depths"]]))
      )
      locs_table_depths[ids_h0_replace, ivars] <-
        res0[["table_depths"]][is_res0_good, ivars]

      ivars <- seq_len(
        min(ncol(locs_table_texture), ncol(res0[["table_texture"]]))
      )
      locs_table_texture[ids_h0_replace, ivars] <-
        res0[["table_texture"]][is_res0_good, ivars]
    }
  }


  #--- Return tables
  list(
    ref = create_reference_for_NRCS_SDA(),
    table_keys = locs_keys,
    table_depths = locs_table_depths,
    table_texture = locs_table_texture
  )
}
DrylandEcology/rSW2exter documentation built on May 4, 2024, 10:53 p.m.