R/fetchNASIS_pedons.R

Defines functions .fetchNASIS_pedons

# get NASIS site/pedon/horizon/diagnostic feature data
.fetchNASIS_pedons <- function(SS = TRUE,
                               fill = FALSE,
                               rmHzErrors = FALSE,
                               nullFragsAreZero = TRUE,
                               soilColorState = 'moist',
                               mixColors = TRUE,
                               lab = FALSE,
                               stringsAsFactors = NULL,
                               dsn = NULL
) {
  
  if (!missing(stringsAsFactors) && is.logical(stringsAsFactors)) {
    .Deprecated(msg = sprintf("stringsAsFactors argument is deprecated.\nSetting package option with `NASISDomainsAsFactor(%s)`", stringsAsFactors))
    NASISDomainsAsFactor(stringsAsFactors)
  }
  
  # check if NASIS local DB instance/ODBC data source is available
  .soilDB_test_NASIS_connection(dsn = dsn)
  
  # sanity check
  if (!soilColorState %in% c('dry', 'moist'))
    stop('soilColorState must be either `dry` or `moist`', call. = FALSE)

  ## load data in pieces
  # these fail gracefully when no data in local DB | selected set
  site_data  <- get_site_data_from_NASIS_db(SS = SS, dsn = dsn)
  hz_data    <- get_hz_data_from_NASIS_db(SS = SS, fill = fill, dsn = dsn)
  color_data <- get_colors_from_NASIS_db(SS = SS, mixColors = mixColors, dsn = dsn)

  ## ensure there are enough data to create an SPC object
  ds <- ifelse(SS, "NASIS selected set", "NASIS local database")
  if (nrow(site_data) == 0) {
    stop('No site/site observation records in ', ds, call. = FALSE)
  }
  if (nrow(hz_data) == 0) {
    stop('No horizon records in ', ds,'. Use `fill = TRUE` to include pedons without horizons.', call. = FALSE)
  }
  
  ## https://github.com/ncss-tech/soilDB/issues/44
  # optionally load phlabresults table
  if (lab) {
    phlabresults <- .get_phlabresults_data_from_NASIS_db(SS = SS, dsn = dsn)
    
    # TODO: perform phlabresults aggregation method(s) here
    
    hz_data <- merge(hz_data,
                     phlabresults,
                     by = c("peiid", "phiid"),
                     all = TRUE,
                     sort = FALSE)
  }
  
  # data that cannot be effectively flattened in SQL

  extended_data <- get_extended_data_from_NASIS_db(SS = SS,
                                                   nullFragsAreZero = nullFragsAreZero,
                                                   dsn = dsn)

  ## fix some common problems

  # replace missing lower boundaries
  missing.lower.depth.idx <- which(!is.na(hz_data$hzdept) & is.na(hz_data$hzdepb))

  # keep track of affected pedon IDs (if none, this will have zero length)
  assign('missing.bottom.depths', value = unique(hz_data$pedon_id[missing.lower.depth.idx]), envir = get_soilDB_env())

  if (length(missing.lower.depth.idx) > 0) {
    message(paste0('replacing missing lower horizon depths with top depth + 1cm ... [', length(missing.lower.depth.idx), ' horizons]'))

    # make edit
    hz_data$hzdepb[missing.lower.depth.idx] <- hz_data$hzdept[missing.lower.depth.idx] + 1
  }

  # top == bottom ? bottom <- bottom + 1
  top.eq.bottom.idx <- which(hz_data$hzdept == hz_data$hzdepb)

  # keep track of affected pedon IDs (if none, this will have zero length)
  assign('top.bottom.equal', value = unique(hz_data$pedon_id[	top.eq.bottom.idx]), envir = get_soilDB_env())

  if (length(top.eq.bottom.idx) > 0) {
    message(paste0('top/bottom depths equal, adding 1cm to bottom depth ... [', length(top.eq.bottom.idx), ' horizons]'))

    # make the edit
    hz_data$hzdepb[top.eq.bottom.idx] <- hz_data$hzdepb[top.eq.bottom.idx] + 1
  }

  #  aqp uses data.table for efficient logic checking
  filled.ids <- character(0)
  if (rmHzErrors) {

    # get overall validity (combination of 4 logic tests applied to each peiid)
    h.test <- aqp::checkHzDepthLogic(hz_data, c("hzdept","hzdepb"), "peiid", fast = TRUE)
    
    # fill=TRUE adds horizons with NA phiid will have NA depths -- will not pass logic check
    filled.idx <- which(is.na(hz_data$phiid))
    if(length(filled.idx) > 0) {
      filled.ids <- as.character(hz_data$peiid[filled.idx])
      #print(dput(filled.ids))
    }
    
    # which are the good (valid) ones?
    good.ids <- as.character(h.test$peiid[which(h.test$valid)])
    bad.ids <- as.character(h.test$peiid[which(!h.test$valid)])
    bad.horizons <- hz_data[hz_data$peiid %in% h.test$peiid[which(!h.test$valid)], 
                            c("peiid", "phiid", "pedon_id", "hzname", "hzdept", "hzdepb")]
    bad.pedon.ids <- site_data$pedon_id[which(site_data$peiid %in% bad.ids)]
    
    # handle fill=TRUE
    if(length(filled.ids) > 0) {
      good.ids <- unique(c(good.ids, filled.ids))
      bad.ids <- unique(bad.ids[!bad.ids %in% filled.ids])
    }
    
    # optionally filter pedons WITH NO horizonation inconsistencies
    if (rmHzErrors) {
      hz_data <- hz_data[which(hz_data$peiid %in% good.ids), ]
    }
    
    # keep track of those pedons with horizonation errors
    assign('bad.pedon.ids', value = bad.pedon.ids, envir = get_soilDB_env())
    assign("bad.horizons", value = data.frame(bad.horizons), envir = get_soilDB_env())
  }

  # convert pedon and horizon unique ID to character
  hz_data$peiid <- as.character(hz_data$peiid)
  hz_data$phiid <- as.character(hz_data$phiid)

  # upgrade to SoilProfilecollection
  depths(hz_data) <- peiid ~ hzdept + hzdepb

  # move pedon_id into @site
  site(hz_data) <- ~ pedon_id

  ## copy pre-computed colors into a convenience field for plotting
  # moist colors
  if (soilColorState == 'moist') {
    color_data$soil_color <- color_data$moist_soil_color
  }
  
  # dry colors
  if (soilColorState == 'dry') {
    color_data$soil_color <- color_data$dry_soil_color
  }
  
  horizons(hz_data) <- color_data

  # check for empty fragment summary and nullFragsAreZero
  if(nullFragsAreZero & all(is.na(unique(extended_data$frag_summary$phiid)))) {
    extended_data$frag_summary <- cbind(phiid = unique(hz_data$phiid), extended_data$frag_summary[, -1])
  }
  
  ## join hz + fragment summary
  horizons(hz_data) <- extended_data$frag_summary

  # check for empty artifact summary and nullFragsAreZerod
  if (nullFragsAreZero & all(is.na(unique(extended_data$art_summary$phiid)))) {
    extended_data$art_summary <- cbind(phiid = unique(hz_data$phiid), extended_data$art_summary[, -1])
  }
  
  # join hz + artifact summary
  horizons(hz_data) <- extended_data$art_summary

  # add site data to object
  # remove 'pedon_id' column from site_data
  site_data$pedon_id <- NULL
  
  # TODO: duplicating surface fine gravel column with old name for backward compatibility
  site_data$surface_fgravel <- site_data$surface_fine_gravel
  
  # left-join via peiid
  # < 0.1 second for ~ 4k pedons
  site(hz_data) <- site_data

  # load best-guess optimal records from taxhistory
  # method is added to the new field called 'selection_method'
  # assumes that classdate is a datetime class object!
  # 2019-01-31: converting to base functions
  # 2020-02-17: converting to data.table

  # define data.table globals for R CMD CHECK
  .BY <- NULL
  .SD <- NULL

  ed.tax <- data.table::as.data.table(extended_data$taxhistory)
  best.tax.data <- ed.tax[, .pickBestTaxHistory(.SD),
                          by = list(peiid = ed.tax$peiid)]
  site(hz_data) <- as.data.frame(best.tax.data)

  # get "best" ecosite data (most recent correlation, or most complete if no date)
  site(hz_data) <- extended_data$ecositehistory

  ## TODO: NA in diagnostic boolean columns are related to pedons with no diagnostic features
  ## https://github.com/ncss-tech/soilDB/issues/59
  # add diagnostic boolean data into @site
  site(hz_data) <- extended_data$diagHzBoolean

  ## optionally convert NA fragvol to 0
  if (nullFragsAreZero) {
    # this is the "total fragment volume" per NASIS calculation
    hz_data$fragvoltot <- ifelse(is.na(hz_data$fragvoltot), 0, hz_data$fragvoltot)

    # this is computed by soilDB::simplifyFragmentData()
    hz_data$total_frags_pct <- ifelse(is.na(hz_data$total_frags_pct), 0, hz_data$total_frags_pct)

    # this is computed by soilDB::simplifyFragmentData()
    # no para-frags
    hz_data$total_frags_pct_nopf <- ifelse(is.na(hz_data$total_frags_pct_nopf), 0, hz_data$total_frags_pct_nopf)

    # this is computed by soilDB::simplifyArtifactData()
    hz_data$total_art_pct <- ifelse(is.na(hz_data$total_art_pct), 0, hz_data$total_art_pct)
  }

  ## 2021-11-05: converted surface frag summary to simplifyFragmentData() in get_site_data_from_NASIS_db()

  # load diagnostic horizons into @diagnostic:
  # supress warnings: diagnostic_hz() <- is noisy when not all profiles have diagnostic hz data
  suppressWarnings(diagnostic_hz(hz_data) <- extended_data$diagnostic)

  # add restrictions to SPC
  # required new setter in aqp SPC object (AGB added 2019/12/23)
  suppressWarnings(restrictions(hz_data) <- extended_data$restriction)

  # join-in landform string w/ ampersand as separator for hierarchy
  # .formatLandformString <- soilDB:::.formatLandformString
  # .formatParentMaterialString <- soilDB:::.formatParentMaterialString
  ed.lf <- data.table::as.data.table(extended_data$geomorph)
  lf <- ed.lf[, .formatLandformString(.SD, uid = .BY$peiid, name.sep = ' & '),
              by = list(peiid = ed.lf$peiid)]

  if (ncol(lf) > 1) {
    site(hz_data) <- as.data.frame(lf[, c("peiid", "landform_string", "landscape_string", "microfeature_string", "geomicrorelief_string")])
  }
  
  ed.pm <- data.table::as.data.table(extended_data$pm)
  pm <- ed.pm[, .formatParentMaterialString(.SD, uid = .BY$siteiid, name.sep = ' & '),
              by = list(siteiid = ed.pm$siteiid)]
  
  if (ncol(pm) > 2) {
    site(hz_data) <- as.data.frame(pm[, c("siteiid", "pmkind", "pmorigin")])
  }
  
  # set metadata
  m <- metadata(hz_data)
  m$origin <- 'NASIS pedons'
  m$created <- Sys.time()
  metadata(hz_data) <- m

  # print any messages on possible data quality problems:
  if (exists('sites.missing.pedons', envir = get_soilDB_env())) {
    if (length(get('sites.missing.pedons', envir = get_soilDB_env())) > 0) {
      message(
        "-> QC: sites without pedons: \n\tUse `get('sites.missing.pedons', envir=get_soilDB_env())` for site record IDs (siteiid)"
      )
    }
  }
  if (exists('dup.pedon.ids', envir = get_soilDB_env()))
    if (length(get('dup.pedon.ids', envir = get_soilDB_env())) > 0)
      message("-> QC: duplicate pedons: \n\tUse `get('dup.pedon.ids', envir=get_soilDB_env())` for pedon record IDs (peiid)")

  # set NASIS component specific horizon identifier
  if (!fill & length(filled.ids) == 0) {
    res <- try(hzidname(hz_data) <- 'phiid')
    if (inherits(res, 'try-error')) {
      if (!rmHzErrors) {
        warning("cannot set `phiid` as unique pedon horizon key -- duplicate horizons present with rmHzErrors=FALSE")
      } else {
        warning("cannot set `phiid` as unique pedon horizon key -- defaulting to `hzID`")
      }
    }
  } else {
    warning("cannot set `phiid` as unique pedon horizon key - `NA` introduced by fill=TRUE", call.=F)
  }
  
  # set hz designation and texture fields -- NB: chose to use calculated texture -- more versatile
  # functions designed to use hztexclname() should handle presence of in-lieu, modifiers, etc.
  hzdesgnname(hz_data) <- "hzname"
  hztexclname(hz_data) <- "texture"

  if (exists('bad.pedon.ids', envir = get_soilDB_env()))
    if (length(get('bad.pedon.ids', envir = get_soilDB_env())) > 0)
      message("-> QC: horizon errors detected:\n\tUse `get('bad.pedon.ids', envir=get_soilDB_env())` for pedon record IDs (peiid)\n\tUse `get('bad.horizons', envir=get_soilDB_env())` for horizon designations")

  if (exists('missing.bottom.depths', envir = get_soilDB_env()))
    if (length(get('missing.bottom.depths', envir = get_soilDB_env())) > 0)
      message("-> QC: pedons missing bottom hz depths:\n\tUse `get('missing.bottom.depths', envir=get_soilDB_env())` for pedon record IDs (peiid)")

  if (exists('top.bottom.equal', envir = get_soilDB_env()))
    if (length(get('top.bottom.equal', envir = get_soilDB_env())) > 0)
      message("-> QC: equal hz top and bottom depths:\n\tUse `get('top.bottom.equal', envir=get_soilDB_env())` for pedon record IDs (peiid)")

  # done
  return(hz_data)
}
ncss-tech/soilDB documentation built on April 25, 2024, 12:29 a.m.