R/createSSURGO.R

Defines functions .gpkg_create_contents .gpkg_has_contents .gpkg_delete_contents .gpkg_add_contents createSSURGO downloadSSURGO

Documented in createSSURGO downloadSSURGO

#' Get SSURGO ZIP files from Web Soil Survey 'Download Soils Data'
#' 
#' Download ZIP files containing spatial (ESRI shapefile) and tabular (TXT) files with standard SSURGO format; optionally including the corresponding SSURGO Template Database with `include_template=TRUE`.
#' 
#' To specify the Soil Survey Areas you would like to obtain data you use a `WHERE` clause for query of `sacatalog` table such as `areasymbol = 'CA067'`, `"areasymbol IN ('CA628', 'CA067')"` or  `areasymbol LIKE 'CT%'`.
#'
#' @param WHERE A SQL `WHERE` clause expression used to filter records in `sacatalog` table. Alternately `WHERE` can be any spatial object supported by `SDA_spatialQuery()` for defining the target extent.
#' @param areasymbols Character vector of soil survey area symbols e.g. `c("CA067", "CA077")`. Used in lieu of `WHERE` argument.
#' @param destdir Directory to download ZIP files into. Default `tempdir()`.
#' @param exdir Directory to extract ZIP archives into. May be a directory that does not yet exist. Each ZIP file will extract to a folder labeled with `areasymbol` in this directory. Default: `destdir`
#' @param include_template Include the (possibly state-specific) MS Access template database? Default: `FALSE`
#' @param db Either `"SSURGO"` (default; detailed soil map) or `"STATSGO"` (general soil map).
#' @param extract Logical. Extract ZIP files to `exdir`? Default: `TRUE`
#' @param remove_zip Logical. Remove ZIP files after extracting? Default: `FALSE` 
#' @param overwrite Logical. Overwrite by re-extracting if directory already exists? Default: `FALSE`
#' @param quiet Logical. Passed to `curl::curl_download()`.
#' @details
#' When `db="STATSGO"` the `WHERE` argument is not supported. Allowed `areasymbols` include `"US"` and two-letter state codes e.g. `"WY"` for the Wyoming general soils map.
#' 
#' @export
#' 
#' @details Pipe-delimited TXT files are found in _/tabular/_ folder extracted from a SSURGO ZIP. The files are named for tables in the SSURGO schema. There is no header / the files do not have column names. See the _Soil Data Access Tables and Columns Report_: \url{https://sdmdataaccess.nrcs.usda.gov/documents/TablesAndColumnsReport.pdf} for details on tables, column names and metadata including the default sequence of columns used in TXT files. The function returns a `try-error` if the `WHERE`/`areasymbols` arguments result in
#' 
#' Several ESRI shapefiles are found in the _/spatial/_ folder extracted from a SSURGO ZIP. These have prefix `soilmu_` (mapunit), `soilsa_` (survey area), `soilsf_` (special features). There will also be a TXT file with prefix `soilsf_` describing any special features. Shapefile names then have an `a_` (polygon), `l_` (line), `p_` (point) followed by the soil survey area symbol.
#' 
#' @return Character. Paths to downloaded ZIP files (invisibly). May not exist if `remove_zip = TRUE`.
downloadSSURGO <- function(WHERE = NULL, 
                           areasymbols = NULL,
                           destdir = tempdir(), 
                           exdir = destdir, 
                           include_template = FALSE,
                           db = c('SSURGO', 'STATSGO'),
                           extract = TRUE, 
                           remove_zip = FALSE,
                           overwrite = FALSE,
                           quiet = FALSE) {
  
  db <- match.arg(toupper(db), c('SSURGO', 'STATSGO'))
  
  if (!is.null(WHERE) && db == "STATSGO") {
    stop('custom WHERE clause not supported with db="STATSGO"', call. = FALSE)
  }
  
  if (!is.null(areasymbols) && db == "STATSGO") {
    WHERE <- areasymbols
  }
  
  if (is.null(WHERE) && is.null(areasymbols)) {
    stop('must specify either `WHERE` or `areasymbols` argument', call. = FALSE)
  }
    
  if (is.null(WHERE) && !is.null(areasymbols)) {
    WHERE <- sprintf("areasymbol IN %s", format_SQL_in_statement(areasymbols))
  }
  
  if (!is.character(WHERE)) {
    # attempt passing WHERE to SDA_spatialQuery
    res <- suppressMessages(SDA_spatialQuery(WHERE, what = 'areasymbol'))
    WHERE <- paste("areasymbol IN", format_SQL_in_statement(res$areasymbol))
  }
  
  # make WSS download URLs from areasymbol, template, date
  urls <- .make_WSS_download_url(WHERE, include_template = include_template, db = db)
  
  if (inherits(urls, 'try-error')) {
    message(urls[1])
    return(invisible(urls))
  }
  
  if (!dir.exists(destdir)) {
    dir.create(destdir, recursive = TRUE)
  }
  
  # download files
  for (i in seq_along(urls)) {
    destfile <- file.path(destdir, basename(urls[i]))
    if (!file.exists(destfile)) {
      try(curl::curl_download(urls[i], destfile = destfile, quiet = quiet, mode = "wb", handle = .soilDB_curl_handle()), silent = quiet)
    }
  }
  
  paths <- list.files(destdir, pattern = "\\.zip$", full.names = TRUE)
  paths2 <- paths[grep(".*wss_(SSA|gsmsoil)_(.*)_.*", paths)]
  
  if  (extract) {
    if (!quiet) {
      message("Extracting downloaded ZIP files...")
    }
    
    if (length(paths2) == 0) {
      stop("Could not find SSURGO ZIP files in `destdir`: ", destdir, call. = FALSE)
    }
    
    if (!dir.exists(exdir)) {
      dir.create(exdir, recursive = TRUE)
    }
    
    for (i in seq_along(paths2)) {
      ssa <- gsub(".*wss_SSA_(.*)_.*", "\\1", paths2[i])
      if ((!dir.exists(file.path(exdir, ssa)) || overwrite) && 
          length(utils::unzip(paths2[i], exdir = exdir)) == 0) {
        message(paste('Invalid zipfile:', paths2[i]))
      }
    }
    
    if (remove_zip) {
      file.remove(paths2)
    }
  }
  
  invisible(paths2)
}
 
#' Create a SQLite database or GeoPackage from one or more SSURGO Exports
#'
#' @param filename Output file name (e.g. `'db.sqlite'` or `'db.gpkg'`)
#' @param exdir Path containing containing SSURGO spatial (.shp) and tabular (.txt) files. 
#' @param pattern Character. Optional regular expression to use to filter subdirectories of `exdir`. Default: `NULL` will search all subdirectories for SSURGO export files.
#' @param include_spatial Logical. Include spatial data layers in database? Default: `TRUE`. 
#' @param overwrite Logical. Overwrite existing layers? Default `FALSE` will append to existing tables/layers.
#' @param header Logical. Passed to `read.delim()` for reading pipe-delimited (`|`) text files containing tabular data.
#' @param quiet Logical. Suppress messages and other output from database read/write operations?
#' @param ... Additional arguments passed to `write_sf()` for writing spatial layers.
#'
#' @return Character. Vector of layer/table names in `filename`.
#' @export
#' @examples
#' \dontrun{
#'  downloadSSURGO("areasymbol IN ('CA067', 'CA077', 'CA632')", destdir = "SSURGO_test")
#'  createSSURGO("test.gpkg", "SSURGO_test")
#' }
createSSURGO <- function(filename,
                         exdir,
                         pattern = NULL,
                         include_spatial = TRUE,
                         overwrite = FALSE,
                         header = FALSE,
                         quiet = TRUE,
                         ...) {
  
  if (missing(filename) || length(filename) == 0) {
    stop("`filename` should be a path to a .gpkg or .sqlite file to create or append to.")
  }
  
  IS_GPKG <- grepl("\\.gpkg$", filename, ignore.case = TRUE)[1]
  
  f <- list.files(exdir, recursive = TRUE, pattern = pattern, full.names = TRUE)
  
  if (!requireNamespace("sf"))
    stop("package `sf` is required to write spatial datasets to SSURGO SQLite databases", call. = FALSE)
  
  if (!requireNamespace("RSQLite"))
    stop("package `RSQLite` is required to write tabular datasets to SSURGO SQLite databases", call. = FALSE)
  
  if (isTRUE(overwrite) && file.exists(filename)) {
    file.remove(filename)
  }
  
  # create and add combined vector datasets:
  #   "soilmu_a", "soilmu_l", "soilmu_p", "soilsa_a", "soilsf_l", "soilsf_p" 
  f.shp <- f[grepl(".*\\.shp$", f)]
  shp.grp <- do.call('rbind', strsplit(gsub(".*soil([musfa]{2})_([apl])_([a-z]{2}\\d{3}|[a-z]{2})\\.shp", "\\1;\\2;\\3", f.shp), ";", fixed = TRUE))
  
  layer_names <- c(`mu_a` = "mupolygon", `mu_l` = "muline",   `mu_p` = "mupoint", 
                   `sa_a` = "sapolygon", `sf_l` = "featline", `sf_p` = "featpoint")
  
  if (nrow(shp.grp) >= 1 && ncol(shp.grp) == 3 && include_spatial) {
    f.shp.grp <- split(f.shp, list(feature = shp.grp[,1], geom = shp.grp[,2]))
    
    lapply(seq_along(f.shp.grp), function(i) {
      lapply(seq_along(f.shp.grp[[i]]), function(j){
        lnm <- layer_names[match(gsub(".*soil([musfa]{2}_[apl])_.*", "\\1", f.shp.grp[[i]][j]),
                                 names(layer_names))]
        
        if (overwrite && j == 1) {
          sf::write_sf(sf::read_sf(f.shp.grp[[i]][j]), filename, layer = lnm, overwrite = TRUE, ...)
        } else sf::write_sf(sf::read_sf(f.shp.grp[[i]][j]), filename, layer = lnm, append = TRUE, ...)
        NULL
      })
    })
  }
  
  # create and add combined tabular datasets
  f.txt <- f[grepl(".*\\.txt$", f)]
  txt.grp <- gsub("\\.txt", "", basename(f.txt))
  
  # explicit handling special feature descriptions -> "featdesc" table
  txt.grp[grepl("soilsf_t_", txt.grp)] <- "featdesc"
  
  f.txt.grp <- split(f.txt, txt.grp)
  
  # get table, column, index lookup tables
  mstabn <- f.txt.grp[[which(names(f.txt.grp) %in% c("mstab", "mdstattabs", "MetadataTable"))[1]]][[1]]
  mstabcn <- f.txt.grp[[which(names(f.txt.grp) %in% c("mstabcol", "mdstattabcols", "MetadataColumnLookup"))[1]]][[1]]
  msidxdn <- f.txt.grp[[which(names(f.txt.grp) %in% c("msidxdet", "mdstatidxdet", "MetadataIndexDetail"))[1]]][[1]]
  
  if (length(mstabn) >= 1) {
    mstab <- read.delim(mstabn[1], sep = "|", stringsAsFactors = FALSE, header = header)
    mstab_lut <- mstab[[1]]
    names(mstab_lut) <- mstab[[5]]
  } else {
    mstab_lut <- names(f.txt.grp)
    names(mstab_lut) <- names(f.txt.grp)
  }
  
  if (length(mstabcn) >= 1) {
    mstabcol <- read.delim(mstabcn[1], sep = "|", stringsAsFactors = FALSE, header = header)
  }
  
  if (length(msidxdn) >= 1) {
    msidxdet <- read.delim(msidxdn[1], sep = "|", stringsAsFactors = FALSE, header = header)
  }
  
  con <- RSQLite::dbConnect(RSQLite::SQLite(), filename, loadable.extensions = TRUE)  
  on.exit(RSQLite::dbDisconnect(con))
  
  lapply(names(f.txt.grp), function(x) {
    
    if (!is.null(mstabcol)) {
      newnames <- mstabcol[[3]][mstabcol[[1]] == mstab_lut[x]]
    }
    
    if (!is.null(msidxdet)) {
      indexPK <- na.omit(msidxdet[[4]][msidxdet[[1]] == mstab_lut[x] & grepl("PK_", msidxdet[[2]])])
      indexDI <- na.omit(msidxdet[[4]][msidxdet[[1]] == mstab_lut[x] & grepl("DI_", msidxdet[[2]])])
    }
    
    d <- try(as.data.frame(data.table::rbindlist(lapply(seq_along(f.txt.grp[[x]]), function(i) {
        
        y <- read.delim(f.txt.grp[[x]][i], sep = "|", stringsAsFactors = FALSE, header = header)
        
        if (length(y) == 1) {
          y <- data.frame(content = y)
        } else {
          if (!is.null(mstab) && !header) { # preserve headers if present 
            colnames(y) <- newnames
          }
        }
        y
    }))), silent = quiet)
    
    if (length(mstab_lut[x]) && is.na(mstab_lut[x])) {
      mstab_lut[x] <- x
    }
    
    if (length(mstab_lut[x]) && !is.na(mstab_lut[x]) && inherits(d, 'data.frame') && nrow(d) > 0) {
      # remove repeated records/metadata
      if (ncol(d) > 1) {
        d <- unique(d) 
      }
      
      # write tabular data to file
      try({
        if (overwrite) {
          RSQLite::dbWriteTable(con, mstab_lut[x], d, overwrite = TRUE)
        } else RSQLite::dbWriteTable(con, mstab_lut[x], d, append = TRUE)
      }, silent = quiet)
      
      # create pkey indices
      if (!is.null(indexPK) && length(indexPK) > 0) {
        try({
          RSQLite::dbExecute(con, sprintf("CREATE UNIQUE INDEX IF NOT EXISTS %s ON %s (%s)", 
                                          paste0('PK_', mstab_lut[x]), mstab_lut[x], paste0(indexPK, collapse = ",")))
        }, silent = quiet)
      }
      
      # create key indices
      if (!is.null(indexDI) && length(indexDI) > 0) {
        for (i in seq_along(indexDI)) {
          try({
            RSQLite::dbExecute(con, sprintf("CREATE INDEX IF NOT EXISTS %s ON %s (%s)", 
                                            paste0('DI_', mstab_lut[x]), mstab_lut[x], indexDI[i]))
          }, silent = quiet)
        }
      }
      
      # for GPKG output, add gpkg_contents (metadata for features and attributes)
      if (IS_GPKG) {
        if (!.gpkg_has_contents(con)) {
          # if no spatial data inserted, there will be no gpkg_contents table initally
          try(.gpkg_create_contents(con))
        }
        # update gpkg_contents table entry
        try(.gpkg_delete_contents(con, mstab_lut[x]))
        try(.gpkg_add_contents(con, mstab_lut[x]))
      }
      
      # TODO: other foreign keys/relationships? ALTER TABLE/ADD CONSTRAINT not available in SQLite
      #  the only way to add a foreign key is via CREATE TABLE which means refactoring above two
      #  steps into a single SQL statement (create table with primary and foreign keys)
    }
  })
  
  res <- RSQLite::dbListTables(con)
  invisible(res)
}

## From https://github.com/brownag/gpkg -----

#' Add, Remove, Update and Create `gpkg_contents` table and records
#' @description `gpkg_add_contents()`: Add a record to `gpkg_contents`
#' @param con A _geopackage_
#' @param table_name Name of table to add or remove record for in _gpkg_contents_
#' @param description Default `""`
#' @param template Default `NULL` uses global EPSG:4326 with bounds -180,-90:180,90
#' @return Result of `RSQLite::dbExecute()`
#' @noRd
#' @keywords internal
.gpkg_add_contents <- function(con, table_name, description = "", template = NULL) {
  
  stopifnot(requireNamespace("RSQLite"))
  
  if (!missing(template) &&
      !is.null(template) &&
      is.list(template) &&
      all(c("ext", "srsid") %in% names(template))) {
    ex <- template$ext
    cr <- as.integer(template$srsid)
  } else {
    ex <- c(-180, -90, 180, 90)
    cr <- 4326
  }
  
  # append to gpkg_contents
  RSQLite::dbExecute(con,
                    paste0(
                      "INSERT INTO gpkg_contents (table_name, data_type, identifier, 
                                  description, last_change,
                                  min_x, min_y, max_x, max_y, srs_id) 
       VALUES ('",
       table_name ,
       "', 'attributes', '",
       table_name,
       "', '",
       description,
       "','",
       strftime(Sys.time(), '%Y-%m-%dT%H:%M:%OS3Z'),
       "', ", ex[1], ", ", ex[2], ", ",
       ex[3], ", ", ex[4], ", ",
       cr, "
                      );"
                    )
  )
}

#' @description `.gpkg_delete_contents()`: Delete a record from `gpkg_contents` based on table name
#' @noRd
#' @keywords internal
.gpkg_delete_contents <- function(con, table_name) {
  stopifnot(requireNamespace("RSQLite"))
  RSQLite::dbExecute(con, paste0("DELETE FROM gpkg_contents WHERE table_name = '", table_name, "'"))
}

#' @description `.gpkg_has_contents()`: Determine if a database has table named `"gpkg_contents"` 
#' @noRd
#' @keywords internal
.gpkg_has_contents <- function(con) {
  stopifnot(requireNamespace("RSQLite"))
  isTRUE("gpkg_contents" %in% RSQLite::dbListTables(con))
}

#' @description `.gpkg_has_contents()`: Determine if a database has table named `"gpkg_contents"` 
#' @noRd
#' @keywords internal
.gpkg_create_contents <- function(con) {
    stopifnot(requireNamespace("RSQLite"))
    q <- "CREATE TABLE gpkg_contents (
      table_name TEXT NOT NULL PRIMARY KEY,
      data_type TEXT NOT NULL,
      identifier TEXT UNIQUE,
      description TEXT DEFAULT '',
      last_change DATETIME NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ','now')),
      min_x DOUBLE,
      min_y DOUBLE,
      max_x DOUBLE,
      max_y DOUBLE,
      srs_id INTEGER,
      CONSTRAINT fk_gc_r_srs_id FOREIGN KEY (srs_id) REFERENCES gpkg_spatial_ref_sys(srs_id)
    )"
    
    if (!.gpkg_has_contents(con)) {
      RSQLite::dbExecute(con, q)
    } else return(1)
}
ncss-tech/soilDB documentation built on May 5, 2024, 2:21 a.m.