R/get_OSD.R
In soilDB: Soil Database Interface

Documented in get_OSD get_OSD_JSON

#' Get Official Series Description Data from JSON, HTML or TXT sources
#'
#' @param series A character vector of Official Series names e.g. `"Chewacla"`
#' @param result Select `"json"`, `"html"`, or `"txt"` output
#' @param base_url Optional: alternate JSON/HTML/TXT repository path. Default: `NULL` uses `"https://github.com/ncss-tech/SoilKnowledgeBase"` for `result="json"`
#' @param fix_ocr_errors Default: `FALSE`; Applies only to `result='json'`. Convert clear cases of Optical Character Recognition (OCR) errors to likely actual values. 
#' @param verbose Print errors and warning messages related to HTTP requests? Default: `FALSE` 
#'
#' @details The default `base_url` for `result="json"` is to JSON files stored in a GitHub repository  that is regularly updated from the official source of Series Descriptions. Using format: `https://raw.githubusercontent.com/ncss-tech/SoilKnowledgeBase/main/inst/extdata/OSD/{LETTER}/{SERIES}.json` for JSON. And `"https://soilseriesdesc.sc.egov.usda.gov/OSD_Docs/{LETTER}/{SERIES}.html` is for `result="html"` (official source).
#' 
#' `fix_ocr_errors` by default is turned off (`FALSE`). When `TRUE`, assume that in color data hue/value/chroma lowercase "L" (`"l"`) is a 1, and a capital "O" is interpreted as zero. Also, in horizon designations assume lowercase "L" is a `1`, and a string that starts with `0` starts with the capital letter `"O"`.
#' @return For JSON result: A `data.frame` with 1 row per series, and 1 column per "section" in the OSD as defined in National Soil Survey Handbook. For TXT or HTML result a list of character vectors containing OSD text with 1 element per series and one value per line.
#' @export
#' @aliases get_OSD_JSON
#' @examplesIf curl::has_internet() && requireNamespace("httr", quietly = TRUE) && requireNamespace("jsonlite", quietly = TRUE)
#' @examples
#' \donttest{
#'   series <- c("Musick", "Hector", "Chewacla")
#'   get_OSD(series)
#' }
get_OSD <- function(series, base_url = NULL, result = c("json","html","txt"), fix_ocr_errors = FALSE, verbose = FALSE) {
  result <- match.arg(tolower(result), c("json","html","txt"))

  a_url <- NULL
  if (!is.null(base_url)) {
    a_url <- base_url
  }

  switch(result,
         "json" = { .get_OSD_JSON(series, base_url = a_url, verbose = verbose, fix_ocr_errors = fix_ocr_errors) },
         "html" = { .get_OSD_HTML(series, base_url = a_url, verbose = verbose) },
         "txt" =  { .get_OSD_TXT(series, verbose = verbose)  })
}


## DEB: there are funky white-space characters (a0 created by &nbsp;) between location / state on line 2
## DEB: there is no new-line between date and "XXX SERIES" because an <H1> element is used to create vertical space

.get_OSD_HTML <- function(series, base_url = NULL, verbose = FALSE) {
  if (!requireNamespace('rvest', quietly = TRUE))
    stop('please install the `rvest` package', call. = FALSE)

  if (missing(base_url) || is.null(base_url))
    base_url <- 'https://soilseriesdesc.sc.egov.usda.gov/OSD_Docs/'

  # get HTML content and strip blank / NA lines
  res <- sapply(.seriesNameToURL(series, base_url = base_url), function(x) {

    # if the URL is bad a warning with 404 will be generated
    u <- suppressWarnings(try(url(x, "rb"), silent = TRUE))
    if (inherits(u, 'try-error'))
      return(NULL)

    # HTML results as xml_document
    html.raw <- rvest::read_html(u, silent = !verbose)

    # close connection
    close(u)

    ## not quite right, this removes the title but is otherwise a better candidate for future
    # # 2021-09-13: explicit conversion of &nbsp; -> simple white space
    # textResult <- rvest::html_text2(html.raw, preserve_nbsp = FALSE)

    # NOTE: not enough line-breaks between Date and "<H1>XXX SERIES</H1>", lines are joined
    textResult <- rvest::html_text(html.raw)

    # remove extra line-breaks and other cruft
    textResult <- .stripOSDContents(readLines(textConnection(textResult)))

    # line 5 does not break after the date: "09/2021FRESNO SERIES"
    # repair it
    txt.part <- strsplit(textResult[5], "[0-9]+\\/[0-9]+")[[1]][2]
    date.part <- gsub(txt.part, '', textResult[5])

    # correct contents of line 5
    textResult[5] <- date.part

    # insert the correct contents of line 6, shift everything forwards
    c(textResult[1:5], txt.part, textResult[6:length(textResult)])

  })

  names(res) <- toupper(series)
  return(res)
}

.get_OSD_TXT <- function(series, base_url = "", verbose = FALSE) {
  sapply(series, function(x) {
    fp <- .seriesNameToURL(x, base_url = base_url, extension = "txt")

    if (!file.exists(fp))
      return(NULL)

    # remove empty lines and fix other markup
    try(.stripOSDContents(readLines(fp)), silent = !verbose)
  })
}

#' @export
#' @rdname get_OSD
get_OSD_JSON <- function(series, base_url = NULL) {
  # .Deprecated("get_OSD")
  .get_OSD_JSON(series, base_url)
}

.get_OSD_JSON <- function(series, base_url = NULL, verbose = FALSE, fix_ocr_errors = FALSE) {

  # http://github.com/ncss-tech/SoilKnowledgeBase is default JSON repository path
  if (missing(base_url) || is.null(base_url))
    base_url <- "https://raw.githubusercontent.com/ncss-tech/SoilKnowledgeBase/main/inst/extdata/OSD"

  if (!requireNamespace("jsonlite"))
    stop("package `jsonlite` is required", call. = FALSE)

  ## TODO convert to use .seriesNameToFileName()
  
  # convert series name to upper case and remove NA
  series <- toupper(na.omit(series))
  
  # remove white space on left / right side
  series <- trimws(series, which = 'both')
  
  # convert spaces -> underscore
  series <- gsub(pattern = ' ', replacement = '_', x = series, fixed = TRUE)

  ## TODO if not using convenience function, don't forget apos.
  
  # get first letter of each taxon (if any)
  if (length(series) > 0 && all(nchar(series) > 1)) {
    firstLetter <- substr(series, 0, 1)
  } else stop("argument `series` should be character vector of existing official series names", call. = FALSE)

  # construct URL
  path <- file.path(base_url, firstLetter, paste0(series, ".json"))

  # query, handle errors, return 'tidy' data.frame result
  data.frame(data.table::rbindlist(lapply(seq_along(path), function(i) {

    p <- path[i]
    jsp <- try(jsonlite::read_json(p), silent = TRUE)

    # warning will be generated for non-existent URL
    if (inherits(jsp, 'try-error'))
      return(NULL)

    jspn <- names(jsp)[!names(jsp) %in% c('SITE','HORIZONS')]
    res <- try({
      data.table::as.data.table(lapply(jspn, function(m) {
        x <- jsp[[m]]
        res2 <- x[[length(x)]]
        if (is.null(res2))
          res2 <- NA
        res2
      }))
    }, silent = FALSE)
    colnames(res) <- jspn

    jsp$SITE[[1]][[1]]$id <- i
    res$SITE <- list(data.frame(data.table::rbindlist(lapply(jsp$SITE[[1]], data.frame), fill = TRUE)))
    res$HORIZONS <- list(data.frame(data.table::rbindlist(lapply(jsp$HORIZONS[[1]], data.frame), fill = TRUE)))

    # replace NAs ( from toJSON(na='string') )
    # handle type conversion (mixed NA/numerics will be CHARACTER without conversion)
    res <- type.convert(res, na.strings = "NA", as.is = TRUE)
    res$SITE[[1]] <- type.convert(res$SITE[[1]], na.strings = "NA", as.is = TRUE)
    res$HORIZONS[[1]] <- type.convert(res$HORIZONS[[1]], na.strings = "NA", as.is = TRUE)
    
    if (fix_ocr_errors) {
      # specific "easy" fixes for common OCR errors in character columns
      ocr_cols_color <- c('dry_hue', 'dry_value', 'dry_chroma', 'moist_hue', 'moist_value', 'moist_chroma')
      ocr_cols_hzdesgn <- c('name')
      if (nrow(res$HORIZONS[[1]]) > 0) {
        res$HORIZONS[[1]][ocr_cols_color] <- lapply(res$HORIZONS[[1]][ocr_cols_color], function(chr) {
          # assume in colors hue/value/chroma lowercase L is a 1, and a capital "O" is a zero
          gsub("l", 1, gsub("O", 0, chr))
        })
        res$HORIZONS[[1]][ocr_cols_hzdesgn] <- lapply(res$HORIZONS[[1]][ocr_cols_hzdesgn], function(chr) {
          # assume in horizon desgn lowercase L is a 1, and starts with 0 is the capital letter O
          gsub("l", 1, gsub("^0", "O", chr))
        })
      }
    }
    
    
    # handles weird cases
    if (inherits(res, 'try-error'))
      return(NULL)

    return(res)
  })))

}

## Migrated / adapted from parseOSD repo
## 2021-06-21
## D.E. Beaudette

# generate a link to the OSD for a vector of series names
.seriesNameToURL <- function(s, base_url = 'http://soilseriesdesc.sc.egov.usda.gov/OSD_Docs/',
                             extension = 'html') {
  paste0(base_url, .seriesNameToFileName(s, extension = extension))
}

# prepare a file name and capitalized-first-letter folder based on a series name
.seriesNameToFileName <- function(s, extension = 'txt') {

  # remove any accidental wite space
  s <- trimws(s, which = 'both')
  
  # convert single space to underscore
  s <- gsub(pattern = ' ', replacement = '_', s, fixed = TRUE)

  # caps
  s <- toupper(s)
  
  # TODO: convert apostrophe
  
  res <- file.path(
    substr(s, 1, 1), 
    sprintf('%s.%s', s, extension)
  )
  
  return(res)
}

  # remove empty lines and NA strip double quotes by converting to " inches"
.stripOSDContents <- function(x) {
  x <- x[which(x != '')]
  x <- x[which(!is.na(x))]

  gsub('"', ' inches', x)

  # convert &nbsp; (\ua0) >- ' '
  res <- gsub('\ua0', ' ', x)

  return(res)
}