R/importWaterML1.R

Defines functions check_if_xml empty_col add_empty_col r_bind_dr importWaterML1

Documented in importWaterML1

#' Function to return data from the NWISWeb WaterML1.1 service
#'
#' This function accepts a url parameter that already contains the desired
#' NWIS site, parameter code, statistic, startdate and enddate.
#'
#' @param obs_url character or raw, containing the url for the retrieval or a
#' file path to the data file, or raw XML.
#' @param asDateTime logical, if \code{TRUE} returns date and time as POSIXct, if \code{FALSE}, Date
#' @param tz character to set timezone attribute of datetime. Default converts the datetimes to UTC
#' (properly accounting for daylight savings times based on the data's provided tz_cd column).
#' Recommended US values include "UTC", "America/New_York", "America/Chicago", "America/Denver",
#' "America/Los_Angeles", "America/Anchorage", "America/Honolulu", "America/Jamaica", "America/Managua",
#' "America/Phoenix", and "America/Metlakatla".
#' For a complete list, see \url{https://en.wikipedia.org/wiki/List_of_tz_database_time_zones}
#' @return A data frame with the following columns:
#' \tabular{lll}{
#' Name \tab Type \tab Description \cr
#' agency_cd \tab character \tab The NWIS code for the agency reporting the data\cr
#' site_no \tab character \tab The USGS site number \cr
#'  \tab POSIXct \tab The date and time of the value converted to UTC (if asDateTime = TRUE), \cr
#' \tab character \tab or raw character string (if asDateTime = FALSE) \cr
#' tz_cd \tab character \tab The time zone code for  \cr
#' code \tab character \tab Any codes that qualify the corresponding value\cr
#' value \tab numeric \tab The numeric value for the parameter \cr
#' }
#' Note that code and value are repeated for the parameters requested. The names are of the form
#' X_D_P_S, where X is literal,
#' D is an option description of the parameter,
#' P is the parameter code,
#' and S is the statistic code (if applicable).
#'
#' There are also several useful attributes attached to the data frame:
#' \tabular{lll}{
#' Name \tab Type \tab Description \cr
#' url \tab character \tab The url used to generate the data \cr
#' siteInfo \tab data.frame \tab A data frame containing information on the requested sites \cr
#' variableInfo \tab data.frame \tab A data frame containing information on the requested parameters \cr
#' statisticInfo \tab data.frame \tab A data frame containing information on the requested statistics on the data \cr
#' queryTime \tab POSIXct \tab The time the data was returned \cr
#' }
#'
#' @seealso \code{\link{renameNWISColumns}}
#' @export
#' @examplesIf is_dataRetrieval_user()
#' site_id <- "02177000"
#' startDate <- "2012-09-01"
#' endDate <- "2012-10-01"
#' offering <- "00003"
#' property <- "00060"
#' obs_url <- constructNWISURL(site_id, property, startDate, endDate, "dv")
#' \donttest{
#' data <- importWaterML1(obs_url, asDateTime = TRUE)
#'
#'
#' unitDataURL <- constructNWISURL(
#'   site_id, property,
#'   "2013-11-03", "2013-11-03", "uv"
#' )
#' unitData <- importWaterML1(unitDataURL, TRUE)
#'
#' # Two sites, two pcodes, one site has two data descriptors:
#' siteNumber <- c("01480015", "04085427")
#' obs_url <- constructNWISURL(
#'   siteNumber, c("00060", "00010"),
#'   startDate, endDate, "dv"
#' )
#' data <- importWaterML1(obs_url)
#' data$dateTime <- as.Date(data$dateTime)
#' data <- renameNWISColumns(data)
#' names(attributes(data))
#' attr(data, "url")
#' attr(data, "disclaimer")
#'
#' inactiveSite <- "05212700"
#' inactiveSite <- constructNWISURL(inactiveSite, "00060", 
#'                                  "2014-01-01", "2014-01-10", "dv")
#' inactiveSite <- importWaterML1(inactiveSite)
#'
#' inactiveAndAcitive <- c("07334200", "05212700")
#' inactiveAndAcitive <- constructNWISURL(inactiveAndAcitive, 
#'                          "00060", "2014-01-01", "2014-01-10", "dv")
#' inactiveAndAcitive <- importWaterML1(inactiveAndAcitive)
#'
#' # Timezone change with specified local timezone:
#' tzURL <- constructNWISURL("04027000", c("00300", "63680"), 
#'                           "2011-11-05", "2011-11-07", "uv")
#' tzIssue <- importWaterML1(tzURL,
#'   asDateTime = TRUE, tz = "America/Chicago"
#' )
#'
#' # raw XML
#' url <- constructNWISURL(
#'   service = "dv", siteNumber = "02319300", parameterCd = "00060",
#'   startDate = "2014-01-01", endDate = "2014-01-01"
#' )
#' raw <- httr::content(httr::GET(url), as = "raw")
#' rawParsed <- importWaterML1(raw)
#' }
#' filePath <- system.file("extdata", package = "dataRetrieval")
#' fileName <- "WaterML1Example.xml"
#' fullPath <- file.path(filePath, fileName)
#' importFile <- importWaterML1(fullPath, TRUE)
#'
importWaterML1 <- function(obs_url, asDateTime = FALSE, tz = "UTC") {
  # note: obs_url is a dated name, does not have to be a url/path

  returnedDoc <- check_if_xml(obs_url)
  raw <- !is.character(obs_url)

  if (tz == "") { # check tz is valid if supplied
    tz <- "UTC"
  }
  tz <- match.arg(tz, OlsonNames())

  timeSeries <- xml2::xml_find_all(returnedDoc, ".//ns1:timeSeries") # each parameter/site combo

  # some initial attributes
  queryNodes <- xml2::xml_children(xml2::xml_find_all(returnedDoc, ".//ns1:queryInfo"))
  notes <- queryNodes[xml2::xml_name(queryNodes) == "note"]
  noteTitles <- xml2::xml_attrs(notes)
  noteText <- xml2::xml_text(notes)
  noteList <- as.list(noteText)
  names(noteList) <- noteTitles

  if (0 == length(timeSeries)) {
    df <- data.frame(
      agency_cd = character(),
      site_no = character(),
      dateTime = as.POSIXct(character()),
      tz_cd = character()
    )
    attr(df, "queryInfo") <- noteList
    if (!raw) {
      attr(df, "url") <- obs_url
    }
    return(df)
  }

  mergedDF <- NULL

  for (t in timeSeries) {
    # check if there are multiple time series (ie with different descriptors)
    # descriptor will be appended to col name if so
    valParents <- xml2::xml_find_all(t, ".//ns1:values")
    obsDF <- NULL
    useMethodDesc <- FALSE
    if (length(valParents) > 1) useMethodDesc <- TRUE # append the method description to colnames later

    sourceInfo <- xml2::xml_children(xml2::xml_find_all(t, ".//ns1:sourceInfo"))
    variable <- xml2::xml_children(xml2::xml_find_all(t, ".//ns1:variable"))
    agency_cd <- xml2::xml_attr(sourceInfo[xml2::xml_name(sourceInfo) == "siteCode"], "agencyCode")
    pCode <- xml2::xml_text(variable[xml2::xml_name(variable) == "variableCode"])
    statCode <- xml2::xml_attr(xml2::xml_find_all(variable, ".//ns1:option"), "optionCode")

    # site info
    srsNode <- xml2::xml_find_all(sourceInfo, ".//ns1:geogLocation")
    srs <- xml2::xml_attr(srsNode, "srs")
    locNodes <- xml2::xml_children(srsNode)
    locNames <- xml2::xml_name(locNodes)
    locText <- as.numeric(xml2::xml_text(locNodes))
    names(locText) <- sub("longitude", "dec_lon_va", sub("latitude", "dec_lat_va", locNames))
    sitePropNodes <- sourceInfo[xml2::xml_name(sourceInfo) == "siteProperty"]
    siteProp <- xml2::xml_text(sitePropNodes)
    names(siteProp) <- xml2::xml_attr(sitePropNodes, "name")
    tzInfo <- unlist(xml2::xml_attrs(xml2::xml_find_all(sourceInfo, "ns1:defaultTimeZone")))
    siteName <- xml2::xml_text(sourceInfo[xml2::xml_name(sourceInfo) == "siteName"])
    siteCodeNode <- sourceInfo[xml2::xml_name(sourceInfo) == "siteCode"]
    site_no <- xml2::xml_text(siteCodeNode)
    siteCodeAtt <- unlist(xml2::xml_attrs(siteCodeNode))
    siteDF <- cbind.data.frame(t(locText), t(tzInfo),
      station_nm = siteName,
      t(siteCodeAtt), srs, t(siteProp),
      site_no, stringsAsFactors = FALSE
    )
    defaultTZ <- xml2::xml_attr(xml2::xml_find_all(sourceInfo, ".//ns1:defaultTimeZone"), "zoneAbbreviation")

    for (v in valParents) {
      obsColName <- paste(pCode, statCode, sep = "_")
      obs <- xml2::xml_find_all(v, ".//ns1:value")
      values <- as.numeric(xml2::xml_text(obs)) # actual observations

      nObs <- length(values)
      qual <- xml2::xml_attr(obs, "qualifiers")


      dateTime <- xml2::xml_attr(obs, "dateTime")
      if (asDateTime) {
        numChar <- nchar(dateTime)
        dateTime <- lubridate::parse_date_time(dateTime, c(
          "%Y", "%Y-%m-%d", "%Y-%m-%dT%H:%M",
          "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%OS",
          "%Y-%m-%dT%H:%M:%OS%z"
        ), exact = TRUE)
        if (any(numChar < 20) && any(numChar > 16)) {
          # not sure there is still a case for this (no offset on times)?
          dateTime[numChar < 20 & numChar > 16] <- dateTime[numChar < 20 & numChar > 16] +
            offsetLibrary[offsetLibrary$code == defaultTZ, "offset"] * 60 * 60
          warning(paste("site", site_no[1], "had data without time zone offsets, so DST could not be accounted for"))
        }

        # ^^setting tz in as.POSIXct just sets the attribute, does not convert the time!
        # Set to UTC now, convert the whole thing to tz later.
        attr(dateTime, "tzone") <- "UTC"
        tzCol <- rep(tz, nObs)
      } else {
        tzCol <- rep(defaultTZ, nObs)
      }
      # create column names, addressing if methodDesc is needed
      if (useMethodDesc) {
        methodDesc <- xml2::xml_text(xml2::xml_find_all(v, ".//ns1:methodDescription"))
        # this keeps column names consistent with old version
        methodDesc <- gsub("\\[|\\]| |\\(|\\)", ".", methodDesc)

        # sometimes methodDesc is empty
        if (nchar(methodDesc) > 0) {
          obsColName <- paste("X", methodDesc, obsColName, sep = "_")
        } else {
          obsColName <- paste("X", obsColName, sep = "_")
        }
      } else {
        obsColName <- paste("X", obsColName, sep = "_")
      }
      qualColName <- paste(obsColName, "cd", sep = "_")

      valParentDF <- cbind.data.frame(dateTime, values, qual, tzCol, stringsAsFactors = FALSE)
      names(valParentDF) <- c("dateTime", obsColName, qualColName, "tz_cd")
      # delete qual column if all NA
      if (all(is.na(valParentDF[, eval(qualColName)]))) {
        valParentDF <- subset(valParentDF, select = c("dateTime", eval(obsColName), "tz_cd"))
      }
      if (nrow(valParentDF) > 0) {
        if (is.null(obsDF)) {
          obsDF <- valParentDF
        } else {
          obsDF <- merge(
            x = obsDF,
            y = valParentDF,
            by = c("dateTime", "tz_cd"),
            all = TRUE
          )
        }
      } else {
        # need column names for joining later
        # but don't overwrite:
        if (is.null(obsDF)) {
          obsDF <- data.frame(dateTime = character(0), tz_cd = character(0), stringsAsFactors = FALSE)
          if (asDateTime) {
            obsDF$dateTime <- as.POSIXct(obsDF$dateTime)
            attr(obsDF$dateTime, "tzone") <- "UTC"
          }
        }
      }
    }

    if (is.null(obsDF)) {
      mergedSite <- data.frame()
      next
    }
    nObs <- nrow(obsDF)

    # statistic info
    options <- xml2::xml_find_all(variable, "ns1:option")
    stat <- options[xml2::xml_attr(options, "name") == "Statistic"]
    stat_nm <- xml2::xml_text(options[xml2::xml_attr(stat, "name") == "Statistic"])
    statCd <- xml2::xml_attr(stat, "optionCode")
    statDF <- cbind.data.frame(
      statisticCd = statCd,
      statisticName = stat_nm,
      stringsAsFactors = FALSE
    )

    # variable info
    varText <- as.data.frame(t(xml2::xml_text(variable)), stringsAsFactors = FALSE)
    varNames <- xml2::xml_name(variable)
    varNames <- sub("unit", "param_unit", varNames) # rename to stay consistent with orig importWaterMl1
    names(varText) <- varNames

    # replace no data vals with NA, change attribute df
    noDataVal <- as.numeric(varText$noDataValue)
    if (nObs > 0 && obsColName %in% names(obsDF)) {
      obsDF[[obsColName]][obsDF[[obsColName]] == noDataVal] <- NA
    }
    varText$noDataValue <- NA

    # rep site no & agency, combine into DF
    obsDFrows <- nrow(obsDF)
    df <- cbind.data.frame(
      agency_cd = rep(agency_cd, obsDFrows),
      site_no = rep(site_no, obsDFrows),
      obsDF, stringsAsFactors = FALSE
    )

    # join by site no
    # append siteInfo, stat, and variable if they don't match a previous one
    if (is.null(mergedDF)) {
      mergedDF <- df
      mergedSite <- siteDF
      mergedVar <- varText
      mergedStat <- statDF
    } else {
      if (nrow(df) > 0) {
        # merge separately with any same site nos, then recombine
        sameSite <- mergedDF[mergedDF$site_no == site_no, ]
        if (nrow(sameSite) > 0) {
          diffSite <- mergedDF[mergedDF$site_no != site_no, ]
          # first need to delete the obs and qual columns if they have already been filled with NA
          deleteCols <- grepl(obsColName, colnames(sameSite))
          sameSite <- sameSite[, !deleteCols]
          sameSite_simNames <- intersect(colnames(sameSite), colnames(df))
          sameSite <- merge(
            x = sameSite,
            y = df,
            by = sameSite_simNames,
            all = TRUE
          )
          na.omit.unique <- function(x) {
            if (all(is.na(x))) NA else stats::na.omit(x)
          }

          if (any(duplicated(sameSite[, c(
            "agency_cd", "site_no",
            "dateTime", "tz_cd"
          )]))) {
            types <- lapply(sameSite, class)
            sameSite <- stats::aggregate(. ~ agency_cd + site_no + dateTime + tz_cd,
              data = sameSite,
              FUN = na.omit.unique, na.action = NULL
            )
            sameSite[, which(types == "numeric")] <- sapply(sameSite[, which(types == "numeric")], as.numeric)
            sameSite[, which(types == "character")] <- sapply(sameSite[, which(types == "character")], as.character)
          }

          sameSite <- sameSite[order(as.Date(sameSite$dateTime)), ]

          mergedDF <- r_bind_dr(sameSite, diffSite)
        } else {
          similarNames <- intersect(colnames(mergedDF), colnames(df))
          mergedDF <- merge(
            x = mergedDF,
            y = df,
            by = similarNames,
            all = TRUE
          )
        }
      }
      mergedSite <- merge(
        x = mergedSite,
        y = siteDF,
        by = colnames(mergedSite),
        all = TRUE
      )
      mergedVar <- merge(
        x = mergedVar,
        y = varText,
        by = colnames(mergedVar),
        all = TRUE
      )
      mergedStat <- merge(
        x = mergedStat,
        y = statDF,
        by = colnames(mergedStat),
        all = TRUE
      )
    }
  }

  if (!is.null(mergedSite)) {
    # keep attribute df names the same as old version
    names(mergedSite) <- c(
      "dec_lat_va", "dec_lon_va", "timeZoneOffset", "timeZoneAbbreviation",
      "station_nm", "network", "agency_cd", "srs", "siteTypeCd",
      "hucCd", "stateCd", "countyCd", "site_no"
    )
    mergedSite <- mergedSite[c(
      "station_nm", "site_no", "agency_cd", "timeZoneOffset",
      "timeZoneAbbreviation", "dec_lat_va", "dec_lon_va", "srs", "siteTypeCd",
      "hucCd", "stateCd", "countyCd", "network"
    )]
  }

  # move tz column to far right and sort by increasing site number to be consistent with old version
  mergedNames <- names(mergedDF)
  tzLoc <- grep("tz_cd", names(mergedDF))
  attr(mergedDF$dateTime, "tzone") <- tz
  mergedDF <- mergedDF[c(mergedNames[-tzLoc], mergedNames[tzLoc])]

  mergedDF <- mergedDF[order(mergedDF$site_no, mergedDF$dateTime), ]
  ###############################################################
  names(mergedDF) <- make.names(names(mergedDF))

  # attach other site info etc as attributes of mergedDF
  if (!raw) {
    attr(mergedDF, "url") <- obs_url
  }
  attr(mergedDF, "siteInfo") <- mergedSite
  attr(mergedDF, "variableInfo") <- mergedVar
  attr(mergedDF, "disclaimer") <- noteText[noteTitles == "disclaimer"]
  attr(mergedDF, "statisticInfo") <- mergedStat
  attr(mergedDF, "queryTime") <- Sys.time()

  return(mergedDF)
}

r_bind_dr <- function(df1, df2) {

  # Note...this function doesn't retain factors/levels
  # That is not a problem with any dataRetrieval function
  # but, if this function gets used else-where,
  # that should be addressed.
  df1 <- add_empty_col(df1, df2, setdiff(names(df2), names(df1)))
  df2 <- add_empty_col(df2, df1, setdiff(names(df1), names(df2)))

  df3 <- rbind(df1, df2)

  return(df3)
}

add_empty_col <- function(df, df_ref, col_names) {
  if (length(col_names) > 0) {
    if (nrow(df) > 0) {
      df[, col_names] <- NA
    } else {
      for (i in col_names) {
        column_type <- class(df_ref[[i]])
        df[i] <- empty_col(column_type)
      }
    }
  }
  return(df)
}

empty_col <- function(column_type) {
  if (all(column_type %in% c("POSIXct", "POSIXt"))) {
    column_type <- "POSIXct"
  }

  col_return <- switch(column_type,
    "numeric" = as.numeric(),
    "factor" = as.factor(),
    "list" = list(),
    "integer" = as.integer(),
    "Date" = as.Date(numeric(), origin = "1970-01-01"),
    "POSIXct" = as.POSIXct(numeric(), origin = "1970-01-01"),
    "POSIXlt" = as.POSIXlt(numeric(), origin = "1970-01-01"),
    "character" = as.character()
  )

  return(col_return)
}

check_if_xml <- function(obs_url) {
  if (is.character(obs_url) && file.exists(obs_url)) {
    returnedDoc <- xml2::read_xml(obs_url)
  } else if (is.raw(obs_url)) {
    returnedDoc <- xml2::read_xml(obs_url)
  } else if (inherits(obs_url, c("xml_node", "xml_nodeset"))) {
    returnedDoc <- obs_url
  } else {
    doc <- getWebServiceData(obs_url, encoding = "gzip")
    if (is.null(doc)) {
      return(invisible(NULL))
    }
    returnedDoc <- xml2::xml_root(doc)
  }
  return(returnedDoc)
}
USGS-R/dataRetrieval documentation built on Nov. 5, 2024, 9:18 p.m.