R/importWaterML1.R

Defines functions check_if_xml empty_col add_empty_col r_bind_dr importWaterML1

Documented in importWaterML1

#' Function to return data from the NWISWeb WaterML1.1 service
#'
#' This function accepts a url parameter that already contains the desired
#' NWIS site, parameter code, statistic, startdate and enddate.
#'
#' @param obs_url character or raw, containing the url for the retrieval or a
#' file path to the data file, or raw XML.
#' @param asDateTime logical, if `TRUE` returns date and time as POSIXct, if `FALSE`, Date
#' @param tz character to set timezone attribute of datetime. Default converts the datetimes to UTC
#' (properly accounting for daylight savings times based on the data's provided tz_cd column).
#' Recommended US values include "UTC", "America/New_York", "America/Chicago", "America/Denver",
#' "America/Los_Angeles", "America/Anchorage", "America/Honolulu", "America/Jamaica", "America/Managua",
#' "America/Phoenix", and "America/Metlakatla".
#' For a complete list, see <https://en.wikipedia.org/wiki/List_of_tz_database_time_zones>
#' @return A data frame with the following columns:
#' \tabular{lll}{
#' Name \tab Type \tab Description \cr
#' agency_cd \tab character \tab The NWIS code for the agency reporting the data\cr
#' site_no \tab character \tab The USGS site number \cr
#'  \tab POSIXct \tab The date and time of the value converted to UTC (if asDateTime = TRUE), \cr
#' \tab character \tab or raw character string (if asDateTime = FALSE) \cr
#' tz_cd \tab character \tab The time zone code for  \cr
#' code \tab character \tab Any codes that qualify the corresponding value\cr
#' value \tab numeric \tab The numeric value for the parameter \cr
#' }
#' Note that code and value are repeated for the parameters requested. The names are of the form
#' X_D_P_S, where X is literal,
#' D is an option description of the parameter,
#' P is the parameter code,
#' and S is the statistic code (if applicable).
#'
#' There are also several useful attributes attached to the data frame:
#' \tabular{lll}{
#' Name \tab Type \tab Description \cr
#' url \tab character \tab The url used to generate the data \cr
#' siteInfo \tab data.frame \tab A data frame containing information on the requested sites \cr
#' variableInfo \tab data.frame \tab A data frame containing information on the requested parameters \cr
#' statisticInfo \tab data.frame \tab A data frame containing information on the requested statistics on the data \cr
#' queryTime \tab POSIXct \tab The time the data was returned \cr
#' }
#'
#' @seealso [renameNWISColumns()]
#' @export
#' @examplesIf is_dataRetrieval_user()
#' site_id <- "02177000"
#' startDate <- "2012-09-01"
#' endDate <- "2012-10-01"
#' offering <- "00003"
#' property <- "00060"
#' obs_url <- constructNWISURL(site_id, property, startDate, endDate, "dv")
#' \donttest{
#' data <- importWaterML1(obs_url, asDateTime = TRUE)
#'
#'
#' unitDataURL <- constructNWISURL(
#'   site_id, property,
#'   "2013-11-03", "2013-11-03", "uv"
#' )
#' unitData <- importWaterML1(unitDataURL, TRUE)
#'
#' # Two sites, two pcodes, one site has two data descriptors:
#' siteNumber <- c("01480015", "04085427")
#' obs_url <- constructNWISURL(
#'   siteNumber, c("00060", "00010"),
#'   startDate, endDate, "dv"
#' )
#' data <- importWaterML1(obs_url)
#' data$dateTime <- as.Date(data$dateTime)
#' data <- renameNWISColumns(data)
#' names(attributes(data))
#' attr(data, "url")
#' attr(data, "disclaimer")
#'
#' inactiveSite <- "05212700"
#' inactiveSite <- constructNWISURL(inactiveSite, "00060",
#'                                  "2014-01-01", "2014-01-10", "dv")
#' inactiveSite <- importWaterML1(inactiveSite)
#'
#' inactiveAndAcitive <- c("07334200", "05212700")
#' inactiveAndAcitive <- constructNWISURL(inactiveAndAcitive,
#'                          "00060", "2014-01-01", "2014-01-10", "dv")
#' inactiveAndAcitive <- importWaterML1(inactiveAndAcitive)
#'
#' # Timezone change with specified local timezone:
#' tzURL <- constructNWISURL("04027000", c("00300", "63680"),
#'                           "2011-11-05", "2011-11-07", "uv")
#' tzIssue <- importWaterML1(tzURL,
#'   asDateTime = TRUE, tz = "America/Chicago"
#' )
#'
#' }
#' filePath <- system.file("extdata", package = "dataRetrieval")
#' fileName <- "WaterML1Example.xml"
#' fullPath <- file.path(filePath, fileName)
#' importFile <- importWaterML1(fullPath, TRUE)
#'
importWaterML1 <- function(obs_url, asDateTime = FALSE, tz = "UTC") {
  # note: obs_url is a dated name, does not have to be a url/path

  returnedDoc <- check_if_xml(obs_url)
  raw <- !is.character(obs_url) & !("httr2_request" %in% class(obs_url))

  if (tz == "") {
    # check tz is valid if supplied
    tz <- "UTC"
  }

  tz <- match.arg(tz, OlsonNames())

  timeSeries <- xml2::xml_find_all(returnedDoc, ".//ns1:timeSeries") # each parameter/site combo

  # some initial attributes
  queryNodes <- xml2::xml_children(xml2::xml_find_all(
    returnedDoc,
    ".//ns1:queryInfo"
  ))
  notes <- queryNodes[xml2::xml_name(queryNodes) == "note"]
  noteTitles <- xml2::xml_attrs(notes)
  noteText <- xml2::xml_text(notes)
  noteList <- as.list(noteText)
  names(noteList) <- noteTitles

  if (0 == length(timeSeries)) {
    df <- data.frame(
      agency_cd = character(),
      site_no = character(),
      dateTime = as.POSIXct(character()),
      tz_cd = character()
    )
    attr(df, "queryInfo") <- noteList
    if (!raw) {
      attr(df, "url") <- obs_url$url
    }
    return(df)
  }

  mergedDF <- NULL

  for (t in timeSeries) {
    # check if there are multiple time series (ie with different descriptors)
    # descriptor will be appended to col name if so
    valParents <- xml2::xml_find_all(t, ".//ns1:values")
    obsDF <- NULL
    useMethodDesc <- FALSE
    if (length(valParents) > 1) {
      useMethodDesc <- TRUE
    } # append the method description to colnames later

    sourceInfo <- xml2::xml_children(xml2::xml_find_all(t, ".//ns1:sourceInfo"))
    variable <- xml2::xml_children(xml2::xml_find_all(t, ".//ns1:variable"))
    agency_cd <- xml2::xml_attr(
      sourceInfo[xml2::xml_name(sourceInfo) == "siteCode"],
      "agencyCode"
    )
    pCode <- xml2::xml_text(variable[
      xml2::xml_name(variable) == "variableCode"
    ])
    statCode <- xml2::xml_attr(
      xml2::xml_find_all(variable, ".//ns1:option"),
      "optionCode"
    )

    # site info
    srsNode <- xml2::xml_find_all(sourceInfo, ".//ns1:geogLocation")
    srs <- xml2::xml_attr(srsNode, "srs")
    locNodes <- xml2::xml_children(srsNode)
    locNames <- xml2::xml_name(locNodes)
    locText <- as.numeric(xml2::xml_text(locNodes))
    names(locText) <- sub(
      "longitude",
      "dec_lon_va",
      sub("latitude", "dec_lat_va", locNames)
    )
    sitePropNodes <- sourceInfo[xml2::xml_name(sourceInfo) == "siteProperty"]
    siteProp <- xml2::xml_text(sitePropNodes)
    names(siteProp) <- xml2::xml_attr(sitePropNodes, "name")
    tzInfo <- unlist(xml2::xml_attrs(xml2::xml_find_all(
      sourceInfo,
      "ns1:defaultTimeZone"
    )))
    siteName <- xml2::xml_text(sourceInfo[
      xml2::xml_name(sourceInfo) == "siteName"
    ])
    siteCodeNode <- sourceInfo[xml2::xml_name(sourceInfo) == "siteCode"]
    site_no <- xml2::xml_text(siteCodeNode)
    siteCodeAtt <- unlist(xml2::xml_attrs(siteCodeNode))
    siteDF <- cbind.data.frame(
      t(locText),
      t(tzInfo),
      station_nm = siteName,
      t(siteCodeAtt),
      srs,
      t(siteProp),
      site_no,
      stringsAsFactors = FALSE
    )
    defaultTZ <- xml2::xml_attr(
      xml2::xml_find_all(sourceInfo, ".//ns1:defaultTimeZone"),
      "zoneAbbreviation"
    )

    for (v in valParents) {
      obsColName <- paste(pCode, statCode, sep = "_")
      obs <- xml2::xml_find_all(v, ".//ns1:value")
      values <- as.numeric(xml2::xml_text(obs)) # actual observations

      nObs <- length(values)
      qual <- xml2::xml_attr(obs, "qualifiers")

      dateTime <- xml2::xml_attr(obs, "dateTime")
      if (asDateTime) {
        numChar <- nchar(dateTime)
        dateTime <- lubridate::parse_date_time(
          dateTime,
          c(
            "%Y",
            "%Y-%m-%d",
            "%Y-%m-%dT%H:%M",
            "%Y-%m-%dT%H:%M:%S",
            "%Y-%m-%dT%H:%M:%OS",
            "%Y-%m-%dT%H:%M:%OS%z"
          ),
          exact = TRUE
        )
        if (any(numChar < 20) && any(numChar > 16)) {
          # not sure there is still a case for this (no offset on times)?
          dateTime[numChar < 20 & numChar > 16] <- dateTime[
            numChar < 20 & numChar > 16
          ] +
            offsetLibrary[offsetLibrary$code == defaultTZ, "offset"] * 60 * 60
          warning(paste(
            "site",
            site_no[1],
            "had data without time zone offsets, so DST could not be accounted for"
          ))
        }

        # ^^setting tz in as.POSIXct just sets the attribute, does not convert the time!
        # Set to UTC now, convert the whole thing to tz later.
        attr(dateTime, "tzone") <- "UTC"
        tzCol <- rep(tz, nObs)
      } else {
        tzCol <- rep(defaultTZ, nObs)
      }
      # create column names, addressing if methodDesc is needed
      if (useMethodDesc) {
        methodDesc <- xml2::xml_text(xml2::xml_find_all(
          v,
          ".//ns1:methodDescription"
        ))
        # this keeps column names consistent with old version
        methodDesc <- gsub("\\[|\\]| |\\(|\\)", ".", methodDesc)

        # sometimes methodDesc is empty
        if (nchar(methodDesc) > 0) {
          obsColName <- paste("X", methodDesc, obsColName, sep = "_")
        } else {
          obsColName <- paste("X", obsColName, sep = "_")
        }
      } else {
        obsColName <- paste("X", obsColName, sep = "_")
      }
      qualColName <- paste(obsColName, "cd", sep = "_")

      valParentDF <- cbind.data.frame(
        dateTime,
        values,
        qual,
        tzCol,
        stringsAsFactors = FALSE
      )
      names(valParentDF) <- c("dateTime", obsColName, qualColName, "tz_cd")
      # delete qual column if all NA
      if (all(is.na(valParentDF[, eval(qualColName)]))) {
        valParentDF <- subset(
          valParentDF,
          select = c("dateTime", eval(obsColName), "tz_cd")
        )
      }
      if (nrow(valParentDF) > 0) {
        if (is.null(obsDF)) {
          obsDF <- valParentDF
        } else {
          obsDF <- merge(
            x = obsDF,
            y = valParentDF,
            by = c("dateTime", "tz_cd"),
            all = TRUE
          )
        }
      } else {
        # need column names for joining later
        # but don't overwrite:
        if (is.null(obsDF)) {
          obsDF <- data.frame(
            dateTime = character(0),
            tz_cd = character(0),
            stringsAsFactors = FALSE
          )
          if (asDateTime) {
            obsDF$dateTime <- as.POSIXct(obsDF$dateTime)
            attr(obsDF$dateTime, "tzone") <- "UTC"
          }
        }
      }
    }

    if (is.null(obsDF)) {
      mergedSite <- data.frame()
      next
    }
    nObs <- nrow(obsDF)

    # statistic info
    options <- xml2::xml_find_all(variable, "ns1:option")
    stat <- options[xml2::xml_attr(options, "name") == "Statistic"]
    stat_nm <- xml2::xml_text(options[
      xml2::xml_attr(stat, "name") == "Statistic"
    ])
    statCd <- xml2::xml_attr(stat, "optionCode")
    statDF <- cbind.data.frame(
      statisticCd = statCd,
      statisticName = stat_nm,
      stringsAsFactors = FALSE
    )

    # variable info
    varText <- as.data.frame(
      t(xml2::xml_text(variable)),
      stringsAsFactors = FALSE
    )
    varNames <- xml2::xml_name(variable)
    varNames <- sub("unit", "param_unit", varNames) # rename to stay consistent with orig importWaterMl1
    names(varText) <- varNames

    # replace no data vals with NA, change attribute df
    noDataVal <- as.numeric(varText$noDataValue)
    if (nObs > 0 && obsColName %in% names(obsDF)) {
      obsDF[[obsColName]][obsDF[[obsColName]] == noDataVal] <- NA
    }
    varText$noDataValue <- NA

    # rep site no & agency, combine into DF
    obsDFrows <- nrow(obsDF)
    df <- cbind.data.frame(
      agency_cd = rep(agency_cd, obsDFrows),
      site_no = rep(site_no, obsDFrows),
      obsDF,
      stringsAsFactors = FALSE
    )

    # join by site no
    # append siteInfo, stat, and variable if they don't match a previous one
    if (is.null(mergedDF)) {
      mergedDF <- df
      mergedSite <- siteDF
      mergedVar <- varText
      mergedStat <- statDF
    } else {
      if (nrow(df) > 0) {
        # merge separately with any same site nos, then recombine
        sameSite <- mergedDF[mergedDF$site_no == site_no, ]
        if (nrow(sameSite) > 0) {
          diffSite <- mergedDF[mergedDF$site_no != site_no, ]
          # first need to delete the obs and qual columns if they have already been filled with NA
          deleteCols <- grepl(obsColName, colnames(sameSite))
          sameSite <- sameSite[, !deleteCols]
          sameSite_simNames <- intersect(colnames(sameSite), colnames(df))
          sameSite <- merge(
            x = sameSite,
            y = df,
            by = sameSite_simNames,
            all = TRUE
          )
          na.omit.unique <- function(x) {
            if (all(is.na(x))) NA else stats::na.omit(x)
          }

          if (
            any(duplicated(sameSite[, c(
              "agency_cd",
              "site_no",
              "dateTime",
              "tz_cd"
            )]))
          ) {
            types <- lapply(sameSite, class)
            sameSite <- stats::aggregate(
              . ~ agency_cd + site_no + dateTime + tz_cd,
              data = sameSite,
              FUN = na.omit.unique,
              na.action = NULL
            )
            sameSite[, which(types == "numeric")] <- sapply(
              sameSite[, which(types == "numeric")],
              as.numeric
            )
            sameSite[, which(types == "character")] <- sapply(
              sameSite[, which(types == "character")],
              as.character
            )
          }

          sameSite <- sameSite[order(as.Date(sameSite$dateTime)), ]

          mergedDF <- r_bind_dr(sameSite, diffSite)
        } else {
          similarNames <- intersect(colnames(mergedDF), colnames(df))
          mergedDF <- merge(
            x = mergedDF,
            y = df,
            by = similarNames,
            all = TRUE
          )
        }
      }
      mergedSite <- merge(
        x = mergedSite,
        y = siteDF,
        by = colnames(mergedSite),
        all = TRUE
      )
      mergedVar <- merge(
        x = mergedVar,
        y = varText,
        by = colnames(mergedVar),
        all = TRUE
      )
      mergedStat <- merge(
        x = mergedStat,
        y = statDF,
        by = colnames(mergedStat),
        all = TRUE
      )
    }
  }

  if (!is.null(mergedSite)) {
    # keep attribute df names the same as old version
    names(mergedSite) <- c(
      "dec_lat_va",
      "dec_lon_va",
      "timeZoneOffset",
      "timeZoneAbbreviation",
      "station_nm",
      "network",
      "agency_cd",
      "srs",
      "siteTypeCd",
      "hucCd",
      "stateCd",
      "countyCd",
      "site_no"
    )
    mergedSite <- mergedSite[c(
      "station_nm",
      "site_no",
      "agency_cd",
      "timeZoneOffset",
      "timeZoneAbbreviation",
      "dec_lat_va",
      "dec_lon_va",
      "srs",
      "siteTypeCd",
      "hucCd",
      "stateCd",
      "countyCd",
      "network"
    )]
  }

  # move tz column to far right and sort by increasing site number to be consistent with old version
  mergedNames <- names(mergedDF)
  tzLoc <- grep("tz_cd", names(mergedDF))
  attr(mergedDF$dateTime, "tzone") <- tz
  mergedDF <- mergedDF[c(mergedNames[-tzLoc], mergedNames[tzLoc])]

  mergedDF <- mergedDF[order(mergedDF$site_no, mergedDF$dateTime), ]
  ###############################################################
  names(mergedDF) <- make.names(names(mergedDF))

  # attach other site info etc as attributes of mergedDF
  if ("httr2_request" %in% class(obs_url)) {
    attr(mergedDF, "url") <- obs_url$url
  }
  attr(mergedDF, "siteInfo") <- mergedSite
  attr(mergedDF, "variableInfo") <- mergedVar
  attr(mergedDF, "disclaimer") <- noteText[noteTitles == "disclaimer"]
  attr(mergedDF, "statisticInfo") <- mergedStat
  attr(mergedDF, "queryTime") <- Sys.time()

  return(mergedDF)
}

r_bind_dr <- function(df1, df2) {
  # Note...this function doesn't retain factors/levels
  # That is not a problem with any dataRetrieval function
  # but, if this function gets used else-where,
  # that should be addressed.
  df1 <- add_empty_col(df1, df2, setdiff(names(df2), names(df1)))
  df2 <- add_empty_col(df2, df1, setdiff(names(df1), names(df2)))

  df3 <- rbind(df1, df2)

  return(df3)
}

add_empty_col <- function(df, df_ref, col_names) {
  if (length(col_names) > 0) {
    if (nrow(df) > 0) {
      df[, col_names] <- NA
    } else {
      for (i in col_names) {
        column_type <- class(df_ref[[i]])
        df[i] <- empty_col(column_type)
      }
    }
  }
  return(df)
}

empty_col <- function(column_type) {
  if (all(column_type %in% c("POSIXct", "POSIXt"))) {
    column_type <- "POSIXct"
  }

  col_return <- switch(
    column_type,
    "numeric" = as.numeric(),
    "factor" = as.factor(),
    "list" = list(),
    "integer" = as.integer(),
    "Date" = as.Date(numeric(), origin = "1970-01-01"),
    "POSIXct" = as.POSIXct(numeric(), origin = "1970-01-01"),
    "POSIXlt" = as.POSIXlt(numeric(), origin = "1970-01-01"),
    "character" = as.character()
  )

  return(col_return)
}

check_if_xml <- function(obs_url) {
  if (is.character(obs_url) && file.exists(obs_url)) {
    returnedDoc <- xml2::read_xml(obs_url)
  } else if (is.raw(obs_url)) {
    returnedDoc <- xml2::read_xml(obs_url)
  } else if (inherits(obs_url, c("xml_node", "xml_nodeset"))) {
    returnedDoc <- obs_url
  } else {
    doc <- getWebServiceData(obs_url)
    if (is.null(doc)) {
      return(invisible(NULL))
    }
    returnedDoc <- xml2::xml_root(doc)
  }
  return(returnedDoc)
}

Try the dataRetrieval package in your browser

Any scripts or data that you put into this service are public.

dataRetrieval documentation built on March 31, 2026, 5:07 p.m.