R/utils.R
In worldmet: Import Surface Meteorological Data from NOAA Integrated Surface Database (ISD)

#' Copy of [openair::timeAverage()] for hourly averaging, simplified for
#' specific use in worldmet
#' @noRd
worldmet_time_average <- function(
  mydata,
  avg.time = "day",
  type = "default"
) {
  # extract variables of interest
  vars <- unique(c("date", names(mydata)))
  mydata <- check_prep(
    mydata,
    vars,
    type = "default",
    remove.calm = FALSE
  )

  # time zone of data (replace missing w/ GMT)
  TZ <- attr(mydata$date, "tzone") %||% "GMT"

  # function for getting mean
  FUN <- function(x) {
    if (all(is.na(x))) {
      NA
    } else {
      mean(x, na.rm = TRUE)
    }
  }

  # function to calculate means
  #
  # need to check whether avg.time is > or < actual time gap of data
  # then data will be expanded or aggregated accordingly
  calc.mean <- function(mydata) {
    # obtain time parameters; seconds in the avg.time interval and seconds in
    # the data interval
    time_params <- get_time_parameters(mydata = mydata, avg.time = avg.time)
    seconds_data_interval <- time_params$seconds_data_interval
    seconds_avgtime_interval <- time_params$seconds_avgtime_interval

    # check to see if we need to expand data rather than aggregate it
    # i.e., chosen time interval less than that of data
    if (seconds_avgtime_interval < seconds_data_interval) {
      # Get time interval from data
      interval <- find_time_interval(mydata$date)

      # get time interval as days; used for refinement
      days <- as.numeric(strsplit(interval, split = " ")[[1]][1]) / 24 / 3600

      # refine interval for common situations
      interval <- if (inherits(mydata$date, "Date")) {
        paste(days, "day")
      } else if (days %in% c(30, 31)) {
        "month"
      } else if (days %in% c(365, 366)) {
        "year"
      } else {
        interval
      }

      # calculate full series of dates by the data interval
      date_range <- range(mydata$date)
      allDates <- seq(date_range[1], date_range[2], by = interval)
      allDates <- c(allDates, max(allDates) + seconds_data_interval)

      # recalculate full series of dates by the avg.time interval
      allData <- data.frame(date = seq(min(allDates), max(allDates), avg.time))

      # merge with original data, which leaves gaps to fill
      mydata <-
        mydata |>
        dplyr::full_join(allData, by = dplyr::join_by("date")) |>
        dplyr::arrange(date)

      # make sure missing types are inserted
      mydata <- tidyr::fill(
        mydata,
        dplyr::all_of(type),
        .direction = c("downup")
      )

      return(mydata)
    }

    # calculate Uu & Vv if "wd" (& "ws") are in mydata
    mydata <- calculate_wind_components(mydata = mydata)

    ## Aggregate data

    ## variables to split by
    vars <- c(type, "date")
    mydata$date <-
      cut(mydata$date, avg.time) |>
      as.character() |>
      (function(x) {
        id <- !grepl("00:00", x)
        x[id] <- paste(x[id], "00:00:00")
        x
      })() |>
      as.POSIXct(tz = TZ)

    avmet <-
      mydata |>
      dplyr::group_by(dplyr::across(dplyr::all_of(vars))) |>
      dplyr::summarise(dplyr::across(
        dplyr::where(function(x) {
          !is.factor(x) && !is.character(x)
        }),
        function(x) {
          FUN(x)
        }
      ))

    avmet
  }

  # type will always be a factor
  for (i in type) {
    mydata[i] <- factor(mydata[[i]])
  }

  # select date, type, and all non-factor/character columns
  mydata <-
    dplyr::select(
      mydata,
      dplyr::all_of(c("date", type)),
      dplyr::where(function(x) {
        !is.character(x) && !is.factor(x)
      })
    )

  # calculate averages
  mydata <-
    mydata |>
    split(mydata[type], drop = TRUE) |>
    purrr::map(
      calc.mean
    ) |>
    purrr::list_rbind() |>
    dplyr::as_tibble()

  # drop default column if it exists
  if ("default" %in% names(mydata)) {
    mydata$default <- NULL
  }

  # return
  return(mydata)
}


#' Pad the data
#' @noRd
pad_dates_timeavg <- function(mydata, type = NULL, interval = "month") {
  # assume by the time we get here the data have been split into types
  # This means we just need to pad out the missing types based on first
  # line.

  start.date <- min(mydata$date, na.rm = TRUE)
  end.date <- max(mydata$date, na.rm = TRUE)

  # interval is in seconds, so convert to days if Date class and not POSIXct
  if (class(mydata$date)[1] == "Date") {
    interval <- paste(
      as.numeric(strsplit(interval, " ")[[1]][1]) / 3600 / 24,
      "days"
    )
  }

  all.dates <- data.frame(date = seq(start.date, end.date, by = interval))
  mydata <- dplyr::full_join(mydata, all.dates, by = "date")

  # add in missing types if gaps are made
  if (!is.null(type)) {
    mydata[type] <- mydata[1, type]
  }

  # make sure order is correct
  mydata <- dplyr::arrange(mydata, date)

  return(mydata)
}

#' Get the intervals in the original data and in the avg.time period
#' @noRd
get_time_parameters <- function(mydata, avg.time) {
  # Time diff in seconds of original data
  seconds_data_interval <- as.numeric(strsplit(
    find_time_interval(mydata$date),
    " "
  )[[1]][1])

  # Parse time diff of new interval
  by2 <- strsplit(avg.time, " ", fixed = TRUE)[[1]]

  seconds_avgtime_interval <- 1
  if (length(by2) > 1) {
    seconds_avgtime_interval <- as.numeric(by2[1])
  }
  units <- by2[length(by2)]

  # Convert units to seconds
  int <- switch(
    units,
    "sec" = 1,
    "min" = 60,
    "hour" = 3600,
    "day" = 3600 * 24,
    "week" = 3600 * 24 * 7,
    "month" = 3600 * 24 * 31,
    "quarter" = 3600 * 24 * 31 * 3,
    "season" = 3600 * 24 * 31 * 3,
    "year" = 3600 * 8784
  )

  if (length(int) == 0L) {
    opts <-
      c(
        "sec",
        "min",
        "hour",
        "day",
        "week",
        "month",
        "quarter",
        "season",
        "year"
      )
    cli::cli_abort(c(
      "x" = "Date unit '{units}' not recognised.",
      "i" = "Possible options: {.code {opts}}."
    ))
  }

  seconds_avgtime_interval <- seconds_avgtime_interval * int # interval in seconds
  if (is.na(seconds_data_interval)) {
    seconds_data_interval <- seconds_avgtime_interval # when only one row
  }

  return(
    list(
      seconds_data_interval = seconds_data_interval,
      seconds_avgtime_interval = seconds_avgtime_interval
    )
  )
}

#' Calculate Uu and Vv if wd & ws are in the data
#' @noRd
calculate_wind_components <- function(mydata) {
  if ("wd" %in% names(mydata)) {
    if (is.numeric(mydata$wd)) {
      if ("ws" %in% names(mydata)) {
        mydata <- dplyr::mutate(
          mydata,
          Uu = .data$ws * sin(2 * pi * .data$wd / 360),
          Vv = .data$ws * cos(2 * pi * .data$wd / 360)
        )
      } else {
        mydata <- dplyr::mutate(
          mydata,
          Uu = sin(2 * pi * .data$wd / 360),
          Vv = cos(2 * pi * .data$wd / 360)
        )
      }
    }
  }

  return(mydata)
}

#' Copy of `check_prep()` from `openair`
#' @noRd
check_prep <- function(
  mydata,
  Names,
  type,
  remove.calm = TRUE,
  remove.neg = TRUE,
  wd = "wd"
) {
  # deal with conditioning variable if present, if user-defined, must exist in
  # data pre-defined types existing conditioning variables that only depend on
  # date (which is checked)
  conds <- c(
    "default",
    "year",
    "hour",
    "month",
    "season",
    "weekday",
    "week",
    "weekend",
    "monthyear",
    "gmtbst",
    "bstgmt",
    "dst",
    "daylight",
    "yearseason",
    "seasonyear"
  )
  all.vars <- unique(c(names(mydata), conds))

  # names we want to be there
  varNames <- c(Names, type)
  matching <- varNames %in% all.vars

  if (any(!matching)) {
    # not all variables are present
    stop(cat("Can't find the variable(s)", varNames[!matching], "\n"))
  }

  # add type to names if not in pre-defined list
  if (any(type %in% conds == FALSE)) {
    ids <- which(type %in% conds == FALSE)
    Names <- c(Names, type[ids])
  }

  # if type already present in data frame
  if (any(type %in% names(mydata))) {
    ids <- which(type %in% names(mydata))
    Names <- unique(c(Names, type[ids]))
  }

  # just select data needed
  mydata <- mydata[, Names]

  # if site is in the data set, check none are missing
  # seems to be a problem for some KCL data...
  if ("site" %in% names(mydata)) {
    # split by site

    # remove any NA sites
    if (anyNA(mydata$site)) {
      id <- which(is.na(mydata$site))
      mydata <- mydata[-id, ]
    }
  }

  # sometimes ratios are considered which can results in infinite values
  # make sure all infinite values are set to NA
  mydata[] <- lapply(mydata, function(x) {
    replace(x, x == Inf | x == -Inf, NA)
  })

  if ("ws" %in% Names) {
    if ("ws" %in% Names && is.numeric(mydata$ws)) {
      # check for negative wind speeds
      if (any(sign(mydata$ws[!is.na(mydata$ws)]) == -1)) {
        if (remove.neg) {
          # remove negative ws only if TRUE
          warning("Wind speed <0; removing negative data")
          mydata$ws[mydata$ws < 0] <- NA
        }
      }
    }
  }

  # round wd to make processing obvious
  # data already rounded to nearest 10 degress will not be affected
  # data not rounded will be rounded to nearest 10 degrees
  # assumes 10 is average of 5-15 etc
  if (wd %in% Names) {
    if (wd %in% Names && is.numeric(mydata[, wd])) {
      # check for wd <0 or > 360
      if (
        any(
          sign(mydata[[wd]][!is.na(mydata[[wd]])]) == -1 |
            mydata[[wd]][!is.na(mydata[[wd]])] > 360
        )
      ) {
        warning("Wind direction < 0 or > 360; removing these data")
        mydata[[wd]][mydata[[wd]] < 0] <- NA
        mydata[[wd]][mydata[[wd]] > 360] <- NA
      }

      if (remove.calm) {
        if ("ws" %in% names(mydata)) {
          mydata[[wd]][mydata$ws == 0] <- NA # set wd to NA where there are calms
          mydata$ws[mydata$ws == 0] <- NA # remove calm ws
        }
        mydata[[wd]][mydata[[wd]] == 0] <- 360 # set any legitimate wd to 360

        # round wd for use in functions - except windRose/pollutionRose
        mydata[[wd]] <- 10 * ceiling(mydata[[wd]] / 10 - 0.5)
        mydata[[wd]][mydata[[wd]] == 0] <- 360 # angles <5 should be in 360 bin
      }
      mydata[[wd]][mydata[[wd]] == 0] <- 360 # set any legitimate wd to 360
    }
  }

  # make sure date is ordered in time if present
  if ("date" %in% Names) {
    if ("POSIXlt" %in% class(mydata$date)) {
      stop("date should be in POSIXct format not POSIXlt")
    }

    # try and work with a factor date - but probably a problem in original data
    if (is.factor(mydata$date)) {
      warning("date field is a factor, check date format")
      mydata$date <- as.POSIXct(mydata$date, "GMT")
    }

    mydata <- dplyr::arrange(mydata, date)

    # make sure date is the first field
    if (names(mydata)[1] != "date") {
      mydata <- mydata[c("date", setdiff(names(mydata), "date"))]
    }

    # check to see if there are any missing dates, stop if there are
    ids <- which(is.na(mydata$date))
    if (length(ids) > 0) {
      mydata <- mydata[-ids, ]
      warning(
        paste("Missing dates detected, removing", length(ids), "lines"),
        call. = FALSE
      )
    }
  }

  # return data frame
  return(mydata)
}

#' From openair
#' @noRd
find_time_interval <- function(dates) {
  # could have several sites, dates may be unordered
  # find the most common time gap in all the data
  dates <- unique(dates) # make sure they are unique

  # work out the most common time gap of unique, ordered dates
  id <- which.max(table(diff(as.numeric(unique(dates[order(dates)])))))
  seconds <- as.numeric(names(id))

  if ("POSIXt" %in% class(dates)) {
    seconds <- paste(seconds, "sec")
  }

  if (class(dates)[1] == "Date") {
    seconds <- seconds * 3600 * 24
    seconds <- paste(seconds, "sec")
  }

  seconds
}

Any scripts or data that you put into this service are public.

worldmet documentation built on Nov. 7, 2025, 5:07 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

worldmet
Import Surface Meteorological Data from NOAA Integrated Surface Database (ISD)

R/utils.R
In worldmet: Import Surface Meteorological Data from NOAA Integrated Surface Database (ISD)

Defines functions find_time_interval check_prep calculate_wind_components get_time_parameters pad_dates_timeavg worldmet_time_average

Try the worldmet package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

worldmet Import Surface Meteorological Data from NOAA Integrated Surface Database (ISD)

R/utils.R In worldmet: Import Surface Meteorological Data from NOAA Integrated Surface Database (ISD)

Defines functions find_time_interval check_prep calculate_wind_components get_time_parameters pad_dates_timeavg worldmet_time_average

Try the worldmet package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

worldmet
Import Surface Meteorological Data from NOAA Integrated Surface Database (ISD)

R/utils.R
In worldmet: Import Surface Meteorological Data from NOAA Integrated Surface Database (ISD)