R/wrcc_createDataDataframe.R
In PWFSLSmoke: Utilities for Working with Air Quality Monitoring Data

Documented in wrcc_createDataDataframe

#' @keywords WRCC
#' @export
#' @import MazamaCoreUtils
#'
#' @title Create WRCC data dataframe
#'
#' @param tbl single site WRCC tibble created by \code{wrcc_clustering()}
#' @param meta WRCC meta datafra,e created by \code{wrcc_createMetaDataframe()}
#' @description After quality control has been applied to an WRCC tibble,
#' we can extract the PM2.5 values and store them in a \code{data} tibble
#' organized as time-by-deployment (aka time-by-site).
#'
#' The first column of the returned dataframe is named \code{'datetime'} and
#' contains a \code{POSIXct} time in UTC. Additional columns contain data
#' for each separate deployment of a monitor.
#'
#' @return A \code{data} dataframe for use in a \emph{ws_monitor} object.


wrcc_createDataDataframe <- function(tbl, meta) {

  logger.debug(" ----- wrcc_createDataDataframe() ----- ")

  # Sanity check -- tbl must have deploymentID
  if ( !'deploymentID' %in% names(tbl) ) {
    logger.error("No 'deploymentID' column found in 'tbl' tibble with columns: %s", paste0(names(tbl), collapse=", "))
    stop(paste0("No 'deploymentID' column found in 'tbl' tibble.  Have you run addClustering()?"))
  }

  # Sanity check -- tbl must have datetime
  if ( !'datetime' %in% names(tbl) ) {
    logger.error("No 'datetime' column found in 'tbl' tibble with columns: %s", paste0(names(tbl), collapse=", "))
    stop(paste0("No 'datetime' column found in 'tbl' tibble."))
  }

  # Sanity check -- meta must have a monitorType
  if ( !'monitorType' %in% names(meta) ) {
    logger.error("No 'monitorType' column found in 'meta' dataframe with columns: %s", paste0(names(meta), collapse=", "))
    stop(paste0("No 'monitorType' column found in 'meta' dataframe."))
  }

  monitorType <- unique(meta$monitorType)

  # Sanity check -- only a single monitorType is allowed
  if ( length(monitorType) > 1 ) {
    logger.error("Multiple monitor types found in 'meta' dataframe: %s", paste0(monitorType, collapse=", "))
    stop(paste0("Multiple monitor types found in 'meta' dataframe."))
  }

  # Create monitorID the same way we did in wrcc_createMetaDataframe()
  # Should only have a single instrumentID
  instrumentIDs <- sort(unique(meta$instrumentID))
  if ( length(instrumentIDs) > 1 ) {
    logger.warn('Multiple instrumentIDs encountered: %s', paste0(instrumentIDs,collapse=", "))
  }
  instrumentID <- instrumentIDs[1]
  tbl$monitorID <- paste(as.character(tbl$deploymentID), instrumentID, sep='_')

  if ( monitorType == 'EBAM' ) {
    pm25Var <- 'ConcRT'
  } else if ( monitorType == 'ESAM' ) {
    pm25Var <- 'ConcRT'
  } else {
    logger.error("Dataframe creation is not supported for %s", monitorType)
    stop(paste0("Dataframe creation is not supported for ", monitorType))
  }

  # Create minimal subset with the the variables we need for rows, columns and data
  subTbl <- tbl[,c('datetime','monitorID',pm25Var)]
  melted <- reshape2::melt(subTbl, id.vars=c('datetime','monitorID'), measure.vars=pm25Var)

  # Unit conversion as needed (mg/m3 ==> ug/m3)
  if ( monitorType == 'EBAM' ) melted$value <- melted$value * 1 # no conversion needed
  if ( monitorType == 'ESAM' ) melted$value <- melted$value * 1 # no conversion needed

  # Use median if multiple values are found

  # Sanity check -- only one pm25DF measure per hour
  valueCountPerCell <- reshape2::dcast(melted, datetime ~ monitorID, length)
  maxCount <- max(valueCountPerCell[,-1])
  if (maxCount > 1) logger.warn("Up to %s measurements per hour -- median used",maxCount)

  # NOTE:  The resulting dataframe is [datetime,monitorIDs] with monitorIDs in alphabetical order
  pm25DF <- reshape2::dcast(melted, datetime ~ monitorID, stats::median)
  # Reorder data columns to match the order of monitorIDs in 'meta'
  pm25DF <- pm25DF[,c('datetime',meta$monitorID)]

  # Create an empty hourlyDF dataframe with a full time axis (no missing hours)
  datetime <- seq(min(tbl$datetime), max(tbl$datetime), by="hours")
  hourlyDF <- data.frame(datetime=datetime)

  # Merge pm25DF into the houlyDF dataframe, inserting NA's where necessary
  # NOTE:  dplyr returns objects of class "tbl_df" which can be confusing. We undo that.
  data <- as.data.frame( dplyr::left_join(hourlyDF, pm25DF, by='datetime'), stringsAsFactors=FALSE )

  logger.trace("Created 'data' dataframe with %d rows and %d columns", nrow(data), ncol(data))

  return(data)

}