R/baseFillUp.R

Defines functions demo_hsFillUp .reorderedAndSelectedColumns .newColumnNames .interpolateAllColumns .showStartAndEndOfBlock fillup .warnOnNoStepMultiples .stopOnBadArguments hsFillUp

Documented in demo_hsFillUp fillup hsFillUp

# hsFillUp ---------------------------------------------------------------------

#' Fill-up with Missing Timestamps
#'
#' Based on the time series given in "tseries" a new time series is generated in
#' which timestamps that are lacking in tseries are included. Optionally missing
#' values are generated by interpolation between existing values.
#'
#' @param tseries data frame representing a time-series of values
#' @param tsField Name of timestamp column in tseries. Default: name of first
#'   available POSIXt-column
#' @param step_s Time step in seconds that shall lie between consecutive
#'   timestamps. Non-existing timestamps are generated.
#' @param forceStep If TRUE, only timestamps that are multiples of the time step
#'   given in step_s are put into the result data frame
#' @param limits data frame or matrix with exactly two columns containing first
#'   and last timestamp of intervals for which timestamps are to be generated.
#' @param interpol if TRUE, the returned data frame will for each value column
#'   contain a corresponding column containing interpolated values
#' @param includeOrig if TRUE and interpol = TRUE the original columns will
#'   remain in the output in addition to the columns containing all (original
#'   plus interpolated) values.
#' @param default value to be used if there are not enough (at least to) non-NA
#'   values to be used for interpolation. Default: NA
#' @param dbg If TRUE, debug messages are shown
#' @export
#' @return Returns a data.frame
#' @importFrom kwb.utils catIf posixColumnAtPosition
hsFillUp <- function(
  tseries, tsField = names(tseries)[kwb.utils::posixColumnAtPosition(tseries)[1]], 
  step_s = 60, forceStep = TRUE, limits = NULL, interpol = TRUE, 
  includeOrig = TRUE, default = NA, dbg = FALSE
)
{
  # Stop if there are unexpected argument values
  .stopOnBadArguments(tseries, step_s, limits)
  
  ## Check if there are timestamps that are not at exact minutes (seconds = 0)
  .warnOnNoStepMultiples(timestamps = tseries[[tsField]], step_s = step_s)
  
  # If no limits are given use the full time range of the timestamps in the
  # given data frame as "artificial" limits  
  if (is.null(limits)) {
    timeRange <- range(tseries[[tsField]])
    limits <- data.frame(from = timeRange[1], to = timeRange[2])
  }
  
  ## Initialise result (will become a data.frame)
  result <- NULL

  ## Call fillup for each data block defined by the intervals given in limits 
  # (may be only one interval)
  for (i in seq_len(nrow(limits))) {
    
    tbeg <- limits[i, 1]
    tend <- limits[i, 2]
    
    kwb.utils::catIf(dbg, sprintf("tbeg: %s, tend: %s\n", tbeg, tend))
    
    ## Cut block of rows representing timestamps between tbeg and tend from
    ## tseries
    selected <- tseries[[tsField]] >= tbeg & tseries[[tsField]] <= tend
    
    # Skip empty areas
    if (any(selected)) {

      ## Call fillup for the data block, not giving limits
      blockResult <- fillup(
        tseries = tseries[selected, ],
        tsField = tsField, 
        step_s = step_s, 
        forceStep = forceStep, 
        interpol = interpol, 
        includeOrig = includeOrig, 
        default = default, 
        dbg = dbg      
      )
      
      ## add filled-up data block to result data.frame
      result <- rbind(result, blockResult)     
    }
    else {
      warning("No data available between the given limits: ",
              tbeg, " and ", tend, "!")
    }
  }    
  
  result  
}

# .stopOnBadArguments ----------------------------------------------------------
.stopOnBadArguments <- function(tseries, step_s, limits)
{
  ## The timeseries must be given as a data.frame
  if (class(tseries)[1] != "data.frame") {
    stop(paste("In tseries, a data.frame must be given, containing timestamps",
               "in the first column.\n"))
  }
  
  ## The time-step must be given as a number
  if (! is.numeric(step_s)) {
    stop(sprintf("step_s must be numeric (is %s).", class(step_s)))
  }
  
  ## If limits are given they must be of type matrix or data.frame
  if (! is.null(limits) && class(limits) != "data.frame" 
      && class(limits) != "matrix") {
    stop(sprintf("limits must be data.frame or matrix (is %s).", class(limits)))
  }  
}

# .warnOnNoStepMultiples -------------------------------------------------------
.warnOnNoStepMultiples <- function(timestamps, step_s)
{
  isNoStepMultiple <- as.integer(timestamps) %% step_s != 0
  numberOfNoStepMultiples <- sum(isNoStepMultiple)
  
  if (numberOfNoStepMultiples > 0) {
    
    cat("There are", numberOfNoStepMultiples, "timestamps",
        "(out of a total of", length(timestamps), ") that are not multiples",
        "of the timestep (", step_s, "seconds ):\n")
    
    print(utils::head(timestamps[isNoStepMultiple]))
  } 
}

# fillup -----------------------------------------------------------------------

#' Fill-up with Missing Timestamps
#'
#' Based on the time series given in "tseries" a new time series is generated in
#' which timestamps that are lacking in tseries are included. Optionally missing
#' values are generated by interpolation between existing values.
#'
#' @param tseries data frame representing a time-series of values
#' @param tsField Name of timestamp column in tseries. Default: name of first
#'   available POSIXt-column
#' @param step_s Time step in seconds that shall lie between consecutive
#'   timestamps. Non-existing timestamps are generated.
#' @param forceStep If TRUE, only timestamps that are multiples of the time step
#'   given in step_s are put into the result data frame
#' @param interpol if TRUE, the returned data frame will for each value column
#'   contain a corresponding column containing interpolated values
#' @param includeOrig if TRUE and interpol = TRUE the original columns will
#'   remain in the output in addition to the columns containing all (original
#'   plus interpolated) values.
#' @param default value to be used if there are not enough (at least to) non-NA
#'   values to be used for interpolation. Default: NA
#' @param dbg If TRUE, debug messages are shown
#'
#' @return Returns a data.frame
#' @export
#' @importFrom kwb.datetime roundTime
fillup <- function(
  tseries, tsField, step_s, forceStep, interpol, includeOrig, default = NA, 
  dbg = FALSE   
)
{
  ## Initialise result (will become a data.frame)
  result <- NULL
  
  ## Fill-up between the first and the last timestamp of the given time series
  timeRange <- range(tseries[[tsField]])
  
  ## If needed, get lower minimum or greater maximum representing multiples
  ## of the time step.
  tbeg <- kwb.datetime::roundTime(timeRange[1], step_s, 1)
  tend <- kwb.datetime::roundTime(timeRange[2], step_s, 0)      
  
  ## Generate the complete series of "regular" timestamps (multiples of 
  ## time-step) between tbeg and tend
  
  ## Handle the special case of only one value
  if (tbeg == tend) {
    timestamps <- tbeg
  } else {
    timestamps <- seq(tbeg, tend, by = step_s)    
  }
  
  tsBlock <- tseries
  
  if (dbg) {
    
    .showStartAndEndOfBlock(tsBlock, tbeg, tend, timestamps)
  }
  
  ## Merge all timestamps with time series block by joining the regular 
  ## timestamps (multiples of time-step) with the timestamps of the given 
  ## timeseries block:
  ## - If forcStep is TRUE we do a "left join", where the result only contains
  ##   the "regular" timestamps (multiples of time-step).
  ## - If forcStep is FALSE we do a "left or right join", where the result 
  ##   contains both all "regular" timestamps (multiples of time-step) and all
  ##   timestamps contained in the original timeseries.
  tsBlock <- merge(
    x = data.frame(timestamps = timestamps), 
    y = tsBlock, 
    by.x = "timestamps", 
    by.y = tsField, 
    all.x = TRUE, 
    all.y = ! forceStep
  )
  
  ## Interpolate values, if desired
  if (interpol) {
    tsBlock <- .interpolateAllColumns(
      tsBlock = tsBlock, 
      tseries = tseries, 
      tsField = tsField, 
      default = default, 
      dbg = dbg
    )
  }  
  
  ## Set result data frame
  result <- tsBlock
  
  ## Get column names without timestamp column
  dataColumnNames <- setdiff(colnames(tseries), tsField)
  
  ## Set column names in result data.frame
  names(result) <- .newColumnNames(
    columnNames = dataColumnNames, 
    tsField = tsField, 
    interpol = interpol, 
    dbg = dbg
  )
  
  columns <- .reorderedAndSelectedColumns(
    numberOfColumns = ncol(result),
    numberOfDataColumns = length(dataColumnNames), 
    interpol = interpol, 
    includeOrig = includeOrig
  )
  
  result[, columns]
}

# .showStartAndEndOfBlock ------------------------------------------------------
.showStartAndEndOfBlock <- function(tsBlock, tbeg, tend, timestamps)
{
  cat(sprintf("Block of timeseries within %s and %s (full multiples of timestep)\n", tbeg, tend))
  print(utils::head(tsBlock))
  cat("...\n")
  print(utils::tail(tsBlock))
  cat(sprintf("Generated timestamps between %s and %s:\n", tbeg, tend))
  print(utils::head(timestamps))
  cat("...\n")
  print(utils::tail(timestamps))
  cat(sprintf("Available timestamps: %10d\n", nrow(tsBlock)))
  cat(sprintf("All timestamps:       %10d\n", length(timestamps)))      
}

# .interpolateAllColumns -------------------------------------------------------
.interpolateAllColumns <- function(
  tsBlock, tseries, tsField, default = NA, dbg = FALSE
)
{
  # Skip the timestamp field itself
  fields <- setdiff(colnames(tseries), tsField)
  
  ## For each value field
  for (field in fields) {
    
    kwb.utils::catIf(dbg, sprintf("Interpolating field: %s\n", field))
    
    interpolated <- .getInterpolatedValues(
      timestamps = tseries[[tsField]], 
      values = tseries[[field]], 
      requiredTimestamps = tsBlock$timestamps, 
      default = default, 
      dbg = dbg
    )
    
    ## Add column with interpolated values to the result data.frame
    tsBlock <- cbind(tsBlock, interpolated)
  }
  
  tsBlock
}

# .getInterpolatedValues -------------------------------------------------------
#' @keywords internal
#' @noRd
#' @noMd
#' @importFrom stats approx
.getInterpolatedValues <- function
(
  timestamps, values, requiredTimestamps, default = NA, dbg = FALSE
)
{
  ## We need at least two non-NA values to interpolate
  if (sum(!is.na(values)) > 1 ) {
    
    # Calculate interpolation for this field. approx returns a list with
    # components x and y of which we use the y component.
    interpolated <- stats::approx(
      timestamps, values, xout = requiredTimestamps
    )$y
  }
  else {
    
    # Determine a subsitute value. Either the value itself if there is only
    # one value or the given default value!
    substituteValue <- ifelse(length(values) == 1, values, default)
    
    kwb.utils::catIf(dbg, "Not at least two non-NA values available for interpolation!\n",
          "Using one value for all timestamps:", substituteValue, "\n")
    
    interpolated <- rep(substituteValue, length(requiredTimestamps))
  }
  
  interpolated
  ### list with components \emph{x} (requiredTimestamps) and \emph{y}
  ### (interpolated values)
}

# .newColumnNames --------------------------------------------------------------
.newColumnNames <- function(columnNames, tsField, interpol, dbg = FALSE)
{
  if (interpol) {
    columnNames <- c(paste(columnNames, "orig", sep = "_"), columnNames)
  }
  
  columnNames <- c(tsField, columnNames)
  
  kwb.utils::catIf(dbg, sprintf("Column names: %s\n", paste(columnNames, collapse = ", ")))
  
  columnNames
}

# .reorderedAndSelectedColumns -------------------------------------------------
.reorderedAndSelectedColumns <- function(
  numberOfColumns, numberOfDataColumns, interpol, includeOrig
)
{
  ## Reorder columns
  columnNumbers <- 1
  
  if (interpol) {
    columnNumbers <- c(columnNumbers, (numberOfDataColumns + 2):numberOfColumns)
  }
  
  ## If desired, include original columns  
  if (!interpol || includeOrig) {
    columnNumbers <- c(columnNumbers, 2:(numberOfDataColumns + 1))
  }
  
  columnNumbers
}

# demo_hsFillUp ----------------------------------------------------------------

#' Create Plot Demonstrating hsFillUp()
#' 
#' @return demo plot hsFillUp
#' @export
#' @importFrom stats rnorm
#' @importFrom  graphics lines plot
demo_hsFillUp <- function()
{
  message(
    "See the code creating the plot by typing 'demo_hsFillUp' and pressing ", 
    "Enter."
  )
  
  tstamps <- hsMkTimestamps("2010-03-27", to = "2010-03-30", step.s = 3600)
  
  df_1 <- data.frame(
    DateTimeUTC = tstamps,
    DataValue = stats::rnorm(n = length(tstamps))
  )
  
  limits <- data.frame(
    from = kwb.datetime::hsToPosix("2010-03-28"), to = kwb.datetime::hsToPosix("2010-03-29")
  )
  
  df_2 <- hsFillUp(df_1, includeOrig = FALSE, limits = limits)
  
  graphics::plot(df_1, main = "hsFillUp() missing values within a range")
  
  graphics::lines(df_2, xlim = range(df_1$DateTimeUTC), type = "b", cex = 0.3)
}
KWB-R/kwb.base documentation built on June 19, 2022, 3:30 a.m.