R/approx.R

Defines functions approx_dt

Documented in approx_dt

#' Approximate missing values in a data.table.
#'
#' Similar to, but not quite like, `stats::approx`.
#' Does only support constant extrapolation and linear interpolation.
#' The resulting `data.table` only contains the range provided by `xdata` along `xcol`.
#' Without extrapolation, `xcol` in the resulting `data.table` may not
#' cover the range given by `xdata`.
#'
#' @param dt a data.table.
#' @param xdata the range to interpolate to. This is the range the result will have along the dimension `xcol`.
#' @param xcol name of the column for interpolation.
#' @param ycol name of the column that contains the value to be interpolated.
#' @param idxcols columns that identify a row (besides xcol), i.e., the remaining index dimensions.
#' @param keepna keep NA values for rows that can not be interpolated (since they are outside of [min(xcol), max(xcol)]), default is FALSE.
#' @param extrapolate use the closest values to fill `ycol` outside of the interpolation domain, default is FALSE. This will also work if there is only one value along `ycol`, i.e., no interpolation is taking place.
#' @return a data.table with the range given by `xdata` along `xcol`. Columns not given in `idxcols` will be kept but NAs will appear on extrapolated and interpolated rows.
#'
#' @import data.table
#' @importFrom stats approx
#' @export
#' @examples
#' dt <- as.data.table(ChickWeight)
#' ## delete all values but 1
#' dt[Chick == 1 & Time > 0, weight := NA]
#' ## delete all values but 2
#' dt[Chick == 2 & Time > 2, weight := NA]
#'
#' ## extrapolation from 1 value
#' approx_dt(dt, 0:21, "Time", "weight", idxcols=c("Chick", "Diet"), extrapolate = TRUE)[Chick == 1]
#' ## extrapolation and interpolation
#' approx_dt(dt, 0:21, "Time", "weight", idxcols=c("Chick", "Diet"), extrapolate = TRUE)[Chick == 2]
#' ## column not in idxcols
#' approx_dt(dt, 0:21, "Time", "weight", idxcols="Chick", extrapolate = TRUE)[Chick == 2]
#'
#' dt <- as.data.table(ChickWeight)
#' ## interpolation only
#' approx_dt(dt, 0:21, "Time", "weight", idxcols=c("Chick", "Diet"))[Chick == 2]

approx_dt <- function(dt, xdata, xcol, ycol,
                      idxcols = NULL,
                      keepna = FALSE,
                      extrapolate = FALSE){

    dummycol <- target <- xrange <- NULL

    ## assert that there is some overlap between given xdata and the values in xcol
    if(!any(between(dt[[xcol]], min(xdata), max(xdata)))){
        stop("Given xdata and range in the xcol column of the table are not overlapping.")
    }

    if(is.null(idxcols)){
      idxcols <- names(dt)
      idxcols <- idxcols[!idxcols %in% c(xcol, ycol)]
    }

    ## create a datatable based on the index columns and the new xdata
    target <- unique(dt[, ..idxcols])[, dummycol := "new xdata"]
    xrange <- data.table(xcol = xdata)
    names(xrange) <- xcol
    xrange[, dummycol := "new xdata"]
    target <- merge(target, xrange, by = "dummycol", allow.cartesian = TRUE)[, dummycol := NULL]

    ## for the missing xdata we expand the full idx range
    result <- merge(dt, target, by = c(idxcols, xcol), all = T)

    if(extrapolate){
        result[, (ycol) := if(sum(!is.na(.SD[[ycol]])) > 1){
                               ## if there are at least two non-NA values, we interpolate
                               approx(.SD[[xcol]], .SD[[ycol]], xout = .SD[[xcol]], rule = 2)$y
                           }else{
                               ## if there is only one value, we use it on the whole column
                               sum(.SD[[ycol]], na.rm = T)
                           },
               by = idxcols]
    }else{
        if(max(dt[[xcol]]) < max(xdata) || min(dt[[xcol]]) > min(xdata)){
            stop("Error: interpolation range out of bounds.")
        }
        result[, (ycol) := approx(.SD[[xcol]], .SD[[ycol]], xout = .SD[[xcol]], rule = 1)$y,
               by=idxcols]
        if(!keepna){
            ## in case no extrapolation is taking place, we might want to
            ## remove NAs from the result
            result <- result[!is.na(get(ycol))]
        }
    }

    ## we will filter the result using join
    jdt <- data.table(xdata)
    setnames(jdt, xcol)
    return(result[jdt, on=xcol])
}
pik-piam/rmndt documentation built on April 21, 2024, 4:31 a.m.