R/recode_replace.R

Defines functions pad pad_atomic replace_outliers mad_trans mad_limits sd_limits Crepoutl replace_inf replace_na na_focb_ph na_locf_ph na_focb na_locf recode_char recode_num scv

Documented in na_focb na_locf pad recode_char recode_num replace_inf replace_na replace_outliers

# Note: don't change the order of these arguments !!!
scv <- function(x, v, r, set = FALSE, inv = FALSE, vind1 = FALSE) .Call(C_setcopyv, x, v, r, inv, set, vind1)

# inspired by ?dplyr::recode
# Think about adopting this code for as_numeric_factor and as_character_factor
recode_num <- function(X, ..., default = NULL, missing = NULL, set = FALSE) {
  if(missing(...)) stop("recode_num requires arguments of the form: value = replacement")
  args <- list(...)
  nam <- as.numeric(names(args))
  # nzchar(names(args)) ... check non-empty names ? -> nah, this package is not for dummies
  if(anyNA(nam)) stop(paste("Non-numeric arguments:", paste(names(args)[is.na(nam)], collapse = ", ")))
  arglen <- length(args)
  missingl <- !is.null(missing)
  if(missingl && any(nam == missing))  warning(paste0("To improve performance missing values are replaced prior to recode, so this replaces all missing values with ",
                                               missing, " and those are then again replaced with ", args[[which(nam == missing)]], ". If this is not desired, call replace_na after recode with missing = NULL."))
  if(arglen == 1L) {
    args <- args[[1L]]
    if(is.null(default)) {
      if(missingl) {
        repfun <- function(y) if(is.numeric(y)) {
          z <- scv(y, NA, missing, set) # y[is.na(y)] <- missing
          scv(z, nam, args, TRUE) # `[<-`(y, y == nam, value = args)
          } else y
      } else {
        repfun <- function(y) if(is.numeric(y)) scv(y, nam, args, set) else y # `[<-`(y, y == nam, value = args)
      }
    } else {
      nr <- if(is.atomic(X)) NROW(X) else fnrow(X)
      if(missingl) {
        repfun <- function(y) if(is.numeric(y)) {
          nas <- is.na(y)
          z <- scv(y, nas, missing, set, vind1 = TRUE)
          ind <- whichv(z, nam)
          scv(z, nas, default, TRUE, TRUE, vind1 = TRUE) # duplAttributes(alloc(default, nr), y)
          scv(z, ind, args, TRUE, vind1 = TRUE) # y == nam
        } else y
      } else {
        repfun <- function(y) if(is.numeric(y)) scv(scv(y, nam, default, set, TRUE), nam, args, TRUE) else y # `[<-`(duplAttributes(alloc(default, nr), y), y == nam, value = args)
      }
    }
  } else {
    seqarg <- seq_len(arglen)
    if(is.null(default)) {
      repfun <- function(y) if(is.numeric(y)) {
        if(missingl) y <- scv(y, NA, missing, set) # y[is.na(y)] <- missing
        else if(!set) y <- scv(y, 1L, y[1L], vind1 = TRUE) # copy
        z <- scv(y, 1L, y[1L], vind1 = TRUE) # copy
        for(i in seqarg) scv(y, whichv(z, nam[i]), args[[i]], TRUE, vind1 = TRUE)
        y
      } else y
      # repfun <- function(y) if(is.numeric(y)) {
      #   if(missingl) y <- scv(y, NA, missing, set) # y[is.na(y)] <- missing
      #   if(set) { # Note: not strictly the way this should work...
      #     for(i in seqarg) scv(y, nam[i], args[[i]], TRUE)
      #     return(y)
      #   }
      #   z <- scv(y, 1L, y[1L], vind1 = TRUE) # copy
      #   for(i in seqarg) scv(z, whichv(y, nam[i]), args[[i]], TRUE, vind1 = TRUE)
      #   z
      # } else y
    } else {
      nr <- if(is.atomic(X)) NROW(X) else fnrow(X)
      if(missingl) {
        repfun <- function(y) if(is.numeric(y)) {
          nas <- is.na(y)
          y <- scv(y, nas, missing, set, vind1 = TRUE)
          z <- scv(y, 1L, y[1L], vind1 = TRUE) # Copy
          scv(y, nas, default, TRUE, TRUE, vind1 = TRUE)
          for(i in seqarg) scv(y, whichv(z, nam[i]), args[[i]], TRUE, vind1 = TRUE)
          y
        } else y
      } else {
        repfun <- function(y) if(is.numeric(y)) {
          z <- scv(y, 1L, y[1L], vind1 = TRUE) # Copy
          y <- scv(y, nam[1L], default, set, TRUE) # duplAttributes(alloc(default, nr), y)
          scv(y, nam[1L], args[[1L]], TRUE)
          for(i in seqarg[-1L]) scv(y, whichv(z, nam[i]), args[[i]], TRUE, vind1 = TRUE)
          y
        } else y
      }
    }
  }

  if(is.list(X)) {
    if(set) {
      lapply(unattrib(X), repfun)
      return(invisible(X))
    }
    res <- duplAttributes(lapply(unattrib(X), repfun), X)
    return(if(inherits(X, "data.table")) alc(res) else res)
  }
  if(!is.numeric(X)) stop("X needs to be numeric or a list")
  res <- repfun(X)
  return(if(set) invisible(res) else res)
}

recode_char <- function(X, ..., default = NULL, missing = NULL, regex = FALSE,
                        ignore.case = FALSE, fixed = FALSE, set = FALSE) {
  if(missing(...)) stop("recode_char requires arguments of the form: value = replacement")
  args <- list(...)
  nam <- names(args)
  arglen <- length(args)
  missingl <- !is.null(missing)
  if(missingl && any(nam == missing))  warning(paste0("To improve performance missing values are replaced prior to recode, so this replaces all missing values with ",
                                                      missing, " and those are then again replaced with ", args[[which(nam == missing)]], ". If this is not desired, call replace_na after recode with missing = NULL."))
  if(regex) {
    if(arglen == 1L) {
      args <- args[[1L]]
      if(is.null(default)) {
        if(missingl) {
          repfun <- function(y) if(is.character(y)) {
            y <- scv(y, NA, missing, set)  # y[is.na(y)] <- missing
            scv(y, grepl(nam, y, ignore.case, FALSE, fixed), args, TRUE, vind1 = TRUE)
          } else y
        } else {
          repfun <- function(y) if(is.character(y)) scv(y, grepl(nam, y, ignore.case, FALSE, fixed), args, set, vind1 = TRUE) else y
        }
      } else {
        nr <- if(is.atomic(X)) NROW(X) else fnrow(X)
        if(missingl) {
          repfun <- function(y) if(is.character(y)) {
            nas <- is.na(y)
            z <- scv(y, nas, missing, set, vind1 = TRUE)
            ind <- grepl(nam, z, ignore.case, FALSE, fixed)
            scv(z, nas, default, TRUE, TRUE, vind1 = TRUE) # duplAttributes(alloc(default, nr), y)
            scv(z, ind, args, TRUE, vind1 = TRUE)
          } else y
        } else {
          repfun <- function(y) if(is.character(y)) {
            ind <- grepl(nam, y, ignore.case, FALSE, fixed)
            scv(scv(y, ind, default, set, TRUE, vind1 = TRUE), ind, args, TRUE, vind1 = TRUE)
          } else y
        }
      }
    } else {
      seqarg <- seq_len(arglen)
      if(is.null(default)) {
        repfun <- function(y) if(is.character(y)) {
          if(missingl) y <- scv(y, NA, missing, set) # y[is.na(y)] <- missing
          else if(!set) y <- scv(y, 1L, y[1L], vind1 = TRUE) # copy
          z <- scv(y, 1L, y[1L], vind1 = TRUE) # copy
          for(i in seqarg) scv(y, grepl(nam[i], z, ignore.case, FALSE, fixed), args[[i]], TRUE, vind1 = TRUE)
          y
        } else y
      } else {
        nr <- if(is.atomic(X)) NROW(X) else fnrow(X)
        if(missingl) {
          repfun <- function(y) if(is.character(y)) {
            nas <- is.na(y)
            y <- scv(y, nas, missing, set, vind1 = TRUE)
            z <- scv(y, 1L, y[1L], vind1 = TRUE) # Copy
            scv(y, nas, default, TRUE, TRUE, vind1 = TRUE)
            for(i in seqarg) scv(y, grepl(nam[i], z, ignore.case, FALSE, fixed), args[[i]], TRUE, vind1 = TRUE)
            y
          } else y
        } else {
          repfun <- function(y) if(is.character(y)) {
            z <- scv(y, 1L, y[1L], vind1 = TRUE) # Copy
            y <- scv(y, seq_along(y), default, set, vind1 = TRUE)  # Initialize all to default
            for(i in seqarg) scv(y, grepl(nam[i], z, ignore.case, FALSE, fixed), args[[i]], TRUE, vind1 = TRUE)
            y
          } else y
        }
      }
    }

  } else {
    if(arglen == 1L) {
      args <- args[[1L]]
      if(is.null(default)) {
        if(missingl) {
          repfun <- function(y) if(is.character(y)) {
            z <- scv(y, NA, missing, set) # y[is.na(y)] <- missing
            scv(z, nam, args, TRUE) # `[<-`(y, y == nam, value = args)
          } else y
        } else {
          repfun <- function(y) if(is.character(y)) scv(y, nam, args, set) else y # `[<-`(y, y == nam, value = args)
        }
      } else {
        nr <- if(is.atomic(X)) NROW(X) else fnrow(X)
        if(missingl) {
          repfun <- function(y) if(is.character(y)) {
            nas <- is.na(y)
            z <- scv(y, nas, missing, set, vind1 = TRUE)
            ind <- whichv(z, nam)
            scv(z, nas, default, TRUE, TRUE, vind1 = TRUE) # duplAttributes(alloc(default, nr), y)
            scv(z, ind, args, TRUE, vind1 = TRUE) # y == nam
          } else y
        } else {
          repfun <- function(y) if(is.character(y)) scv(scv(y, nam, default, set, TRUE), nam, args, TRUE) else y # `[<-`(duplAttributes(alloc(default, nr), y), y == nam, value = args)
        }
      }
    } else {
      seqarg <- seq_len(arglen)
      if(is.null(default)) {
        repfun <- function(y) if(is.character(y)) {
          if(missingl) y <- scv(y, NA, missing, set) # y[is.na(y)] <- missing
          else if(!set) y <- scv(y, 1L, y[1L], vind1 = TRUE) # copy
          z <- scv(y, 1L, y[1L], vind1 = TRUE) # copy
          for(i in seqarg) scv(y, whichv(z, nam[i]), args[[i]], TRUE, vind1 = TRUE)
          y
        } else y
      } else {
        nr <- if(is.atomic(X)) NROW(X) else fnrow(X)
        if(missingl) {
          repfun <- function(y) if(is.character(y)) {
            nas <- is.na(y)
            y <- scv(y, nas, missing, set, vind1 = TRUE)
            z <- scv(y, 1L, y[1L], vind1 = TRUE) # Copy
            scv(y, nas, default, TRUE, TRUE, vind1 = TRUE)
            for(i in seqarg) scv(y, whichv(z, nam[i]), args[[i]], TRUE, vind1 = TRUE)
            y
          } else y
        } else {
          repfun <- function(y) if(is.character(y)) {
            z <- scv(y, 1L, y[1L], vind1 = TRUE) # Copy
            y <- scv(y, nam[1L], default, set, TRUE) # duplAttributes(alloc(default, nr), y)
            scv(y, nam[1L], args[[1L]], TRUE)
            for(i in seqarg[-1L]) scv(y, whichv(z, nam[i]), args[[i]], TRUE, vind1 = TRUE)
            y
          } else y
        }
      }
    }
  }

  if(is.list(X)) {
    if(set) {
      lapply(unattrib(X), repfun)
      return(invisible(X))
    }
    res <- duplAttributes(lapply(unattrib(X), repfun), X)
    return(if(inherits(X, "data.table")) alc(res) else res)
  }
  if(!is.character(X)) stop("X needs to be character or a list")
  res <- repfun(X)
  return(if(set) invisible(res) else res)
}


na_locf <- function(x, set = FALSE) .Call(C_na_locf, x, set)
na_focb <- function(x, set = FALSE) .Call(C_na_focb, x, set)

na_locf_ph <- function(x, ph1, ph2, set = FALSE) .Call(C_na_locf, x, set)
na_focb_ph <- function(x, ph1, ph2, set = FALSE) .Call(C_na_focb, x, set)

replace_na <- function(X, value = 0L, cols = NULL, set = FALSE, type = "const") {
  FUN <- switch(type, const =, value = scv, locf = na_locf_ph, focb = na_focb_ph,
                stop("Unknown type:", type))
  if(set) {
    if(is.list(X)) {
      if(is.null(cols)) {
        lapply(unattrib(X), FUN, NA, value, TRUE)
      } else if(is.function(cols)) {
        lapply(unattrib(X), function(y) if(cols(y)) FUN(y, NA, value, TRUE) else y)
      } else {
        cols <- cols2int(cols, X, attr(X, "names"), FALSE)
        lapply(unattrib(X)[cols], FUN, NA, value, TRUE)
      }
    } else FUN(X, NA, value, TRUE) # `[<-`(X, is.na(X), value = value)
    return(invisible(X))
  }
  if(is.list(X)) {
    if(is.null(cols)) return(condalc(duplAttributes(lapply(unattrib(X), FUN, NA, value), X), inherits(X, "data.table"))) # function(y) `[<-`(y, is.na(y), value = value)
    if(is.function(cols)) return(condalc(duplAttributes(lapply(unattrib(X),
      function(y) if(cols(y)) FUN(y, NA, value) else y), X), inherits(X, "data.table")))
    clx <- oldClass(X)
    oldClass(X) <- NULL
    cols <- cols2int(cols, X, names(X), FALSE)
    X[cols] <- lapply(unattrib(X[cols]), FUN, NA, value) #  function(y) `[<-`(y, is.na(y), value = value)
    return(condalc(`oldClass<-`(X, clx), any(clx == "data.table")))
  }
  FUN(X, NA, value) # `[<-`(X, is.na(X), value = value)
}

replace_NA <- replace_na

# Remove Inf (Infinity) and NaN (Not a number) from vectors or data frames:
replace_inf <- function(X, value = NA, replace.nan = FALSE, set = FALSE) {
  if(set) {
    if(is.list(X)) {
      lapply(unattrib(X), if(replace.nan) (function(y) if(is.numeric(y)) scv(y, is.infinite(y) | is.nan(y), value, TRUE, vind1 = TRUE) else y) else
                                          (function(y) if(is.numeric(y)) scv(y, is.infinite(y), value, TRUE, vind1 = TRUE) else y))
    }
    if(!is.numeric(X)) stop("Infinite values can only be replaced in numeric objects!")
    if(replace.nan) scv(X, is.infinite(X) | is.nan(X), value, TRUE, vind1 = TRUE) else scv(X, is.infinite(X), value, TRUE, vind1 = TRUE)
    return(invisible(X))
  }
  if(is.list(X)) {
    # if(!inherits(X, "data.frame")) stop("replace_non_finite only works with atomic objects or data.frames")
    res <- duplAttributes(lapply(unattrib(X),
             if(replace.nan) (function(y) if(is.numeric(y)) scv(y, is.infinite(y) | is.nan(y), value, vind1 = TRUE) else y) else
                             (function(y) if(is.numeric(y)) scv(y, is.infinite(y), value, vind1 = TRUE) else y)), X)
    return(if(inherits(X, "data.table")) alc(res) else res)
  }
  if(!is.numeric(X)) stop("Infinite values can only be replaced in numeric objects!")
  if(replace.nan) return(scv(X, is.infinite(X) | is.nan(X), value, vind1 = TRUE)) #  !is.finite(X) also replaces NA
  scv(X, is.infinite(X), value, vind1 = TRUE)
}

replace_Inf <- replace_inf

# replace_non_finite <- function(X, value = NA, replace.nan = TRUE) {
#   .Deprecated("replace_Inf")
#   replace_Inf(X, value, replace.nan)
# }

Crepoutl <- function(x, limits, value, single_limit, set = FALSE) .Call(C_replace_outliers, x, limits, value, single_limit, set)

sd_limits <- function(x, limits) {
  st <- fbstatsCpp(x, stable.algo = FALSE, setn = FALSE)
  st[2L] + st[3L] * c(-limits, limits)
}

mad_limits <- function(x, limits) {
  med <- fmedian.default(x)
  mad <- fmedian.default(abs(x - med))
  med + mad * c(-limits, limits)
}

# scaling data using MAD
mad_trans <- function(x) {
  if(inherits(x, c("pseries", "pdata.frame"))) {
    g <- GRP(x)
    tmp <- fmedian(x, g, TRA = "-")
    tmp %/=% fmedian(if(is.list(tmp)) lapply(tmp, abs) else abs(tmp), g, TRA = "fill", set = TRUE)
    return(tmp)
  }
  tmp <- fmedian(x, TRA = "-")
  tmp %/=% fmedian(if(is.list(tmp)) dapply(tmp, abs) else abs(tmp), TRA = "fill", set = TRUE)
  return(tmp)
}

replace_outliers <- function(X, limits, value = NA,
                             single.limit = c("sd", "mad", "min", "max"),
                             ignore.groups = FALSE,
                             set = FALSE) {

  if(length(limits) == 1L) {
   # "overall_" arguments are legacy, now accommodated via the ignore.groups argument
   sl <- switch(single.limit[1L], SDs = 4L, min = 2L, max = 3L,
                overall_SDs = 5L, sd = 4L, mad = 6L,
                MADs = 6L, overall_MADs = 7L, # Just in case
                stop("Unknown single.limit option: ", single.limit[1L]))
   if(sl == 5L || sl == 7L) ignore.groups <- TRUE
  } else sl <- 0L

  if(sl > 3L) { # Outliers according to standard deviation or MAD threshold
    if(is.list(X)) {
      if(!ignore.groups && inherits(X, c("grouped_df", "pdata.frame"))) {
        if(is.character(value)) stop("clipping is not yet supported with grouped/panel data and SDs/MADs thresholds.")
        num <- .Call(C_vtypes, X, 1L) # vapply(unattrib(X), is.numeric, TRUE)
        num <- if(inherits(X, "grouped_df")) num & !fgroup_vars(X, "logical") else
          num & attr(findex(X), "names") %!in% attr(X, "names")
        clx <- oldClass(X)
        STDXnum <- if(sl > 5L) mad_trans(fcolsubset(X, num)) else fscale(fcolsubset(X, num))
        oldClass(X) <- NULL
        res <- .mapply(function(z, y) scv(z, abs(y) > limits, value, set, vind1 = TRUE),
                       list(unattrib(X[num]), unattrib(STDXnum)), NULL)
        if(set) return(invisible(X))
        X[num] <- res
        res <- `oldClass<-`(X, clx)
      } else {
        limit_fun <- if(sl > 5L) mad_limits else sd_limits
        res <- lapply(unattrib(X), function(y) if(is.numeric(y)) Crepoutl(y, limit_fun(y, limits), value, sl, set) else y)
        if(set) return(invisible(X))
        res <- duplAttributes(res, X)
      }
      return(if(inherits(res, "data.table")) alc(res) else res)
    }
    if(is.matrix(X)) {
      if(is.character(value)) stop("clipping is not yet supported with matrices and SDs/MADs thresholds.")
      res <- scv(X, abs(if(sl > 5L) mad_trans(X) else fscale(X)) > limits, value, set, vind1 = TRUE)
    } else {
      res <- Crepoutl(X, if(sl > 5L) mad_limits(X, limits) else sd_limits(X, limits), value, sl, set)
    }
    return(if(set) invisible(res) else res)
  }

  # Standard cases
  if(set) {
    if(is.list(X)) lapply(unattrib(X), function(y) if(is.numeric(y)) Crepoutl(y, limits, value, sl, set) else y) else
      Crepoutl(X, limits, value, sl, set)
    return(invisible(X))
  }

  if(is.list(X)) {
    res <- duplAttributes(lapply(unattrib(X), function(y) if(is.numeric(y)) Crepoutl(y, limits, value, sl, set) else y), X)
    return(if(inherits(res, "data.table")) alc(res) else res)
  }

  Crepoutl(X, limits, value, sl, set)
}



# pad or fpad? x is vector, matrix or data.frame
pad_atomic <- function(x, i, n, value) {
  ax <- attributes(x)
  tx <- typeof(x)
  if(typeof(value) != tx) value <- as.vector(value, tx)
  if(is.matrix(x)) {
    k <- dim(x)[2L]
    m <- .Call(C_alloc, value, n * k, TRUE)  # matrix(value, n, k)
    dim(m) <- c(n, k)
    m[i, ] <- x
    if(length(ax) == 1L) return(m)
    ax[["dim"]] <- c(n, k)
    # Could also pad row-names? perhaps with names of i ??
    if(length(ax[["dimnames"]][[1L]])) ax[["dimnames"]] <- list(NULL, ax[["dimnames"]][[2L]])
    if(is.object(x)) ax[["class"]] <- NULL
    return(`attributes<-`(m, ax)) # fastest ??
  }
  r <- .Call(C_alloc, value, n, TRUE) # matrix(value, n) # matrix is faster than rep_len !!!!
  r[i] <- x
  if(is.null(ax)) return(r)
  if(length(names(x))) {
    if(length(ax) == 1L) return(r)
    ax[["names"]] <- NULL
  }
  return(`attributes<-`(r, ax))
}

# microbenchmark::microbenchmark(x[-i] <- ri, x[i2] <- ri)
# Unit: milliseconds
# expr       min       lq     mean   median       uq       max neval cld
# x[-i] <- ri 255.16654 420.7083 491.7369 446.0340 476.3324 1290.7396   100   b
# x[i2] <- ri  80.18755 136.8012 157.0027 146.8156 166.7158  311.5526   100  a
# microbenchmark::microbenchmark(seq_along(x)[-i])
# Unit: milliseconds
# expr      min       lq     mean   median       uq      max neval
# seq_along(x)[-i] 506.0745 541.7975 605.0245 567.8115 585.8384 1341.035   100

pad <- function(X, i, value = NA, method = c("auto", "xpos", "vpos")) { # 1 - i is same length as X, fill missing, 2 - i is positive: insert missing values in positions
  ilog <- is.logical(i)
  ineg <- i[1L] < 0L
  n <- if(is.list(X) || is.matrix(X)) fnrow(X) else length(X)
  xpos <- switch(method[1L], auto = if(ilog) bsum(i) == n else if(ineg) FALSE else length(i) == n,
                 xpos = TRUE, vpos = FALSE, stop("Unknown method: ", method[1L]))
  n <- if(ilog) length(i) else if(xpos && !ineg) bmax(i) else n + length(i)
  if(is.atomic(X)) return(pad_atomic(X, if(xpos || ineg) i else if(ilog) !i else -i, n, value))
  if(!is.list(X)) stop("X must be atomic or a list")
  if(ilog) {
    i <- if(xpos) which(i) else whichv(i, FALSE)
  } else if(!xpos) {
    i <- seq_len(n)[if(ineg) i else -i]
  }
  ax <- attributes(X)
  attributes(X) <- NULL
  res <- lapply(X, pad_atomic, i, n, value)
  if(length(ax[["row.names"]])) ax[["row.names"]] <- .set_row_names(n)
  return(condalcSA(res, ax, any(ax[["class"]] == "data.table")))
}

# Something like this already exists?? -> should work with lists as well...
SebKrantz/collapse documentation built on Dec. 16, 2024, 7:26 p.m.