collapse: Advanced and Fast Data Transformation

# todo speed up: use internals of order ??
collap <- function(X, by, FUN = fmean, catFUN = fmode, cols = NULL, custom = NULL,
                   keep.by = TRUE, keep.col.order = TRUE, sort.row = TRUE,
                   parallel = FALSE, mc.cores = 1L,
                   return = c("wide","list","long","long_dupl"), give.names = "auto", ...) {

  return <- switch(return[1L], wide = 1L, list = 2L, long = 3L, long_dupl = 4L, stop("Unknown return output option"))
  widel <- return == 1L
  customl <- !is.null(custom)
  if(!inherits(X, "data.frame")) X <- qDF(X)
  ax <- attributes(X)
  class(X) <- NULL
  nam <- names(X)
  # attributes(X) <- NULL
  # attr(X, "class") <- "data.frame" # class needed for method dispatch of fast functions, not for BY !!

  aplyfun <- if(parallel) function(...) mclapply(..., mc.cores = mc.cores) else lapply

  # identifying by and cols
  vl <- TRUE
  bycalll <- is.call(by)
  if(bycalll) {
      if(length(by) == 3L) {
        v <- logical(length(X))
        v[ckmatch(all.vars(by[[2L]]), nam)] <- TRUE  # nam %in% all.vars(by[[2L]])
        namby <- all.vars(by[[3L]])
        numby <- ckmatch(namby, nam)
      } else {
        namby <- all.vars(by)
        numby <- ckmatch(namby, nam)
        if(!customl) {
          v <- if(is.null(cols)) !logical(length(X)) else cols2log(cols, X, nam)
          v[numby] <- FALSE
        }
      }
      if(length(numby) == 1L) by <- qF(X[[numby]], ordered = sort.row, na.exclude = FALSE) else
                              by <- GRP.default(X, numby, sort = sort.row, return.groups = keep.by)
  } else if(is.atomic(by)) {
    namby <- deparse(substitute(by))
    numby <- 1L
    if(!customl) if(is.null(cols)) vl <- FALSE else v <- cols2log(cols, X, nam)
    if(!is.factor(by)) by <- qF(by, ordered = sort.row, na.exclude = FALSE)
  } else {
    if(!customl) if(is.null(cols)) vl <- FALSE else v <- cols2log(cols, X, nam)
    if(!is.GRP(by)) {
      numby <- seq_along(by)
      namby <- attr(by, "names") # faster if and only if by is a data.frame
      if(is.null(namby)) namby <- paste0("GRP.", numby)
      by <- GRP.default(by, numby, sort = sort.row, return.groups = keep.by)
    } else {
      namby <- by[[5L]]
      if(is.null(namby)) namby <- paste0("GRP.", seq_along(by[[4L]])) # necessary ??
      numby <- seq_along(namby)
    }
  }

  if(!customl) {

    # identifying data
    nu <- vapply(X, is.numeric, TRUE, USE.NAMES = FALSE)
    if(vl) {
      nnu <- which(!nu & v) # faster way ??
      nu <- which(nu & v)
    } else {
      nnu <- which(!nu)
      nu <- which(nu)
    }
    nul <- length(nu) > 0L
    nnul <- length(nnu) > 0L

    # Identifying FUN and catFUN:
    if(nul) if(is.character(FUN)) {
      # FUN <- unlist(strsplit(FUN,",",fixed = TRUE), use.names = FALSE)
      namFUN <- FUN
      FUN <- if(length(FUN) > 1L) lapply(FUN, match.fun, descend = FALSE) else
                                  match.fun(FUN, descend = FALSE)
    } else if(is.list(FUN)) {
      namFUN <- names(FUN)
      if(is.null(namFUN)) namFUN <- all.vars(substitute(FUN))
    } else namFUN <- l1orn(as.character(substitute(FUN)), "FUN") # Faster !!

    if(nnul) if(is.character(catFUN)) {
      # catFUN <- unlist(strsplit(catFUN,",",fixed = TRUE), use.names = FALSE)
      namcatFUN <- catFUN
      catFUN <- if(length(catFUN) > 1L) lapply(catFUN, match.fun, descend = FALSE) else
                                        match.fun(catFUN, descend = FALSE)
    } else if(is.list(catFUN)) {
      namcatFUN <- names(catFUN)
      if(is.null(namcatFUN)) namcatFUN <- all.vars(substitute(catFUN))
    } else namcatFUN <- l1orn(as.character(substitute(catFUN)), "catFUN") # Faster !!

    if(give.names == "auto") give.names <- !widel || length(FUN) > 1L || length(catFUN) > 1L

    # Aggregator function ! # drop level of nesting i.e. make rest length(by)+length(FUN)+length(catFUN)  ??
    agg <- function(xnu, xnnu) { #by, FUN, namFUN, catFUN, namcatFUN, drop.by
      lr <- nul + nnul + keep.by
      res <- vector("list", lr)
        if(keep.by) {
          res[[1L]] <- if(is.atomic(by)) list(`names<-`(list(unique_factor(by)), namby)) else list(by[[4L]]) # nah... could add later using "c" ??
          ind <- 2L
        } else ind <- 1L
        if(nul) {
          fFUN <- namFUN %in% .FAST_STAT_FUN
          if(is.list(FUN))
            res[[ind]] <- condsetn(aplyfun(seq_along(namFUN), function(i)
                          if(fFUN[i]) FUN[[i]](xnu, by, ..., use.g.names = FALSE) else
                          BY.data.frame(xnu, by, FUN[[i]], ..., use.g.names = FALSE)), namFUN, give.names) else
            res[[ind]] <- if(fFUN) condsetn(list(FUN(xnu, by, ..., use.g.names = FALSE)), namFUN, give.names) else # give.names || !widel
                          condsetn(list(BY.data.frame(xnu, by, FUN, ..., use.g.names = FALSE, parallel = parallel, mc.cores = mc.cores)), namFUN, give.names) # give.names || !widel
        }
        if(nnul) {
          fcatFUN <- namcatFUN %in% .FAST_STAT_FUN
          if(is.list(catFUN))
            res[[lr]] <- condsetn(aplyfun(seq_along(namcatFUN), function(i)
                         if(fcatFUN[i]) catFUN[[i]](xnnu, by, ..., use.g.names = FALSE) else
                         BY.data.frame(xnnu, by, catFUN[[i]], ..., use.g.names = FALSE)), namcatFUN, give.names) else
            res[[lr]] <- if(fcatFUN) condsetn(list(catFUN(xnnu, by, ..., use.g.names = FALSE)), namcatFUN, give.names) else # give.names || !widel
                         condsetn(list(BY.data.frame(xnnu, by, catFUN, ..., use.g.names = FALSE, parallel = parallel, mc.cores = mc.cores)), namcatFUN, give.names) # give.names || !widel
        }
      return(res)
    } # fastest isung res list ?? or better combine at the end ??
    res <- agg(if(nul) `oldClass<-`(X[nu], "data.frame") else NULL, if(nnul) `oldClass<-`(X[nnu], "data.frame") else NULL) #colsubset(X, nu)

    if(keep.col.order && widel) o <- forder.int(c(if(!keep.by) NULL else if(!bycalll) rep(0L,length(numby)) else numby,
                                                  if(nul) rep(nu,length(FUN)) else NULL,
                                                  if(nnul) rep(nnu,length(catFUN)) else NULL)) # , method = "radix")

  } else { # custom aggregation:
    if(give.names == "auto") give.names <- TRUE
    namFUN <- names(custom)
    if(!is.list(custom) || is.null(namFUN)) stop("custom needs to be a named list, see ?collap")
    fFUN <- namFUN %in% .FAST_STAT_FUN
    if(!keep.by) {
      res <- vector("list", 1L)
      ind <- 1L
    } else {
      res <- vector("list", 2L)
      res[[1L]] <- if(is.atomic(by)) list(`names<-`(list(unique_factor(by)), namby)) else list(by[[4L]]) # nah... could add later using "c" ??
      ind <- 2L
    }
    custom <- lapply(custom, cols2int, X, nam) # could integrate below, but then reorder doesn't work !!
    #lx <- length(X)
    # custom <- lapply(custom, function(x) if(is.numeric(x) && max(abs(x)) <= lx)
    #                          x else if(is.character(x)) anyNAerror(match(x, nam), "Unknown column names!") else
    #                          stop("custom list content must be variable names or suitable column indices"))

    res[[ind]] <- condsetn(aplyfun(seq_along(namFUN), function(i)
            if(fFUN[i]) match.fun(namFUN[i])(`oldClass<-`(X[custom[[i]]], "data.frame"), by, ..., use.g.names = FALSE) else
            BY.data.frame(X[custom[[i]]], by, namFUN[i], ..., use.g.names = FALSE)), namFUN, give.names)
    if(keep.col.order && widel) {
      o <- unlist(custom, use.names = FALSE)
      o <- forder.int(c(if(!keep.by) NULL else if(!bycalll) rep(0L,length(numby)) else numby, o)) #, method = "radix")
    }
  }
  if(widel) res <- unlist(unlist(res, FALSE), FALSE) else {
    if(length(FUN) > 1L || length(catFUN) > 1L || length(custom) > 1L) {
      res <- unlist(res, FALSE)
      if(return == 2L) {
        ax[["row.names"]] <- if(is.list(by)) .set_row_names(by[[1L]]) else .set_row_names(length(res[[1L]]))  # fnlevels(by) best ??
        if(!keep.by) return(lapply(res, function(e) {
          ax[["names"]] <- names(e)
          return(setAttributes(e, ax)) })) else
          return(lapply(res[-1L], function(e) {
              ax[["names"]] <- c(namby, names(e))
              setAttributes(c(res[[1L]], e), ax) }))
      } else {
        if(return != 4L) {
          res <- if(!keep.by) .Call(C_rbindlist, res, TRUE, TRUE, "Function") else # data.table:::Crbindlist
                 .Call(C_rbindlist, lapply(res[-1L], function(e) c(res[[1L]], e)), TRUE, TRUE, "Function")
        } else {
          if(!(nul && nnul) || customl) stop("long_dupl is only meaningful for aggregations with both numeric and categorical data, and multiple functions used for only one of the two data types!")
          mFUN <- length(FUN) > 1L
          nid <- if(mFUN) length(res) else 2L-!keep.by
          if(!keep.by) {
            res <- if(mFUN) lapply(res[-nid], function(e) c(e, res[[nid]])) else
                            lapply(res[-nid], function(e) c(res[[nid]], e))
          } else res <- if(mFUN) lapply(res[-c(nid, 1L)], function(e) c(res[[1L]], e, res[[nid]])) else
                                 lapply(res[-c(nid, 1L)], function(e) c(res[[1L]], res[[nid]], e))
          res <- .Call(C_rbindlist, res, FALSE, FALSE, "Function")
        }
        if(keep.col.order)  o <- forder.int(c(0L, if(!keep.by) NULL else if(!bycalll) rep(0L,length(numby)) else numby, nu, nnu)) # , method = "radix")
      }
    } else message("return options other than 'wide' are only meaningful if multiple functions are used!")
  }

  if(keep.col.order) .Call(C_setcolorder, res, o) # data.table:::Csetcolorder
  ax[["names"]] <- names(res)
  ax[["row.names"]] <- .set_row_names(length(res[[1L]]))
  return(setAttributes(res, ax))
}

# collapv: allows vector input to by !!!
collapv <- function(X, by, FUN = fmean, catFUN = fmode, cols = NULL, custom = NULL,
                   keep.by = TRUE, keep.col.order = TRUE, sort.row = TRUE,
                   parallel = FALSE, mc.cores = 1L,
                   return = c("wide","list","long","long_dupl"), give.names = "auto", ...) {

  return <- switch(return[1L], wide = 1L, list = 2L, long = 3L, long_dupl = 4L, stop("Unknown return output option"))
  widel <- return == 1L
  customl <- !is.null(custom)
  if(!inherits(X, "data.frame")) X <- qDF(X)
  ax <- attributes(X)
  class(X) <- NULL
  nam <- names(X)
  # attributes(X) <- NULL
  # attr(X, "class") <- "data.frame" # class needed for method dispatch of fast functions, not for BY !!

  aplyfun <- if(parallel) function(...) mclapply(..., mc.cores = mc.cores) else lapply

  # identifying by
  numby <- cols2int(by, X, nam)
  namby <- nam[numby]
  by <- if(length(numby) == 1L) qF(X[[numby]], ordered = sort.row, na.exclude = FALSE) else
         GRP.default(X, numby, sort = sort.row, return.groups = keep.by)

  if(!customl) {

    v <- if(is.null(cols)) !logical(length(X)) else cols2log(cols, X, nam)
    v[numby] <- FALSE

    # identifying data
    nu <- vapply(X, is.numeric, TRUE, USE.NAMES = FALSE)
    nnu <- which(!nu & v) # faster way ??
    nu <- which(nu & v)
    nul <- length(nu) > 0L
    nnul <- length(nnu) > 0L

    # Identifying FUN and catFUN:
    if(nul) if(is.character(FUN)) {
      # FUN <- unlist(strsplit(FUN,",",fixed = TRUE), use.names = FALSE)
      namFUN <- FUN
      FUN <- if(length(FUN) > 1L) lapply(FUN, match.fun, descend = FALSE) else
        match.fun(FUN, descend = FALSE)
    } else if(is.list(FUN)) {
      namFUN <- names(FUN)
      if(is.null(namFUN)) namFUN <- all.vars(substitute(FUN))
    } else namFUN <- l1orn(as.character(substitute(FUN)), "FUN") # Faster !!

    if(nnul) if(is.character(catFUN)) {
      # catFUN <- unlist(strsplit(catFUN,",",fixed = TRUE), use.names = FALSE)
      namcatFUN <- catFUN
      catFUN <- if(length(catFUN) > 1L) lapply(catFUN, match.fun, descend = FALSE) else
        match.fun(catFUN, descend = FALSE)
    } else if(is.list(catFUN)) {
      namcatFUN <- names(catFUN)
      if(is.null(namcatFUN)) namcatFUN <- all.vars(substitute(catFUN))
    } else namcatFUN <- l1orn(as.character(substitute(catFUN)), "catFUN") # Faster !!

    if(give.names == "auto") give.names <- !widel || length(FUN) > 1L || length(catFUN) > 1L

    # Aggregator function ! # drop level of nesting i.e. make rest length(by)+length(FUN)+length(catFUN)  ??
    agg <- function(xnu, xnnu) { #by, FUN, namFUN, catFUN, namcatFUN, drop.by
      lr <- nul + nnul + keep.by
      res <- vector("list", lr)
      if(keep.by) {
        res[[1L]] <- if(is.atomic(by)) list(`names<-`(list(unique_factor(by)), namby)) else list(by[[4L]]) # nah... could add later using "c" ??
        ind <- 2L
      } else ind <- 1L
      if(nul) {
        fFUN <- namFUN %in% .FAST_STAT_FUN
        if(is.list(FUN))
          res[[ind]] <- condsetn(aplyfun(seq_along(namFUN), function(i)
            if(fFUN[i]) FUN[[i]](xnu, by, ..., use.g.names = FALSE) else
              BY.data.frame(xnu, by, FUN[[i]], ..., use.g.names = FALSE)), namFUN, give.names) else
                res[[ind]] <- if(fFUN) condsetn(list(FUN(xnu, by, ..., use.g.names = FALSE)), namFUN, give.names) else # give.names || !widel
                  condsetn(list(BY.data.frame(xnu, by, FUN, ..., use.g.names = FALSE, parallel = parallel, mc.cores = mc.cores)), namFUN, give.names) # give.names || !widel
      }
      if(nnul) {
        fcatFUN <- namcatFUN %in% .FAST_STAT_FUN
        if(is.list(catFUN))
          res[[lr]] <- condsetn(aplyfun(seq_along(namcatFUN), function(i)
            if(fcatFUN[i]) catFUN[[i]](xnnu, by, ..., use.g.names = FALSE) else
              BY.data.frame(xnnu, by, catFUN[[i]], ..., use.g.names = FALSE)), namcatFUN, give.names) else
                res[[lr]] <- if(fcatFUN) condsetn(list(catFUN(xnnu, by, ..., use.g.names = FALSE)), namcatFUN, give.names) else # give.names || !widel
                  condsetn(list(BY.data.frame(xnnu, by, catFUN, ..., use.g.names = FALSE, parallel = parallel, mc.cores = mc.cores)), namcatFUN, give.names) # give.names || !widel
      }
      return(res)
    } # fastest isung res list ?? or better combine at the end ??
    res <- agg(if(nul) `oldClass<-`(X[nu], "data.frame") else NULL, if(nnul) `oldClass<-`(X[nnu], "data.frame") else NULL) #colsubset(X, nu)

    if(keep.col.order && widel) o <- forder.int(c(if(!keep.by) NULL else numby,
                                                  if(nul) rep(nu,length(FUN)) else NULL,
                                                  if(nnul) rep(nnu,length(catFUN)) else NULL)) # , method = "radix")

  } else { # custom aggregation:
    if(give.names == "auto") give.names <- TRUE
    namFUN <- names(custom)
    if(!is.list(custom) || is.null(namFUN)) stop("custom needs to be a named list, see ?collap")
    fFUN <- namFUN %in% .FAST_STAT_FUN
    if(!keep.by) {
      res <- vector("list", 1L)
      ind <- 1L
    } else {
      res <- vector("list", 2L)
      res[[1L]] <- if(is.atomic(by)) list(`names<-`(list(unique_factor(by)), namby)) else list(by[[4L]]) # nah... could add later using "c" ??
      ind <- 2L
    }
    custom <- lapply(custom, cols2int, X, nam) # could integrate below, but then reorder doesn't work !!
    #lx <- length(X)
    # custom <- lapply(custom, function(x) if(is.numeric(x) && max(abs(x)) <= lx)
    #                          x else if(is.character(x)) anyNAerror(match(x, nam), "Unknown column names!") else
    #                          stop("custom list content must be variable names or suitable column indices"))

    res[[ind]] <- condsetn(aplyfun(seq_along(namFUN), function(i)
      if(fFUN[i]) match.fun(namFUN[i])(`oldClass<-`(X[custom[[i]]], "data.frame"), by, ..., use.g.names = FALSE) else
        BY.data.frame(X[custom[[i]]], by, namFUN[i], ..., use.g.names = FALSE)), namFUN, give.names)
    if(keep.col.order && widel) {
      o <- unlist(custom, use.names = FALSE)
      o <- forder.int(c(if(!keep.by) NULL else numby, o)) # , method = "radix")
    }
  }
  if(widel) res <- unlist(unlist(res, FALSE), FALSE) else {
    if(length(FUN) > 1L || length(catFUN) > 1L || length(custom) > 1L) {
      res <- unlist(res, FALSE)
      if(return == 2L) {
        ax[["row.names"]] <- if(is.list(by)) .set_row_names(by[[1L]]) else .set_row_names(length(res[[1L]]))  # fnlevels(by) best ??
        if(!keep.by) return(lapply(res, function(e) {
          ax[["names"]] <- names(e)
          return(setAttributes(e, ax)) })) else
            return(lapply(res[-1L], function(e) {
              ax[["names"]] <- c(namby, names(e))
              setAttributes(c(res[[1L]], e), ax) }))
      } else {
        if(return != 4L) {
          res <- if(!keep.by) .Call(C_rbindlist, res, TRUE, TRUE, "Function") else # data.table:::Crbindlist
            .Call(C_rbindlist, lapply(res[-1L], function(e) c(res[[1L]], e)), TRUE, TRUE, "Function")
        } else {
          if(!(nul && nnul) || customl) stop("long_dupl is only meaningful for aggregations with both numeric and categorical data, and multiple functions used for only one of the two data types!")
          mFUN <- length(FUN) > 1L
          nid <- if(mFUN) length(res) else 2L-!keep.by
          if(!keep.by) {
            res <- if(mFUN) lapply(res[-nid], function(e) c(e, res[[nid]])) else
              lapply(res[-nid], function(e) c(res[[nid]], e))
          } else res <- if(mFUN) lapply(res[-c(nid, 1L)], function(e) c(res[[1L]], e, res[[nid]])) else
            lapply(res[-c(nid, 1L)], function(e) c(res[[1L]], res[[nid]], e))
          res <- .Call(C_rbindlist, res, FALSE, FALSE, "Function")
        }
        if(keep.col.order)  o <- forder.int(c(0L, if(!keep.by) NULL else numby, nu, nnu)) # , method = "radix")
      }
    } else message("return options other than 'wide' are only meaningful if multiple functions are used!")
  }

  if(keep.col.order) .Call(C_setcolorder, res, o) # data.table:::Csetcolorder
  ax[["names"]] <- names(res)
  ax[["row.names"]] <- .set_row_names(length(res[[1L]]))
  return(setAttributes(res, ax))
}

# For dplyr integration !!!
collapg <- function(X, FUN = fmean, catFUN = fmode, cols = NULL, custom = NULL,
                    keep.group_vars = TRUE, keep.col.order = TRUE, sort.row = TRUE,
                    parallel = FALSE, mc.cores = 1L,
                    return = c("wide","list","long","long_dupl"), give.names = "auto", ...) {
  by <- GRP.grouped_df(X)
  ngn <- attr(X, "names") %!in% by[[5L]] # Note: this always leaves grouping columns on the left still !!
  clx <- class(X)
  attr(X, "groups") <- NULL
  oldClass(X) <- clx[clx != "grouped_df"]
  if(is.function(FUN)) FUN <- `names<-`(list(FUN), l1orn(as.character(substitute(FUN)), "FUN")) else
    if(is.list(FUN) && is.null(names(FUN))) names(FUN) <- all.vars(substitute(FUN))
  if(is.function(catFUN)) catFUN <- `names<-`(list(catFUN), l1orn(as.character(substitute(catFUN)), "catFUN")) else
    if(is.list(catFUN) && is.null(names(catFUN))) names(catFUN) <- all.vars(substitute(catFUN))

  return(collap(fcolsubset(X, ngn), by, FUN, catFUN, cols, custom,
                keep.group_vars, keep.col.order, sort.row, parallel,
                mc.cores, return, give.names, ...))
}
SebKrantz/collapse documentation built on June 12, 2025, 1:44 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
SebKrantz/collapse
Advanced and Fast Data Transformation

misc/legacy/sorted out 1.0.0 - 1.1.0/collap_with_qF.R
In SebKrantz/collapse: Advanced and Fast Data Transformation

R Package Documentation

Browse R Packages

We want your feedback!

SebKrantz/collapse Advanced and Fast Data Transformation

misc/legacy/sorted out 1.0.0 - 1.1.0/collap_with_qF.R In SebKrantz/collapse: Advanced and Fast Data Transformation

R Package Documentation

Browse R Packages

We want your feedback!

SebKrantz/collapse
Advanced and Fast Data Transformation

misc/legacy/sorted out 1.0.0 - 1.1.0/collap_with_qF.R
In SebKrantz/collapse: Advanced and Fast Data Transformation