R/aggregate_by_pkg.R
In SSBtools: Statistics Norway's Miscellaneous Tools

Documented in aggregate_by_pkg

#' Aggregate by base R or data.table
#'
#' This function aggregates data by specified grouping variables, using either base R or `data.table`.
#'
#' @param data A data frame
#' @param by A character vector specifying the column names to group by.
#' @param var A character vector specifying the column names of the variables to be aggregated.
#' @param pkg A character string indicating which package to use for aggregation. 
#' Must be either `"base"` for base R or `"data.table"` for `data.table`. Default is `"base"`.
#' @param include_na A logical value indicating whether `NA` values in the grouping variables should be included in the aggregation. Default is `FALSE`.
#' @param fun The function to be applied for aggregation. Default is `sum`.
#' @param base_order A logical value indicating whether to attempt to return the results in the same order as base R when using `data.table`. 
#'                   Note that while the function strives to maintain this order, it cannot be guaranteed due to potential variations in 
#'                   sorting behavior across different systems. Default is `TRUE`.
#' @param ... Further arguments passed to \code{\link{aggregate}} when `pkg` is `"base"`
#'
#' @return A data.frame containing the aggregated results.
#'
#' @export
#' @importFrom stats complete.cases
#' 
#' @examples
#' d <- SSBtoolsData("d2")[1:20, ]
#' d[[2]] <- as.numeric(d[[2]])
#' d$y <- as.numeric(1:20)
#' d$y[2] <- NA
#' d$county[8:9] <- NA
#' d$main_income[11:12] <- NA
#' d$k_group[19:20] <- NA
#' by <- c("main_income", "county", "k_group")
#'
#' a1 <- aggregate_by_pkg(d, by = by, var = c("y", "freq"))
#' a2 <- aggregate_by_pkg(d, by = by, var = c("y", "freq"), 
#'                        include_na = TRUE)
#' a3 <- aggregate_by_pkg(d, by = by, var = c("y", "freq"), 
#'                        include_na = TRUE, fun = function(x) list(x))
#'  
#' if (requireNamespace("data.table", quietly = TRUE)) {  
#'                        
#'   b1 <- aggregate_by_pkg(d, by = by, var = c("y", "freq"), pkg = "data.table")
#'   b2 <- aggregate_by_pkg(d, by = by, var = c("y", "freq"), pkg = "data.table", 
#'                          include_na = TRUE)
#'   b3 <- aggregate_by_pkg(d, by = by, var = c("y", "freq"), pkg = "data.table", 
#'                          include_na = TRUE, fun = function(x) list(x))                        
#'                        
#'   print(identical(a1, b1))   # TRUE when base_order succeeds
#'   print(identical(a2, b2))
#'   print(identical(a3, b3))
#'   
#' }  else {
#'    print("The 'data.table' package is not installed.")
#' }
#'                         
aggregate_by_pkg <- function(data, 
                             by, 
                             var, 
                             pkg = "base", 
                             include_na = FALSE, 
                             fun = sum, 
                             base_order = TRUE, 
                             ...) {
  if (pkg == "base") {
    
    na_included <- rep(FALSE, length(by))
    
    if (include_na) {
      for (i in seq_along(by)) {
        if (anyNA(data[[by[i]]])) {
          na_included[i] <- TRUE
          if (is.integer(data[[by[i]]])) {
            # Replace NA values in integer columns with a very large integer value
            # This value is set close to the maximum integer value that R can handle.
            data[[by[i]]][is.na(data[[by[i]]])] <- .Machine$integer.max - 3L
          } else if (is.numeric(data[[by[i]]])) {
            # Replace NA values in numeric columns with a very large numeric value
            # This value is set close to the maximum numeric value that R can handle.
            data[[by[i]]][is.na(data[[by[i]]])] <- 1.789e+308
          } else {
            if (is.factor(data[[by[i]]]) | is.logical(data[[by[i]]])) {
              # Convert factor or logical columns to character type to handle NA replacement
              data[[by[i]]] <- as.character(data[[by[i]]])
              warning(paste(names(data)[i], "changed from", class(data[[by[i]]]), "to character"))
            }
            # Replace NA values in character (or converted factor/logical) columns with a string
            # This string is chosen so that it will likely be sorted at the end.
            data[[by[i]]][is.na(data[[by[i]]])] <- "~~~~~~M"
          }
        }
      }
    }
    
    result <- aggregate(data[var], data[by], fun, ...)
    
    if (any(na_included)) {
      for (i in seq_along(by)) {
        if (na_included[i]) {
          if (is.integer(result[[by[i]]])) {
            # Revert the large integer value back to NA after aggregation
            result[[by[i]]][result[[by[i]]] == (.Machine$integer.max - 3L)] <- NA
          } else if (is.numeric(result[[by[i]]])) {
            # Revert the large numeric value back to NA after aggregation
            result[[by[i]]][result[[by[i]]] == 1.789e+308] <- NA
          } else {
            # Revert the placeholder string back to NA after aggregation
            result[[by[i]]][result[[by[i]]] == "~~~~~~M"] <- NA
          }
        }
      }
    }
    
    return(result)
  }
  
  if (pkg == "data.table") {
    
    if (!requireNamespace("data.table", quietly = TRUE)) {
      stop("The 'data.table' package is required but is not installed. Please install it first.")
    }
    
    dt <- data.table::as.data.table(data)  
    if (!include_na) {
      # Remove rows where any of the by-columns have NA if include_na is FALSE
      dt <- dt[complete.cases(dt[, ..by]), ]
    }
    
    # Perform aggregation using data.table, grouping by the specified columns
    dt <- dt[, lapply(.SD, fun), by = by, .SDcols = var]
    
    if (base_order) {
      # Optional sorting to mimic base R order
      sort_order <- do.call(order, dt[, rev(by), with = FALSE])
      dt <- dt[sort_order]
    }
    
    return(as.data.frame(dt))
  }
  
  stop('pkg must be "base" or "data.table"')
}

# To avoid problems when data.table not in Depends
.datatable.aware <- TRUE 

# To avoid check problems
utils::globalVariables(c("..by", ".SD"))