R/helper_functions.R
In fabricatr: Imagine Your Data Before You Collect It

Documented in get_unique_variables_by_level recycle

#' Expands data to a given length through recycling.
#'
#' This function is a helper function designed call \code{rep_len} to expand the
#' length of a data vector, but which can dynamically retrieve N from the
#' surrounding level call for use in fabricatr.
#'
#' @param x Data to recycle into length \code{N}
#' @param .N the length to recycle the data to, typically provided implicitly by
#' a or fabricate call wrapped around the function call.
#' @return A vector of data padded to length \code{N}
#'
#' @examples
#'
#' fabricate(
#'   N = 15,
#'   month = recycle(month.abb)
#' )
#' @export
recycle <- function(x, .N = NULL) {
  if(is.null(.N)) {
    .N <- tryCatch({
      dynGet("N")
    }, error = function(e) {
      NULL
    })

    if(is.null(.N)) {
      stop("You must supply a `.N` argument to `recycle` or run ",
           "`recycle` inside a level call to implicit supply `.N`.")
    }
  }

  rep_len(x, length.out = .N)
}

makeUU <- function() {
  sprintf("%X-%X", as.integer(Sys.time()), sample.int(.Machine$integer.max, 1))
}


import_data_list <- function(data) {
  workspace <- new_environment()

  if(is.null(data)) return(workspace)

  if(is.data.frame(data) || !is.list(data)) data <- list(data)

  for(df in data) {
    # Sanity check that the data we're bringing in is good.
    df <- handle_data(data = df)

    uu <- makeUU()

    workspace[[uu]] <- df

  }

  structure(workspace, active_df=uu, insertion_order=c())
}


#' @importFrom rlang is_quosure
get_symbols_from_quosures <- function(quosures) {

  extract <- function(l_arg) {
    # We have some sort of language expression in R, let's extract
    # the symbols it's going to refer to

    if (is_quosure(l_arg)){
      extract(get_expr(l_arg)) # extract from expression
    } else if (is.symbol(l_arg)) {
      # If it's a symbol, return the symbol as character
      as.character(l_arg)
    } else if (is.language(l_arg)) {
      # If it's a language call, then we need to unpack some more
      # Extract the args from the call, (drop names on arguments)
      recurse <- unname(call_args(l_arg))
      # For each arg, extract
      temp <- lapply(recurse, extract)
      unlist(temp)
    } else {
      # It's something else? This might happen if the base level call
      # is numeric or whatever. We are only interested in variable nanes.
    }
  }

  # For each quosure, what symbols will that quosure attempt to read when it
  # is evaluated?
  meta_results <- lapply(quosures, extract)

  # remove duplicates
  meta_results <- Reduce(union, meta_results)

  return(meta_results)
}


#' Find which variables are unique at a given level in hierarchical data
#'
#' @param data a data.frame
#' @param ID_label the ID label to split upon
#' @param superset Superset contains a vector of character strings that contain variables
#' the modify level call is going to write. Some of these may be columns
#' in the data frame, others might not be. If superset is specified,
#' then we definitely only want to check those variables
#'
#' @return a character vector enumerating the unique variables
#' @keywords internal
get_unique_variables_by_level <- function(data, ID_label, superset=NULL) {
  names_to_check <- setdiff(colnames(data), ID_label)

  if (is_empty(names_to_check)) return(names_to_check)

  # Iterate through each column of interest
  # Per column, split that column's data into a list. The split indices come
  # from the level indicator. Now, run a function which checks the unique
  # length of each tranch. Unlist the result to get a vector of TRUE or FALSE
  # for each tranch of the list. If all tranches are TRUE, then the column has
  # unique values based on the level's level. Take the results per column,
  # unlist those, strip the names (if any) from the variables. Now extract the
  # column names for the columns for which this was true. Return as a vector.

  # Performance is around 22% faster than existing code for small dataset
  level_variables <-
    vapply(
      data[names_to_check],
      function(x) {
        all(
          vapply(
            split(x, data[, ID_label]),
            function(x) {
              length(unique(x)) == 1
            },
            FALSE
          )
        )
      },
      FALSE
    )

  names_to_check[level_variables]
}


# Checks if an ID label is sane, warns or errors if not.
# Generates an ID label if there isn't one provided.
handle_id <- function(ID_label, data=NULL) {
  # If the user passed a symbol, we should evaluate the symbol forcibly and
  # error if they were assuming NSE substitution of an undefined symbol.
  tryCatch(
    force(ID_label),
    error = function(e) {
      stop(
        "The `ID_label` provided is a reference to an undefined variable. ",
        "Please enclose `ID_label` in quotation marks if you intended to ",
        "provide `ID_label` as a character vector."
      )
    }
  )

  if(is.null(ID_label)) return(synthetic_ID(data))

  if(!is_scalar_character(ID_label) && !any(is.na(ID_label)))
    stop("Provided `ID_label` must be a string.")

  ID_label
}

synthetic_ID <- function(data) {

  candidates <- c("ID",  paste0("fab_ID_", 1:5) )

  for(candidate_label in setdiff(candidates, names(data))) {
    return(candidate_label)
  }

  stop(
    "No `ID_label` specified for level and supply of default ID ",
    "labels -- ID, fab_ID_1, fab_ID_2, fab_ID_3, fab_ID_4, fab_ID_5",
    " -- are all used for data columns. Please specify an `ID_label` ",
    "for this level."
  )



}

# Checks if a supplied N is sane for the context it's in
handle_n <- function(N, add_level=TRUE, working_environment, parent_frame_levels=1) {
  # Error handling for user-supplied N

  df <- active_df(working_environment)
  # First, evaluate the N in the context of the working environment's working
  # data frame. Why do we need to do this? Because N could be a function of
  # variables.
  N <- eval_tidy(N, data = df)

  if (!is_integerish(N) || any(N < 0))
    stop("Provided `N` must be positive integers.")

  if(add_level) {
      if(length(N) == 1 && N == 0) stop("New level has N == 0")
      if(!is_scalar_integerish(N)) stop("New level has length(N) > 1 ")
  }
  else { # If not adding then nesting
      if(! length(N) %in% c(1, nrow(df)))
          stop("Nested levels must have `N` of length 1 or same size as parent data frame.")
  }

  N
}

# Checks if the user-provided data is sane
# errors if not.
handle_data <- function(data) {
  if (!is.null(data) & !missing(data) & !"data.frame" %in% class(data)) {
    # User provided data, but it's not 2D
    if (is.null(dim(data))) {
      stop(
        "User provided `data` must be a data frame. Provided `data` was low ",
        "dimensional."
      )
    }

    # User provided data, but it's not a data frame, and they didn't provide
    # it explicitly, so this is probably a mess-up with an implicit argument
    if (!"data" %in% names(sys.call()) &&
      !"data" %in% names(sys.call(-1))) {
      stop(
        "The `data` argument must be a data object. The argument call, ",
        deparse(substitute(data)),
        ", was not a data object (e.g. a data.frame, tibble, sf object, or ",
        "convertible matrix)."
      )
    }

    # Convert user data to a data frame
    tryCatch({
      data <- data.frame(data, stringsAsFactors = FALSE)
    }, error = function(e) {
      # We can't make it a data frame -- this should probably never happen,
      # since it relies on something with a dim attribute not converting to
      # a data frame.
      stop(
        "User provided `data` could not convert to a data frame."
      )
    })
  }
  return(data)
}

# Function to check if something is a level call
call_not_level_call <- function(calls) {
  !vapply(calls, function(i) {
      is_call_simple(i) && is_level_token(call_name(i))
    }, FALSE)
}

is_level_token <- function(x) x %in% c(
  "level",
  "add_level",
  "nest_level",
  "modify_level",
  "cross_levels",
  "link_levels",
  "sac_level"
)


# Function to check if every argument in a quosure options
# is a level call.
check_all_levels <- function(options) {
  # Passing the options quosures
  # There were no levels, or indeed arguments, at all
  if (length(options) == 0) return(FALSE)

  # is_call_simple checks whether an expression or quosure has a
  # function name
  is_function <- vapply(options, is_call_simple, FALSE)

  # call_name gets function name from a quosure
  func_names <- vapply(options[is_function], call_name, "")

  # Check to see if the function names are one of the valid level operations
  is_level <- is_level_token(func_names)

  # Return false if we have no level calls
  if (!any(is_level)) return(FALSE)

  # If some calls are levels and some aren't, we're unhappy
  if (!all(is_level)) {
    stop(
      "Arguments passed to `...` must either all be calls to create or modify ",
      "levels, or else none of them must be."
    )
  }

  # Confirm they're all levels
  length(is_level) == length(options)
}



# Generates IDs from 1:N with zero left padding for visual display.
generate_id_pad <- function(N,zero=c("0", "")) {
  sprintf(paste0("%", match.arg(zero), nchar(N), "d"), 1:N)
}



#' @importFrom rlang f_rhs
expand_or_error <- function(vector_data, N, variable_name, call_string) {
  # NULL data means deleting a variable -- this is OK
  if(is.null(vector_data)) { return(NULL) }
  vector_dims <- dim(vector_data)
  if(length(vector_dims) > 1) {
    if(vector_dims[1] == N){
      return(vector_data)
    } else if (vector_dims[1] == 1) {
      return(vector_data[rep(seq_len(nrow(vector_data)), each = N), , drop = FALSE])
    } else {
      stop(simpleError(paste0("Nested structures must have `N.` rows ",
                              "In this call, `N` = ", N, " while the variable ",
                              variable_name, " is length ", vector_dims[1]),
                       call = f_rhs(call_string)))
    }

  }


  # Error if it's neither N nor 1
  if(!length(vector_data) %in% c(1, N)) {
    stop(simpleError(paste0("Variable lengths must all be equal to `N.` ",
                            "In this call, `N` = ", N, " while the variable ",
                            variable_name, " is length ", length(vector_data)),
                     call = f_rhs(call_string)))
  }

  # Recycle if it's 1, if not return data as-is.
  if(length(vector_data) == 1) { return(rep(vector_data, N)) }
  else { return(vector_data) }
}



# Try to overwrite R's recycling of vector operations to ensure the initial
# data is rectangular -- needs an N to ensure that constants do get recycled.
check_rectangular <- function(working_data_list, N) {

  for (i in seq_along(working_data_list)) {
    wdl_i <- working_data_list[[i]]
    d <- dim(wdl_i)
    if(length(d)  %in% 0:1) {
      len <- length(wdl_i)
      if (len == 1) {
        # Variable is a constant -- repeat it N times
        working_data_list[[i]] <- rep_len(wdl_i, N)
      } else if (len != N) {
        # Variable is not of length N. Oops.
        stop("Variables  must all be length `N.` ",
             "In this call, `N` = ", N, " while the variable `",
             names(working_data_list)[i], "` is length ", len)
      }
    }
    else if(length(d) == 2){
      if(d[1] != N) {
        stop("Nested structures must all have `N.` rows. ",
             "In this call, `N` = ", N, " while the variable `",
             names(working_data_list)[i], "` has ", d[1], " rows.")


      }

    }
  }
  return(working_data_list)
}





do_internal <- function(N = NULL, ..., FUN, from, by = NULL) {
  dots <- quos(...)
  if(!has_name(dots, "working_environment_")){
    # This happens if either call is run external to a fabricate
    # call OR if add_level is the only argument to a fabricate call and
    # the data argument tries to resolve an add_level call.
    stop(
      "`", from, "()` calls must be used inside `fabricate()` calls."
    )
  }

  working_environment_ <- get_expr(dots[["working_environment_"]])
  dots[["working_environment_"]] <- NULL


  if (has_name(dots, "ID_label")) {
    ID_label <- get_expr(dots[["ID_label"]])
    dots[["ID_label"]] <- NULL
  }

  # worse is better :()
  if(has_name(formals(FUN), "by")){
    FUN(
      N = N, ID_label = ID_label, by = by,
      workspace = working_environment_,
      data_arguments = dots
    )
  } else {
    FUN(
      N = N, ID_label = ID_label,
      workspace = working_environment_,
      data_arguments = dots
    )
  }
}


# Dummy helper function that just extracts the working data frame from the
# environment. This exists because we may in the future want to return something
# that is not a data frame.
report_results <- function(workspace) {
  df <- active_df(workspace)

  attr_names <- names(attributes(df))

  attributes(df)[grep("^fabricatr::", attr_names)] <- NULL

  df
}


active_df <- function(workspace) {
  uu <- attr(workspace, "active_df")
  if(is.null(uu)) NULL else workspace[[uu]]
}

activate <- function(workspace, data_name){
  attr(workspace, "active_df") <- data_name
  workspace
}

# Helper function to check for variable naming errors.
check_variables_named <- function(data_arguments, call_type = "add_level") {
  nm <- names(data_arguments)
  if(any(nm == "")) {
    # Generate some debug to help the user. Which was unnamed?
    nm[nm == ""] <- "<unnamed>"

    stop("All variables within a level call must be named; recieved variables named:",
         sprintf("\n - '%s'", nm))

  }
}


append_child <- function(workspace, child, parents=NULL, child_df=NULL) {

  parent <- parents %||% attr(workspace, "active_df")

  ATTR <- "fabricatr::children"

  for(p in parents) {
    siblings <- attr(workspace[[p]], ATTR)

    attr(workspace[[p]], ATTR) <- append(siblings, child)
  }

  if(!is.null(child_df)) {
    workspace[[child]] <- structure(
      data.frame(child_df, stringsAsFactors = FALSE, row.names = NULL),
      "fabricatr::parent_df" = parents,
      "fabricatr::ID_label"  = child
      )
  }



  workspace
}