R/Data_Dictionary.R

Defines functions dataDict

Documented in dataDict

# Allow convenient use of functions from other packages
#' @include Pipes.R
NULL

# Avoid "undefined variable" notes in package checking
globalVariables(c("dfEnv"))


#' Create a Data Dictionary
#' 
#' Create a data dictionary object which holds information on the columns of a data.frame-like object.
#' 
#' The \code{df} parameter should be the name of a data.frame-like object, as opposed to an expression.
#' E.g. call the function as \code{dataDict(mtcars)}, instead of \code{dataDict(as_tibble(mtcars))}.
#' This will allow for more functionality, and will help avoid errors, especially when lazy table mode is active.
#' If you want to pass an expression, simple ones should work, but do so at your own risk.
#' 
#' \code{tableMode} can be one of three values: c(TRUE, FALSE, "lazy").
#' Lazy table mode means that columns are not tabulated until requested.
#' When they are tabulated, they can be done only when accessed (\code{tableMode = "lazy"}),
#'   or all together up front (\code{tableMode = true}).
#' This setting can be deactivated entirely (\code{tableMode = FALSE}), but this is not suggested.
#' This setting is useful since tabulation can take a long time for large datasets.
#' 
#' If \code{verbose > 0} (or \code{verbose != FALSE}), a message will be printed out when lazy table mode is active,
#'   as well as when \code{df} is passed as an expression.
#' If \code{verbose > 1}, a message will be printed out saying what object the \code{dataDict} is based on.
#' 
#' @param df The data.frame-like object to calculate information for (should be a name, not an expression).
#' @param tableMode What mode to use for tabulating each column (see \code{details} for more) (character/logical scalar).
#' @param verbose How verbose you want the function to be (higher prints more information) (integer scalar).
#' 
#' @return A \code{dataDict} object.
#' @export
#' 
#' @examples
#' dd1 <- dataDict(mtcars)
#' dd2 <- dataDict(mtcars, tableMode = "lazy")
#' dd3 <- dataDict(mtcars, tableMode = TRUE)
#' dd5 <- dataDict(mtcars, tableMode = FALSE)
#' dd4 <- dataDict(mtcars, tableMode = "lazy", verbose = 0)
#' dd4 <- dataDict(mtcars, tableMode = TRUE, verbose = 0)
#' dd4 <- dataDict(mtcars, tableMode = FALSE, verbose = 0)
#' 
#' # The line below works, since the expression is rather simple, but should be avoided
#' # It is better to declare use something like: `df <- as_tibble(mtcars); dd <- dataDict(df)`
#' dd <- dataDict(tibble::as_tibble(mtcars))
#' 
dataDict <- function(df, tableMode = "lazy", verbose = Inf) {
  
  # Ensure that we have a data.frame-like object
  if (!inherits(df, "data.frame")) stop("dataDict: `df` must inherit from a `data.frame`.")
  
  # Create the data dictionary object, & set the call/verbosity attribute
  dict <- new.env()
  call <- match.call()
  dfCall <- substitute(df)
  
  # Set some attributes of the data dictionary object
  attributes(dict) <- dplyr::lst(
    
    # Set simple attributes (calculated outside)
    verbose, call, dfCall
    
    # Set the environment of the `df` object, & get the environment name/address
    , dfEnv = parent.frame(2)
    , dfEnvName = environmentName(dfEnv) %>% ifelse(. == "", pryr::address(dfEnv), .)
    
    # Get the dimensions/dimension names from the original df object
    , dims = dim(df)
    , dimNames = dimnames(df)
    
  )
  
  # Attempt to set the name of the original data
  if (length(dfCall) > 1) {
    info("`dataDict`: `df` has been passed as an expression (", deparse(dfCall), ").\n", strrep(" ", 12),
         "This may result in problems when using this `dataDict` object, but should be fine.", sep = "")
    attr(dict, "dfName") <- dfCall[purrr::map_lgl(dfCall, ~ is.data.frame(eval(.x)))][[1]] %>% deparse()
  } else {
    attr(dict, "dfName") <- deparse(dfCall)
  }
  
  # If desired & relevant, print out a message about not changing the original object
  if ((tableMode == "lazy") && (verbose > 1)) info(
    "`dataDict`: This `dataDict` will be based off of the object named '", attr(dict, "dfName"), "'.\n", strrep(" ", 12),
    "To ensure that this `dataDict` will continue to work, do not change the name of the object,\n", strrep(" ", 12),
    "You can alternatively use the `updateDD` function to update this `dataDict` after the object changes.",
    sep = ""
  )
  
  # Add various values to the dictionary
  evalq(envir = dict, expr = {
    
    # Set the class of the original df object, & the classes of each column
    dfClass <- class(df)
    classes <- purrr::map_chr(df, class)
    
    # Get the number of missing values
    numMissing <- purrr::map_int(df, num_missing)
    numNA <- purrr::map_int(df, ~ sum(is.na(.x)))
    
    # Summarize numeric columns
    means <- dplyr::select_if(df, is.numeric) %>% purrr::map_dbl(mean, na.rm = TRUE)
    medians <- dplyr::select_if(df, is.numeric) %>% purrr::map_dbl(stats::median, na.rm = TRUE)
    # modes <- dplyr::select_if(df, is.numeric) %>% purrr::map_dbl(mode_stat)
    
    # Get the number of unique elements, & the column tables object
    uniqueVals <- purrr::map(df, ~ sort(unique(.x), na.last = TRUE))
    numUnique <- purrr::map_int(uniqueVals, length)
    colTables <- columnTables(dict, df, tableMode)
    
  })
  
  # Set the dataDict class, & return it
  class(dict) <- c("dataDict", class(dict))
  return(dict)
  
}


#' Extract Elements from a \code{dataDict} Object
#' 
#' @param dict A \code{dataDict} object.
#' @param elem The element to extract (character scalar).
#' @param cols If \code{elem == "colTables"}, the column(s) to extract (character vector).
#' 
#' @return The desired element extracted from the \code{dataDict} object.
#' @export
#' 
#' @examples
#' dd <- dataDict(mtcars)
#' dd["numUnique"]
#' dd["colTables", "mpg"]
#' dd["colTables", NA]
#' dd["colTables"]
#' 
#' dd2 <- dataDict(mtcars, tableMode = TRUE)
#' dd2["colTables", "mpg"]
#' dd2["colTables", NA]
#' 
#' # The following lines work, but cause automated testing problems
#' # dd["colTables", c("mpg", "cyl")]
#' # dd2["colTables", c("mpg", "cyl")]
#' 
`[.dataDict` <- function(dict, elem, cols = NULL) {
  
  # If more than one element was requested, print a message, & take the first one
  if (length(elem) > 1) {
    warning("`[.dataDict`: `length(elem) > 1`, so only the first one will be used.")
    elem <- elem[1]
  }
  
  # Try to return the desired element, else throw an error
  if (tolower(elem) == "coltables") {
    
    # Get the desired tables
    return(dict$colTables[cols])
    
  } else if (elem %in% ls(dict)) {
    
    # Return the desired element
    return(get(x = elem, envir = dict))
    
  } else if (elem %in% names(attributes(dict))) {
    
    # Return the desired element
    return(attr(dict, elem))
    
  } else if (elem %in% colnames(dict)) {
    
    # Return data on the desired column
    return(list(
      class = dict$classes %>% .[names(.) == elem] %>% unname()
      , numMissing = dict$numMissing %>% .[names(.) == elem] %>% unname()
      , numNA = dict$numNA %>% .[names(.) == elem] %>% unname()
      , uniqueVals = dict$uniqueVals %>% .[names(.) == elem] %>% .[[1]]
      , numUnique = dict$numUnique %>% .[names(.) == elem] %>% unname()
      , table = dict$colTables[elem][[1]]
    ))
    
  } else {
    
    # Throw an error if the element requested does not exist
    stop("`[.dataDict`: The requested element (", elem, ") does not exist in this `dataDict` object (",
         deparse(substitute(dict)), ".")
    
  }
  
}


#' Summarize a \code{dataDict} Object
#' 
#' @param object A \code{dataDict} object.
#' @param ... Currently not used.
#' 
#' @return A summary of \code{object}.
#' @export
#' 
#' @examples
#' summary(dataDict(mtcars))
#' 
summary.dataDict <- function(object, ...) {
  
  # Cretae a tibble of the column names, classes, number of unique elements,
  #   & whether or not the column has been tabulated
  return(tibble::tibble(
    Column = colnames(object)
    , Class = object$classes
    , NumUnique = object$numUnique
    , Tabulated = colnames(object) %in% names(object$colTables)
  ))
  
}


#' Print a \code{dataDict} Object
#' 
#' @param x A \code{dataDict} object.
#' @param ... Currently not used.
#' 
#' @return The summary of \code{x}, invisibly (i.e. \code{summary(x)}).
#' @export
#' 
#' @examples
#' dataDict(mtcars)
#' print(dataDict(mtcars))
#' dd <- dataDict(mtcars)
#' print(dd)
#' 
print.dataDict <- function(x, ...) {
  
  # Print some information about the data that the `dataDict` is based off of
  dictName <- deparse(substitute(x)) %>% {ifelse(. == "x", "", paste0("(", ., ") "))}
  cat(
    "This `dataDict` object ", dictName, "was based off of '", attr(x, "dfName"),
    "' (a '", x$dfClass[1], "') in the '", attr(x, "dfEnvName"), "' environment,\n",
    "  which had ", nrow(x), " rows and ", ncol(x), " columns when this `dataDict` was constructed.\n",
    "The `tableMode` mode is set to '", attr(x, "tableMode"), "', and the `verbose` level is '", attr(x, "verbose"), "'.\n",
    sep = ""
  )
  
  # Print the summary object
  return(print(summary(x)))
  
}


#' Dimensions of a \code{dataDict}
#' 
#' Generic function to retrieve the dimensions of the data.frame-like object
#'   that the \code{dataDict} is based on.
#' `ncol` and `nrow` will also work, since they call `dim` in their implementation.
#' 
#' @param x A \code{dataDict} object.
#' 
#' @return The dimensions of the original data.frame-like object.
#' @export
#' 
#' @examples
#' dd <- dataDict(mtcars)
#' dim(dd)
#' ncol(dd)
#' nrow(dd)
#' 
dim.dataDict <- function(x) {
  return(attr(x, "dims"))
}


#' Dimension Names of a \code{dataDict}
#' 
#' Generic function to retrieve the dimension names of the data.frame-like object
#'   that the \code{dataDict} is based on.
#' `colnames` and `rownames` will also work, since they call `dimnames` in their implementation.
#' 
#' @param x A \code{dataDict} object.
#' 
#' @return \code{dimnames}: The dimension names of the original data.frame-like object.
#' @export
#' @name dimension_names
#' 
#' @examples
#' dd <- dataDict(mtcars)
#' dimnames(dd)
#' colnames(dd) # Same as names(dd)
#' rownames(dd)
#' names(dd) # Same as colnames(dd)
#' 
dimnames.dataDict <- function(x) {
  return(attr(x, "dimNames"))
}


#' @return \code{names}: The column names of the original data.frame-like object.
#' @export
#' @rdname dimension_names
#' 
names.dataDict <- function(x) {
  return(colnames(x))
}


#' Update a \code{dataDict} Object
#' 
#' @param dict A \code{dataDict} object.
#' @param df The data.frame-like object to calculate information for.
#' @param resetTables Whether or not to reset the \code{colTables} in \code{dict} (logical scalar).
#' 
#' @export
#' 
#' @examples
#' df <- mtcars
#' dd <- dataDict(df)
#' dd$colTables$mpg
#' df$mpg <- df$mpg * 2
#' updateDD(dd, df)
#' dd$colTables$mpg
#' 
updateDD <- function(dict, df, resetTables = TRUE) {
  message("`updateDD`: This function has not yet been implemented.")
  return(NULL)
}



#' Create a \code{columnTables} Object
#' 
#' Create an object that is used to store the results of tabulating columns in a data.frame-like object.
#' This is currently only called from with a \code{dataDict} object.
#' 
#' @param dict A \code{dataDict} object.
#' @param df The data.frame-like object to calculate information for (should be a name, not an expression).
#' @param tableMode What mode to use for tabulating each column (see \code{details} for more) (character/logical scalar).
#' 
#' @return A \code{columnTables} object.
#' 
columnTables <- function(dict, df, tableMode) {
  
  # Standardize the `tableMode` parameter
  tableMode <- switch(tolower(tableMode), lazy = "lazy", t = , true = TRUE, f = , false = FALSE)
  
  # Tabulate the results, if desired, & set the `tableMode` attribute
  attr(dict, "tableMode") <- tableMode
  if (tableMode == "lazy") {
    if (attr(dict, "verbose") > 0) info("`columnTables`: Lazy table mode active.", sep = "")
    colTables <- vector(mode = "list", length = attr(dict, "dims")[[2]]) %>%
      stats::setNames(., attr(dict, "dimNames")[[2]]) %>% as.environment()
  } else if (isTRUE(tableMode)) {
    colTables <- purrr::map(df, table, useNA = "ifany", dnn = NULL) %>% as.environment()
  } else if (isFALSE(tableMode)) {
    colTables <- new.env()
  } else {
    if (attr(dict, "verbose") > 1)
      warning("`columnTables`: Invalid table mode (", tableMode, "). Lazy table mode will be used instead.")
    attr(dict, "tableMode") <- "lazy"
    colTables <- vector(mode = "list", length = attr(dict, "dims")[[2]]) %>%
      stats::setNames(., attr(dict, "dimNames")[[2]]) %>% as.environment()
  }
  
  # Create the columnTables object, holding the dataDict, & the list of the tables
  attr(colTables, "dict") <- dict
  
  # Set the dataDict class, & return it
  class(colTables) <- c("columnTables", class(colTables))
  return(colTables)
  
}


#' Extract Elements from a \code{columnTables} Object
#' 
#' @param colTables A \code{columnTables} object.
#' @param col The column to extract the table for (character scalar).
#' 
#' @return A \code{table} of the desired column.
#' @export
#' 
#' @examples
#' dd <- dataDict(mtcars)
#' dd$colTables$mpg
#' 
`$.columnTables` <- function(colTables, col = NULL) {
  colTables[col][[1]]
}


#' Extract Elements from a \code{columnTables} Object
#' 
#' @param colTables A \code{columnTables} object.
#' @param cols The columns to extract the tables for (character vector).
#' 
#' @return A list of \code{table}s of the desired columns.
#' @export
#' 
#' @examples
#' dd <- dataDict(mtcars)
#' dd$colTables["mpg"]
#' 
#' # The following line works, but causes automated checking/testing problems
#' # dd$colTables[c("mpg", "cyl", "am")]
#' 
`[.columnTables` <- function(colTables, cols = NULL) {
  
  # If the call comes from RStudio's auto-complete feature, handle it specially
  if (deparse(sys.status()$sys.calls[[2]][[1]]) == ".rs.getCompletionsDollar") {
    colStatus <- try(colTables[[cols]])
    if (inherits(colStatus, "try-error")) return(NULL) else return(colStatus)
  }
  
  # Get the `dict` object (for convenience), & deal with cols being NULL or NA
  dict <- attr(colTables, "dict")
  if (is.null(cols) || is.na(cols)) cols <- colnames(dict)
  
  # Determine which columns have already been tabulated, & which we need to calculate
  tabulatedCols <- names(colTables) # ls(colTables)[!sapply(colTables, is.null)]
  colsToCalc <- intersect(cols, colnames(dict)) %>% setdiff(tabulatedCols)
  
  # Determine which columns are valid/invalid
  invalidCols <- setdiff(cols, colnames(dict))
  validCols <- setdiff(cols, invalidCols)
  
  # If there are any invalid columns (i.e. ones not in the data), print a warning
  if (length(invalidCols) > 0)
    warning("`[.columnTables`: Some invalid columns were selected:\n\t", paste0(invalidCols, collapse = ", "))
  
  # If there are any columns we need to calculate, try to get them, else return the existing tables
  if (length(colsToCalc) > 0) {
    
    # If the original object that this dataDict was based off of still exists, get it, else throw an error
    if ((!is.null(attr(dict, "dfEnv"))) && exists(attr(dict, "dfName"), where = attr(dict, "dfEnv"))) {
      df <- eval(attr(dict, "dfCall"), attr(dict, "dfEnv"))
    } else {
      stop(
        "`[.dataDict`: The object that this `dataDict` was based off (",
        attr(dict, "dfName"), ") no longer exists in its original environment (", attr(dict, "dfEnvName"), ").\n",
        "Please update the reference using `updateDD(dict, df)`."
      )
    }
    
    # Calculate the new column tables, & save them to the tables environment
    purrr::walk(colsToCalc, ~ table(df[[.x]], useNA = "ifany", dnn = NULL) %>% assign(.x, ., colTables))
    
  }
  
  # Return the tables for the desired valid columns
  return(purrr::map(validCols, ~ get(.x, colTables)) %>% stats::setNames(., validCols))
  
}


#' Summarize a \code{dataDict} Object
#' 
#' @param object A \code{columnTables} object.
#' @param ... Currently not used.
#' 
#' @return A summary of \code{object}.
#' @export
#' 
#' @examples
#' dd <- dataDict(mtcars)
#' summary(dd$colTables)
#' 
summary.columnTables <- function(object, ...) {
  message("`summary.columnTables`: This function has not yet been implemented.")
  return(NULL)
}


#' Print a \code{dataDict} Object
#' 
#' @param x A \code{columnTables} object.
#' @param ... Currently not used.
#' 
#' @return A summary of \code{x}, invisibly (i.e. \code{summary(x)}).
#' @export
#' 
#' @examples
#' dd <- dataDict(mtcars)
#' dd$colTables
#' print(dd$colTables)
#' 
print.columnTables <- function(x, ...) {
  message("`print.columnTables`: This function has not yet been implemented.")
  return(NULL)
}


#' Get the Names of a \code{columnTables} Object
#' 
#' Get the names of the columns from a \code{columnTables} object that have
#'   been tabulated already (character vector).
#' 
#' @param x A \code{columnTables} object.
#' 
#' @return The names of the columns that have been tabulated already (character vector).
#' @export
#' 
#' @examples
#' dd <- dataDict(mtcars)
#' names(dd$colTables)
#' 
names.columnTables <- function(x) {
  # return(ls(x)[!sapply(x, is.null)])
  return(ls(x) %>% .[!sapply(., function(y) is.null(get(y, x)))])
}
KO112/KO documentation built on Oct. 2, 2020, 9:21 a.m.