R/variables.manual.review.R

# **export**
# allvariables.manual.review    -> a list of pairs (varname, vartype)
# allvariables.of.type          -> a vector of variable names
# allvariables.type.change      -> a modified data frame
# change.variable.type.in.list  -> a modified list of pairs (varname, vartype)
#
# **private**
# all.types               -> a vector of variable type names
# variable.manual.review  -> a pair (varname, vartype)
# variable.type.change    -> a vector (column) of data

#################################################################################

#' Manual review of a data frame's variables
#'
#' @param data A data frame
#' @param clear.console (\emph{defaults to TRUE}) If TRUE, the console is cleared before each new variable's summary is printed
#' @param max.unique.factor (\emph{defaults to 50}) When a variable's number of unique values is lower than this, the guessed type is \code{factor}
#'
#' @return A list of pairs (variable,type)
#' @export
#'
#' @examples
#' allvariables.manual.review(iris)   # all default guesses are good
#' allvariables.manual.review(mtcars) # some variables could be re-classified as integer or factor
allvariables.manual.review <- function(data,  # a data frame
                                       clear.console = TRUE,
                                       max.unique.factor = min(50,nrow(data)/10)) {
    lapply(names(data),
           function(v) variable.manual.review(data,v,clear.console,max.unique.factor))
}

#################################################################################

all.types <- function() c("factor",
                          "ordinal",
                          "numeric",
                          "integer",
                          "character",
                          "Date",
                          "unclear")

#################################################################################

variable.manual.review <- function(data,  # a data frame
                                   var,   # a variable name
                                   clear.console = TRUE,
                                   max.unique.factor) {
    TYPES  <- all.types()
    NTYPES <- length(TYPES)
    values <- data[[var]]
    guess  <- if (length(unique(values)) <= max.unique.factor) {
                  "factor"
              } else {
                  class(data[[var]])
              }
    guess.index <- if (guess %in% TYPES) { which(guess == TYPES) }
                   else { NTYPES }
    table  <- data.frame(type  = TYPES,
                         guess = factor("", levels = c("","*")))
    table[guess.index, "guess"] <- "*"

    prompt_ <- "Enter type (type nothing for default guess, X for unclear and D, S or T for more detail): "
    read <- "-1"

    while (!(read %in% c("","X") || as.integer(read) %in% 1:NTYPES)) {
        if (clear.console) { cat("\014") }
        print(paste0("Variable: ",var))
        if (read == "D") {
            str(sample(data[[var]]), vec.len = 20)
        } else if (read == "S") {
            print(summary(data[[var]]))
        } else if (read == "T" || read == "t") {
            table_ <- table(data[[var]])
            if (read == "t") { table_ <- table_[order(table_, decreasing = TRUE)] }
            lengt_ <- length(table_)
            if (lengt_ <= max.unique.factor) { print(table_) } else {
                print(table_[1:max.unique.factor])
                print(paste0("Variable ",var, " has too many unique values (",
                             lengt_,"); showing only the first ", max.unique.factor))
            }
        } else {
            str(data[[var]], vec.len = 8)
        }
        print(table, right = FALSE)
        read <- readline(prompt_)
    }

    type <- if (read == "") { TYPES[guess.index] }
            else if (read == "X") { TYPES[NTYPES] }
            else {TYPES[as.integer(read)]}
    return(c(var,type))
}

#################################################################################

#' Extracts all variables of a given from a pair list
#'
#' @param vars.list A list of pairs (variable.name,variable.type) such as those produced by \code{allvariables.manual.review}
#' @param type The name of a type, e.g. \code{"numeric"}
#'
#' @return A vector of variable names
#' @export
#'
#' @examples
#' list <- lapply(names(mtcars), function(v) c(v,class(mtcars[[v]])))
#' allvariables.of.type(list, "numeric")
allvariables.of.type <- function(vars.list, # a list of (varname,vartype) pairs
                                 type) {    # a type name
    unlist(lapply(vars.list,
                  function(pair) if (pair[2] == type) return(pair[1])))
}

#################################################################################

#' Manually changes an element in a (name,type) list
#'
#' @param vars.list A list of pairs (variable.name,variable.type) such as those produced by \code{allvariables.manual.review}
#' @param varname A variable name
#' @param vartype A variable type
#'
#' @return A modified version of \code{list}
#' @export
#'
#' @examples
#' list <- lapply(names(mtcars), function(v) c(v,class(mtcars[[v]])) )
#' change.variable.type.in.list(list, "am", "factor")
change.variable.type.in.list <- function(vars.list,  # a list of (varname,vartype) pairs
                                         varname,    # a variable name
                                         vartype) {  # a variable type
    lapply(vars.list, function(pair) {
        if (pair[1] == varname) { c(varname,vartype) } else { pair }
    })
}

#################################################################################

#' Changing a data frame's variable types in bulk
#'
#' @param data A data frame
#' @param vars.list A list of pairs (variable.name,variable.type) such as those produced by \code{allvariables.manual.review}
#' @param data2 A second data frame (optional) from which levels will be pulled and added when coercing a variable into a factor
#'
#' @return A modified version of \code{data}
#' @export
#'
#' @examples
#' list_ <- list(c("cyl","factor"),c("am","factor"),c("vs","factor"),c("gear","factor"))
#' str(allvariables.type.change(mtcars,list_))
allvariables.type.change <- function(data,      # a data frame
                                     vars.list, # a list of (varname,vartype) pairs
                                     data2) {   # another data frame (optional)
    data.variables <- names(data)
    for (var.pair in vars.list) {
        varname <- var.pair[1]
        if (!varname %in% data.variables) { next }
        vartype <- var.pair[2]
        if (class(data[[varname]]) != vartype) {
            data[[varname]] <- variable.type.change(data,varname,vartype,data2)
        }
    }
    return(data)
}

#################################################################################

variable.type.change <- function(data, varname, vartype, data2) {
    if (vartype == "factor") {
        return(factor(data[[varname]],
                      ordered = FALSE,
                      exclude = NULL,
                      levels  = if (missing(data2)) { unique(data[[varname]]) } else {
                          union(unique(data[[varname]]), unique(data2[[varname]]))
                      }))
    } else if (vartype == "ordinal") {
        return(factor(data[[varname]],
                      ordered = TRUE,
                      exclude = NULL,
                      levels  = sort(
                          if (missing(data2)) { unique(data[[varname]]) } else {
                              union(unique(data[[varname]]), unique(data2[[varname]]))
                          }
                      )))
    } else if (vartype == "numeric") {
        return(as.numeric(data[[varname]]))
    } else if (vartype == "integer") {
        return(as.integer(data[[varname]]))
    } else if (vartype == "character") {
        return(as.character(data[[varname]]))
    } else if (vartype == "Date") {
        return(as.Date(data[[varname]]))
    } else if (vartype == "unclear") {
        warning(paste0("Variable ",varname," has commanded type 'unclear'"))
        return(data[[varname]])
    } else {
        stop(paste0("Bug: ",vartype," is not a recognized type."))
    }
}
ahdxb/data.exploration documentation built on May 11, 2019, 11:31 p.m.