#' Relevel using a levelChanges list
#'
#' Relevels the columns of a data frame, as as prescribed by a level changes
#' list, of the form generated by \code{\link{generateFactorRenamer}}.
#'
#' Suppose we have a data frame \code{x}, composed of a number of factor
#' variables (in columns).
#' These factor variables are poorly named. For example, they may just be
#' named by by integers. We would like to rename them so that their names
#' are more helpful and intuitive.
#'
#' To do this in a clear and repeatable manner, we will create an R list.
#' Each component of the list is given the name of a column of the
#' data.frame \code{x}. Each of these components is a character vector, each
#' component of which refers to a level of the corresponding factor variable
#' in \code{x}. Each of these component of the vector is a character, taking
#' the default value of the current name of the factor level.
#'
#' The function \code{\link{generateFactorRenamer}} can be used to create
#' such a list. This can then be altered, and then processed using this
#' function.
#'
#' @param x a data frame
#' @param levelChanges A list, of the form generated by
#' \code{\link{generateFactorRenamer}}
#' @param allowMissingCols A logical or a character vector. If it is a
#' logical, it indicates whether to allow some columns of x
#' not to be included in levelChanges. If TRUE, only a warning is issue if
#' there are columns missing. If FALSE, an error is given.
#' If it is a character vector, it contains the names of any columns that
#' are allowed not to be included in levelChanges.
#' @param verbose A logical, indicating whether to show progress
#' @return The data frame \code{x}, with factor levels releveled as
#' prescribed by \code{levelChanges}.
#' @examples
#' dat <- esoph[, 1:3]
#' generateFactorRenamer(dat)
#'
#' # we can then change this into the following
#' levelChanges <- list(
#' agegp = c(
#' "25-34" = "Young",
#' "35-44" = "Young",
#' "45-54" = "Middle-aged",
#' "55-64" = "Middle-aged",
#' "65-74" = "Old",
#' "75+" = "Old"),
#' alcgp = c(
#' "0-39g/day" = "0-39g/day",
#' "40-79" = "40-79",
#' "80-119" = "80-119",
#' "120+" = "120+"),
#' tobgp = c(
#' "0-9g/day" = "Light",
#' "10-19" = "Medium",
#' "20-29" = "Heavy",
#' "30+" = "Heavy")
#' )
#' changeLevels(dat, levelChanges)
#' # this return the new data frame
#' @export
changeLevels <- function(x,
levelChanges,
allowMissingCols = F,
verbose = T){
stopifnot(is.consistent.changeLevels(x, levelChanges, allowMissingCols),
is.logical(verbose),
length(verbose) == 1)
if (require(utils)){
colNames <- names(x)
# get the original levels, to avoid processing columns that do
# not change
actualLevelChanges <- levelChanges
xWithChanges <- x[, names(levelChanges)]
originalLevels <- utils:::capture.output(generateFactorRenamer(xWithChanges))
eval(parse(file = "", text = originalLevels))
originalLevels <- levelChanges
levelChanges <- actualLevelChanges
if (isTRUE(verbose)){
cat("Processing: ")
}
availableCols <- colNames %in% names(levelChanges)
colsToProcess <- colNames[availableCols]
nCols <- length(names(x))
colSeq <- seq_len(nCols)
if (verbose){
progress <- create_progress_bar("text")
progress$init(nCols)
}
out <- data.frame(lapply(colSeq, function(i){
colName <- colNames[i]
if (verbose){
progress$step()
}
shouldProcessCol <- colName %in% colsToProcess
levelsAreDifferent <- !identical(originalLevels[colName],
levelChanges[colName])
if (shouldProcessCol && levelsAreDifferent){
changeLevelsColName(colName, x, levelChanges, verbose)
} else {
x[, colName]
}
}))
if (isTRUE(verbose)){
cat("\n\n")
}
names(out) <- colNames
out
}
}
#' Generate a factor renamer
#'
#' Creates a template from which a specification of the renaming of the
#' levels of factor variables in a data.frame can be.
#'
#' Suppose we have a data frame \code{x}, composed of a number of factor
#' variables (in columns).
#' These factor variables are poorly named. For example, they may just be
#' named by by integers. We would like to rename them so that their names
#' are more helpful and intuitive.
#'
#' To do this in a clear and repeatable manner, we will create an R list.
#' Each component of the list is given the name of a column of the
#' data.frame \code{x}. Each of these components is a character vector, each
#' component of which refers to a level of the corresponding factor variable
#' in \code{x}. Each of these component of the vector is a character, taking
#' the default value of the current name of the factor level.
#'
#' This function output such a list.
#' The list can then be modified to specify the new level names, and the
#' changes made using function \code{\link{changeLevels}}.
#'
#' @param x a data frame
#' @param newlines A character variable. Either \code{"variable"} or
#' \code{"level"}. In the former case, each variable is one a separate
#' line in the result. In the latter case, newlines are additionally
#' placed after each level.
#' @param sort A logical indicating whether the levels should be sorted,
#' according to their numeric order
#' @return The R code to generate the list is output to the terminal.
#' @examples
#'
#' dat <- esoph[, 1:3]
#' generateFactorRenamer(dat)
#'
#' # we can then change this into the following
#' levelChanges <- list(
#' agegp = c(
#' "25-34" = "Young",
#' "35-44" = "Young",
#' "45-54" = "Middle-aged",
#' "55-64" = "Middle-aged",
#' "65-74" = "Old",
#' "75+" = "Old"),
#' alcgp = c(
#' "0-39g/day" = "0-39g/day",
#' "40-79" = "40-79",
#' "80-119" = "80-119",
#' "120+" = "120+"),
#' tobgp = c(
#' "0-9g/day" = "Light",
#' "10-19" = "Medium",
#' "20-29" = "Heavy",
#' "30+" = "Heavy")
#' )
#' changeLevels(dat, levelChanges)
#' # this return the new data frame
#' @export
generateFactorRenamer <- function(x, newlines = "level", sort = F){
stringComponents <- lapply(names(x),
levelsStringForColName, x, newlines, sort)
out <- paste(unlist(stringComponents), collapse = "")
# remove trailing comma and newline
nc <- nchar(out)
out <- substr(out, 1, nc - 2)
cat("levelChanges <- list(\n")
cat(out)
cat("\n)")
}
#' Relevel a column using a levelChanges list
#'
#' Relevels a factor column of a data frame, as as prescribed by a level
#' changes list, of the form generated by
#' \code{\link{generateFactorRenamer}}.
#'
#' @param colName A character vector of length 1, indicating which column of
#' \code{x} the string should be created for.
#' @param x a data frame
#' @param levelChanges A list, of the form generated by
#' \code{\link{generateFactorRenamer}}
#' @param verbose A logical, indicating whether to show progress
#' @return The data from \code{x[, levelChanges]}, with as prescribed by
#' \code{levelChanges}.
changeLevelsColName <- function(colName, x, levelChanges, verbose = T){
stopifnot(class(x) == "data.frame",
class(colName) == "character",
"factor" %in% class(x[, colName]),
class(levelChanges) == "list",
class(verbose) == "logical",
length(verbose) == 1)
colCurrentLevels <- levels(x[, colName])
colLevelChanges <- levelChanges[colName][[1]]
matchLevels <- match(colCurrentLevels, names(colLevelChanges))
# ensure all levels are covered
stopifnot(
length(colLevelChanges) == length(colCurrentLevels),
!any(is.na(matchLevels))
)
sortedNewLevels <- colLevelChanges[matchLevels]
levels(x[, colName]) <- sortedNewLevels
x[, colName] <- factor(x[, colName], levels = unique(colLevelChanges))
x[, colName]
}
#' Check levelChanges consistency
#'
#' Check that the levels provided in a levelChanges list match those that
#' are already present.
#'
#' @param x a data frame
#' @param levelChanges A list, of the form generated by
#' \code{\link{generateFactorRenamer}}
#' @param allowMissingCols A logical or a character vector. If it is a
#' logical, it indicates whether to allow some columns of x
#' not to be included in levelChanges. If TRUE, only a warning is issue if
#' there are columns missing. If FALSE, an error is given.
#' If it is a character vector, it contains the names of any columns that
#' are allowed not to be included in levelChanges.
#' @return The data from \code{x[, levelChanges]}, with as prescribed by
#' \code{levelChanges}.
is.consistent.changeLevels <- function(x, levelChanges, allowMissingCols){
if (require(utils)){
colNames <- names(x)
availableCols <- colNames %in% names(levelChanges)
missingCols <- colNames[!availableCols]
areMissingCols <- length(missingCols) > 0
colsOk <- TRUE
if (areMissingCols){
if (class(allowMissingCols) == "character"){
missingColsButOK <- missingCols %in% allowMissingCols
missingColsButNotOK <- missingCols[!missingColsButOK]
areNotOKMissingCols <- length(missingColsButNotOK) > 0
if (isTRUE(areNotOKMissingCols)){
missingString <- paste(missingColsButNotOK, collapse = ", ")
missingDput <- utils:::capture.output(dput(missingColsButNotOK))
msg <- paste("The following columns are missing from levelChanges:",
" ", missingString, "\n As a vector: ", missingDput,
sep = "")
stop(msg)
colsOk <- FALSE
} else {
colsOk <- TRUE
}
}
else {
missingString <- paste(missingCols, collapse = ", ")
missingDput <- utils:::capture.output(dput(missingCols))
msg <- paste("The following columns are missing from levelChanges: ",
missingString, "\n As a vector: ", missingDput, sep = "")
if (isTRUE(allowMissingCols)){
warning(msg)
colsOk <- TRUE
}
else {
stop(msg)
colsOk <- FALSE
}
}
} else {
colsOk <- TRUE
}
colsToProcess <- colNames[availableCols]
differences <- lapply(colsToProcess, function(colName){
colCurrentLevels <- levels(x[, colName])
colProposedLevels <- names(levelChanges[[colName]])
missingFromProposed <- setdiff(colCurrentLevels, colProposedLevels)
unneededButProposed <- setdiff(colProposedLevels, colCurrentLevels)
nMissingFromProposed <- length(missingFromProposed)
nUnneededButProposed <- length(unneededButProposed)
anyDifferences <- nMissingFromProposed + nUnneededButProposed > 0
if (!anyDifferences){
NA
}
else {
list(missingFromProposed = missingFromProposed,
unneededButProposed = unneededButProposed)
}
})
all.na <- function(x) all(is.na(x))
columnsWithUnmatchedLevels <- !sapply(differences, all.na)
allColumnsOK <- !any(columnsWithUnmatchedLevels)
if (!allColumnsOK){
problemCols <- colsToProcess[columnsWithUnmatchedLevels]
differencesSubset <- differences[columnsWithUnmatchedLevels]
diffSeq <- seq_along(differencesSubset)
differencesString <- sapply(diffSeq, function(i){
thisDifference <- differencesSubset[[i]]
paste("* Variable: ", problemCols[i], "\n",
"Missing from proposed: ", thisDifference$missingFromProposed,
"\nUnneeded in proposed: ", thisDifference$unneededButProposed,
"\n\n", sep = "")
})
differencesString <- paste(differencesString, collapse = "\n")
msg <- paste("There is a mismatch between the existing levels and the ",
"levelChanges of the following columns: ",
paste(problemCols, collapse = ", "),
"\n\n", differencesString,
sep = "")
stop(msg)
}
all(colsOk, allColumnsOK)
}
}
#' Generate factor renamer for column
#'
#' Generate a factor renamer for an individual column of a data.frame
#'
#' @param colName A character vector of length 1, indicating which column
#' of \code{x} the string should be created for.
#' @param x a data frame
#' @param newlines A character variable. Either \code{"variable"} or
#' \code{"level"}. In the former case, each variable is one a separate
#' line in the result. In the latter case, newlines are additionally
#' placed after each level.
#' @param sort A logical indicating whether the levels should be sorted,
#' according to their numeric order
#' @return A character vector. This is an individual component of the list
#' described in \code{\link{generateFactorRenamer}}.
levelsStringForColName <- function(colName,
x,
newlines = "level",
sort = F){
# get the levels of the relevant column
colLevels <- levels(x[, colName])
if (sort){
o <- order(as.numeric(colLevels))
colLevels <- colLevels[o]
}
# build up the character variable
quotedLevels <- sapply(colLevels, function(q){
paste("\"", q, "\"", sep = "")
})
collapse_string <- if (newlines == "variable"){
", "
} else if (newlines == "level") {
",\n "
} else {
warning("Unrecognised 'newline' value. Using default of 'level'")
",\n "
}
allQuotedLevels <- paste(quotedLevels, " = ", quotedLevels,
collapse = collapse_string, sep = "")
allQuotedLevels <- paste("c(\n ", allQuotedLevels, ")", sep = "")
# nc <- nchar(allQuotedLevels)
# allQuotedLevels <- allQuotedLevels[-(nc-1:nc)]
# cat("\t\"", colName, "\" = c(")
names(allQuotedLevels) <- colName
allQuotedLevels
paste("`", colName, "` = ", allQuotedLevels, ",\n", sep = "")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.