R/data.R

Defines functions changeLevels generateFactorRenamer changeLevelsColName is.consistent.changeLevels levelsStringForColName

Documented in changeLevels changeLevelsColName generateFactorRenamer is.consistent.changeLevels levelsStringForColName

#' Relevel using a levelChanges list
#' 
#' Relevels the columns of a data frame, as as prescribed by a level changes 
#' list, of the form generated by \code{\link{generateFactorRenamer}}.
#' 
#' Suppose we have a data frame \code{x}, composed of a number of factor 
#' variables (in columns).
#' These factor variables are poorly named. For example, they may just be 
#' named by by integers. We would like to rename them so that their names 
#' are more helpful and intuitive.
#'
#' To do this in a clear and repeatable manner, we will create an R list. 
#' Each component of the list is given the name of a column of the 
#' data.frame \code{x}. Each of these components is a character vector, each 
#' component of which refers to a level of the corresponding factor variable 
#' in \code{x}. Each of these component of the vector is a character, taking 
#' the default value of the current name of the factor level.
#' 
#' The function \code{\link{generateFactorRenamer}} can be used to create 
#' such a list. This can then be altered, and then processed using this 
#' function.
#' 
#' @param x a data frame
#' @param levelChanges A list, of the form generated by 
#'   \code{\link{generateFactorRenamer}}
#' @param allowMissingCols A logical or a character vector. If it is a 
#'   logical, it indicates whether to allow some columns of x 
#'   not to be included in levelChanges. If TRUE, only a warning is issue if 
#'   there are columns missing. If FALSE, an error is given.
#'   If it is a character vector, it contains the names of any columns that 
#'   are allowed not to be included in levelChanges.
#' @param verbose A logical, indicating whether to show progress
#' @return The data frame \code{x}, with factor levels releveled as 
#'   prescribed by \code{levelChanges}.
#' @examples
#' dat <- esoph[, 1:3]
#' generateFactorRenamer(dat)
#' 
#' # we can then change this into the following
#' levelChanges <- list(
#' agegp = c(
#'   "25-34" = "Young",
#'   "35-44" = "Young",
#'   "45-54" = "Middle-aged",
#'   "55-64" = "Middle-aged",
#'   "65-74" = "Old",
#'   "75+" = "Old"),
#' alcgp = c(
#'   "0-39g/day" = "0-39g/day",
#'   "40-79" = "40-79",
#'   "80-119" = "80-119",
#'   "120+" = "120+"),
#' tobgp = c(
#'   "0-9g/day" = "Light",
#'   "10-19" = "Medium",
#'   "20-29" = "Heavy",
#'   "30+" = "Heavy")
#' )
#' changeLevels(dat, levelChanges)
#' # this return the new data frame
#' @export
changeLevels <- function(x,
                         levelChanges,
                         allowMissingCols = F,
                         verbose = T){
  stopifnot(is.consistent.changeLevels(x, levelChanges, allowMissingCols),
            is.logical(verbose),
            length(verbose) == 1)
  if (require(utils)){
  colNames <- names(x)

  # get the original levels, to avoid processing columns that do
  # not change
  actualLevelChanges <- levelChanges
  xWithChanges <- x[, names(levelChanges)]
  originalLevels <- utils:::capture.output(generateFactorRenamer(xWithChanges))
  eval(parse(file = "", text = originalLevels))
  originalLevels <- levelChanges
  levelChanges <- actualLevelChanges

  if (isTRUE(verbose)){
    cat("Processing: ")
  }
  availableCols <- colNames %in% names(levelChanges)
  colsToProcess <- colNames[availableCols]

  nCols <- length(names(x))
  colSeq <- seq_len(nCols)

  if (verbose){
    progress <- create_progress_bar("text")
    progress$init(nCols)
  }

  out <- data.frame(lapply(colSeq, function(i){
    colName <- colNames[i]

    if (verbose){
      progress$step()
    }

    shouldProcessCol <- colName %in% colsToProcess
    levelsAreDifferent <- !identical(originalLevels[colName],
                                    levelChanges[colName])

    if (shouldProcessCol && levelsAreDifferent){
      changeLevelsColName(colName, x, levelChanges, verbose)
    } else {
      x[, colName]
    }
  }))
  if (isTRUE(verbose)){
    cat("\n\n")
  }
  names(out) <- colNames
  out
  }
}


#' Generate a factor renamer
#' 
#' Creates a template from which a specification of the renaming of the 
#' levels of factor variables in a data.frame can be.
#'
#' Suppose we have a data frame \code{x}, composed of a number of factor 
#' variables (in columns).
#' These factor variables are poorly named. For example, they may just be 
#' named by by integers. We would like to rename them so that their names 
#' are more helpful and intuitive.
#'
#' To do this in a clear and repeatable manner, we will create an R list. 
#' Each component of the list is given the name of a column of the 
#' data.frame \code{x}. Each of these components is a character vector, each 
#' component of which refers to a level of the corresponding factor variable 
#' in \code{x}. Each of these component of the vector is a character, taking 
#' the default value of the current name of the factor level.
#' 
#' This function output such a list.
#' The list can then be modified to specify the new level names, and the 
#' changes made using function \code{\link{changeLevels}}.
#' 
#' @param x a data frame
#' @param newlines A character variable. Either \code{"variable"} or
#'   \code{"level"}. In the former case, each variable is one a separate
#'   line in the result. In the latter case, newlines are additionally
#'   placed after each level.
#' @param sort A logical indicating whether the levels should be sorted, 
#'   according to their numeric order
#' @return The R code to generate the list is output to the terminal.
#' @examples
#' 
#' dat <- esoph[, 1:3]
#' generateFactorRenamer(dat)
#' 
#' # we can then change this into the following
#' levelChanges <- list(
#' agegp = c(
#'   "25-34" = "Young",
#'   "35-44" = "Young",
#'   "45-54" = "Middle-aged",
#'   "55-64" = "Middle-aged",
#'   "65-74" = "Old",
#'   "75+" = "Old"),
#' alcgp = c(
#'   "0-39g/day" = "0-39g/day",
#'   "40-79" = "40-79",
#'   "80-119" = "80-119",
#'   "120+" = "120+"),
#' tobgp = c(
#'   "0-9g/day" = "Light",
#'   "10-19" = "Medium",
#'   "20-29" = "Heavy",
#'   "30+" = "Heavy")
#' )
#' changeLevels(dat, levelChanges)
#' # this return the new data frame
#' @export
generateFactorRenamer <- function(x, newlines = "level", sort = F){
  stringComponents <- lapply(names(x),
                             levelsStringForColName, x, newlines, sort)
  out <- paste(unlist(stringComponents), collapse = "")

  # remove trailing comma and newline
  nc <- nchar(out)
  out <- substr(out, 1, nc - 2)

  cat("levelChanges <- list(\n")
  cat(out)
  cat("\n)")
}


#' Relevel a column using a levelChanges list
#' 
#' Relevels a factor column of a data frame, as as prescribed by a level 
#' changes list, of the form generated by 
#' \code{\link{generateFactorRenamer}}.
#' 
#' @param colName A character vector of length 1, indicating which column of 
#'   \code{x} the string should be created for.
#' @param x a data frame
#' @param levelChanges A list, of the form generated by 
#'   \code{\link{generateFactorRenamer}}
#' @param verbose A logical, indicating whether to show progress
#' @return The data from \code{x[, levelChanges]}, with as prescribed by 
#'   \code{levelChanges}.
changeLevelsColName <- function(colName, x, levelChanges, verbose = T){
  stopifnot(class(x)            == "data.frame",
            class(colName)      == "character",
            "factor"            %in% class(x[, colName]),
            class(levelChanges) == "list",
            class(verbose)      == "logical",
            length(verbose)     == 1)

  colCurrentLevels <- levels(x[, colName])
  colLevelChanges <- levelChanges[colName][[1]]

  matchLevels <- match(colCurrentLevels, names(colLevelChanges))

  # ensure all levels are covered
  stopifnot(
    length(colLevelChanges) == length(colCurrentLevels),
    !any(is.na(matchLevels))
  )

  sortedNewLevels <- colLevelChanges[matchLevels]
  levels(x[, colName]) <- sortedNewLevels
  x[, colName] <- factor(x[, colName], levels = unique(colLevelChanges))
  x[, colName]
}


#' Check levelChanges consistency
#' 
#' Check that the levels provided in a levelChanges list match those that 
#' are already present.
#' 
#' @param x a data frame
#' @param levelChanges A list, of the form generated by 
#'   \code{\link{generateFactorRenamer}}
#' @param allowMissingCols A logical or a character vector. If it is a 
#'   logical, it indicates whether to allow some columns of x 
#'   not to be included in levelChanges. If TRUE, only a warning is issue if 
#'   there are columns missing. If FALSE, an error is given.
#'   If it is a character vector, it contains the names of any columns that 
#'   are allowed not to be included in levelChanges.
#' @return The data from \code{x[, levelChanges]}, with as prescribed by 
#'   \code{levelChanges}.
is.consistent.changeLevels <- function(x, levelChanges, allowMissingCols){
  if (require(utils)){
  colNames <- names(x)
  availableCols <- colNames %in% names(levelChanges)
  missingCols <- colNames[!availableCols]
  areMissingCols <- length(missingCols) > 0

  colsOk <- TRUE

  if (areMissingCols){
    if (class(allowMissingCols) == "character"){
      missingColsButOK <- missingCols %in% allowMissingCols
      missingColsButNotOK <- missingCols[!missingColsButOK]

      areNotOKMissingCols <- length(missingColsButNotOK) > 0
      if (isTRUE(areNotOKMissingCols)){
        missingString <- paste(missingColsButNotOK, collapse = ", ")
        missingDput <- utils:::capture.output(dput(missingColsButNotOK))
        msg <- paste("The following columns are missing from levelChanges:",
                     " ", missingString, "\n As a vector: ", missingDput,
                     sep = "")
        stop(msg)
        colsOk <- FALSE
      } else {
        colsOk <- TRUE
      }
    }
    else {
      missingString <- paste(missingCols, collapse = ", ")
      missingDput <- utils:::capture.output(dput(missingCols))
      msg <- paste("The following columns are missing from levelChanges: ",
                   missingString, "\n As a vector: ", missingDput, sep = "")

      if (isTRUE(allowMissingCols)){
        warning(msg)
        colsOk <- TRUE
      }
      else {
        stop(msg)
        colsOk <- FALSE
      }
    }
  } else {
    colsOk <- TRUE
  }

  colsToProcess <- colNames[availableCols]

  differences <- lapply(colsToProcess, function(colName){
    colCurrentLevels <- levels(x[, colName])
    colProposedLevels <- names(levelChanges[[colName]])

    missingFromProposed <- setdiff(colCurrentLevels, colProposedLevels)
    unneededButProposed <- setdiff(colProposedLevels, colCurrentLevels)

    nMissingFromProposed <- length(missingFromProposed)
    nUnneededButProposed <- length(unneededButProposed)

    anyDifferences <- nMissingFromProposed + nUnneededButProposed > 0
    if (!anyDifferences){
      NA
    }
    else {
      list(missingFromProposed = missingFromProposed,
           unneededButProposed = unneededButProposed)
    }
  })

  all.na <- function(x) all(is.na(x))
  columnsWithUnmatchedLevels <- !sapply(differences, all.na)
  allColumnsOK <- !any(columnsWithUnmatchedLevels)

  if (!allColumnsOK){
    problemCols <- colsToProcess[columnsWithUnmatchedLevels]

    differencesSubset <- differences[columnsWithUnmatchedLevels]
    diffSeq <- seq_along(differencesSubset)

    differencesString <- sapply(diffSeq, function(i){
      thisDifference <- differencesSubset[[i]]
      paste("* Variable: ", problemCols[i], "\n",
            "Missing from proposed: ", thisDifference$missingFromProposed,
            "\nUnneeded in proposed: ", thisDifference$unneededButProposed,
            "\n\n", sep = "")
    })
    differencesString <- paste(differencesString, collapse = "\n")

    msg <- paste("There is a mismatch between the existing levels and the ",
                 "levelChanges of the following columns: ",
                 paste(problemCols, collapse = ", "),
                 "\n\n", differencesString,
                 sep = "")
    stop(msg)
  }

  all(colsOk, allColumnsOK)
}
}

#' Generate factor renamer for column
#' 
#' Generate a factor renamer for an individual column of a data.frame
#' 
#' @param colName A character vector of length 1, indicating which column 
#'   of \code{x} the string should be created for.
#' @param x a data frame
#' @param newlines A character variable. Either \code{"variable"} or
#'   \code{"level"}. In the former case, each variable is one a separate
#'   line in the result. In the latter case, newlines are additionally
#'   placed after each level.
#' @param sort A logical indicating whether the levels should be sorted, 
#'   according to their numeric order
#' @return A character vector. This is an individual component of the list 
#'   described in \code{\link{generateFactorRenamer}}.
levelsStringForColName <- function(colName,
                                   x,
                                   newlines = "level",
                                   sort = F){
  # get the levels of the relevant column
  colLevels <- levels(x[, colName])

  if (sort){
    o <- order(as.numeric(colLevels))
    colLevels <- colLevels[o]
  }

  # build up the character variable
  quotedLevels <- sapply(colLevels, function(q){
    paste("\"", q, "\"", sep = "")
  })

  collapse_string <- if (newlines == "variable"){
    ", "
  } else if (newlines == "level") {
    ",\n  "
  } else {
    warning("Unrecognised 'newline' value. Using default of 'level'")
    ",\n  "
  }

  allQuotedLevels <- paste(quotedLevels, " = ", quotedLevels,
                           collapse = collapse_string, sep = "")
  allQuotedLevels <- paste("c(\n  ", allQuotedLevels, ")", sep = "")

  # nc <- nchar(allQuotedLevels)
  # allQuotedLevels <- allQuotedLevels[-(nc-1:nc)]
  # cat("\t\"", colName, "\" = c(")

  names(allQuotedLevels) <- colName
  allQuotedLevels
  paste("`", colName, "` = ", allQuotedLevels, ",\n", sep = "")
}
rjbgoudie/utils.rjbg documentation built on May 27, 2019, 9:13 a.m.