#' Convert dataframe column types.
#'
#' Given a configuration dataframe with column names and the required types,
#' convert the column data types in a separate dataframe.
#'
#' Column metadata may be stored separately from the code needed to perform the
#' conversion. As an example, a dataframe with 2,000 columns could have its
#' metadata stored in a separate \code{csv} or \code{xlsx} file. The metadata
#' file would be read in as a dataframe and \code{convert_cols} would update the
#' data types en masse.
#'
#' \code{convert_cols} is trading the need to maintain a separate metadata file
#' for concise code that performs the data type conversion. The alternative is
#' to have verbose code that details each data type conversion to take place.
#'
#' \code{convert_cols} is a closure that contains two parts.
#' \itemize{
#' \item An inner function, \code{convert_}, that converts a set of columns
#' that are to be converted to the same type using \link[purrr]{dmap_at}.
#' \item The enclosing environment that contains the dataframe that will be
#' updated and contains a call to \link[purrr]{pmap} to drive
#' iteration.
#' }
#'
#' Finally, the function is missing cases to convert to the following types.
#' The types includes should cover the most simple use cases.
#' \itemize{
#' \item list
#' \item factor
#' \item raw
#' \item complex
#' }
#'
#' @param df A dataframe that will have its column data types converted.
#' @param types_df A dataframe containing the column to convert and the data
#' type to convert to. The expectation is that \code{types_df} has columns
#' named \code{col_name} and \code{col_type}. If \code{types_df} has columns
#' named differently, this is easy to convert using \link[dplyr]{rename}.
#'
#' @return A copy of the original dataframe, df, with the column types
#' converted.
#'
#' @examples
#' ## dataframe to convert
#' id <- c(1L, 2L, 3L)
#' date_var <- c("2016-01-02", "2015-12-24", "2016-05-05")
#' datetime_var <- c("2016-01-02T11:11:04",
#' "2015-12-24T08:11:41",
#' "2016-05-05T05:05:49")
#' double_var1 <- c("46.41", "118.11", "84.68")
#' double_var2 <- c("68.48", "-248.99", "194")
#' numeric_var <- c("78.61", "593.1", "123")
#' logical_var <- c("true", "true", "false")
#' character_var <- c(49, 88, 104)
#' integer_var <- c("77", "84", "4949")
#'
#' test_df <- data.frame(id,
#' date_var,
#' datetime_var,
#' double_var1,
#' double_var2,
#' numeric_var,
#' logical_var,
#' character_var,
#' integer_var)
#'
#' ## print the structure
#' str(test_df)
#'
#' ## create a dataframe with columns named col_name and col_type (if you name
#' ## them something else you can always use dplyr::rename) with the column
#' ## name and the type you would like it converted to
#' types_df <- tibble::tribble(
#' ~col_name, ~col_type,
#' "date_var", "date",
#' "datetime_var", "datetime",
#' "double_var1", "double",
#' "double_var2", "double",
#' "numeric_var", "numeric",
#' "logical_var", "logical",
#' "character_var", "character",
#' "integer_var", "integer"
#' )
#'
#' ## convert the columns
#' test_df_converted <- convert_cols(test_df, types_df)
#'
#' @export
convert_cols <- function(df, types_df) {
# trailing underscore represents a "private" function
convert_ <- function(col_type, col_names, ...) {
# swallow dots when using as part of pmap
# used in case types_df has other columns contained within
dots <- list(...)
# type check as well as setting the function to perform conversion
if (col_type %in% c("character",
"date",
"datetime",
"double",
"integer",
"logical")) {
conv_func <- switch(col_type,
"character" = as.character,
"date" = lubridate::ymd,
"datetime" = lubridate::ymd_hms,
"double" = as.double,
"integer" = as.integer,
"logical" = as.logical)
} else if (col_type %in% ("numeric")) {
message("For col_type = 'numeric' utilizing type double")
conv_func <- as.double
} else {
stop("col_type must be one of:\n character, date, datetime, double, integer, or logical")
}
# col_names is a list, convert to a character vector
col_names_chr <- col_names %>% purrr::flatten_chr()
# name check
# if not all columns to convert are in df_converted
if (!all(tibble::has_name(df_converted, col_names_chr))) {
# discard those columns that are not in df_converted
col_names_chr_not <- purrr::discard(col_names_chr,
tibble::has_name(df_converted,
col_names_chr))
# keep just those columns that are in df_converted
col_names_chr <- purrr::keep(col_names_chr,
tibble::has_name(df_converted,
col_names_chr))
message(paste0("Ignoring the following columns:\n ",
paste(col_names_chr_not, collapse = "\n "))
)
}
# perform the conversion
# because using a closure, able to use <<- to update df_converted
df_converted <<- purrr::dmap_at(.d = df_converted,
.at = col_names_chr,
.f = conv_func)
}
# the dataframe to update
df_converted <- df
# utilize tidyr::nest so that conversions can happen in groups
# e.g. all character conversions happen at the same time in a single
# purrr::dmap_at call rather than one at a time
types_df_nest <- types_df %>%
# make sure that all columns are character and not factors
dplyr::mutate_if(is.factor, as.character) %>%
tidyr::nest_("col_name", key_col = "col_names")
# iterate over the types dataframe, which tells us how to convert columns
# in df
# if there are columns within types_df other than col_name and col_type,
# they are swallowed by the fact that convert_ takes dots (...)
purrr::pmap(.l = types_df_nest,
.f = convert_)
# return the new dataframe
df_converted
}
#' Update a readr column specification.
#'
#' Given a \link[readr]{cols} specification, update the specification in place
#' by explicitly declaring the appropriate type in a dataframe.
#'
#' This is of value when importing an extremely wide dataframe with many
#' columns. It may not make sense to explicitly declare the type of every
#' column by hand. Nor should one have to copy/paste an exisiting column
#' specification into a text editor in order to update the specification.
#'
#' A final use case is when problems occur and a \link[readr]{problems} tibble
#' is created. The tibble can be the starting point to decide upon the columns
#' that require updating.
#'
#' \code{spec_update} is a closure that contains two parts.
#' \itemize{
#' \item An inner function, \code{update_}, that updates the specification by
#' setting the \code{class} attribute for the given column.
#' \item The enclosing environment that contains the specification to be
#' updated and contains a call to \link[purrr]{pmap} to drive
#' iteration.
#' }
#'
#' @param col_spec A \link[readr]{cols} specification. Most commonly produced
#' as part of a call to \link[readr]{read_csv}.
#' @param col_spec_df A dataframe containing the column name and type to update
#' within the specification. The expectation is that \code{col_spec_df} has
#' columns named \code{col_name} and \code{col_type}. If \code{col_spec_df}
#' has columns named differently, this is easy to convert using
#' \link[dplyr]{rename}.
#'
#' @return A copy of the original specification with updated column types.
#'
#' @examples
#' test_df <- readr::read_csv("a,b,c\n1,2,3\n4,5,6")
#' test_spec <- readr::spec(test_df)
#'
#' ## returns
#' ## a = col_integer()
#' ## b = col_integer()
#' ## c = col_integer()
#'
#' ## update columns a and b to be doubles instead of integers
#' col_spec_df <- tibble::tribble(
#' ~col_name, ~col_type,
#' "a", "double",
#' "b", "double"
#' )
#'
#' ## update the specification
#' test_spec_updated <- spec_update(test_spec, col_spec_df)
#'
#' ## re-read with new column spec
#' test_updated <- readr::read_csv("a,b,c\n1,2,3\n4,5,6",
#' col_types = test_spec_updated)
#'
#' @export
spec_update <- function(col_spec, col_spec_df) {
# trailing underscore represents a "private" function
update_ <- function(col_name, col_type, ...) {
# swallow dots when using as part of pmap
# used in case col_spec_df has other columns contained within
dots <- list(...)
# type check as well as create the collector type to utilize to set the
# class attribute
if (col_type %in% c("character",
"date",
"datetime",
"double",
"euro_double",
"integer",
"logical",
"number",
"numeric",
"time",
"skip",
"guess")) {
if (col_type == "numeric") {
col_type_full <- c("collector_number", "collector")
} else {
col_type_full <- c(paste0("collector_", col_type), "collector")
}
} else {
stop("col_type must be one of:\n character, date, datetime, double, euro_double, integer, logical, number, time, skip, or guess")
}
# check that col_name is part of the specification
if (col_name %in% names(col_spec_updated$cols)) {
if (col_type %in% c("date", "datetime", "time")) {
col_spec_updated$cols[col_name][[1]] <<- list(format = "")
} else {
col_spec_updated$cols[col_name][[1]] <<- list()
}
class(col_spec_updated$cols[col_name][[1]]) <<- col_type_full
} else {
message(paste0("Ignoring as the following column name ",
"is not part of the specification: ", col_name))
}
}
# create a copy of col_spec
col_spec_updated <- col_spec
# iterate over the col_spec_df dataframe, which tells us how to update the
# column specification
purrr::pmap(.l = col_spec_df,
.f = update_)
# return the updated spec
col_spec_updated
}
#' Create a \link[readr]{cols} specification from a dataframe.
#'
#' Given a dataframe with columns named \code{col_name} and \code{col_type},
#' construct a \link[readr]{cols} specification.
#'
#' This allows one to create a \link[readr]{cols} specification from an existing
#' dataframe. Executing \link{types_df} on a dataframe will return a new
#' dataframe containing the column names and types needed to create a
#' \link[readr]{cols} specification.
#'
#' The inverse operation is \link{spec_to_df}.
#'
#' @param df A dataframe with column names \code{col_name} and \code{col_type}.
#'
#' @return A \link[readr]{cols} specification.
#'
#' @examples
#' ## create an artificial df
#' test_df <- readr::read_csv(paste0("a,b,c,d\n",
#' "1,two,3.0,2016-05-01T11:40:44\n",
#' "4,five,6.0,2014-12-01T06:12:23"))
#'
#' ## write the spec to disk as a mere CSV
#' spec_df <- spec_to_df(readr::spec(test_df))
#' tmp_dir <- tempdir()
#' readr::write_csv(spec_df,
#' file.path(tmp_dir, "test_df_spec.csv"))
#' readr::write_csv(test_df,
#' file.path(tmp_dir, "test_df.csv"))
#'
#' ## create a specification and then read original CSV back in
#' spec_df_from_csv <- readr::read_csv(file.path(tmp_dir, "test_df_spec.csv"))
#' spec_from_csv <- spec_from_df(spec_df_from_csv)
#' test_df_from_csv <- readr::read_csv(file.path(tmp_dir, "test_df.csv"),
#' col_types = spec_from_csv)
#' ## returns TRUE
#' assertthat::are_equal(test_df, test_df_from_csv)
#'
#' @export
spec_from_df <- function(df) {
if (!all(tibble::has_name(df, c("col_name", "col_type")))) {
stop(paste0("df must have columns named 'col_name' and 'col_type'. ",
"dplyr::rename may be used for column renaming."))
} else {
col_spec_ <- function(col_name, col_type, ...) {
}
type_list_update_ <- function(col_name, col_type, ...) {
type_list[[col_name]] <<- switch(col_type,
"_" = ,
"-" = readr::col_skip(),
"?" = readr::col_guess(),
"character" = readr::col_character(),
"date" = readr::col_date(),
"double" = readr::col_double(),
"euro_double" = readr::col_euro_double(),
"integer" = readr::col_integer(),
"logical" = readr::col_logical(),
"number" = readr::col_number(),
"numeric" = readr::col_number(),
"datetime" = readr::col_datetime(),
"time" = readr::col_time())
}
# initial list
type_list <- list()
# iterate over the df dataframe, which tells us what type each column
# should be
purrr::pmap(.l = df,
.f = type_list_update_)
# return the updated spec using a "private" function from readr
readr:::col_spec(type_list)
}
}
#' Create a dataframe from a \link[readr]{cols} specification.
#'
#' Given a \link[readr]{cols} specificaton, convert it to a dataframe.
#'
#' One use case is to write the returned dataframe as a \code{csv} in lieu of
#' writing out a \link[readr]{cols} specification as an \code{rds} object. This
#' is useful for cases when reading a \link[readr]{cols} specification as an
#' \code{rds} throws an error.
#'
#' The inverse operation is \link{spec_from_df}.
#'
#' @param spec A \link[readr]{cols} specification to be written as a dataframe.
#'
#' @return A \link[tibble]{tibble} representation of a \link[readr]{cols}
#' specification.
#'
#' @export
spec_to_df <- function(spec) {
# from https://github.com/hadley/readr/blob/master/R/collectors.R
# collectors can be...
# col_logical
# col_integer
# col_double
# col_character
# col_number
# col_factor
# col_guess
# col_skip
# col_datetime
# col_date
# col_time
# function for lmap
extract_ <- function(x) {
list(
list("col_name" = names(x),
"col_type" = class(x[[1]]) %>%
strsplit("collector_") %>%
unlist() %>%
.[2])
)
}
# extract_ returns a list of lists that are then row bound
# in order to return a dataframe
purrr::lmap(spec$cols, extract_) %>%
dplyr::bind_rows()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.