#' Functions to help with Data Cleaning
#'
#' @param DT A data table to operate on
#' @param cols columns of a data.table to focus the operation on
#' @param value In the case of booleanize, a value to look for throughout the table
#' @param not A boolean. Negates the output of \code{booleanize}
#' @param index In the case of \code{CleanRows}, if \code{index} is TRUE,
#' an Index column will be created to track which rows were removed
#' @param verbose A boolean indicating whether to print information on the console
#' @param aslast A boolean indicating whether arrange the non-order columns before or after (asLast = TRUE)
#' the ordered ones
#' @param digits A numeric indicating the number of digits to round in \code{RoundCols}
#' @param igncols A character vector indicating which columns to ignore
#' @param b_skip A boolean indicating whether to skip non-numeric columns that \code{RoundCols} encounters
#' @param b_copy A boolean indicating whether to make a copy of the data, or operate on by reference
#'
#' @name easycleaning
NULL
#' @describeIn easycleaning A function to remove all columns that have ONLY NA values
#' @export
CleanCols <- function(DT){
f <- function(c){
if(sum(is.na(c)) == length(c))
return(FALSE)
return(TRUE)
}
DT[, sapply(DT, f), with=FALSE]
}
#' @describeIn easycleaning A function to remove na rows in specified columes
#' @export
CleanRows <- function(DT, cols = NULL, index = FALSE){
DT <- CleanCols(DT)
if(index) DT[, Ind := .I]
if(is.null(cols)) cols <- colnames(DT)
DT[Reduce("&", Booleanize(DT, cols, NA, TRUE))]
}
#' @describeIn easycleaning A function to remove duplicates across all columns (default),
#' or a given set of columns
#' @export
ddup <- function(DT, cols=NULL, verbose=TRUE){
cDT <- copy(DT)
cnames <- colnames(cDT)
setkeyv(cDT, cnames) # set all but change if needed below
if(!is.null(cols)){
# if none of cols exist - error. If some exist- warning
if(!any(cols %in% cnames))
stop("Provided names not valid columns in the data", call. = FALSE)
if(!all(cols %in% cnames))
warning("Some cols not in data. Using those that exist", call. = FALSE)
setkeyv(cDT, cols[which(cols %in% cnames)]) # will capture all if warning is not relevent
}
del <- nrow(DT) - nrow(cDT)
print(paste0("Removed ", del, " rows from total ", nrow(DT), " rows"))
return(cDT)
}
#' @describeIn easycleaning A function that turns a data.table into all logical values based on finding the value arg
#' in all or selected columns
#' @export
Booleanize <- function(DT=NULL, cols=NULL, value=NULL, not=FALSE){
if(is.null(value)) stop("function needs value argument")
if(!is.data.table(DT)) stop("DT must be of class data.table")
if(is.null(cols)) cols <- colnames(DT)
if(is.na(value))
e <- substitute(is.na(get(i, DT)))
else
e <- substitute(get(i, DT) == value)
if(not) DT[, sapply(cols, function(i) !eval(e), simplify = FALSE)]
else DT[, sapply(cols, function(i) eval(e), simplify = FALSE)]
}
#' @describeIn easycleaning A convienience wrapper for \code{round} that applies to all or a subset of cols
#' @export
RoundCols <- function(DT, digits = 2, igncols = NULL, cols = NULL, b_skip=FALSE, b_copy=TRUE){
# digits = 2
# igncols = c("SummaryGrp", "NumObs_tn")
dat <- data.table::copy(DT)
# ADD THIS FUNCTIONALITY (recursive call if copy = TRUE)
# if(copy){
# return(RoundCols(cdt, digits, cols, b_skip, copy=FALSE))
# }
allcols <- colnames(dat)
if(!is.null(igncols))
allcols <- allcols[!allcols %in% igncols]
if(!is.null(cols))
cols <- cols[cols %in% allcols]
else
cols <- allcols
if(!data.table::is.data.table(dat))
stop("arg not class data.table. Use: 'as.data.table'", call. = FALSE)
tmp <- sapply(cols, function(c) is.numeric(get(c, dat)))
if(!all(tmp) & !b_skip)
stop("some calls are not numeric. To skip non-numerics, use b_skip=TRUE")
numCols <- tmp[which(tmp==TRUE)]
skipped <- tmp[which(tmp==FALSE)]
if(!length(numCols))
stop(paste0("No numeric cols. All skipped: ", paste0(skipped, collapse = ", ")), call. = FALSE)
for(c in cols)
data.table::set(dat, j = c, value = round(get(c, dat), digits))
return(dat)
}
#' @describeIn easycleaning A convienience wrapper for \code{data.table::setcolorder} that makes it easy
#' set the order of a subset of columns
#' @export
p_setcolorder <- function(DT, cols=NULL, aslast=TRUE, verbose = FALSE){
if(is.null(cols))
stop("provide a vector of column names to set order")
cnam <- colnames(DT)
othercols <- cnam[!cnam %in% cols]
if(aslast)
setcolorder(DT, c(othercols, cols))
else
setcolorder(DT, c(cols, othercols))
content <- paste0(1:length(colnames(DT)), ". ", colnames(DT), collapse = "\n")
if(verbose) print("New column order set", content = content)
}
# #' @describeIn dataclean A function to expand elements of a timestamp into
# #' numerous columns in a new data.table
# #' @export
# xDate <- function(date_vec){
# DateDT <- data.table(
# Wkday = lubridate::wday(date_vec), # day of week
# Monthday = lubridate::day(date_vec), # day of month
# Yrday = lubridate::yday(date_vec), # day of year
# Week = lubridate::week(date_vec), # Week of year
# Month = lubridate::month(date_vec), # month of year
# Qtr = lubridate::quarter(date_vec), # quarter of year
# Year = lubridate::year(date_vec), # Year
# Hour = lubridate::hour(date_vec),
# Minute = lubridate::minute(date_vec),
# Second = lubridate::second(date_vec)
# )
# return(DateDT)
# }
#
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.