## ----------------------------------------------------------------------
## FUNCTIONS OPERATING ON DATES
## ----------------------------------------------------------------------
## TODO:
## - re-write as generic to both allow df and individual columns
## name it: convert.date
## - possibly extract "convert.unknown.date.to.na" as individual function
#' Convert all date columns of a data.frame to Date vectors.
#'
#' This function identifies all date columns in a data frame and converts them
#' to vectors of Date objects. Optionally, unknown dates (e.g. coded as '1900-01-01')
#' can be converted to \code{NA}.
#'
#' @param dat data frame to convert dates
#' @param .format format of the dates (default: '\%Y-\%m-\%d')
#' @param convert.unknown.date.to.na flag for conversion of unknown dates
#' @param unknown.date.string string specifying the coding of unknown dates
#' @param partial.date.handling how to handle columns containing entries that cannot be converted (default: skip; see Details)
#' @param partial.date.string string used for unknown data components (e.g. 'Unknown' or '--')
#' @param silent hide verbose output
#' @return Date frame with date columns converted.
#' @details \code{partial.date.handling}: By default, potential data columns containing entries that cannot be converted are skipped (option 'skip'). With 'force.conversion' fields incompatible with the date format are converted to NA. 'fill.partial.dates' keeps the orginal column untouched and adds a new column to the data frame with the name <colname>.partial.dates.processed in which partial are filled using \code{\link{fill.partial.date}} (e.g. Unknown.01.2013 -> 15.01.2013). With 'fill.partial.dates.and.keep.original' partial dates are processed directly and the original data is copied to <colname>.original.
#' @export
#' @seealso fill.partial.date
#' @examples
#' convert.all.dates(data.frame(date=c("2014-01-Unknown", "1900-01-01")),
#' partial.date.handling="fill.partial.dates", convert.unknown.date.to.na = TRUE,
#' unknown.date.string="1900-01-01")
#' ## date date.partial.dates.processed
#' ##1 2014-01-Unknown 2014-01-15
#' ##2 1900-01-01 <NA>
#' @author Pascal Benkert
convert.all.dates <- function(dat, .format="%Y-%m-%d", convert.unknown.date.to.na = FALSE, unknown.date.string=NULL, partial.date.handling="skip", partial.date.string="Unknown", silent=FALSE) {
for(i in names(dat)) {
## since multiple date formats may be check, skip if already converted to Date
if (inherits(dat[[i]], "Date")) next
## handle error, e.g. Error in strptime(x, format, tz = "GMT"): input string is too long
possibleError <- tryCatch(x <- as.Date(as.character(dat[[i]]),
format=.format),
error = function(e) {
warning(paste0("Column '", i, "': ", e, " Skipped..."))
})
if(inherits(possibleError, "error")) next
##x <- as.Date(as.character(dat[[i]]), format=.format)
text.length <- as.numeric(sapply(as.character(dat[[i]])[!is.na(dat[[i]])],FUN=nchar))
## non-date columns have only NA entries after conversion (others are date fields)
## FIXME: "Unknown." is used in the in the MS cohort as a placeholder which is a 7 digits string:
## To avoid that comment columns with a by change date entry are converted which check the
## overall string length. alternatively we may convert prior to all other steps "Unknown" (i.e. unknown.date.string to a generic generic placeholder "00" or "--"
if(!all(is.na(x)==TRUE) & (all(text.length<=10 & text.length>=4) | length(grep("Unknown", dat[[i]], fixed=TRUE)) != 0)) {
## identify 3-component-fields
is.three.component.field <- FALSE
date.string.length <- 8 + (nchar(.format) - 6)
##
if(nchar(partial.date.string) > 2 & length(grep(partial.date.string, dat[[i]])) > 0) is.three.component.field <- TRUE
## handle dates in freetext fields (e.g. comments: "23.05.2013: positive")
## none of the entries is allowed to be longer than the format string
## calculate date string length: len("%Y%m%d")=8, len("%Y.%m.%d")=10
if(is.three.component.field == FALSE & TRUE %in% (as.numeric(sapply(as.character(dat[[i]]),FUN=nchar)) > date.string.length)) {
if(silent==FALSE) cat(" *", i,"identified as potential date column, but contains \nstrings longer than date format\n")
next
}
if(silent==FALSE) cat(" *", i,"identified as potential date column\n")
converted <- TRUE
col.original <- dat[[i]]
## handle missing dates coded as string (unknown.date.string)
if(!is.null(unknown.date.string)) {
nr.missing.dates <- length(dat[[i]][!is.na(dat[[i]]) & dat[[i]] == unknown.date.string])
if(nr.missing.dates > 0) {
if(convert.unknown.date.to.na == TRUE) {
dat[[i]][!is.na(dat[[i]]) & dat[[i]] == unknown.date.string] <- NA
if(silent==FALSE) cat(" *", nr.missing.dates,"date(s) coded as", unknown.date.string, "set to NA\n")
}
else {
if(silent==FALSE) cat(" *", nr.missing.dates,"date(s) coded as", unknown.date.string, "identified\n")
}
}
}
## fill up trimed dates
text.length <- as.numeric(sapply(as.character(dat[[i]])[!is.na(dat[[i]])],FUN=nchar))
if ((partial.date.string == "" | partial.date.string == " ") & nchar(.format) == 6 & length(text.length) > 0 & all(text.length<=8 & text.length>=4)) {
## %Y%m%d: fill right
if (.format == "%Y%m%d") dat[[i]] <- sprintf("%-8s", dat[[i]])
## %d%m%Y: fill right
else if (.format == "%d%m%Y") dat[[i]] <- sprintf("%8s", dat[[i]])
else {
warning(paste(" *", "incomplete dates of the format", .format, "and no placeholder cannot be handeled."), immediate.=TRUE)
next
}
## restore NA and ""
dat[[i]][trimws(dat[[i]]) == "NA" | trimws(dat[[i]]) == ""] <- NA
partial.date.string = " "
}
## convert dates
col.before.conversion <- dat[[i]] # unknown.date.string already processed
dat[[i]] <- as.Date(as.character(dat[[i]]), format=.format)
## handle entries that cannot be converted
## is NA after conversion but was not NA before
corrupt.entires <- (is.na(dat[[i]]) & (!is.na(col.before.conversion)))
if(TRUE %in% corrupt.entires) {
if(partial.date.handling == "skip") {
dat[[i]] <- col.before.conversion
warning(paste(" *", length(which(corrupt.entires)),"could not be converted - conversion skipped."),immediate.=TRUE)
cat(" *", length(which(corrupt.entires)),"could not be converted - conversion skipped.\n")
converted <- FALSE
}
else if(partial.date.handling == "fill.partial.dates") {
## restore original
dat[[i]] <- col.original
tryCatch({
new.col.name <- paste(i, ".partial.dates.processed", sep="")
dat[[new.col.name]] <- as.Date(as.character(unlist(lapply(col.before.conversion, FUN= function(x,y,z) fill.partial.date(x, partial.date.string, .format) ))), format=.format)
dat <- move.column.after(dat, new.col.name, i)
cat(" * ", length(which(corrupt.entires))," partial dates identified - new column", new.col.name, "\n")
converted <- TRUE
}, error = function(err) {
##stop()
warning(paste(" *", length(which(corrupt.entires)),"could not be converted - conversion skipped."),immediate.=TRUE)
cat(" *", length(which(corrupt.entires)),"could not be converted - conversion skipped.\n")
converted <- FALSE
})
}
else if(partial.date.handling == "fill.partial.dates.and.keep.original") {
## restore original
tryCatch({
new.col.name <- paste(i, ".original", sep="")
dat[[new.col.name]] <- col.original
dat[[i]] <- as.Date(as.character(unlist(lapply(col.before.conversion, FUN= function(x,y,z) fill.partial.date(x, partial.date.string, .format) ))), format=.format)
dat <- move.column.after(dat, new.col.name, i)
cat(" * ", length(which(corrupt.entires))," partial dates identified - dates converted and original data stored in column", new.col.name, "\n")
converted <- TRUE
}, error = function(err) {
##stop()
warning(paste(" *", length(which(corrupt.entires)),"could not be converted - conversion skipped."),immediate.=TRUE)
cat(" *", length(which(corrupt.entires)),"could not be converted - conversion skipped.\n")
converted <- FALSE
})
}
else {
warning(paste(" *", length(which(corrupt.entires)),"could not be converted. Fields converted to NA."), immediate.=TRUE)
cat(" *", length(which(corrupt.entires)),"could not be converted. Fields converted to NA.\n")
}
}
cat(" *", "column", ifelse(converted, "","NOT"), "converted to date\n")
}
}
return(dat)
}
## ----------------------------------------------------------------------
#' Fill partial dates.
#'
#' This function convert a partial date of the form Unknown.mm.YYYY or
#' Unknown.Unknown.YYYY to the typical placeholders 15.mm.YYYY and 01.07.YYYY,
#' respectively. Other \code{partial.date.string} can be specified.
#' Alternatively, the 'exact' midpoint of the month/year can be used, i.e.
#' \code{floor(ndays/2)} and 02.07.YYYY.
#'
#' @param date.string string containing a partial date
#' @param partial.date.string string used for unknown data components (e.g. 'Unknown' or '--')
#' @param .format format of the dates (default: '\%Y-\%m-\%d')
#' @param exact use the exact modpoints \code{floor(ndays/2)} and 02.07.YYYY
#' @return Date string, converted if applicable.
#' @export fill.partial.date
#' @examples
#' fill.partial.date("Unknown.02.2014", "Unknown", .format="%d.%m.%Y", exact=TRUE) # 14.02.2014
#' fill.partial.date("2014/--/--", "--", .format="%Y/%m/%d") # 2014/07/01
#' @author Pascal Benkert
fill.partial.date <- function(date.string, partial.date.string="Unknown", .format="%Y-%m-%d", exact=FALSE) {
## skip NA's
if(is.na(date.string)) return(NA)
####################
## german format: ##
####################
## any separator or none allowed
if(length(grep("%d?.%m?.%Y", .format)) != 0) {
separator <- ""
## check whether same separator used or ne separator
if(substr(.format,3,3) == substr(.format,6,6)) {
separator <- substr(.format,3,3)
}
else {
## e.g. %d.%m/%Y
if(nchar(.format) != 6) stop(paste0("Wrong date format '",.format,"'."))
}
if(date.string == paste(partial.date.string, partial.date.string, partial.date.string, sep=separator)) return(NA)
## handle Unknown day+month
unk.day.month.placeholder <- paste("01",separator,"07",separator,sep="")
if(exact == TRUE) unk.day.month.placeholder <- paste("02",separator,"07",separator,sep="")
date.string <- sub(paste(partial.date.string, separator, partial.date.string, separator, sep=""), unk.day.month.placeholder , date.string)
## handle Unknown day
if(regexpr(partial.date.string, date.string)[1] == 1) {
tryCatch(month.days <- days.of.month(as.Date(gsub(partial.date.string,"01",date.string), format=.format)), error = function(e) stop(paste0("Date '",date.string,"' does not match format '", .format, "'.")))
if(exact == TRUE) date.string <- gsub(partial.date.string,floor(month.days/2),date.string) else date.string <- gsub(partial.date.string,"15",date.string)
}
## final check that output is correct date
if(is.na(as.Date(date.string, format=.format))) stop(paste0("Date '",date.string,"' cannot be processed using format '", .format, "'."))
return(date.string)
}
####################
## english format:##
####################
## any separator or none allowed
if(length(grep("%Y?.%m?.%d", .format)) != 0) {
separator <- ""
## check whether same separator used or ne separator
if(substr(.format,3,3) == substr(.format,6,6)) separator <- substr(.format,3,3)
else {
## e.g. %d.%m/%Y
if(nchar(.format) != 6) stop(paste0("Wrong date format '",.format,"'."))
}
if(date.string == paste(partial.date.string, partial.date.string, partial.date.string, sep=separator)) return(NA)
## handle Unknown day+month
unk.day.month.placeholder <- paste(separator, "07", separator, "01", sep="")
if(exact == TRUE) unk.day.month.placeholder <- paste(separator, "07", separator, "02", sep="")
date.string <- sub(paste(separator, partial.date.string, separator, partial.date.string, sep=""), unk.day.month.placeholder , date.string)
## handle Unknown day
if(regexpr(partial.date.string, date.string)[1] == (nchar(date.string) - nchar(partial.date.string) + 1)) {
tryCatch(month.days <- days.of.month(as.Date(gsub(partial.date.string,"01",date.string), format=.format)), error = function(e) stop(paste0("Date '",date.string,"' does not match format '", .format, "'.")))
if(exact == TRUE) date.string <- gsub(partial.date.string,floor(month.days/2),date.string) else date.string <- gsub(partial.date.string,"15",date.string)
}
## final check that output is correct date
if(is.na(as.Date(date.string, format=.format))) stop(paste0("Date '",date.string,"' cannot be processed using format '", .format, "'."))
return(date.string)
}
else stop(paste0("Wrong date format '",.format,"'."))
}
## ----------------------------------------------------------------------
#' Number of days of given month.
#'
#' This function returns the number of days for a given month as specified by
#' as Date object generated by \code{\link{as.Date}}.
#'
#' @param date Date object
#' @return Number of days.
#'
#' @examples
#' days.of.month(as.Date("01.02.2013", "%d.%m.%Y")) # 28
#' @export
#' @author Pascal Benkert
days.of.month <- function(date) {
assertthat::assert_that(inherits(date, "Date") & !is.na(date))
m <- format(date, format="%m")
while (format(date, format="%m") == m) {
date <- date + 1
}
return(as.integer(format(date - 1, format="%d")))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.