#' Extract and parse date-times within strings such as file names
#'
#' @description
#' Extract and parse date-times within strings such as filenames. Currently only
#' handles format "yyyy?mm?dd?HH?MM?(SS)", where "?" represents an optional
#' space or punctuation character, and the seconds slot (SS) is optional.
#'
#' @param x Vector of strings such as file names
#' @param remove_path Logical indicating whether to strip file paths from `x`
#' using \code{\link[base]{basename}} (up to and including the last path
#' separator) prior to searching for date values. Defaults to `TRUE`.
#' @param year_min Minimum year value expected. Targeting an expected year range
#' helps avoid false positive matches to number sequences that are not dates.
#' Defaults to `2000`.
#' @param year_max Maximum year value expected. Defaults to the current year
#' based on `Sys.date`.
#' @param if_multiple If multiple date-times within a string, return only the
#' first ("use_first"), or only the last ("use_last"). Defaults to "use_last".
#'
#' @return
#' Vector of date-times (of class POSIXct/POSIXt). Returns NA when no date-time
#' found within a given string.
#'
#' @examples
#' files <- c(
#' "~/Documents/2020-02-01/myfile-2020-06-30-1230.csv",
#' "2020_06_30_052051_Database_Complete_v1230.csv",
#' "~/Desktop/data__cleaned1.xlsx",
#' "~/Desktop/data__cleaned__2020-07-01_16-25.xlsx",
#' "22062020-covid19-cases.xlsm",
#' "/Documents/2015/PhD.Data.20091020_1247.Final.xls",
#' "/exports/2015-03-05/export-2015-03-05_1352.xls"
#' )
#'
#' extract_datetime(files)
#'
#' @importFrom stringr str_extract_all
#' @importFrom lubridate as_datetime
#' @export extract_datetime
extract_datetime <- function(x,
remove_path = TRUE,
year_min = 2000,
year_max = as.integer(format(Sys.Date(), "%Y")),
if_multiple = "use_last") {
if_multiple <- match.arg(if_multiple, c("use_first", "use_last"))
if (remove_path) {
x <- basename(x)
}
# extract date-time
regex_y <- prep_regex(seq(year_min, year_max, by = 1L))
regex_m <- prep_regex(1:12)
regex_d <- prep_regex(1:31)
regex_hr <- prep_regex(0:23)
regex_mi <- prep_regex(0:60)
regex_se <- paste0(prep_regex(0:62), "?")
regex_sep <- "[[:punct:]|[:space:]]?"
r_datetime <- paste(
regex_y, regex_m, regex_d, regex_hr, regex_mi, regex_se, sep = regex_sep
)
dt_extract <- stringr::str_extract_all(x, r_datetime)
# if multiple date-times within string, select either first-only or last-only
if (if_multiple == "use_last") {
dt_extract <- vapply(dt_extract, function (x) ifelse(length(x) > 0, x[length(x)], NA_character_), "")
} else {
dt_extract <- vapply(dt_extract, function (x) ifelse(length(x) > 0, x[1L], NA_character_), "")
}
# strip punctuation and space characters
dt_extract_std <- gsub("[[:space:]]|[[:punct:]]", "", dt_extract)
# convert to date-time, trying both hm and hms formats
dt_hm <- lubridate::as_datetime(dt_extract_std, format = "%Y%m%d%H%M")
dt_hms <- lubridate::as_datetime(dt_extract_std, format = "%Y%m%d%H%M%S")
# coalesce hm and hms formats
coalesce_dates(x = dt_hms, y = dt_hm, prefer = "x")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.