llutils: Linelist Utilities

Documented in extract_datetime

#' Extract and parse date-times within strings such as file names
#'
#' @description
#' Extract and parse date-times within strings such as filenames. Currently only
#' handles format "yyyy?mm?dd?HH?MM?(SS)", where "?" represents an optional
#' space or punctuation character, and the seconds slot (SS) is optional.
#'
#' @param x Vector of strings such as file names
#' @param remove_path Logical indicating whether to strip file paths from `x`
#'   using \code{\link[base]{basename}} (up to and including the last path
#'   separator) prior to searching for date values. Defaults to `TRUE`.
#' @param year_min Minimum year value expected. Targeting an expected year range
#'   helps avoid false positive matches to number sequences that are not dates.
#'   Defaults to `2000`.
#' @param year_max Maximum year value expected. Defaults to the current year
#'   based on `Sys.date`.
#' @param if_multiple If multiple date-times within a string, return only the
#'   first ("use_first"), or only the last ("use_last"). Defaults to "use_last".
#'
#' @return
#' Vector of date-times (of class POSIXct/POSIXt). Returns NA when no date-time
#' found within a given string.
#'
#' @examples
#' files <- c(
#'   "~/Documents/2020-02-01/myfile-2020-06-30-1230.csv",
#'   "2020_06_30_052051_Database_Complete_v1230.csv",
#'   "~/Desktop/data__cleaned1.xlsx",
#'   "~/Desktop/data__cleaned__2020-07-01_16-25.xlsx",
#'   "22062020-covid19-cases.xlsm",
#'   "/Documents/2015/PhD.Data.20091020_1247.Final.xls",
#'   "/exports/2015-03-05/export-2015-03-05_1352.xls"
#' )
#'
#' extract_datetime(files)
#'
#' @importFrom stringr str_extract_all
#' @importFrom lubridate as_datetime
#' @export extract_datetime
extract_datetime <- function(x,
                             remove_path = TRUE,
                             year_min = 2000,
                             year_max = as.integer(format(Sys.Date(), "%Y")),
                             if_multiple = "use_last") {

  if_multiple <- match.arg(if_multiple, c("use_first", "use_last"))

  if (remove_path) {
    x <- basename(x)
  }

  # extract date-time
  regex_y <- prep_regex(seq(year_min, year_max, by = 1L))
  regex_m <- prep_regex(1:12)
  regex_d <- prep_regex(1:31)
  regex_hr <- prep_regex(0:23)
  regex_mi <- prep_regex(0:60)
  regex_se <- paste0(prep_regex(0:62), "?")
  regex_sep <- "[[:punct:]|[:space:]]?"


  r_datetime <- paste(
    regex_y, regex_m, regex_d, regex_hr, regex_mi, regex_se, sep = regex_sep
  )

  dt_extract <- stringr::str_extract_all(x, r_datetime)

  # if multiple date-times within string, select either first-only or last-only
  if (if_multiple == "use_last") {
    dt_extract <- vapply(dt_extract, function (x) ifelse(length(x) > 0, x[length(x)], NA_character_), "")
  } else {
    dt_extract <- vapply(dt_extract, function (x) ifelse(length(x) > 0, x[1L], NA_character_), "")
  }

  # strip punctuation and space characters
  dt_extract_std <- gsub("[[:space:]]|[[:punct:]]", "", dt_extract)

  # convert to date-time, trying both hm and hms formats
  dt_hm <- lubridate::as_datetime(dt_extract_std, format = "%Y%m%d%H%M")
  dt_hms <- lubridate::as_datetime(dt_extract_std, format = "%Y%m%d%H%M%S")

  # coalesce hm and hms formats
  coalesce_dates(x = dt_hms, y = dt_hm, prefer = "x")
}