##' @title Grep by Individual Constrained in Time
##' @description in a dataset with one or more variables (typically containing
##' text) associated with a date, find matches on those variables for
##' specific individuals within specifed time frames
##' @param pattern a vector of search strings (regular expressions) (the names
##' attribute will be used as alias if it exists)
##' @param x names of variables to search in (given in order of importance)
##' @param data a data frame
##' @param id name of id variable (in 'data')
##' @param date name of associated date variable (in 'data')
##' @param units a data frame containing id's as well as (but optionally)
##' 'begin' and 'end' variables
##' @param units.id variable name in 'units' to use as id (by default the same
##' as 'id') N.B. a unit can appear several times, and will be identified
##' alongside 'begin' and 'end' (a soft warning will be given if these 3
##' variables are not enough for uniqueness)
##' @param begin variable name in 'units' to use as begin, if missing will be
##' set to earliest date in data
##' @param end variable name in 'units' to use as end, if missing will be set to
##' latest date in data
##' @param include length 2 logical vector specifying if lower (first entry) and
##' upper (second entry) bounds are inclusive (\code{TRUE}) or not
##' (\code{FALSE})
##' @param ... arguments passed to \code{data.table::like} for identifying matches
##' @param data.keep character vector of variables you want to keep from 'data'
##' @param verbose if \code{TRUE} the function will give helpful and/or annoying
##' messages
##' @import data.table
##' @return A data frame with
##' \itemize{
##' \item id the id variable
##' \item alias the name of pattern searched for (else p1, p2, etc)
##' \item date the date of assicated match
##' \item time days from 'begin' to 'date'
##' \item event indicator for a match
##' \item begin the begin date (could be individual)
##' \item end the end date (could be individual)
##' \item match.in the variable the match was found in
##' \item match the match found
##' \item first.id indicator for first occurence of associated
##' id/begin/end-combination
##' \item first.id_date indicator for first occurence of associated
##' id/begin/end- AND date combination
##' \item pattern the pattern searched for
##' \item ... variables selected with 'data.keep'.
##' }
##'
##' Note that any individual can have more than one match. See the vignette for
##' examples.
##' @export
gict <- function(pattern, x, data, id, date,
units, units.id = id, begin = NULL, end = NULL,
include = c(TRUE, TRUE), ...,
data.keep = NULL, verbose = TRUE){
## CHECK ARGUMENTS ---------------------------------------------------------
.Deprecated("survivalist::itc_grep")
properties(x = verbose, class = "logical", length = 1, na.ok = FALSE)
V <- verbose
if(V) message("[gict is verbose] Checking arguments")
properties(x = pattern, class = "character", na.ok =FALSE)
properties(x = names(pattern), nm = "names of 'pattern'",
class = c("NULL", "character"), na.ok =FALSE)
if(is.null(names(pattern))){
alias <- sprintf("alias %s", 1:length(pattern))
pattern <- setNames(object = pattern,
nm = alias)
if(V) message("pattern doesn't have names, alias(es) {",
paste(alias, collapse = ", "),"} is/are used")
}
properties(x = x, class = "character", na.ok = FALSE)
properties(x = data, class = "data.frame")
properties(x = id, class = "character", length = 1, na.ok = FALSE)
properties(x = date, class = "character", length = 1, na.ok = FALSE)
properties(x = units, class = "data.frame")
properties(x = units.id, class = "character", length = 1, na.ok = FALSE)
properties(x = begin, class = c("NULL", "character"),
length = 0:1, na.ok = FALSE)
properties(x = end, class = c("NULL", "character"),
length = 0:1, na.ok = FALSE)
properties(x = include, class = "logical", length = 2, na.ok = FALSE)
properties(x = data.keep, class = c("NULL", "character"), na.ok = FALSE)
inclusion(x = names(data), nm = "'data'",
include = c(id, date, x, data.keep))
inclusion(x = names(units), nm = "'units'", include = c(units.id, begin, end))
avoidance(x = x, avoid = c("id", "date"))
## GET DATA READY ----------------------------------------------------------
if(V) message("Preparing the data")
rk <- renaming_key(x = c(data.keep), nm = "'data'",
avoid = c("id", "date"))
data.select <- c(id, date, x, data.keep)
DATA <- as.data.table(data)[, data.select, with = FALSE]
setnames(DATA, old = c(id, date, names(rk)), new = c("id", "date", rk))
setkey(DATA, id, date)
## GET UNITS READY ---------------------------------------------------------
units.select <- c(units.id, begin, end)
UNITS <- as.data.table(units)[, units.select, with = FALSE]
setnames(UNITS, old = units.id, new = "id")
setkey(UNITS, id)
if(is.null(begin)){
UNITS[, begin := DATA[, min(date, na.rm = TRUE)]]
} else setnames(UNITS, old = begin, new = "begin")
if(is.null(end)){
UNITS[, end := DATA[, max(date, na.rm = TRUE)]]
} else setnames(UNITS, old = end, new = "end")
if(anyDuplicated(UNITS)){
s <- paste0(" | *soft warning*: units {id, begin, end} contains duplicates")
warning(s)
}
## this seems like inelegant coding, but I don't know how to best keep
## at copy of 'begin' and 'end' after the non-equi join creating X:
UNITS[, `:=`(start = identity(begin), stop = identity(end))]
## this seems cumbersome, but no better solution as of yet:
X <- if(include[1]){
if(include[2]){
UNITS[DATA, on = .(id, start <= date, stop >= date),
nomatch = NULL]
} else {
UNITS[DATA, on = .(id, start <= date, stop > date),
nomatch = NULL]
}
} else {
if(include[2]){
UNITS[DATA, on = .(id, start < date, stop >= date),
nomatch = NULL]
} else {
UNITS[DATA, on = .(id, start < date, stop > date),
nomatch = NULL]
}
}
X[, stop := NULL]
setnames(X, old = "start", new = "date")
## NEED TO STORE A MISSING VERSION OF 'data.keep'-VARIABLES ----------------
## (when filling in OTHER below, else rbindlist might fail due to wrong
## class)
keepNA <- DATA[1, data.keep, with = FALSE]
keepNA[1] <- NA
## GO THROUGH ALL PATTERNS AND SEARCH VARIABLES ----------------------------
L <- as.list(NULL)
for(j in seq_along(pattern)){
if(V) message("Now searching for ", names(pattern)[j], " (",
j, "/", length(pattern), ") in variable:", sep = "")
## GET MATCHES FOR PATTERN IN ALL 'x'-VARIABLES - - - - - - - - - - - -
LL <- as.list(NULL)
for(i in seq_along(x)){
if(V) cat(" * [ ] ", x[i], sep = "")
TMP <- X[like(vector = eval(parse(text = x[i])),
pattern = pattern[j])] ## , ...]
if(length(x) > 1) TMP[, setdiff(x, x[i]) := NULL]
TMP[, `:=`(event = 1,
time = as.numeric(date-begin),
match.in = factor(x[i], levels = x))]
setnames(TMP, old = x[i], new = "match")
LL[[i]] <- TMP
if(V) cat("\r * [done] ", x[i], "\n", sep = "")
}
MATCH <- rbindlist(LL)
## ADD THOSE WITHOUT MATCH USING AN ANTI-JOIN - - - - - - - - - - - - -
OTHER <- UNITS[!MATCH, on = .(id, begin, end),
.(id, begin, end, date = end, match = NA_character_)]
if(!is.null(data.keep)) OTHER[, (data.keep):= keepNA]
OTHER[, `:=`(event = 0,
time = as.numeric(date-begin),
match.in = NA_character_)]
L[[j]] <- rbind(MATCH, OTHER)[order(id)][
, `:=`(alias = names(pattern[j]))
]
}
## FIX OUTPUT --------------------------------------------------------------
if(V) message("Pattern search complete. Preparing output")
R <- rbindlist(L)[order(alias, id, begin, date, match.in)][
, `:=`(first.id = as.integer(rowid(alias, id, begin, end) == 1),
first.id_date = as.integer(rowid(alias, id, begin, end, date) == 1))
]
nams <- c("id", "alias", "date", "time", "event", "begin", "end",
"match.in", "match", "first.id", "first.id_date")
setcolorder(R, neworder = c(nams, data.keep))
setattr(R, name = "pattern", value = pattern)
if(dt.active()) R else setDF(R)
}
dt.active <- function() "data.table" %in% .packages()
##' widen gict result
##'
##' Apply \code{data.table::dcast} to turn the result from gict into wide format
##' (one row per {id, begin, end}) with time and event component for each alias.
##' @param x the return object from \code{gict}
##' @param event.only logical; event only data, else time and event components
##' for each alias
##' @return Wide form of gict. If \code{event.only=FALSE} each alias has the
##' match associated with \code{first.id == 1} turned into variables
##' 'ev.alias' (value of 'event') and 't.alias' (value of time). If
##' \code{event.only=FALSE}, each alias gets a column each with the value of
##' 'event' associated with \code{first.id == 1}
##' @export
gict2wide <- function(x, event.only = FALSE){
.Deprecated("survivalist::itc_grep")
properties(x, class = "data.frame")
properties(event.only, class = "logical", length = 1, na.ok = FALSE)
inclusion(names(x), nm = "x",
include = c("id", "begin", "end", "time",
"event", "alias", "first.id"))
if(!"data.table" %in% class(x)) x <- as.data.table(x)
a <- unique(x[['alias']])
R <- if(event.only){
dcast(data = x[first.id == 1,
.(id, begin, end, event, alias)],
formula = id + begin + end ~ alias,
value.var = c("event"))
} else {
dc = dcast(data = x[first.id == 1,
.(id, begin, end, time, event, alias)],
formula = id + begin + end ~ alias,
value.var = c("time", "event"))
g1 <- gsub(pattern = "time_", replacement = "t.",
x = names(dc), fixed = TRUE)
g2 <- gsub(pattern = "event_", replacement = "ev.",
x = g1, fixed = TRUE)
setnames(dc, old = names(dc), new = g2)
colorder <- c("id", "begin", "end",
paste0(c("t.", "ev."), rep(a, each = 2)))
setcolorder(dc, neworder = colorder)
}
if(dt.active()) R else setDF(R)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.