Nothing
link_impl <- function(x,
y,
by,
start_time,
end_time,
y_time,
offset_before,
offset_after,
add_before,
add_after,
name) {
# Force variables to be evaluated, or somehow it cannot be found later on.
force(add_before)
force(add_after)
force(name)
# Filter y to keep only `by` instances that occur in x
if (!rlang::is_null(by) && length(by) != 0) {
y <- dplyr::semi_join(y, x, by = by)
}
# Prepare x
# Create a row_id or rematching later
x <- x |>
tibble::as_tibble() |>
mutate(.row_id = dplyr::row_number())
# If no end_time is specified, calculate the start and end time of the interval using offset
# Else, take the start time and end time as specified by the user
if (is.null(end_time) || length(end_time) == 0) {
data <- x |>
mutate(across(.env$start_time, as.integer, .names = ".x_time")) |>
mutate(.start_time = .data$.x_time - offset_before) |>
mutate(.end_time = .data$.x_time + offset_after) |>
select({{ by }}, ".start_time", ".end_time", ".row_id")
} else {
# Ensure column names for x and y do not clash
# also easier to work with
data <- x |>
dplyr::rename(.start_time = .env$start_time) |>
dplyr::rename(.end_time = .env$end_time) |>
mutate(.start_time = as.integer(.data$.start_time)) |>
mutate(.end_time = as.integer(.data$.end_time)) |>
select({{ by }}, ".start_time", ".end_time", ".row_id")
}
# Match sensing data with ESM using a left join
data <- data |>
dplyr::left_join(y, by = by, multiple = "all", relationship = "many-to-many") |>
mutate(across(dplyr::all_of(y_time), as.integer, .names = ".y_time")) |>
tidyr::drop_na(".start_time", ".end_time")
# The main data, i.e. data exactly within the interval
data_main <- data |>
filter(.data$.y_time >= .data$.start_time & .data$.y_time <= .data$.end_time) |>
arrange(across(c({{ by }}, ".y_time"))) |>
select(-".y_time") |>
nest({{ name }} := !c({{ by }}, ".start_time", ".end_time", ".row_id")) |>
select(dplyr::all_of(c(".row_id", name)))
# Merge back with original data
# Bug: if this happens after merging data_before and data_after, they would be lost in the case
# no data was retained in data_main as all the row_ids are deleted as well
data_main <- x |>
dplyr::left_join(data_main, by = ".row_id", multiple = "all", relationship = "many-to-many")
# Add the last measurement before start_time
tz <- attr(pull(y, {{ y_time }}), "tz")
if (add_before) {
# Calculate in which groups there is a measurement that equals start_time
equal_to_start <- data |>
filter(.data$.y_time == .data$.start_time) |>
distinct(.data$.row_id)
data_before <- data |>
dplyr::anti_join(equal_to_start, by = ".row_id") |>
filter(.data$.y_time < .data$.start_time) |>
group_by(.data$.row_id) |>
dplyr::slice_max(order_by = .data$.y_time, n = 1, with_ties = TRUE) |>
ungroup() |>
mutate(across(dplyr::all_of(y_time), .names = "original_time")) |>
mutate({{ y_time }} := lubridate::as_datetime(.data$.start_time, tz = tz)) |>
select(-".y_time") |>
nest(data_before = !c({{ by }}, ".start_time", ".end_time", ".row_id")) |>
select(".row_id", "data_before")
# Add to the main result
data_main <- data_main |>
dplyr::left_join(data_before, by = ".row_id", multiple = "all", relationship = "many-to-many") |>
mutate({{ name }} := purrr::map2(data_before, !!rlang::ensym(name), bind_rows)) |>
select(-"data_before")
}
# Add the first measurements after end_time
if (add_after) {
# Calculate in which groups there is a measurement that equals start_time
equal_to_end <- data |>
filter(.data$.y_time == .data$.end_time) |>
distinct(.data$.row_id)
data_after <- data |>
dplyr::anti_join(equal_to_end, by = ".row_id") |>
filter(.data$.y_time > .data$.end_time) |>
group_by(.data$.row_id) |>
dplyr::slice_min(order_by = .data$.y_time, n = 1, with_ties = TRUE) |>
ungroup() |>
mutate(across(dplyr::all_of(y_time), .names = "original_time")) |>
mutate({{ y_time }} := lubridate::as_datetime(.data$.end_time, tz = tz)) |>
select(-".y_time") |>
nest(data_after = !c({{ by }}, ".start_time", ".end_time", ".row_id")) |>
select(".row_id", "data_after")
# Add to the main result
data_main <- data_main |>
dplyr::left_join(data_after, by = ".row_id", multiple = "all", relationship = "many-to-many") |>
mutate({{ name }} := purrr::map2(!!rlang::ensym(name), data_after, bind_rows)) |>
select(-"data_after")
}
# Create an empty tibble (prototype) by retrieving rows with time before UNIX start (not possible)
# This is needed to fill in the data entries where there would otherwise be nothing left
# because nothing matched within the start_time and end_time
proto <- tibble::as_tibble(y[0, ]) |>
select(-{{ by }})
if (add_before || add_after) {
proto$original_time <- as.POSIXct(vector(mode = "double"))
# In case data_main is empty, applying the solution below leads to NA in the next step causing
# proto not to be applied (since it's not null)
if (nrow(data_main) > 0) {
# Add column original_time in cases where it's missing
for (i in seq_len(nrow(data_main))) {
if (!any("original_time" == colnames(pull(data_main, dplyr::all_of(name))[[i]]))) {
data_main$data[[i]]$original_time <- as.POSIXct(NA, tz = tz)
}
}
}
}
res <- data_main |>
mutate({{ name }} := ifelse(test = lapply(
X = !!rlang::ensym(name),
FUN = \(x) {
is.null(x) || identical(x, NA) || nrow(x) == 0
}
),
yes = list(proto),
no = !!rlang::ensym(name)
)) |>
select(-".row_id")
res
}
#' Link y to the time scale of x
#'
#' @description `r lifecycle::badge("stable")`
#'
#' One of the key tasks in analysing mobile sensing data is being able to link it to other data.
#' For example, when analysing physical activity data, it could be of interest to know how much
#' time a participant spent exercising before or after an ESM beep to evaluate their stress level.
#' [link()] allows you to map two data frames to each other that are on different time scales,
#' based on a pre-specified offset before and/or after. This function assumes that both `x` and
#' `y` have a column called `time` containing \link[base]{DateTimeClasses}.
#'
#' @details `y` is matched to the time scale of `x` by means of time windows. These time windows are
#' defined as the period between `x - offset_before` and `x + offset_after`. Note that either
#' `offset_before` or `offset_after` can be 0, but not both. The "interval" of the measurements is
#' therefore the associated time window for each measurement of `x` and the data of `y` that also
#' falls within this period. For example, an `offset_before` of
#' \code{\link[lubridate]{minutes}(30)} means to match all data of `y` that occurred *before* each
#' measurement in `x`. An `offset_after` of 900 (i.e. 15 minutes) means to match all data of `y`
#' that occurred *after* each measurement in `x`. When both `offset_before` and `offset_after` are
#' specified, it means all data of `y` is matched in an interval of 30 minutes before and 15
#' minutes after each measurement of `x`, thus combining the two arguments.
#'
#' The arguments `add_before` and `add_after` let you decide whether you want to add the last
#' measurement before the interval and/or the first measurement after the interval respectively.
#' This could be useful when you want to know which type of event occurred right before or after
#' the interval of the measurement. For example, at `offset_before = "30 minutes"`, the data may
#' indicate that a participant was running 20 minutes before a measurement in `x`, However, with
#' just that information there is no way of knowing what the participant was doing the first 10
#' minutes of the interval. The same principle applies to after the interval. When `add_before` is
#' set to `TRUE`, the last measurement of `y` occurring before the interval of `x` is added to the
#' output data as the first row, having the **`time` of \code{x - offset_before}** (i.e. the start
#' of the interval). When `add_after` is set to `TRUE`, the first measurement of `y` occurring
#' after the interval of `x` is added to the output data as the last row, having the **`time` of
#' `x + offset_after`** (i.e. the end of the interval). This way, it is easier to calculate the
#' difference to other measurements of `y` later (within the same interval). Additionally, an
#' extra column (`original_time`) is added in the nested `data` column, which is the original time
#' of the `y` measurement and `NULL` for every other observation. This may be useful to check if
#' the added measurement isn't too distant (in time) from the others. Note that multiple rows may
#' be added if there were multiple measurements in `y` at exactly the same time. Also, if there
#' already is a row with a timestamp exactly equal to the start of the interval (for `add_before =
#' TRUE`) or to the end of the interval `(add_after = TRUE`), no extra row is added.
#'
#' @section Warning: Note that setting `add_before` and `add_after` each add one row to each nested
#' \code{tibble} of the `data` column. Thus, if you are only interested in the total count (e.g.
#' the number of total screen changes), remember to set these arguments to FALSE or make sure to
#' filter out rows that do _not_ have an `original_time`. Simply subtracting 1 or 2 does not work
#' as not all measurements in `x` may have a measurement in `y` before or after (and thus no row
#' is added).
#'
#'
#' @param x,y A pair of data frames or data frame extensions (e.g. a tibble). Both `x` and `y` must
#' have a column called `time`.
#' @param by A character vector indicating the variable(s) to match by, typically the participant
#' IDs. If NULL, the default, `*_join()` will perform a natural join, using all variables in
#' common across `x` and `y`. Therefore, all data will be mapped to each other based on the time
#' stamps of `x` and `y`. A message lists the variables so that you can check they're correct;
#' suppress the message by supplying by explicitly.
#'
#' To join by different variables on `x` and `y`, use a named vector. For example, `by = c('a' =
#' 'b')` will match `x$a` to `y$b`.
#'
#' To join by multiple variables, use a vector with `length > 1`. For example, `by = c('a', 'b')`
#' will match `x$a` to `y$a` and `x$b` to `y$b`. Use a named vector to match different variables
#' in `x` and `y`. For example, `by = c('a' = 'b', 'c' = 'd')` will match `x$a` to `y$b` and `x$c`
#' to `y$d`.
#'
#' To perform a cross-join (when `x` and `y` have no variables in common), use `by = character()`.
#' Note that the `split` argument will then be set to 1.
#' @param time The name of the column containing the timestamps in `x`.
#' @param end_time Optionally, the name of the column containing the end time in `x`. If specified,
#' it means `time` defines the start time of the interval and `end_time` the end time. Note that
#' this cannot be used at the same time as `offset_before` or `offset_after`.
#' @param y_time The name of the column containing the timestamps in `y`.
#' @param offset_before The time before each measurement in `x` that denotes the period in which `y`
#' is matched. Must be convertible to a period by [lubridate::as.period()].
#' @param offset_after The time after each measurement in `x` that denotes the period in which `y`
#' is matched. Must be convertible to a period by [lubridate::as.period()].
#' @param add_before Logical value. Do you want to add the last measurement before the start of each
#' interval?
#' @param add_after Logical value. Do you want to add the first measurement after the end of each
#' interval?
#' @param name The name of the column containing the nested `y` data.
#' @param split An optional grouping variable to split the computation by. When working with large
#' data sets, the computation can grow so large it no longer fits in your computer's working
#' memory (after which it will probably fall back on the swap file, which is very slow). Splitting
#' the computation trades some computational efficiency for a large decrease in RAM usage. This
#' argument defaults to `by` to automatically suppress some of its RAM usage.
#'
#' @returns A tibble with the data of `x` with a new column `data` with the matched data of `y`
#' according to `offset_before` and `offset_after`.
#'
#' @export
#'
#' @examples
#' # Define some data
#' x <- data.frame(
#' time = rep(seq.POSIXt(as.POSIXct("2021-11-14 13:00:00"), by = "1 hour", length.out = 3), 2),
#' participant_id = c(rep("12345", 3), rep("23456", 3)),
#' item_one = rep(c(40, 50, 60), 2)
#' )
#'
#' # Define some data that we want to link to x
#' y <- data.frame(
#' time = rep(seq.POSIXt(as.POSIXct("2021-11-14 12:50:00"), by = "5 min", length.out = 30), 2),
#' participant_id = c(rep("12345", 30), rep("23456", 30)),
#' x = rep(1:30, 2)
#' )
#'
#' # Now link y within 30 minutes before each row in x
#' # until the measurement itself:
#' link(
#' x = x,
#' y = y,
#' by = "participant_id",
#' time = time,
#' y_time = time,
#' offset_before = "30 minutes"
#' )
#'
#' # We can also link y to a period both before and after
#' # each measurement in x.
#' # Also note that time, end_time and y_time accept both
#' # quoted names as well as character names.
#' link(
#' x = x,
#' y = y,
#' by = "participant_id",
#' time = "time",
#' y_time = "time",
#' offset_before = "15 minutes",
#' offset_after = "15 minutes"
#' )
#'
#' # It can be important to also know the measurements
#' # just preceding the interval or just after the interval.
#' # This adds an extra column called 'original_time' in the
#' # nested data, containing the original time stamp. The
#' # actual timestamp is set to the start time of the interval.
#' link(
#' x = x,
#' y = y,
#' by = "participant_id",
#' time = time,
#' y_time = time,
#' offset_before = "15 minutes",
#' offset_after = "15 minutes",
#' add_before = TRUE,
#' add_after = TRUE
#' )
#'
#' # If you participant_id is not important to you
#' # (i.e. the measurements are interchangeable),
#' # you can ignore them by leaving by empty.
#' # However, in this case we'll receive a warning
#' # since x and y have no other columns in common
#' # (except time, of course). Thus, we can perform
#' # a cross-join:
#' link(
#' x = x,
#' y = y,
#' by = character(),
#' time = time,
#' y_time = time,
#' offset_before = "30 minutes"
#' )
#'
#' # Alternatively, we can specify custom intervals.
#' # That is, we can create variable intervals
#' # without using fixed offsets.
#' x <- data.frame(
#' start_time = rep(
#' x = as.POSIXct(c(
#' "2021-11-14 12:40:00",
#' "2021-11-14 13:30:00",
#' "2021-11-14 15:00:00"
#' )),
#' times = 2
#' ),
#' end_time = rep(
#' x = as.POSIXct(c(
#' "2021-11-14 13:20:00",
#' "2021-11-14 14:10:00",
#' "2021-11-14 15:30:00"
#' )),
#' times = 2
#' ),
#' participant_id = c(rep("12345", 3), rep("23456", 3)),
#' item_one = rep(c(40, 50, 60), 2)
#' )
#' link(
#' x = x,
#' y = y,
#' by = "participant_id",
#' time = start_time,
#' end_time = end_time,
#' y_time = time,
#' add_before = TRUE,
#' add_after = TRUE
#' )
link <- function(x,
y,
by = NULL,
time,
end_time = NULL,
y_time,
offset_before = 0,
offset_after = 0,
add_before = FALSE,
add_after = FALSE,
name = "data",
split = by) {
check_arg(x, type = "data.frame")
check_arg(y, type = "data.frame")
check_arg(by, type = "character", allow_null = TRUE)
check_arg(add_before, type = "logical")
check_arg(add_after, type = "logical")
check_arg(name, type = "character")
if (missing(time)) {
lifecycle::deprecate_warn(
when = "1.1.2",
what = "link(time = 'must not be missing')",
details = c(
i = paste(
"Due to backwards compatiblity, `time` defaults to",
"'time' for now."
),
i = paste(
"Please make this argument explicit to prevent your",
"code from breaking in a future version."
)
)
)
time <- "time"
}
if (missing(y_time)) {
lifecycle::deprecate_warn(
when = "1.1.2",
what = "link(y_time = 'must not be missing')",
details = c(
i = paste(
"Due to backwards compatiblity, `y_time` defaults to",
"'time' for now."
),
i = paste(
"Please make this argument explicit to prevent your",
"code from breaking in a future version."
)
)
)
y_time <- "time"
}
# Check that not end_time and any offset are used at the same time
if (!missing(end_time) && (!missing(offset_before) || !missing(offset_after))) {
abort("`end_time` and `offset_before` or `offset_after` cannot be used at the same time.")
}
# Check offsets if end_time is not missing
if (missing(end_time)) {
offsets <- check_offset(offset_before, offset_after)
offset_before <- offsets$offset_before
offset_after <- offsets$offset_after
}
# Do not perform matching when x and y are identical
if (identical(x, y) || isTRUE(all.equal(x, y))) {
abort("`x` and `y` are identical.")
}
# Get the start_time, end_time, and y_time as characters and check their validity
x <- ungroup(x)
y <- ungroup(y)
start_time <- colnames(select(x, {{ time }}))
if (!missing(end_time)) {
end_time <- colnames(select(x, {{ end_time }}))
}
y_time <- colnames(select(y, {{ y_time }}))
by <- colnames(select(x, {{ by }}))
check_arg(start_time, "character", n = 1)
check_arg(end_time, "character", n = 1, allow_null = TRUE)
check_arg(y_time, "character", n = 1)
# Check the time columns
check_arg(pull(x, start_time), "POSIXt", arg = "time")
if (!is.null(end_time)) {
check_arg(pull(x, end_time), "POSIXt", arg = "end_time")
}
check_arg(pull(y, y_time), "POSIXt", arg = "y_time")
# Split up the data for computation efficiency, either based on a numeric value (fixed group size)
# or a variable
if (!is.null(split)) {
if (is.numeric(split)) {
x <- split(x, rep(1:split, each = ceiling(nrow(x) / split), length.out = nrow(x)))
} else {
x <- dplyr::group_split(x, across({{ by }}))
}
} else {
x <- list(x)
}
# Temporarily override future global max size options
old <- options(future.globals.maxSize = .Machine$double.xmax)
on.exit(options(old))
x |>
furrr::future_map(
~ link_impl(
x = .x,
y = y,
by = by,
start_time = start_time,
end_time = end_time,
y_time = y_time,
offset_before = offset_before,
offset_after = offset_after,
add_before = add_before,
add_after = add_after,
name = name
),
.options = furrr::furrr_options(seed = TRUE)
) |>
bind_rows()
}
#' Link two sensors OR one sensor and an external data frame using an mpathsenser database
#'
#' @description `r lifecycle::badge("deprecated")`
#'
#' This function is specific to mpathsenser databases. It is a wrapper around [link()] but
#' extracts data in the database for you. It is now soft deprecated as I feel this function's use
#' is limited in comparison to [link()].
#'
#' @inheritParams get_data
#' @inheritParams link
#' @param sensor_one The name of a primary sensor. See \link[mpathsenser]{sensors} for a list of
#' available sensors.
#' @param sensor_two The name of a secondary sensor. See \link[mpathsenser]{sensors} for a list of
#' available sensors. Cannot be used together with `external`.
#' @param external Optionally, specify an external data frame. Cannot be used at the same time as a
#' second sensor. This data frame must have a column called `time`.
#' @param external_time The name of the column containing the timestamps in `external`.
#' @param reverse Switch `sensor_one` with either `sensor_two` or `external`? Particularly useful in
#' combination with `external`.
#' @param ignore_large Safety override to prevent long wait times. Set to `TRUE` to do this function
#' on lots of data.
#'
#' @seealso [link()]
#'
#' @returns A tibble with the data of `sensor_one` with a new column `data` with the matched data of
#' either `sensor_two` or `external` according to `offset_before` or `offset_after`. The other way
#' around when `reverse = TRUE`.
#' @export
#'
#' @examples
#' \dontrun{
#' # Open a database
#' db <- open_db("path/to/db")
#'
#' # Link two sensors
#' link_db(db, "accelerometer", "gyroscope", offset_before = 300, offset_after = 300)
#'
#' # Link a sensor with an external data frame
#' link_db(db, "accelerometer",
#' external = my_external_data,
#' external_time = "time", offset_before = 300, offset_after = 300
#' )
#' }
link_db <- function(db,
sensor_one,
sensor_two = NULL,
external = NULL,
external_time = "time",
offset_before = 0,
offset_after = 0,
add_before = FALSE,
add_after = FALSE,
participant_id = NULL,
start_date = NULL,
end_date = NULL,
reverse = FALSE,
ignore_large = FALSE) {
# Soft deprecate warning
lifecycle::deprecate_warn("1.1.2", "link_db()", "link()")
check_db(db)
check_arg(sensor_one, type = "character", n = 1)
check_arg(sensor_two, type = "character", n = 1, allow_null = TRUE)
check_arg(external, type = "data.frame", allow_null = TRUE)
check_arg(external_time, type = "character", n = 1)
check_arg(participant_id, type = "character", allow_null = TRUE)
check_arg(reverse, type = "logical", n = 1)
check_arg(ignore_large, type = "logical", n = 1)
if ((is.null(external) && is.null(sensor_two)) || (!is.null(external) && !is.null(sensor_two))) {
abort("Either a second sensor or an external data frame must be supplied.")
}
# See if data is not incredibly large
if (!ignore_large) {
n <- sum(
get_nrows(db, c(sensor_one, sensor_two), participant_id, start_date, end_date),
nrow(external)
)
if (n > 1e+05) {
abort("the total number of rows is higher than 100000. Use ignore_large = TRUE to continue")
}
}
if (!is.null(sensor_two)) {
dat_two <- get_data(db, sensor_two, participant_id, start_date, end_date) |>
mutate(time = paste(.data$date, .data$time)) |>
select(-"date") |>
collect() |>
mutate(time = as.POSIXct(.data$time, format = "%F %H:%M:%OS", tz = "UTC"))
y_time <- "time"
} else {
check_arg(pull(external, external_time), "POSIXt")
if (any(format(pull(external, external_time), "%Z") != "UTC")) {
warn(c(
"`external` is not using UTC as a time zone, unlike the data in the database.",
i = "Consider converting the time column to UTC."
))
}
y_time <- external_time
dat_two <- external
}
# Get dates of dat_two to shrink dat_one as much as possible
dates <- unique(as.Date(dat_two$time))
dat_one <- get_data(db, sensor_one, participant_id, start_date, end_date) |>
filter(.data$date %in% dates) |>
mutate(time = paste(.data$date, .data$time)) |>
select(-"date") |>
collect() |>
mutate(time = as.POSIXct(.data$time, format = "%F %H:%M:%OS", "UTC"))
time <- "time"
if (reverse) {
tmp_time <- time
time <- y_time
y_time <- tmp_time
}
if (is.null(external) && reverse) {
tmp <- dat_one
dat_one <- dat_two
dat_two <- tmp
} else if (!is.null(external) && !reverse) {
tmp <- dat_one
dat_one <- external
dat_two <- tmp
}
link(
x = dat_one,
y = dat_two,
by = "participant_id",
time = {{ time }},
y_time = {{ y_time }},
offset_before = offset_before,
offset_after = offset_after,
add_before = add_before,
add_after = add_after
)
}
#' Link gaps to (ESM) data
#'
#' @description `r lifecycle::badge("stable")`
#'
#' Gaps in mobile sensing data typically occur when the app is stopped by the operating system or
#' the user. While small gaps may not pose problems with analyses, greater gaps may cause bias or
#' skew your data. As a result, gap data should be considered in order to inspect and limit their
#' influence. This function, analogous to [link()], allows you to connect gaps to other data
#' (usually ESM/EMA data) within a user-specified time range.
#'
#' @param data A data frame or an extension to a data frame (e.g. a tibble). While gap data can be
#' linked to any other type of data, ESM data is most commonly used.
#' @param gaps A data frame (extension) containing the gap data. See [identify_gaps()] for
#' retrieving gap data from an mpathsenser database. It should at least contain the columns `from`
#' and `to` (both in a date-time format), as well as any specified columns in `by`.
#' @inheritParams link
#' @param raw_data Whether to include the raw data (i.e. the matched gap data) to the output as
#' gap_data.
#'
#' @seealso [bin_data()] for linking two sets of intervals to each other; [identify_gaps()] for
#' finding gaps in the sampling; [add_gaps()] for adding gaps to sensor data;
#'
#' @returns The original `data` with an extra column `duration` indicating the gap during within the
#' interval in seconds (if `duration` is `TRUE`), or an extra column called `gap_data` containing
#' the gaps within the interval. The function ensures all durations and gap time stamps are within
#' the range of the interval.
#' @export
#'
#' @examples
#' # Create some data
#' x <- data.frame(
#' time = rep(seq.POSIXt(as.POSIXct("2021-11-14 13:00:00"), by = "1 hour", length.out = 3), 2),
#' participant_id = c(rep("12345", 3), rep("23456", 3)),
#' item_one = rep(c(40, 50, 60), 2)
#' )
#'
#' # Create some gaps
#' gaps <- data.frame(
#' from = as.POSIXct(c("2021-11-14 13:00:00", "2021-11-14 14:00:00")),
#' to = as.POSIXct(c("2021-11-14 13:30:00", "2021-11-14 14:30:00")),
#' participant_id = c("12345", "23456")
#' )
#'
#' # Link the gaps to the data
#' link_gaps(x, gaps, by = "participant_id", offset_before = 0, offset_after = 1800)
#'
#' # Link the gaps to the data and include the raw data
#' link_gaps(
#' x,
#' gaps,
#' by = "participant_id",
#' offset_before = 0,
#' offset_after = 1800,
#' raw_data = TRUE
#' )
link_gaps <- function(
data,
gaps,
by = NULL,
offset_before = 0,
offset_after = 0,
raw_data = FALSE) {
# Argument checking
check_arg(data, type = "data.frame")
check_arg(gaps, type = "data.frame")
check_arg(by, type = "character", allow_null = TRUE)
check_arg(raw_data, type = "logical", n = 1)
offsets <- check_offset(offset_before, offset_after)
offset_before <- offsets$offset_before
offset_after <- offsets$offset_after
# Check for time column in data
if (!("time" %in% colnames(data))) {
abort("Column `time` must be present in `data`")
}
# Check for time column
if (!("from" %in% colnames(gaps) && "to" %in% colnames(gaps))) {
abort("Column `from` and `to` must be present in `gaps`.")
}
if (!lubridate::is.POSIXct(data$time)) {
abort("Column `time` in `data` must be a POSIXct.")
}
# Check that gap or gap_data is not already present in data
if ("gap" %in% colnames(data)) {
abort("column 'gap' should not already be present in data")
}
if (raw_data && "gap_data" %in% colnames(data)) {
abort("column 'gap_data' should not already be present in data")
}
# Calculate the start and end time of the interval (in seconds) of each row in data
# Only retain gaps that are (partially) within or span over the interval
data_gaps <- data |>
select({{ by }}, "time") |>
mutate(time_int = as.integer(.data$time)) |>
mutate(start_interval = .data$time_int - offset_before) |>
mutate(end_interval = .data$time_int + offset_after)
data_gaps <- data_gaps |>
dplyr::left_join(gaps, by = by, multiple = "all", relationship = "many-to-many") |>
mutate(across(c("from", "to"), as.integer)) |>
filter(.data$from < .data$end_interval & .data$to > .data$start_interval)
# Set gaps time stamps out of the interval to the interval's bounds
data_gaps <- data_gaps |>
mutate(from = ifelse(.data$from < .data$start_interval,
.data$start_interval,
.data$from
)) |>
mutate(to = ifelse(.data$to > .data$end_interval, .data$end_interval, .data$to)) |>
mutate(gap = .data$to - .data$from)
if (raw_data) {
# Transform from and to back to POSIXct and nest the data
data_gaps <- data_gaps |>
select({{ by }}, "time", "from", "to", "gap") |>
mutate(from = as.POSIXct(.data$from,
origin = "1970-01-01",
tz = attr(gaps$from, "tzone")
)) |>
mutate(to = as.POSIXct(.data$to,
origin = "1970-01-01",
tz = attr(gaps$to, "tzone")
)) |>
group_by(across(c({{ by }}, "time"))) |>
nest(gap_data = c("from", "to", "gap")) |>
ungroup()
# Add gaps at beep level
data_gaps$gap <- vapply(data_gaps$gap_data, function(x) sum(x$gap, na.rm = TRUE),
FUN.VALUE = double(1)
)
# Create empty data frame in case no results are found
proto <- tibble(
from = as.POSIXct(vector(mode = "double"),
origin = "1970-01-01",
tz = attr(gaps$from, "tzone")
),
to = as.POSIXct(vector(mode = "double"),
origin = "1970-01-01", tz =
attr(gaps$from, "tzone")
),
gap = integer(0)
)
} else {
data_gaps <- data_gaps |>
group_by(across(c({{ by }}, "time"))) |>
summarise(gap = sum(.data$gap), .groups = "drop")
}
# Merge with ESM data
data <- data |>
tibble::as_tibble() |>
dplyr::left_join(data_gaps, by = c(by, "time"), multiple = "all", relationship = "many-to-many") |>
mutate(gap = ifelse(is.na(.data$gap), 0, .data$gap))
if (raw_data) {
data <- data |>
mutate(gap_data = ifelse(lapply(.data$gap_data, is.null), list(proto), .data$gap_data))
}
data
}
# Link intervals of y within intervals of x
link_intervals <- function(
x, x_start, x_end,
y, y_start, y_end,
by = NULL,
name = "data") {
check_arg(x, "data.frame")
check_arg(y, "data.frame")
check_arg(by, "character", allow_null = TRUE)
check_arg(name, "character", n = 1)
tz <- attr(pull(y, {{ y_start }}), "tz")
# Calculate which values in y are within x's bounds
if (length(by) == 0 && utils::packageVersion("dplyr") >= "1.1.0") {
res <- dplyr::cross_join(x, y)
} else {
join_by <- dplyr::join_by(
!!!by
)
res <- dplyr::left_join(x, y, by = join_by, multiple = "all", relationship = "many-to-many")
}
res <- res |>
mutate(across(c({{ y_start }}, {{ y_end }}), as.integer)) |>
filter(
(
(is.na({{ y_end }} & {{ y_start }} >= {{ x_start }} & {{ y_start }} < {{ x_end }})) &
(is.na({{ y_start }} & {{ y_end }} >= {{ x_start }} & {{ y_end }} < {{ x_end }}))
) |
({{ y_start }} < {{ x_end }} & {{ y_end }} > {{ x_start }})
)
# Set gaps time stamps out of the interval to the interval's bounds
res <- res |>
mutate({{ y_start }} := ifelse({{ y_start }} < {{ x_start }},
{{ x_start }},
{{ y_start }}
)) |>
mutate({{ y_end }} := ifelse({{ y_end }} > {{ x_end }},
{{ x_end }},
{{ y_end }}
)) |>
mutate(across(c({{ y_start }}, {{ y_end }}), \(.x) lubridate::as_datetime(.x, tz = tz)))
out <- x |>
dplyr::nest_join(res,
by = c(
by,
colnames(mutate(ungroup(x), {{ x_start }}, .keep = "used")),
colnames(mutate(ungroup(x), {{ x_end }}, .keep = "used"))
),
name = name
)
out
}
#' Create bins in variable time series
#'
#' @description `r lifecycle::badge("stable")`
#'
#' In time series with variable measurements, an often recurring task is calculating the total time
#' spent (i.e. the duration) in fixed bins, for example per hour or day. However, this may be
#' difficult when two subsequent measurements are in different bins or span over multiple bins.
#'
#' @param data A data frame or tibble containing the time series.
#' @param start_time The column name of the start time of the interval, a POSIXt.
#' @param end_time The column name of the end time of the interval, a POSIXt.
#' @param by A binning specification.
#' @param fixed Whether to create fixed bins. If `TRUE`, bins will be rounded to, for example,
#' whole hours or days (depending on `by`). If `FALSE`, bins will be created based on the
#' first timestamp.
#'
#' @seealso [link_gaps()] for linking gaps to data.
#' @returns A tibble containing the group columns (if any), date, hour (if `by = "hour"`), and
#' the duration in seconds.
#' @export
#'
#' @examples
#' library(dplyr)
#'
#' data <- tibble(
#' participant_id = 1,
#' datetime = c(
#' "2022-06-21 15:00:00", "2022-06-21 15:55:00",
#' "2022-06-21 17:05:00", "2022-06-21 17:10:00"
#' ),
#' confidence = 100,
#' type = "WALKING"
#' )
#'
#' # get bins per hour, even if the interval is longer than one hour
#' data |>
#' mutate(datetime = as.POSIXct(datetime)) |>
#' mutate(lead = lead(datetime)) |>
#' bin_data(
#' start_time = datetime,
#' end_time = lead,
#' by = "hour"
#' )
#'
#' # Alternatively, you can give an integer value to by to create custom-sized
#' # bins, but only if fixed = FALSE. Not that these bins are not rounded to,
#' # as in this example 30 minutes, but rather depends on the earliest time
#' # in the group.
#' data |>
#' mutate(datetime = as.POSIXct(datetime)) |>
#' mutate(lead = lead(datetime)) |>
#' bin_data(
#' start_time = datetime,
#' end_time = lead,
#' by = 1800L,
#' fixed = FALSE
#' )
#'
#' # More complicated data for showcasing grouping:
#' data <- tibble(
#' participant_id = 1,
#' datetime = c(
#' "2022-06-21 15:00:00", "2022-06-21 15:55:00",
#' "2022-06-21 17:05:00", "2022-06-21 17:10:00"
#' ),
#' confidence = 100,
#' type = c("STILL", "WALKING", "STILL", "WALKING")
#' )
#'
#' # binned_intervals also takes into account the prior grouping structure
#' out <- data |>
#' mutate(datetime = as.POSIXct(datetime)) |>
#' group_by(participant_id) |>
#' mutate(lead = lead(datetime)) |>
#' group_by(participant_id, type) |>
#' bin_data(
#' start_time = datetime,
#' end_time = lead,
#' by = "hour"
#' )
#' print(out)
#'
#' # To get the duration for each bin (note to change the variable names in sum):
#' purrr::map_dbl(
#' out$bin_data,
#' ~ sum(as.double(.x$lead) - as.double(.x$datetime),
#' na.rm = TRUE
#' )
#' )
#'
#' # Or:
#' out |>
#' tidyr::unnest(bin_data, keep_empty = TRUE) |>
#' mutate(duration = .data$lead - .data$datetime) |>
#' group_by(bin, .add = TRUE) |>
#' summarise(duration = sum(.data$duration, na.rm = TRUE), .groups = "drop")
bin_data <- function(data,
start_time,
end_time,
by = c("sec", "min", "hour", "day"),
fixed = TRUE) {
check_arg(data, "data.frame")
check_arg(fixed, "logical")
group_vars <- dplyr::group_vars(data)
if (!is.null(by) && is.character(by)) {
by <- match.arg(by, c("sec", "min", "hour", "day"))
by_duration <- c(sec = 1L, min = 60L, hour = 3600L, day = 86400L)
by_duration <- by_duration[grepl(by, names(by_duration))]
} else if (is.numeric(by) && !fixed) {
by_duration <- by
} else {
abort(paste(
"`by` must be one of 'sec', 'min', 'hour', or 'day',",
"or a numeric value if `fixed = FALSE`."
))
}
tz <- attr(pull(data, {{ start_time }}), "tz")
# check that start_time and end_time are a datetime, or try to convert
if (!lubridate::is.POSIXt(pull(data, {{ start_time }})) ||
!lubridate::is.POSIXt(pull(data, {{ end_time }}))) {
data <- data |>
mutate({{ start_time }} := as.POSIXct({{ start_time }}, origin = "1970-01-01")) |>
mutate({{ end_time }} := as.POSIXct({{ end_time }}, origin = "1970-01-01"))
}
# Generate output structure with unique hours per day, keeping the grouping structure
out <- data |>
tidyr::pivot_longer(
cols = c({{ start_time }}, {{ end_time }}),
names_to = NULL,
values_to = "bin_start"
)
if (fixed) {
out <- out |>
mutate(bin_start = trunc(.data$bin_start, by))
# mutate(bin_start = lubridate::floor_date(.data$bin_start, by))
}
out <- out |>
distinct() |>
drop_na("bin_start")
if (utils::packageVersion("dplyr") >= "1.1.0") { # nocov start
groups <- dplyr::group_vars(out)
out <- out |>
dplyr::reframe(bin_start = seq.POSIXt(
from = min(.data$bin_start, na.rm = TRUE),
to = max(.data$bin_start, na.rm = TRUE) + by_duration,
by = by_duration
))
# Regroup after reframe
if (length(groups) > 0) {
out <- group_by(out, dplyr::pick(dplyr::all_of(groups)))
}
} else {
out <- out |>
summarise(bin_start = seq.POSIXt(
from = min(.data$bin_start, na.rm = TRUE),
to = max(.data$bin_start, na.rm = TRUE) + by_duration,
by = by_duration
))
} # nocov end
if (by == "day") {
out <- out |>
mutate(bin_start = round.POSIXt(.data$bin_start, units = by))
}
out <- out |>
mutate(bin_start = as.integer(as.POSIXct(.data$bin_start)))
out <- out |>
mutate(bin_end = lead(.data$bin_start)) |>
drop_na("bin_end")
out <- link_intervals(
x = out, x_start = .data$bin_start, x_end = .data$bin_end,
y = data, y_start = {{ start_time }}, y_end = {{ end_time }},
by = group_vars,
name = "bin_data"
)
out <- out |>
mutate(bin_start = lubridate::as_datetime(.data$bin_start, tz = tz)) |>
dplyr::rename(bin = "bin_start") |>
select(-"bin_end")
out
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.