tests/testthat/test-linking.R

## link ===============
test_that("link", {
  dat1 <- data.frame(
    time = rep(seq.POSIXt(as.POSIXct("2021-11-14 13:00:00"), by = "1 hour", length.out = 3), 2),
    participant_id = c(rep("12345", 3), rep("23456", 3)),
    item_one = rep(c(40, 50, 60), 2)
  )

  dat2 <- data.frame(
    time = rep(seq.POSIXt(as.POSIXct("2021-11-14 12:50:00"), by = "5 min", length.out = 30), 2),
    participant_id = c(rep("12345", 30), rep("23456", 30)),
    x = rep(1:30, 2)
  )

  res <- link(
    x = dat1,
    y = dat2,
    by = "participant_id",
    time = "time",
    y_time = "time",
    offset_before = 1800,
    split = NULL
  )
  true <- tibble::tibble(
    time = rep(c(
      as.POSIXct("2021-11-14 13:00:00"), as.POSIXct("2021-11-14 14:00:00"),
      as.POSIXct("2021-11-14 15:00:00")
    ), 2),
    participant_id = c(rep("12345", 3), rep("23456", 3)),
    item_one = rep(c(40, 50, 60), 2),
    data = rep(list(
      tibble::tibble(
        time = seq.POSIXt(
          from = as.POSIXct("2021-11-14 12:50:00"),
          length.out = 3, by = "5 min"
        ),
        x = 1:3
      ),
      tibble::tibble(
        time = seq.POSIXt(
          from = as.POSIXct("2021-11-14 13:30:00"),
          length.out = 7, by = "5 min"
        ),
        x = 9:15
      ),
      tibble::tibble(
        time = seq.POSIXt(
          from = as.POSIXct("2021-11-14 14:30:00"),
          length.out = 7, by = "5 min"
        ),
        x = 21:27
      )
    ), 2)
  )
  expect_equal(res, true)

  # Test warning if time and y_time are not specified
  lifecycle::expect_deprecated(
    link(
      x = dat1,
      y = dat2,
      by = "participant_id",
      y_time = "time",
      offset_before = 1800,
      split = NULL
    ),
    "The `time` argument of `link\\(\\)` must not be missing as of mpathsenser 1.1.2."
  )

  lifecycle::expect_deprecated(
    link(
      x = dat1,
      y = dat2,
      by = "participant_id",
      time = "time",
      offset_before = 1800,
      split = NULL
    ),
    "The `y_time` argument of `link\\(\\)` must not be missing as of mpathsenser 1.1.2."
  )

  # Test x and y identical
  expect_error(
    link(
      x = dat1,
      y = dat1,
      by = "participant_id",
      time = "time",
      y_time = "time",
      offset_before = 1800,
      split = NULL
    ),
    "`x` and `y` are identical."
  )

  # Test without offset bu using time and end_time
  res2 <- dat1 %>%
    dplyr::rename(end_time = time) %>%
    mutate(start_time = end_time - 1800) %>%
    link(
      x = .,
      y = dat2,
      by = "participant_id",
      time = start_time,
      end_time = end_time,
      y_time = time
    )
  true2 <- true
  true2$start_time <- true2$time - 1800
  true2 <- select(true2, end_time = time, participant_id, item_one, start_time, data)
  expect_equal(res2, true2)

  # Test that end_time and offset_before and offset_after cannot be used at the same time.
  expect_error(
    dat1 %>%
      dplyr::rename(end_time = time) %>%
      mutate(start_time = end_time - 1800) %>%
      link(
        x = .,
        y = dat2,
        by = "participant_id",
        time = start_time,
        end_time = end_time,
        y_time = time,
        offset_before = 1800
      ),
    "`end_time` and `offset_before` or `offset_after` cannot be used at the same time."
  )

  # Test split argument
  res <- link(
    x = dat1,
    y = dat2,
    by = "participant_id",
    time = time,
    y_time = time,
    offset_before = 1800,
    split = 6
  )
  expect_equal(res, true)

  # Scrambled test
  scramble <- function(data) {
    idx <- sample(seq_along(data[, 1]), nrow(data))
    data[idx, ]
  }
  res <- link(
    x = scramble(dat1),
    y = scramble(dat2),
    by = "participant_id",
    time = time,
    y_time = time,
    offset_before = 1800
  ) %>%
    arrange(participant_id, time)
  expect_equal(res, true)

  res <- link(
    x = dat1,
    y = dat2,
    by = "participant_id",
    time = time,
    y_time = time,
    offset_after = 1800
  )
  true <- tibble::tibble(
    time = rep(c(
      as.POSIXct("2021-11-14 13:00:00"), as.POSIXct("2021-11-14 14:00:00"),
      as.POSIXct("2021-11-14 15:00:00")
    ), 2),
    participant_id = c(rep("12345", 3), rep("23456", 3)),
    item_one = rep(c(40, 50, 60), 2),
    data = rep(list(
      tibble::tibble(
        time = seq.POSIXt(
          from = as.POSIXct("2021-11-14 13:00:00"),
          length.out = 7, by = "5 min"
        ),
        x = 3:9
      ),
      tibble::tibble(
        time = seq.POSIXt(
          from = as.POSIXct("2021-11-14 14:00:00"),
          length.out = 7, by = "5 min"
        ),
        x = 15:21
      ),
      tibble::tibble(
        time = seq.POSIXt(
          from = as.POSIXct("2021-11-14 15:00:00"),
          length.out = 4, by = "5 min"
        ),
        x = 27:30
      )
    ), 2)
  )
  expect_equal(res, true)

  res <- link(
    x = scramble(dat1),
    y = scramble(dat2),
    by = "participant_id",
    time = time,
    y_time = time,
    offset_after = 1800
  ) %>%
    arrange(participant_id, time)
  expect_equal(res, true)

  # Test add_before and add_after
  # Add one minute to dat2 time as otherwise a row would not added before
  # This is due to new functionality where a row is not added if the first measurements in an
  # interval is equal to the start time
  dat2$time <- dat2$time + lubridate::minutes(1)
  res <- link(
    x = dat1,
    y = dat2,
    by = "participant_id",
    time = time,
    y_time = time,
    offset_before = 1800,
    add_before = TRUE,
    add_after = TRUE
  )
  true <- tibble::tibble(
    time = rep(c(
      as.POSIXct("2021-11-14 13:00:00"), as.POSIXct("2021-11-14 14:00:00"),
      as.POSIXct("2021-11-14 15:00:00")
    ), 2),
    participant_id = c(rep("12345", 3), rep("23456", 3)),
    item_one = rep(c(40, 50, 60), 2),
    data = rep(list(
      tibble::tibble(
        time = c(
          seq.POSIXt(
            from = as.POSIXct("2021-11-14 12:51:00"),
            length.out = 2, by = "5 min"
          ),
          as.POSIXct("2021-11-14 13:00:00")
        ),
        x = c(1:3),
        original_time = c(
          rep(as.POSIXct(NA), 2),
          as.POSIXct("2021-11-14 13:01:00")
        )
      ),
      tibble::tibble(
        time = c(
          as.POSIXct("2021-11-14 13:30:00"),
          seq.POSIXt(
            from = as.POSIXct("2021-11-14 13:31:00"),
            length.out = 6, by = "5 min"
          ),
          as.POSIXct("2021-11-14 14:00:00")
        ),
        x = c(8, 9:14, 15),
        original_time = c(
          as.POSIXct("2021-11-14 13:26:00"),
          rep(as.POSIXct(NA), 6),
          as.POSIXct("2021-11-14 14:01:00")
        )
      ),
      tibble::tibble(
        time = c(
          as.POSIXct("2021-11-14 14:30:00"),
          seq.POSIXt(
            from = as.POSIXct("2021-11-14 14:31:00"),
            length.out = 6, by = "5 min"
          ),
          as.POSIXct("2021-11-14 15:00:00")
        ),
        x = c(20, 21:26, 27),
        original_time = c(
          as.POSIXct("2021-11-14 14:26:00"),
          rep(lubridate::`NA_POSIXct_`, 6),
          as.POSIXct("2021-11-14 15:01:00")
        )
      )
    ), 2)
  )
  expect_equal(res, true, ignore_attr = TRUE)

  # Without offset, using time, end_time, and y_time
  res2 <- dat1 %>%
    dplyr::rename(end_time = time) %>%
    mutate(start_time = end_time - 1800) %>%
    link(
      x = .,
      y = dat2,
      by = "participant_id",
      time = start_time,
      end_time = end_time,
      y_time = time,
      add_before = TRUE,
      add_after = TRUE
    )
  true$start_time <- true$time - 1800
  true <- select(true, end_time = time, participant_id, item_one, start_time, data)
  expect_equal(res2, true)

  # Bug #6: Test whether original_time is present in all nested data columns
  # Create some data to use
  dat1 <- data.frame(
    time = c(
      rep(seq.POSIXt(as.POSIXct("2021-11-14 13:00:00"), by = "1 hour", length.out = 3), 2),
      as.POSIXct("2021-11-14 13:00:00"), as.POSIXct("2021-11-14 13:00:00")
    ),
    participant_id = c(rep("12345", 3), rep("23456", 3), "45678", "56789"),
    item_one = c(rep(c(40, 50, 60), 2), 40, 40)
  )

  dat2 <- data.frame(
    time = c(
      rep(seq.POSIXt(as.POSIXct("2021-11-14 12:50:00"), by = "5 min", length.out = 30), 2),
      seq.POSIXt(as.POSIXct("2021-11-14 12:30:00"), by = "5 min", length.out = 6)
    ),
    participant_id = c(rep("12345", 30), rep("23456", 30), rep("45678", 6)),
    x = c(rep(1:30, 2), 1:6)
  )

  # Link together, make sure to include rows before and after
  res <- link(
    x = dat1,
    y = dat2,
    by = "participant_id",
    time = time,
    y_time = time,
    offset_before = 1800,
    add_before = TRUE,
    add_after = TRUE
  )

  # Use the results, e.g. to filter out the extra added rows by offset_before and offset_after
  # (as you have to do for the number of screen unlocks).
  expect_true(all(purrr::map_lgl(res$data, ~ "original_time" %in% colnames(.x))))

  # Bug: add_before and add_after were ignored (or rather lost) if data_main is empty
  dat1 <- data.frame(
    time = rep(seq.POSIXt(as.POSIXct("2021-11-14 13:00:00"), by = "1 hour", length.out = 3), 2),
    participant_id = c(rep("12345", 3), rep("23456", 3)),
    item_one = rep(c(40, 50, 60), 2)
  )
  dat2 <- data.frame(
    time = rep(c(as.POSIXct("2021-11-14 11:50:00"), as.POSIXct("2021-11-14 16:50:00")), 2),
    participant_id = c(rep("12345", 2), rep("23456", 2)),
    x = rep(1:2, 2)
  )

  res <- link(
    x = dat1,
    y = dat2,
    by = "participant_id",
    time = time,
    y_time = time,
    offset_before = 1800L,
    add_before = TRUE,
    add_after = TRUE
  )

  expect_true(all(purrr::map_int(res$data, nrow) != 0))

  # Check that timezones stay consistent, even with add_before and add_after
  dat1 <- data.frame(
    time = rep(
      seq.POSIXt(as.POSIXct("2021-11-14 13:00:00", tz = "Europe/Brussels"),
        by = "1 hour",
        length.out = 3
      ),
      2
    ),
    participant_id = c(rep("12345", 3), rep("23456", 3)),
    item_one = rep(c(40, 50, 60), 2)
  )

  dat2 <- data.frame(
    time = rep(
      seq.POSIXt(as.POSIXct("2021-11-14 11:00:00", tz = "UTC"),
        by = "10 mins",
        length.out = 40
      ),
      2
    ),
    participant_id = c(rep("12345", 40), rep("23456", 40)),
    x = rep(1:2, 2)
  )
  res <- link(
    x = dat1,
    y = dat2,
    by = "participant_id",
    time = time,
    y_time = time,
    offset_before = 1800L,
    add_before = TRUE,
    add_after = TRUE
  )
  expect_equal(attr(res$time, "tz"), "Europe/Brussels")
  expect_equal(unique(map_chr(res$data, ~ attr(.x$time, "tz"))), "UTC")
  expect_equal(unique(map_chr(res$data, ~ attr(.x$original_time, "tz"))), "UTC")

  # Make sure link does not add an extra row if first measurement equal start of the interval or
  # vice versa for the end of the interval.
  dat1 <- data.frame(
    time = as.POSIXct(c("2021-11-14 11:00:00", "2021-11-14 12:00:00")),
    participant_id = "12345",
    item_one = c(40, 50)
  )

  dat2 <- data.frame(
    time = as.POSIXct(c(
      "2021-11-14 10:20:00", "2021-11-14 10:30:00", "2021-11-14 10:40:00",
      "2021-11-14 11:00:00", "2021-11-14 11:10:00", "2021-11-14 11:30:01",
      "2021-11-14 11:40:00", "2021-11-14 12:10:00"
    )),
    participant_id = "12345",
    x = 1:8
  )

  true <- tibble::tibble(
    time = as.POSIXct(c("2021-11-14 11:00:00", "2021-11-14 12:00:00")),
    participant_id = "12345",
    item_one = c(40, 50),
    data = list(
      tibble::tibble(
        time = as.POSIXct(c("2021-11-14 10:30:00", "2021-11-14 10:40:00", "2021-11-14 11:00:00")),
        x = c(2, 3, 4),
        original_time = as.POSIXct(c(NA, NA, NA))
      ),
      tibble::tibble(
        time = as.POSIXct(c(
          "2021-11-14 11:30:00", "2021-11-14 11:30:01", "2021-11-14 11:40:00",
          "2021-11-14 12:00:00"
        )),
        x = c(5, 6, 7, 8),
        original_time = as.POSIXct(c("2021-11-14 11:10:00", NA, NA, "2021-11-14 12:10:00"))
      )
    )
  )

  res <- link(
    x = dat1,
    y = dat2,
    by = "participant_id",
    time = time,
    y_time = time,
    offset_before = 1800L,
    add_before = TRUE,
    add_after = TRUE
  )

  expect_equal(true, res)

  # Check what should be done when any of the times is missing
  # Preferably, we just want to create a 0-row tibble with `proto`, as still adding before or after
  # is kind of strange. Additionally, the column names should be consistent with the rest of the
  # output.
  dat1 <- dat1 |>
    mutate(end_time = lead(time))

  res <- link(
    x = dat1,
    y = dat2,
    by = "participant_id",
    time = time,
    end_time = end_time,
    y_time = time,
    add_before = TRUE,
    add_after = TRUE
  )

  true <- tibble::tibble(
    time = as.POSIXct(c("2021-11-14 11:00:00", "2021-11-14 12:00:00")),
    participant_id = "12345",
    item_one = c(40, 50),
    end_time = as.POSIXct(c("2021-11-14 12:00:00", NA)),
    data = list(
      tibble::tibble(
        time = as.POSIXct(c(
          "2021-11-14 11:00:00", "2021-11-14 11:10:00", "2021-11-14 11:30:00",
          "2021-11-14 11:40:00", "2021-11-14 12:00:00"
        )),
        x = c(4, 5, 6, 7, 8),
        original_time = as.POSIXct(c(NA, NA, NA, NA, "2021-11-14 12:10:00"))
      ),
      tibble::tibble(
        time = as.POSIXct(double(0)),
        x = integer(0),
        original_time = as.POSIXct(double(0))
      )
    )
  )
  expect_equal(res, true)
})

## link_db ===============
test_that("link_db", {
  path <- system.file("testdata", package = "mpathsenser")
  db <- open_db(path, "test.db")
  dat1 <- data.frame(
    time = as.POSIXct(c("2021-11-14 13:00:00", "2021-11-14 14:00:00", "2021-11-14 15:00:00"),
      tz = "UTC"
    ),
    participant_id = "12345",
    item_one = c(40, 50, 60)
  )

  # Check deprecation
  lifecycle::expect_deprecated(link_db(db, "Activity", "Connectivity", offset_after = 1800L))

  # Disable deprecation check
  rlang::local_options(lifecycle_verbosity = "quiet")

  # Check basic functionality
  res <- link_db(db, "Activity", "Connectivity", offset_after = 1800)
  true <- tibble::tibble(
    measurement_id = c(
      "fbf85cd7-6d37-53a8-5c44-ad8fe13ef7ac",
      "ef96364c-d1f4-5f73-ce40-277f078e3d0f",
      "5ba54e77-4bcf-c8d1-17ff-71b9ed908897"
    ),
    participant_id = "12345",
    time = as.POSIXct(c("2021-11-14 13:59:59", "2021-11-14 14:00:00", "2021-11-14 14:00:01"),
      tz = "UTC"
    ),
    confidence = c(NA, 100L, 99L),
    type = c(NA, "WALKING", "STILL"),
    data = list(
      tibble::tibble(
        measurement_id = c(
          "27a5777a-ec41-80de-afa4-d2e7f6b02fcf",
          "2d430c2a-5b16-1dce-0e2f-c049c44e3729"
        ),
        time = as.POSIXct(c("2021-11-14 14:00:00", "2021-11-14 14:01:00"), tz = "UTC"),
        connectivity_status = c("wifi", NA)
      ),
      tibble::tibble(
        measurement_id = c(
          "27a5777a-ec41-80de-afa4-d2e7f6b02fcf",
          "2d430c2a-5b16-1dce-0e2f-c049c44e3729"
        ),
        time = as.POSIXct(c("2021-11-14 14:00:00", "2021-11-14 14:01:00"), tz = "UTC"),
        connectivity_status = c("wifi", NA)
      ),
      tibble::tibble(
        measurement_id = "2d430c2a-5b16-1dce-0e2f-c049c44e3729",
        time = as.POSIXct("2021-11-14 14:01:00", tz = "UTC"),
        connectivity_status = NA_character_
      )
    )
  )
  expect_equal(res, true)

  # Check reverse
  res <- link_db(db, "Activity", "Connectivity", offset_after = 1800, reverse = TRUE)
  true <- tibble::tibble(
    measurement_id = c(
      "27a5777a-ec41-80de-afa4-d2e7f6b02fcf",
      "2d430c2a-5b16-1dce-0e2f-c049c44e3729"
    ),
    participant_id = "12345",
    time = as.POSIXct(c("2021-11-14 14:00:00", "2021-11-14 14:01:00"), tz = "UTC"),
    connectivity_status = c("wifi", NA),
    data = list(
      tibble::tibble(
        measurement_id = c(
          "ef96364c-d1f4-5f73-ce40-277f078e3d0f",
          "5ba54e77-4bcf-c8d1-17ff-71b9ed908897"
        ),
        time = as.POSIXct(c("2021-11-14 14:00:00", "2021-11-14 14:00:01"), tz = "UTC"),
        confidence = c(100L, 99L),
        type = c("WALKING", "STILL"),
      ),
      tibble::tibble(
        measurement_id = character(0),
        time = structure(numeric(0), tzone = "UTC", class = c("POSIXct", "POSIXt")),
        confidence = integer(0),
        type = character(0)
      )
    )
  )
  expect_equal(res, true)

  # Check with external data
  res <- link_db(db, "Activity", external = dat1, offset_after = 1800)
  true <- tibble::tibble(
    dat1,
    data = list(
      tibble::tibble(
        measurement_id = character(0),
        time = structure(numeric(0), tzone = "UTC", class = c("POSIXct", "POSIXt")),
        confidence = integer(0L),
        type = character(0)
      ),
      tibble::tibble(
        measurement_id = c(
          "ef96364c-d1f4-5f73-ce40-277f078e3d0f",
          "5ba54e77-4bcf-c8d1-17ff-71b9ed908897"
        ),
        time = as.POSIXct(c("2021-11-14 14:00:00", "2021-11-14 14:00:01"), tz = "UTC"),
        confidence = c(100L, 99L),
        type = c("WALKING", "STILL")
      ),
      tibble::tibble(
        measurement_id = character(0),
        time = structure(numeric(0), tzone = "UTC", class = c("POSIXct", "POSIXt")),
        confidence = integer(0),
        type = character(0)
      )
    )
  )
  expect_equal(res, true)

  # Argument checks
  expect_error(
    link_db(db, "Activity", "Bluetooth", offset_before = 1800, external = dat1),
    "Either a second sensor or an external data frame must be supplied."
  )
  expect_error(
    link_db(db, "Activity", offset_before = 1800),
    "Either a second sensor or an external data frame must be supplied."
  )

  # Check time zone differences
  dat1$time <- .POSIXct(dat1$time, tz = "Europe/Brussels")
  expect_warning(
    link_db(db, "Activity", external = dat1, offset_after = 1800),
    "`external` is not using UTC as a time zone, unlike the data in the database."
  )
  dat1$time <- .POSIXct(dat1$time, tz = "UTC")
  dbDisconnect(db)

  # Check if ignore_large works
  filename <- tempfile("big", fileext = ".db")
  db <- create_db(NULL, filename)

  # Populate database
  add_study(db, study_id = "test-study", data_format = "CARP")
  add_participant(db, participant_id = "12345", study_id = "test-study")

  sens_value <- seq.int(0, 10, length.out = 50001)
  time_value <- seq.POSIXt(as.POSIXct("2021-11-14 14:00:00.000", format = "%F %H:%M:%OS"),
    by = "sec",
    length.out = 50001
  )
  acc <- data.frame(
    measurement_id = paste0("id_", 1:50001),
    participant_id = "12345",
    date = "2021-11-14",
    time = strftime(time_value, format = "%H:%M:%OS3"),
    x = sens_value,
    y = sens_value,
    z = sens_value
  )

  DBI::dbWriteTable(db, "Accelerometer", acc, overwrite = TRUE)
  DBI::dbWriteTable(db, "Gyroscope", acc, overwrite = TRUE)

  expect_error(
    link_db(db, "Accelerometer", "Gyroscope", offset_after = 30),
    "the total number of rows is higher than 100000. Use ignore_large = TRUE to continue"
  )
  expect_error(
    link_db(db, "Accelerometer", "Gyroscope", offset_after = 30, ignore_large = TRUE),
    "`x` and `y` are identical"
  )

  # Cleanup
  dbDisconnect(db)
  file.remove(filename)
})

## link_gaps =================
test_that("link_gaps", {
  dat1 <- data.frame(
    participant_id = c(rep("12345", 6), rep("23456", 6)),
    time = rep(seq.POSIXt(as.POSIXct("2021-11-14 13:00:00"), by = "1 hour", length.out = 6), 2),
    item_one = rep(seq.int(10, by = 10, length.out = 6), 2)
  )

  # Test with two participants to ensure link takes different groups into account
  # Test both before and after each beep
  # 1. the gap falls completely inside the beep interval
  # 2. the start of the gap falls inside the beep interval, but the end does not
  # 3. the start of the gap falls outside of the beep interval, but the end of the gap falls inside
  # 4. the gap spans over the entire interval
  # 5. the gap occurs entirely before the interval
  # 6. the gap occurs entirely after the interval
  dat2 <- data.frame(
    participant_id = c(rep("12345", 46), rep("23456", 46)),
    from = rep(c(
      seq.POSIXt(as.POSIXct("2021-11-14 12:40:00"), by = "1 min", length.out = 20), # 1 before
      seq.POSIXt(as.POSIXct("2021-11-14 13:10:00"), by = "1 min", length.out = 20), # 1 after
      as.POSIXct(c(
        "2021-11-14 13:55:00", # 2 before, 3 after
        "2021-11-14 14:25:00", # 3 before, 2 after
        "2021-11-14 15:25:00", # 2 after
        "2021-11-14 15:30:00", # 4 before, after
        "2021-11-14 12:15:00", # 5 before, after
        "2021-11-14 18:35:00" # 6 before, after
      ))
    ), 2),
    to = rep(c(
      seq.POSIXt(as.POSIXct("2021-11-14 12:41:00"), by = "1 min", length.out = 20), # 1 before
      seq.POSIXt(as.POSIXct("2021-11-14 13:11:00"), by = "1 min", length.out = 20), # 1 after
      as.POSIXct(c(
        "2021-11-14 14:05:00", # 2 before, 3 after
        "2021-11-14 14:40:00", # 3 before
        "2021-11-14 15:30:00", # 2 after
        "2021-11-14 16:30:00", # 4 before, after
        "2021-11-14 12:25:00", # 5 before, after
        "2021-11-14 18:40:00" # 6 before, after
      ))
    ), 2)
  )

  # Test difference types of input for offset_before
  # Integer vs double
  expect_equal(
    link_gaps(dat1, dat2, by = "participant_id", offset_before = 1800L),
    link_gaps(dat1, dat2, by = "participant_id", offset_before = 1800)
  )

  expect_equal(
    link_gaps(dat1, dat2, by = "participant_id", offset_before = 1800L),
    link_gaps(dat1, dat2, by = "participant_id", offset_before = lubridate::minutes(30))
  )

  expect_equal(
    link_gaps(dat1, dat2, by = "participant_id", offset_before = 1800L),
    link_gaps(dat1, dat2, by = "participant_id", offset_before = "30 minutes")
  )

  expect_equal(
    link_gaps(dat1, dat2, by = "participant_id", offset_before = 1800L),
    link_gaps(dat1, dat2, by = "participant_id", offset_before = "1800 seconds")
  )

  # Offset after
  expect_equal(
    link_gaps(dat1, dat2, by = "participant_id", offset_after = 1800L),
    link_gaps(dat1, dat2, by = "participant_id", offset_after = 1800)
  )

  expect_equal(
    link_gaps(dat1, dat2, by = "participant_id", offset_after = 1800L),
    link_gaps(dat1, dat2, by = "participant_id", offset_after = lubridate::minutes(30))
  )

  expect_equal(
    link_gaps(dat1, dat2, by = "participant_id", offset_after = 1800L),
    link_gaps(dat1, dat2, by = "participant_id", offset_after = "30 minutes")
  )

  expect_equal(
    link_gaps(dat1, dat2, by = "participant_id", offset_after = 1800L),
    link_gaps(dat1, dat2, by = "participant_id", offset_after = "1800 seconds")
  )

  # Offset_before, raw_data = TRUE
  res_raw <- link_gaps(
    data = dat1,
    gaps = dat2,
    by = "participant_id",
    offset_before = 1800L,
    offset_after = 0L,
    raw_data = TRUE
  )
  true <- tibble::tibble(
    participant_id = c(rep("12345", 6), rep("23456", 6)),
    time = rep(seq.POSIXt(as.POSIXct("2021-11-14 13:00:00"), by = "1 hour", length.out = 6), 2),
    item_one = rep(seq.int(10, by = 10, length.out = 6), 2),
    gap_data = rep(list(
      tibble::tibble(
        from = seq.POSIXt(as.POSIXct("2021-11-14 12:40:00"), by = "1 min", length.out = 20),
        to = seq.POSIXt(as.POSIXct("2021-11-14 12:41:00"), by = "1 min", length.out = 20),
        gap = rep(60, 20)
      ), # 1
      tibble::tibble(
        from = as.POSIXct("2021-11-14 13:55:00"),
        to = as.POSIXct("2021-11-14 14:00:00"),
        gap = 300
      ), # 3
      tibble::tibble(
        from = as.POSIXct("2021-11-14 14:30:00"),
        to = as.POSIXct("2021-11-14 14:40:00"),
        gap = 600
      ), # 3
      tibble::tibble(
        from = as.POSIXct("2021-11-14 15:30:00"),
        to = as.POSIXct("2021-11-14 16:00:00"),
        gap = 1800
      ), # 4
      tibble::tibble(
        from = as.POSIXct(double(0), tz = ""),
        to = as.POSIXct(double(0), tz = ""),
        gap = integer(0)
      ), # 5
      tibble::tibble(
        from = as.POSIXct(double(0), tz = ""),
        to = as.POSIXct(double(0), tz = ""),
        gap = integer(0)
      ) # 6
    ), 2),
    gap = rep(c(1200, 300, 600, 1800, 0, 0), 2)
  )
  expect_equal(res_raw, true)

  # Check accidental double link
  expect_error(
    link_gaps(
      data = res_raw,
      gaps = dat2,
      by = "participant_id",
      offset_before = 1800L,
      raw_data = TRUE
    ),
    "column 'gap' should not already be present in data"
  )

  expect_error(
    link_gaps(
      data = res_raw %>% dplyr::select(-gap),
      gaps = dat2,
      by = "participant_id",
      offset_before = 1800L,
      raw_data = TRUE
    ),
    "column 'gap_data' should not already be present in data"
  )

  expect_error(
    link_gaps(
      data = res_raw %>% dplyr::select(-gap),
      gaps = dat2,
      by = "participant_id",
      offset_before = 1800L,
      raw_data = FALSE
    ),
    NA
  )

  # Scrambled test
  scramble <- function(data) {
    idx <- sample(seq_along(data[, 1]), nrow(data))
    data[idx, ]
  }
  res <- link_gaps(
    data = scramble(dat1),
    gaps = scramble(dat2),
    by = "participant_id",
    offset_before = 1800,
    raw_data = TRUE
  ) %>%
    arrange(participant_id, time)
  expect_equal(res_raw, true)

  # offset_before, raw_data = FALSE
  res <- link_gaps(
    data = dat1,
    gaps = dat2,
    by = "participant_id",
    offset_before = 1800L,
    offset_after = 0L
  )
  true <- tibble::tibble(
    participant_id = c(rep("12345", 6), rep("23456", 6)),
    time = rep(seq.POSIXt(as.POSIXct("2021-11-14 13:00:00"), by = "1 hour", length.out = 6), 2),
    item_one = rep(seq.int(10, by = 10, length.out = 6), 2),
    gap = rep(c(1200, 300, 600, 1800, 0, 0), 2)
  )
  expect_equal(res, true)

  # Test whether results from raw_data = FALSE and TRUE are the same
  expect_equal(
    res_raw %>%
      mutate(gap = purrr::map_dbl(gap_data, ~ sum(.x$gap))) %>%
      dplyr::select(-gap_data),
    res
  )

  # Offset_after, raw_data = TRUE
  res_raw <- link_gaps(
    data = dat1,
    gaps = dat2,
    by = "participant_id",
    offset_before = 0L,
    offset_after = 1800L,
    raw_data = TRUE
  )
  true <- tibble::tibble(
    participant_id = c(rep("12345", 6), rep("23456", 6)),
    time = rep(seq.POSIXt(as.POSIXct("2021-11-14 13:00:00"), by = "1 hour", length.out = 6), 2),
    item_one = rep(seq.int(10, by = 10, length.out = 6), 2),
    gap_data = rep(list(
      tibble::tibble(
        from = seq.POSIXt(as.POSIXct("2021-11-14 13:10:00"), by = "1 min", length.out = 20),
        to = seq.POSIXt(as.POSIXct("2021-11-14 13:11:00"), by = "1 min", length.out = 20),
        gap = rep(60, 20)
      ), # 1
      tibble::tibble(
        from = c(as.POSIXct("2021-11-14 14:00:00"), as.POSIXct("2021-11-14 14:25:00")),
        to = c(as.POSIXct("2021-11-14 14:05:00"), as.POSIXct("2021-11-14 14:30:00")),
        gap = 300
      ), # 2
      tibble::tibble(
        from = as.POSIXct("2021-11-14 15:25:00"),
        to = as.POSIXct("2021-11-14 15:30:00"),
        gap = 300
      ), # 2
      tibble::tibble(
        from = as.POSIXct("2021-11-14 16:00:00"),
        to = as.POSIXct("2021-11-14 16:30:00"),
        gap = 1800
      ), # 4
      tibble::tibble(
        from = as.POSIXct(double(0), tz = ""),
        to = as.POSIXct(double(0), tz = ""),
        gap = integer(0)
      ), # 5
      tibble::tibble(
        from = as.POSIXct(double(0), tz = ""),
        to = as.POSIXct(double(0), tz = ""),
        gap = integer(0)
      ) # 6
    ), 2),
    gap = rep(c(1200, 600, 300, 1800, 0, 0), 2)
  )
  expect_equal(res_raw, true)

  # offset_after, raw_data = FALSE
  res <- link_gaps(
    data = dat1,
    gaps = dat2,
    by = "participant_id",
    offset_before = 0L,
    offset_after = 1800L
  )
  true <- tibble::tibble(
    participant_id = c(rep("12345", 6), rep("23456", 6)),
    time = rep(seq.POSIXt(as.POSIXct("2021-11-14 13:00:00"), by = "1 hour", length.out = 6), 2),
    item_one = rep(seq.int(10, by = 10, length.out = 6), 2),
    gap = rep(c(1200, 600, 300, 1800, 0, 0), 2)
  )
  expect_equal(res, true)

  # Test whether results from raw_data = FALSE and TRUE are the same
  expect_equal(
    res_raw %>%
      mutate(gap = purrr::map_dbl(gap_data, ~ sum(.x$gap))) %>%
      dplyr::select(-gap_data),
    res
  )

  # Offset both
  res_raw <- link_gaps(
    data = dat1,
    gaps = dat2,
    by = "participant_id",
    offset_before = 1800L,
    offset_after = 1800L,
    raw_data = TRUE
  )
  true <- tibble::tibble(
    participant_id = c(rep("12345", 6), rep("23456", 6)),
    time = rep(seq.POSIXt(as.POSIXct("2021-11-14 13:00:00"), by = "1 hour", length.out = 6), 2),
    item_one = rep(seq.int(10, by = 10, length.out = 6), 2),
    gap_data = rep(list(
      tibble::tibble(
        from = c(
          seq.POSIXt(as.POSIXct("2021-11-14 12:40:00"), by = "1 min", length.out = 20),
          seq.POSIXt(as.POSIXct("2021-11-14 13:10:00"), by = "1 min", length.out = 20)
        ),
        to = c(
          seq.POSIXt(as.POSIXct("2021-11-14 12:41:00"), by = "1 min", length.out = 20),
          seq.POSIXt(as.POSIXct("2021-11-14 13:11:00"), by = "1 min", length.out = 20)
        ),
        gap = rep(60, 40)
      ), # 1
      tibble::tibble(
        from = c(as.POSIXct("2021-11-14 13:55:00"), as.POSIXct("2021-11-14 14:25:00")),
        to = c(as.POSIXct("2021-11-14 14:05:00"), as.POSIXct("2021-11-14 14:30:00")),
        gap = c(600, 300)
      ), # 2
      tibble::tibble(
        from = c(as.POSIXct("2021-11-14 14:30:00"), as.POSIXct("2021-11-14 15:25:00")),
        to = c(as.POSIXct("2021-11-14 14:40:00"), as.POSIXct("2021-11-14 15:30:00")),
        gap = c(600, 300)
      ), # 2
      tibble::tibble(
        from = as.POSIXct("2021-11-14 15:30:00"),
        to = as.POSIXct("2021-11-14 16:30:00"),
        gap = 3600
      ), # 4
      tibble::tibble(
        from = as.POSIXct(double(0), tz = ""),
        to = as.POSIXct(double(0), tz = ""),
        gap = integer(0)
      ), # 5
      tibble::tibble(
        from = as.POSIXct(double(0), tz = ""),
        to = as.POSIXct(double(0), tz = ""),
        gap = integer(0)
      ) # 6
    ), 2),
    gap = rep(c(2400, 900, 900, 3600, 0, 0), 2)
  )
  expect_equal(res_raw, true)

  # offset_both
  res <- link_gaps(
    data = dat1,
    gaps = dat2,
    by = "participant_id",
    offset_before = 1800L,
    offset_after = 1800L
  )
  true <- tibble::tibble(
    participant_id = c(rep("12345", 6), rep("23456", 6)),
    time = rep(seq.POSIXt(as.POSIXct("2021-11-14 13:00:00"), by = "1 hour", length.out = 6), 2),
    item_one = rep(seq.int(10, by = 10, length.out = 6), 2),
    gap = rep(c(2400, 900, 900, 3600, 0, 0), 2)
  )
  expect_equal(res, true)

  # Test whether results from raw_data = FALSE and TRUE are the same
  expect_equal(
    res_raw %>%
      mutate(gap = purrr::map_dbl(gap_data, ~ sum(.x$gap))) %>%
      dplyr::select(-gap_data),
    res
  )

  # Argument checks
  expect_error(
    link_gaps(
      data = dat1[, -2],
      gaps = dat2,
      by = "participant_id",
      offset_after = 1800
    ),
    "Column `time` must be present in `data`"
  )
  expect_error(
    link_gaps(
      data = dat1,
      gaps = dat2[, -2],
      by = "participant_id",
      offset_after = 1800
    ),
    "Column `from` and `to` must be present in `gaps`."
  )
  expect_error(
    link_gaps(
      data = mutate(dat1, time = as.character(time)),
      gaps = dat2,
      by = "participant_id",
      offset_after = 1800
    ),
    "Column `time` in `data` must be a POSIXct."
  )
})

## bin_data =================
test_that("bin_data", {
  data <- tibble::tibble(
    participant_id = 1,
    datetime = c(
      "2022-06-21 15:00:00", "2022-06-21 15:55:00",
      "2022-06-21 17:05:00", "2022-06-21 17:10:00"
    ),
    confidence = 100,
    type = "WALKING"
  )

  # get bins per hour, even if the interval is longer than one hour
  res <- data %>%
    mutate(datetime = as.POSIXct(datetime)) %>%
    mutate(lead = lead(datetime)) %>%
    bin_data(
      start_time = datetime,
      end_time = lead,
      by = "hour"
    )

  true <- tibble::tibble(
    bin = as.POSIXct(c("2022-06-21 15:00:00", "2022-06-21 16:00:00", "2022-06-21 17:00:00")),
    bin_data = list(
      tibble::tibble(
        participant_id = 1,
        datetime = as.POSIXct(c("2022-06-21 15:00:00", "2022-06-21 15:55:00")),
        confidence = 100,
        type = "WALKING",
        lead = as.POSIXct(c("2022-06-21 15:55:00", "2022-06-21 16:00:00"))
      ),
      tibble::tibble(
        participant_id = 1,
        datetime = as.POSIXct(c("2022-06-21 16:00:00")),
        confidence = 100,
        type = "WALKING",
        lead = as.POSIXct("2022-06-21 17:00:00")
      ),
      tibble::tibble(
        participant_id = 1,
        datetime = as.POSIXct(c(
          "2022-06-21 17:00:00", "2022-06-21 17:05:00",
          "2022-06-21 17:10:00"
        )),
        confidence = 100,
        type = "WALKING",
        lead = as.POSIXct(c("2022-06-21 17:05:00", "2022-06-21 17:10:00", NA))
      )
    )
  )
  expect_equal(res, true)

  # Alternatively, you can give an integer value to by to create custom-sized
  # bins, but only if fixed = FALSE. Not that these bins are not rounded to,
  # as in this example 30 minutes, but rather depends on the earliest time
  # in the group.
  res <- data %>%
    mutate(datetime = as.POSIXct(datetime)) %>%
    mutate(lead = lead(datetime)) %>%
    bin_data(
      start_time = datetime,
      end_time = lead,
      by = 1800L,
      fixed = FALSE
    )
  true <- tibble::tibble(
    bin = as.POSIXct(c(
      "2022-06-21 15:00:00", "2022-06-21 15:30:00", "2022-06-21 16:00:00",
      "2022-06-21 16:30:00", "2022-06-21 17:00:00"
    )),
    bin_data = list(
      tibble::tibble(
        participant_id = 1,
        datetime = as.POSIXct(c("2022-06-21 15:00:00")),
        confidence = 100,
        type = "WALKING",
        lead = as.POSIXct(c("2022-06-21 15:30:00"))
      ),
      tibble::tibble(
        participant_id = 1,
        datetime = as.POSIXct(c("2022-06-21 15:30:00", "2022-06-21 15:55:00")),
        confidence = 100,
        type = "WALKING",
        lead = as.POSIXct(c("2022-06-21 15:55:00", "2022-06-21 16:00:00"))
      ),
      tibble::tibble(
        participant_id = 1,
        datetime = as.POSIXct(c("2022-06-21 16:00:00")),
        confidence = 100,
        type = "WALKING",
        lead = as.POSIXct(c("2022-06-21 16:30:00"))
      ),
      tibble::tibble(
        participant_id = 1,
        datetime = as.POSIXct(c("2022-06-21 16:30:00")),
        confidence = 100,
        type = "WALKING",
        lead = as.POSIXct(c("2022-06-21 17:00:00"))
      ),
      tibble::tibble(
        participant_id = 1,
        datetime = as.POSIXct(c(
          "2022-06-21 17:00:00", "2022-06-21 17:05:00",
          "2022-06-21 17:10:00"
        )),
        confidence = 100,
        type = "WALKING",
        lead = as.POSIXct(c("2022-06-21 17:05:00", "2022-06-21 17:10:00", NA))
      )
    )
  )
  expect_equal(res, true)

  # More complicated data for showcasing grouping:
  data <- tibble::tibble(
    participant_id = c(rep(1, 4), rep(2, 4)),
    datetime = rep(c(
      "2022-06-21 15:00:00", "2022-06-21 15:55:00",
      "2022-06-21 17:05:00", "2022-06-21 17:10:00"
    ), 2),
    confidence = 100,
    type = rep(c("STILL", "WALKING", "STILL", "WALKING"), 2)
  )

  # binned_intervals also takes into account the prior grouping structure
  res <- data %>%
    mutate(datetime = as.POSIXct(datetime)) %>%
    group_by(participant_id) %>%
    mutate(lead = lead(datetime)) %>%
    group_by(participant_id, type) %>%
    bin_data(
      start_time = datetime,
      end_time = lead,
      by = "hour"
    )
  true <- tibble::tibble(
    participant_id = c(rep(1, 6), rep(2, 6)),
    type = rep(c("STILL", "STILL", "STILL", "WALKING", "WALKING", "WALKING"), 2),
    bin = rep(as.POSIXct(c(
      "2022-06-21 15:00:00", "2022-06-21 16:00:00",
      "2022-06-21 17:00:00"
    )), 4),
    bin_data = rep(list(
      # STILL
      tibble::tibble(
        datetime = as.POSIXct(c("2022-06-21 15:00:00")),
        confidence = 100,
        lead = as.POSIXct(c("2022-06-21 15:55:00"))
      ),
      tibble::tibble(
        datetime = as.POSIXct(double(0)),
        confidence = double(0),
        lead = as.POSIXct(double(0))
      ),
      tibble::tibble(
        datetime = as.POSIXct(c("2022-06-21 17:05:00")),
        confidence = 100,
        lead = as.POSIXct(c("2022-06-21 17:10:00"))
      ),
      # WALKING
      tibble::tibble(
        datetime = as.POSIXct(c("2022-06-21 15:55:00")),
        confidence = 100,
        lead = as.POSIXct(c("2022-06-21 16:00:00"))
      ),
      tibble::tibble(
        datetime = as.POSIXct(c("2022-06-21 16:00:00")),
        confidence = 100,
        lead = as.POSIXct(c("2022-06-21 17:00:00"))
      ),
      tibble::tibble(
        datetime = as.POSIXct(c("2022-06-21 17:00:00", "2022-06-21 17:10:00")),
        confidence = 100,
        lead = as.POSIXct(c("2022-06-21 17:05:00", NA))
      )
    ), 2)
  ) %>%
    group_by(participant_id, type)
  expect_equal(res, true)

  # To get the duration for each bin (note to change the variable names in sum):
  duration <- purrr::map_dbl(
    .x = res$bin_data,
    .f = ~ sum(as.double(.x$lead) - as.double(.x$datetime),
      na.rm = TRUE
    ) / 60
  )

  # Or:
  duration2 <- res %>%
    unnest(bin_data, keep_empty = TRUE) %>%
    mutate(duration = .data$lead - .data$datetime) %>%
    group_by(bin, .add = TRUE) %>%
    summarise(duration = sum(.data$duration, na.rm = TRUE), .groups = "drop")

  true <- c(55, 0, 5, 5, 60, 5, 55, 0, 5, 5, 60, 5)
  expect_equal(duration, as.double(duration2$duration))
  expect_equal(duration, true)
  expect_equal(as.double(duration2$duration), true)

  # Argument checks
  data <- data %>%
    mutate(datetime = as.POSIXct(datetime)) %>%
    mutate(lead = lead(datetime))

  expect_error(
    bin_data(
      data = data,
      start_time = datetime,
      end_time = lead,
      by = TRUE
    ),
    "`by` must be one of 'sec', 'min', 'hour', or 'day', or a numeric value if `fixed = FALSE`."
  )
  expect_error(
    data %>%
      mutate(datetime = as.character(datetime)) %>%
      bin_data(
        start_time = datetime,
        end_time = lead,
        by = "hour"
      ),
    NA
  )

  # Test bug #8: bin_data() incorrectly rounded off days after DST change
  data <- tibble::tibble(
    participant_id = 1,
    datetime = as.POSIXct(
      c(
        "2022-10-30 15:00:00", "2022-10-30 15:55:00",
        "2022-10-31 17:05:00", "2022-10-31 17:10:00"
      ),
      tz = "Europe/Brussels"
    ),
    confidence = 100,
    type = "WALKING"
  )

  res <- data %>%
    mutate(datetime = as.POSIXct(datetime)) %>%
    mutate(lead = lead(datetime)) %>%
    bin_data(
      start_time = datetime,
      end_time = lead,
      by = "day"
    )

  expect_equal(lubridate::hour(res$bin), c(0, 0))
})

Try the mpathsenser package in your browser

Any scripts or data that you put into this service are public.

mpathsenser documentation built on May 29, 2024, 9:11 a.m.