Nothing
#' Session Length Check
#'
#' This function takes a dataframe as input as well as the minimum number of webpages in your web experiment or study,
#' that a participant should have seen in order to be considered a complete (or plausible) participation.
#' It flags the most extreme 10% of complete participations in the data as very slow or very fast.
#'
#' @param dataframe A dataframe containing WEXTOR data (needs to contain variables `page_trail` for the trail of
#' webpages in the study and `session_length` for the overall time each participant spent on the study)
#' @param min_pages Numeric. The minimum number of pages a participant must have
#' visited in the study for their participation to be considered plausible. Defaults to `6`.
#'
#' @returns A dataframe with added variables `sess_length_clean` for only completed session lengths, otherwise NA
#' and `check_sess_length` for a flag of the most extreme 10% of time needed to finish participation
#' @export
#' @importFrom stats quantile
#'
#' @examples
#'
#' data <- read_WEXTOR(path_to_file("BiFiX_data_raw.csv"))
#' new_data <- sess_length_check(data)
#'
sess_length_check <- function(dataframe, min_pages = 6){
if(any(grepl(".wx.", names(dataframe)))){
new <- removepref(dataframe)
} else {
new <- dataframe
}
new$true_trail <- strsplit(new$page_trail, " - ") |> lengths()
new$check_page <- ifelse(new$true_trail >= min_pages, "ok", "incomplete")
new$sess_length_clean <- ifelse(new$check_page == "incomplete", NA, new$session_length)
new$check_sess_length <- ifelse(new$sess_length_clean < quantile(new$sess_length_clean, 0.05, na.rm = T),
"very fast",
ifelse(new$sess_length_clean > quantile(new$sess_length_clean, 0.95, na.rm = T),
"very slow",
"ok")
)
return(new)
# }
# # old version
# new2 <- dataframe |>
# dplyr::mutate(
# sess_length_clean = ifelse(check_page == "incomplete", NA, session_length)
# )
#
# new2$check_sess_length <- ifelse(between(new2$sess_length_clean,
# quantile(new2$sess_length_clean, 0.05, na.rm = T),
# quantile(new2$sess_length_clean, 0.95, na.rm = T)),
# "ok", "very slow or fast")
#
# return(new2)
}
#' Check for duplicate IP addresses
#'
#' `ip_check()` identifies potential duplicate participations in a dataset
#' by checking for repeated IP addresses. Cases with duplicated IPs are
#' flagged as `"possible duplicate"`, while unique IPs are labeled `"ok"`.
#' Keep in mind that in some cases, duplicate IP addresses should not lead to exclusion,
#' for example when different participants (such as siblings or flatmates) plausibly
#' use the same device.
#'
#' If WEXTOR-style prefixes (i.e. `.wx.`) are detected in the variable names,
#' they are removed prior to performing the check.
#'
#' @param dataframe A data frame containing the study data. Must include a
#' variable named `ip` representing participants' IP addresses.
#'
#' @returns A data frame with an additional column `check_ip`, indicating
#' whether each case has a unique IP (`"ok"`) or is a potential duplicate
#' (`"possible duplicate"`).
#'
#' @details The function uses `duplicated()` to flag repeated IP addresses.
#' Only subsequent occurrences are marked as duplicates; the first instance
#' of each IP is treated as valid.
#' @export
#'
#' @examples
#' data <- read_WEXTOR(path_to_file("BiFiX_data_raw.csv"))
#' # The example data does not contain real IPs (data protection), so we will use simulate ones
#' data$ip <- sample(1:1000, nrow(data), replace = TRUE)
#'
#' new_data <- ip_check(data)
#'
ip_check <- function(dataframe){
if(any(grepl(".wx.", names(dataframe)))){
new3 <- removepref(dataframe)
} else {
new3 <- dataframe
}
new3$check_ip <- ifelse(duplicated(new3$ip), "possible duplicate", "ok")
return(new3)
}
#' Plausibility check on WEXTOR study data
#'
#' `plausicheck()` performs basic plausibility checks on a study dataset to
#' identify potentially invalid or suspicious participation. The function can
#' check whether participants visited a minimum number of pages, whether the
#' recorded session length appears plausible, and whether IP addresses indicate
#' duplicate participation.
#'
#' If WEXTOR prefixes are detected in the variable names, they are removed before
#' the plausibility checks are applied.
#'
#' @param dataframe A data frame containing the study data (needs to contain variables `page_trail` for the trail of
#' webpages in the study & `session_length` for the overall time each participant spent on the study (if
#' session length is to be checked) and `ip` for participants' IP addresses (if IP is to be checked)).
#' @param min_pages Numeric. The minimum number of pages a participant must have
#' visited in the study for their participation to be considered plausible.
#' @param check_sess_length Logical. Should the session length plausibility check
#' be performed? Defaults to `TRUE`.
#' @param check_ip Logical. Should the IP address plausibility check be performed?
#' Defaults to `TRUE`.
#'
#' @returns A data frame with additional plausibility check variables. The final
#' variable `check_plausibility` indicates whether all selected checks were passed
#' (`"all ok"`) or whether the case should be excluded (`"exclude"`). Keep in mind
#' that researchers are advised to make sure that the `"exclude"`-cases were correctly
#' identified and are indeed of poorer data quality to avoi unnecessary data loss.
#' @export
#'
#' @examples
#'
#' data <- read_WEXTOR(path_to_file("BiFiX_data_raw.csv"))
#' # The example data does not contain real IPs (data protection), so we will use simulate ones
#' data$ip <- sample(1:1000, nrow(data), replace = TRUE)
#'
#' plausi_data <- plausicheck(dataframe = data,
#' min_pages = 6,
#' check_sess_length = TRUE,
#' check_ip = TRUE)
#'
plausicheck <- function(dataframe,
min_pages,
check_sess_length = TRUE,
check_ip = TRUE){
# If no Plausicheck is selected, exit and select at least one
if(!any(c(check_sess_length, check_ip))){
stop("Please select at least one plausibility check to be performed!
You can choose `sess_length_check` and `ip_check`.")
} else{
if(any(grepl(".wx.", names(dataframe)))){
plausi <- removepref(dataframe)
} else {
plausi <- dataframe
}
if(is.null(min_pages)){
warning("Please define the minimum number of pages that a participant of your study should have visited for a plausible participation, the default is 6.")
} else if(check_sess_length){
plausi <- sess_length_check(plausi, min_pages)
}
if(check_ip){
plausi <- ip_check(plausi)
}
plausi$check_plausibility <- ifelse(check_ok(plausi, "check_page") &
check_ok(plausi, "check_sess_length") &
check_ok(plausi, "check_ip"),
"all ok",
"exclude"
)
return(plausi)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.