R/artifact_detection.R

Defines functions plot_artifacts predict_multiclass_classifier choose_between_classes predict_binary_classifier get_kernel compute_features2 add_chunk_group compute_wavelet_coefficients compute_wavelet_decomposition max_per_n compute_amplitude_features compute_derivative_features get_second_derivative get_derivative sd median sum mean min max

Documented in add_chunk_group choose_between_classes compute_amplitude_features compute_derivative_features compute_features2 compute_wavelet_coefficients compute_wavelet_decomposition get_derivative get_kernel get_second_derivative max_per_n plot_artifacts predict_binary_classifier predict_multiclass_classifier

max <- function(...) suppressWarnings(base::max(..., na.rm = TRUE))
min <- function(...) suppressWarnings(base::min(..., na.rm = TRUE))
mean <- function(...) base::mean(..., na.rm = TRUE)
sum <- function(...) base::sum(..., na.rm = TRUE)
median <- function(...) stats::median(..., na.rm = TRUE)
sd <- function(...) stats::sd(..., na.rm = TRUE)

#' Configuration of the SVM algorithm for binary classification
#'
#' @author Sara Taylor \email{sataylor@@mit.edu}
#' @references \url{https://eda-explorer.media.mit.edu/}
"binary_classifier_config"

#' Configuration of the SVM algorithm for ternary classification
#'
#' @author Sara Taylor \email{sataylor@@mit.edu}
#' @references \url{https://eda-explorer.media.mit.edu/}
"multiclass_classifier_config"

#' First derivative
#'
#' Get the first derivative.
#'
#' @param values vector of numbers
get_derivative <- function(values) {
  end <- length(values)
  if (end < 3) {
    list(NaN)
  } else {
    list((values[2:(end - 1)] + values[3:end]) / 2 -
      (values[2:(end - 1)] + values[1:(end - 2)]) / 2)
  }
}

#' Second derivative
#'
#' Get the second derivative.
#'
#' @param values vector of numbers
get_second_derivative <- function(values) {
  end <- length(values)
  if (end < 3) {
    list(NaN)
  } else {
    list(values[3:end] - 2 * values[2:(end - 1)] + values[1:(end - 2)])
  }
}


#' Derivative features
#'
#' Compute derivative features.
#'
#' @param derivative vector of derivatives
#' @param feature_name name of feature
compute_derivative_features <- function(derivative, feature_name) {
  features <- list()

  features[paste0(feature_name, "_max")] <- max(derivative)
  features[paste0(feature_name, "_min")] <- min(derivative)
  features[paste0(feature_name, "_abs_max")] <- max(abs(derivative))
  features[paste0(feature_name, "_abs_avg")] <- mean(abs(derivative))

  as.data.frame(features)
}


#' Amplitude features
#'
#' Compute amplitude features.
#'
#' @param data vector of amplitude values
#' @importFrom dplyr across .data
compute_amplitude_features <- function(data) {
  data %>%
    group_by(.data$group) %>%
    mutate(
      raw_derivative = get_derivative(.data$EDA),
      raw_second_derivative = get_second_derivative(.data$EDA),
      filtered_derivative = get_derivative(.data$filtered_eda),
      filtered_second_derivative = get_second_derivative(.data$filtered_eda)
    ) %>%
    summarize(
      raw_mean = mean(.data$EDA),
      filtered_mean = mean(.data$filtered_eda),
      across(
        c(
          .data$raw_derivative, .data$raw_second_derivative,
          .data$filtered_derivative, .data$filtered_second_derivative
        ),
        list(
          max = ~ max(unlist(.x)),
          min = ~ min(unlist(.x)),
          abs_max = ~ max(abs(unlist(.x))),
          abs_avg = ~ mean(abs(unlist(.x)))
        )
      ),
      .groups = "drop"
    ) %>%
    select(-.data$group)
}

#' Max value per segment of length n
#'
#' Give the maximum value of a vector of values per segment of length n.
#'
#' @param values array of numbers
#' @param n length of each segment
#' @param output_length argument to adjust for final segment not being full
max_per_n <- function(values, n, output_length) {
  if (n == 1) {
    abs(values[1:output_length])
  } else {
    matrix <- matrix(values[1:(n * output_length)],
      nrow = n,
      byrow = FALSE,
      dimnames = list(1:n, 1:output_length)
    )

    as.double(apply(abs(matrix), 2, max))
  }
}

#' Wavelet decomposition
#'
#' Compute wavelet decomposition.
#'
#' @param data vector of values
#' @importFrom waveslim dwt
compute_wavelet_decomposition <- function(data) {
  output_length <- (length(data) %/% 8) * 8

  decompostion <- waveslim::dwt(data[1:output_length], "haar", 3, "periodic")

  list(
    level1 = decompostion$d1,
    level2 = decompostion$d2,
    level3 = decompostion$d3
  )
}


#' Wavelet coefficients
#'
#' Compute wavelet coefficients.
#'
#' @param data data with an EDA element
compute_wavelet_coefficients <- function(data) {
  wavelets <- compute_wavelet_decomposition(data$EDA)

  one_second_feature_length <- ceiling(nrow(data) / 8)
  one_second_level_1_features <- max_per_n(wavelets$level1, 4, one_second_feature_length)
  one_second_level_2_features <- max_per_n(wavelets$level2, 2, one_second_feature_length)
  one_second_level_3_features <- max_per_n(wavelets$level3, 1, one_second_feature_length)

  half_second_feature_length <- ceiling(nrow(data) / 4)
  half_second_level_1_features <- max_per_n(wavelets$level1, 2, half_second_feature_length)
  half_second_level_2_features <- max_per_n(wavelets$level2, 1, half_second_feature_length)

  list(
    one_second_features = data.frame(
      one_second_level_1 = one_second_level_1_features,
      one_second_level_2 = one_second_level_2_features,
      one_second_level_3 = one_second_level_3_features
    ),
    half_second_features = data.frame(
      half_second_level_1 = half_second_level_1_features,
      half_second_level_2 = half_second_level_2_features
    )
  )
}

#' Addition of chunk groups
#'
#' partition data into chunks of a fixed number of rows in order to calculate
#'   aggregated features per chunk
#'
#' @param data df to partition into chunks
#' @param rows_per_chunk size of a chunk
#' @importFrom magrittr "%>%"
#' @importFrom dplyr arrange bind_rows mutate .data
add_chunk_group <- function(data, rows_per_chunk) {
  old_part <-
    data %>%
    dplyr::mutate(group = rep(
      seq(1,
        by = rows_per_chunk,
        length.out = nrow(data) / rows_per_chunk
      ),
      each = rows_per_chunk,
      length.out = nrow(data)
    ))

  new_part <-
    old_part[tail(unique(old_part$group), -1), ] %>%
    dplyr::mutate(group = .data$group - rows_per_chunk)

  dplyr::bind_rows(old_part, new_part) %>%
    dplyr::arrange(.data$group)
}

#' Compute Features for SVM
#'
#' This function computes features for support vector machines (SVM) analysis. It processes input data, 
#' applies wavelet coefficients computation, and aggregates various statistical measures over 
#' defined time chunks. The function is designed to work with data frames containing electrodermal 
#' activity (EDA) data, filtered EDA, and timestamps.
#'
#' @param data A data frame with columns for electrodermal activity (eda), filtered electrodermal 
#' activity (filtered eda), and timestamps (DateTime). It is assumed that the data frame has 
#' already been preprocessed appropriately for feature computation.
#'
#' @return A data frame with computed features. Each row corresponds to a time chunk, and columns 
#' include statistical measures (like maximum, mean, standard deviation, median, and sum of positive 
#' values) of wavelet coefficients computed over different intervals (1 second, 0.5 seconds, and 
#' based on amplitude features).
#'
#' @details The function internally computes wavelet coefficients and then divides the data into 
#' chunks of specified durations (1 second, 0.5 seconds, and a custom duration based on amplitude 
#' features). For each chunk, it calculates various statistical measures. The resulting data frame 
#' includes timestamps and the computed features, which can be used for further analysis or as input 
#' to machine learning models like SVM.
#'
#' @examples
#' \dontrun{
#' # Assuming 'my_data' is a preprocessed data frame with the necessary columns
#' features <- compute_features2(my_data)
#' }
#' @importFrom magrittr "%>%"
#' @importFrom rlang .data
#' @importFrom dplyr group_by summarize select mutate across everything
#' @export
compute_features2 <- function(data) {
  id <- NULL 
  sec_per_chunk <- 5
  
  coefficients <- compute_wavelet_coefficients(data)
  
  fun_lis <- list(
    max = max,
    mean = mean,
    std = sd,
    median = median,
    positive = ~ sum(.x > 0)
  )
  
  out_1sec <- coefficients$one_second_features %>%
    add_chunk_group(sec_per_chunk) %>%
    dplyr::group_by(.data$group) %>%
    dplyr::summarize(across(.cols = everything(), .fns = fun_lis), .groups = "drop") %>%
    dplyr::select(-.data$group)
  
  out_05sec <- coefficients$half_second_features %>%
    add_chunk_group(2 * sec_per_chunk) %>%
    dplyr::group_by(.data$group) %>%
    dplyr::summarize(across(.cols = everything(), .fns = fun_lis), .groups = "drop") %>%
    dplyr::select(-.data$group)
  
  amplitude_features <-
    data %>%
    add_chunk_group(8 * sec_per_chunk) %>%
    compute_amplitude_features()
  
  timestamps <-
    data$DateTime[1] +
    sec_per_chunk * (1:nrow(amplitude_features) - 1)
  
  as.data.frame(cbind(
    id = timestamps,
    out_1sec,
    out_05sec,
    amplitude_features
  ))
}

#' SVM kernel
#'
#' Generate kernel needed for SVM
#'
#' @param kernel_transformation Data matrix used to transform EDA features
#'   into kernel values
#' @param sigma The inverse kernel width used by the kernel
#' @param columns Features computed from EDA signal
get_kernel <- function(kernel_transformation, sigma, columns) {
  kernlab::kernelMatrix(
    kernlab::rbfdot(sigma = sigma),
    kernel_transformation,
    as.matrix(columns)
  )
}


#' Binary classifiers
#'
#' Generate classifiers (artifact, no artifact)
#'
#' @param data features from EDA signal
#' @export
predict_binary_classifier <- function(data) {
  relevant_columns <-
    data[c(
      "raw_mean",
      "raw_derivative_abs_max",
      "raw_second_derivative_max",
      "raw_second_derivative_abs_avg",
      "filtered_mean",
      "filtered_second_derivative_min",
      "filtered_second_derivative_abs_max",
      "one_second_level_1_max",
      "one_second_level_1_mean",
      "one_second_level_1_std",
      "one_second_level_2_std",
      "one_second_level_3_std",
      "one_second_level_3_median"
    )]

  config <- wearables::binary_classifier_config

  kernel <- unname(as.data.frame(get_kernel(
    config$kernel_tranformation,
    config$sigma,
    relevant_columns
  )))

  labels <- sapply(
    kernel,
    function(value) {
      as.integer(sign(sum(config$coefficients * value) + config$intercept))
    }
  )

  data.frame(
    id = data$id,
    label = labels
  )
}


#' Choice between two classes
#'
#' Make choice between two classes based on kernel values
#'
#' @param class_a Number by which class a is indicated
#' @param class_b Number by which class b is indicated
#' @param kernels Kernel values from SVM
#' @export
choose_between_classes <- function(class_a, class_b, kernels) {
  config <- wearables::multiclass_classifier_config

  coef_a <- config$coeffcients[[paste0(class_a, "_constrasted_with_", class_b)]]
  coef_b <- config$coeffcients[[paste0(class_b, "_constrasted_with_", class_a)]]
  intercept_a_b <- config$intercept[paste0(class_a, "_and_", class_b)]
  kernel_a <- kernels[[paste0("class_", class_a)]]
  kernel_b <- kernels[[paste0("class_", class_b)]]

  sapply(
    seq_len(ncol(kernel_a)),
    function(index) {
      prediction_value <-
        sum(coef_a * kernel_a[, index]) +
        sum(coef_b * kernel_b[, index]) +
        intercept_a_b

      if (prediction_value > 0) {
        as.integer(class_a)
      } else {
        as.integer(class_b)
      }
    }
  )
}


#' Ternary classifiers
#'
#' Generate classifiers (artifact, unclear, no artifact)
#'
#' @param data features from EDA signal
#' @export
predict_multiclass_classifier <- function(data) {
  relevant_columns <-
    data[c(
      "filtered_second_derivative_abs_max",
      "filtered_second_derivative_min",
      "one_second_level_1_std",
      "raw_second_derivative_max",
      "raw_mean",
      "one_second_level_1_max",
      "raw_second_derivative_abs_max",
      "raw_second_derivative_abs_avg",
      "filtered_second_derivative_max",
      "filtered_mean"
    )]

  config <- wearables::multiclass_classifier_config

  kernels <- lapply(
    config$kernel_tranformation,
    get_kernel,
    config$sigma,
    relevant_columns
  )

  label_predictions <- cbind(
    `class -1 and 0` = choose_between_classes(-1, 0, kernels),
    `class -1 and 1` = choose_between_classes(-1, 1, kernels),
    `class 0 and 1` = choose_between_classes(0, 1, kernels)
  )

  label_majority_votes <- apply(
    label_predictions,
    1,
    function(values) {
      out <- values[duplicated(values)]
      if (length(out) == 0) out <- 0
      out
    }
  )

  data.frame(
    id = data$id,
    label = label_majority_votes
  )
}


#' Artifact plots
#'
#' Plot artifacts after eda_data is classified
#'
#' @param labels labels with artifact classification
#' @param eda_data data upon which the labels are plotted
#' @export
#' @importFrom ggplot2 ggplot aes geom_vline geom_line
#' @importFrom dplyr .data
plot_artifacts <- function(labels, eda_data) {
  binaries <-
    labels %>%
    dplyr::filter(.data$label == -1) %>%
    mutate(min = as.numeric(lubridate::force_tz(.data$id, Sys.timezone()) - eda_data$DateTime[1],
                            units = "mins")) %>%
    dplyr::pull(.data$min)

  eda_data %>%
    mutate(min = as.numeric(.data$DateTime - .data$DateTime[1], units = "mins")) %>%
    ggplot(aes(.data$min, .data$EDA)) +
    geom_vline(xintercept = binaries, colour = "red", size = 4) +
    geom_line(size = 1)
}
PCdLf/wearables documentation built on Nov. 19, 2024, 5:57 p.m.