R/extrapolate.R

Defines functions .extrapolate_interpolate .fit_weighted .fit .extrapolate_fit .extrapolate_fitline .shift .extrapolate_neighbor .extrapolate_edges .mark_threshold .optimize_gamma .mark_gamma .extrapolate_mark .extrapolate_oversampling extrapolate_single_col extrapolate

Documented in extrapolate extrapolate_single_col

#' Extrapolate input multi-channel accelerometer data
#'
#' \code{extrapolate} applies the extrapolation algorithm to a multi-channel
#' accelerometer data, trying to reconstruct the true movement from the
#' maxed-out samples.
#'
#' This function first linearly interpolates the input signal to 100Hz, and then
#' applies the extrapolation algorithm (see the manuscript) to recover the
#' maxed-out samples. Maxed-out samples are samples that are cut off because the
#' intensity of the underlying movement exceeds the dynamic range of the device.
#'
#' \code{extrapolate} processes a dataframe of a multi-channel accelerometer
#' signal. \code{extrapolate_single_col} processes a single-channel signal with
#' its timestamps and values specified in the first and second arguments.
#'
#' @section How is it used in MIMS-unit algorithm?: This function is the first
#'   step during MIMS-unit algorithm, applied before filtering.
#'
#' @param df dataframe. Input multi-channel accelerometer data. Used in
#'   \code{\link{extrapolate}}.  The first column should be the date/time
#' @param ... see following parameter list.
#' @param t POSIXct or numeric vector. Input index or timestamp sequence Used in
#'   \code{\link{extrapolate_single_col}}.
#' @param value numeric vector. Value vector used in
#'   \code{\link{extrapolate_single_col}}.
#' @param range numeric vector. The dynamic ranges of the input signal. Should
#'   be a 2-element numeric vector. \code{c(low, high)}, where \code{low} is
#'   the negative max value the device can reach and \code{high} is the positive
#'   max value the device can reach.
#' @param noise_level number. The tolerable noise level in \eqn{g} unit, should
#'   be between 0 and 1. Default is 0.03, which applies to most devices.
#' @param k number. Duration of neighborhood to be used in local spline
#'   regression for each side, in seconds. Default is 0.05, as optimized by
#'   MIMS-unit algorithm.
#' @param spar number. Between 0 and 1, to control how smooth we want to fit
#'   local spline regression, 0 is linear and 1 matches all local points.
#'   Default is 0.6, as optimized by MIMS-unit algorithm.
#' @return \code{extraplate} returns a dataframe with extrapolated multi-channel
#'   signal. \code{extrapolate_single_col} returns a dataframe with extrapolated
#'   single-channel signal, the timestamp col is in numeric values instead of
#'   POSIXct format.
#' @family extrapolation related functions
#' @name extrapolate
#' @examples
#'   # Use the maxed-out data for the conceptual diagram
#'   df = conceptual_diagram_data[
#'           conceptual_diagram_data['GRANGE'] == 4,
#'           c("HEADER_TIME_STAMP", "X")]
#'
#'   # Plot input
#'   illustrate_signal(df, range=c(-4, 4))
#'
#'   # Use the default parameter settings as in MIMunit algorithms
#'   # The dynamic range of the input data is -4g to 4g.
#'   output = extrapolate(df, range=c(-4, 4))
#'
#'   # Plot output
#'   illustrate_signal(output, range=c(-4, 4))
NULL

#' @rdname extrapolate
#' @export
extrapolate <- function(df, ...) {
  time_zone <- lubridate::tz(df[[1, 1]])
  t <- df[[1]]
  values <- df[2:ncol(df)]
  result <- plyr::alply(values,
    .margins = 2, function(col_data) {
      col_name <- names(col_data)[1]
      output <- extrapolate_single_col(t, col_data[[1]], ...)
      colnames(output) <- c(colnames(df)[1], col_name)
      return(output)
    }
  )
  result <- Reduce(
    function(x, y) {
      return(merge(x, y, by = colnames(x)[1]))
    },
    result
  )
  names(result[2:ncol(result)]) <-
    paste("EXTRAPOLATED", names(result[2:ncol(result)]), sep = "_")
  result[, 1] <-
    as.POSIXct(result[, 1], origin = "1970-01-01", tz = time_zone)
  return(result)
}

#' @rdname extrapolate
#' @export
extrapolate_single_col <-
  function(t,
           value,
           range,
           noise_level = 0.03,
           k = 0.05,
           spar = 0.6) {
    # over sampling to 100Hz
    t_over <- seq(t[1], t[length(t)], by = 1 / 100)

    # over sampling to 100Hz with spline interpolation
    dat_over <-
      stats::spline(
        x = t,
        y = value,
        xout = t_over,
        method = "natural"
      )
    dat_over <- data.frame(dat_over)
    if (lubridate::is.POSIXct(t[1])) {
      dat_over[1] <- as.POSIXct(dat_over[[1]], origin = "1970-01-01")
    }
    dat_over <- dat_over[order(dat_over$x), ]
    not_dups <- !duplicated(dat_over$x)
    t <- dat_over$x[not_dups]
    value <- dat_over$y[not_dups]
    rm(dat_over); gc()

    # mark maxed out region using gamma distribution or threshold
    marker_fun <- .extrapolate_mark("gamma")
    marker <- marker_fun(t, value, range[1], range[2], noise_level)

    # mark neighbors
    neighbors <- .extrapolate_neighbor(marker, 100, k)

    # fit local spline regression
    points_ex <-
      .extrapolate_fit(t, value, neighbors, marker, spar, 100, k)

    # interpolate with the original
    dat_interp <-
      .extrapolate_interpolate(t, value, marker, points_ex, 100)

    return(dat_interp)
  }

.extrapolate_oversampling <- function(t, value) {
  time_zone <- lubridate::tz(t[1])
  t_over <- seq(t[1], t[length(t)], by = 1 / 100)

  # over sampling to 100Hz with spline interpolation
  dat_over <-
    stats::spline(
      x = t,
      y = value,
      xout = t_over,
      method = "natural"
    )
  dat_over <- data.frame(dat_over)
  if (lubridate::is.POSIXct(t[1])) {
    dat_over[1] <- as.POSIXct(dat_over[[1]], origin = "1970-01-01", tz = time_zone)
  }
  dat_over <- dat_over[order(dat_over$x), ]
  not_dups <- !duplicated(dat_over$x)
  t <- dat_over$x[not_dups]
  value <- dat_over$y[not_dups]
  return(list(t = t, value = value))
}

.extrapolate_mark <- function(method = "gamma") {
  return(switch(method, gamma = .mark_gamma, threshold = .mark_threshold))
}

.mark_gamma <-
  function(t, value, range_low, range_high, noise_sd = noise_sd) {
    # init the mark vector
    marker <- rep(0, length(t))

    # model of 3sd and shape para with confident probability at 0.95
    noise_sd <- noise_sd + 1e-05
    shape <- .optimize_gamma(3 * noise_sd)
    # mark using gamma distribution
    marker[value >= 0] <-
      stats::pgamma(value[value >= 0] - (range_high - 5 * noise_sd),
        shape = shape,
        scale = 1
      )
    marker[value < 0] <-
      -stats::pgamma(-value[value < 0] + (range_low + 5 * noise_sd),
        shape = shape,
        scale = 1
      )
    return(marker)
  }

.optimize_gamma <- function(value) {
  i <- seq(0.5, 0.001, by = -0.001)
  result <- 0
  previous <- 1
  previous_ii <- 0
  for (ii in i)
  {
    current <- stats::pgamma(value, shape = ii, scale = 1)
    if (previous < 0.95 & current >= 0.95) {
      if (abs(0.95 - previous) > abs(current - 0.95)) {
        result <- ii
      } else {
        result <- previous_ii
      }
      break
    }
    previous <- current
    previous_ii <- ii
  }
  return(result)
}

.mark_threshold <-
  function(t, value, range_low, range_high, noise_sd = noise_sd) {
    # init the mark vector
    marker <- rep(0, length(t))

    # Compute the upper and lower threshold bound
    # according to the range and threshold
    u_bound <- range_high - 5 * noise_sd
    l_bound <- range_low + 5 * noise_sd

    marker[value >= u_bound] <- 1
    marker[value <= l_bound] <- -1
    return(marker)
  }

#' @importFrom magrittr %>%
.extrapolate_edges <- function(marker, confident, sr) {
  marker_diff_left <- c(0, diff(marker))
  marker_diff_right <- c(diff(marker), 0)

  threshold_maxedout <- sr * 5

  # hills
  positive_left_end <-
    which(marker_diff_left > confident & marker > 0)
  positive_right_start <-
    which(marker_diff_right < -confident & marker > 0)

  if (length(positive_left_end) - length(positive_right_start) == 1) {
    # end case > 2 second maxed out edge region
    if (length(marker) - positive_left_end[length(positive_left_end)] >
      threshold_maxedout) {
      positive_right_start <- c(positive_right_start, -1)
    } else {
      # < 2 second maxed out edge region, do nothing
      if (length(positive_left_end) == 1) {
        positive_left_end <- c()
      } else {
        positive_left_end <-
          positive_left_end[1:(length(positive_left_end) - 1)]
      }
    }
  } else if (length(positive_left_end) - length(positive_right_start == -1)) {
    # start case > 2 second maxed out edge region
    if (positive_right_start[1] > threshold_maxedout) {
      positive_left_end <- c(-1, positive_left_end)
    } else {
      # < 2 second maxed out edge region, do nothing
      if (length(positive_right_start) == 1) {
        positive_right_start <- c()
      } else {
        positive_right_start <-
          positive_right_start[2:length(positive_right_start)]
      }
    }
  }

  positive_left_end <- positive_left_end %>% stats::na.omit()
  positive_right_start <-
    positive_right_start %>% stats::na.omit()

  if (abs(length(positive_left_end) - length(positive_right_start)) > 2) {
    # something is wrong
    stop("The side of maxed out hills are diff > 1, should never happen")
  }

  if (any(positive_right_start - positive_left_end < 0) &&
    length(positive_right_start) > 1) {
    positive_left_end <- .shift(positive_left_end, 1)
    positive_right_start <- .shift(positive_right_start, -1)
  }

  positive_edges <-
    data.frame(
      left_end = positive_left_end,
      right_start = positive_right_start,
      stringsAsFactors = FALSE
    )


  # valleys
  negative_left_end <-
    which(marker_diff_left < -confident & marker < 0)
  negative_right_start <-
    which(marker_diff_right > confident & marker < 0)

  if (length(negative_left_end) - length(negative_right_start) == 1) {
    # end case > 2 second maxed out edge region
    if (length(marker) - negative_left_end[length(negative_left_end)] >
      threshold_maxedout) {
      negative_right_start <- c(negative_right_start, -1)
    } else {
      # < 2 second maxed out edge region, do nothing
      if (length(negative_left_end) == 1) {
        negative_left_end <- c()
      } else {
        negative_left_end <-
          negative_left_end[1:(length(negative_left_end) - 1)]
      }
    }
  } else if (length(negative_left_end) - length(negative_right_start == -1)) {
    # start case > 5 second maxed out edge region
    if (negative_right_start[1] > threshold_maxedout) {
      negative_left_end <- c(-1, negative_left_end)
    } else {
      # < 2 second maxed out edge region, do nothing
      if (length(negative_right_start) == 1) {
        negative_right_start <- c()
      } else {
        negative_right_start <-
          negative_right_start[2:length(negative_right_start)]
      }
    }
  }

  negative_left_end <- negative_left_end %>% stats::na.omit()
  negative_right_start <-
    negative_right_start %>% stats::na.omit()

  if (abs(length(negative_left_end) - length(negative_right_start)) > 2) {
    # something is wrong
    stop("The side of maxed out valleys are diff > 1, should never happen")
  }

  if (any(negative_right_start - negative_left_end < 0) &&
    length(negative_right_start) > 1) {
    negative_left_end <- .shift(negative_left_end, 1)
    negative_right_start <- .shift(negative_right_start, -1)
  }

  negative_edges <-
    data.frame(
      left_end = negative_left_end,
      right_start = negative_right_start,
      stringsAsFactors = FALSE
    )

  edges <- rbind(positive_edges, negative_edges)
  return(edges)
}

#' @importFrom magrittr %>%
.extrapolate_neighbor <- function(marker, sr, k, confident = 0.5) {
  n_neighbor <- k * sr
  edges <- .extrapolate_edges(marker, confident, sr)
  if (nrow(edges) > 0) {
    neighbors <-
      edges %>%
      tibble::as_tibble() %>%
      dplyr::mutate(
        left_start = edges$left_end - n_neighbor + 1,
        right_end = edges$right_start + n_neighbor - 1
      ) %>%
      as.data.frame()
    neighbors$left_start <- pmax(neighbors$left_start, 1)
    neighbors$right_end <-
      pmin(neighbors$right_end, length(marker))
    neighbors$left_start[neighbors$left_end == -1] <- -1
    neighbors$right_end[neighbors$right_start == -1] <- -1
  } else {
    neighbors <-
      data.frame(
        left_start = c(),
        right_end = c(),
        left_end = c(),
        right_start = c()
      )
  }
  return(neighbors)
}

.shift <- function(v, lag, truncate = TRUE) {
  n <- length(v)
  if (lag > 0) {
    v <- v[-((n - lag + 1):n)]
  } else {
    v <- v[-(1:(-lag))]
  }
  return(v)
}

#' @importFrom magrittr %>%
.extrapolate_fitline <-
  function(t,
           value,
           neighbors,
           marker,
           spar,
           sr,
           k,
           model = "spline") {
    neighbors$index <- 1:nrow(neighbors)
    point_ex <- neighbors %>% plyr::adply(1, function(neighbor) {
      # validate neighboring
      fitted_left <-
        .fit_weighted(
          t,
          value,
          marker,
          neighbor$left_start,
          neighbor$left_end,
          spar,
          sr,
          k,
          model
        )
      fitted_right <-
        .fit_weighted(
          t,
          value,
          marker,
          neighbor$right_start,
          neighbor$right_end,
          spar,
          sr,
          k,
          model
        )

      st <- t[neighbor$left_end]
      left_end <- 0
      right_start <-
        as.numeric(t[neighbor$right_start] - st, units = "secs")

      middle_t <- (left_end + right_start) / 2
      middle_t <- st + middle_t
      left_x_ex <- c(seq(st, middle_t, 1 / 100), middle_t)
      right_x_ex <-
        c(middle_t, seq(middle_t, st + right_start, 1 / 100))
      # if (left_end == right_start) {
      #   return(data.frame(
      #     t_ex = c(),
      #     value_ex = c(),
      #     type_ex = c(),
      #     index =
      #   ))
      # }
      switch(model,
      spline = {
        left_ex <- fitted_left %>% stats::predict(x = as.numeric(left_x_ex))
        right_ex <-
          fitted_right %>% stats::predict(x = as.numeric(right_x_ex))
        type_ex <- rep("left_line", length(left_ex$y))

        point_ex_left <-
          fitted_left %>% stats::predict(x = as.numeric(middle_t))
        point_ex_right <-
          fitted_right %>% stats::predict(x = as.numeric(middle_t))
        point_ex <- (point_ex_left$y + point_ex_right$y) / 2
        type_ex <- c(type_ex, "point")
        type_ex <-
          c(type_ex, rep("right_line", length(right_ex$y)))
        index <- rep(neighbor$index, length(type_ex))
      }
      )
      return(data.frame(
        t_ex = c(left_x_ex, middle_t, right_x_ex),
        value_ex = c(left_ex$y, point_ex, right_ex$y),
        type_ex = type_ex,
        index = index
      ))
    },
    .inform = TRUE, .id = NULL, .expand = FALSE
    )
    return(point_ex)
  }

#' @importFrom magrittr %>%
.extrapolate_fit <-
  function(t,
           value,
           neighbors,
           marker,
           spar,
           sr,
           k,
           model = "spline") {
    point_ex <- neighbors %>% plyr::adply(1, function(neighbor) {
      # validate neighboring

      fitted_left <-
        .fit_weighted(
          t,
          value,
          marker,
          neighbor$left_start,
          neighbor$left_end,
          spar,
          sr,
          k,
          model
        )
      fitted_right <-
        .fit_weighted(
          t,
          value,
          marker,
          neighbor$right_start,
          neighbor$right_end,
          spar,
          sr,
          k,
          model
        )

      if (is.null(fitted_left)) {
        n_rep <- neighbor$right_start - 1
        point_ex <- rep(-200, n_rep) # give an extra huge value
        middle_t <- t[1:(neighbor$right_start - 1)]
      } else if (is.null(fitted_right)) {
        n_rep <- length(t) - neighbor$left_end
        point_ex <- rep(-200, n_rep) # give an extra huge value
        middle_t <- t[(neighbor$left_end + 1):length(t)]
      } else {
        st <- t[neighbor$left_end]
        left_end <- 0
        right_start <-
          as.numeric(t[neighbor$right_start] - st, units = "secs")

        middle_t <- (left_end + right_start) / 2
        middle_t <- st + middle_t
        switch(model, linear = {
          left_ex <-
            fitted_left %>%
            stats::predict(data.frame(over_t = as.numeric(middle_t)))
          right_ex <-
            fitted_right %>%
            stats::predict(data.frame(over_t = as.numeric(middle_t)))
          point_ex <- (left_ex + right_ex) / 2
        },
        spline = {
          left_ex <- fitted_left %>% stats::predict(x = as.numeric(middle_t))
          right_ex <-
            fitted_right %>% stats::predict(x = as.numeric(middle_t))
          point_ex <- (left_ex$y + right_ex$y) / 2
        }
        )
      }
      return(data.frame(t_ex = middle_t, value_ex = point_ex))
    },
    .inform = TRUE,
    .id = NULL,
    .expand = FALSE
    )
    return(point_ex)
  }

.fit <-
  function(t,
           value,
           marker,
           start,
           end,
           spar,
           sr,
           k,
           model = "spline") {
    if (start < 0) {
      start <- 1
    }
    if (end > length(t)) {
      end <- length(t)
    }
    # over sampling so that we have enough points
    n_over <- k * sr
    sub_t <- t[start:end]
    sub_value <- value[start:end]
    sp <- stats::approx(x = sub_t, y = sub_value, n = n_over)
    over_t <- sp$x[-n_over]
    over_value <- sp$y[-n_over]

    # fit locally with spline smoothing
    fitted <-
      switch(
        model,
        spline = stats::smooth.spline(over_t, over_value, spar = spar),
        linear = stats::lm(over_value ~ over_t, na.action = stats::na.omit)
      )
    return(fitted)
  }

.fit_weighted <-
  function(t,
           value,
           marker,
           start,
           end,
           spar,
           sr,
           k,
           model = "spline") {
    # if(start < 0) start = 1 if(end > length(t)) end = length(t)

    # over sampling so that we have enough points
    n_over <- k * sr

    if (start == -1 && end == -1) {
      # start edge case
      fitted <- NULL
    } else {
      sub_t <- t[start:end]
      sub_value <- value[start:end]
      weight <- 1 - marker[start:end]
      sp <- stats::approx(x = sub_t, y = sub_value, n = n_over)
      weight <- stats::approx(x = sub_t, y = weight, n = n_over)
      over_t <- sp$x
      over_value <- sp$y
      weight <- weight$y
      # fit locally with spline smoothing
      fitted <-
        switch(
          model,
          spline = stats::smooth.spline(over_t,
            over_value,
            weight,
            spar = spar
          ),
          linear = stats::lm(
            over_value ~ over_t,
            weights = weight,
            na.action = stats::na.omit
          )
        )
    }
    return(fitted)
  }

#' @importFrom magrittr %>%
.extrapolate_interpolate <-
  function(t, value, marker, points_ex, sr, confident = 0.5) {

    first_t = dplyr::first(t)
    last_t = dplyr::last(t)

    mark_it <- abs(marker) < confident
    rm(marker); gc()
    length_t_mark = sum(mark_it, na.rm = TRUE)

    # t_mark = t value_mark = value
    if (length_t_mark / length(t) < 0.3) {
      dat <- data.frame(t = t, value = value)
    } else {
      # only have to calculate these if the above isn't true
      # since mark_it is logical, can just do the sum above
      value_mark <- value[mark_it]
      rm(value)

      t_mark <- t[mark_it]
      rm(mark_it)

      rm(t); gc()

      dat <-
        rbind(
          data.frame(t = t_mark, value = value_mark),
          data.frame(t = points_ex$t_ex, value = points_ex$value_ex)
        ) %>% dplyr::arrange(t)
      rm(t_mark); gc()
      rm(value_mark); gc()
    }
    t = dat$t
    value = dat$value
    rm(dat); gc()
    t_interp <-
      seq(first_t, last_t, by = 1 / sr)
    dat <-
      stats::spline(t, y = value, xout = t_interp) %>%
      as.data.frame() %>%
      stats::na.omit()
    names(dat) <- c("t", "value")

    return(dat)
  }

Try the MIMSunit package in your browser

Any scripts or data that you put into this service are public.

MIMSunit documentation built on June 21, 2022, 5:06 p.m.