R/funs_brokenstick.r

#' Fitter function for brokenstick models
#' 
#' Basic computing engines of the brokenstick algorithm called by 
#' \code{\link{brokenstick}}.
#' 
#' @param xy Data, as returned by \code{\link{xy.coords}}. The \code{x} values 
#' have to be sorted.
#' @param pts The points to use.
#' @param eco.mem An integer value between 0 and 7 to control the memory size of the 
#' output. \code{0} = no memory savings.
#' @seealso \code{\link{brokenstick}} and \code{\link{optBrokenstick}} which should 
#' be used to fit brokenstick models.
#' @keywords internal
#' @export
bsmfit <- function(xy, pts, eco.mem = 0L) { 
  if (eco.mem > 4) stop('"eco.mem" must be between 0 and 4.')
  
  # Compute stick slopes and coefficients
  n <- diff(pts)
  n[1L] <- n[1L] + 1 # First segment needs 1 more prediction for the first point
  dy <- diff(xy[pts, 2L]) ; dx <- diff(xy[pts, 1L])
  a <- dy / dx
  b <- xy[pts[-length(pts)], 2L] - (a * xy[pts[-length(pts)], 1L])
  
  # Compute fitted values and residuals
  fit <- rep(a, n) * xy[ , 1L] + rep(b, n)
  res <- xy[, 2] - fit
  
  # Format output
  out <- list(pts.x = xy[pts, 1L], pts.y = xy[pts, 2L], slope = a, intercept = b, 
              pts = pts, residuals = res, fitted.values = fit, data = xy)
  "if"(eco.mem != 0L, out[-unique(seq(8L - eco.mem + 1L, 8L))], out)
}

#' Fitting brokenstick models
#' 
#' \code{brokenstick} is used to fit brokenstick models on two-dimensional data 
#' such as Time-Depth, Depth-Temperature or Depth-Light profiles. Brokenstick models 
#' are useful for compressing or extracting the shape of high resolution profiles.
#' 
#' @param x The x data. Note that the \code{x} values have to be sorted. Can also 
#' be a list of \code{x} and \code{y} (processed by \code{\link{xy.coords}}). 
#' Alternatively, it can also be an object of class "\code{\link{formula}}".
#' @param y The y data. (processed by \code{\link{xy.coords}}). Alternatively, 
#' if \code{x} is a \code{'formula'}, \code{y} can also be an optional data frame, 
#' list or environment (or object coercible by \code{\link{as.data.frame}} to a 
#' data frame) containing the variables in the model. If not found in data, the 
#' variables are taken from environment (\code{formula}), typically the environment 
#' from which \code{brokenstick} is called.
#' @param npts The number of points for the brokenstick model to fit. See 
#' \code{\link{optBrokenstick}} for a version of \code{brokenstick} which figures 
#' out this number of points automatically.
#' @param start Some starting points to start the algorithm with.
#' @param na.action A function which indicates what should happen when the data 
#' contain \code{NAs}. The default is set by the \code{na.action} setting of 
#' \code{\link{options}}, and is \code{\link{na.fail}} if that is unset. The 
#' "factory-fresh" default is \code{\link{na.omit}}. Another possible value 
#' is \code{NULL}, no action. Value \code{\link{na.exclude}} can be useful.
#' @param ... Further arguments to be passed to \code{\link{bsmfit}} 
#' such as \code{eco.mem}.
#' @param allow.dup If \code{TRUE} the algorithm will not stop when duplicated 
#' breakpoints are found. The output will contain a slot called \code{dup}, a 
#' data.frame with the breakpoint number of the duplicates (\code{dup.no}) and the 
#' breakpoint number of the its clone among real breakpoints (\code{pts.no})
#' @param not.inj.action What should be done when \code{f: y -> x} is not injective.
#' @param sort.data Should the data be sorted according to \code{x} values (when they 
#' are not) before fitting the broken-stick model.
#' 
#' @return A \code{bsm} object with (depending on \code{eco.mem}):
#' \itemize{
#'   \item pts.x The x values of the brokenstick points (\code{eco.mem} inefficient).
#'   \item pts.y The y values of the brokenstick points (\code{eco.mem} inefficient).
#'   \item slope The slopes of each "stick" of the model (\code{eco.mem} inefficient).
#'   \item intercept The intercept of each "stick" of the model (\code{eco.mem} inefficient).
#'   \item pts (\code{eco.mem < 4}) The index of the brokenstick points.
#'   \item na.action Information from the action which was applied to object if \code{NAs} 
#'   were handled specially. (\code{eco.mem} inefficient)
#'   \item residuals (\code{eco.mem < 3}) The model's residuals.
#'   \item fitted.values (\code{eco.mem < 2}) The fitted values.
#'   \item data (\code{eco.mem < 1}) The input data used for fitting.
#'   \item pts.no The iteration number of points. (\code{eco.mem} inefficient)
#' }
#' @details See \code{\link{bsmfit}}, the function called by \code{brokenstick} 
#' on each iteration to fit the model with specified points.
#' @seealso \code{\link{optBrokenstick}} and \code{\link{predict.bsm}}, \code{\link{residuals.bsm}}, 
#' \code{\link{update.bsm}}, \code{\link{summary.bsm}}, 
#' \code{\link{coef.bsm}}, \code{\link{plot.bsm}}, \code{\link{as.data.frame.bsm}}
#' for other functions with a S3 method for \code{bsm} objects. 
#' @export
#' @keywords brokenstick
#' @examples
#' data(exses)
#' dv <- tdrply(identity, 1:2, no = 100, obj = exses)[[1]]
#' 
#' # Syntax
#' bsm <- brokenstick(dv$time, dv$depth)
#' bsm <- brokenstick(dv) # if two columns
#' bsm <- brokenstick(depth ~ time, dv) 
#' bsm <- with(dv, brokenstick(depth ~ time))
brokenstick <- function(x, y, npts = 6, start  = NULL, na.action, 
                        allow.dup = FALSE, 
                        not.inj.action = c("ignore", "null", "error"), 
                        sort.data = FALSE, ...) {
  UseMethod("brokenstick")
}

#' @inheritParams brokenstick
#' @export
brokenstick.default <- function(x, y = NULL, npts = 6, 
                                start = NULL, na.action, 
                                allow.dup = FALSE, 
                                not.inj.action = c("ignore", "null", "error"), 
                                sort.data = FALSE, ...) {
  # Format input data
  nms <- if (is.recursive(x)) {
    names(x)
  } else {
    c(deparse(substitute(x)), deparse(substitute(y)))
  }
  if (any(grepl('\\$', nms))) nms <- gsub('(.*\\$)(.*$)', '\\2', nms)
  else if (any(sapply(nms, nchar) > 10)) nms <- c('x', 'y')
  xy <- setNames(as.data.frame(xy.coords(x, y)[1:2]), nms) # formated data
  
  # Apply na.action
  if (missing(na.action)) na.action <- options("na.action")[[1]]
  xy <- do.call(na.action, list(xy))
  
  # Check data integrity
  # X data should be monotonous
  if (is.unsorted(xy[ ,1]) & is.unsorted(rev(xy[ ,1]))) {
    message("X values are not sorted.")
    if (sort.data) xy <- xy[order(xy[ ,1]), ]
  }
  # f: Y -> X should be injective
  if (!is.injective(xy[ ,2], xy[ ,1])) {
    not.inj.action <- match.arg(not.inj.action, not.inj.action)
    if (not.inj.action == "error") {
      stop("Data contains different y values with same x values.", 
              " This is not appropriate for brokenstick models and ", 
              "will result in segments with infinite coefficients.")
    } else {
    warning("Data contains different y values with same x values.", 
            " This is not appropriate for brokenstick models and ", 
            "will result in segments with infinite coefficients.")
      if (not.inj.action == "null") return(NULL)
    }
  }
  
  # Broken sticks algorithm
  pts <- "if"(is.null(start) || length(start) < 2, c(1, length(xy[ , 1L])), start)
  np <- length(pts)
  pts.no <- rep(1L, np)
  dup.pts <- data.frame() ; ndup <- 0
  niter <- 0
  while (np < npts) {
    niter <- niter + 1
    if (niter > npts) {
      warning("Infinite loop issue. brokenstick returns NULL.")
      return(NULL)
    }
    brkstk <- bsmfit(xy, pts)
    absRes <- abs(brkstk$residuals)
    pts <- c(pts, which.max(absRes))
    pts.no <- c(pts.no, max(pts.no) + ndup + 1L)
    dup <- duplicated(pts)
    dup.pts <- rbind(dup.pts, data.frame(
      dup.no = pts.no[dup], 
      pts.no = which(pts[!dup] == pts[dup])
    ))
    ndup <- nrow(dup.pts)
    pts <- pts[!dup] ; pts.no <- pts.no[!dup]
    ord <- order(pts) ; pts <- pts[ord] ; pts.no <- pts.no[ord]
    if (ndup > 0) {
      if (ndup == 1L) warning('Duplicated points found. "npts" may be too high.')
      if (!allow.dup) break
    }
    np <- length(pts) + ndup
  }
  out <- bsmfit(xy, pts, ...)
  out$pts.no <- pts.no
  out$dup    <- dup.pts
  
  # Format output
  out$na.action <- attr(xy, "na.action")
  class(out) <- c('bsm', 'list')
  out
}

#' @inheritParams brokenstick
#' @keywords internal
#' @export
brokenstick.formula <- function(x, y = NULL, npts = 6, start = NULL, 
                                na.action, ...) {
  # Format input data from "formula" ("x" arg) and "data" ("y" arg) syntax
  mf <- match.call(expand.dots = FALSE)
  m <- match(c("x", "y"), names(mf), 0L)
  mf <- mf[c(1L, m)]
  mf[[1L]] <- quote(stats::model.frame)
  names(mf) <- c('', 'formula', 'data')[seq_along(mf)]
  mf <- eval(mf, parent.frame())
  
  # Apply na.action
  if (missing(na.action)) na.action <- options("na.action")[[1]]
  
  # Launch algorithm as usual
  brokenstick.default(mf[2:1], NULL, npts = npts, start = start, na.action, ...)
}

#' Update and Re-fit a brokenstick model
#' 
#' @param object Object of class inheriting from "\code{bsm}".
#' @inheritParams brokenstick
#' @param allow.dup Should the duplicated points be added anyway.
#' @export
#' @keywords internal brokenstick
#' @examples
#' data(exses)
#' dv <- tdrply(identity, 1:2, no = 100, obj = exses)[[1]]
#' bsm <- brokenstick(dv) 
#' length(bsm$pts.x)
#' 
#' length(update(bsm, 5)$pts.x)
#' length(update(bsm, 7)$pts.x)
#' 
#' \dontrun{
#' # low resolution
#' lr_bsm <- eco.mem(bsm)
#' length(update(lr_bsm, 5)$pts.x)
#' length(update(lr_bsm, 7)$pts.x)
#' }
update.bsm <- function(object, npts, allow.dup = FALSE, ...) {
  # Check if an update is needed
  np <- length(object$pts.x)
  if (np == npts) return(object)
  
  # If Hi-Res data are not available update possible with smaller number of break points
  if (!"data" %in% names(object)) {
    if (npts > np) {
      stop('Cannot improve BSM resolution without data (set "eco.mem" to 0).')
    } else {
      # Just select the first npts breakpoints but cannot get fitted values and residuals
      return(brokenstick(object$pts.x, object$pts.y, npts, eco.mem = 4))
    }
  }
  
  # If Hi-Res data are available...
  if (np < npts) {
    # Resume process at last iteration and go on to requested number of break points
    pts <- object$pts ; pts.no <- object$pts.no
    while (np < npts) {
      brkstk <- bsmfit(object$data, pts)
      absRes <- abs(brkstk$residuals)
      pts <- c(pts, which.max(absRes))
      pts.no <- c(pts.no, max(pts.no) + 1L)
      pts <- pts[ord <- order(pts)] ; pts.no <- pts.no[ord]
      dup <- duplicated(pts)
      if (any(dup)) {
        warning('Duplicated points found. "npts" may be too high.')
        # Stop if duplicated break points not allowed ("allow.dup" arg)
        if (!allow.dup) {
          pts <- pts[!dup] ; pts.no <- pts.no[!dup]
          break
        }
      }
      np <- length(pts)
    }
  } else {
    # Just select the first npts breakpoints
    cnd <- object$pts.no <= (npts - 1)
    pts <- object$pts[cnd]
    pts.no <- object$pts.no[cnd]
  }
  
  # Refit BSM to get the correct coefficients, fitted values and residuals
  x <- bsmfit(object$data, pts, ...)
  x$pts.no <- pts.no
  
  # Format output
  x$na.action <- "if"("na.action" %in% names(object), object$na.action, NULL)
  class(x) <- c("bsm", "list")
  x
}

#' Brokenstick model predictions
#' 
#' @param object Object of class inheriting from "\code{bsm}".
#' @param newdata An optional data frame in which to look for variables with 
#' which to predict. If omitted, the fitted values are used.
#' @param ... other arguments.
#' @export
#' @details Require \code{eco.mem <= 4} (see \code{\link{bsmfit}}). When 
#' using \code{newdata} argument, the function returns \code{NAs} for values that 
#' do not belong to a stick.
#' @seealso \code{\link{brokenstick}}, \code{\link{optBrokenstick}}, 
#' \code{\link{which.stick}}
#' @keywords internal brokenstick
#' @examples
#' data(exses)
#' dv <- tdrply(identity, 1:2, no = 100, obj = exses)[[1]]
#' bsm <- brokenstick(dv) 
#' plot(bsm, data = TRUE, enumerate = TRUE)
#' 
#' ypred <- predict(bsm, newdata = xpred <- sample(dv$time, 10))
#' points(xpred, ypred, col = 3, pch = 19)
predict.bsm <- function(object, newdata, ...) {
  # If "newdata" not provided, return fitted values else make new prediction
  if (missing(newdata)) {
    if (is.null(fitted(object))) {
      if (!"data" %in% names(object)) {
        stop("Impossible without x values. ", 'Consider add theses or set "eco.mem" less than 2.')
      } else {
        # Preventive programming, this case should not occur:
        # "data" slot is missing before "fitted" slot when "eco.mem" argument is used
        n <- diff(object$pts)
        n[1] <- n[1] + 1 
        return(rep(object$slope, n) * object$data$x + rep(object$intercept, n))
      }
    } else {
      return(fitted(object))
    }
  } else {
    # Find out which coefficients must be applied
    stks <- which.stick(object, newdata)
    if (is.POSIXct(newdata)){newdata <- as.numeric(newdata)}
    return(object$slope[stks] * newdata + object$intercept[stks])
  }
  
  # Preventive programming, for bug reports
  stop('Unexpected case occured in "predict.bsm".')
}

#' To which brokenstick segment a point belongs to ?
#' 
#' Given a brokenstick model and a set of points, the function determines on which 
#' stick the points are located.
#' 
#' @param object Object of class inheriting from "\code{bsm}".
#' @param pts The set of the points to match aginst sticks.
#' @param type The type of values provided in \code{pts}. To choose in 
#' \code{c('x', 'i')} where \code{'x'} stands for x values and \code{'i'} stands 
#' for the index of values.
#' @export
#' @seealso \code{\link{predict.bsm}}
#' @keywords internal brokenstick
#' @examples
#' data(exses)
#' dv <- tdrply(identity, 1:2, no = 100, obj = exses)[[1]]
#' bsm <- brokenstick(dv) 
#' (pts <- sample(1:nrow(dv), 5))
#' which.stick(bsm, pts, type = 'i')
#' which.stick(bsm, dv[pts, 1], type = 'x')
#' 
#' \dontrun{
#' # For the actual points of the model the result does not matter so much
#' # since both of previous and next segment are valid for prediction.
#' which.stick(bsm, bsm$pts, 'i')
#' }
which.stick <- function(object, pts, type = c("x", "i")) {
  bsm.pts <- switch(match.arg(type), x = object$pts.x, i = object$pts)
  eql <- lapply(pts, function(x) x == bsm.pts)
  grt <- lapply(pts, function(x) max(which(x >= bsm.pts) %else% NA))
  lst <- lapply(pts, function(x) min(which(x <= bsm.pts) %else% NA))
  
  # Function returning scitck number given the location of data in comparison to 
  # break points
  .f <- function(eql, grt, lst) {
    if (any(eql)) {ifelse(which(eql) == length(eql), which(eql) - 1, which(eql))}
    else if (is.na(grt) || is.na(lst)) {NA}
    else {grt}
  }
  out <- mapply(.f, eql, grt, lst)
  if (any((len <- sapply(out, length)) != 1)) {
    warning("Some data points were matched by several segments.\n ", 
            "This issue can be related to non matching duplicates in x/y data.\n ",
            "Here, the last matching segment is chosen every time this issue show up.")
    out <- sapply(out, last)
  }
  as.integer(out)
}

#' Extract brokenstick models coefficients
#' 
#' @param object \code{bsm} object, typically result from \code{\link{brokenstick}} 
#' or \code{\link{optBrokenstick}}.
#' @param ... other arguments.
#' @return A data frame with slope and intercepts of each stick.
#' @seealso \code{\link{brokenstick}}, \code{\link{optBrokenstick}} for model fitting.
#' @export
#' @keywords internal brokenstick
#' @details Require \code{eco.mem <= 4} (see \code{\link{bsmfit}}).
#' @examples
#' data(ses)
#' dv <- tdrply(identity, 1:2, no = 100, obj= exses)[[1]]
#' coef(brokenstick(dv))
coef.bsm <- function(object, ...) {
  # Just format corresponding slots into a data.frame output
  out <- data.frame(intercept = object$intercept, slope = object$slope)
  row.names(out) <- paste0("seg", seq(nrow(out)))
  out
}

#' Plot brokenstick models
#' 
#' @param x \code{bsm} object to plot, typically result from \code{\link{brokenstick}} 
#' or \code{\link{optBrokenstick}}.
#' @param add If true add the plot to the already existing plot.
#' @param type Character indicating the type of plotting; actually any of the 
#' types as in \code{\link{plot.default}}.
#' @param lwd The line width, a positive number, defaulting to 2.
#' @param ylim the y limits (y1, y2) of the plot. 
#' Here y1 > y2 and leads to a "reversed axis".
#' @param col A specification for the default plotting color.
#' @param col.pts Color of the breakpoints (can be of length > 1). 
#' @param enumerate A switch to indicate if the iteration number of points should 
#' be added to the plot.
#' @param data Should the data used to fit the BSM be plotted as well ?
#' @param xlab A title for the x axis: see \code{\link{title}}.
#' @param ylab A title for the y axis: see \code{\link{title}}.
#' @param ... Further graphical parameters (see \code{\link{par}}) may also be 
#' supplied as arguments.
#' @details Require \code{eco.mem <= 4} (see \code{\link{bsmfit}}).
#' @seealso \code{\link{brokenstick}}, \code{\link{optBrokenstick}} 
#' @keywords internal brokenstick
#' @export
#' @examples
#' data(exses)
#' dv <- tdrply(identity, 1:2, no = 100, obj= exses)[[1]]
#' bsm <- brokenstick(dv) 
#' plot(bsm, data = TRUE, enumerate = TRUE)
#' 
#' # Similar (BUT plot.tdr draws a POSIXct x axis while plot.bsm draws a numeric axis)
#' plot(dv) 
#' plot(bsm, add = TRUE, enumerate = TRUE)
plot.bsm <- function(x, type = "b", lwd = 2, ylim = rev(range(xy$y)), 
                     add = FALSE, col = 1, col.pts = col, 
                     enumerate = FALSE, data = FALSE, 
                     xlab = NULL, ylab = NULL, ...) {
  # Argument checking in relation to plot type & colors
  type %in% c("b", "l", "p") || stop("Only types 'p', 'l', and 'b' are supported.")
  length(col) == 1 || stop("Only 'col.pts' can have a length > 1")
  
  # If xlab/ylab NULL, get the names of xy variables if available
  nms <- c(
    xlab %else% "if"("data" %in% names(x), names(x$data)[1], "bsm x"), 
    ylab %else% "if"("data" %in% names(x), names(x$data)[2], "bsm y")
    )
  
  # Generate BSM abstracted profile
  y <- predict(x, newdata = x$pts.x)
  valid_pred <- is.finite(y) # May occur because of infinite bsm coefficients.
  if (any(!valid_pred)) warning("Non-finite values were predicted. Omitting them.")
  xy <- xy.coords(x$pts.x[valid_pred], y[valid_pred], "x", "y")
  
  # Draw it
  if (length(col.pts) > 1) {
    type %in% c("b", "p") || stop("Length of 'col.pts' should be 1 when 'type' = 'l'.")
    if (!add) plot(xy, type = "n", ylim = ylim, xlab = nms[1], ylab = nms[2], ...)
    Map(points, x = xy$x, y = xy$y, lwd = lwd, col = col.pts, ...)
    if (type == "b") lines(xy, type = "b", pch = NA, lwd = lwd, col = col, ...)
  } else {
    if (add) {
      lines(xy, type = type, lwd = lwd, col = col, ...)
    } else {
      plot(xy, type = type, lwd = lwd, ylim = ylim, col = col, 
           xlab = nms[1], ylab = nms[2], ...)
    }
  }
  
  # If break points numbering is requested
  if (enumerate) text(xy, labels = x$pts.no, adj = c(1.5, 1.5), col = col.pts, cex = .8)
  
  # Hi-Res data can be added if requested and available
  if (data) {
    ("data" %in% names(x) && !is.null(x$data)) || stop('"data" slot is missing in "x".')
    lines(x$data)
  }
  
  invisible(NULL)
}

#' Extract brokenstick model residuals
#' 
#' @param object \code{bsm} object, typically result from \code{\link{brokenstick}} 
#' or \code{\link{optBrokenstick}}.
#' @param type To choose in \code{c('normal', 'absolute')}. The second choice 
#' returning the absolute value of the first.
#' @param newdata \code{x} and \code{y} data to used when interested in specific 
#' residual values (\code{x} in first column, \code{y} in second column).
#' @param ... Other arguments.
#' @return Returns model residuals if \code{newdata} is omited. Returns residuals 
#' computed from \code{newdata} otherwise. In this case, returns \code{NAs} for 
#' values that do not belong to a stick .
#' @details Require \code{eco.mem <= 4} if using \code{newdata} argument but requires 
#' \code{eco.mem <= 2} otherwise (see \code{\link{bsmfit}}).
#' @seealso \code{\link{brokenstick}}, \code{\link{optBrokenstick}}
#' @export
#' @keywords internal brokenstick
#' @examples
#' data(exses)
#' dv <- tdrply(identity, 1:2, no = 100, obj= exses)[[1]]
#' bsm <- brokenstick(dv)
#' plot(residuals(bsm)) ; abline(v = bsm$pts, h = 0, col = "grey")
residuals.bsm <- function(object, type = c("normal", "absolute"), newdata, ...) {
  # If "newdata" not provided, return residual slot else make new prediction
  if (!missing(newdata)) {
    if (length(newdata) < 2) {stop('Please provide "x" and "y" data in "newdata" argument.')}
    ypred <-  predict.bsm(object, newdata = newdata[ , 1])
    out <- newdata[ , 2] - ypred
  } else {
    if (all(!"residuals" %in% names(object))) {
      stop('Cant return residuals without "residuals" slot or "newdata" argument.')
    } else {
      out <- object$residuals
    }
  }
  
  # Format output to requested "type"
  switch(match.arg(type), normal = out, absolute = abs(out))
}

#' Fitting automatic brokenstick models
#' 
#' \code{optBrokenstick} is similar to \code{\link{brokenstick}} except that a 
#' cost function can be used to determine the optimal number of points.
#' 
#' @inheritParams brokenstick
#' @param x The x data. Note that the \code{x} values have to be sorted. Can also 
#' be a list of \code{x} and \code{y} data as returned by \code{\link{xy.coords}}.
#' @param y The y data.
#' @param threshold A threshold value for the cost function to be used instead of 
#' the minimum. If provided the search of a local minimum in the cost function is 
#' abandoned.
#' @param cost The cost function to use. Some are included in the package such as 
#' \code{\link{max_dist_cost}}, \code{\link{dist_per_pt_cost}}, 
#' \code{\link{rss_cost}}, \code{\link{dzi_cost}} etc. Feel free to use a custom one.
#' @param npmin Minimun number of points.
#' @param npmax Maximum number of points.
#' @return Same as \code{\link{brokenstick}} with the value of the cost function.
#' @seealso \code{\link{brokenstick}} and \code{\link{predict.bsm}}, \code{\link{residuals.bsm}}, 
#' \code{\link{update.bsm}}, \code{\link{summary.bsm}}, 
#' \code{\link{coef.bsm}}, \code{\link{plot.bsm}}, \code{\link{as.data.frame.bsm}}
#' for other functions with a S3 method for \code{bsm} objects. 
#' @export
#' @keywords brokenstick
#' @examples
#' data(exses)
#' dv <- tdrply(identity, 1:2, no = 90, obj = exses)[[1]]
#' 
#' bsm_6p <- brokenstick(dv, npts = 6)
#' plot(bsm_6p, data = TRUE)
#' 
#' bsm_30m <- optBrokenstick(dv, threshold = 30, cost = max_dist_cost) 
#' plot(bsm_30m, add = TRUE, col = 2, lty = 2, enumerate = TRUE, 
#'      col.pts = (bsm_30m$pts.no > 5) + 1)
#' 
#' bsm_5m <- optBrokenstick(dv, threshold = 5, cost = max_dist_cost) 
#' plot(bsm_5m, add = TRUE, col = 3, lty = 3, enumerate = TRUE, 
#'      col.pts = (bsm_5m$pts.no > max(bsm_30m$pts.no)) + (bsm_5m$pts.no > 5) + 1)
optBrokenstick <- function(x, y = NULL, threshold, cost = max_dist_cost, 
                           npmin = 2, npmax = Inf, start = NULL, na.action, ...) {
  # Check inputs consitency
  if (npmin < length(start)) 
    stop('"npmin" must be equal or greater than the length of "start".')
  if (npmax <= length(start)) 
    stop('"npmax" must be greater than the length of "start".')
  if (npmax <= npmin) 
    stop('"npmax" must be greater than "npmin".')
  
  # Set defaults for "na.action" and "threshold"
  if (missing(na.action)) na.action <- options("na.action")[[1]]
  if (missing(threshold)) threshold <- -Inf
  
  # Initiate algorithm
  bsm0 <- brokenstick(x, y, npmin, start, na.action, ...)
  S0 <- cost(bsm0)
  # Start iterations
  repeat {
    # Stop if max number of break point or if is the threshold cost is reached
    if (S0 <= threshold || length(bsm0$pts) == npmax) break
    bsm <- update(bsm0, npts = length(bsm0$pts) + 1)
    S <- cost(bsm)
    # Stop if threshold does not make sense or cost increases
    if (is.infinite(threshold) && S >= S0) break
    bsm0 <- bsm ; S0 <- S
  }
  bsm <- update(bsm0, npts = length(bsm0$pts), ...)
  
  # Format output
  bsm$cost <- cost(bsm)
  bsm
}

#' Get the maximun residual of a BSM at a given iteration 
#' 
#' @param x \code{bsm} object as returned by \code{\link{brokenstick}} or 
#' \code{\link{optBrokenstick}}.
#' @param iter the iteration number for which the maximum residual is to be 
#' returned (Ri). If \code{NULL} then \code{iter} is set to the last iteration 
#' in \code{x}
#' @inheritParams residuals.bsm
#' @export
#' @keywords internal brokenstick
#' @examples 
#' data(exses)
#' dv <- tdrply(identity, 1:2, no = 100, obj = exses)[[1]]
#' bsm <- brokenstick(dv) 
#' max_residual(bsm, iter = 5)
#' 
#' \dontrun{
#' max_residual(eco.mem(bsm), iter = 5) # error
#' max_residual(eco.mem(bsm), iter = 4)
#' }
max_residual <- function(x, iter = NULL, type = c("normal", "absolute")) {
  # Check inputs and set iter to default value when necessary
  is.bsm(x) || stop('x must be of class "bsm"')
  iter <- iter %else% (length(x$pts.x) - 1)
  
  # Update "x" to necessary number of break point given "iter"
  npts <- iter + 1
  bsm0 <- update(x, npts)
  
  # If data slot is available use it so that any iteration can be asked
  if ("data" %in% names(x) && !is.null(x$data)) {
    res <- resid(bsm0)
    out <- res[which.max(abs(res))]
  } else {
    bsm1 <- try(update(x, npts + 1), TRUE) %else% stop('Requested iteration ', 
                                                       'number "iter" is too high.')
    rk <- which.max(bsm1$pts.no)
    out <- predict(bsm0, newdata = bsm1$pts.x[rk]) - bsm1$pts.y[rk]
  }
  
  # Format output to requested "type"
  switch(match.arg(type), normal = out, absolute = abs(out))
}

#' Compute goodness of fit of brokenstick models: Dive Zone Index (DZI)
#' 
#' An index of the goodness of fit for brokenstick models. Value of the dive zone 
#' index ranges from 0 (perfect fit) to 1. See original article in references.
#' 
#' @param x a \code{bsm} object as returned by \code{\link{brokenstick}} or 
#' \code{\link{optBrokenstick}}.
#' @param iter the iteration number for which the DZI is to be computed. If 
#' \code{iter = NULL} then it is set to last BSM iteration.
#' @param n Optional. The number of points to use when calculating the dive zone limits. 
#' if \code{NULL} then \code{n} is set to the number of record in the dataset used to 
#' fit the BSM (when available) or 500 (when not available)
#' @export
#' @keywords brokenstick
#' @details If the original TDR data are not available to the function the DZI 
#' of the last BSM iteration will be computed using the maximum residual from the 
#' previous iteration.
#' @return A \code{dzi} object with:
#' \itemize{
#'   \item dzi The dive zone index obtained at each iteration.
#'   \item max_res The maximum residuals at each iteration.
#'   \item dz_Lbnd Dive zone lower bound.
#'   \item dz_Ubnd Dive zone upper bound.
#'   \item dz_width The difference between the two previous slots i.e. the vertical 
#'   width of the dive zone.
#'   \item dz_Xval A vector giving the x values corresponding to \code{dz_Lbnd}, 
#'   \code{dz_Ubnd} and \code{dz_width}.
#'   \item no_seg A vector giving the number of BSM segments to which the 
#'   \code{dz_Xval} belong.
#'   \item seg_width Vertical width covered by BSM segments.
#'   \item seq_length Duration of BSM segments.
#'   \item pts.x, pts.y, pts.no Breakpoints information inherited from \code{x}.
#'   \item data The raw data of input BSM when available.
#' }
#' @references Photopoulou, T., Lovell, P., Fedak, M. A., Thomas, L. and 
#' Matthiopoulos, J. (2015). Efficient abstracting of dive profiles using a 
#' broken-stick model. Methods Ecol Evol 6, 278-288. 
#' @examples 
#' data(exses)
#' dv <- tdrply(identity, 1:2, no = 100, obj = exses)[[1]]
#' bsm <- brokenstick(dv) 
#' dzi <- dive_zone_index(bsm)
#' 
#' dzi
#' str(dzi)
#' plot(dzi)
dive_zone_index <- function(x, iter = NULL, n = NULL) {
  is.bsm(x) || stop('x must be of class "bsm"')
  
  # Compute constants
  npts_ini <- max(x$pts.no)
  npts_ini > 1 || stop('"x" must have at least 3 break points.')
  iter <- iter %else% npts_ini
  rng_y <- range(x$pts.y)
  if (has_data <- ("data" %in% names(x) && !is.null(x$data))) {
    n <- nrow(x$data)
    newx <- x$data[ , 1]
  } else {
    if (iter >= npts_ini) warning('Maximum residual (Ri) is not computable for the requested iteration number "iter".', 
                                  'The last Ri available will be used instead.')
    n <- n %else% 500
    newx <- seq(min(x$pts.x), max(x$pts.x), length.out = n)
  }
  
  # Initiate self-dependent variables
  R <- dz_index <- NULL
  Ubnd <- -(Lbnd <- rep(Inf, n))
  
  for (i in seq(1, iter)) {
    bsm_i <- update(x, npts = i + 1) # BSM at iteration i
    bsm_i_pred <- predict(bsm_i, newx)
    R <- c(R , try(max_residual(x, i, type = "absolute"), TRUE) %else% NULL)
    Ri <- R[length(R)]
    
    # Compute Dive Zone limits and the corresponding index
    Ubnd <- pmax(bsm_i_pred - Ri, rng_y[1], Ubnd)
    Lbnd <- pmin(bsm_i_pred + Ri, rng_y[2], Lbnd)
    dz_width <- Lbnd - Ubnd
    dz_index <- c(dz_index, sum(dz_width) / (diff(rng_y) * n))
  }
  
  # Format output
  out <- list(
    dzi = setNames(dz_index, paste0("DZI", seq(1, iter))), 
    max_res = setNames(R, paste0("R", seq(1, length(R)))), 
    dz_Lbnd = Lbnd, dz_Ubnd = Ubnd, dz_width = dz_width, 
    dz_Xval = newx, no_seg = which.stick(bsm_i, newx), 
    seg_width = abs(diff(bsm_i$pts.y)), seg_length = abs(diff(bsm_i$pts.x)), 
    pts.x = x$pts.x, pts.y = x$pts.y, pts.no = x$pts.no, 
    data = "if"(has_data, x$data, NULL)
  )
  class(out) <- c("dzi", "list")
  out
}

#' Cost functions for automatic brokenstick models.
#' 
#' Given a model and data the function returns a single value which is used to 
#' determine if the number of points is optimized: the cost function value being 
#' minimized by the optimal set of parameters.
#'
#' @param object Object of class inheriting from "\code{bsm}".
#' @return A statistic to minimize. 
#' @details \code{max_dist_cost} In this function the statistic is the maximun 
#' distance between an 
#' observation and its fitted value. Hence the function constantly decrease with 
#' increasing number of point in the model and a threshold must be provided along 
#' with this function to avoid infinite looping. Yet, the pros of this cost function 
#' is a threshold that is simple to determine and to interpret.
#' @seealso \code{\link{optBrokenstick}}
#' @keywords internal brokenstick
#' @export
max_dist_cost <- function(object) {
  max_residual(object, type = 'absolute')
}

#' @rdname max_dist_cost
#' @inheritParams max_dist_cost
#' @details \code{dist_per_pt_cost} In this function the statistic is the 
#' average distance between observations and fitted values divided by the number of
#' points used by the model. This function has does rech a minimun value but 
#' generally for high numbers of points.
#' @keywords internal brokenstick
dist_per_pt_cost <- function(object) {
  res <- residuals.bsm(object, type = 'absolute')
  mean(res, na.rm = TRUE) / length(object$pts)
}

#' @rdname max_dist_cost
#' @inheritParams max_dist_cost
#' @details \code{rss_cost} In this function the statistic is the sum of 
#' squared residuals.
#' @keywords internal brokenstick
rss_cost <- function(object) {
  sum(residuals.bsm(object)^2)
}

#' @rdname max_dist_cost
#' @inheritParams max_dist_cost
#' @details \code{dzi_cost} In this function the statistic is the Dive Zone Index. 
#' See details in \code{\link{dive_zone_index}}
#' @export
#' @keywords internal brokenstick
dzi_cost <- function(object) {
  if (max(object$pts.no) == 1) stop('"object" must have at least 3 break points. ', 
                                    'Set "npmin" to 3 in "optBrokenstick"')
  last(dive_zone_index(object)$dzi)
}

#' Coerce brokenstick model to data.frame
#' 
#' @param x a brokenstick model.
#' @inheritParams base::as.data.frame
#' @export
#' @keywords internal brokenstick
#' @examples 
#' data(exses)
#' bsm <- tdrply(brokenstick, 1:2, no = 50:53, obj = exses)
#' lapply(bsm, as.data.frame)
as.data.frame.bsm <- function(x, row.names = NULL, ...) {
  n <- length(x$pts.x)
  df <- data.frame(st_tm = x$pts.x[-n], ed_tm = x$pts.x[-1], no_seg = seq(1, n-1),
                   bsm_slope = x$slope, intercept = x$intercept, duration = diff(x$pts.x))
  as.data.frame(df, row.names = row.names, ...)
}

#' Print summary of a brokenstick model
#' 
#' @param object a brokenstick model
#' @param ... for method consistency
#' @export
#' @keywords internal brokenstick
#' @examples 
#' data(exses)
#' dv <- tdrply(identity, 1:2, no = 53, obj = exses)[[1]]
#' summary(brokenstick(dv))
#' summary(brokenstick(dv, npts = 12))
summary.bsm <- function(object, ...) {
  df <- as.data.frame(object)
  print(df[ , -(1:2)])
  r2 <- 1 - (var(resid(object)) / var(object$data[ , 2]))
  cat("\nR-squared =", r2, "\nMax residual =", maxr <- max_residual(object), 
      "\nMean squared resiluals =", meanr <- mean(resid(object)^2), 
      "\nDive Zone Index =", last((dzi <- dive_zone_index(object))$dzi))
  invisible(list(df = df[ , -(1:2)], r2 = r2, max_res = maxr, 
                 mean_res = meanr, dzi = dzi))
}

#' General S3 utils for bsm objects
#' 
#' @param pts.x x coordinates of breakpoints. If \code{pts.x} is a list then 
#' it is interpreted as being a \code{"bsm"} object and is returned as is with 
#' updated class. Eventually \code{x} can be a table formated in the SMRU format 
#' (see details section).
#' @param pts.y y coordinates of breakpoints
#' @param ... other BSM slots such as \code{"data"}, \code{"fitted"} or 
#' \code{"residuals"}. See details of slots in \code{\link{brokenstick}}.
#' @details SMRU format refers to a table with 80 columns extracted from Access 
#' database on the SMRU Instrumenation website. See the link in references for 
#' the description of the object ("dive" table, page 4).
#' @references \url{http://www.smru.st-andrews.ac.uk/protected/specs/DatabaseFieldDescriptions.pdf}
#' @export
#' @keywords internal brokenstick
#' @examples
#' data(exses)
#' dv <- tdrply(identity, 1:2, no = 100, obj = exses)[[1]]
#' bsm <- brokenstick(dv) 
#' 
#' as.bsm(bsm$pts.x, bsm$pts.y)
#' as.bsm(bsm$pts.x, bsm$pts.y, residuals = "dummy")
#' as.bsm(bsm)
as.bsm <- function(pts.x, pts.y = NULL, ...) {
  # Names of SMRU "dive" tables. "RESIDUAL" column ???
  nms_smru_db <- c("ref", "PTT", "CNT", "DE_DATE", "SURF_DUR", "DIVE_DUR", "MAX_DEP", 
                   "D1", "D2", "D3", "D4", "V1", "V2", "V3", "V4", "V5", "TRAVEL_R", 
                   "HOMEDIST", "BOTTOM", "T1", "T2", "T3", "T4", "D_SPEED", "N_DEPTHS", 
                   "N_SPEEDS", "DEPTH_STR", "SPEED_STR", "PROPN_STR", "PERCENT_AREA", 
                   "RESIDUAL", "GRP_NUMBER", "D5", "T5", "qc", "D6", "D7", "D8", 
                   "D9", "D10", "D11", "D12", "D13", "D14", "D15", "D16", "D17", 
                   "D18", "D19", "D20", "D21", "D22", "D23", "D24", "D25", "T6", 
                   "T7", "T8", "T9", "T10", "T11", "T12", "T13", "T14", "T15", "T16", 
                   "T17", "T18", "T19", "T20", "T21", "T22", "T23", "T24", "T25", 
                   "ds_date", "start_lat", "start_lon", "lat", "lon")
  
  # Extract relevant info from SMRU "dive" table
  if (!is.null(names(pts.x)) && all(names(pts.x) %in% nms_smru_db)) {
    no_dive <- seq_along(pts.x[ , 1])
    dv_dur <- pts.x$DIVE_DUR
    dv_start <- as.POSIXct(pts.x$DE_DATE, format = "%d/%m/%Y", tz = "UTC")
    
    .f <- function(x, type) c(0, na.omit(unname(unlist(x))), "if"(type == "x", 100, 0))
    
    pts.y <- split(pts.x[ , grep("^D[0-9]+$", names(pts.x))], no_dive)
    pts.y <- lapply(pts.y, .f, type = "y")
    
    pts.x <- split(pts.x[ , grep("^T[0-9]+$", names(pts.x))], no_dive)
    pts.x <- lapply(pts.x, .f, type = "x")
    pts.x <- Map("+", Map("*", pts.x , dv_dur / 100), dv_start)
  }
  
  # Whatever it is convert it into a "bsm" object
  if (is.list(pts.x)) {
    if (is.bsm(pts.x)) {
      class(pts.x) <- c("bsm", "list")
      return(pts.x)
    } else {
      out <- Map(as.bsm, pts.x, pts.y)
    }
  } else {
    slts <- list(...)
    pts.x <- as.numeric(pts.x)
    out <- brokenstick(pts.x, pts.y, length(pts.x), eco.mem = 4, not.inj.action = "null", sort.data = TRUE)
    out[names(slts)] <- slts
  }
  out
}

#' @rdname as.bsm
#' @param x an object to test.
#' @export
#' @keywords internal brokenstick
#' @examples 
#' \dontrun{
#' is.bsm(bsm)
#' }
is.bsm <- function(x) is(x, "bsm")

#' Print method for bsm objects
#' @param x a bsm object
#' @param ... for generic compatibility
#' @export
#' @keywords internal brokenstick
print.bsm <- function(x, ...) print(as.data.frame(x))

#' Print method for dzi objects
#' @param x a dzi object
#' @param ... for generic compatibility
#' @export
#' @keywords internal brokenstick
print.dzi <- function(x, ...) print(x$dzi)

#' Plot method for "dzi" objects
#' @param x a \code{"dzi"} object
#' @param dz_col color of dive zone area
#' @param dz_border color of the border of the dive zone area
#' @param enumerate should the order of BSM break points be enumerated.
#' @param ... Arguments to be passed to methods, such as graphical 
#' parameters (see \code{\link{par}}).
#' @export
#' @keywords brokenstick
plot.dzi <- function(x, dz_col = "lightblue", dz_border = "blue", enumerate = TRUE, ...) {
  has_data <- ("data" %in% names(x) && !is.null(x$data))
  iter <- length(x$dzi)
  
  # Make plot drawing area
  if (has_data) 
    plot(x$data, type = "n", ylim = rev(range(x$data[ , 2])), ...)
  else 
    plot(x$pts.x, x$pts.y, type = "n", ylim = rev(range(x$pts.y)), ...)
  
  # Plot dive zone
  polygon(c(x$dz_Xval, rev(x$dz_Xval)), c(x$dz_Lbnd, rev(x$dz_Ubnd)), 
          col = dz_col, border = dz_border)
  
  # Add dive profiles (hi-res if available + abstracted profile)
  if (has_data) lines(x$data)
  plot(brokenstick(x$pts.x, x$pts.y, npts = iter + 1), enumerate = enumerate, add = TRUE)
  
  # Add title
  title(paste0("Iteration ", iter, " (", iter + 1, " brkpts): ", 
               "R", length(x$max_res), " = ", round(x$max_res[length(x$max_res)], digits = 2), ", ", 
               "DZI", iter, " = ", round(x$dzi[iter], digits = 2)))
  
  invisible(NULL)
}

#' Subset the slots of bsm objects
#' 
#' \code{bsm} objects, depending on the \code{eco.mem} argument used when they were 
#' fitted using the \code{\link{brokenstick}} function, can contain numerous slots 
#' (detailed in \code{\link{brokenstick}}) keeping information about the high 
#' sampling frequency data. 
#' These information are usefull to get accurate computations in numerous cases but 
#' need to be ignored in order to mimic abstracted dive profiles such as 
#' those obtained by CTD-SRDL tags.
#' This function is a utility that allows to ignore these high sampling fequency 
#' information.
#' 
#' @param x a \code{bsm} object or a list of \code{bsm} objects.
#' @param n set the number of slot to ignore as the \code{eco.mem} argument in the 
#' \code{\link{brokenstick}} function. The default \code{n = 4} returns 
#' abstracted dive profiles as if they were obtained from CTD-SRDL tags.
#' @param type A character indicating how the slots are to be handled. 
#' \code{"ignore"} puts aside the slots (renaming them) so they are ignored by other 
#' \code{bsm} processing functions but does not remove them. \code{"delete"} delete 
#' them so that less memory is used to store the object. 
#' \code{"reset"} reverse the \code{"ignore"} operation.
#' @export
#' @keywords brokenstick
#' @examples 
#' data(exses)
#' bsm <- tdrply(brokenstick, 1:2, no = 100, obj = exses)[[1]]
#' 
#' bsm <- eco.mem(bsm, type = "ignore")
#' try(predict(bsm)) # error: required slots treated as absent
#' bsm <- eco.mem(bsm, type = "reset")
#' predict(bsm) # slots were restored
#' identical(bsm, x)
#' 
#' # Use type = "delete" to save some memory
#' lr_bytes <- object.size(eco.mem(bsm, type = "delete")) 
#' hr_bytes <- object.size(bsm)
#' paste0(round(100 * as.numeric(lr_bytes / hr_bytes), digits = 2), "%")
#' 
#' # Works on lists
#' bsm <- eco.mem(tdrply(brokenstick, 1:2, no = 100:103, obj = exses), type = "delete")
#' # Same as
#' bsm <- tdrply(brokenstick, 1:2, no = 100:103, obj = exses, eco.mem = 4)
eco.mem <- function(x, n = 4, type = c("ignore", "delete", "reset")) {
  if (is.bsm(x)) {
    if (n > 4) stop('"n" must be between 0 and 4.')
    n_subset <- "if"(n != 0L, -unique(seq(8L - n + 1L, 8L)), seq_along(x))
    type <- match.arg(type)
    if (type == "ignore") {
      nms <- names(x)
      nms[-n_subset] <- paste0("ignored.", nms[-n_subset])
      out <- setNames(x, nms)
    } else if (type == "delete") {
      out <- do.call(as.bsm, x[n_subset])
    } else if (type == "reset") {
      nms <- names(x)
      nms[-n_subset] <- gsub("ignored\\.", "", nms[-n_subset])
      out <- setNames(x, nms)
    }
    return(out)
  } else {
    if (is.list(x)) return(lapply(x, eco.mem))
  }
}
SESman/rbl documentation built on May 9, 2019, 11:10 a.m.