R/dataFunctions.R
In gfpop: Graph-Constrained Functional Pruning Optimal Partitioning

Documented in dataGenerator sdDiff

#' Data Generator
#'
#' @description Generating data with a given model = changepoint relative positions + parameters + type of cost + (standard deviation + gamma decay)
#' @param n number of data points to generate
#' @param changepoints vector of positions of the changepoints in (0,1] (last element is always 1).
#' @param parameters vector of means for the consecutive segments (same length as changepoints)
#' @param type a string defining the cost model to use: \code{"mean"}, \code{"variance"}, \code{"poisson"}, \code{"exp"}, \code{"negbin"}
#' @param sigma a positive number = the standard deviation of the data
#' @param gamma a number between 0 and 1 : the coefficient of the exponential decay (by default = 1 for piecewise constant signals)
#' @param size parameter of the \code{rnbinom} function
#' @return a vector of size n generated by the chosen model
#' @examples
#' dataGenerator(500, c(0.3, 0.6, 1), c(1, 2, 3), type = "mean", sigma = 0.5)
#'
#' dataGenerator(1000, c(0.2, 0.33,0.5, 1), c(4, 0.2, 3,0.5), type = "variance")
#'
#' dataGenerator(800, c(0.4, 0.8, 1), c(15, 5, 8), type = "mean", gamma  = 0.95, sigma = 0.4)
#'
#' dataGenerator(400, c(0.4, 0.9, 1), c(2, 1.5, 3), type = "poisson")
#'
#' dataGenerator(1000, c(0.44, 0.86, 1), c(0.5, 0.2, 0.4), type = "negbin", size = 3)
dataGenerator <- function(n, changepoints, parameters, type = "mean", sigma = 1, gamma = 1, size = 10)
{
  ### STOP ###
  if(is.numeric(n) == FALSE || n%%1 != 0){stop('n is not an integer.')}
  if(n < 2){stop('n must be a >= 1')}

  if(is.numeric(changepoints) == FALSE){stop('changepoints is not a numeric vector')}
  if(!all(changepoints <= 1) || !all(changepoints > 0)){stop('changepoints has values outside (0,1]')}
  if(changepoints[length(changepoints)] != 1){stop('the last value in changepoints vector is not 1')}
  if(length(changepoints) > 1)
  {
    for(i in 1:(length(changepoints)-1)){if(changepoints[i+1] <= changepoints[i]){stop('changepoints not an increasing vector')}}
  }

  allowed.types <- c("mean", "variance", "poisson", "exp", "negbin")
  if(!type %in% allowed.types){stop('type must be one of: ', paste(allowed.types, collapse=", "))}

  if(sigma < 0){stop('sigma is a nonnegative value')}
  if(gamma > 1 || gamma <= 0){stop('gamma is not between 0 and 1 (0 excluded)')}

  ### ###

  SegLength <- diff(c(0, floor(changepoints*n)))
  vectData <- NULL

  if(type == "mean")
  {
    if(gamma != 1)
    {
      decay <- NULL
      for(i in 1:length(SegLength)){decay <- c(decay, cumprod(rep(gamma, SegLength[i])))}
      vectData <- rep(parameters, SegLength)*decay + rnorm(n, 0, sigma)
    }
    else
    {
      vectData <- rep(parameters, SegLength) + rnorm(n, 0, sigma)
    }
  }

  if(type == "variance"){vectData <- rnorm(n, 0, rep(sqrt(parameters), SegLength))}
  if(type == "poisson"){vectData <- rpois(n, rep(parameters, SegLength))}
  if(type == "exp"){vectData <- rexp(n, rep(parameters, SegLength))}
  if(type == "negbin")
  {
    if(any(parameters > 1) || any(parameters <= 0)){stop('parameters for negbin type should be positive probabilities')}
    vectData <- rnbinom(n, size = size, prob = rep(parameters, SegLength))
  }

  return(vectData)
}

#' sdDiff
#'
#' @description sdDiff is a function based on the difference operator (or difference order for HALL method) estimating the time-series standard deviation.
#' The estimation works for time-series generated by Gaussian
#' random variables with constant standard deviation and multiple changes in mean.
#' Three estimators are available:
#' \itemize{
#'   \item \code{HALL} : the so-called HALL-estimator of order 3. For more details see: \emph{(1990) Asymptotically optimal difference-based estimation of variance in nonparametric regression. Authors: Hall, Peter and Kay, JW and Titterinton, DM. Biometrika, pages 521--528}
#'   \item \code{MAD} : the median absolute deviation estimator computed on \code{diff(x)/sqrt(2)} with x the vector of datapoints
#'   \item \code{SD} : the standard deviation estimator (function sd) computed on \code{diff(x)/sqrt(2)} with x the vector of datapoints
#' }
#' @param x vector of datapoints
#' @param method Three available methods: \code{"HALL"}, \code{"MAD"} and \code{"SD"}
#' @return a value equal to the estimated standard deviation
#' @examples
#' data <- dataGenerator(300, seq(0.1,1,0.1), sample(10), sigma = 2)
#' sdDiff(data, method = "HALL")
#' sdDiff(data, method = "MAD")
#' sdDiff(data, method = "SD")
sdDiff <- function(x, method = "HALL")
{
  if(is.numeric(x) == FALSE || length(x) < 5){stop('x is not a numeric vector or length < 5 (the HALL method cannot be used)')}
  if(method == "HALL")
  {
    n = length(x)
    wei <- c(0.1942, 0.2809, 0.3832, -0.8582)
    mat <- wei %*% t(x)
    mat[2, -n] = mat[2, -1]
    mat[3, -c(n-1, n)] = mat[3, -c(1, 2)]
    mat[4, -c(n-2, n-1, n)] = mat[4, -c(1, 2, 3)]
    return(sqrt(sum(apply(mat[, -c(n-2, n-1, n)], 2, sum)^2) / (n-3)))
  }
  if(method == "MAD")
  {
    return(mad(diff(x)/sqrt(2)))
  }
  if(method == "SD")
  {
    return(sd(diff(x)/sqrt(2)))
  }
  return(NULL)
}


###############################################
# invisible function for the user
getDerivativePenalty <- function(D, n, c1 = 2, c2 = 5)
{
  return(c1*log(n) - c1*(log(D) + 1) + c2)
}