R/multivariate_tsout_ensemble.R

Defines functions compute_method_weights tsout_ensemble mv_tsout_ens

Documented in mv_tsout_ens

#' Performs multivariate time series outlier ensembling.
#'
#' @param x A  data frame or a matrix object containing a multivariate time series
#' @param m1 Variable indicating dimension reduction methods. Default is set to using all 4 methods: PCA, DOBIN, ICS and ICA.
#' @param ncomp The number of components for each dimension reduction method. Default is set to 2.
#' @param sds The random seed for generating a no-outlier time series.
#' @param rept The number of repetitions for generating a no-outlier time series.
#' @param compr To adjust for multiple testing, the results of the ensemble are compared with the results of a time series without outliers. If \code{compr =1}, a time series is simulated as in \code{simulate_comp_ts} without outliers. If \code{compr = 2}, the top outliers are removed from the outlier series and interpolated values are used for those time points. If \code{compr = 3} both methods of simulation are used for comparison.
#' @param rat A comparison is done with the outliers removed time series. The variable \code{rat} denotes the ratio of outliers to be removed as a proportion of the whole dataset for this comparison.
#' @param fast For faster computation skip ICS decomposition method.
#'
#' @return A list with the following components:
#' \item{\code{outliers}}{The outliers detected from the multivariate ensemble after comparing with the comparison time series without outliers. }
#' \item{\code{all}}{All the outliers detected from the multivariate ensemble. }
#' \item{\code{outmat}}{A matrix with outlier scores organised by outlier method.}
#' \item{\code{wts}}{The weights of the outlier detection methods.}
#' \item{\code{pca_loadings}}{The basis vectors from PCA.}
#' \item{\code{dobin_loadings}}{The basis vectors from DOBIN. See R package \code{dobin} for more details.}
#' \item{\code{ics_loadings}}{The basis vectors from ICS. See R package \code{ICS} for more details.}
#' \item{\code{ica_loadings}}{The basis vectors from Independent Component Analysis.}
#' \item{\code{decomp_wts}}{Each decomposition method has several components. For example if \code{ncomp}=2, then there are 2 PC components, 2 DOBIn components, etc ... The weight of each component is given different and depends on the decomposition method. These weights are given in \code{decomp_wts}.}
#' \item{\code{outmat4D}}{A 4D array with outlier scores organised by outlier method, decomposition method, components for each decomposition method and time.}
#'
#' @examples
#' \dontrun{
#' set.seed(100)
#' n <- 600
#' x <- sample(1:100, n, replace=TRUE)
#' x[25] <- 200
#' x[320] <- 300
#' x2 <- sample(1:100, n, replace=TRUE)
#' x3 <- sample(1:100, n, replace=TRUE)
#' x4 <- sample(1:100, n, replace=TRUE)
#' X <- cbind.data.frame(x, x2, x3, x4)
#' out <- mv_tsout_ens(X, m1=c(1,2,4), compr=2)
#' }
#'
#' @importFrom stats prcomp rnorm runif
#' @export
mv_tsout_ens <- function(x, m1 =NULL, ncomp=2,  sds=1, rept=1, compr=2, rat=0.05, fast=TRUE){ # wts = NULL,
  d <- dim(x)[2]
  n <- dim(x)[1]
  if(is.null(m1)){
    m1 <- 1:4
  }
  if(fast){
    if(is.null(m1)){
      m1 <- c(1,2,4)
    }else{
      m1 <- intersect(m1, c(1,2,4))
    }
  }
  out1 <- tsout_ensemble(x, m1, ncomp,  sds) # wts,
  res <- out1$all

  if((compr==1)|(compr==3)){
    # thresholds for actual outliers - from no outliers
    noout <- no_outlier_scores_1(d, n, ncomp, sd=123, rept = rept, m1)
    thres1 <- noout$thres1
    thres2 <- noout$thres2
    gap <- thres2 - thres1

    indthres1 <- which(res[ ,2] > thres1)
    outthres1 <- res[indthres1, ]
    outthres1$gapscore1 <- rep(0, length(indthres1))


    stars <-  (outthres1$Total_Score - thres2)/gap
    stars <- ceiling(stars)
    stars[stars < 0] <- 0
    outthres1$gapscore1 <- stars
  }
  if((compr==2)|(compr==3)){
    # Remove outliers over mean + sd from the list and get the outlier scores
    noout2 <- no_outlier_scores_2(x, ncomp, out1, rat=rat, m1)
    thres1 <- noout2$thres1
    thres2 <- noout2$thres2
    if(thres1!=0){
      gap <- thres2 - thres1

      indthres1 <- which(res[ ,2] > thres1)
      outthres2 <- res[indthres1, ]
      outthres2$gapscore2 <- rep(0, length(indthres1))


      stars <-  (outthres2[ ,2] - thres2)/gap
      stars <- ceiling(stars)
      stars[stars < 0] <- 0
      outthres2$gapscore2 <- stars
    }else{
      # thresholds are equal to zero
      # The second comparison method of removing outliers did not produce any outliers to compare with - so return the original data frame
      outthres2 <- res
    }
  }

  if(compr==1){
    outthres <- outthres1
  }else if(compr==2){
    outthres <- outthres2
  }else if(compr==3){
    outthres <- merge(outthres1, outthres2, all = TRUE)
  }

  outthres <- as.data.frame(outthres)
  gp1 <- which(colnames(outthres) %in% c("gapscore1"))
  if(length(gp1)>0){
    colnames(outthres)[gp1] <- "Gap_Score_1"
  }
  gp2 <- which(colnames(outthres) %in% c("gapscore2"))
  if(length(gp2)>0){
    colnames(outthres)[gp2] <- "Gap_Score_2"
  }

  out <- list()
  out$outliers <- as.data.frame(outthres)
  out$all<- res
  out$outmat <- out1$outmat
  out$wts <- out1$wts
  out$pca_loadings <- out1$pca_loadings
  out$dobin_loadings <- out1$dobin_loadings
  out$ics_loadings <- out1$ics_loadings
  out$ica_loadings <- out1$ica_loadings
  out$decomp_wts <- out1$decomp_wts
  out$outmat4D <- out1$outmat4D

  return(out)

}



tsout_ensemble <- function(x, m1 =NULL, ncomp=NULL,  sds=1){  # wts = NULL,
  x <- as.data.frame(x)
  d <- dim(x)[2]
  n <- dim(x)[1]

  if(is.null(m1)){
    m1 <- 1:4
  }
  if(is.null(ncomp)){
    ncomp = 2
  }
  # if(is.null(wts)){
  #   wts <- 1/(1:ncomp)
  # }

  pca_loadings <- dobin_loadings <- ics_loadings <- ica_loadings <- NA
  result <- matrix(0, nrow=n, ncol=length(m1))
  outmat <- matrix(0, nrow=n, ncol=4)
  colnames(outmat) <- c("forecast", "tsoutliers", "otsad", "anomalize" )
  wts_mat <- matrix(0, nrow=ncomp, ncol=4)
  colnames(wts_mat) <- c("dobin", "pca", "ics", "ica")
  outmat4D <- array(0, dim=c(n, 4, ncomp, 4))
  # second index for 4 outlier methods
  # fourth index for 4 coord transformation methods
  for(i in 1:length(m1)){
    j=m1[i]
    obj <- tsout_ensemble_1M(x, pre=m1[i], ncomp )  # , wts
    result[ ,i] <- obj$score
    outmat <- outmat +  obj$outmat
    if(j==1){
      dobin_loadings <- obj$load_vectors
      wts_mat[ ,1] <- obj$wts
      outmat4D[ , , ,1] <- obj$outmat3D
    }else if(j==2){
      pca_loadings <- obj$load_vectors
      wts_mat[ ,2] <- obj$wts
      outmat4D[ , , ,2] <- obj$outmat3D
    }else if(j==3){
      ics_loadings <- obj$load_vectors
      wts_mat[ ,3] <- obj$wts
      outmat4D[ , , ,3] <- obj$outmat3D
    }else if(j==4){
      ica_loadings <- obj$load_vectors
      wts_mat[ ,4] <- obj$wts
      outmat4D[ , , ,4] <- obj$outmat3D
    }
  }
  # Get the weights for the different outlier methods
  wts <- compute_method_weights(outmat)
  # Scale the results by these weights
  # result <- result %*% diag(wts) # I think this needs to be outmat not result

  # Output scaled according to outlier method weights
  outmat2 <- outmat %*% diag(wts)

  result_rowsum <- apply(result, 1, sum)
  outmat_rowsum <- apply(outmat2, 1, sum)

  avg <- apply(outmat2, 1, sum)
  inds <- which(avg > 0)
  # Number of coordinates which found that point as an outlier
  num_comps_all <- apply(result, 1, function(x) sum(x>0))
  num_comps <- num_comps_all[inds]

  result[inds, ] <- result[inds, ]*(outmat_rowsum[inds]/result_rowsum[inds])

  # outmatsum_inds <- which(apply(outmat2, 1, sum) > 0 )
  outmat_positive <- outmat2 > 0
  num_methods_all <- apply( outmat_positive, 1, sum )
  num_methods <- num_methods_all[inds]

  colnames(result) <- c("DOBIN", "PCA", "ICS", "ICA")[m1]

  if(length(inds)==1){
    res <- c(inds, avg[inds], num_comps,  num_methods,  result[inds, ], outmat2[inds, ])
    res <- t(data.frame(res))
  }else{
    res <- cbind.data.frame(inds, avg[inds], num_comps,  num_methods,  result[inds, ], outmat2[inds, ])
  }

  colnames(res) <- c("Indices", "Total_Score", "Num_Coords", "Num_Methods", colnames(result), colnames(outmat) )

  tot_vals <- avg[inds]
  # thres <- mean(tot_vals) + sd(tot_vals)
  # indsgrtr <- which(res[ ,2] > thres)
  #
  # outl <- res[indsgrtr, ]

  out <- list()
  #out$outliers <- outl
  out$all<- res
  out$outmat <- outmat
  out$wts <- wts
  out$pca_loadings <- pca_loadings
  out$dobin_loadings <- dobin_loadings
  out$ics_loadings <- ics_loadings
  out$ica_loadings <- ica_loadings
  out$decomp_wts <- wts_mat
  out$outmat4D <- outmat4D
  return(out)
}


compute_method_weights <- function(outmat){
  wts <- rep(0, 4)
  for(i in 1:4){
    non_zero_inds_i <- which(outmat[ ,i] >0)
    if(length(non_zero_inds_i) >0 ){
      outmat_i <- outmat[ ,-i]
      non_zero_inds_others <- which( apply(outmat_i, 1, sum) > 0 )
      wts[i] <- length(intersect(non_zero_inds_others, non_zero_inds_i))/length(non_zero_inds_i)
    }else{
      wts[i] <- 1
    }
  }
  wts <- wts/sum(wts)
  return(wts)
}

Try the composits package in your browser

Any scripts or data that you put into this service are public.

composits documentation built on May 25, 2022, 9:11 a.m.