R/carx_outlier.R
In carx: Censored Autoregressive Model with Exogenous Covariates

Documented in ot2.carx ot.carx outlier outlier.carx

#' Detecting outliers in a  \code{carx} model.
#'
#' This is an internal function. It tests for additional outlier one at a time, for each time point, as described in
#'  Wang and Chan (2015), adjusted for multiplicity of testing. If the test result is significant, the function 
#'  augments the location of the most significant outlier to the vector of outlier indices, 
#'  i.e, the \code{outlier.indices} in the object returned by the function.
#' @param object a \code{carx} object
#' @keywords internal
#' @return a possibly updated \code{object} which will have an attribute \code{outlier.indices} 
#' denoting the indices of outliers with the new index of outlier appended, if any outlier is detected.
ot.carx <- function(object)
{
  verbose= FALSE
	#message("detecting outliers")
	nSample <- 100000
	threshold <- 0.025/object$nObs
	eps <- stats::rnorm(nSample,0,object$sigma)
	trend <- object$x%*%object$prmtrX
	covEta <- computeCovAR(object$prmtrAR, object$sigma,lag=object$p)
	nObs <- object$nObs
	p <- object$p
	prmtrAR <- object$prmtrAR
	skipIndex <- object$skipIndex
	y <- object$y
	ci <- object$ci
	lcl <- object$lcl
	ucl <- object$ucl
  finiteRows <- object$finiteRows

	if(any(ci[finiteRows]>0))
    y[finiteRows][ci[finiteRows]>0] <- ucl[finiteRows][ci[finiteRows]>0]
  if(any(ci[finiteRows]<0))
	y[finiteRows][ci[finiteRows]<0] <- lcl[finiteRows][ci[finiteRows]<0]

	pValues <- numeric(nObs)
	pValues[skipIndex] <- 1

	for(idx in seq(1,nObs)[-skipIndex])
	{
		#message(sprintf("checking %i",idx))
		wkm <- y[(idx-1):(idx-p)]
		tmpCensorIndicator <- ci[(idx-1):(idx-p)]
		nCensored <- sum(tmpCensorIndicator!=0)
    censored <- tmpCensorIndicator[tmpCensorIndicator!=0]
		if(nCensored)   #at least one is censored
		{
			if( nCensored < p ) #not all are censored
			{
				conditionalIndex <- which(tmpCensorIndicator==0) #indices of known values
				tmpY <- y[(idx-1):(idx-p)][conditionalIndex] #known values
				tmpM <- trend[(idx-1):(idx-p)]
				cdist <- conditionalDistMvnorm(tmpY, conditionalIndex,tmpM,covEta)
				tmpMean <- cdist$'mean'
				tmpVar <- cdist$'var'
			}else{
				tmpMean <- trend[(idx-1):(idx-p)]
				tmpVar <- covEta
			}
      tmpLower <- rep(-Inf,length = nCensored)
      tmpUpper <- rep(Inf,length = nCensored)
      tmpLower[censored>0] <- object$ucl[(idx-1):(idx-p)][tmpCensorIndicator>0]
      tmpUpper[censored<0] <- object$lcl[(idx-1):(idx-p)][tmpCensorIndicator<0]
			smpl <- tmvtnorm::rtmvnorm(nSample,tmpMean,tmpVar,lower=tmpLower,upper=tmpUpper,algorithm="gibbs")
			smpl <- as.matrix(smpl)
			ySmpl <- numeric(nSample)
			for(i in 1:nSample){
				wkm[tmpCensorIndicator!=0] <- smpl[i,]
				ySmpl[i] <- trend[idx] + (wkm - trend[(idx-1):(idx-p)])%*%prmtrAR + eps[i]
			}
			pU <- sum(ySmpl > y[idx])/nSample
			pL <- sum(ySmpl < y[idx])/nSample
		}
		else{
			r <- y[idx]-trend[idx] - (wkm-trend[(idx-1):(idx-p)])%*%prmtrAR
			r <- r/object$sigma
			pU <- stats::pnorm(r,lower.tail=FALSE)
			pL <- stats::pnorm(r,lower.tail=TRUE)
		}
		pValues[idx] <- min(pU,pL)
	}
	minP <- min(pValues)
	if( minP <= threshold )
  {
		i <- which(pValues == minP)
    if(verbose) message("new outlier index: ",i, ", minimum p-value (",signif(minP,digits=6),") <= threshold (", signif(threshold,digits=6),")")
    if(is.null(object$outlier.indices))
       object$outlier.indices <- i
    else
      if(!(i %in% object$outlier.indices))
       object$outlier.indices <- c(object$outlier.indices, i)
  }else
  {
    if(verbose) message("No outlier is found: minimum p-value (",signif(minP,digits=6),") > threshold (", signif(threshold,digits=6),")")
  }
  invisible(object)
}

#' Detecting outliers in a  \code{carx} model.
#'
#' This is an internal function. It tests for additional outlier one at a time, for each time point, as described in
#'  Wang and Chan (2015), adjusted for multiplicity of testing. If the test result is significant, the function 
#'  augments the location of the most significant outlier to the vector of outlier indices, 
#'  i.e, the \code{outlier.indices} in the object returned by the function. The method used to compute the p-value is different from \code{ot.carx}. 
#' @param object a \code{carx} object
#' @keywords internal
#' @return a possibly updated \code{object} which will have an attribute \code{outlier.indices} 
#' denoting the indices of outliers with the new index of outlier appended, if any outlier is detected.

#' use the probability function in tmvtnorm to test the outlier
ot2.carx <- function(object)
{
	#message("detecting outliers")
  verbose=FALSE
	nObs <- object$nObs
	prmtrAR <- object$prmtrAR
	p <- object$p
	threshold <- 0.025/object$nObs
	trend <- object$x%*%object$prmtrX
	covEta <- computeCovAR(object$prmtrAR, object$sigma,lag=p+1)
	skipIndex <- object$skipIndex
	y <- object$y
	ci <- object$ci
	lcl <- object$lcl
	ucl <- object$ucl
  finiteRows <- object$finiteRows

	if(any(ci[finiteRows]>0))
    y[finiteRows][ci[finiteRows]>0] <- ucl[finiteRows][ci[finiteRows]>0]
  if(any(ci[finiteRows]<0))
	y[finiteRows][ci[finiteRows]<0] <- lcl[finiteRows][ci[finiteRows]<0]

	pValues <- numeric(nObs)
	pValues[skipIndex] <- 1

	for(idx in seq(1,nObs)[-skipIndex])
	{
		#message(sprintf("checking %i",idx))
		tmpCensorIndicator <- ci[(idx):(idx-p)]
		nCensored <- sum(tmpCensorIndicator[-1]!=0)
    censored <- tmpCensorIndicator[-1][tmpCensorIndicator[-1]!=0]
		if(nCensored)   #at least one is censored
		{
			if( nCensored < p ) #not all are censored
			{
				conditionalIndex <- which(tmpCensorIndicator==0) #indices of known values
        if(conditionalIndex[1]==1) #first value is to be predicted
          conditionalIndex = conditionalIndex[-1]
				tmpY <- y[(idx):(idx-p)][conditionalIndex] #known values
				tmpM <- trend[(idx):(idx-p)]
				cdist <- conditionalDistMvnorm(tmpY, conditionalIndex,tmpM,covEta)
				tmpMean <- cdist$'mean'
				tmpVar <- cdist$'var'
			}else{ #all previous p values are censored
				tmpMean <- trend[(idx):(idx-p)]
				tmpVar <- covEta
			}
      tmpLower <- rep(-Inf,length = nCensored)
      tmpUpper <- rep(Inf,length = nCensored)
      tmpLower[censored>0] <- ucl[(idx-1):(idx-p)][tmpCensorIndicator[-1]>0]
      tmpUpper[censored<0] <- lcl[(idx-1):(idx-p)][tmpCensorIndicator[-1]<0]
      tmpLower <- c(-Inf,tmpLower)
      tmpUpper <- c(Inf,tmpUpper)
      if(ci[idx]<0)
        qn = object$lcl[idx]
      else{ 
        if(ci[idx]>0) 
          qn = ucl[idx]
        else{
          qn = y[idx]
        }
      }
      pval = tmvtnorm::ptmvnorm.marginal(qn,n=1,mean=tmpMean,sigma=tmpVar,lower=tmpLower,upper=tmpUpper)
      if(ci[idx]==0) 
        pval = min(pval,1-pval)
      pValues[idx] = pval
	  }else{
			r <- y[idx]-trend[idx] - (y[(idx-1):(idx-p)]-trend[(idx-1):(idx-p)])%*%prmtrAR
			r <- r/object$sigma
			pU <- stats::pnorm(r,lower.tail=FALSE)
			pL <- stats::pnorm(r,lower.tail=TRUE)
		  pValues[idx] <- min(pU,pL)
		}
	}
	minP <- min(pValues)
	if( minP <= threshold )
  {
		i <- which(pValues == minP)
    if(verbose) message("new outlier index: ",i, ", minimum p-value (",signif(minP,digits=6),") <= threshold (", signif(threshold,digits=6),")")
    if(is.null(object$outlier.indices))
       object$outlier.indices <- i
    else
      if(!(i %in% object$outlier.indices))
       object$outlier.indices <- c(object$outlier.indices, i)
  }else
  {
    if(verbose) message("No outlier is found: minimum p-value (",signif(minP,digits=6),") > threshold (", signif(threshold,digits=6),")")
  }
  invisible(object)
}


#' S3 method to detect outlier of a \code{carx} object
#'
#' Detect all outliers of a \code{carx} object.
#' @inheritParams outlier.carx
#' @return an updated \code{carx} object. If any outlier is detected, its index will be stored in the \code{outlier.indices} attribute of the return object, and prefix for variable name is stored in the \code{outlier.prefix} attribute. Note that if the original object is fitted through a formula interface, the formula will also be updated.
#' @export
#' @seealso \code{\link{outlier.carx}}.
#'
outlier <- function(object,outlier.prefix="OI_",seed=NULL) UseMethod("outlier")

#' Detect all outliers of a \code{carx} object
#'
#' Detect all outliers of a \code{carx} object and update the model if any outlier is detected.
#' It tests for the presence of outliers one at a time, for each time point, adjusted for multiplicity of testing, as described in Wang and Chan (2017).
#' @param object a \code{carx} object.
#' @param outlier.prefix the prefix used to construct variable name for indicator variables representing the detected outliers, default = "OI_".
#' @param seed the seed for randon number generator, default=\code{NULL}.
#' @return an updated \code{carx} object. If any outlier is detected, its index will be stored in the \code{outlier.indices} attribute of the return object, and prefix for variable name is stored in the \code{outlier.prefix} attribute. Note that if the original object is fitted through a formula interface, the formula will also be updated.
#' @references Wang C, Chan KS (2017). "Quasi-likelihood estimation of a censored autoregressive model with exogenous variables."  Journal of the American Statistical Association. 2017 Mar 20(just-accepted).
#
#' @export
#' @examples
#' sigma = 0.6
#' nObs = 100
#' dat = carxSimCenTS(nObs=nObs,sigma=sigma,ucl=Inf)
#' dat$y[as.integer(nObs/2)] = dat$y[as.integer(nObs/2)] + 4*sd(dat$y)
#' mdl <- carx(y~X1+X2-1,data=dat, p=2, CI.compute = FALSE)
#' oc = outlier(mdl)
#' #note the outlier indices in the output:
#' print(oc)
#' #note the updated formula:
#' print(formula(oc))

outlier.carx <- function(object,outlier.prefix="OI_",seed=131)
{
  verbose = FALSE

  if(verbose) message("Detecting outliers.")
  
  oldSeed <- ifelse(exists(".Random.seed"),.Random.seed,NULL)
  set.seed(seed)
  #if(!is.null(seed)) set.seed(seed)
  newObj = object
  newFormula = NULL
  tryCatch({newFormula=formula(object)},error=function(e){newFormula=NULL})

  oiVec = NULL
  ot = -1
  while(TRUE)
  {
    preOt <- newObj$outlier.indices
    newObj = ot2.carx(newObj)
    newOt <- newObj$outlier.indices
    if(is.null(newOt))
      break
    if(!is.null(preOt) & length(preOt) == length(newOt))
      break
    oi = numeric(newObj$nObs)
    idx <- newOt[length(newOt)]
    oi[idx] = 1
    newVar = paste0(outlier.prefix,idx)
    if(is.null(newFormula))
    {
      newx=data.frame(newObj$x)
      newx[,newVar] <- oi
      #newObj = update(newObj,x=newx) #this doesn't work.
      newObj=carx(y=newObj$y,
                  x=newx,
                  ci=newObj$ci,
                  lcl=newObj$lcl,
                  ucl=newObj$ucl,
                  p=newObj$p,
                  prmtrX = c(newObj$prmtrX,0),
                  prmtrAR = newObj$prmtrAR,
                  sigma = newObj$sigma,
                  tol = newObj$tol,
                  max.iter = newObj$max.iter,
                  CI.compute=FALSE
                 )
    }
    else
    {
      #we have formula and data

      #update the data
      if(!is.null(newObj$cenTS))
      {
        newData <- newObj$cenTS
        nms <- names(newData)
        #unfortunately I haven't find a way to assign a column to a xts with arbitrary name, hope that
        newData$newVarByChao <- oi
        names(newData) <- c(nms,newVar)
      }
      #update the formula
      newFormula =stats::update(newFormula, stats::as.formula(paste("~.+",newVar)))
      newObj <- carx(newFormula, data = newData,
                   p=newObj$p,
                   prmtrX = c(newObj$prmtrX,0),
                   prmtrAR = newObj$prmtrAR,
                   sigma = newObj$sigma,
                   tol = newObj$tol,
                   max.iter = newObj$max.iter,
                   CI.compute=FALSE
                   )
    }
    newObj$outlier.indices <- newOt
    newObj$outlier.prefix <- outlier.prefix
  }
  if(verbose) message("Detecting outliers. Done.")
  if(!is.null(oldSeed)) .Random.seed <- oldSeed
  invisible(newObj)
}