R/impRZalr.R
In robCompositions: Compositional Data Analysis

Documented in impRZalr

#' alr EM-based imputation of rounded zeros
#' 
#' A modified EM alr-algorithm for replacing rounded zeros in compositional
#' data sets.
#' 
#' Statistical analysis of compositional data including zeros runs into
#' problems, because log-ratios cannot be applied.  Usually, rounded zeros are
#' considerer as missing not at random missing values. The algorithm first
#' applies an additive log-ratio transformation to the compositions. Then the
#' rounded zeros are imputed using a modified EM algorithm.
#' 
#' @param x compositional data
#' @param pos position of the rationing variable for alr transformation
#' @param dl detection limit for each part
#' @param eps convergence criteria
#' @param maxit maximum number of iterations
#' @param bruteforce if TRUE, imputations over dl are set to dl. If FALSE,
#' truncated (Tobit) regression is applied.
#' @param method either \dQuote{lm} (default) or \dQuote{MM}
#' @param step if TRUE, a stepwise (AIC) procedure is applied when fitting
#' models
#' @param nComp if determined, it fixes the number of pls components. If
#' \dQuote{boot}, the number of pls components are estimated using a
#' bootstraped cross validation approach.
#' @param R number of bootstrap samples for the determination of pls
#' components. Only important for method \dQuote{pls}.
#' @param verbose additional print output during calculations.
#' @return \item{xOrig }{Original data frame or matrix} \item{xImp }{Imputed
#' data} \item{wind }{Index of the missing values in the data} \item{iter
#' }{Number of iterations} \item{eps }{eps}
#' @author Matthias Templ and Karel Hron
#' @seealso \code{\link{impRZilr}}
#' @references Palarea-Albaladejo, J., Martin-Fernandez, J.A. Gomez-Garcia, J. (2007) 
#' A parametric approach for dealing with compositional rounded zeros. 
#' \emph{Mathematical Geology}, 39(7), 625-645.
#' @keywords manip multivariate
#' @export
#' @importFrom MASS stepAIC
#' @importFrom robustbase lmrob
#' @examples
#' 
#' data(arcticLake)
#' x <- arcticLake
#' ## generate rounded zeros artificially:
#' x[x[,1] < 5, 1] <- 0
#' x[x[,2] < 47, 2] <- 0
#' xia <- impRZalr(x, pos=3, dl=c(5,47), eps=0.05)
#' xia$xImp
#' 
impRZalr <- function(x, pos=ncol(x), dl=rep(0.05, ncol(x)-1), 
                     eps=0.0001, maxit=50, bruteforce=FALSE, 
                     method="lm", step=FALSE, nComp = "boot", R=10,
                     verbose=FALSE) {

#   x <- xorig <- genVarsSimple(n=50,p=30)
#   qu <- apply(x, 2, quantile, 0.05)
#   for(i in 1:(ncol(x)-2)){
#      x[x[,i] < qu[i], i] <- 0
#   }
#   pos <- ncol(x)-1
#   x <- data.frame(x)
#   dl <- qu
#   method="lm"
#   step=FALSE
#   bruteforce=FALSE
  
  ## some checks:	
  if(is.matrix(x)) stop("convert to data.frame first")
  stopifnot(all(x[,pos] != 0 & length(which(is.na(x[,pos]))) == 0))
  if(!any(is.na(x)) && !any(x==0) ) stop("No zeros or missing values in the data")
  if(method=="pls" & ncol(x)<5) stop("too less variables/parts for method pls")
  if(is.null(nComp)){
    pre <- FALSE
    nC <- NULL
  } else if(nComp=="boot"){
    nC <- integer(ncol(x))
    pre <- TRUE
  } else if(length(nComp) == ncol(x)){
    nC <- nComp
    pre <- FALSE
  } else  {
    pre <- FALSE	
  }
  
  ## zeros to NA:
  x[x==0] <- NA
  
  ## transformation:
  xa <- addLR(x, ivar=pos)
  xax <- xa$x
  w <- is.na(xa$x)
  
  ## dl --> phi
  m <- matrix(rep(dl, each=nrow(x)), ncol=length(dl))
  phi <- log(m/x[,pos]) 
  phi <- phi[,-pos]
  xOrig <- x

  it <- 0
  d <- 99999999
  
  ## initialisation:
  xax[w] <- phi[w] * 2 / 3
  
  ## start the EM:
  it <- 0
  amountMiss <- length(which(w))
  while( d > eps & it <= maxit ){
	it <- it + 1
	yold <- xax
  n2 <- nrow(xax)-ncol(xax)+1
  cn <- colnames(x)
  n <- nrow(x)
  for(i in 1:ncol(xax)){ 
    response <- xax[,i]
    predictors <- as.matrix(xax[,-i])
		if(method=="lm" && !step){
	    lm1 <- lm(response ~ predictors)
  			yhat <- predict(lm1, new.data=predictors)	  
  			s <- sd(lm1$res, na.rm=TRUE)
	#  		s <- sqrt(sum(lm1$res^2)) / n2	
	  }
    if(method=="pls" && !step){    
      if(it == 1 & pre){ ## evaluate ncomp.
        nC[i] <- bootnComp(predictors,
                           y=response, R, 
                           plotting=FALSE)$res #$res2
      }
      if(verbose) cat("\n ncomp for part",i,":",nC[i])
      reg1 <- mvr(as.matrix(response) ~ as.matrix(predictors), ncomp=nC[i],
                  method="simpls")
      yhat <- predict(reg1, new.data=data.frame(predictors), ncomp=nC[i])
      s <- sqrt(sum(reg1$res^2)/n) 
      
      lm1 <- lm(response ~ predictors)
      yhat <- predict(lm1, new.data=predictors)	  
    		s <- sd(lm1$res, na.rm=TRUE)
      # s <- sqrt(sum(lm1$res^2)) / n2	
    }
    if(method=="lm" && step){
	  	lm1 <- lm(response ~ predictors) 
	    lm1 <- MASS::stepAIC(lm1, trace=FALSE)
	  	yhat <- predict(lm1, new.data=predictors)	  
    	s <- sd(lm1$res, na.rm=TRUE)
  	#	s <- sqrt(sum(lm1$res^2)) / n2			
	 	}
    if(method=="MM" && !step) {
  		lm1 <- robustbase::lmrob(response ~ predictors)
  		yhat <- predict(lm1, new.data=predictors)	
  		if(any(is.na(yhat))) stop("NA in yhat")
  		if(any(yhat=="Inf")) stop("Inf in yhat")	
  		s <- lm1$s		
  	}
	  if(method=="MM" && step) {
	    stop("robust + stepwise is not implemented until now.")
	   }	
	  ex <- (phi[,i] - yhat)/s 
		tobit <- s*dnorm(ex)/pnorm(ex)
		tobit <- ifelse(tobit=="NaN", 0, tobit)
		tobit <- ifelse(is.na(tobit), 0, tobit)	
		tobit <- ifelse(tobit =="Inf", 0, tobit)	
		tobit <- ifelse(tobit =="-Inf", 0, tobit)	
		yhat2 <- yhat - tobit
	  # check if we are under the DL:
    if(any(yhat2[w[,i]] >= phi[w[,i],i])) stop(paste("values above the DL are imputed. \n column",i))
	   if(bruteforce){
          xax[w[,i],i] <- ifelse(yhat2[w[,i]] >= phi[w[,i],i], phi[w[,i],i], yhat2[w[,i]]) 
	   } else {
       xax[w[,i],i] <- yhat2[w[,i]] 
	   }
	}
	d <- sum(abs(xax - yold))/amountMiss
}

  ## backtransform:	
  xa$x.alr <- xax
  ximp <- suppressWarnings(addLRinv(xa)) 
  
  ## result:
  res <- list(x=ximp, wind=NULL, iter=it, eps=eps) 
  invisible(res)
}