# R/impRZalr.R In robCompositions: Compositional Data Analysis

#### Documented in impRZalr

#' alr EM-based imputation of rounded zeros
#'
#' A modified EM alr-algorithm for replacing rounded zeros in compositional
#' data sets.
#'
#' Statistical analysis of compositional data including zeros runs into
#' problems, because log-ratios cannot be applied.  Usually, rounded zeros are
#' considerer as missing not at random missing values. The algorithm first
#' applies an additive log-ratio transformation to the compositions. Then the
#' rounded zeros are imputed using a modified EM algorithm.
#'
#' @param x compositional data
#' @param pos position of the rationing variable for alr transformation
#' @param dl detection limit for each part
#' @param eps convergence criteria
#' @param maxit maximum number of iterations
#' @param bruteforce if TRUE, imputations over dl are set to dl. If FALSE,
#' truncated (Tobit) regression is applied.
#' @param method either \dQuote{lm} (default) or \dQuote{MM}
#' @param step if TRUE, a stepwise (AIC) procedure is applied when fitting
#' models
#' @param nComp if determined, it fixes the number of pls components. If
#' \dQuote{boot}, the number of pls components are estimated using a
#' bootstraped cross validation approach.
#' @param R number of bootstrap samples for the determination of pls
#' components. Only important for method \dQuote{pls}.
#' @param verbose additional print output during calculations.
#' @return \item{xOrig }{Original data frame or matrix} \item{xImp }{Imputed
#' data} \item{wind }{Index of the missing values in the data} \item{iter
#' }{Number of iterations} \item{eps }{eps}
#' @author Matthias Templ and Karel Hron
#' @references Palarea-Albaladejo, J., Martin-Fernandez, J.A. Gomez-Garcia, J. (2007)
#' A parametric approach for dealing with compositional rounded zeros.
#' \emph{Mathematical Geology}, 39(7), 625-645.
#' @keywords manip multivariate
#' @export
#' @importFrom MASS stepAIC
#' @importFrom robustbase lmrob
#' @examples
#'
#' data(arcticLake)
#' x <- arcticLake
#' ## generate rounded zeros artificially:
#' x[x[,1] < 5, 1] <- 0
#' x[x[,2] < 47, 2] <- 0
#' xia <- impRZalr(x, pos=3, dl=c(5,47), eps=0.05)
#' xia$xImp #' impRZalr <- function(x, pos=ncol(x), dl=rep(0.05, ncol(x)-1), eps=0.0001, maxit=50, bruteforce=FALSE, method="lm", step=FALSE, nComp = "boot", R=10, verbose=FALSE) { # x <- xorig <- genVarsSimple(n=50,p=30) # qu <- apply(x, 2, quantile, 0.05) # for(i in 1:(ncol(x)-2)){ # x[x[,i] < qu[i], i] <- 0 # } # pos <- ncol(x)-1 # x <- data.frame(x) # dl <- qu # method="lm" # step=FALSE # bruteforce=FALSE ## some checks: if(is.matrix(x)) stop("convert to data.frame first") stopifnot(all(x[,pos] != 0 & length(which(is.na(x[,pos]))) == 0)) if(!any(is.na(x)) && !any(x==0) ) stop("No zeros or missing values in the data") if(method=="pls" & ncol(x)<5) stop("too less variables/parts for method pls") if(is.null(nComp)){ pre <- FALSE nC <- NULL } else if(nComp=="boot"){ nC <- integer(ncol(x)) pre <- TRUE } else if(length(nComp) == ncol(x)){ nC <- nComp pre <- FALSE } else { pre <- FALSE } ## zeros to NA: x[x==0] <- NA ## transformation: xa <- addLR(x, ivar=pos) xax <- xa$x
w <- is.na(xa$x) ## dl --> phi m <- matrix(rep(dl, each=nrow(x)), ncol=length(dl)) phi <- log(m/x[,pos]) phi <- phi[,-pos] xOrig <- x it <- 0 d <- 99999999 ## initialisation: xax[w] <- phi[w] * 2 / 3 ## start the EM: it <- 0 amountMiss <- length(which(w)) while( d > eps & it <= maxit ){ it <- it + 1 yold <- xax n2 <- nrow(xax)-ncol(xax)+1 cn <- colnames(x) n <- nrow(x) for(i in 1:ncol(xax)){ response <- xax[,i] predictors <- as.matrix(xax[,-i]) if(method=="lm" && !step){ lm1 <- lm(response ~ predictors) yhat <- predict(lm1, new.data=predictors) s <- sd(lm1$res, na.rm=TRUE)
#  		s <- sqrt(sum(lm1$res^2)) / n2 } if(method=="pls" && !step){ if(it == 1 & pre){ ## evaluate ncomp. nC[i] <- bootnComp(predictors, y=response, R, plotting=FALSE)$res #$res2 } if(verbose) cat("\n ncomp for part",i,":",nC[i]) reg1 <- mvr(as.matrix(response) ~ as.matrix(predictors), ncomp=nC[i], method="simpls") yhat <- predict(reg1, new.data=data.frame(predictors), ncomp=nC[i]) s <- sqrt(sum(reg1$res^2)/n)

lm1 <- lm(response ~ predictors)
yhat <- predict(lm1, new.data=predictors)
s <- sd(lm1$res, na.rm=TRUE) # s <- sqrt(sum(lm1$res^2)) / n2
}
if(method=="lm" && step){
lm1 <- lm(response ~ predictors)
lm1 <- MASS::stepAIC(lm1, trace=FALSE)
yhat <- predict(lm1, new.data=predictors)
s <- sd(lm1$res, na.rm=TRUE) # s <- sqrt(sum(lm1$res^2)) / n2
}
if(method=="MM" && !step) {
lm1 <- robustbase::lmrob(response ~ predictors)
yhat <- predict(lm1, new.data=predictors)
if(any(is.na(yhat))) stop("NA in yhat")
if(any(yhat=="Inf")) stop("Inf in yhat")
s <- lm1$s } if(method=="MM" && step) { stop("robust + stepwise is not implemented until now.") } ex <- (phi[,i] - yhat)/s tobit <- s*dnorm(ex)/pnorm(ex) tobit <- ifelse(tobit=="NaN", 0, tobit) tobit <- ifelse(is.na(tobit), 0, tobit) tobit <- ifelse(tobit =="Inf", 0, tobit) tobit <- ifelse(tobit =="-Inf", 0, tobit) yhat2 <- yhat - tobit # check if we are under the DL: if(any(yhat2[w[,i]] >= phi[w[,i],i])) stop(paste("values above the DL are imputed. \n column",i)) if(bruteforce){ xax[w[,i],i] <- ifelse(yhat2[w[,i]] >= phi[w[,i],i], phi[w[,i],i], yhat2[w[,i]]) } else { xax[w[,i],i] <- yhat2[w[,i]] } } d <- sum(abs(xax - yold))/amountMiss } ## backtransform: xa$x.alr <- xax