simPop: Simulation of Complex Synthetic Data Information

Documented in simContinuous

generateValues_multinom <- function(dataSample, dataPop, params) {
  excludeLevels <- params$excludeLevels
  maxit <- params$maxit
  MaxNWts <- params$MaxNWts
  command <- params$command
  eps <- params$eps
  limit <- params$limit
  censor <- params$censor
  hasNewLevels  <- params$hasNewLevels
  newLevels <- params$newLevels
  name <- params$name
  response <- params$response

  # unique combinations in the stratum of the population need
  # to be computed for prediction
  indGrid <- split(1:nrow(dataPop), dataPop, drop=TRUE)
  grid <- dataPop[sapply(indGrid, function(i) i[1]), , drop=FALSE]
  grid <- as.data.frame(grid)

  # in sample, observations with NAs have been removed to fit the
  # model, hence population can have additional levels
  # these need to be removed since those probabilities cannot
  # be predicted from the model
  if ( excludeLevels ) {
    exclude <- mapply(function(pop, new) pop %in% new,
      pop=grid[, hasNewLevels, drop=FALSE], new=newLevels[hasNewLevels]
    )
    if ( is.null(dim(exclude)) ) {
      exclude <- which(any(exclude))
    } else {
      exclude <- which(apply(exclude, 1, any))
    }
  } else {
    exclude <- integer()
  }
  # fit multinomial model
  mod <- eval(parse(text=command))  # fitted model
  # predict probabilities
  if ( length(exclude) == 0 ) {
    probs <- predict(mod, newdata=grid, type="probs")
  } else {
    probs <- predict(mod, newdata=grid[-exclude, , drop=FALSE], type="probs")
  }
  # set too small probabilities to exactly 0
  if ( !is.null(eps) ) {
    probs[probs < eps] <- 0
  }
  # ensure it works for missing levels of response
  ind <- as.integer(which(table(dataSample[[name]]) > 0))
  if ( length(ind) > 2 && (nrow(grid)-length(exclude)) == 1 ) {
    probs <- t(probs)
  }
  # account for structural zeros
  if ( (!is.null(limit) || !is.null(censor)) && !is.null(dim(probs)) ) {
    if ( length(exclude) == 0 ) {
      probs <- adjustProbs(probs, grid, names(indGrid), limit, censor)
    } else {
      probs <- adjustProbs(probs, grid[-exclude, , drop=FALSE], names(indGrid)[-exclude], limit, censor)
    }
  }
  # local function for sampling from probabilities
  if ( length(ind) == 1 ) {
    resample <- function(k, n, p) rep.int(1, n[k])
  } else if ( length(ind) == 2 ) {
    resample <- function(k, n, p) spSample(n[k], c(1-p[k],p[k]))
  } else {
    resample <- function(k, n, p) spSample(n[k], p[k,])
  }
  # generate realizations for each combination
  sim <- as.list(rep.int(NA, length(indGrid)))
  if ( length(exclude) == 0 ) {
    ncomb <- as.integer(sapply(indGrid, length))
    sim <- lapply(1:length(ncomb), resample, ncomb, probs)
  } else {
    ncomb <- as.integer(sapply(indGrid[-exclude], length))
    sim[-exclude] <- lapply(1:length(ncomb), resample, ncomb, probs)
  }
  sim <- unsplit(sim, dataPop, drop=TRUE)
  # return realizations
  levels(response)[ind][sim]
}

generateValues_lm <- function(dataSample, dataPop, params) {
  if ( !nrow(dataSample) ) {
    return(numeric())
  }
  coef <- params$coef
  excludeLevels <- params$excludeLevels
  hasNewLevels <- params$hasNewLevels
  newLevels <- params$newLevels
  command <- params$command
  predNames <- params$predNames
  additional <- params$additional
  const <- params$const
  formula <- params$formula
  levels <- params$levels
  residuals <- params$residuals
  log <- params$log

  # fix: for each predictor, the level set must be equal in dataSample and dataPop
  for ( i in predNames ) {
	if(is.factor(dataSample[[i]])){	  
      both <- intersect(levels(dataSample[[i]]), levels(dataPop[[i]]))
      a <- as.character(dataSample[[i]])
      a[!a%in%both] <- NA
      b <- as.character(dataPop[[i]])
      b[!b %in%both] <- NA
      dataSample[[i]] <- factor(a, levels=both)
      dataPop[[i]] <- factor(b, levels=both)
    }
	if((is.factor(dataSample[[i]])&!is.factor(dataPop[[i]]))|(is.factor(dataPop[[i]])&!is.factor(dataSample[[i]]))){
		stop("Variable",i,"is a factor only in one of the sample and population data sets.")
	}
  }
  # unique combinations in the stratum of the population need to be computed for prediction
  indGrid <- split(1:nrow(dataPop), dataPop, drop=TRUE)
  grid <- dataPop[sapply(indGrid, function(i) i[1]), , drop=FALSE]
  grid <- as.data.frame(grid)

  # in sample, observations with NAs have been removed to fit the
  # model, hence population can have additional levels
  # these need to be removed since those probabilities cannot
  # be predicted from the model
  if ( excludeLevels ) {
    exclude <- mapply(function(pop, new) pop %in% new,
        pop=grid[, hasNewLevels, drop=FALSE], new=newLevels[hasNewLevels]
    )
    if ( is.null(dim(exclude)) ) {
      exclude <- which(any(exclude))
    } else {
      exclude <- which(apply(exclude, 1, any))
    }
    if ( length(exclude) > 0 ) {
      grid <- grid[-exclude, , drop=FALSE]
    }
    for ( j in predNames[hasNewLevels] ) {
      # drop new factor levels
      grid[, j] <- factor(as.character(grid[, j]), levels=levels(dataSample[[j]]))
    }
  } else {
    exclude <- integer()
  }
  # fit linear model
  mod <- eval(parse(text=command))
  # add coefficients from auxiliary model if necessary
  #tmp <- coef
  #coef[names(coef(mod))] <- coef(mod)
  #mod$coefficients <- coef
  # prediction
  # add 0 variable to combinations for use of 'model.matrix'
  newdata <- cbind(grid, 0)
  names(newdata) <- c(predNames, additional[1])
  newdata <- model.matrix(formula, data=newdata)

  if ( length(exclude) == 0 ) {
    pred <- spPredict(mod, newdata)
  } else {
    pred <- as.list(rep.int(NA, length(indGrid)))
    pred[-exclude] <- spPredict(mod, newdata)
  }
  pred <- unsplit(pred, dataPop, drop=TRUE)
  # add error terms
  if ( residuals ) {
    error <- sample(residuals(mod), size=nrow(dataPop), replace=TRUE)
  } else {
    mu <- median(residuals(mod))
    sigma <- mad(residuals(mod))
    error <- rnorm(nrow(dataPop), mean=mu, sd=sigma)
  }
  # return realizations
  sim <- pred + error
  if ( log ) {
    res <- exp(sim)  # transform back
    if ( !is.null(const) ) {
      res <- res - const  # subtract constant
    }
    return(res)
  } else {
    return(sim)
  }
}

generateValues_poisson <- function(dataSample, dataPop, params) {
  if ( !nrow(dataSample) ) {
    return(rep(0,nrow(dataPop)))
  }
  coef <- params$coef
  excludeLevels <- params$excludeLevels
  hasNewLevels <- params$hasNewLevels
  newLevels <- params$newLevels
  command <- params$command
  predNames <- params$predNames
  additional <- params$additional
  const <- params$const
  formula <- params$formula
  levels <- params$levels
  residuals <- params$residuals
  log <- params$log

  # fix: for each predictor, the level set must be equal in dataSample and dataPop
  for ( i in predNames ) {
	if(is.factor(dataSample[[i]])){
      both <- intersect(levels(dataSample[[i]]), levels(dataPop[[i]]))
      a <- as.character(dataSample[[i]])
      a[!a%in%both] <- NA
      b <- as.character(dataPop[[i]])
      b[!b %in%both] <- NA
      dataSample[[i]] <- factor(a, levels=both)
      dataPop[[i]] <- factor(b, levels=both)
	}
	if((is.factor(dataSample[[i]])&!is.factor(dataPop[[i]]))|(is.factor(dataPop[[i]])&!is.factor(dataSample[[i]]))){
		stop("Variable",i,"is a factor only in one of the sample and population data sets.")
	}
  }
  # unique combinations in the stratum of the population need to be computed for prediction
  indGrid <- split(1:nrow(dataPop), dataPop, drop=TRUE)
  grid <- dataPop[sapply(indGrid, function(i) i[1]), , drop=FALSE]
  grid <- as.data.frame(grid)

  # in sample, observations with NAs have been removed to fit the
  # model, hence population can have additional levels
  # these need to be removed since those probabilities cannot
  # be predicted from the model
  if ( excludeLevels ) {
    exclude <- mapply(function(pop, new) pop %in% new,
        pop=grid[, hasNewLevels, drop=FALSE], new=newLevels[hasNewLevels]
    )
    if ( is.null(dim(exclude)) ) {
      exclude <- which(any(exclude))
    } else {
      exclude <- which(apply(exclude, 1, any))
    }
    if ( length(exclude) > 0 ) {
      grid <- grid[-exclude, , drop=FALSE]
    }
    for ( j in predNames[hasNewLevels] ) {
      # drop new factor levels
      grid[, j] <- factor(as.character(grid[, j]), levels=levels(dataSample[[j]]))
    }
  } else {
    exclude <- integer()
  }
  # fit linear model
  mod <- try(eval(parse(text=command)))
  # add coefficients from auxiliary model if necessary
  #tmp <- coef
  #coef[names(coef(mod))] <- coef(mod)
  #mod$coefficients <- coef
  # prediction
  # add 0 variable to combinations for use of 'model.matrix'
  if(!"try-error"%in%class(mod)){
    newdata <- cbind(grid, 0)
    names(newdata) <- c(predNames, additional[1])
    
    if ( length(exclude) == 0 ) {
      pred <- round(predict(mod, newdata=newdata,type="response"))
    } else {
      pred <- as.list(rep.int(NA, length(indGrid)))
      pred[-exclude] <- round(predict(mod, newdata=newdata,type="response"))
    }
    pred <- unsplit(pred, dataPop, drop=TRUE)  
  }else{
    pred <- rep(0,nrow(dataPop))
  }
  
  # add error terms
# addition of an error term, not implemented for Poisson Regression yet
#  if ( residuals ) {
#    error <- sample(residuals(mod), size=nrow(dataPop), replace=TRUE)
#  } else {
#    mu <- median(residuals(mod))
#    sigma <- mad(residuals(mod))
#    error <- rnorm(nrow(dataPop), mean=mu, sd=sigma)
#  }
  # return realizations
  return(pred)
  sim <- pred #+ error
  #return(sim)
}

generateValues_binary <- function(dataSample, dataPop, params) {
  excludeLevels <- params$excludeLevels
  hasNewLevels <- params$hasNewLevels
  newLevels <- params$newLevels
  predNames <- params$predNames
  name <- params$name
  weight <- params$weight
  useAux <- params$useAux
  tol <- params$tol
  eps <- params$eps
  if ( !nrow(dataSample) ) {
    return(numeric())
  }
  # if all y values are the same return the same value for everybody
  if(length(unique(dataSample[[name]]))==1){
    if(params$verbose){
      cat("All values in the training data set are the same!\n")
    }
    return(rep(dataSample[[name]][1],nrow(dataPop)))
  }
  # unique combinations in the stratum of the population need to be computed for prediction
  indGrid <- split(1:nrow(dataPop), dataPop, drop=TRUE)
  grid <- dataPop[sapply(indGrid, function(i) i[1]), , drop=FALSE]
  grid <- as.data.frame(grid)

  # in sample, observations with NAs have been removed to fit the
  # model, hence population can have additional levels
  # these need to be removed since those probabilities cannot
  # be predicted from the model
  if ( excludeLevels ) {
    exclude <- mapply(function(pop, new) pop %in% new,
      pop=grid[, hasNewLevels, drop=FALSE],
      new=newLevels[hasNewLevels]
    )
    if ( is.null(dim(exclude)) ) {
      exclude <- which(any(exclude))
    } else {
      exclude <- which(apply(exclude, 1, any))
    }
    if ( length(exclude) > 0 ) {
      grid <- grid[exclude, , drop=FALSE]
    }
    for ( j in predNames[hasNewLevels] ) {
      # drop new factor levels
      grid[, j] <- factor(as.character(grid[, j]), levels=levels(dataSample[[j]]))
    }
  } else {
    exclude <- integer()
  }
  # add 0 variable to combinations for use of 'model.matrix'
  Xnew <- cbind(grid, 0)
  names(Xnew) <- c(predNames, name)
  Xnew <- model.matrix(params$command, data=Xnew)

  # fit logit model
  X <- model.matrix(params$command, data=dataSample)
  y <- dataSample[[name]]
  weights <- dataSample[[weight]]
  mod <- logitreg(X, y, weights=weights)
  # add parameters from auxiliary model if necessary
  if ( useAux ) {
    indPar <- abs(mod$par) < tol
    mod$par[indPar] <- params$par[indPar]

    # remove non-existing combinations from mod$par
    # reason: auxiliary model is estimated on total population
    ii <- setdiff(names(mod$par), colnames(Xnew))
    if ( length(ii) >0 ) {
      mod$par <- mod$par[!names(mod$par)%in%ii]
    }
  }

  # predict probabilities
  tmp <- exp(Xnew %*% mod$par)
  # avoid integer overflow
  p <- ifelse(is.infinite(tmp), 1, as.numeric(tmp / (1 + tmp)))
  # set too small probabilities to exactly 0
  if ( !is.null(eps) ) {
    p[p < eps] <- 0
  }
  # generate realizations for each combination
  if ( length(exclude) == 0 ) {
    ncomb <- as.integer(sapply(indGrid, length))
    sim <- lapply(1:length(ncomb), function(k) {
      spSample(ncomb[k], c(1-p[k], p[k])) - 1
    })
  } else {
    ncomb <- as.integer(sapply(indGrid[-exclude], length))
    sim <- as.list(rep.int(NA, length(indGrid)))
    sim[-exclude] <- lapply(1:length(ncomb), function(k) {
      spSample(ncomb[k], c(1-p[k], p[k])) - 1
    })
  }
  # return realizations
  if(params$verbose){
    cat("Summary of the predicted probabilites:")
    print(summary(unsplit(sim, dataPop, drop=TRUE)))
  }
  unsplit(sim, dataPop, drop=TRUE)
}

generateValues_xgboost <- function(dataSample, dataPop, params) {
  
  res <- NULL
  
  predNames <- params$predNames
  name <- params$name
  typ <- params$typ
  formula <- params$formula
  weight <- params$weight
  command <- params$command
  residual <- params$residuals
  strata <- params$strata
  
  
  # remove strata from prediction, because xgboost can not handle factors with only one level
  predNames <- predNames[predNames != strata]
  
  # Check if a factor has only one level
  check_levels <- sapply (predNames, function(nam) {
      length(levels(dataSample[[nam]])) == 1
  })
  
  if(any(check_levels)){
    stop(paste0(predNames[which(check_levels)]),
         " has only one level, XGBoost can't work with categorical variables with one level")
  }
  
  mod <- eval(parse(text=command))

  # set sample factor levels to population
  ind <- match(colnames(dataPop), colnames(dataSample))
  for ( i in 1:length(ind) ) {
    if (is.factor(unlist(dataPop[,i,with=FALSE]))) {
      dataPop[,colnames(dataPop)[i]:=factor(as.character(unlist(dataPop[,colnames(dataPop)[i],with=FALSE])),levels(dataSample[[ind[i]]]))]
    }
  }

  new_data <- xgb.DMatrix(data = model.matrix(~.+0, data = model.frame(dataPop[, predNames, with = FALSE],  na.action=na.pass)), missing = NA)
  pred <- predict(mod,
                  newdata=new_data)
  
  # if residuals is true, calculate in-sample residuals and add an error term to the predictions
  if(residual){
    predSample <- predict(mod,
                          newdata=xgb.DMatrix(data = model.matrix(~.+0, data = model.frame(dataSample[, predNames, with = FALSE],  na.action=na.pass)),
                                              missing = NA,
                                              info = list(weight = as.numeric(dataSample[, weight, with = FALSE]))))
    resSample <- cbind(dataSample, predSample)
    
    resSample[, res := predSample - .SD, .SDcols = name]
    target <- resSample[, name, with = FALSE][[1]]
    
    error <- sample(resSample$res, nrow(dataPop), replace = TRUE)
    pred <- pred + error
  }
  
  return(pred)
}

genVals <- function(dataSample, dataPop, params, typ, response) {
  
  # unify level-set of predictors
  for ( i in params$predNames ) {
    dataSample[[i]] <- cleanFactor(dataSample[[i]])
    dataPop[[i]] <- cleanFactor(dataPop[[i]])
  }


  if ( !typ %in% c("multinom","lm","binary","poisson","xgboost") ) {
    stop("unsupported value for argument 'type' in genVals()\n")
  }
  # Check wheter all response values are the same
  if(length(unique(response))==1){
	  res <- rep(unique(response),nrow(dataPop))
	  return(res)
  }
  if ( typ=="binary") {
    res <- generateValues_binary(dataSample, dataPop, params)
  }else if ( typ=="lm") {
    res <- generateValues_lm(dataSample, dataPop, params)
  }else if ( typ=="multinom" ) {
    res <- generateValues_multinom(dataSample, dataPop, params)
  }else if ( typ=="poisson") {
    res <- generateValues_poisson(dataSample, dataPop, params)
  }else if ( typ=="xgboost") {
    res <- generateValues_xgboost(dataSample, dataPop, params)
  }
  res
}

runModel <- function(dataS, dataP, params, typ) {
  x <- NULL
  strata <- params$strata
  pp <- parallelParameters(nr_cpus=params$nr_cpus, nr_strata=length(levels(dataS[[strata]])))
  indStrata <- params$indStrata
  predNames <- params$predNames
  additional <- unique(c(params$additional, params$name, params$weight))
  by <- params$by
  
  if(is.null(by) |  by != "none"){
    if ( pp$parallel ) {
      # windows
      if ( pp$have_win ) {
        cl <- makePSOCKcluster(pp$nr_cores)
        registerDoParallel(cl,cores=pp$nr_cores)
        valuesCat <- foreach(x=levels(dataS[[strata]]), .options.snow=list(preschedule=TRUE)) %dopar% {
          genVals(
            dataSample=dataS[dataS[[strata]] == x,],
            dataPop=dataP[indStrata[[x]], predNames, with=FALSE],
            params,response=dataS[dataS[[strata]] == x,eval(parse(text=params$name))],
            typ=typ)
        }
        stopCluster(cl)
      }
      # linux/mac
      if ( !pp$have_win ) {
        valuesCat <- mclapply(levels(dataS[[strata]]), function(x) {
          genVals(
            dataSample=dataS[dataS[[strata]] == x,],
            dataPop=dataP[indStrata[[x]], predNames, with=FALSE],
  		  params,response=dataS[dataS[[strata]] == x,eval(parse(text=params$name))],
            typ=typ)
        },mc.cores=pp$nr_cores)
      }
    } else {
      if(params$verbose){
        cat("All values of the by group:","\n")
        print(levels(dataS[[strata]]))
      }
      valuesCat <- lapply(levels(dataS[[strata]]), function(x) {
         if(params$verbose){
           cat("Current by group for the binary model:",x,"\n")
         }
        genVals(
          dataSample=dataS[dataS[[strata]] == x,c(predNames, additional), with=FALSE],
          dataPop=dataP[indStrata[[x]], predNames, with=FALSE],
  		params,response=dataS[dataS[[strata]] == x,eval(parse(text=params$name))],
          typ=typ)
      })
    }
    
    res <- sapply(valuesCat, class)
    if ( any(res=="try-error") ) {
      stop(paste0("Error in estimating the linear model. Try to specify a more simple model!\n"))
    }
    if ( typ=="multinom" ) {
      response <- dataS[[params$name]]
      valuesCat <- factor(unsplit(valuesCat, dataP[[strata]]), levels=levels(response))
    }
    if ( typ=="binary" ) {
      valuesCat <- unsplit(valuesCat, dataP[[strata]], drop=FALSE)
    }
    if ( typ%in%c("poisson","lm","xgboost") ) {
      valuesCat <- unsplit(valuesCat, dataP[[strata]], drop=FALSE)
    }
    
  }else{
    
    valuesCat <- genVals(dataSample=dataS,
                         dataPop=dataP,
                         params,
                         response=dataS[, eval(parse(text=params$name))],
                         typ=typ)
    
    res <- sapply(valuesCat, class)
    if ( any(res=="try-error") ) {
      stop(paste0("Error in estimating the linear model. Try to specify a more simple model!\n"))
    }
    if (!typ %in% "xgboost"){
      stop(paste0("Strata with type ", strata, "only implemented for typ xgboost, not",
                  typ))
    }
  }

  # check for errors

  
  return(valuesCat)
}


#' Simulate continuous variables of population data
#'
#' Simulate continuous variables of population data using multinomial
#' log-linear models combined with random draws from the resulting categories
#' or (two-step) regression models combined with random error terms. The
#' household structure of the population data and any other categorical
#' predictors need to be simulated beforehand.
#'
#' If \code{method} is \code{"lm"}, the behavior for two-step models is
#' described in the following.
#'
#' If \code{zeros} is \code{TRUE} and \code{log} is not \code{TRUE} or the
#' variable specified by \code{additional} does not contain negative values, a
#' log-linear model is used to predict whether an observation is zero or not.
#' Then a linear model is used to predict the non-zero values.
#'
#' If \code{zeros} is \code{TRUE}, \code{log} is \code{TRUE} and \code{const}
#' is specified, again a log-linear model is used to predict whether an
#' observation is zero or not. In the linear model to predict the non-zero
#' values, \code{const} is added to the variable specified by \code{additional}
#' before the logarithms are taken.
#'
#' If \code{zeros} is \code{TRUE}, \code{log} is \code{TRUE}, \code{const} is
#' \code{NULL} and there are negative values, a multinomial log-linear model is
#' used to predict negative, zero and positive observations. Categories for the
#' negative values are thereby defined by \code{breaks}. In the second step, a
#' linear model is used to predict the positive values and negative values are
#' drawn from uniform distributions in the respective classes.
#'
#' If \code{zeros} is \code{FALSE}, \code{log} is \code{TRUE} and \code{const}
#' is \code{NULL}, a two-step model is used if there are non-positive values in
#' the variable specified by \code{additional}. Whether a log-linear or a
#' multinomial log-linear model is used depends on the number of categories to
#' be used for the non-positive values, as defined by \code{breaks}. Again,
#' positive values are then predicted with a linear model and non-positive
#' values are drawn from uniform distributions.
#'
#' The number of cpus are selected automatically in the following manner. The
#' number of cpus is equal the number of strata. However, if the number of cpus
#' is less than the number of strata, the number of cpus - 1 is used by
#' default. This should be the best strategy, but the user can also overwrite
#' this decision.
#'
#' @name simContinuous
#' @param simPopObj a \code{\linkS4class{simPopObj}} holding household survey
#' data, population data and optionally some margins.
#' @param additional a character string specifying the additional continuous
#' variable of \code{dataS} that should be simulated for the population data.
#' Currently, only one additional variable can be simulated at a time.
#' @param method a character string specifying the method to be used for
#' simulating the continuous variable. Accepted values are \code{"multinom"},
#' for using multinomial log-linear models combined with random draws from the
#' resulting categories, \code{"lm"}, for using (two-step) regression
#' models combined with random error terms, \code{"poisson"} for using Poisson regression for count variables, and \code{"xgboost"} for using XGBoost.
#' @param zeros a logical indicating whether the variable specified by
#' \code{additional} is semi-continuous, i.e., contains a considerable amount
#' of zeros. If \code{TRUE} and \code{method} is \code{"multinom"}, a separate
#' factor level for zeros in the response is used. If \code{TRUE} and
#' \code{method} is \code{"lm"}, a two-step model is applied. The first step
#' thereby uses a log-linear or multinomial log-linear model (see
#' \dQuote{Details}).
#' @param breaks an optional numeric vector; if multinomial models are
#' computed, this can be used to supply two or more break points for
#' categorizing the variable specified by \code{additional}. If \code{NULL},
#' break points are computed using weighted quantiles.
#' @param lower,upper optional numeric values; if multinomial models are
#' computed and \code{breaks} is \code{NULL}, these can be used to specify
#' lower and upper bounds other than minimum and maximum, respectively. Note
#' that if \code{method} is \code{"multinom"} and \code{gpd} is \code{TRUE}
#' (see below), \code{upper} defaults to \code{Inf}.
#' @param equidist logical; if \code{method} is \code{"multinom"} and
#' \code{breaks} is \code{NULL}, this indicates whether the (positive) default
#' break points should be equidistant or whether there should be refinements in
#' the lower and upper tail (see \code{\link{getBreaks}}).
#' @param probs numeric vector with values in \eqn{[0, 1]}; if \code{method} is
#' \code{"multinom"} and \code{breaks} is \code{NULL}, this gives probabilities
#' for quantiles to be used as (positive) break points. If supplied, this is
#' preferred over \code{equidist}.
#' @param gpd logical; if \code{method} is \code{"multinom"}, this indicates
#' whether the upper tail of the variable specified by \code{additional} should
#' be simulated by random draws from a (truncated) generalized Pareto
#' distribution rather than a uniform distribution.
#' @param threshold a numeric value; if \code{method} is \code{"multinom"},
#' values for categories above \code{threshold} are drawn from a (truncated)
#' generalized Pareto distribution.
#' @param est a character string; if \code{method} is \code{"multinom"}, the
#' estimator to be used to fit the generalized Pareto distribution.
#' @param limit an optional named list of lists; if multinomial models are
#' computed, this can be used to account for structural zeros. The names of the
#' list components specify the predictor variables for which to limit the
#' possible outcomes of the response. For each predictor, a list containing the
#' possible outcomes of the response for each category of the predictor can be
#' supplied. The probabilities of other outcomes conditional on combinations
#' that contain the specified categories of the supplied predictors are set to
#' 0. Currently, this is only implemented for more than two categories in the
#' response.
#' @param censor an optional named list of lists or \code{data.frame}s; if
#' multinomial models are computed, this can be used to account for structural
#' zeros. The names of the list components specify the categories that should
#' be censored. For each of these categories, a list or \code{data.frame}
#' containing levels of the predictor variables can be supplied. The
#' probability of the specified categories is set to 0 for the respective
#' predictor levels. Currently, this is only implemented for more than two
#' categories in the response.
#' @param log logical; if \code{method} is \code{"lm"}, this indicates whether
#' the linear model should be fitted to the logarithms of the variable
#' specified by \code{additional}. The predicted values are then
#' back-transformed with the exponential function. See \dQuote{Details} for
#' more information.
#' @param const numeric; if \code{method} is \code{"lm"} and \code{log} is
#' \code{TRUE}, this gives a constant to be added before log transformation.
#' @param alpha numeric; if \code{method} is \code{"lm"}, this gives trimming
#' parameters for the sample data. Trimming is thereby done with respect to the
#' variable specified by \code{additional}. If a numeric vector of length two
#' is supplied, the first element gives the trimming proportion for the lower
#' part and the second element the trimming proportion for the upper part. If a
#' single numeric is supplied, it is used for both. With \code{NULL}, trimming
#' is suppressed.
#' @param residuals logical; if \code{method} is \code{"lm"}, this indicates
#' whether the random error terms should be obtained by draws from the
#' residuals. If \code{FALSE}, they are drawn from a normal distribution
#' (median and MAD of the residuals are used as parameters).
#' @param keep logical; if multinomial models are computed, this indicates
#' whether the simulated categories should be stored as a variable in the
#' resulting population data. If \code{TRUE}, the corresponding column name is
#' given by \code{additional} with postfix \code{"Cat"}.
#' @param maxit,MaxNWts control parameters to be passed to
#' \code{\link[nnet]{multinom}} and \code{\link[nnet]{nnet}}. See the help file
#' for \code{\link[nnet]{nnet}}.
#' @param tol if \code{method} is \code{"lm"} and \code{zeros} is \code{TRUE},
#' a small positive numeric value or \code{NULL}. When fitting a log-linear
#' model within a stratum, factor levels may not exist in the sample but are
#' likely to exist in the population. However, the coefficient for such factor
#' levels will be 0. Therefore, coefficients smaller than \code{tol} in
#' absolute value are replaced by coefficients from an auxiliary model that is
#' fit to the whole sample. If \code{NULL}, no auxiliary log-linear model is
#' computed and no coefficients are replaced.
#' @param nr_cpus if specified, an integer number defining the number of cpus
#' that should be used for parallel processing.
#' @param eps a small positive numeric value, or \code{NULL} (the default). In
#' the former case and if (multinomial) log-linear models are computed,
#' estimated probabilities smaller than this are assumed to result from
#' structural zeros and are set to exactly 0.
#' @param regModel allows to specify the model that should be for the
#' simulation of the additional continuous variable. The following choices are
#' possible: \itemize{ \item'basic'only the basic household-variables
#' (generated with \code{\link{simStructure}}) are used.  \item'available'all
#' available variables (that are common in the sample and the syntetic
#' population (e.g. previously generated variables) are used for the modeling.
#' Should be used with care because all variables are automatically used as
#' factors!  \item formula-object: Users may also specify a specific formula
#' (class 'formula') that will be used. Checks are performed that all required
#' variables are available.}
#' @param byHousehold if NULL, simulated values are used as is. If either \code{'sum'},
#' \code{'mean'} or \code{'random'} is specified, the values are aggregated and each member
#' of the household gets the same value (mean, sum or a random value) assigned.
#' @param imputeMissings if TRUE, missing values in variables that are used for
#' the underlying model are imputed using hock-deck.
#' @param seed optional; an integer value to be used as the seed of the random
#' number generator, or an integer vector containing the state of the random
#' number generator to be restored.
#' @param verbose (logical) if \code{TRUE}, additional output is written to the promt
#' @param by defining which variable to use as split up variable of the estimation. Defaults to the strata variable.
#' @param model_params adding optional parameter to the model, at the moment only implemented for xgboost hyperparameters
#' @return An object of class \code{\linkS4class{simPopObj}} containing survey
#' data as well as the simulated population data including the continuous
#' variable specified by \code{additional} and possibly simulated categories
#' for the desired continous variable.
#' @note The basic household structure and any other categorical predictors
#' need to be simulated beforehand with the functions
#' \code{\link{simStructure}} and \code{\link{simCategorical}}, respectively.
#' @author Bernhard Meindl, Andreas Alfons, Alexander Kowarik (based on code by Stefan Kraft), Siro Fritzmann
#' @references 
#' B. Meindl, M. Templ, A. Kowarik, O. Dupriez (2017) Simulation of Synthetic Populations for Survey Data Considering Auxiliary
#' Information. \emph{Journal of Statistical Survey}, \strong{79} (10), 1--38. \doi{10.18637/jss.v079.i10}
#' 
#' A. Alfons, M. Templ (2011) Simulation of close-to-reality population data for household surveys with application to EU-SILC. 
#' \emph{Statistical Methods & Applications}, \strong{20} (3), 383--407. \doi{10.1080/02664763.2013.859237}
#' @seealso \code{\link{simStructure}}, \code{\link{simCategorical}},
#' \code{\link{simComponents}}, \code{\link{simEUSILC}}
#' @keywords datagen
#' @export
#' @examples
#'
#' data(eusilcS)
#' \dontrun{
#' ## approx. 20 seconds computation time
#' inp <- specifyInput(data=eusilcS, hhid="db030", hhsize="hsize", strata="db040", weight="db090")
#' simPop <- simStructure(data=inp, method="direct",
#'   basicHHvars=c("age", "rb090", "hsize", "pl030", "pb220a"))
#'
#' regModel = ~rb090+hsize+pl030+pb220a
#'
#' # multinomial model with random draws
#' eusilcM <- simContinuous(simPop, additional="netIncome",
#'               regModel = regModel,
#'               upper=200000, equidist=FALSE, nr_cpus=1)
#' class(eusilcM)
#' 
#' # two-step regression
#' eusilcT <- simContinuous(simPop, additional="netIncome",
#'               regModel = "basic",
#'               method = "lm", nr_cpus=1)
#' class(eusilcT)
#' }
#'
simContinuous <- function(simPopObj, additional = "netIncome",
  method = c("multinom", "lm","poisson", "xgboost"), zeros = TRUE,
  breaks = NULL, lower = NULL, upper = NULL,
  equidist = TRUE, probs = NULL, gpd = TRUE,
  threshold = NULL, est = "moments", limit = NULL,
  censor = NULL, log = TRUE, const = NULL,
  alpha = 0.01, residuals = TRUE, keep = TRUE,
  maxit = 500, MaxNWts = 1500,
  tol = .Machine$double.eps^0.5,
  nr_cpus=NULL, eps = NULL, regModel="basic", byHousehold=NULL,
  imputeMissings=FALSE, seed, verbose=FALSE,by="strata", model_params=NULL) {

  x <- hhid <- vals <- id <- V1 <- randId <- optional_params <- NULL

  if ( !is.null(byHousehold) ) {
    if ( !byHousehold %in% c("mean","sum","random") ) {
      stop("invalid value for argument 'byHousehold'. Allowed values are 'mean', 'sum' or 'random'!\n")
    }
  }

  samp <- simPopObj@sample
  pop <- simPopObj@pop
  basic <- simPopObj@basicHHvars
  if(by %in% c("strata", "none")){
    if(by == "none" & method[1] != "xgboost"){
      stop(paste0("by \"none\" not implemented for method ", method[1]))
    }
    strata <- samp@strata
  }else if(!is.null(by)){
    strata <- by
  }
  weight <- samp@weight

  dataS <- samp@data
  dataP <- pop@data

  if ( additional %in% names(dataP)) {
    stop(paste0("Variable '",additional,"' already available in the synthetic population!\n"))
  }

  ## initializations
  if ( !missing(seed) ) {
    set.seed(seed,"L'Ecuyer")  # set seed of random number generator
  }

  if ( length(additional) != 1 ) {
    stop("currently only one additional variable can be generated at a time")
  }
  if ( !additional %in% colnames(samp@data) ) {
    stop("variable 'additional' must be included in the sample of input 'simPopObj'!\n")
  }

  regInput <- regressionInput(simPopObj, additional=additional, regModel=regModel)
  predNames <- regInput[[1]]$predNames
  estimationModel <- regInput[[1]]$formula

  varNames <- unique(c(predNames, weight, additional, strata))
  if(!strata%in%colnames(dataS)){
    stop(strata," is defined as by variable, but not in the sample data set.")
  }
  dataS <- dataS[,varNames, with=FALSE]

  method <- match.arg(method)
  zeros <- isTRUE(zeros)
  log <- isTRUE(log)
  if(log&&method %in% c("poisson", "xgboost")){
    log <- FALSE
    warning(paste0("For ", method," regression the log=TRUE parameter is ignored and the numeric variable is not transformed."))
  }
  if(!is.null(alpha) && method %in% c("poisson", "xgboost")){
    alpha <- NULL
    warning(paste0("For ", method ," regression the alpha!=NULL is not yet implemented and therefore set to NULL."))
  }
  if ( is.numeric(alpha) && length(alpha) > 0 ) {
    alpha <- rep(alpha, length.out=2)
    if ( !all(is.finite(alpha)) || any(alpha < 0) || sum(alpha) >= 1 ) {
      alpha <- NULL
      warning("invalid parameter 'alpha': trimming is not applied\n")
    }
  } else {
    alpha <- NULL
  }

  # observations with missings are excluded from simulation
  #exclude <- getExclude(dataS[,c(additional,predNames),with=F]) # fixes #31?
  #if ( length(exclude) ) {
  #  dataS <- dataS[-exclude,]
  #}

  # temporarily impute (using hotdeck) / or check (if imputeMissings=FALSE)
  # missing values in additional variables in the sample
  if ( is.null(strata) ) {
    modelVars <- setdiff(predNames, c(weight,basic,pop@hhsize))
  } else {
    modelVars <- setdiff(predNames, c(strata,weight,basic,pop@hhsize))
  }

  if ( length(modelVars) > 0 & imputeMissings ) {
    dataS_orig <- dataS[,modelVars,with=FALSE]
    dataS <- hotdeck(dataS, variable=modelVars, domain_var=strata, imp_var=FALSE)
  }

  # check for NAs and warn user
  if ( !imputeMissings) {
    naTab <- dataS[,lapply(.SD, is.na), .SDcols=c(additional,predNames)]
    perc.miss <- sum(rowSums(naTab)!=0) / nrow(dataS)
    if ( perc.miss > 0 ) {
      wm <- paste0("There are ~",formatC(100*perc.miss,format="f", digits=1),"% ")
      wm <- paste0(wm, "observations in the response/predictors with at least one missing variable.\n")
      wm <- paste0(wm, "If you get errors in the estimation procedure, consider to recode these missing ")
      wm <- paste0(wm, "values (e.g. by assigning an additional category) or try to specify a different model.\n\n")
      for ( z in 1:ncol(naTab) ) {
        vv <- colnames(naTab)[z]
        missv <- sum(naTab[[z]])
        missp <- formatC(100*missv/nrow(dataS),format="f", digits=1)
        if ( vv == additional ) {
          wm <- paste0(wm, "Variable '",vv,"' (response): ",missv," missing values (~",missp,"%).\n")
        } else {
          wm <- paste0(wm, "Variable '",vv,"' (predictor): ",missv," missing values (~",missp,"%).\n")
        }
      }
      if(verbose) warning(wm)
    }
  }

  # variables are coerced to factors
  select <- unique(c(predNames, strata)) # strata always included
#  dataS <- checkFactor(dataS, select)
  if(!strata%in%colnames(dataP)){
    stop(strata," is defined as by variable, but not in the population data set.")
  }
#  dataP <- checkFactor(dataP, select)

  # sample data of variable to be simulated
  additionalS <- dataS[[additional]]

  ## determine which models to fit and do further initialization
  haveBreaks <- !is.null(breaks)
  if ( method == "multinom" ) {
    useMultinom <- TRUE
    useLogit <- FALSE
    useLm <- FALSE
    usePoisson <- FALSE
    useXgboost <- FALSE
    # define break points (if missing)
    if ( haveBreaks ) {
      checkBreaks(breaks)
      breaks <- if(zeros) union(breaks, 0) else unique(breaks)
      breaks <- sort(breaks)
    } else {
      if ( is.null(upper) && gpd ) {
        upper <- Inf
      }
      weights <- NULL
      if(!is.null(weight)){
        weights <- dataS[[weight]]
      }
      breaks <- getBreaks(additionalS, weights=weights, zeros, lower, upper, equidist, probs)
    }
  } else {
    if(method=="lm"){
      useLm <- TRUE
      usePoisson <- FALSE
      useXgboost <- FALSE
    }else if(method=="poisson"){
      useLm <- FALSE
      usePoisson <- TRUE
      useXgboost <- FALSE
    }else if(method=="xgboost"){
      useMultinom <- FALSE
      useLm <- FALSE
      usePoisson <- FALSE
      useXgboost <- TRUE
    }

    
    if ( log ) {
      if ( is.null(const) ) {
        ## use log-transformation
        # check for negative values
        neg <- which(additionalS < 0)
        haveNeg <- length(neg) > 0
        if ( haveNeg ) {
          # define break points for negative values
          if ( haveBreaks ) {
            checkBreaks(breaks)
            breaks <- c(unique(breaks[breaks < 0]), 0)
          } else {
            breaks <- getBreaks(additionalS[neg], dataS[[weight]][neg], zeros=TRUE, lower, upper)
          }
          if ( zeros || length(breaks) > 2 ) {
            useMultinom <- TRUE
            breaks <- c(breaks, Inf)  # add Inf to breakpoints
          } else {
            useMultinom <- FALSE
          }
          useLogit <- !useMultinom
        } else {
          useLogit <- zeros || any(additionalS == 0)
          useMultinom <- FALSE
        }
      } else {
        # check constant
        if ( !is.numeric(const) || length(const) == 0 ) {
          stop("'const' must be numeric\n")
        } else {
          const <- const[1]
        }
        # set control parameters
        useLogit <- zeros || any(additionalS == 0)
        useMultinom <- FALSE
      } 
    } else if (method == "xgboost") {
      useLogit <- FALSE
      useMultinom <- FALSE
    } else {
      # logistic model is used in case of semi-continuous variable
      useLogit <- zeros
      # multinomial model is not needed
      useMultinom <- FALSE
    }
  }

  ## some general preparations for the simulation
  # list indStrata contains the indices of dataP split by strata
  N <- nrow(dataP)
  indP <- 1:N
  indStrata <- split(indP, dataP[[strata]])
  #fpred <- paste(predNames, collapse = " + ")  # for formula
  # check if population data contains factor levels that do not exist
  # in the sample
  newLevels <- lapply(predNames, function(nam) {
    levelsS <- levels(dataS[[nam]])
    levelsP <- levels(dataP[[nam]])
    levelsP[!(levelsP %in% levelsS)]
  })
  hasNewLevels <- sapply(newLevels, length) > 0
  excludeLevels <- any(hasNewLevels)

  ## preparations for multinomial or binomial logit model
  if ( useMultinom || useLogit ) {
    name <- getCatName(additional)
    estimationModel <- gsub(additional, name, estimationModel)
    # remove strata variable from estimation model if we are computing on multiple cores
    # else multinom fails because the variable has only one factor!
    if ( !is.null(dataS[[strata]]) ) {
#      if ( parallelParameters(nr_cpus, length(levels(dataS[[strata]])))$nr_cores > 1 ) {
        estimationModel <- gsub(paste0("[+]",strata),"",estimationModel)
#      }
    }
  }

  if ( useMultinom ) {
    ## some preparations
    dataS[[name]] <- getCat(additionalS, breaks, zeros, right=TRUE)
    response <- dataS[[name]]  # response variable
    # check threshold for GPD (if supplied)
    if ( !useLm && gpd && !is.null(threshold) && length(threshold) != 1 ) {
      stop("'threshold' must be a single numeric value")
    }

    ## simulate categories
    # TODO: share code with 'simCategorical'
    params <- list()
    params$excludeLevels <- excludeLevels
    # command needs to be constructed as string
    # this is actually a pretty ugly way of fitting the model
    params$command <- paste("suppressWarnings(multinom(", estimationModel,
      ", weights=", weight, ", data=dataSample, trace=FALSE",
      ", maxit=maxit, MaxNWts=MaxNWts))", sep="")
    params$maxit <- maxit
    params$MaxNWts <- MaxNWts
    params$eps <- eps
    params$limit <- limit
    params$censor <- censor
    params$hasNewLevels  <- hasNewLevels
    params$newLevels <- newLevels
    params$name <- name
    params$response <- response
    params$strata <- strata
    params$nr_cpus <- nr_cpus
    params$indStrata <- indStrata
    params$predNames <- predNames
    params$additional <- c(additional, weight)
    params$verbose <- verbose 
    params$by <- by
    if(verbose) cat("running multinom with the following model:\n")
    if(verbose) cat(gsub("))",")",gsub("suppressWarnings[(]","",params$command)),"\n")

    # run in parallel if possible
    valuesCat <- runModel(dataS, dataP, params, typ="multinom")

    ## simulate (semi-)continuous values
    tcat <- table(valuesCat)
    ncat <- length(tcat)

    icat <- 1:ncat
    values <- as.list(rep.int(NA, ncat))
    # zeros

    if ( zeros ) {
      # bug: missing 0 even though zeros is not null?
      izero <- which(breaks == 0)
      values[izero] <- 0
      tcat <- tcat[-izero]
      ncat <- length(tcat)
      icat <- icat[-izero]
    }

    # values to be simulated with linear model or draws from Pareto
    # distribution
    if ( useLm ) {
      # last breakpoint is Inf, the one before is 0
      nunif <- ncat - 1  # leave category of positive values out
    } else {
      nbreaks <- length(breaks)
      if ( gpd ) {
        if ( is.null(threshold) ) {
          if ( !haveBreaks && (!isTRUE(equidist) || !is.null(probs)) ) {
            ngpd <- nbreaks-2
          } else {
            ngpd <- nbreaks-1
          }
        } else if ( any(tmp <- breaks >= threshold) ) {
          ngpd <- min(which(tmp))
        } else {
          ngpd <- nbreaks
        }
      } else {
        ngpd <- nbreaks
      }
      if ( gpd && ngpd <= ncat ) {
        # adjust threshold and fit GPD
        threshold <- breaks[ngpd]  # adjust threshold
        estPar <- fitgpd(additionalS, threshold, est)  # fit GPD
        estPar <- estPar[["fitted.values"]]  # parameters of GPD
        # generalized pareto distribution
        igpd <- ngpd:ncat
        values[icat[igpd]] <- lapply(igpd, function(i) {
          truncPareto(tcat[i], loc=threshold, scale=estPar["scale"], shape=estPar["shape"], breaks[i], breaks[i+1])
        })
      }
      nunif <- ngpd - 1
    }
    # uniform distribution
    if ( nunif > 0 ) {
      iunif <- 1:nunif
      values[icat[iunif]] <- lapply(iunif, function(i) {
        runif(tcat[i], breaks[i], breaks[i+1])
      })
    }
    # turn list into vector of values
    values <- unsplit(values, valuesCat)
  }

  if ( useLogit ) {
    ## some preparations
    if ( log && is.null(const) && haveNeg ) {
      indS <- additionalS > 0
    } else {
      indS <- additionalS != 0
    }
    dataS[[name]] <- as.integer(indS)
    estimationModel <- as.formula(estimationModel)  # formula for model
    # auxiliary model for all strata (used in case of empty combinations)
    useAux <- !is.null(tol)
    if(method=="poisson"){
      useAux <- FALSE
    }
    if ( useAux ) {
      if ( length(tol) != 1 || tol <= 0 ) {
        stop("'tol' must be a single small positive value!\n")
      }
      #nas <- sum(!complete.cases(dataS))
      #if ( length(nas) > 0 ) {
      #  warning("\nwe Hotdeck-imputation of missing values sample data required!\n")
      #  dataS <- hotdeck(dataS, variable=predNames, domain_var=samp@strata, imp_var=FALSE)
      #}
      X <- model.matrix(estimationModel, data=dataS)
      y <- dataS[[name]]
      weights <- dataS[[weight]]
      mod <- logitreg(X, y, weights=weights)
      par <- mod$par
    } else {
      par <- NULL
    }

    ## simulate binary vector
    params <- list()
    params$excludeLevels <- excludeLevels
    params$hasNewLevels <- hasNewLevels
    params$newLevels <- newLevels
    params$predNames <- predNames
    params$tol <- tol
    params$eps <- eps
    params$weight <- weight
    params$useAux <- useAux
    params$name <- name
    params$strata <- strata
    params$nr_cpus <- nr_cpus
    params$indStrata <- indStrata
    params$predNames <- predNames
    params$additional <- additional
    params$par <- par
    params$command <- estimationModel
    params$verbose <- verbose
    params$by <- by
    # run in parallel if possible

    valuesCat <- runModel(dataS, dataP, params, typ="binary")
  }

  if ( useLm || usePoisson || useXgboost) {
    ## some preparations
    if ( useMultinom ) {
      catLm <- names(tcat)[ncat]  # category for positive values
      dataS <- dataS[response == catLm, , drop=FALSE]
      indP <- valuesCat == catLm
    } else if( useLogit ) {
      dataS <- dataS[indS]  # select only non-zeros
      indP <- valuesCat == 1  # indicates non-zeros in population
    }
    if ( useMultinom || useLogit ) {
      # adjust population data
      if ( any(indP) ) {
        dataPop <- dataP[indP]
      } else {
        dataPop <- dataP
      }
      # list indStrata is adjusted so that it only contains
      # indices of persons in population with non-zero value
      indStrata <- split(1:nrow(dataPop), dataPop[[strata]])
    } else {
      dataPop <- dataP
    }
    ## trim data (if specified)
    if ( !is.null(alpha) ) {
      additional <- additional[1]
      additionalS <- dataS[[additional]]
      p <- c(alpha[1], 1-alpha[2])
      bounds <- quantileWt(additionalS, dataS[[weight]], p)
      select <- additionalS > bounds[1] & additionalS < bounds[2]
      dataSample <- dataS[select, , drop=FALSE]
      # check if all relevant levels of predictor variables are still
      # contained in sample after trimming
      # if not, trimming is not applied and a warning message is generated
      check <- unlist(sapply(predNames, function(i) {
        table(dataS[[i]]) > 0 & table(dataS[[i]]) == 0
      }))
      if ( any(check) ) {
        dataSample <- dataS
        warning("trimming could not be applied\n")
      }
    } else {
      dataSample <- dataS
    }

    ## fit linear model
    # formula for linear model
    if ( log ) {
      fname <- paste("log(", additional, if(!is.null(const)) " + const", ")", sep = "")
    } else {
      fname <- additional
    }
    fstring <- paste0(fname, " ~ ", tail(unlist(strsplit(as.character(estimationModel),"~")),1))
    formula <- as.formula(fstring)
    # auxiliary model for all strata (used in case of empty combinations)
    weights <- dataSample[[weight]]
    if(useLm){
      # TODO{Sironimo}: fix Fehler in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : 
      # contrasts can be applied only to factors with 2 or more levels
      # Question{Sironimo}: use of mod and coef? -> Not used at the moment, in my opinion
      mod <- lm(formula, weights=weights, data=dataSample,x=FALSE,y=FALSE,model=FALSE)
      coef <- coef(mod)
    }else if(usePoisson){
      mod <- glm(formula, weights=weights, data=dataSample,family=poisson(),model=FALSE,x=FALSE,y=FALSE)
      coef <- coef(mod)
    }


    # simulate values
    params <- list()
    params$coef <- coef
    if(useLm){
      params$command <- paste("lm(", fstring,", weights=", weight, ", data=dataSample,x=FALSE,y=FALSE,model=FALSE)", sep="")
    }else if(usePoisson){
      params$command <- paste("glm(", fstring,", weights=", weight, ", data=dataSample,family=poisson(),model=FALSE,x=FALSE,y=FALSE)", sep="")
    }else if(useXgboost){
      
      # simulation via xgboost
      if(verbose) cat("we are running xgboost:\n")
      
      # set xgb verbose level
      if(verbose){
        xgb_verbose <- 1
      }else{
        xgb_verbose <- 0
      }
      
      # set nr_cpus=1 for xgboost
      if(verbose){
        cat("\n Setting nr_cpus=1 when using xgboost\n")
      }
      nr_cpus <- 1
      
      if(TRUE){
        xgb_weight <- paste0(", info = list(\"weight\" = as.numeric(dataSample$", weight, "))")
      }else{
        xgb_weight <- ""
      }
      
      if( log ){
        log_transform <- "log"
      }else{
        log_transform <- ""
      }

      pred_names <- paste(predNames[predNames != strata], collapse = "\",\"")
      train <- paste0("xgb.DMatrix(data = model.matrix(~.+0,data = model.frame(setDT(dataSample)[,c(\"",pred_names,"\"), with=F],  na.action=na.pass)),
                                         missing = NA, label = ", log_transform, "(dataSample$",additional,")
                                        ", xgb_weight,")")
      
      # Default values
      nrounds <- 100
      early_stopping_rounds <- 10
      xgb_hyper_params <- "list(nthread = 6,
                                eta = 0.1,
                                max_depth = 32,
                                min_child_weight = 0,
                                gamma = 0,
                                subsample = 1,
                                lambda = 0,
                                objective = \"reg:squarederror\",
                                eval_metric = \"rmse\")"
      
      if(!is.null(model_params)){
        
        xgb_hyper_params <- "params$model_params"
        
        if(!is.null(optional_params$nrounds)){
          nrounds <- optional_params$nrounds
        }
        
        if(!is.null(optional_params$early_stopping_rounds)){
          early_stopping_rounds <- optional_params$early_stopping_rounds
        }
      }
      
      xgb_params <- paste0("nrounds = ", nrounds,",
                            watchlist = list(train = ", train, ",
                                             test = ", train, "), 
                            early_stopping_rounds = ", early_stopping_rounds,",
                            print_every_n = 10,")
      
      command <- paste0("xgb.train(",train, ", ",
                                    xgb_params,
                                    "verbose = ", xgb_verbose, ", ",
                                    "params = ", xgb_hyper_params, ")")
      
      params$command <- command
    }
    
    #params$name <- fname
    params$name <- additional
    params$excludeLevels <- excludeLevels
    params$hasNewLevels <- hasNewLevels
    params$newLevels <- newLevels
    params$predNames <- predNames
    params$additional <- c(additional, weight)
    params$const <- const
    params$formula <- formula
    params$residuals <- residuals
    params$log <- log
    params$strata <- strata
    params$nr_cpus <- nr_cpus
    params$indStrata <- indStrata
    params$predNames <- predNames
    params$verbose <- verbose
    params$by <- by
    params$model_params <- model_params
    if(useLm){
      valuesTmp <- runModel(dataSample, dataPop, params, typ="lm")
    }else if(usePoisson){
      valuesTmp <- runModel(dataSample, dataPop, params, typ="poisson")
    }else if(useXgboost){
      valuesTmp <- runModel(dataSample, dataPop, params, typ="xgboost")
    }

    ## put simulated values together
    if ( useMultinom ) {
      values[which(indP == 1)] <- valuesTmp
    } else {
      if ( useLogit ) {
        if ( log && is.null(const) && haveNeg ) {
          # only one category for non-positive values (two breakpoints, one of them is 0)
          values <- rep.int(NA, N)
          values[indP] <- runif(sum(indP), breaks[1], breaks[2])
        } else {
          values <- ifelse(is.na(indP), NA, 0) # only zeros
        }
        values[indP] <- valuesTmp
      } else {
        values <- valuesTmp
      }
    }
  }

  # reset imputed variables in sample
  if ( imputeMissings ) {
    for ( i in 1:ncol(dataS_orig)) {
      cmd <- paste0("dataS[,",colnames(dataS_orig)[i],":=dataS_orig$",colnames(dataS_orig)[i],"]")
      eval(parse(text=cmd))
    }
  }

  # attach new variable(s) to population data
  if ( useMultinom && keep ) {
    dataP[[name]] <- valuesCat
  }

  # calculate mean of new variable by household
  if ( !is.null(byHousehold) ) {
    xx <- data.table(id=1:length(values), hhid=dataP[[pop@hhid]], vals=values)
    setkey(xx, hhid)

    if ( byHousehold=="mean" ) {
      yy <- xx[,mean(vals, na.rm=TRUE), by=key(xx)]
    }
    if ( byHousehold=="sum" ) {
      yy <- xx[,sum(vals, na.rm=TRUE), by=key(xx)]
    }
    if ( byHousehold=="random" ) {
      zz <- xx[,.N,by=key(xx)]
      ids1 <- zz[N==1]
      yy <- xx[hhid%in%ids1$hhid]
      yy[,id:=NULL]
      ids2 <- zz[N>1]
      if ( nrow(ids2) > 0 ) {
        xx2 <- xx[hhid %in% ids2$hhid]
        xx2[,randId:=sample(1:nrow(xx2))]
        setkey(xx2, hhid, randId)
        setkey(xx2, hhid)
        yy2 <- unique(xx2)
        yy2[,c("id","randId"):=NULL]
        yy <- rbind(yy, yy2)
      }
      setkey(yy, hhid)
      yy[,V1:=vals]
      yy[,vals:=NULL]
    }
    xx <- merge(xx, yy, all.x=TRUE)
    setkey(xx, id)
    xx[is.nan(V1), V1:=NA]
    values <- xx$V1
  }

  # return simulated data
  dataP[[additional]] <- values
  simPopObj@pop@data <- dataP
  invisible(simPopObj)
}
statistikat/simPop documentation built on March 24, 2024, 5:05 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
statistikat/simPop
Simulation of Complex Synthetic Data Information

R/simContinuous.R
In statistikat/simPop: Simulation of Complex Synthetic Data Information

Defines functions simContinuous runModel genVals generateValues_xgboost generateValues_binary generateValues_poisson generateValues_lm generateValues_multinom

Documented in simContinuous

R Package Documentation

Browse R Packages

We want your feedback!

statistikat/simPop Simulation of Complex Synthetic Data Information

R/simContinuous.R In statistikat/simPop: Simulation of Complex Synthetic Data Information

Defines functions simContinuous runModel genVals generateValues_xgboost generateValues_binary generateValues_poisson generateValues_lm generateValues_multinom

Documented in simContinuous

R Package Documentation

Browse R Packages

We want your feedback!

statistikat/simPop
Simulation of Complex Synthetic Data Information

R/simContinuous.R
In statistikat/simPop: Simulation of Complex Synthetic Data Information