mglmn: Model Averaging for Multivariate GLMM with Null Models

Documented in mamgllvm

#' Model averaging for multivariate generalized linear latent variable models
#'
#' Model averaging for multivariate GLLVM based on information theory.
#'
#' @name mamgllvm
#'
#' @export
#' @param data Data frame, typically of environmental variables. Rows for sites and colmuns for environmental variables.
#' @param y Name of 'mvabund' object (character)
#' @param family the 'family' object used.
#' @param scale Whether to scale independent variables (default = TRUE)
#' @param AIC.restricted Whether to use AICc (TRUE) or AIC (FALSE) (default = TRUE).
#' @return A list of results
#' @return \item{res.table }{data frame with "AIC", AIC of the model, "log.L", log-likelihood of the model, "delta.aic", AIC difference to the best model, "wAIC", weighted AIC to the model, "n.vars", number of variables in the model, and each term.}
#' @return \item{importance }{vector of relative importance value of each term, caluclated as as um of the weighted AIC over all of the model in whith the term aperars.}
#' @return \item{family }{the 'family' object used.}

#' @references 
#' Burnham, K.P. & Anderson, D.R. (2002) Model selection and multi-model inference: a practical information-theoretic approach. Springer Verlag, New York.
#'
#' Niku, J., Warton,  D. I., Hui, F. K. C., and Taskinen, S. (2017). Generalized linear latent variable models for multivariate count and biomass data in ecology. Journal of Agricultural, Biological, and Environmental Statistics, 22:498-522.
#'
#' Niku, J., Brooks, W., Herliansyah, R., Hui, F. K. C., Taskinen, S., and Warton,  D. I. (2018). Efficient estimation of generalized linear latent variable models. PLoS One, 14(5):1-20.
#'
#' Warton, D. I., Guillaume Blanchet, F., O'Hara, R. B., Ovaskainen, O., Taskinen, S., Walker, S. C. and Hui, F. K. C. (2015). So many variables: Joint modeling in community ecology. Trends in Ecology & Evolution, 30:766-779.
#'
#' @seealso\code{\link{maglm}}, \code{\link{ses.maglm}}, \code{\link{ses.mamglm}}
#' @examples
#' #load species composition and environmental data
#' library(mvabund)
#' data(capcay)
#' #use a subset of data in this example to reduce run time
#' env_assem <- capcay$env_assem[, 1:2]
#' freq.abs <- mvabund(log(capcay$abund + 1))
#'
#' #to fit a gaussian regression model to frequency data:
#' mamgllvm(data = env_assem, y = "freq.abs", family = "gaussian")
#'
#' #to fit a binomial regression model to presence/absence data"
#' pre.abs0 <- capcay$abund
#' pre.abs0[pre.abs0 > 0] = 1
#' pre.abs <- mvabund(pre.abs0)
#'
#' mamgllvm(data = env_assem, y = "pre.abs", family = "binomial")
mamgllvm <- function(data, y, family, scale = TRUE, AIC.restricted = FALSE){
  if (scale) data <- as.data.frame(scale(data))
    my.vars <- colnames(data)
    n.vars <- length(my.vars)
    vars.list <- list()
    for (i in 1:n.vars){
    vars.list[[i]] <- combn(my.vars, i)
  }

#numbers of combination for each sample size
  temp <- sapply(vars.list, ncol)

#AIC for each model
  model.aic <- NULL
  log.L <- NULL
  vars <- list()
  k <- 0
  vars2 <- matrix(numeric(n.vars * sum(temp)), nrow = sum(temp), ncol = n.vars)

#number of paremeter = i+1 (one means intercept)
#sample size = nrow(data)
#AICc = -2*logLike + 2*n.para*n.sample/(n.sample-n.para-1)
  for (i in 1:n.vars){
    for (j in 1:temp[i]){
      k <- k + 1
      vars.temp <- vars.list[[i]][, j]
      vars[[k]] <- vars.temp
      #f.str <- make.formula(get(y), vars.temp)
     # f.str <- make.formula(y, vars.temp)
    # data2 <- data[, vars.temp]
      data2 <- as.data.frame(data[, vars.temp])
      colnames(data2) <- vars.temp

    #  message("y")
    #  get(y) %>% print
    #  message("data2")
    #  data2 %>% print
    #  message("family")
    #  print(family)

      fit_fun <- function(){
        gllvm(
          y = get(y),
          X = data2, 
          family = family,
          starting.val = "res")
      }

      fit.temp <- NULL
      fit.temp <- try(fit_fun(), silent = FALSE)

      if (class(fit.temp) == "try-error") {
        message("Use starting.val = 'zero' instead of 'res'")
        fit.temp <- gllvm(
          y = get(y),
          X = data2, 
          family = family,
          starting.val = "zero")
      }

      fit.summary <- summary(fit.temp)

      log.L.temp <- fit.summary$`log-likelihood`
      log.L <- c(log.L, log.L.temp)

      if (AIC.restricted){
        aic.restricted <- fit.summary$AICc
        model.aic <- c(model.aic, aic.restricted)
      }
      else{
        aic.unrestricted <- fit.summary$AIC
        model.aic <- c(model.aic, aic.unrestricted)
      }
    }
  }

  min.aic <- min(model.aic)
  delta.aic <- model.aic - min.aic
  wAIC <- exp(-delta.aic / 2) / sum(exp(-delta.aic / 2))

  res <- data.frame(AIC = model.aic, log.L = log.L, delta.aic, wAIC, n.vars = rep(1:n.vars, temp))

##counting vars
#vars2 is matrix filled with 0 (row:sites,col:parameters)
  vars2 <- matrix(numeric(n.vars * sum(temp)), nrow = sum(temp), ncol = n.vars)
  colnames(vars2) <- my.vars
  n.size <- rep(1:n.vars, temp) # number of paremters for each row
  for (i in 1:nrow(vars2)){ # each row (sample)
    for (j in 1:n.vars){ # each column (paramter)
      for (k in 1:n.size[i]){ # upto number of paramters
        if (colnames(vars2)[j] == vars[[i]][k]) vars2[i,j] <- 1
      }
    }
  }

# for exmaple, i=100
# > vars[[100]]
# [1] "LOG..x.1..Exotic.plant.rich"
# [2] "P.mega.Is.p.a"
# [3] "Tot.rain.during.sampling.log..x.0.01."
# these three paramters were used in the 100th analysis
#
# if jth colnames of vars2, which is a parameter name, is identical to vars[[i]][k], which is a paramter name used in the analysis, vars2[i,j] is replaced by 1

  res <- cbind(res, vars2)
  res <- res[order(res$AIC), ]

  rownames(res) <- NULL

#calculating weighted result of explanable variables
  res.temp <- res[, -1:-5]
  res2 <- apply(apply(res.temp, 2,
                      function(x)res$wAIC * x), 2, sum)
  list(res.table = res, importance = res2, family = family)
}