R/hmi_smallfunctions_2016-07-14.R

Defines functions hmi_pool get_type sample_imp update.sigma.alpha.coef update.sigma.y.coef update.beta.coef update.alpha.coef calc

Documented in calc get_type hmi_pool sample_imp update.alpha.coef update.beta.coef update.sigma.alpha.coef update.sigma.y.coef

##############################
###Gibbs sampler functions####
##############################


####Gibbs sampler for general random coefficients models

#' helper function to be called in \code{update.alpha.coef}.
#'
#' It (basically) calculates (Z'Z + sigma.e * sigma.b^-1)^-1 * z' resid.
#' @param x A data.frame which is basically cbind(Z_obs, y_obs - X_obs * beta.new).
#' @param sigma.y.new The numeric residual variance.
#' @param sigma.alpha.new.inv The inverse random effects covariance matrix.
#' @return It returns (for this given cluster) a (cluster specific) random intercept and random slope.
calc <- function(x, sigma.y.new, sigma.alpha.new.inv){
  solve(t(x[, 1:(ncol(x) - 1)]) %*% as.matrix(x[, 1:(ncol(x) - 1)]) +
          sigma.y.new^2 * sigma.alpha.new.inv) %*% t(x[, 1:(ncol(x) - 1)]) %*% x[, ncol(x)]
}


#' update.alpha.coef
#'
#' The function updates the cluster specific random effects
#' by drawing from a multivariate normal distribution
#' (with a unique distribution for every cluster).
#' @param Z_obs The random effects data matrix of those observations with an observed value of y.
#' @param y_obs The target variable of those observations with an observed value of y.
#' @param X_obs The fixed effects data matrix of those observations with an observed value of y.
#' @param beta.new The vector of fixed effects parameters.
#' @param clID_obs The cluster ID vector of those observations with an observed value of y.
#' @param sigma.y.new The numeric residual variance.
#' @param sigma.alpha.new.inv The inverse random effects covariance matrix.
#' @return It returns cluster specific random effects.
update.alpha.coef <- function(Z_obs, y_obs, X_obs, beta.new, clID_obs, sigma.y.new, sigma.alpha.new.inv){

  # -- calculate cluster specific random effects
  tmp <- cbind(Z_obs, y_obs - X_obs %*% beta.new)
  alpha.hat <- by(tmp, clID_obs, calc,
                  sigma.y.new = sigma.y.new, sigma.alpha.new.inv = sigma.alpha.new.inv)

  rm(tmp)

  # -- calculate cluster specific random effects covariance matrices.
  var.alpha.hat <- by(Z_obs, clID_obs,
                      function(x) sigma.y.new^2 *
                        solve(t(x) %*% as.matrix(x) + sigma.y.new^2 * sigma.alpha.new.inv))

  # -- draw for each cluster new random effects.
  alpha.new <- t(mapply(MASS::mvrnorm, mu = alpha.hat, Sigma = var.alpha.hat, n = 1))
  return(alpha.new)
}



#' update.beta.coef
#'
#' The functions updates the fixed effects (imputation) parameters.
#' By drawing them from their normal distribution.
#' @param Z_obs A data.frame with the random effects variables.
#' @param alpha.new A matrix (with the dimension: number of clusters times number of random effect variables)
#' of cluster specific random effects.
#' @param b.hat.part1 The (X'X)^{-1} X' matrix.
#' @param clID_obs The cluster ID vector of those observations with an observed value of y.
#' @param y_obs The target variable of those observations with an observed value of y.
#' @param sigma.y.new The residual variance (positive numeric).
#' @param xtx The (X'X)^{-1} matrix.
#' @return It returns a vector of new fixed effects parameters.
update.beta.coef <- function(Z_obs, alpha.new, b.hat.part1, clID_obs, y_obs, sigma.y.new, xtx){
  Z.alpha <- rowSums(Z_obs * alpha.new[clID_obs,])
  beta.hat <- b.hat.part1 %*% (y_obs - Z.alpha)
  Sigma.hat <- sigma.y.new^2 * xtx
  newbeta <- MASS::mvrnorm(n = 1, mu = beta.hat, Sigma = Sigma.hat)
  return(newbeta)
}

#' update.sigma.y.coef
#'
#' The function updates the residual variance parameter by drawing from a chisquared distribution.
#' @param y_obs The target variable of those observations with an observed value of y.
#' @param X_obs The fixed effects data matrix of those observations with an observed value of y.
#' @param beta.new The vector of fixed effects parameters.
#' @param Z_obs The random effects data matrix of those observations with an observed value of y.
#' @param alpha.new The matrix of cluster specific random effects.
#' @param clID_obs The cluster ID vector of those observations with an observed value of y.
#' @param n.obs The number of individuals with an observed y.
#' @return The numeric residual variance.
update.sigma.y.coef <- function(y_obs, X_obs, beta.new, Z_obs, alpha.new, clID_obs, n.obs){
  resid <- y_obs - X_obs %*% beta.new - rowSums(Z_obs * alpha.new[clID_obs, ])
  sigma.y.new <- sqrt(sum(resid^2)/rchisq(1, n.obs - 1))
  return(sigma.y.new)
}

#' update.sigma.alpha.coef
#'
#' It updates the random effects covarariance matrix by drawing from a Wishart distribution.
#' @param hyper.df1 A vector of numeric hyper parameters
#' @param hyper.df2 A matrix of hyper parameters.
#' @param alpha.new A matrix of cluster specific random effects.
#' @return It returns a matrix with a new random effects covarariance matrix.
update.sigma.alpha.coef <- function(hyper.df1, hyper.df2, alpha.new){
  hyper.Sigma <- solve(t(alpha.new) %*% alpha.new + hyper.df2)
  new.sigma.alpha <- solve(rWishart(1, df = hyper.df1, Sigma = hyper.Sigma)[,,1])
  return(new.sigma.alpha)
}


#help function to do a sample imputation
#' Sample imputation.
#'
#' Function to sample values in a variable from other (observed) values in this variable.
#' So this imputation doesn't use further covariates.
#' @param variable A vector of size \code{n} with missing values.
#' @return A vector of size \code{n} without missing values.
#' @examples
#' sample_imp(c(1, NA, 3, NA, 5))
sample_imp <- function(variable){
  #!!!Es waere sinnvoll hier schon Nebenbedingungen zuzulassen!!!
  ret <- variable
  ret[is.na(ret)] <- sample(size = sum(is.na(variable)),
                             variable[!is.na(variable)], replace = TRUE)
  return(ret)
}


#TODO: Fallunterscheidungen um mit Vectoren, Matrizen oder data.frames umzugehen!!!
#Variable sollte ein Vektor oder factor sein

#' Get the type of variables.
#'
#' Function checks wether a variable is: ...
#' \itemize{
#'  \item continuous (just many numeric values),
#'  \item semicontinuous (many numeric values but more than 5% of the data are 0),
#'  \item a intercept (just the same value for all observations),
#'  \item binary (just two different values - like 0s and 1s or "m" and "f"),
#'  \item categorical (the variable is a factor or has more than 3 different values)
#'  \item orderd categorical (the categorical variable is ordered.)
#'}
#'
#' @param variable A variable from your dataset.
#' @return A character denoting the type of \code{variable}.
get_type <- function(variable){

  if(length(table(variable)) == 1){
    type <- "intercept"
    return(type)
  }

  if(length(table(variable)) == 2){
    type <- "binary"
    #!!!ggfs muessen binaere Variablen noch in 0 und 1 umgewandelt werden.
    #Bsp. wenn die Geschlechtervariable ist bis dahin "m" und "w" ist.
    return(type)
  }

  if(is.vector(variable) || is.factor(variable) || is.array(variable)){
    if(is.numeric(variable)){
      type <- "cont"

      #if the variable is numeric and more than 5% of the variable values are 0,
      #I count this variable as semi-continious.
      #???FROM WHICH RATE ON, ONE CAN SAY, THAT THE VARIABLE IS SEMI-CONT AND NOT
      #ONLY CONT? 1%, 10%, 50%???
      #!!!problematische Zahl frei w?hlbar!!!
      if(sum(variable == 0, na.rm = TRUE)/
         length(variable[!is.na(variable)]) > 0.05){
        type <- "semicont"
      }
      return(type)

    }

    #checking the length of the table to be larger than two is not sufficient
    #to get a real categoriacal variable, because for a (real) continious variable
    #the length of the table is equal to the length of the variable
    #(what will be in general larger than two...)
    #Therefore it will be checked if the table has more than two entries AND
    #wheter the varriable is a factor.
    if(is.factor(variable) || length(table(variable)) > 2){
      type <- "categorical"
      if(is.ordered(variable)){
        type <- "ordered_categorical"
      }

      return(type)
    }

  }else{
    #MS: Wenn es kein Vector oder kein Factor ist, gehe ich davon aus,
    #dass es eine Matrix oder ein data.frame ist.
    #print("R ist komisch!")
    #MS: d.h. ich schliesse aus, dass ein eine Liste mit Datensaetzen etc. ist.
    #MS: Fallunterscheidungen um mit Vectoren, Matrizen oder data.frames umzugehen:
    if(ncol(variable) == 1){
      ret <- get_type(variable[, 1])
      return(ret)
    }
    if(ncol(variable) > 1){
      if(is.matrix(variable)){
        variable <- as.data.frame(variable)
      }
      ret <- unlist(lapply(variable, get_type))
      #MS:?!? Fuer data.frames brauche ich lapply, fuer matrizen apply.
      return(ret)
    }
  }
}



#' Pool the results of the imputation function \code{wrapper}.
#'
#' This function applies the analysis the user is interested in, on all different imputed dataset.
#' Then the results are pooled by simply averaging the results. So the user has to make sure that
#' his analysis produces results with a meaningful average. And furthermore has to accept that no
#' variance is calculated for these parameters.
#' @param imputed_data A \code{mids} object from the \code{wrapper} imputation function.
#' @param analysis A user generated function that he is interested in. See examples.
#' @return A vector with all pooled results.
hmi_pool <- function(imputed_data, analysis){

  if (!is.mids(imputed_data)) stop("The data must have class mids.")
  analyses <- as.list(1:imputed_data$m)
  for (i in 1:imputed_data$m) {
    data.i <- complete(imputed_data, i)
    analyses[[i]] <- analysis(data.i)
  }

  tmp <- simplify2array(analyses)
  mode(tmp) <- "numeric"
  return(rowMeans(tmp))
}
matthiasspeidel/hmi documentation built on Aug. 18, 2020, 4:37 p.m.