R/utilities.R

Defines functions form_to_data data_filter REdat FEdat RHSrandom RHSfixed getLHS getRHS

# Formula Manipulation ----

# Extract RHS of formula, also use for RHS of bar (e.g. 1 + x | u:v returns u:v)
getRHS <- function(formula) {
  RHS <- formula[[3]]
  return(RHS)
}

# Extract LHS of formula, also use for LHS of bar (in ex above returns 1 + x)
getLHS <- function(formula) {
  LHS <- formula[[2]]
  return(LHS)
}

# Recursive function for extracting fixed portion of RHS/formula, based on lme4 source code
RHSfixed <- function(RHS) {

  if (!("|" %in% all.names(RHS))) return(RHS) #if no bars, return RHS as-is
  if (RHS[[1]] == as.name("|")) return(NULL) #if it's an RE term, return as NULL

  if (length(RHS) == 2) {
    temp <- RHSfixed(RHS[[2]])
    if (is.null(temp)) return(NULL)
    RHS[[2]] <- temp
    return(RHS)
  }

  rhs2 <- RHSfixed(RHS[[2]])
  rhs3 <- RHSfixed(RHS[[3]])
  if (is.null(rhs2)) return(rhs3)
  if (is.null(rhs3)) return(rhs2)
  RHS[[2]] <- rhs2
  RHS[[3]] <- rhs3

  return(RHS)

}

# Recursive function for extract RHS random intercept terms, based on lme4 source code. ONLY works on RHS
RHSrandom <- function(RHS) {

  if (is.name(RHS)) return(NULL)
  if (RHS[[1]] == as.name("(")) return(RHSrandom(RHS[[2]]))
  if (RHS[[1]] == as.name("|")) return(RHS)
  return(c(RHSrandom(RHS[[2]]), RHSrandom(RHS[[3]])))

}




# Data preparation ----

# Function for preparing FE component and DV
FEdat <- function(data, y=NULL, fe_formula) {

  mf <- stats::model.frame(fe_formula, data) #need to add na.action at some point - understand how it intersects with missingness in RE groups
  X <- stats::model.matrix(fe_formula, mf) #need to add contrasts at some point - also work out offsets and weights

  if (is.null(y)) {
    temp <- stats::model.response(mf)
    n <- length(temp)
    if (!is.factor(temp)) temp <- as.factor(temp)
    lev <- levels(temp)
    M <- length(lev)
    y <- matrix(integer(n*M), nrow=n, ncol=M)
    for (m in 1:M) {
      y[,m] <- ifelse(temp == lev[m], 1L, 0L)
    }
    colnames(y) <- lev
  } else {
    y <- y[attr(mf, "na.action"), ] #need to understand how this will change for different na.actions
    ymissing <- apply(y, 1, function(x) any(is.na(x)))
    y <- y[ymissing,]
    mf <- mf[ymissing,] # this will break the use of attr(mf, "na.action")
    X <- X[ymissing,]
  }

  out <- list(mf, X, y)
  names(out) <- c("mf", "X", "y")
  return(out)

}

# Function for generating data blocks for REs
# - for now random intercepts only
# - next will add cross-classified REs
# - random slopes (correlated and uncorrelated) a long way off
REdat <- function(data, re_terms) {

  # Currently assuming only random intercepts, so not processing LHS atm

  # Extract RHS, convert to formula
  groupNames <- as.character(sapply(re_terms, logitPlus:::getRHS))
  groups <- paste0(groupNames, collapse=" + " )

  # Convert desired variables to model frame
  ref <- stats::model.frame(as.formula(paste0("~ ", groups)), data)
  n <- nrow(ref)

  # Column by column convert to block matrix
  A <- ncol(ref) # will need to come back to this once nesting and slopes are added
  Z <- vector("list", A)
  for (i in 1:A) {
    vec <- ref[[i]]
    if (!is.factor(vec)) vec <- as.factor(vec)
    lev <- levels(vec)
    nlev <- length(lev)
    Z[[i]] <- matrix(integer(n * nlev), nrow=n, ncol=nlev)
    colnames(Z[[i]]) <- lev
    rownames(Z[[i]]) <- rownames(ref)
    for (j in 1:nlev) {
      Z[[i]][,j] <- ifelse(vec == lev[j], 1L, 0L)
    }
  }

  names(Z) <- groupNames
  out <- list(ref, Z)
  names(out) <- c("ref", "Z")
  return(out) # in the future this will need to return meta-info such as whether it's an intercept or slope, etc

}

# Function for finalising missingness relations - need to come back to this once using na.action
data_filter <- function(fe_data, re_data) {

  feindex <- rownames(fe_data$mf) %in% rownames(re_data$ref)
  reindex <- rownames(re_data$ref) %in% rownames(fe_data$mf)

  fe_data$mf <- fe_data$mf[feindex,]
  fe_data$X <- fe_data$X[feindex,]
  fe_data$y <- fe_data$y[feindex,]

  re_data$ref <- re_data$ref[reindex,]
  for (i in 1:length(re_data$Z)) re_data$Z[[i]] <- re_data$Z[[i]][reindex,]

  model_data <- cbind(fe_data$mf, re_data$ref)

  out <- list(fe_data, re_data, model_data)
  names(out) <- c("fe_data", "re_data", "model_data")
  return(out)

}




# Formula-data function ----

# Function that takes data, optional y matrix, formula, na.action, contrasts, weights, etc
# parses formula to return model frame(s) required for estimation using functions above
form_to_data <- function(data, y=NULL, formula) {

  fe_formula <- logitPlus:::RHSfixed(formula)
  re_terms <- logitPlus:::RHSrandom(logitPlus:::getRHS(formula))

  fe_data <- logitPlus:::FEdat(data, y, fe_formula)
  has_REs <- !is.null(re_terms)
  if (has_REs) re_data <- logitPlus:::REdat(data, re_terms)
  if (has_REs) {
    mdata <- logitPlus:::data_filter(fe_data, re_data)
    fe_data <- mdata$fe_data
    re_data <- mdata$re_data
    model_data <- mdata$model_data
    y <- fe_data$y
    X <- fe_data$X
    Z <- re_data$Z
  } else {
    model_data <- fe_data$mf
    y <- fe_data$y
    X <- fe_data$X
  }

  out <- list(model_data, y, X)
  if (has_REs) out <- append(out, list(Z))
  onames <- c("model_data", "y", "X")
  names(out) <- if (has_REs) c(onames, "Z") else onames
  return(out)

}
philswatton/logitPlus documentation built on March 19, 2022, 7:16 a.m.