causl: Methods for Specifying, Simulating from and Fitting Causal Models

Documented in check_rej

##' Process formulas, families and parameters
##'
##' @inheritParams rfrugalParam
##' @param kwd keyword for copula
## @param ordering logical: should an ordering of variables be computed?
## @param ... dots from rfrugalParam
##'
##' @details Function that processes and checks the validity of the main arguments
##' used for simulating data.  The `control` argument only requires values useful
##' for obtatining the empirical (conditional) quantiles of pre-specified data.
##'
process_inputs <- function (formulas, family, pars, link, dat, kwd, method="inversion", control=list()) {

  ## process univariate formulas and obtain dimensions of model
  formulas <- process_formulas(formulas)
  ## check ordering is sound

  ord <- var_order(formulas, method=method)
  dims <- lengths(formulas)

  ## obtain variable names
  LHS_Z <- lhs(formulas[[1]])
  LHS_X <- lhs(formulas[[2]])
  LHS_Y <- lhs(formulas[[3]])

  ## process family variable inputs
  family <- process_family(family=family, dims=dims)

  ### change to use the names supplied by each family

  ## useful variable summaries
  # output <- c(LHS_Z, LHS_Y)
  vars <- c(LHS_Z, LHS_X, LHS_Y)
  # if (!is.null(dat)) avars <- c(names(dat), vars)
  if ((is.null(dat) && any(duplicated(vars))) ||
      any(duplicated(c(names(dat), vars)))) stop("Repeated variable names not allowed")
  LHSs <- list(LHS_Z, LHS_X, LHS_Y)

  ## check univariate parameter values are appropriate
  if (!missing(pars)) {
    dummy_dat <- gen_dummy_dat(family=family, pars=pars, dat=dat, LHSs=LHSs, dims=dims)
    pars <- check_pars(formulas=formulas, family=family, pars=pars, dummy_dat=dummy_dat,
                       LHSs=LHSs, kwd=kwd, dims=dims)$pars
  }
  else {
    message("Parameters missing, treating as a model only for fitting")
    pars <- NULL
  }

  ## different approaches based on method selected
  if (method == "rejection") pars <- check_rej(formulas=formulas, family=family, pars=pars, dims=dims, kwd=kwd)$pars
  else {

    ## obtain empirical quantiles from any plasmode variables that will appear in copula
    if (!is.null(dat)) {
      n <- nrow(dat)

      wh_q <- setdiff(unlist(rhs_vars(formulas[[2]])),
                      c(unlist(rhs_vars(formulas[[3]])), vars))
      quantiles <- process_prespecified(dat, prespec = wh_q, cond = control$pm_cond,
                                        nlevs = control$pm_nlevs,
                                        cor_thresh = control$pm_cor_thresh,
                                        tol = control$quan_tol)
    }
    else wh_q <- character(0)

    # nC <- length(wh_q)

    ## put some validation code in here for inversion method

    if (method == "inversion") {
      tmp <- pair_copula_setup(formulas=formulas[[4]], family=family[[4]],
                               pars=pars[[kwd]], LHSs=LHSs, quans=wh_q, ord=ord)
      formulas[[4]] <- tmp$formulas
      family[[4]] <- tmp$family
      pars[[kwd]] <- tmp$pars
    }
    else if (method == "inversion_mv") {
      if (length(formulas[[4]]) != 1) stop("Should only be one formula for multivariate copula")
      if (length(family[[4]]) != 1) stop("Should only be one family for multivariate copula")
    }
    else stop("'method' should be \"inversion\", \"inversion_mv\" or \"rejection\"")
  }

  if (!exists("quantiles")) quantiles <- NULL

  ## set up link functions
  link <- link_setup(link, family[1:3], vars=LHSs)

  caus_mod <- list(formulas=formulas, family=family, pars=pars, link=link,
                   dat=dat, LHSs=list(LHS_Z=LHS_Z, LHS_X=LHS_X, LHS_Y=LHS_Y),
                   quantiles=quantiles, kwd=kwd, dim=dims[1:3], vars=vars, #output=output,
                   order=ord, method=method)
  class(caus_mod) <- "causl_model"

  return(caus_mod)
}

##' @inheritParams process_inputs
##' @param len number of formulas
##' @describeIn process_inputs Process input for family variables
process_formulas <- function (formulas, len=4) {
  ## check we have four groups of formulas
  if (length(formulas) != len) stop(paste0("Formulas must have length ", len))

  ## ensure all formulas are given as lists
  if (any(sapply(formulas, class) == "formula")) {
    wh <- which(sapply(formulas, class) == "formula")
    for (i in wh) formulas[[i]] <- list(formulas[[i]])
  }

  return(formulas)
}

##' @inheritParams gen_dummy_dat
##' @describeIn process_inputs Process input for family variables
##' @param func_return function to use to process character arguments
##'
##' @details
##' For `causl` we use the `get_family()` function to process character based
##' arguments, but we allow for other functions to be used in packages that
##' build on this one.
##'
##'
##' @export
process_family <- function (family, dims, func_return=get_family) {

  if (missing(family)) {
    ## assume everything is Gaussian
    family <- lapply(dims, rep.int, x=1)
    message("No families provided, so assuming all variables are Gaussian")
    return(family)
  }

  nU <- length(family) - 1
  if (length(family) != length(dims)) stop("Should be a family entry for each set of formulas")

  if (is.list(family)) {
    lens <- lengths(family)

    if (any(sapply(family, class) == "causl_family")) {
      wh <- which(sapply(family, class) == "causl_family")
      for (i in wh) {
        family[[i]] <- list(family[[i]])
        lens[i] <- 1
      }
    }
    if (!all(lens[seq_len(nU)] == dims[seq_len(nU)])) stop("Mismatch in family and formulae specifications")
  }
  else if (length(family) == nU+1) {
    if (sum(dims[seq_len(nU)]) > nU) stop("Mismatch in family and formulae specification")
    family <- as.list(family)
    lens <- rep(1, nU+1)
  }
  else stop(paste0("'family' should be a list, or vector of length ", nU+1))

  ## deal with family names and causl_fam functions
  for (i in seq_len(nU)) {
    if (lens[i] > 0) {
      if (is.character(family[[i]])) {
        fams <- unlist(family[[i]])
        family[[i]] <- lapply(fams, function(x) func_return(x)())
        # family[[i]] <- lapply(fams, function(x) get_family(x)())
      }
      else if (is.numeric(family[[i]])) {
        next
      }
      else if (is.list(family[[i]])) {
        if (all(rapply(family[[i]], is.character))) {
          family[[i]] <- unlist(family[[i]])
          if (length(family[[i]]) != lens[i]) stop("Incorrect number of families specified")
          family[[i]] <- lapply(family[[i]], function(x) func_return(x)())
        }
        else if (all(rapply(family[[i]], is.numeric))) {
          family[[i]] <- unlist(family[[i]])
          if (length(family[[i]]) != lens[i]) stop("Incorrect number of families specified")
        }
        else if (all(sapply(family[[i]], function(x) is(x, "causl_family")))) {
          next
        }
        else stop("Not a valid family specification")
      }
      else stop("Not a valid family specification")
    }
  }

  ## check copula families are valid
  # if (!all(unlist(family[1:3]) %in% family_vals$val)) stop("Invalid family specification")
  if (!all(unlist(family[[nU+1]]) %in% copula_vals$val)) stop("Invalid copula specification")

  return(family)
}

##' Obtain variable ordering from formulas
##'
##' @inheritParams gen_dummy_dat
##' @inheritParams process_inputs
##' @param inc_cop logical indicating whether to include copula in the ordering
##'
var_order <- function (formulas, dims, inc_cop=TRUE, method) {

  if (missing(dims)) dims <- lengths(formulas)
  nU <- length(dims) - 1

  ## get terms objects
  forms <- lapply(formulas[seq_len(nU)], function(x) lapply(x, terms))

  ord_mat <- matrix(0, nrow=sum(dims[seq_len(nU)]), ncol=sum(dims[seq_len(nU)]))

  LHSs <- lapply(formulas[seq_len(nU)], lhs)
  vars <- unlist(LHSs)

  ## get adjacency matrix of dependencies
  for (i in seq_along(forms)) if (dims[i] > 0) {
    cvs <- sum(dims[seq_len(i-1)]) + seq_len(dims[i])  # variables for this iteration
    ord_mat[cvs, ] <- 1*t(sapply(forms[[i]], function(x) vars %in% attr(x, "term.labels")))
  }

  if (inc_cop) {
    ## now include copula parameters
    LHS_Y <- LHSs[[3]]

    if (!is.null(formulas[[4]][LHS_Y])) {
      ## look for possible loops in copula formulae
      cop_frms <- formulas[[4]][LHS_Y]
      if (is.list(cop_frms) && length(cop_frms) > 0 && is.list(cop_frms[[1]])) {
        pred <- lapply(cop_frms, function (x) sapply(x, lhs))
        ord_mat[sum(dims[1:2]) + seq_len(dims[3]), ] <- ord_mat[sum(dims[1:2]) + seq_len(dims[3]), ] +
          1*t(sapply(pred, function(x) vars %in% x))
        ord_mat[] <- pmin(ord_mat, 1)
      }
    }

    ## add this information in
    ord_mat[sum(dims[1:2]) + seq_len(dims[3]), ] <- ord_mat[sum(dims[1:2]) + seq_len(dims[3]), ] +
      1*t(sapply(forms[[3]], function(x) vars %in% attr(x, "term.labels")))
  }

  ## get order for simulation, if one exists

  if (FALSE && method == "inversion_mv") {
    # break ties with Xs going first.
    # if there are uncoditional treatments put them first
    treats <- ord_mat[(dims[1] + 1) : (dims[1] + dims[2]), , drop = FALSE]
    nuisance <- ord_mat[1:dims[1], , drop = FALSE]
    marginal_treats <- apply(treats == 0, 1, all)
    if(any(marginal_treats)){
      which_treats <- which(marginal_treats) + dims[1]
      marginal_nuisance <- apply(nuisance == 0, 1, all)
      if(any(marginal_nuisance)){
        which_nuisance <- which(marginal_nuisance)
        ord_mat[which_nuisance, which_treats] <- 0.1
      }
    }
  }
  ord <- topOrd(ord_mat)

  if (any(is.na(ord))) stop("Formulae contain cyclic dependencies")

  return(ord)
}

##' @describeIn check_pars Checks for rejection sampling
##' @inheritParams process_inputs
##'
check_rej <- function(formulas, family, pars, dims, kwd) {
  if (any(unlist(lapply(family, is_categorical)))) stop("Categorical variables must be simulated using 'method=\"inversion\"'")

  LHS_Z <- lhs(formulas[[1]])
  LHS_X <- lhs(formulas[[2]])
  LHS_Y <- lhs(formulas[[3]])
  if (missing(dims)) dims <- lengths(formulas)

  ## more restrictive for the rejection sampling method
  if (any(unlist(lapply(rhs_vars(formulas[[3]]),
                        function(x) any(x %in% LHS_Z))))) stop("Covariates cannot be direct predictors for outcomes under rejection sampling")
  if (any(unlist(lapply(rhs_vars(formulas[[1]]),
                        function(x) any(x %in% LHS_Y))))) stop("Outcomes cannot be direct predictors for covariates under rejection sampling")
  if (any(unlist(lapply(rhs_vars(formulas[[3]]),  # could relax this
                        function(x) any(x %in% LHS_Y))))) stop("Outcomes cannot directly predict one another under rejection sampling")

  ## check if covariates predict each other
  if (dims[1] > 1) {
    idx <- lapply(rhs_vars(formulas[[1]]), function(x) na.omit(match(x,LHS_Z)))
    # idx <- lapply(formsZ,
    #               function(x) match(intersect(LHS_Z, attr(x, "term.labels")[attr(x, "order") == 1]), LHS_Z))
    ignr <- do.call(cbind, mapply(function(x,y) {
      if (length(x) > 0) rbind(x,y)
      else matrix(NA, 2, 0)
    }, idx, seq_along(LHS_Z)))
    if (length(ignr) > 0) {
      wh_swt <- ignr[1,] > ignr[2,]
      ignr[,wh_swt] <- ignr[2:1, wh_swt]
      in_form <- setmatch(apply(ignr, 2, c, simplify = FALSE),
                          combn(length(LHS_Z), 2, simplify = FALSE))
    }
    else in_form <- numeric(0)
  }
  else in_form <- numeric(0)

  order <- NULL

  ## ensure that copula parameters are in correct matrix format
  if (!is.matrix(pars[[kwd]]$beta)) {
    pars[[kwd]]$beta <- matrix(pars[[kwd]]$beta, ncol=choose(dims[1]+dims[3],2))
    form_cop <- terms(formulas[[4]][[1]])
    if (nrow(pars[[kwd]]$beta) != length(attr(form_cop, "term.labels"))+attr(form_cop, "intercept")) stop("Wrong number of regression parameters for copula")
  }
  if (length(in_form) > 0) {
    if (any(pars[[kwd]]$beta[,in_form] != 0)) {
      warning("Parameters in copula for objects already related by regression; setting copula parameters to zero")
      pars[[kwd]]$beta[,in_form] <- 0
    }
  }

  ## code to check if Y or Z is included in copula formula
  if (any(c(LHS_Z, LHS_Y) %in% unlist(rhs_vars(formulas[[4]])))) stop("copula cannot depend upon Z or Y variables")

  return(list(pars=pars))
}

##' Generate a dummy dataset
##'
##' Create a dummy dataset for the purpose of checking coefficient numbers
##'
##' @inheritParams process_inputs
##' @inheritParams rfrugalParam
##' @param LHSs left-hand sides from `formulas`
##' @param dims number of variables in each class
##'
##' @export
gen_dummy_dat <- function (family, pars, dat, LHSs, dims) {

  ## SHOULD MAKE THIS FUNCTION FOLLOW THE ORDER FOR SIMULATING VARIBLES

  if (missing(dims)) dims <- lengths(family)
  nU <- length(LHSs)

  ## produce dummy data.frame to check number of coefficients
  nv <- sum(dims[seq_len(nU)])
  if (!is.null(attr(LHSs, "T"))) {
    T <- attr(LHSs, "T")[1]
    strt <- attr(LHSs, "T")[2]
    if (is.na(strt)) strt <- 0

    nms <- LHSs[[1]]
    nms_t <- unlist(LHSs[2:4])
    nms <- c(nms, paste0(nms_t, "_", rep(seq_len(T)+strt-1, each=length(nms_t))))
    # nms <- setdiff(nms, names(dat))
  }
  else {
    nms <- unlist(LHSs)
  }
  nv <- length(nms)

  out <- as.data.frame(rep(list(NA), nv))
  names(out) <- nms

  if (!is.null(dat)) {
    out <- cbind(dat[1,,drop=FALSE], out)
  }

  ## generate dummy data for each variable to simulate
  for (i in seq_along(LHSs)) for (j in seq_len(dims[[i]])) {
    if (i > 1 && exists("strt")) {
      vnms <- paste0(LHSs[[i]][j], "_", seq_len(T)-1-strt)
      if (!is_categorical(family[[i]][j])) out[vnms] <- 0
      else out[vnms] <- factor(x=1L, levels=seq_len(pars[[LHSs[[i]][j]]]$nlevel))
    }
    else {
      if (!is_categorical(family[[i]][j])) out[[LHSs[[i]][j]]] <- 0
      else out[[LHSs[[i]][j]]] <- factor(x=1L, levels=seq_len(pars[[LHSs[[i]][j]]]$nlevel))
    }
  }

  return(out)
}

##' Check parameters for univariate families
##'
##' Checks existence of beta vectors and then assesses appropriate length
##'
##' @inheritParams process_inputs
##' @inheritParams gen_dummy_dat
##' @param dummy_dat a dummy dataset, as generated by `gen_dummy_dat()`
##'
##' @export
check_pars <- function (formulas, family, pars, dummy_dat, LHSs, kwd, dims) {

  if (missing(dims)) dims <- lengths(formulas)

  ## check that supplied regression parameters are sufficient
  nm_pars <- names(pars)

  wh_cop <- which(nm_pars == kwd)
  if (is.na(wh_cop)) stop("No parameters specified for copula")

  nms <- lapply(pars[-wh_cop], names)
  bpres <- sapply(nms, function(x) "beta" %in% x)

  if (!all(bpres)) {
    plur <- sum(!bpres) > 1
    stop(paste(ifelse(plur, "Variables", "Variable"),
               paste(names(pars)[!bpres], collapse=", "),
               ifelse(plur, "lack", "lacks"), "a beta parameter vector"))
  }

  ## check variable names
  vars <- unlist(LHSs)
  rep_pars <- match(vars, nm_pars, nomatch = 0L)
  if (any(rep_pars == 0)) {
    wh_nrep <- vars[rep_pars==0]
    stop(paste0("Variable ", paste(wh_nrep, collapse=", "), " not represented in the parameters list"))
  }


  for (j in seq_along(LHSs)) for (i in seq_along(formulas[[j]])) {
    mod_mat <- model.matrix(formulas[[j]][[i]], data=dummy_dat)
    # npar <- length(attr(formsZ[[i]], "term.labels")) + attr(formsZ[[i]], "intercept")
    npar <- ncol(mod_mat)
    if (is_categorical(family[[j]][i])) {
      npar <- npar*(pars[[LHSs[[j]][i]]]$nlevel - 1)
      if (!is.matrix(pars[[LHSs[[j]][i]]]$beta)) pars[[LHSs[[j]][i]]]$beta <- matrix(pars[[LHSs[[j]][i]]]$beta, nrow=ncol(mod_mat))
    }
    if (isTRUE(is.na(npar)) || length(npar) != 1) stop(paste0("Categorical variable '", LHSs[[j]][i], "' requires an 'nlevel' parameter"))
    if (length(pars[[LHSs[[j]][i]]]$beta) != npar) stop(paste0("dimension of model matrix for ", LHSs[[j]][i], " does not match number of coefficients provided"))
  }
  # for (i in seq_along(formulas[[2]])) {
  #   mod_mat <- model.matrix(formulas[[2]][[i]], data=dummy_dat)
  #   # npar <- length(attr(formsX[[i]], "term.labels")) + attr(formsX[[i]], "intercept")
  #   npar <- ncol(mod_mat)
  #   if (is_categorical(family[[2]][i])) {
  #     npar <- npar*(pars[[LHSs[[2]][i]]]$nlevel - 1)
  #     if (!is.matrix(pars[[LHSs[[2]][i]]]$beta)) pars[[LHSs[[2]][i]]]$beta <- matrix(pars[[LHSs[[2]][i]]]$beta, nrow=ncol(mod_mat))
  #   }
  #   if (isTRUE(is.na(npar)) || length(npar) != 1) stop(paste0("Categorical variable '", LHSs[[2]][i], "' requires an 'nlevel' parameter"))
  #   if (length(pars[[LHSs[[2]][i]]]$beta) != npar) stop(paste0("dimension of model matrix for '", LHSs[[2]][i], "' does not match number of coefficients provided"))
  # }
  # for (i in seq_along(formulas[[3]])) {
  #   mod_mat <- model.matrix(formulas[[3]][[i]], data=dummy_dat)
  #   # npar <- length(attr(formsY[[i]], "term.labels")) + attr(formsY[[i]], "intercept")
  #   npar <- ncol(mod_mat)
  #   if (is_categorical(family[[3]][i])) {
  #     npar <- npar*(pars[[LHSs[[3]][i]]]$nlevel - 1)
  #     if (!is.matrix(pars[[LHSs[[3]][i]]]$beta)) pars[[LHSs[[3]][i]]]$beta <- matrix(pars[[LHSs[[3]][i]]]$beta, nrow=ncol(mod_mat))
  #   }
  #   if (isTRUE(is.na(npar)) || length(npar) != 1) stop(paste0("Categorical variable '", LHSs[[3]][i], "' requires an 'nlevel' parameter"))
  #   if (length(pars[[LHSs[[3]][i]]]$beta) != npar) stop(paste0("dimension of model matrix for ", LHSs[[3]][i], " does not match number of coefficients provided"))
  # }


  ## check that families have necessary parameters
  for (j in seq_along(LHSs)) {
    if (is.numeric(family[[j]])) {
      for (i in seq_along(family[[j]])) {
        if (family[[j]][i] <= 4 && is.null(pars[[LHSs[[j]][i]]]$phi))
          stop(paste(LHSs[[j]][i], "needs a phi parameter"))
        if (family[[j]][i] == 2 && is.null(pars[[LHSs[[j]][i]]]$par2))
          stop(paste(LHSs[[j]][i], "needs a par2 parameter for degrees of freedom"))
        if (family[[j]][i] %in% 10:11 && is.null(pars[[LHSs[[j]][i]]]$nlevel))
          stop(paste(LHSs[[j]][i], "needs a parameter for number of categories"))
      }
    }
    else if (length(family[[j]]) > 0 && is(family[[j]][1], "causl_family")) {
      for (i in seq_along(family[[j]])) {
        if (!all(family[[j]][i]$pars %in% names(pars[[LHSs[[j]][i]]]))) {
          miss <- family[[j]][i]$pars[!family[[j]][i]$pars %in% names(pars[[LHSs[[j]][i]]])]
          stop(paste0("Parameters ", paste(miss, collapse=", "), " missing for variable ", LHSs[[j]][i]))
        }
      }
    }
  }

  ## check copula regression parameters are the correct length
  cop_pos <- length(LHSs) + 1   # should be 4 for causl
  for (i in seq_along(formulas[[cop_pos]])) {
    pars_cop <- pars[[kwd]]

    if (is(formulas[[cop_pos]][[i]], "formula")) {
      mod_mat <- model.matrix(formulas[[cop_pos]][[i]], data=dummy_dat)
      npar <- ncol(mod_mat)
      # if (length(pars_cop[[LHSs[[3]][i]]]$beta) != npar) stop(paste0("dimension of model matrix for ", LHSs[[3]][i], " does not match number of coefficients provided"))
    }
    else if (is.list(formulas[[cop_pos]][[i]])) {
      for (j in seq_along(formulas[[cop_pos]][[i]])) {
        mod_mat <- model.matrix(formulas[[cop_pos]][[i]][[j]], data=dummy_dat)
        npar <- ncol(mod_mat)
      }
    }

    # npar <- length(attr(formsY[[i]], "term.labels")) + attr(formsY[[i]], "intercept")
  }


  return(list(pars=pars))
}

##' Sets up copula quantities only
##'
## @inheritParams process_inputs
##' @param formulas list of formulas for copula only
##' @param family list of families for copula only
##' @param pars list of copula parameters
##' @param LHSs left-hand sides for all variables
##' @param quans character vector of already existing variables to include
##' @param ord topological ordering
##'
##' @export
pair_copula_setup <- function (formulas, family, pars, LHSs, quans, ord) {

  dims <- lengths(LHSs)
  dZ <- dims[length(dims)-2]
  dX <- dims[length(dims)-1]
  dY <- last(dims)
  LHS_Z <- LHSs[[length(LHSs)-2]]
  LHS_Y <- LHSs[[length(LHSs)]]

  nQ <- length(quans)

  if (is.list(formulas)) {
    ## put in code to check that copula formulas and parameters have matching lengths

  }

  ## get formulas in right format
  if (!is.list(formulas)) {
    formulas <- rep(list(formulas), dY)
  }
  else if (length(formulas) != dY) {
    formulas <- rep(formulas[], dY)
  }
  if (!all(sapply(formulas, is.list)) || any(lengths(formulas) < nQ+dZ+seq_len(dY)-1)) {
    for (i in seq_len(dY)) {
      if (is.list(formulas[[i]])) {
        formulas[[i]] <- rep(formulas[[i]], nQ+dZ+i-1)
      }
      else {
        formulas[[i]] <- rep(list(formulas[[i]]), nQ+dZ+i-1)
      }
      lhs(formulas[[i]]) <- c(quans, LHS_Z[rank(ord[seq_len(dZ)])],
                              LHS_Y[rank(ord[dZ+dX+seq_len(i-1)])])
      ## update to allow some Zs to come after Ys
    }
  }
  names(formulas) <- LHS_Y[rank(ord[dZ+dX+seq_len(dY)])]


  ## get copula families in right format
  if (!is.list(family) || length(family) != dY) {
    if (is.list(family)) {
      family <- rep_len(family, dY)
    }
    if (!is.list(family)) {
      if (length(family) == dY) {
        family <- as.list(family)
      }
      else  {
        family <- rep_len(list(family), dY)
      }
    }
  }
  ## now ensure each family value is specified for each pair-copula
  if (any(lengths(family) != lengths(formulas))) {

    dYcop <- lengths(formulas)
    fam_lns <- lengths(family)
    # if (length(fam_lns) < length(dYcop)) {
    #   if (length(fam_lns) != 1) warning("Insufficient family parameters for pair-copula. Copying earlier values")
    #   family <- family[rep_len(length(fam_lns), length(dYcop))]
    # }
    # else if (length(fam_lns) < length(dYcop)) {
    #   warning("Too many family parameters for pair-copula. Truncating to appropriate number")
    #   family <- family[seq_along(dYcop)]
    # }
    ## now that lengths of objects are the same, ensure correct number of family variables in each list
    if (any(fam_lns < dYcop)) family <- mapply(function(x, y) rep_len(x, y), family, dYcop, SIMPLIFY = FALSE)

  }
  # if (length(dims) == 4) return(list(formulas=formulas, family=family))

  if (!is.null(pars)) {
    ## get parameters in right format
    if (!setequal(names(pars), LHS_Y)) {
      if ("beta" %in% names(pars)) {
        pars <- rep(list(list(list(beta=pars$beta))), length(LHS_Y))
        names(pars) <- LHS_Y
        ordY <- rank(ord[dZ+dX+seq_len(dY)])
        if (nQ+dZ > 1 || dY > 1) {
          for (i in seq_len(dY)) {
            vnm <- LHS_Y[[ordY[i]]]
            pars[[vnm]] <- rep(pars[[vnm]], nQ+dZ+i-1)
            names(pars[[vnm]]) <- c(quans, LHS_Z, LHS_Y[ord[seq_len(i-1)]])
          }
        }
      }
      else {
        nrep <- setdiff(LHS_Y, names(pars))
        if (length(nrep) > 0) stop(paste0("Variable ", paste(nrep, collapse=", "),
                                          " not represented in the copula parameters list"))
        rep <- setdiff(names(pars), LHS_Y)
        if (length(rep) > 0) stop(paste0("Variable ", paste(rep, collapse=", "),
                                         " represented in copula parameters list but not a response variable"))
        stop("Shouldn't get here")
      }
    }
    else if (!all(sapply(pars, is.list))) {
      ## put some code in here to correct fact that some pairs aren't represented
    }
  }

  return(list(formulas=formulas, family=family, pars=pars))
}

##' Obtain quantiles for prespecified variables
##'
##' @param dat data frame containing variables
##' @param prespec character vector of prespecified variables in `dat`
##' @param cond logical: should conditional quantiles be estimated?
##' @param nlevs maximum number of levels for 'discrete' variable
##' @param cor_thresh threshold for when linear regression should be used to
##' model dependence
##' @param tol tolerance for similarity of ranks for applying uniform noise
##'
##' @details Currently takes the rank of each entry, and subtracts 1/2 and
##' normalizes by the number of entries.  If there are \eqn{k} ties they are
##' randomly sorted with a uniform random variable
##' in the symmetric interval around the rank of width \eqn{k/n}.
##'
##' `nlevs` is used to classify discrete vs continuous variables.  If a variable
##' has more than this number of distinct values, it is considered to be
##' continuous.  `cor_thresh` is used to determine when the correlation is small
##' enough that it need not be modelled.  `tol` is for classifying which ranks
##' are sufficiently close together to add noise as a group.
##'
##' @export
process_prespecified <- function (dat, prespec, cond=TRUE, nlevs=5, cor_thresh=0.25,
                                  tol=sqrt(.Machine$double.eps)) {
  n <- nrow(dat)

  if (cond) {
    ## make sure variables are all real-valued
    dat <- as.data.frame(lapply(dat, as.numeric))
    if (any(sapply(dat, function(x) all(is.na(x))))) stop("Problem with conversion to numeric data")
    vlev <- nrow(dat) - sapply(dat, function(x) sum(duplicated(x)))
    do_disc <- (vlev <= nlevs)
  }

  ## perform marginal empirical transformation
  quantiles <- data.frame(lapply(dat[prespec], rank))
  if (cond) cors <- cor(quantiles)

  ## go through each variable and put into [0,1] in approx uniform manner
  for (i in seq_along(prespec)) {
    if (cond && i > 1) {
      ## for conditional case, check which correlations are above the set threshold
      cors_i <- which(abs(cors[i,seq_len(i-1)]) > cor_thresh)
      if (length(cors_i) > 0) X <- as.matrix(dat[,prespec[cors_i]])
      else X <- rep(1, n)

      if (do_disc[i]) {
        ## if fewer than 'nlevs' levels then treat as discrete
        if (vlev[i] == 2) {
          if (any(dat[,prespec[i]] %in% c(0,1))) dat[,prespec[i]] <- (dat[,prespec[i]] - min(dat[,prespec[i]]))/(max(dat[,prespec[i]])-min(dat[,prespec[i]]))

          mod <- glm(dat[,prespec[i]] ~ X, family=binomial)
        }
        else {
          stop("Plasmode simulation not implemented for discrete variables with more than 2 levels")
        }
      }
      else {
        ## otherwise as continuous
        mod <- lm(dat[,prespec[i]] ~ X)
        preds <- mod$residuals
        quan <- (rank(preds)-0.5)/n
        if (any(duplicated(quan))) {
          ## separate out ties
          cts <- table(quan)
          vals <- as.numeric(names(cts))
          for (j in which(cts > 1)) {
            wh_j <- which(abs(quan - vals[j]) < tol)
            quan[wh_j] <- quan[wh_j] + (runif(cts[j])-1/2)*cts[j]/n
          }
        }
      }
    }
    else {
      ## if marginal (or for first variable in sequence)
      var_nm <- prespec[i]
      quan <- (rank(dat[[var_nm]])-1/2)/n
      if (any(duplicated(quan))) {
        cts <- table(quan)
        vals <- as.numeric(names(cts))
        for (j in which(cts > 1)) {
          wh_j <- which(abs(quan - vals[j]) < tol)
          quan[wh_j] <- quan[wh_j] + (runif(cts[j])-1/2)*cts[j]/n
        }
      }
    }
    quantiles[prespec[i]] <- quan
  }

  return(quantiles)
}