R/main.R

Defines functions full_likelihood fit_model fit_copula_interactions predict.logistic_copula init_logistic_copula

Documented in fit_copula_interactions fit_model predict.logistic_copula

init_logistic_copula <- function(y, x, xtype, which_include, reg.method="glm",
                                 set_nonsig_zero=FALSE) {

  if (reg.method == "glm") {
    ft <- glm(y~x, family=binomial())
    if (set_nonsig_zero) {
      nonsig <- which(summary(ft)$coef[-1, 4] > .05)
      sig <- which(summary(ft)$coef[-1, 4] <= .05)
      ft_n <- glm(y~x[, -nonsig], family = "binomial")
      ft$coef[nonsig + 1] <- 0
      ft$coeg[c(1, 1 + sig)] <- coef(ft_n)
    }
  } else if (reg.method == "brglm2") {
    ft <- glm(y~x, family="binomial", method = "brglmFit")
    if (set_nonsig_zero) {
      nonsig <- which(summary(ft)$coef[-1, 4] > .05)
      sig <- which(summary(ft)$coef[-1, 4] <= .05)
      ft_n <- glm(y~x[, -nonsig], family = "binomial", method = "brglmFit")
      ft$coef[nonsig + 1] <- 0
      ft$coeg[c(1, 1 + sig)] <- coef(ft_n)
    }
  } else {
    stop(paste0("reg.method = ", reg.method, " not implemented!"))
  }

  prs <- coefs_to_pars(
    y, x, xtype, coef(ft)[1], coef(ft)[-1]
  )

  trees <- TreeGraphList(length(which_include))
  u <- transform_margins_to_hash(x, xtype, prs, which_include)
  beta_0 <- parameters_to_intercept(xtype, prs)
  beta_x <- pars_to_linear_pred(x, xtype, prs) - beta_0
  copula_eff <- rep(0, length(beta_x))
  
  m_obj <- list(
    trees=trees, xtype=xtype, parameters=prs, beta_0=beta_0,
    beta_x_insample=beta_x, copula_eff_insample=copula_eff, u=u,
    which_include=which_include, beta_vec=coef(ft)
  )
  class(m_obj) <- "logistic_copula"
  attr(m_obj, "call") <- sys.call()
  
  m_obj
}


predict.logistic_copula <- function(object, new_x, ...) {
  ##' predict.logistic_copula
  ##' @name predict.logistic_copula
  ##' @aliases predict.logistic_copula
  ##' @description
  ##' Computes predicted probability of Y=1 for a logistic regression model with
  ##' a vine extension. 
  ##' @param object The model object as returned by fit_copula_interactions
  ##' @param new_x A matrix of covariate values to compute predictions for.
  ##' @param ... Not used.
  ##' @return A numeric vector of estimates of the conditional probability
  ##' of Y=1 | x, computed for each row of new_x.
  new_u <- transform_margins_to_hash(
    new_x, object$xtype, object$parameters, object$which_include
  )
  object_copy <- object
  object_copy <- set_transformed_vars(
    new_x, object_copy$which_include, object_copy
    )
  cop_eff <- compute_g(object_copy)

  plogis(cbind(1, new_x) %*% object$beta_vec + cop_eff)
}

fit_copula_interactions <- function(
    y, x, xtype, family_set=c("gaussian", "clayton", "gumbel"), 
    oos_validation=FALSE, tau=2, which_include=NULL, reg.method="glm",
    maxit_final=1000, maxit_intermediate=50, verbose=FALSE, 
    adjust_intercept=TRUE, max_t=Inf, test_x=NULL, test_y=NULL,
    set_nonsig_zero=FALSE, reltol=sqrt(.Machine$double.eps)
) {
  ##' fit_copula_interactions
  ##' @name fit_copula_interactions
  ##' @aliases fit_copula_interactions
  ##' @description This is the main function of the package, which
  ##' starting from an initial logistic regression model with only main effects
  ##' of each covariate, selects and fits interaction terms in the form of two
  ##' R-vine models with identical graphical structure, one for each class.
  ##' @param y A vector of n observations of the (univariate) binary outcome
  ##' variable y
  ##' @param x A (n x p) matrix of n observations of p covariates
  ##' @param xtype A vector of p characters that have to take the value
  ##' "c_a", "c_p", "d_b" or "d_b", to indicate whether each margin of the
  ##' is continuous with full support, continuous with support on the positive
  ##' real line, discrete (binary) or a counting variable.
  ##' @param family_set A vector of strings that specifies the set of
  ##' pair-copula families that the fitting algorithm chooses from. For an
  ##' overview of which values that can be specified, see the documentation for
  ##' \link[rvinecopulib]{bicop}.
  ##' @param oos_validation Whether to use an external sample for validation
  ##' instead of an in-sample likelihood based criteria. Would require that
  ##' both test_x and test_y are provided if set to TRUE. 
  ##' @param tau Parameter used when selecting the structure, where the 
  ##' the criteria is (new_likelihood - previous_likelihood - tau), 
  ##' so that an additional edge in the copulas is only accepted if it leads to
  ##' an increase in the likelihood that exceeds tau. Setting tau to NULL, has 
  ##' the same effect as -Inf. 
  ##' @param which_include The column indices of the covariates that could be
  ##' included in the copula effects.
  ##' @param reg.method The method by which the initial regression coefficients
  ##' are fitted.
  ##' @param maxit_final The maximum number of gradient optimisation iterations
  ##' to use when the full structure has been selected to refit all the
  ##' parameters. Defaults to 1000.
  ##' @param maxit_intermediate The maximum number of gradient optimisation
  ##' iterations to use when adding a newly selected component to refit the
  ##' parameters. Defaults to 10.
  ##' @param verbose Whether information about the progress should be printed 
  ##' to the console.
  ##' @param adjust_intercept Whether to intermediately refit the intercept
  ##' during the model/structure selection procedure. Defaults to true. 
  ##' @param max_t The maximum number of trees in the copula models. Defaults
  ##' to Inf, i.e., no maximum.
  ##' @param test_x Part of the optional validation set,
  ##' see @oos_validation.
  ##' @param test_y Part of the optional validation set,
  ##' see @oos_validation.
  ##' @param set_nonsig_zero If true, non-significant regression coefficients 
  ##' (in the initial glm model) will be set to zero 
  ##' @param reltol Relative convergence tolerance, see the documentation for 
  ##' \link[stats]{optim}.
  ##' @return A logistic_copula object, which contains the regression 
  ##' coefficients of the model, the parameters of the chosen conditional
  ##' covariate distribution that corresponds to the regression coefficients,
  ##' and the pair of vine-models that extend the logistic regression model.
  ##' @examples
  ##' data("Ionosphere")
  ##' 
  ##' dset <- Ionosphere[, -(1:2)] 
  ##' 
  ##' set.seed(20)
  ##' rowss <- sample(nrow(dset), round(nrow(dset) * 0.75))
  ##' colss <- sample(ncol(dset) - 1, 5)
  ##' x <- as.matrix(dset[rowss, colss])
  ##' xte <- as.matrix(dset[-rowss, colss])
  ##' y <- dset[rowss, ncol(dset)] == "bad"
  ##' yte <- dset[-rowss, ncol(dset)] == "bad"
  ##' 
  ##' xtype <- apply(x, 2, function(x) if(length(unique(x)) > 2) "c_a" else "d")
  ##' 
  ##' # Model with selection penalty tau=log(n)
  ##' md <- LogisticCopula::fit_copula_interactions(
  ##'   y, as.matrix(x), xtype, tau = log(nrow(x))
  ##' )
  ##' # Model with selection penalty tau=Inf, returns just the logistic
  ##' # regression model
  ##' mdglm <- LogisticCopula::fit_copula_interactions(
  ##'   y, as.matrix(x), xtype, tau = Inf
  ##' )
  ##'
  ##'plot(predict(mdglm, xte), predict(md, xte), col = 3 + yte)
  ##' @export
  
  if (is.null(which_include)) {
    which_include <- which((xtype == "c_a") | (xtype == "c_p"))
  } else {
    if(any((xtype[which_include] != "c_a") & (xtype[which_include] != "c_p"))) {
      ind <- which(
        (xtype[which_include] != "c_a") & (xtype[which_include] != "c_p")
      )
      which_include <- which_include[-ind] 
    }
  }
  
  if (length(family_set) > 1) {
    family_combs <- cbind(
      combn(family_set, 2),
      combn(family_set, 2)[rev(1:2), ],
      rbind(family_set,
            family_set)
    )
  } else{
    family_combs <- matrix(rep(family_set, 2), ncol=1)
  }
  
  if (oos_validation & (is.null(test_x) | is.null(test_y))) {
    stop("Must supply test_x and test_y for cdevriterion = 'OOS validation'!")
  }
  
  m_obj <- init_logistic_copula(y, x, xtype, which_include, 
                                reg.method=reg.method,
                                set_nonsig_zero=set_nonsig_zero)
  
  for (t in seq(min(length(which_include) - 1, max_t))) {
    # Find all valid edges
    valid_edges <- all_initially_valid_edges(
      m_obj$trees[[t]]$nodes, 
      if(!is.null(which_include)) which_include else seq(length(xtype))
    )
    
    if (length(valid_edges$edges) == 0) {
      break
    }
    
    # Fit all model for all valid edges, only gaussian
    copula_pairs <- new.env(hash=T)
    for (edge in valid_edges$edges) {
      assign(edge_key(edge), fit_pair(edge, m_obj$u, y, xtype))
    }
    
    improvement <- TRUE
    k <- 1
    while (improvement) {
      
      # Compute the copula - effects for each edge
      pair_effects <- sapply(
        1:length(valid_edges$edges),
        function(j) pair_effect(
          valid_edges$edges[[j]], m_obj$u,
          get(edge_key(valid_edges$edges[[j]]), envir=copula_pairs)
        )
      )
      
      # Find the best edge, whether it improves the model or not
      j_star <- choose_best_effect(
        y, m_obj$beta_0 + m_obj$beta_x + m_obj$copula_eff_insample,
        pair_effects,
        adjust_intercept=adjust_intercept
      )
      
      rm(pair_effects)
      
      best_edge <- valid_edges$edges[[j_star]]
      
      # Fit all combinations of pair - copula families to the best edge
      copula_pairs_family_comb <- lapply(
        1:ncol(family_combs),
        function(l) fit_pair(best_edge, m_obj$u, y, xtype, family_combs[, l])
      )
      
      # Compute the copula effects for each combination
      pair_effects_family_comb <- sapply(
        1:ncol(family_combs),
        function(l) pair_effect(
          best_edge, m_obj$u, copula_pairs_family_comb[[l]]
        )
      )
      
      # Find the best such combination, if there is one that improves the model
      l_star <- choose_best_effect(
        y, m_obj$beta_0 + m_obj$beta_x + m_obj$copula_eff_insample,
        pair_effects_family_comb, adjust_intercept=adjust_intercept
      )
      
      # Is there a combination that improves the model?¨
      m_cand <- m_obj
      
      # Add the edge to the tree
      m_cand$trees[[t]] <- add_edge_to_tree(
        m_cand$trees[[t]], valid_edges$edges[[j_star]]
      )
      
      m_cand$trees[[t]]$graph <- igraph::add_edges(
        m_cand$trees[[t]]$graph, valid_edges$edge_mat[, j_star]
      )
      m_cand$trees[[t]] <- add_pair_copulas_to_tree(
        m_cand$trees[[t]], copula_pairs_family_comb[[l_star]]
      )
      
      # Refit 
      m_cand_tmp <- try(
        fit_model(
          y, x, m_cand, maxit=min(maxit_final, maxit_intermediate),
          num_grad=FALSE, verbose=verbose, reltol=reltol
        ),
        silent=TRUE
      )
      
      if(length(m_cand_tmp) == 1) {
        stop("Error during fitting of model")
      } else {
        m_cand <- m_cand_tmp
        rm(m_cand_tmp)
      }
      
      cand_lik <- full_likelihood(
        y, x, m_cand, m_cand$beta_0, m_cand$beta_vec[-1],
        transformed_delta_vec(m_cand)
      )
      
      prev_lik <- full_likelihood(
        y, x, m_obj, m_obj$beta_0, m_obj$beta_vec[-1],
        transformed_delta_vec(m_obj)
      )
      
      if (oos_validation) {
        cand_lik <- sum(
          dbinom(test_y, 1, plogis(predict(m_cand, test_x)), TRUE)
        )
        prev_lik <- sum(
          dbinom(test_y, 1, plogis(predict(m_obj, test_x)), TRUE)
        )
        crit <- cand_lik - prev_lik
      } else {
        if (!is.null(tau)) {
          crit <- 2 * (cand_lik - prev_lik - tau)
        } else {
          crit <- Inf
        }
      }
      
      if (crit > 0) {
        m_obj <- m_cand
        valid_edges$edges <- valid_edges$edges[-j_star]
        valid_edges$edge_mat <- as.matrix(valid_edges$edge_mat[, -j_star])
        
        if(length(valid_edges$edges) > 0) {
          if (
            any(!sapply(seq(length(valid_edges$edges)),
                        function (j) valid_edge(m_obj$trees[[t]],
                                                valid_edges$edges[[j]],
                                                valid_edges$edge_mat[, j])))
          ) {
            rem_edg <- which(
              !sapply(seq(length(valid_edges$edges)),
                      function (j) valid_edge(
                        m_obj$trees[[t]], valid_edges$edges[[j]],
                        valid_edges$edge_mat[, j])
              )
            )
            valid_edges$edges <- valid_edges$edges[-rem_edg]
            valid_edges$edge_mat <- as.matrix(valid_edges$edge_mat[, -rem_edg])
          }
        }
      }
      
      if (length(valid_edges$edges) == 0 | crit <= 0){
        improvement <- FALSE
      }
      rm(m_cand)
      rm(pair_effects_family_comb)
      rm(copula_pairs_family_comb)
    }
    
    # ADD THE NEW TREE
    if (length(m_obj$trees[[t]]$edges) > 1){
      m_obj$trees <- append(
        m_obj$trees,
        list(TreeGraph(edges_to_next_trees_nodes(m_obj$trees[[t]]$edges)))
      )
    } else if(t < length(m_obj$trees[[1]]$nodes) - 1) {
      break
    }
    k <- k + 1
    rm(copula_pairs)
  }
  
  if (length(m_obj$trees[[length(m_obj$trees)]]$edges) == 0) {
    m_obj$trees <- m_obj$trees[1:(length(m_obj$trees) - 1)]
  }
  
  # Refit one last time
  if(maxit_final > 50) {
    if(length(m_obj$trees[[1]]$edges) > 0) {
      m_obj_tmp <- try(
        fit_model(
          y, x, m_obj, maxit=maxit_final, num_grad=FALSE, verbose=verbose,
          reltol=reltol
        ),
        silent=TRUE
      )
      if(length(m_obj_tmp) == 1) {
        stop("Error during fitting of model")
      } else {
        m_obj <- m_obj_tmp
        rm(m_obj_tmp)
      }
    }
  }
  
  final_lik <- full_likelihood(
    y, x, m_obj, m_obj$beta_0, m_obj$beta_vec[-1],
    transformed_delta_vec(m_obj)
  )
  
  rm(valid_edges)
  m_obj
}


fit_model <- function(y, x, m_obj, maxit=5, num_grad=FALSE, verbose=FALSE,
                      hessian=FALSE, reltol=sqrt(.Machine$double.eps)) {

  ##' fit_model
  ##' @name fit_model
  ##' @aliases fit_model
  ##' @description This function updates the parameters of a LogisticCopula 
  ##' model by maximum likelihood.
  ##' @param y A vector of n observations of the (univariate) binary outcome
  ##' variable y
  ##' @param x A (n x p) matrix of n observations of p covariates
  ##' @param m_obj The model object as returned from fit_copula_interactions
  ##' @param maxit The maximum number of gradient steps
  ##' @param num_grad Whether to compute gradients numerically.
  ##' @param verbose Whether information about the progress should be printed 
  ##' to the console.
  ##' @param hessian Whether to numerically compute the hessian matrix, see the
  ##' documentation for \link[stats]{optim}. 
  ##' @param reltol Relative convergence tolerance, see the documentation for 
  ##' \link[stats]{optim}.
  ##' @return A logistic_copula object, which contains the regression 
  ##' coefficients of the model, the parameters of the chosen conditional
  ##' covariate distribution that corresponds to the regression coefficients,
  ##' and the pair of vine-models that extend the logistic regression model.
  # Extract parameters :
  t_delta <- transformed_delta_vec(m_obj)

  optfit <- optim(
    fn=function(par) {
      lik <- full_likelihood(y, x, m_obj, beta_0=par[1], betas=par[2:(ncol(x) + 1)],
                             delta=par[(ncol(x) + 2):length(par)],
                             transform_delta=TRUE)
      -lik
    },
    gr=function(par) {
      if (num_grad) {
        numDeriv::grad(function(par) {
          - full_likelihood(y, x, m_obj, beta_0=par[1], betas=par[2:(ncol(x) + 1)],
                            delta=par[(ncol(x) + 2):length(par)],
                            transform_delta=TRUE)
        }, x=par)
      } else {
        -full_gradient(y, x, m_obj, beta_0=par[1], betas=par[2:(ncol(x) + 1)], 
                       par[(ncol(x) + 2):length(par)])
      }
    },
    par=c(m_obj$beta_vec, t_delta), method="BFGS",
    control=list(maxit=maxit, trace=ifelse(verbose, 100, 0), reltol=reltol),
    hessian=hessian
  )

  m_obj <- set_model(
    y, x, m_obj, optfit$par[seq(length(m_obj$beta_vec))], 
    transform_t_to_delta_vec(optfit$par[-seq(length(m_obj$beta_vec))], m_obj)
  )
  m_obj$fit_inf <- optfit
  m_obj

}


full_likelihood <- function(y, x, fit, beta_0, betas, delta, 
                            transform_delta=TRUE) {
  if (any(fit$xtype == "d_c") | any(fit$xtype == "d_b")) {
    ind_d <- which((fit$xtype == "d_c") | (fit$xtype == "d_b"))
    if (any(is.infinite(exp(betas[ind_d])))) {
      -1e100
    } else {
      if (transform_delta & !is.null(delta)) {
        t_delta <- delta
        delta <- transform_t_to_delta_vec(delta, fit)
      }
      
      fit <- set_model(y, x, fit, c(beta_0, betas), delta)
      eta <- fit$beta_0 + fit$beta_x_insample + fit$copula_eff_insample
      
      p <- plogis(eta)
      
      sum(dbinom(y, 1, p, TRUE))
    }
  }
  else {
    if (transform_delta & !is.null(delta)) {
      t_delta <- delta
      delta <- transform_t_to_delta_vec(delta, fit)
    }
    
    fit <- set_model(y, x, fit, c(beta_0, betas), delta)
    eta <- fit$beta_0 + fit$beta_x_insample + fit$copula_eff_insample
    
    p <- plogis(eta)
    
    sum(dbinom(y, 1, p, TRUE))
  }
}

Try the LogisticCopula package in your browser

Any scripts or data that you put into this service are public.

LogisticCopula documentation built on June 28, 2024, 5:09 p.m.