logratiolasso: Tools For Large Log Ratio Models

#' Two-stage log-ratio lasso
#'
#' Fits the two-stage log-ratio lasso.
#'
#' @param z The input matrix.  Assumed to be the logarithmic transform of positive raw inputs.
#' @param y The real-valued response.
#' @param k_max The largest number of log-ratios to consider.
#' @param lambda_1 Optional vector of lambda values.
#' @param second.stage Specifies whether to fit the second stage on the response y or the fitted values yhat.
#'  Fitting on yhat is known as "conservative two-stage" in the log-ratio lasso paper.
#' @param ... Additional arguments passed to "glmnet.constr".
#' @return beta A list of p by k_max matrices where column j gives the coefficient value after j steps.
#'   each list entry corresponds to a value of lambda.
#' @return coef. The theta coefficients selected for each model.  Recommended for internal use only.
#' @return lambda The grid of lambda values used for the fitting.
#' @return selected_vars A matrix showing which log-terms are active at each step of the second-stage
#' forward stepwise procedure.
two_stage <- function(z, y, k_max = 10, lambda_1 = NULL, second.stage = "y", family = "gaussian", nlambda = 20, ...) {
  # executes two-stage procedure.
  # assumes centered response variable.

  p <- ncol(z)

  constrained_fit <- glmnet.constr(z, y, family = family, lambda = lambda_1, nlambda = nlambda, ...)
  lambda_1 <- constrained_fit$lambda
  betas <- constrained_fit$beta
  #print(betas)
  #print(lambda_1)
  selected_vars <- apply(betas, 2, function(x){(abs(x) > 0)})

  output <- list()
  for (i in 1:length(lambda_1)) {
    if(sum(selected_vars[, i]) > 0) {
      #print(paste0("which: i",i))
      #print(selected_vars)
      expanded_set <- small_to_big_z(z, which(selected_vars[, i]))
      y_hat <- z %*% betas[, i]
      resp <- y
      if(second.stage == "yhat") {
        resp <- y_hat
      }

      d <- sum(selected_vars[, i]) - 1
      k <- min(d, k_max)

      suppressWarnings(output[[i]] <- custom_fs(expanded_set, resp, k, selected_vars[, i], p, family = family))
      #print(output[[i]])
    } else {
      output[[i]] <- NA
    }
  }

  betas <- lapply(output, function(x) {out_to_beta(x, k_max, p)})

  list(betas = betas, coef = output, lambda = lambda_1, selected_vars = selected_vars)
}


out_to_beta <- function(obj, k, p) {
  #takes output of "two_stage$coef[[i]]" and returns p by k matrix of beta coefficients
  #internal helper function

  betas <- matrix(0, p, k)

  if(sum(is.na(obj)) > 0) {
    return(betas)
  } else if (class(obj) != "list") {
    return(betas)
  }
  #print(paste0("obj", obj))
  d <- min(k, nrow(obj$theta_ind))
  #print(d)
  #print(k)
  if(d == 0) {
    return(betas)
  }

  # print(obj$subset)
  # print(obj$theta_ind)
  # print(paste0("d:", d))
  # print(dim(betas))
  # print(paste0("p:", p))

  for(i in 1:d) {
    beta <- rep(0, p)

    # print(i)
    # print(obj$subset[obj$theta_ind[i,1]])
    # print(obj$subset[obj$theta_ind[i,2]])
    # print(obj$theta_vals[[i]])

    for(j in 1:i){
      beta[obj$subset[obj$theta_ind[j,1]]] <- beta[obj$subset[obj$theta_ind[j,1]]] +
        obj$theta_vals[[i]][j]
      beta[obj$subset[obj$theta_ind[j,2]]] <- beta[obj$subset[obj$theta_ind[j,2]]] -
        obj$theta_vals[[i]][j]
    }

    # print(paste0("i completed"))
    betas[, i] <- beta
  }
  #fill in remaining entries with same beta
  #print(d)
  if(d < k) {
    for(i in (d+1):k) {
      betas[, i] <- betas[, d]
    }
  }

  #print(betas)
  betas
}


predict_two_stage <- function(z, y, new_z, family = "gaussian", lambda_1 = NULL, k_max = 5, nlambda = 20, ...) {
  # forms predictions on new data from two_stage model
  # output is list of size length(lambda_1)
  # each entry is matrix of length ncol(new_z) and width k_max
  # intended for internal use with "cv_two_stage" functions

  p <- ncol(z)
  fit <- two_stage(z, y, lambda_1 = lambda_1, k_max = k_max, nlambda = nlambda, family = family, ...)
  #print(fit$coef)
  betas <- lapply(fit$coef, function(obj){out_to_beta(obj, k_max, p)})

  predictor <- function(betas) {
    if(!is.null(ncol(betas))) {
      return(apply(betas,2,function(b){new_z %*% b}))
    }
    return(matrix(rep(new_z %*% betas, k_max), ncol = k_max, byrow = FALSE))
  }

  y_pred <- lapply(betas, predictor)
  #list of n_pred by k matrix of predictions

  out <- list(two_step_obj = fit, y_pred = y_pred)
  out
}


#' CV for two-stage
#'
#' Cross-validation for the lambda and k parameters in the two-stage log-ratio lasso.
#'
#' @param z The input matrix.  Assumed to be the logarithmic transform of positive raw inputs.
#' @param y The real-valued response.
#' @param k_max The largest number of log-ratios to consider.
#' @param lambda_1 Optional vector of lambda values.
#' @param n_folds Optional number of folds.  Default is 10.
#' @param ... Additional arguments passed to "two_stage"
#' @return mse A matrix of MSE values generated by CV.
#' @return best_params The best lambda and k parameter values chosen by CV.
#' @return lambda_min A best value of lambda.
#' @return k_min A best value of k.
#' @return two_step_obj The two_step object fit on the full data.
#' @return beta_min The parameter value selected by CV.
#' @return lambda The grid of lambda values used.
cv_two_stage <- function(z, y, family = "gaussian", lambda_1 = NULL, k_max = 10,
                         n_folds = 10, nlambda = 20, folds = NULL, ...) {
  p <- ncol(z)
  #returns list of length lambda of vectors of size k_max with the prediction error.
  two_step_obj <- two_stage(z, y, family = family, lambda_1 = lambda_1, k_max = k_max, nlambda = nlambda, ...)
  lambda_1 <- two_step_obj$lambda

  #Create equally size folds
  if(is.null(folds)) {
    folds <- sample(cut(seq(1,nrow(z)),breaks=n_folds,labels=FALSE))
  } else {
    n_folds = max(folds) - min(folds) + 1
  }

  #Perform k-fold cross validation
  mse <- list()
  for(i in 1:n_folds){
      print(paste0("Starting CV fold ", i))
      #Segement your data by fold using the which() function
      test_indices <- which(folds==i,arr.ind=TRUE)
      out <- predict_two_stage(z[-test_indices, ], y[-test_indices], new_z = z[test_indices, ],
                              lambda_1 = lambda_1, k_max = k_max, nlambda = nlambda, family = family, ...)

      if(family == "gaussian") {
        mse_fun <- function(y_pred) {
          if(!is.null(ncol(y_pred))) {
            if(ncol(y_pred > 1)) {
              return(apply(y_pred,2,function(yp){sum((yp - y[test_indices])**2)}))
            }
          }
          sum((yp - y[test_indices])**2)
        }
      } else if (family == "binomial") {
        mse_fun <- function(y_pred) {
          if(!is.null(ncol(y_pred))) {
            if(ncol(y_pred > 1)) {
              return(apply(y_pred,2,function(yp){sum(log(1 + exp(-(2*y[test_indices] - 1) * y_pred)))}))
            }
          }
          sum(log(1 + exp(-(2*y[test_indices] - 1) * y_pred)))
        }
      }
      mse[[i]] <- lapply(out$y_pred, mse_fun)

      #glmnet may return fewer than nlambda entries if convergence fails
      while(length(mse[[i]]) < nlambda) {
        mse[[i]][[length(mse[[i]]) + 1]] = rep(Inf, k)
      }
  }

  mse_full <- lapply(mse[[1]], function(x){x / n_folds})
  #print(length(mse_full))
  for(i in 2:n_folds) {
    for(j in 1:length(mse[[1]])) {
      mse_full[[j]] <- mse_full[[j]] + mse[[i]][[j]] / n_folds
    }
  }
  mse_full <- simplify2array(mse_full)
  best <- which(mse_full == min(mse_full), arr.ind = TRUE)
  lambda_min <- best[1,2]
  k_min <- best[1,1]
  beta_min <- out_to_beta(two_step_obj$coef[[lambda_min]], k_max, p)[, k_min]

  list(mse = mse_full, best_params = best, lambda_min = lambda_min, k_min = k_min,
       two_step_obj = two_step_obj, beta_min = beta_min, lambda = lambda_1)
}