R-codacore: Learning Sparse Log-Ratios for Compositional Data

Documented in activeInputs.codacore codacore getBinaryPartitions getDenominatorParts getLogRatios getNumeratorParts getNumLogRatios getSlopes getTidyTable plot.codacore plotROC predict.codacore print.codacore

# Here we implement the codacore model

library(keras)
utils::globalVariables(c("self"))

# """Fits a single base learner"""
# Private class not to be called by user
.CoDaBaseLearner <- function(
  x,
  y,
  boostingOffset,
  logRatioType,
  objective,
  lambda,
  cvParams,
  optParams,
  verbose
){
  
  cdbl = list(
    intercept=NULL,
    slope=NULL,
    weights=NULL,
    softAssignment=NULL,
    hard=NULL,
    x=x,
    y=y,
    boostingOffset=boostingOffset,
    logRatioType=logRatioType,
    objective=objective,
    lambda=lambda,
    cvParams=cvParams,
    optParams=optParams,
    verbose=verbose
  )
  class(cdbl) = "CoDaBaseLearner"
  
  # Train the relaxation model
  cdbl = trainRelaxation.CoDaBaseLearner(cdbl)
  
  # Find optimal cutoff by CV
  cutoff = findBestCutoff.CoDaBaseLearner(cdbl)
  
  # Use cutoff to "harden" the log-ratio
  cdbl = harden.CoDaBaseLearner(cdbl, cutoff)
  
  # And recompute the linear coefficients
  cdbl = setInterceptAndSlope.CoDaBaseLearner(cdbl, cdbl$x, cdbl$y, cdbl$boostingOffset)
  
  # Add some metrics
  yHat = predict(cdbl, x) + boostingOffset
  if (cdbl$objective == 'binary classification') {
    cdbl$ROC = pROC::roc(y, yHat, quiet=TRUE)
    cdbl$AUC = pROC::auc(cdbl$ROC)
    cdbl$accuracy = mean(y == (yHat > 0))
  } else {
    cdbl$RMSE = sqrt(mean((y - yHat)^2))
    cdbl$Rsquared = 1 - cdbl$RMSE^2 / stats::var(y)
  }
  
  return(cdbl)
}


#' @import keras
trainRelaxation.CoDaBaseLearner = function(cdbl) {
  startTime = Sys.time()
  
  # Set up traininable variables
  inputDim = ncol(cdbl$x)
  numObs = nrow(cdbl$x)
  
  # Initializaing the intercept at the average of the data
  # this helps optimization greatly
  # TODO: should experiment with slopeInit parameter for potential gains
  if (cdbl$objective == "binary classification") {
    loss_func = 'binary_crossentropy'
    if (abs(mean(1 / (1 + exp(-cdbl$boostingOffset))) - mean(cdbl$y)) < 0.001) {
      # Protect against numerical errors in glm() call
      interceptInit = 0.0
    } else {
      tempGLM = stats::glm(cdbl$y ~ 1, offset=cdbl$boostingOffset, family='binomial')
      interceptInit = tempGLM$coef[[1]]
    }
    slopeInit = 0.1
    metrics = c('accuracy')
  } else if (cdbl$objective == "regression") {
    loss_func = 'mean_squared_error'
    interceptInit = mean(cdbl$y - cdbl$boostingOffset)
    slopeInit = 0.1 # * stats::sd(cdbl$y - cdbl$boostingOffset)
    metrics = c('mean_squared_error')
  }
  
  # Define the forward pass for our relaxation,
  # which differs for balances and amalgamations
  if (cdbl$logRatioType == 'A') {
    epsilon = cdbl$optParams$epsilonA
    forwardPass = function(x, mask = NULL) {
      softAssignment = 2 * keras::k_sigmoid(self$weights) - 1
      # Add the small value to ensure gradient flows at exact zeros (initial values)
      pvePart = keras::k_dot(x, keras::k_relu(softAssignment + 1e-20))
      nvePart = keras::k_dot(x, keras::k_relu(-softAssignment))
      logRatio = keras::k_log(pvePart + epsilon) - 
        keras::k_log(nvePart + epsilon)
      eta = self$slope * logRatio + self$intercept + self$boostingOffset
      # keras::k_sigmoid(eta)
      eta
    }
  } else if (cdbl$logRatioType == 'B') {
    epsilon = cdbl$optParams$epsilonB
    forwardPass = function(x, mask = NULL) {
      softAssignment = 2 * keras::k_sigmoid(self$weights) - 1
      # Add the small value to ensure gradient flows at exact zeros (initial values)
      pvePart = keras::k_relu(softAssignment + 1e-20)
      nvePart = keras::k_relu(-softAssignment)
      logRatio = keras::k_dot(keras::k_log(x), pvePart) / keras::k_maximum(keras::k_sum(pvePart), epsilon) -
        keras::k_dot(keras::k_log(x), nvePart) / keras::k_maximum(keras::k_sum(nvePart), epsilon)
      eta = self$slope * logRatio + self$intercept + self$boostingOffset
      # keras::k_sigmoid(eta)
      eta
    }
  }
  
  if (FALSE) {
    tensorflow::tf$random$set_seed(0)
  }
  
  # Set up custom layer
  CustomLayer <- R6::R6Class(
    "CustomLayer",
    
    inherit = keras::KerasLayer,
    
    public = list(
      output_dim = NULL,
      weights = NULL,
      intercept = NULL,
      slope = NULL,
      boostingOffset = NULL,
      # epsilon = NULL,
      
      initialize = function() {
        self$output_dim <- 1
      },
      
      build = function(input_shape) {
        self$weights <- self$add_weight(
          name = 'weights', 
          shape = list(as.integer(inputDim), as.integer(1)),
          initializer = keras::initializer_zeros(),
          trainable = TRUE
        )
        self$intercept <- self$add_weight(
          name = 'intercept', 
          shape = list(as.integer(1)),
          initializer = keras::initializer_constant(interceptInit),
          trainable = TRUE
        )
        self$slope <- self$add_weight(
          name = 'slope', 
          shape = list(as.integer(1)),
          initializer = keras::initializer_constant(slopeInit),
          trainable = TRUE
        )
        self$boostingOffset <- self$add_weight(
          name = 'boostingOffset',
          shape = list(as.integer(numObs), as.integer(1)),
          initializer = keras::initializer_constant(cdbl$boostingOffset),
          trainable = FALSE
        )
        # self$epsilon <- self$add_weight(
        #   name = 'epsilon', 
        #   shape = list(as.integer(1)),
        #   initializer = keras::initializer_constant(cdbl$epsilon),
        #   trainable = FALSE
        # )
      },
      
      call = forwardPass,
      
      compute_output_shape = function(input_shape) {
        list(input_shape[[1]], self$output_dim)
      }
    )
  )
  
  .trainKeras = function(lr, epochs) {
    # define layer wrapper function
    codacoreLayer <- function(object) {
      keras::create_layer(CustomLayer, object)
    }
    
    # use it in a model
    model <- keras::keras_model_sequential()
    model %>% codacoreLayer()
    if (cdbl$objective == "binary classification") {
      model %>% layer_activation('sigmoid')
    }
    
    # compile graph
    model %>% keras::compile(
      loss = loss_func,
      optimizer = keras::optimizer_sgd(lr, momentum=cdbl$optParams$momentum),
      # optimizer = keras::optimizer_adam(0.001),
      metrics = metrics
    )
    
    
    model %>% keras::fit(cdbl$x, cdbl$y, epochs=epochs, 
                         batch_size=cdbl$optParams$batchSize, 
                         verbose=FALSE)# =TRUE) for debugging
    return(model)
  }
  
  runAdaptively = is.numeric(cdbl$optParams$adaptiveLR) & is.null(cdbl$optParams$vanillaLR)
  if (runAdaptively) {
    # Adaptive learning rate here means that we pick the lr s.t.
    # our first gradient step moves the amalWeights out by a specified amount
    model = .trainKeras(1, 1)
    lr = cdbl$optParams$adaptiveLR
    epochs = cdbl$optParams$epochs
    lr = lr / max(abs(as.numeric(model$get_weights()[[1]])))
    model = .trainKeras(lr, epochs)
  } else {
    warning("Using non-adaptive learning rate may hinder optimization.")
    lr = cdbl$optParams$vanillaLR
    epochs = cdbl$optParams$epochs
    model = .trainKeras(lr, epochs)
  }
  
  
  # Save results:
  cdbl$weights = as.numeric(model$get_weights()[[1]])
  cdbl$softAssignment = 2 / (1 + exp(-cdbl$weights)) - 1
  cdbl$intercept = as.numeric(model$get_weights()[[2]])
  cdbl$slope = as.numeric(model$get_weights()[[3]])
  
  # Equalize the largest + and largest - assignment for more 'balanced' balances
  eqRatio = max(cdbl$softAssignment) / min(cdbl$softAssignment) * (-1)
  cdbl$softAssignment[cdbl$softAssignment < 0] = cdbl$softAssignment[cdbl$softAssignment < 0] * eqRatio
  
  endTime = Sys.time()
  if (cdbl$verbose) {
    print('GD time:')
    print(endTime - startTime)
  }
  # cdbl$runTimeGD = endTime - startTime
  
  return(cdbl)
}

# Given a trained softAssignment, which corresponds to running
# the weights through an activation, we find
# the cutoff at which we define our log-ratio
findBestCutoff.CoDaBaseLearner = function(cdbl) {
  if (any(abs(cdbl$softAssignment) > 0.999999)) {
    warning("Large weights encountered in gradient descent;
            vanishing gradients likely.
            Learning rates might need recalibrating - try adaptive rates?")
  }
  
  candidateCutoffs = sort(abs(cdbl$softAssignment), decreasing=TRUE)
  maxCutoffs = cdbl$cvParams$maxCutoffs
  # Start from 2nd since we equalized +ve and -ve; thus neither side will be empty
  candidateCutoffs = candidateCutoffs[2:min(maxCutoffs, length(candidateCutoffs))]
  
  # TODO: re-implement without passing cdbl to harden()
  # and setInterceptAndSlope() to avoid computational overhead
  # from copying data unnecessarily
  
  # Compute the CV scores:
  startTime = Sys.time()
  numFolds = cdbl$cvParams$numFolds
  # Naive way of splitting equally into folds:
  foldIdx = sample(cut(1:length(cdbl$y), breaks=numFolds, labels=FALSE))
  if (cdbl$objective == "binary classification") {
    # Instead we randomize with equal # of case/controls in each fold
    # See discussion on stratified CV in page 204 of He & Ma 2013
    if (sum(cdbl$y) < numFolds | sum(1 - cdbl$y) < numFolds) {
      stop("Insufficient samples from each class available for cross-validation.")
    }
    caseIdx = sample(cut(1:sum(cdbl$y), breaks=numFolds, labels=FALSE))
    controlIdx = sample(cut(1:sum(1 - cdbl$y), breaks=numFolds, labels=FALSE))
    foldIdx[cdbl$y == 1] = caseIdx
    foldIdx[cdbl$y == 0] = controlIdx
  } 
  scores = matrix(nrow=length(candidateCutoffs), ncol=numFolds)
  i = 0
  for (cutoff in candidateCutoffs) {
    i = i + 1
    cdbl = harden.CoDaBaseLearner(cdbl, cutoff)
    for (j in 1:numFolds) {
      cdbl = setInterceptAndSlope.CoDaBaseLearner(cdbl, cdbl$x[foldIdx != j,], cdbl$y[foldIdx != j], cdbl$boostingOffset[foldIdx != j])
      yHat = predict(cdbl, cdbl$x[foldIdx == j,]) + cdbl$boostingOffset[foldIdx == j]
      if (cdbl$objective == "binary classification") {
        ROC = pROC::roc(cdbl$y[foldIdx == j], yHat, quiet=TRUE)
        scores[i, j] = pROC::auc(ROC)
      } else if (cdbl$objective == "regression") {
        scores[i, j] = -sqrt(mean((cdbl$y[foldIdx == j] - yHat)^2))
      }
    }
  }
  # Now implement lambda-SE rule
  means = apply(scores, 1, mean)
  # see eqn 9.2 here https://www.cs.cmu.edu/~psarkar/sds383c_16/lecture9_scribe.pdf
  stds = apply(scores, 1, stats::sd) / sqrt(numFolds)
  lambdaSeRule = max(means) - stds[which.max(means)] * cdbl$lambda
  # oneSdRule = max(means - stds)
  bestCutoff = candidateCutoffs[means >= lambdaSeRule][1]
  # bestCutoff = candidateCutoffs[which.max(scores)]
  
  
  endTime = Sys.time()
  if (cdbl$verbose) {
    print('CV time:')
    print(endTime - startTime)
    xCoor = 2:(length(means) + 1)
    graphics::plot(xCoor, means, ylim=range(c(means-stds, means+stds)))
    graphics::arrows(xCoor, means-stds, xCoor, means+stds, length=0.05, angle=90, code=3)
    graphics::abline(lambdaSeRule, 0)
  }
  
  if (cdbl$objective == "binary classification") {
    baseLineScore = pROC::auc(pROC::roc(cdbl$y, cdbl$boostingOffset, quiet=TRUE))
  } else if (cdbl$objective == "regression") {
    baseLineScore = -sqrt(mean((cdbl$y - cdbl$boostingOffset)^2))
  }
  noImprovement = lambdaSeRule < baseLineScore
  if (noImprovement) {
    bestCutoff = 1.1 # bigger than the softAssignment
  }
  
  return(bestCutoff)
}


harden.CoDaBaseLearner = function(cdbl, cutoff) {
  numPart = cdbl$softAssignment >= cutoff
  denPart = cdbl$softAssignment <= -cutoff
  hard = list(numerator=numPart, denominator=denPart)
  cdbl$hard = hard
  return(cdbl)
}


setInterceptAndSlope.CoDaBaseLearner = function(cdbl, x, y, boostingOffset) {
  # If our base learner is empty (i.e. couldn't beat the 1SE rule),
  # we simply set to 0:
  if (!any(cdbl$hard$numerator) & !any(cdbl$hard$denominator)) {
    cdbl$slope = 0.0
    cdbl$intercept = 0.0
    return(cdbl)
  }
  # Otherwise, we have a non-empty SLR, so we compute it's regression coefficient
  logRatio = computeLogRatio.CoDaBaseLearner(cdbl, x)
  dat = data.frame(x=logRatio, y=y)
  if (cdbl$objective == "binary classification") {
    glm = stats::glm(y~x, family='binomial', data=dat, offset=boostingOffset)
    if (any(is.na(glm$coefficients))) {
      glm = list(coefficients=list(0, 0))
      warning("Numerical error during glm fit. Possible data issue.") 
    }
  } else if (cdbl$objective == "regression") {
    glm = stats::glm(y~x, family='gaussian', data=dat, offset=boostingOffset)
  } else {
    stop("Not implemented objective=", cdbl$objective)
  }
  cdbl$intercept = glm$coefficients[[1]]
  cdbl$slope = glm$coefficients[[2]]
  return(cdbl)
}


computeLogRatio.CoDaBaseLearner = function(cdbl, x) {
  
  if (!any(cdbl$hard$numerator) | !any(cdbl$hard$denominator)) {
    logRatio = rowSums(x * 0)
  } else { # we have a bona fide log-ratio
    if (cdbl$logRatioType == 'A') {
      epsilon = cdbl$optParams$epsilonA
      pvePart = rowSums(x[, cdbl$hard$numerator, drop=FALSE]) # drop=FALSE to keep as matrix
      nvePart = rowSums(x[, cdbl$hard$denominator, drop=FALSE])
      logRatio = log(pvePart + epsilon) - log(nvePart + epsilon)
    } else if (cdbl$logRatioType == 'B') {
      pvePart = rowMeans(log(x[, cdbl$hard$numerator, drop=FALSE])) # drop=FALSE to keep as matrix
      nvePart = rowMeans(log(x[, cdbl$hard$denominator, drop=FALSE]))
      logRatio = pvePart - nvePart
    }
  }
  
  return(logRatio)
}


predict.CoDaBaseLearner = function(cdbl, x, asLogits=TRUE) {
  logRatio = computeLogRatio.CoDaBaseLearner(cdbl, x)
  eta = cdbl$slope * logRatio + cdbl$intercept
  if (asLogits) {
    return(eta)
  } else {
    if (cdbl$objective == 'regression') {
      stop("Logits argument should only be used for classification, not regression.")
    }
    return(1 / (1 + exp(-eta)))
  }
}


#' codacore
#' 
#' This function implements the codacore algorithm described by Gordon-Rodriguez et al. 2021 
#' (https://doi.org/10.1101/2021.02.11.430695).
#' 
#' @param x A data.frame or matrix of the compositional predictor variables.
#'  Rows represent observations and columns represent variables.
#' @param y A data.frame, matrix or vector of the response. In the case of a 
#'  data.frame or matrix, there should be one row for each observation, and
#'  just a single column.
#' @param logRatioType A string indicating whether to use "balances" or "amalgamations".
#'  Also accepts "balance", "B", "ILR", or "amalgam", "A", "SLR".
#'  Note that the current implementation for balances is not strictly an ILR,
#'  but rather just a collection of balances (which are possibly non-orthogonal
#'  in the Aitchison sense).
#' @param objective A string indicating "binary classification" or "regression". By default,
#'  it is NULL and gets inferred from the values in y.
#' @param lambda A numeric. Corresponds to the "lambda-SE" rule. Sets the "regularization strength"
#'  used by the algorithm to decide how to harden the ratio. 
#'  Larger numbers tend to yield fewer, more sparse ratios.
#' @param offset A numeric vector of the same length as y. Works similarly to the offset in a glm.
#' @param shrinkage A numeric. Shrinkage factor applied to each base learner.
#'  Defaults to 1.0, i.e., no shrinkage applied.
#' @param maxBaseLearners An integer. The maximum number of log-ratios that the model will
#'  learn before stopping. Automatic stopping based on \code{seRule} may occur sooner.
#' @param optParams A list of named parameters for the optimization of the
#'  continuous relaxation. Empty by default. User can override as few or as
#'  many of our defaults as desired. Includes adaptiveLR (learning rate under
#'  adaptive training scheme), momentum (in the gradient-descent sense), 
#'  epochs (number of gradient-descent epochs), batchSize (number of 
#'  observations per minibatch, by default the entire dataset),
#'  and vanillaLR (the learning rate to be used if the user does *not* want
#'  to use the 'adaptiveLR', to be used at the risk of optimization issues).
#' @param cvParams A list of named parameters for the "hardening" procedure
#'  using cross-validation. Includes numFolds (number of folds, default=5) and
#'  maxCutoffs (number of candidate cutoff values of 'c' to be tested out
#'  during CV process, default=20 meaning log-ratios with up to 21 components
#'  can be found by codacore).
#' @param verbose A boolean. Toggles whether to display intermediate steps.
#' @param overlap A boolean. Toggles whether successive log-ratios found by 
#'  CoDaCoRe may contain repeated input variables. TRUE by default.
#'  Changing to FALSE implies that the log-ratios obtained by CoDaCoRe
#'  will become orthogonal in the Aitchison sense, analogously to the
#'  isometric-log-ratio transformation, while losing a small amount of
#'  model flexibility.
#' @param fast A boolean. Whether to run in fast or slow mode. TRUE by
#'  default. Running in slow mode will take ~x5 the computation time,
#'  but may help identify slightly more accurate log-ratios.
#' 
#' @return A \code{codacore} object.
#' 
#' @examples
#' \dontrun{
#' data("Crohn")
#' x <- Crohn[, -ncol(Crohn)]
#' y <- Crohn[, ncol(Crohn)]
#' x <- x + 1
#' model = codacore(x, y)
#' print(model)
#' plot(model)
#' }
#' 
#' @importFrom stats predict
#' 
#' @export
codacore <- function(
  x,
  y,
  logRatioType='balances',
  objective=NULL,
  lambda=1.0,
  offset=NULL,
  shrinkage=1.0,
  maxBaseLearners=5,
  optParams=list(),
  cvParams=list(),
  verbose=FALSE,
  overlap=TRUE,
  fast=TRUE
){
  
  # Convert x and y to the appropriate objects
  x = .prepx(x)
  y = .prepy(y)
  
  # Check whether we are in regression or classification mode by inspecting y
  if (is.null(objective)) {
    distinct_values = length(unique(y))
    if (distinct_values == 2) {
      objective = 'binary classification'
    } else if (inherits(y, 'factor')) {
      stop("Multi-class classification note yet implemented.")
    } else if (inherits(y, 'numeric')) {
      objective = 'regression'
      if (distinct_values <= 10) {
        warning("Response only has ", distinct_values, " distinct values.")
        warning("Consider changing the objective function.")
      }
    }
  }
  
  # Make sure we recognize objective
  if (! objective %in% c('binary classification', 'regression')) {
    stop("Objective: ", objective, " not yet implemented.")
  }
  
  # Save names of labels if relevant
  if (objective == 'binary classification' & inherits(y, 'factor')) {
    yLevels = levels(y)
    y = as.numeric(y) - 1
  } else {
    yLevels = NULL
  }
  
  # In the regression case, standardize data and save scale
  if (objective == 'regression') {
    yMean = mean(y)
    yScale = stats::sd(y)
    y = (y - yMean) / yScale
  } else {
    yMean = NULL
    yScale = NULL
  }
  
  # Convert logRatioType to a unique label:
  if (logRatioType %in% c('amalgamations', 'amalgam', 'A', 'SLR')) {
    logRatioType='A'
  } else if (logRatioType %in% c('balances', 'balance', 'B', 'ILR')) {
    logRatioType='B'
  } else {
    stop('Invalid logRatioType argument given: ', logRatioType)
  }
  
  if (any(x == 0)) {
    if (logRatioType == 'A') {
      warning("The data contain zeros. An epsilon is used to prevent divide-by-zero errors.")
    } else if (logRatioType == 'B') {
      stop("The data contain zeros. Balances cannot be used in this case.")
    }
  }
  
  if (!overlap) {
    # We store away the original data, since we will override during
    # the stagewise-additive procedure, zeroing out the input variables
    # that get picked up by each log-ratio.
    xOriginal = x
  }
  
  if (nrow(x) > 10000) {
    warning("Large number of observations; codacore could benefit from minibatching.")
  }
    
  if (nrow(x) < 50) {
    warning("Small number of observations; proceed with care (the likelihood of unstable results may increase).")
  }
  
  # Set up optimization parameters
  optDefaults = list(
    epochs=100,
    batchSize=nrow(x),
    vanillaLR=NULL,
    adaptiveLR=0.5,
    momentum=0.9,
    epsilonA=1e-6,
    epsilonB=1e-2
    # initialization = 'zeros'
  )
  # Take the defaults and override with any user-specified params, if given
  for (param in names(optParams)) {
    if (param %in% names(optDefaults)) {
      optDefaults[param] = optParams[param]
    } else {
      stop('Unknown optimization parameter given:', param)
    }
  }
  optParams = optDefaults
  
  # Check whether we are running in fast or slow mode
  if (!fast) {
    message("CoDaCoRe is running in slow mode. Switch to fast=TRUE for ~x5 speedup.")
    optParams$epochs = 1000
  }
  
  # Set up cross-validation parameters
  cvDefaults = list(
    maxCutoffs=20,
    numFolds=5
  )
  # Take the defaults and override with any user-specified params, if given
  for (param in names(cvParams)) {
    if (param %in% names(cvDefaults)) {
      cvDefaults[param] = cvParams[param]
    } else {
      stop('Unknown optimization parameter given:', param)
    }
  }
  cvParams = cvDefaults
  
  
  ### Now we train codacore:
  # Initialize from an empty ensemble
  ensemble = list()
  if (is.null(offset)) {
    boostingOffset = y * 0.0
  } else {
    boostingOffset = offset
  }
  maxBaseLearners = maxBaseLearners / shrinkage
  for (i in 1:maxBaseLearners) {
    startTime = Sys.time()
    cdbl = .CoDaBaseLearner(
      x=x,
      y=y,
      boostingOffset=boostingOffset,
      logRatioType=logRatioType,
      objective=objective,
      lambda=lambda,
      optParams=optParams,
      cvParams=cvParams,
      verbose=verbose
    )
    endTime = Sys.time()
    
    if (verbose) {
      cat('\n\n\nBase Learner', i)
      cat('\nLog-ratio indexes:')
      cat('\nNumerator =', which(cdbl$hard$numerator))
      cat('\nDenominator =', which(cdbl$hard$denominator))
      if (objective == 'binary classification') {
        cat('\nAccuracy:', cdbl$accuracy)
        cat('\nAUC:', cdbl$AUC)
      } else if (objective == 'regression') {
        cat('\nRMSE', cdbl$RMSE)
      }
      cat('\nTime taken:', endTime - startTime)
    }
    
    # If base learner is empty, we stop (no further gain in CV AUC):
    if (!any(cdbl$hard$numerator) & !any(cdbl$hard$denominator)) {break}
    
    # Add the new base learner to ensemble
    boostingOffset = boostingOffset + shrinkage * predict(cdbl, x)
    ensemble[[i]] = cdbl
    
    # If AUC is ~1, we stop (we separated the training data):
    # Note this won't always get caught by previous check since separability can lead to
    # numerical overflow which throws an error rather than finding an empty base learner
    if (cdbl$objective == 'binary classification' && cdbl$AUC > 0.999) {break}
    if (cdbl$objective == 'regression' && cdbl$Rsquared > 0.999) {break}
    
    # To avoid overlapping log-ratios, we "zero-out" the input variables that have 
    # already been used
    if (!overlap) {
      x[, cdbl$hard$numerator] = min(x)
      x[, cdbl$hard$denominator] = min(x)
    }
  }
  
  if (!overlap) {
    # Replace the original data frame for saving in the object
    x = xOriginal
  }
  
  cdcr = list(
    ensemble=ensemble,
    x = x,
    y = y,
    objective=objective,
    logRatioType=logRatioType,
    lambda=lambda,
    shrinkage=shrinkage,
    maxBaseLearners=maxBaseLearners,
    optParams=optParams,
    cvParams=cvParams,
    overlap=overlap,
    yLevels=yLevels,
    yMean=yMean,
    yScale=yScale
  )
  class(cdcr) = "codacore"
  
  # If no log-ratios were found, suggest reducing regularization strength
  if (length(ensemble) == 0) {
    warning("No predictive log-ratios were found. Consider using lower values of lambda.")
  }
  
  return(cdcr)
}


#' predict
#'
#' @param object A codacore object.
#' @param newx A set of inputs to our model.
#' @param asLogits Whether to return outputs in logit space
#'  (as opposed to probability space). Should always be set
#'  to TRUE for regression with continuous outputs, but can
#'  be toggled for classification problems.
#' @param numLogRatios How many predictive log-ratios to 
#'  include in the prediction. By default, includes the
#'  effects of all log-ratios that were obtained during
#'  training. Setting this parameter to an integer k will
#'  restrict to using only the top k log-ratios in the model.
#' @param ... Not used.
#'
#' @export
predict.codacore = function(object, newx, asLogits=TRUE, numLogRatios=NA, ...) {
  # Throw an error if zeros are present
  if (any(newx == 0)) {
    if (object$logRatioType == 'A') {
      warning("The data contain zeros. An epsilon is used to prevent divide-by-zero errors.")
    } else if (object$logRatioType == 'B') {
      stop("The data contain zeros. Balances cannot be used in this case.")
    }
  }
  
  x = .prepx(newx)
  yHat = rep(0, nrow(x))
  
  if (is.na(numLogRatios)) {
    numLogRatios = length(object$ensemble)
  }
  
  for (i in 1:numLogRatios) {
    cdbl = object$ensemble[[i]]
    yHat = yHat + object$shrinkage * predict(cdbl, x)
  }
  
  if (object$objective == 'binary classification') {
    if (asLogits) {
      return(yHat)
    } else {
      return(1 / (1 + exp(-yHat)))
    }
  } else if (object$objective == 'regression') {
    return(yHat * object$yScale + object$yMean)
  }
}


#' print
#'
#' @param x A codacore object.
#' @param ... Not used.
#'
#' @export
print.codacore = function(x, ...) {
  # TODO: Make this into a table to print all at once
  cat("\nNumber of log-ratios found:", length(x$ensemble))
  if (length(x$ensemble) >= 1) {
    for (i in 1:length(x$ensemble)) {
      cat("\n***")
      cat("\nLog-ratio rank", i)
      cdbl = x$ensemble[[i]]
      hard = x$ensemble[[i]]$hard
      if (is.null(rownames(cdbl$x))) {
        cat("\nNumerator:", which(cdbl$hard$numerator))
        cat("\nDenominator:", which(cdbl$hard$denominator))
      } else {
        cat("\nNumerator:", colnames(cdbl$x)[which(cdbl$hard$numerator)])
        cat("\nDenominator:", colnames(cdbl$x)[which(cdbl$hard$denominator)])
      }
      # cat("\nIntercept:", cdbl$intercept)
      if (cdbl$objective == 'binary classification') {
        cat("\nAUC:", cdbl$AUC)
        cat("\nSlope:", cdbl$slope)
      } else if (cdbl$objective == 'regression') {
        cat("\nR squared:", cdbl$Rsquared)
        cat("\nSlope:", cdbl$slope * x$yScale)
      }
    }
  }
  cat("\n") # one final new line at end to finish print block
}


#' plot
#' 
#' Plots a summary of a fitted codacore model.
#' Credit to the authors of the selbal package (Rivera-Pinto et al., 2018),
#' from whose package these plots were inspired.
#'
#' @param x A codacore object.
#' @param index The index of the log-ratio to plot.
#' @param ... Not used.
#'
#' @export
plot.codacore = function(x, index = 1, ...) {
  
  allRatios = getLogRatios(x)
  if(index > ncol(allRatios)){
    stop("The selected log-ratio does not exist!")
  }
  
  if (x$objective == 'regression') {
    
    logRatio = allRatios[, index]
    graphics::plot(logRatio, x$y, xlab='Log-ratio score', ylab='Response')
    graphics::abline(x$ensemble[[1]]$intercept, x$ensemble[[1]]$slope, lwd=2)
    
  } else if (x$objective == 'binary classification') {
    
    logRatio = allRatios[, index]
    
    # Convert 0/1 binary output to the original labels, if any
    if (!is.null(x$yLevels)) {
      y = x$yLevels[x$y + 1]
    }
    
    graphics::boxplot(
      logRatio ~ y,
      col=c('orange','lightblue'),
      main=paste0('Distribution of log-ratio ', index),
      xlab='Log-ratio score',
      ylab='Outcome',
      horizontal=TRUE
    )
    
  }
}


#' plotROC
#'
#' @param cdcr A codacore object.
#'
#' @export
plotROC = function(cdcr) {
  
  if (cdcr$objective != 'binary classification') {
    stop("ROC curves undefined for binary classification")
  }
  cols = c("black", "gray50", "gray70", "gray80", "gray90")
  lwds = c(2.0, 1.5, 1.2, 0.8, 0.6)
  oldPar <- graphics::par(no.readonly = TRUE)
  on.exit(graphics::par(oldPar)) # make sure to restore params even if there's an error
  graphics::par(pty = 's')
  graphics::plot(cdcr$ensemble[[1]]$ROC)
  legendCols = cols
  numBL = length(cdcr$ensemble)
  legendText = c()
  legendLwds = c()
  for (i in 1:min(5, numBL)) {
    cdbl = cdcr$ensemble[[i]]
    graphics::lines(cdbl$ROC$specificities, cdbl$ROC$sensitivities, col=cols[i], lwd=lwds[i])
    legendText = c(legendText, paste0("Log-ratio: ", i, ", AUC: ", round(cdbl$AUC, 2)))
    legendCols = c(legendCols, cols[i])
    legendLwds = c(legendLwds, lwds[i])
  }
  graphics::legend(
    "bottomright",
    rev(legendText),
    lty=1,
    col=rev(legendCols),
    lwd=rev(legendLwds) + 0.5
  )
}


# Helper functions below...


#' activeInputs
#'
#' @param cdcr A codacore object.
#'
#' @return The covariates included in the log-ratios
#' 
#' @export
activeInputs.codacore = function(cdcr) {
  
  vars = c()
  
  for (cdbl in cdcr$ensemble) {
    vars = c(vars, which(cdbl$hard$numerator))
    vars = c(vars, which(cdbl$hard$denominator))
  }
  
  return(sort(unique(vars)))
}


#' getNumeratorParts
#'
#' @param cdcr A codacore object.
#' @param baseLearnerIndex An integer indicating which of the 
#'     (possibly multiple) log-ratios learned by codacore to be used.
#' @param boolean Whether to return the parts in boolean form
#'     (a vector of TRUE/FALSE) or to return the column names of
#'     those parts directly.
#'
#' @return The covariates in the numerator of the selected log-ratio.
#' 
#' @export
getNumeratorParts <- function(cdcr, baseLearnerIndex=1, boolean=TRUE){
  
  parts = cdcr$ensemble[[baseLearnerIndex]]$hard$numerator
  
  if (boolean) {
    return(parts)
  } else {
    return(colnames(cdcr$x)[parts])
  }
}

#' getDenominatorParts
#'
#' @param cdcr A codacore object.
#' @param baseLearnerIndex An integer indicating which of the 
#'     (possibly multiple) log-ratios learned by codacore to be used.
#' @param boolean Whether to return the parts in boolean form
#'     (a vector of TRUE/FALSE) or to return the column names of
#'     those parts directly.
#' 
#' @return The covariates in the denominator of the selected log-ratio.
#' 
#' @export
getDenominatorParts <- function(cdcr, baseLearnerIndex=1, boolean=TRUE){
  
  parts = cdcr$ensemble[[baseLearnerIndex]]$hard$denominator
  
  if (boolean) {
    return(parts)
  } else {
    return(colnames(cdcr$x)[parts])
  }
}

#' getLogRatios
#'
#' @param cdcr A codacore object
#' @param x A set of (possibly unseen) compositional data. 
#'     The covariates must be passed in the same order as 
#'     for the original codacore() call.
#'
#' @return The learned log-ratio features, computed on input x.
#' 
#' @export
getLogRatios <- function(cdcr, x=NULL){
  
  if (is.null(x)) {
    x = cdcr$x
  }
  
  if (cdcr$logRatioType == 'A') {
    epsilonA = cdcr$optParams$epsilonA
    ratios <- lapply(cdcr$ensemble, function(a){
      num <- rowSums(x[, a$hard$numerator, drop=FALSE]) + epsilonA
      den <- rowSums(x[, a$hard$denominator, drop=FALSE]) + epsilonA
      log(num/den)
    })
  } else if (cdcr$logRatioType == 'B') {
    ratios <- lapply(cdcr$ensemble, function(a){
      num <- rowMeans(log(x[, a$hard$numerator, drop=FALSE]))
      den <- rowMeans(log(x[, a$hard$denominator, drop=FALSE]))
      num - den
    })
  }
  
  out <- do.call("cbind", ratios)
  colnames(out) <- paste0("log-ratio", 1:ncol(out))
  return(out)
}


#' getSlopes
#'
#' @param cdcr A codacore object
#'
#' @return The slopes (i.e., regression coefficients) for each log-ratio.
#' 
#' @export
getSlopes <- function(cdcr){
  
  out = c()
  
  for (cdbl in cdcr$ensemble) {
    out = c(out, cdbl$slope)
  }
  
  return(out)
}


#' getNumLogRatios
#'
#' @param cdcr A codacore object
#'
#' @return The number of log-ratios that codacore found.
#'     Typically a small integer. Can be zero if codacore
#'     found no predictive log-ratios in the data.
#' 
#' @export
getNumLogRatios <- function(cdcr){
  return(length(cdcr$ensemble))
}


#' getTidyTable
#'
#' @param cdcr A codacore object
#'
#' @return A table displaying the log-ratios found.
#' 
#' @export
getTidyTable <- function(cdcr){
  
  tidyLogRatio = function(baseLearnerIndex, model, xTrain){
    x = getNumeratorParts(model, baseLearnerIndex, FALSE)
    df = data.frame(Side = 'Numerator', Name = x)
    x = getDenominatorParts(model, baseLearnerIndex, FALSE)
    df = rbind(df, data.frame(Side = 'Denominator', Name = x))
    df$logRatioIndex = baseLearnerIndex
    return(df)
  }
  
  num = getNumLogRatios(cdcr)
  
  if (num == 0) {
    return()
  } else {
    do.call(rbind, lapply(1:num, tidyLogRatio, model=cdcr))
  }
}

#' getBinaryPartitions
#'
#' @param cdcr A codacore object
#'
#' @return A matrix describing whether each component (as rows) is found in the
#'  numerator (1) or denominator (-1) of each learned log-ratio (as columns).
#'  This format resembles a serial binary partition matrix frequently used
#'  in balance analysis.
#' 
#' @export
getBinaryPartitions <- function(cdcr){
  
  numBaseLearners <- length(cdcr$ensemble)
  res <- list(numBaseLearners)
  for(baseLearner in 1:numBaseLearners){
    thisNumerator <- getNumeratorParts(cdcr, baseLearner)
    thisDenominater <- getDenominatorParts(cdcr, baseLearner)
    res[[baseLearner]] <- thisNumerator*1 + thisDenominater*-1
  }
  do.call("cbind", res)
}

.prepx = function(x) {
  if (class(x)[1] == 'tbl_df') {x = as.data.frame(x)}
  if (class(x)[1] == 'data.frame') {x = as.matrix(x)}
  if (is.integer(x)) {x = x * 1.0}
  
  # If the data is un-normalized (e.g. raw counts),
  # we normalize it to ensure our learning rate is well calibrated
  x = x / rowSums(x)
  return(x)
}

.prepy = function(y) {
  if (inherits(y, 'tbl_df')) {
    y = as.data.frame(y)
  }
  if (inherits(y, 'data.frame')) {
    if (ncol(y) > 1) {
      stop("Response should be 1-dimensional (if given 
           as a data.frame or matrix, it should have a 
           row for each sample, and a single column).")
    }
    y = y[[1]]
  }
  if (inherits(y, 'matrix')) {
    if (ncol(y) > 1) {
      stop("Response should be 1-dimensional (if given 
           as a data.frame or matrix, it should have a 
           row for each sample, and a single column).")
    }
    if (inherits(y, 'character')) {
      y = as.character(y)
    }
    if (inherits(y, 'numeric')){
      y = as.numeric(y)
    }
  }
  if (inherits(y, 'character')) {
    y = factor(y)
  }
  return(y)
}
egr95/R-codacore documentation built on April 17, 2023, 7:45 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
egr95/R-codacore
Learning Sparse Log-Ratios for Compositional Data

R/codacore.R
In egr95/R-codacore: Learning Sparse Log-Ratios for Compositional Data

Documented in activeInputs.codacore codacore getBinaryPartitions getDenominatorParts getLogRatios getNumeratorParts getNumLogRatios getSlopes getTidyTable plot.codacore plotROC predict.codacore print.codacore

R Package Documentation

Browse R Packages

We want your feedback!

egr95/R-codacore Learning Sparse Log-Ratios for Compositional Data

R/codacore.R In egr95/R-codacore: Learning Sparse Log-Ratios for Compositional Data

Documented in activeInputs.codacore codacore getBinaryPartitions getDenominatorParts getLogRatios getNumeratorParts getNumLogRatios getSlopes getTidyTable plot.codacore plotROC predict.codacore print.codacore

R Package Documentation

Browse R Packages

We want your feedback!

egr95/R-codacore
Learning Sparse Log-Ratios for Compositional Data

R/codacore.R
In egr95/R-codacore: Learning Sparse Log-Ratios for Compositional Data