ohenery: Modeling of Ordinal Random Variables via Softmax Regression

Documented in erank harsmlik hensmlik rhenery

# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#' @title
#' Expected rank under the Harville model.
#'
#' @description
#'
#' Compute the expected rank of a bunch of entries based on their probability
#' of winning under the Harville model.
#'
#' @details
#'
#' Given the vector \eqn{\mu}, we compute the expected rank of each
#' entry, under the Harville model, where tail probabilities of winning
#' remain proportional.  
#'
#' Under the Harville model, the probability that the \eqn{i}th element
#' is assigned value 1 is 
#' \deqn{\pi_{1,i} = \frac{\mu_i}{\sum_j \mu_j}.}
#' Once an element has been assigned a 1, the Harville procedure 
#' removes it from the set and iterates.
#' If there are \eqn{k} elements in \eqn{\mu}, then the \eqn{i}th element
#' can be assigned any place between \eqn{1} and \eqn{k}. This
#' function computes the expected value of that random variable.
#'
#' While a naive implementation of this function would take
#' time factorial in \eqn{k}, this implementation takes time quadratic
#' in \eqn{k}, since it can be shown that the expected rank of the \eqn{i}th
#' element takes value
#' \deqn{e_i = k + \frac{1}{2} - \sum_j \frac{\mu_i}{\mu_i + \mu_j}.}
#'
#' @param mu a vector giving the probabilities. Should sum to one.
#'
#' @note
#' we should have the sum of ranks equal to the sum of \code{1:length(mu)}.
#'
#' @return The expected ranks, a vector.
#' @examples
#' # a garbage example
#' set.seed(12345)
#' mus <- runif(12)
#' mus <- mus / sum(mus)
#' erank(mus)
#'
#' # confirm the expected rank via simulation
#' set.seed(123)
#' mus <- runif(6,min=0,max=2)
#' mus <- mus / sum(mus)
#' set.seed(101)
#' emp <- rowMeans(replicate(200,rhenery(mu=mus,gamma=rep(1,length(mus)-1)))) 
#' (emp - erank(mus)) / emp
#'
#' \donttest{
#' if (require(microbenchmark)) {
#'   p10 <- 1:10 / sum(1:10)
#'   p16 <- 1:16 / sum(1:16)
#'   p24 <- 1:24 / sum(1:24)
#'   microbenchmark(erank(p10), erank(p16), erank(p24))
#' }
#' }
#' @template etc
#' @rdname erank
#' @export
erank <- function(mu) {
    .Call('_ohenery_erank', PACKAGE = 'ohenery', mu)
}

#' @title
#' Random generation under the Henery (or Harville) softmax model.
#'
#' @description
#'
#' Given base probabilities, and Henery gamma coefficients,
#' performs random generation, using R's built in rand seed,
#' of the final outcome of a race for each participant.
#'
#' @details
#' 
#' Given vectors \eqn{\mu} and \eqn{\gamma}, first computes
#' \deqn{\pi_{1,i} = \frac{\mu_i^{\gamma_1}}{\sum_j \mu_j^{\gamma_1}},}
#' then assigns a \eqn{1} to participant \eqn{i} with probability
#' \eqn{\pi_{1,i}}. The \sQuote{winning} participant is then removed
#' from consideration, and the process is repeated using the remaining
#' \eqn{\mu} and \eqn{\gamma} vectors.
#'
#' Typically one has that \eqn{\mu_i = \exp{\eta_i}}, for some
#' \sQuote{odds}, \eqn{\eta_i}.
#'
#' When the \eqn{\gamma} are all one, you recover the Harville softmax
#' model.
#'
#' @param mu  a vector of the probabilities of taking first place.
#' @param gamma  a vector of the gamma coefficients. Should have length
#' one less than \code{mu}, but if longer the unused elements are ignored.
#' If shorter, we reserve the right to either throw an error or extend out
#' the last gamma element. If not given, the coefficients are assumed
#' to be all one, which is the Harville model.
#' @return A vector, of the same length as the probabilities, giving
#' the entry of each horse. Note that the expected value of this
#' returned thing makes sense, it is \emph{not} the finished rank ordering
#' of a race.
#'
#' @seealso \code{\link{rsm}}
#' @template etc
#' @keywords probability
#' @rdname rhenery
#' @export
rhenery <- function(mu, gamma = NULL) {
    .Call('_ohenery_rhenery', PACKAGE = 'ohenery', mu, gamma)
}

#' @title
#' Softmax log likelihood under Harville and Henery Models.
#'
#' @description
#'
#' Compute the softmax log likelihood and gradient of the same.
#'
#' @details
#'
#' Given vectors \eqn{g}, \eqn{\eta} and optionally the gradient of \eqn{\eta}
#' with respect to some coefficients, computes the log likelihood under the
#' softmax. The user must provide a reverse ordering as well, which is sorted
#' first by the groups, \eqn{g}, and then, within a group, in increasing
#' quality order. For a race, this means that the index is in order from the
#' last place to the first place in that race, where the group numbers
#' correspond to one race.
#'
#' Under the Harville model, the log likelihood on a given group, where we are indexing
#' in \emph{forward} order, is 
#' \deqn{\left(\eta_1 - \log \sum_{j \ge 1} \mu_j\right) + \left(\eta_2 - \log \sum_{j \ge 2} \mu_j\right) + ...}
#' where \eqn{\mu_i = \exp{\eta_i}}.
#' By \dQuote{forward order}, we mean that \eqn{\eta_1} corresponds to the
#' participant taking first place within that group, \eqn{\eta_2} took second
#' place, and so on.
#' 
#' The Henery model log likelihood takes the form
#' \deqn{\left(\eta_1 - \log \sum_{j \ge 1} \mu_j\right) + \left(\gamma_2 \eta_2 - \log \sum_{j \ge 2} \mu_j^{\gamma_2}\right) + ...}
#' for gamma parameters, \eqn{\gamma}.
#' The Henery model corresponds to the Harville model where all the gammas equal 1.
#'
#' Weights in weighted estimation apply to each summand.
#' The weight for the last place participant in a group is irrelevant.
#' The weighted log likelihood under the Harville model is
#' \deqn{w_1\left(\eta_1 - \log \sum_{j \ge 1} \mu_j\right) + w_2\left(\eta_2 - \log \sum_{j \ge 2} \mu_j\right) + ...}
#' One should think of the weights as applying to the outcome,
#' not the participant.
#' 
#' @param g a vector giving the group indices.
#' @param idx a vector of integers, the same length as \code{g}, which
#' describes the reverse sort order on the observations, first by group,
#' then by place within the group. That is, the element \code{idx[0]} is the
#' index of the last place finisher in the group \code{g[0]}; then
#' \code{idx[1]} is the index of the next-to-last place finisher in
#' \code{g[1]} (assuming it equals \code{g[0]}), and so on.
#' The \code{idx} shall be zero based.
#' @param eta  a vector of the odds.
#' Must be the same length as \code{g}.
#' @param wt   an optional vector of non-negative elements, the same length
#' as \code{g}, giving the observation level weights. We then compute a 
#' weighted log likelihood, where the weights are in each summand.
#' The weights should probably have mean 1, but that's just, like, my
#' opinion, man. Negative weights throw an error. Note that the weights
#' for the last place in each group have no effect on the computation.
#' @param deleta  an optional matrix whose row count equals the number of elements
#' of \code{g}, \code{idx}, and \code{eta}. The rows of \code{deleta} are the derivatives of
#' eta with respect to some \eqn{z}. This is used to then maximize
#' likelihood with respect to \eqn{z}.
#'
#' @note
#' The likelihood function does not yet support ties.
#'
#' @return The log likelihood. If \code{deleta} is given, we add an attribute
#' to the scalar number, called \code{gradient} giving the derivative.
#' For the Henery model we also include a term of \code{gradgamma} which is
#' the gradient of the log likelihood with respect to the gamma vector.
#' @examples
#' # a garbage example
#' set.seed(12345)
#' g <- as.integer(sort(ceiling(20 * runif(100))))
#' idx <- as.integer(rev(1:length(g)) - 1L)
#' eta <- rnorm(length(g))
#' foo <- harsmlik(g=g,idx=idx,eta=eta,deleta=NULL)
#'
#' # an example with a real idx
#' nfeat <- 5
#' set.seed(1234)
#' g <- ceiling(seq(0.1,1000,by=0.1))
#' X <- matrix(rnorm(length(g) * nfeat),ncol=nfeat)
#' beta <- rnorm(nfeat)
#' eta <- X %*% beta
#' y <- rsm(eta,g)
#' 
#' idx <- order(g,y,decreasing=TRUE) - 1
#' foores <- harsmlik(g,idx,eta,deleta=X)
#'
#' # now reweight them
#' wt <- runif(length(g))
#' wt <- wt / mean(wt)   # mean 1 is recommended
#' foores <- harsmlik(g,idx,eta,wt=wt)
#' 
#' # try hensmlik 
#' foores <- hensmlik(g,idx,eta,gamma=c(0.9,0.8,1),wt=wt)
#'
#' # check the value of the gradient by numerical approximation
#' \donttest{
#'  nfeat <- 8
#'  set.seed(321)
#'  g <- ceiling(seq(0.1,1000,by=0.1))
#'  X <- matrix(rnorm(length(g) * nfeat),ncol=nfeat)
#'  beta <- rnorm(nfeat)
#'  eta <- X %*% beta
#'  y <- rsm(eta,g)
#'  
#'  idx <- order(g,y,decreasing=TRUE) - 1
#'  if (require(numDeriv)) {
#'  
#'  	fastval <- attr(harsmlik(g,idx,eta,deleta=X),'gradient')
#'  	numap <- grad(function(beta,g,idx,X) { 
#'  			 eta <- X %*% beta
#'  			 as.numeric(harsmlik(g,idx,eta))
#'  			 },
#'  			 x=beta,g=g,idx=idx,X=X)
#'  	rbind(fastval,numap)
#'  }
#' }
#' @template etc
#' @template ref-harville
#' @template note-weights
#' @rdname smlik 
#' @name smlik 
#' @export
harsmlik <- function(g, idx, eta, wt = NULL, deleta = NULL) {
    .Call('_ohenery_harsmlik', PACKAGE = 'ohenery', g, idx, eta, wt, deleta)
}

#' @param gamma a vector of the gamma parameters. It is assumed that the
#' first element is \eqn{\gamma_2}, while the last element is applied
#' to all higher order tie breaks.
#' @template ref-henery
#' @rdname smlik 
#' @export
hensmlik <- function(g, idx, eta, gamma, wt = NULL, deleta = NULL) {
    .Call('_ohenery_hensmlik', PACKAGE = 'ohenery', g, idx, eta, gamma, wt, deleta)
}