R/model_mpin.R

Defines functions .estimate_mpin .mpin_posteriors get_posteriors mpin_ml initials_mpin

Documented in get_posteriors initials_mpin mpin_ml

## - | FILE  HEADER |
##
## Script name:
##    model_mpin.R
##
## Purpose of script:
##    Implements the algorithms for generation of initial parameter sets,
##    the standard estimation method of the Multilayer PIN model, as well
##    as the computation, and display of posterior probabilities for the
##    same model.
##
## Author:
##    Montasser Ghachem
##
## Last updated:
##    2022-11-17
##
## License:
##    GPL 3
##
## Email:
##    montasser.ghachem@pinstimation.com
##
##
##
## Public functions:
## ++++++++++++++++++
##
## initials_mpin():
##    Based on the algorithm in Ersan (2016) , generates initial
##    parameter sets for the maximum likelihood estimation of the
##    MPIN model.
##
## mpin_ml():
##    Estimates the multilayer probability of informed trading
##    MPIN using the standard Maximum likelihood method.
##
## get_posteriors():
##    Computes the posterior probability, for each day in the sample,
##    that the day belongs to a no-information day, good information
##    day and bad information day, respectively (Easley and Ohara (1992),
##    Easley et al. (1996), Ersan (2016)).
##
## ++++++++++++++++++
##
##
## --
## Package: PINstimation
## website: www.pinstimation.com
## Authors: Montasser Ghachem and Oguz Ersan


##       +++++++++++++++++++++++++
## ++++++| |  PUBLIC FUNCTIONS | |
##       +++++++++++++++++++++++++


#' @title MPIN initial parameter sets of Ersan (2016)
#'
#' @description Based on the algorithm in
#' \insertCite{Ersan2016;textual}{PINstimation}, generates
#' initial parameter sets for the maximum likelihood estimation of the `MPIN`
#' model.
#'
#' @usage
#' initials_mpin(data, layers = NULL, detectlayers = "EG",
#'  xtraclusters = 4, verbose = TRUE)
#'
#' @param data A dataframe with 2 variables: the first
#' corresponds to buyer-initiated trades (buys), and the second corresponds
#' to seller-initiated trades (sells).
#'
#' @param layers An integer referring to the assumed number of
#' information layers in the data. If the value of `layers` is `NULL`, then
#' the number of layers is automatically determined by one of the following
#' functions: `detectlayers_e()`, `detectlayers_eg()`, and `detectlayers_ecm()`.
#' The default value is `NULL`.
#'
#' @param detectlayers A character string referring to the layer
#' detection algorithm used to determine the number of layers in the data. It
#' takes one of three values: `"E"`, `"EG"`, and `"ECM"`. `"E"` refers to the
#' algorithm in \insertCite{Ersan2016;textual}{PINstimation}, `"EG"` refers to
#' the algorithm in \insertCite{Ersan2022a;textual}{PINstimation}; while
#' `"ECM"` refers to the algorithm in
#' \insertCite{Ghachem2022;textual}{PINstimation}. The default value is `"EG"`.
#' Comparative results between the layer detection
#' algorithms can be found in \insertCite{Ersan2022a;textual}{PINstimation}.
#'
#' @param xtraclusters An integer used to divide trading days into
#' \code{#(1 + layers + xtraclusters)} clusters, thereby resulting in
#' \code{#comb(layers + xtraclusters, layers)} initial parameter sets in
#' line with \insertCite{ErsanAlici2016;textual}{PINstimation}, and
#' \insertCite{Ersan2016;textual}{PINstimation}. The default value is `4`
#' as chosen in \insertCite{Ersan2016;textual}{PINstimation}.
#'
#' @param verbose a binary variable that determines whether information messages
#' about the initial parameter sets, including the number of the initial
#' parameter sets generated. No message is shown when \code{verbose} is set
#' to \code{FALSE}. The default value is \code{TRUE}.
#'
#' @details The argument 'data' should be a numeric dataframe, and contain
#' at least two variables. Only the first two variables will be considered:
#' The first variable is assumed to correspond to the total number of
#' buyer-initiated trades, while the second variable is assumed to
#' correspond to the total number of seller-initiated trades. Each row or
#' observation correspond to a trading day. `NA` values will be ignored.
#'
#' @return Returns a dataframe of initial parameter sets each consisting of
#' `3J + 2` variables \{\eqn{\alpha}, \eqn{\delta}, \eqn{\mu}, \eb, \es\}.
#' \eqn{\alpha}, \eqn{\delta}, and \eqn{\mu} are vectors of length `J` where
#' `J` is the number of layers in the `MPIN` model.
#'
#' @references
#'
#' \insertAllCited
#'
#' @examples
#' # There is a preloaded quarterly dataset called 'dailytrades' with 60
#' # observations. Each observation corresponds to a day and contains the
#' # total number of buyer-initiated trades ('B') and seller-initiated
#' # trades ('S') on that day. To know more, type ?dailytrades
#'
#' xdata <- dailytrades
#'
#' # Obtain a dataframe of initial parameter sets for estimation of the MPIN
#' # model using the algorithm of Ersan (2016) with 3 extra clusters.
#' # By default, the number of layers in the data is detected using the
#' # algorithm of Ersan and Ghachem (2022a).
#'
#' initparams <- initials_mpin(xdata, xtraclusters = 3, verbose = FALSE)
#'
#' # Show the six first initial parameter sets
#'
#' print(round(t(head(initparams)), 3))
#'
#' # Use 10 randomly selected initial parameter sets from initparams to
#' # estimate the probability of informed trading via mpin_ecm. The number
#' # of information layers will be detected from the initial parameter sets.
#'
#' numberofsets <- nrow(initparams)
#' selectedsets <- initparams[sample(numberofsets, 10),]
#' \donttest{
#' estimate <- mpin_ecm(xdata, initialsets = selectedsets, verbose = FALSE)
#'
#' # Display the estimated MPIN value
#'
#' show(estimate@mpin)
#'
#' # Display the estimated parameters as a numeric vector.
#'
#' show(unlist(estimate@parameters))
#'
#' # Store the posterior probabilities in a variable, and show the first 6 rows.
#'
#' modelposteriors <- get_posteriors(estimate)
#' show(round(head(modelposteriors), 3))
#' }
#' @export
initials_mpin <- function(data, layers = NULL, detectlayers = "EG",
                             xtraclusters = 4, verbose = TRUE) {

  # Check that all variables exist and do not refer to non-existent variables
  # --------------------------------------------------------------------------
  allvars <- names(formals())
  environment(.xcheck$existence) <- environment()
  .xcheck$existence(allvars, err = uierrors$arguments()$mpininitfn)

  # Check that all arguments are valid
  # -------------------------------------------------------------------------
  largs <- list(data, layers, detectlayers, xtraclusters, verbose)
  names(largs) <- names(formals())
  rst <- .xcheck$args(arglist = largs, fn = "mpin")
  ux$stopnow(rst$off, m = rst$error, s = uierrors$arguments()$mpininitfn)


  # Prepare 'data' and initialize variables
  # --------------------------------------------------------------------------
  data <- ux$prepare(data)
  data$oi <- data$b - data$s
  data$aoi <- abs(data$b - data$s)

  # Get the number of layers if not provided
  # --------------------------------------------------------------------------
  if (is.null(layers))
    layers <- .xmpin$detect_layers(detectlayers, data)

  # Cluster aoi observations using HAC algorithm
  clusters <- hclust(dist(data$aoi), method = "complete")
  data$cluster <- cutree(clusters, layers + xtraclusters + 1)

  # Find average aoi per cluster and rank clusters by mean(aoi)
  means <- aggregate(. ~ cluster, data, mean)
  means <- means[order(means$aoi), ]
  clrank <- means$cluster


  # Find all different ways to partition cluster among no-information
  # cluster and information layers and store it in 'partitions'
  # --------------------------------------------------------------------------
  bind_to_matrix <- function(x, mat) {
    return(cbind(rep(x, nrow(mat)), mat))
  }
  positions <- function(sizesofar, layer, extra = xtraclusters) {
    maxsize <- extra + layer - sizesofar
    if (layer >= layers) {
      return(matrix((sizesofar + 1):(sizesofar + maxsize), ncol = 1))
    }
    if (layer < layers) {
      sequence <- sizesofar + (1:maxsize)
      output <- Reduce(
        rbind, lapply(sequence, function(x)
          (bind_to_matrix(x, positions(x, layer + 1)))))
      return(output)
    }
  }
  partitions <- positions(0, 1)
  npartitions <- nrow(partitions)
  xpartitions <- ux$tolist(partitions)

  # --------------------------------------------------------------------------

  # Reorder the cluster so they follow the order of clrank
  # ----------------------------------------------------------------------------
  data$cluster <- match(data$cluster, clrank)

  # Algorithm overview
  # ----------------------------------------------------------------------------
  # Create noEventRange which are the indices of clusters with the lower
  # absolute order imbalance. noEventRange will start with 1 element to
  # reach at the end init.points elements. It consists of the kth first
  # elements of Clrank where k goes from 1 to init.points.
  # for the clusters in the noEventRange, a column called set will take
  # the value 'noEvent'. For those outside notEventRange, the column set
  # takes the value 'good' if the order imbalance (oi) is non-negative
  # and takes 'bad' otherwise.
  # Once the variable 'set' is constructed, we can just aggregate the
  # data by the variable set and take the average within each group.
  # Once we have obtained the means, we can use GAN (2015) algorithm to
  # calculate an initial parameter set.
  # Once we have a new initial parameter set, we append it to the list of
  # initial points called initials.
  # ----------------------------------------------------------------------------
  .generate_one_iniatialset <- function(combination) {

    combination <- xpartitions[[combination]]
    cutoffs <- unname(unlist(c(0, combination, layers + xtraclusters + 1)))
    data$layer <- with(data, findInterval(cluster, cutoffs, left.open = TRUE) -
                         1)

    # Get the information about the sign of the OI, and delete the variables
    # oi, aoi, and cluster, not needed anymore.
    data$info <- sign(data$oi) * sign(data$layer)
    data$cluster <- data$oi <- data$aoi <- NULL
    data$info <- apply(
      data, 1, function(x) c("bad", "none", "good")[x[4] + 2])

    # Collect information on the information days
    infodata <- data[data$layer != 0, ]
    means <- aggregate(. ~ layer + info, infodata, mean, drop = FALSE)
    means$days <- aggregate(b ~ layer + info, infodata, length, drop = FALSE)$b
    means[is.na(means)] <- 0

    headers <- c("layer", "info", "b", "s", "days")
    xcol <- length(headers)

    # Create a shadow dataframe to make sure that all information layers
    # (+ and -) are represented.
    means_bg <- as.data.frame(matrix(0, ncol = xcol, nrow = 2 * layers))
    colnames(means_bg) <- headers
    means_bg$layer <- rep(1:layers, 2)
    means_bg$info <- c(rep("good", layers), rep("bad", layers))

    means <- merge(
      means, means_bg, all.y = TRUE, by = c("layer", "info"), )[, 1:xcol]

    colnames(means) <- headers

    # Add the none statistics
    noinfodata <- data[data$layer == 0, ]

    means_ne <- aggregate(. ~ info, noinfodata, mean, drop = TRUE)
    means_ne$days <- nrow(noinfodata)
    means_ne <- means_ne[, headers]
    means <- rbind(means, means_ne)
    means[is.na(means)] <- 0


    # Compute estimates of the parameters as in Ersan (2016)
    # --------------------------------------------------------------------------

    .get_distribution <- function(df, var) {

      types <- c("bad", "none", "good")
      .get_meanvalue <- function(df, var, type) {
        val <- df[df$info == type, var]
        denom <- ifelse(var == "days", nrow(data), 1)
        val <- val / denom
      }

      result <- lapply(types, function(x) .get_meanvalue(df, var, x))
      names(result) <- types
      return(result)
    }

    xd <- .get_distribution(means, "days")
    xb <- .get_distribution(means, "b")
    xs <- .get_distribution(means, "s")

    a <- xd$bad + xd$good
    d <- xd$bad / a

    .w_mean <- function(var1, var2, info) {
      v1 <- unlist(var1[info])
      v2 <- unlist(var2[info])
      v2 <- v2 / sum(v2)
      return(weighted.mean(x = v1, w = v2))
    }

    eb <- .w_mean(xb, xd, info = c("bad", "none"))
    es <- .w_mean(xs, xd, info = c("good", "none"))
    ediff <- eb - es

    wmu <- xb
    wmu$bad <- xs$bad - xb$bad + ediff
    wmu$good <- xb$good - xs$good - ediff
    mu <- (xd$bad * wmu$bad + xd$good * wmu$good) / (xd$bad + xd$good)
    mu[mu < 0] <- 0

    # Order the vectors alpha, delta by increasing mu
    ordmu <- order(mu)
    a <- a[ordmu]
    d <- d[ordmu]
    mu <- mu[ordmu]

    return(c(a, d, mu, eb, es))

  }


  xinitials <- vapply(
    1:npartitions,
    .generate_one_iniatialset, numeric(3 * layers + 2)
    )
  xinitials <- as.data.frame(t(xinitials))
  colnames(xinitials) <- .xmpin$varnames(4, layers)
  rownames(xinitials) <- NULL

  pin_err <- uierrors$pin()
  ux$show(c = verbose, m = pin_err$displaysets(
    "initials_mpin(...)", nrow(xinitials)), warning = TRUE)

  return(invisible(xinitials))
}


#' @title MPIN model estimation via standard ML methods
#'
#' @description Estimates the multilayer probability of informed trading
#' (`MPIN`) using the standard Maximum Likelihood method.
#'
#' @usage mpin_ml(data, layers = NULL, xtraclusters = 4, initialsets = NULL,
#' detectlayers = "EG", ..., verbose = TRUE)
#'
#' @param data A dataframe with 2 variables: the first
#' corresponds to buyer-initiated trades (buys), and the second corresponds
#' to seller-initiated trades (sells).
#'
#' @param layers An integer referring to the assumed number of
#' information layers in the data. If the argument \code{layers} is given,
#' then the maximum likelihood estimation will use the number of layers
#' provided. If \code{layers} is omitted,
#' the function \code{mpin_ml()} will find the optimal number of layers using
#' the algorithm developed in \insertCite{Ersan2022a;textual}{PINstimation}
#' (as default).
#'
#' @param xtraclusters An integer used to divide trading days into
#' \code{(1 + layers + xtraclusters)} clusters, thereby resulting in
#' \code{#comb(layers + xtraclusters, layers)} initial parameter sets in line
#' with \insertCite{ErsanAlici2016;textual}{PINstimation}, and
#' \insertCite{Ersan2016;textual}{PINstimation}. The default value is `4` as
#'  chosen in \insertCite{Ersan2016;textual}{PINstimation}.
#'
#' @param initialsets A dataframe containing initial parameter
#' sets for the estimation of the `MPIN` model. The default value is `NULL`.
#' If `initialsets` is `NULL`, the initial parameter sets are determined by the
#' function `initials_mpin()`.
#'
#' @param detectlayers A character string referring to the layer
#' detection algorithm used to determine the number of  layer in the data. It
#' takes one of three values: `"E"`, `"EG"`, and `"ECM"`. `"E"` refers to the
#' algorithm in \insertCite{Ersan2016;textual}{PINstimation}, `"EG"` refers to
#' the algorithm in \insertCite{Ersan2022a;textual}{PINstimation};
#' while `"ECM"` refers to the algorithm in
#' \insertCite{Ghachem2022;textual}{PINstimation}.
#' The default value is `"EG"`. Comparative results between the layer detection
#' algorithms can be found in \insertCite{Ersan2022a;textual}{PINstimation}.
#'
#' @param ... Additional arguments passed on to the function `mpin_ml`. The
#' recognized argument is `is_parallel`. `is_parallel` is a logical variable
#' that specifies whether the computation is performed using parallel
#' processing. The default value is \code{FALSE}.
#'
#' @param verbose A binary variable that determines whether detailed
#' information about the steps of the estimation of the MPIN model is displayed.
#' No output is produced when \code{verbose} is set to \code{FALSE}. The default
#' value is \code{TRUE}.
#'
#' @details The argument 'data' should be a numeric dataframe, and contain
#' at least two variables. Only the first two variables will be considered:
#' The first variable is assumed to correspond to the total number of
#' buyer-initiated trades, while the second variable is assumed to
#' correspond to the total number of seller-initiated trades. Each row or
#' observation correspond to a trading day. `NA` values will be ignored. \cr
#'
#' @return Returns an object of class \code{estimate.mpin}
#'
#' @references
#'
#'  \insertAllCited
#'
#' @examples
#' # There is a preloaded quarterly dataset called 'dailytrades' with 60
#' # observations. Each observation corresponds to a day and contains the
#' # total number of buyer-initiated trades ('B') and seller-initiated
#' # trades ('S') on that day. To know more, type ?dailytrades
#'
#' xdata <- dailytrades
#'
#' # ------------------------------------------------------------------------ #
#' # Estimate MPIN model using the standard ML method                         #
#' # ------------------------------------------------------------------------ #
#'
#' # Estimate the MPIN model using mpin_ml() assuming that there is a single
#' # information layer in the data. The model is then equivalent to the PIN
#' # model. The argument 'layers' takes the value '1'.
#' # We use two extra clusters to generate the initial parameter sets.
#'
#' estimate <- mpin_ml(xdata, layers = 1, xtraclusters = 2, verbose = FALSE)
#'
#' # Show the estimation output
#'
#' show(estimate)
#'
#' # Estimate the MPIN model using the function mpin_ml(), without specifying
#' # the number of layers. The number of layers is then detected using Ersan and
#' # Ghachem (2022a).
#' # -------------------------------------------------------------
#' \donttest{
#' estimate <- mpin_ml(xdata, xtraclusters = 2, verbose = FALSE)
#' }
#' # Show the estimation output
#'
#' show(estimate)
#'
#' # Display the likelihood-maximizing parameters
#'
#' show(estimate@parameters)
#'
#' # Display the global multilayer probability of informed trading
#'
#' show(estimate@mpin)
#'
#' # Display the multilayer probabilities of informed trading per layer
#'
#' show(estimate@mpinJ)
#'
#' # Display the first five initial parameters sets used in the maximum
#' # likelihood estimation
#'
#' show(round(head(estimate@initialsets, 5), 4))
#'
#' @importFrom skellam qskellam
#' @export
mpin_ml <- function(data, layers = NULL, xtraclusters = 4, initialsets = NULL,
                    detectlayers = "EG", ..., verbose = TRUE) {

  # Check that all variables exist and do not refer to non-existent variables
  # --------------------------------------------------------------------------
  allvars <- names(formals())
  allvars <- allvars[-6]
  environment(.xcheck$existence) <- environment()
  .xcheck$existence(allvars, err = uierrors$arguments()$mpininitfn)

  # Check that all arguments are valid
  # -------------------------------------------------------------------------
  largs <- list(
    data, layers, xtraclusters, initialsets, detectlayers, 0, verbose
    )
  names(largs) <- names(formals())
  largs[["..."]] <- NULL
  is_parallel <- .default$mpin_parallel

  vargs <- list(...)
  # check for unknown keys in the argument "..."
  unknown <- setdiff(names(vargs), "is_parallel")
  ux$stopnow(length(unknown) > 0, s = uierrors$mpin()$fn,
             m = uierrors$arguments()$unknown(u = unknown))

  # Collect the arguments in the dot-dot arguments
  if (length(vargs) > 0 && "is_parallel" %in% names(vargs))
    is_parallel <- vargs$is_parallel
  largs$is_parallel <- is_parallel
  rst <- .xcheck$args(arglist = largs, fn = "mpin")
  ux$stopnow(rst$off, m = rst$error, s = uierrors$mpin()$fn)


  # Prepare 'data' and initialize variables
  # --------------------------------------------------------------------------
  data <- ux$prepare(data)
  data$oi <- data$b - data$s
  data$aoi <- abs(data$b - data$s)

  # If both layers and xtraclusters are given, check that they are compatible
  if (!is.null(layers)) {
    rst <- .xcheck$xclusters(n = nrow(data), lay = layers,
                             xtra = xtraclusters)
    ux$stopnow(rst$off, m = rst$error, s = uierrors$mpin()$fn)
  }

  if (is.null(initialsets) && is.null(layers)) {
    dlayers <- .xmpin$detect_layers(toupper(detectlayers), data)
    rst <- .xcheck$xclusters(
      n = nrow(data), lay = dlayers, xtra = xtraclusters)
    ux$stopnow(rst$off, m = rst$error, s = uierrors$mpin()$fn)
  }

  # If the user specifies a number (layers), use it. If the argument layers
  # remains NULL, then use the detectlayers algorithms
  # ------------------------------------------------------------------------
  if (!is.null(initialsets)) {

    nrows <- nrow(initialsets)
    xlayers <- (length(initialsets[1, ]) - 2) / 3

    mpin_ms <- uix$mpin(
      nrows = nrows, initlayers = xlayers, layers = layers)

    ux$show(verbose, m = mpin_ms$start)

    ux$show(verbose, m = mpin_ms$detectsets)

    ux$show(verbose && !is.null(layers) && (xlayers != layers),
            m = mpin_ms$differentlayers, warning = TRUE)

    ux$show(verbose, m = mpin_ms$loadinitials)

    layers <- xlayers

    detection <- "INITIALSETS"

  } else {

    mpin_ms <- uix$mpin()

    ux$show(verbose, m = mpin_ms$start)

    if (is.null(layers)) {

      layers <- dlayers

      mpin_ms <- uix$mpin(layers = layers)

      ux$show(verbose, m = paste(
        mpin_ms$detectdata, mpin_ms$algorithm[detectlayers], sep = ""))

      ux$show(verbose, m = mpin_ms$numlayers)

      detection <- detectlayers

    } else {

      mpin_ms <- uix$mpin(layers = layers)

      ux$show(verbose, m = mpin_ms$selectedlayers)

      detection <- "USER"
    }


    ux$show(verbose, m = mpin_ms$computinginitials)

    initialsets <- initials_mpin(
      data, layers, xtraclusters = xtraclusters, verbose = FALSE)

    mpin_ms <- uix$mpin(nrows = nrow(initialsets))

  }

  # Transform initialsets from a dataframe to a list
  initialsets <- ux$tolist(initialsets)

  # Optimize the log-likelihood factorization of Ersan (2016)
  # ------------------------------------------------------------------------
  results <- .estimate_mpin(data, initialsets, layers,
                            detection, is_parallel, verbose)
  results@parallel <- is_parallel

  ux$show(verbose, m = mpin_ms$complete)

  return(results)
}



#' @title Posterior probabilities for PIN and MPIN estimates
#'
#' @description Computes, for each day in the sample, the posterior probability
#' that the day is a no-information day, good-information day and
#' bad-information day, respectively
#' (\insertCite{Easley1992;textual}{PINstimation},
#' \insertCite{Easley1996;textual}{PINstimation},
#' \insertCite{Ersan2016;textual}{PINstimation}).
#'
#' @usage get_posteriors(object)
#'
#' @param object (S4 object) an object of type `estimate.pin`,
#' `estimate.mpin`, or `estimate.mpin.ecm`.
#'
#' @return
#' If the argument `object` is of type `estimate.pin`, returns a dataframe of
#' three variables `post.N`, `post.G` and `post.B` containing in each row the
#' posterior probability that a given day is a no-information day (`N`),
#' good-information day (`G`), or bad-information day (`B`) respectively.
#'
#' If the argument `object` is of type `estimate.mpin` or `estimate.mpin.ecm`,
#' with `J` layers, returns a dataframe of `2*J+1` variables `Post.N`, and
#' `Post.G[j]` and `Post.B[j]` for each layer `j` containing in each row the
#' posterior probability that a given day is a no-information day,
#' good-information day in layer `j` or bad-information day in layer `j`,
#' for each layer `j` respectively.
#'
#' If the argument `object` is of any other type, an error is returned.
#'
#' @references
#'
#' \insertAllCited
#'
#' @examples
#' # There is a preloaded quarterly dataset called 'dailytrades' with 60
#' # observations. Each observation corresponds to a day and contains the
#' # total number of buyer-initiated trades ('B') and seller-initiated
#' # trades ('S') on that day. To know more, type ?dailytrades
#'
#' xdata <- dailytrades
#'
#' # ------------------------------------------------------------------------ #
#' # Posterior probabilities for PIN estimates                                #
#' # ------------------------------------------------------------------------ #
#'
#' # Estimate PIN using the Ersan and Alici (2016) algorithm and the
#' # factorization Lin and Ke(2011).
#'
#' estimate <- pin_ea(xdata, "LK", verbose = FALSE)
#'
#' # Display the estimated PIN value
#'
#' estimate@pin
#'
#' # Store the posterior probabilities in a dataframe variable and display its
#' # first 6 rows.
#'
#' modelposteriors <- get_posteriors(estimate)
#' show(round(head(modelposteriors), 3))
#'
#' # ------------------------------------------------------------------------ #
#' # Posterior probabilities for MPIN estimates                               #
#' # ------------------------------------------------------------------------ #
#'
#' # Estimate MPIN via the ECM algorithm, assuming that the dataset has 2
#' # information layers
#'
#' estimate <- mpin_ecm(xdata, layers = 2, verbose = FALSE)
#'
#' # Display the estimated Multilayer PIN value
#'
#' show(estimate@mpin)
#'
#' # Store the posterior probabilities in a dataframe variable and display its
#' # first six rows. The posterior probabilities are contained in a dataframe
#' # with 7 variables: one for no-information days, and two variables for each
#' # layer, one for good-information days and one for bad-information days.
#'
#' modelposteriors <- get_posteriors(estimate)
#' show(round(head(modelposteriors), 3))
#'
#' @export
get_posteriors <- function(object) {

  # Check that all variables exist and do not refer to non-existent variables
  # --------------------------------------------------------------------------
  allvars <- names(formals())
  environment(.xcheck$existence) <- environment()
  .xcheck$existence(allvars, uierrors$mpin()$fn)

  if (is(object, "estimate.adjpin"))
    return(message("\rError: The posteriors can't be obtained for",
                   " 'estimate.adjpin' objects!"))

  if (is(object, "estimate.pin") | is(object, "estimate.mpin")
      | is(object, "estimate.mpin.ecm"))
    return(attr(object, "posteriors"))
  return(message("\rError: Wrong object type!"))
}


##       +++++++++++++++++++++++++
## ++++++| | PRIVATE FUNCTIONS | |
##       +++++++++++++++++++++++++


.mpin_posteriors <- function(data, params) {
#  Computes the posterior probability for each day in the sample that the day
#  belongs to a no-information day, good information day and bad information
#  day, respectively
#
# Args:
#   data    : a dataframe containing two variables 'b' and 's'
#   params  : the set of MPIN estimates used to evaluate the posterior
#             probabilities
#
# Returns:
#   a dataframe of three variables `PostN`, `PostG` and `PostB` containing in
#  each row the  posterior probability that a given day is a no-information day,
#  good-information day or bad-information day respectively.

  # Prepare 'data' and initialize variables
  # --------------------------------------------------------------------------
  dx <- ux$prepare(data)
  a <- d <- mu <- eb <- es <- NULL

  # Collect the model parameters
  # --------------------------------------------------------------------------
  layers <- (length(params) - 2) / 3
  variables <- c("a", "d", "mu", "eb", "es")
  values <- unname(split(params, rep(1:5, c(layers, layers, layers, 1, 1))))
  for (i in seq_len(length(values)))
    assign(variables[i], unname(unlist(values[[i]])))

  # Compute the two parts of the factorization
  # --------------------------------------------------------------------------
  # Construct the variables e1, e2, e3, emax = max(ei) as in Ersan (2016).

  e1 <- kronecker(dx$b, t(mu), function(x, y) log(1 + y / eb) * x - y)
  e2 <- kronecker(dx$s, t(mu), function(x, y) log(1 + y / es) * x - y)
  e3 <- 0
  de <- data.frame(e1, e2, e3)
  emax <- do.call(pmax, de)

  dx$p1 <- dx$b * log(eb) + dx$s * log(es)  + emax
  dp2 <- cbind(t(t(exp(e1 - emax)) * (a * (1 - d))),
                   t(t(exp(e2 - emax)) * (a * d)),
                   (1 - sum(a)) * exp(e3 - emax))

  dx$p2 <- log(rowSums(dp2))
  dx$denom <- dx$p1 + dx$p2

  # Compute and return the conditional probabilities
  # --------------------------------------------------------------------------
  # - pn: numerator of P(n|b, s) - pg: numerator of P(g|b, s)
  # - pb: numerator of P(b|b, s) - denom : denominator of all probabilities

  dx$pn <- log((1 - sum(a))) + dx$b * log(eb) + dx$s * log(es)
  dx$pg <- log(t(t(exp(e1 - emax)) * (a * (1 - d)))) +
    dx$b * log(eb) + dx$s * log(es)
  dx$pb <- log(t(t(exp(e2 - emax)) * (a * d))) + dx$b * log(eb)
                  + dx$s * log(es)

  posteriors <- data.frame(with(
    data, cbind(exp(dx$pn - dx$denom), exp(dx$pg - dx$denom + emax),
                exp(dx$pb - dx$denom + emax))
  ))

  q <- seq(1:layers)
  qnames <- c("post.N", sprintf("post.G[%s]", q), sprintf("Post.B[%s]", q))
  colnames(posteriors) <- qnames

  return(posteriors)
}

.estimate_mpin <- function(data, initialsets, layers, detection,
                           is_parallel, verbose) {
# find MPIN-likelihood maximizing parameters starting from initial sets
# provided by the user.
#
# Args:
#   data        : a dataframe containing two variables 'b', and 's'
#   initialsets : a list of parameter vectors for the MPIN model
#   detection   : a character specifying how the number of layers is obtained
#               : It takes one of five values "INITIALSETS", "USER", "EG", "E",
#                 and "ECM"
#
# Returns:
#   a 'estimate.mpin' object containing the result of the optimization


  # Prepare 'data' and initialize variables
  # --------------------------------------------------------------------------

  data <- ux$prepare(data)
  data$oi <- data$b - data$s
  data$aoi <- abs(data$b - data$s)
  a <- d <- mu <- eb <- es <- time_on <- time_off <- NULL
  mpin_ms <- uix$mpin()

  estimates <- NULL
  optimal <- list(value = -Inf)

  # Optimize the likelihood for each initial set and pick the one
  # corresponding to the highest likelihood
  # --------------------------------------------------------------------------
  convergent <- 0
  low <- c(rep(0, (3 * layers)), 0, 0)
  up <- c(rep(1, (2 * layers)), rep(Inf, layers), Inf, Inf)
  nrows <- length(initialsets)

  time_on <- Sys.time()


  ux$show(verbose, m = mpin_ms$mlemethod)

  if (verbose) {
    pb_mpin <- ux$progressbar(0, nrows)
    cat(mpin_ms$progressbar)
  }

  .get_mlrun <- function(current) {

    # Prepare the current parameters, and the details dataframe
    # -----------------------------------------------------------------------
    temp_run <- unlist(initialsets[[current]])
    full <- length(initialsets)
    thisrun <- c(temp_run, rep(NA, 3 * layers + 4))

    # Code for displaying the progress bar
    # --------------------------------------------------------------------------
    if (verbose)
      pb_mpin <- ux$progressbar(minvalue = current - 1, maxvalue = full)

    # Estimate the ECM model by calling the function neldermead().
    # If the output of neldermead() is valid, calculate its likelihood using
    # -factorizations$mpin().
    # --------------------------------------------------------------------------
    estimates <- tryCatch({suppressWarnings(neldermead(
          initialsets[[current]], factorizations$mpin(data),
          lower = low, upper = up)
      )}, error = function(error_condition) {
        return(NULL)
      })


    if (!is.null(estimates)) {

      # The vector thisrun contains all optimal parameters, alongside the
      # list 'estimates' containing the results of the ML estimation.

      thisrun <- c(list(c(temp_run, estimates[["par"]], -estimates$value,
                   .xmpin$compute_pin(estimates[["par"]])), I(list(estimates))))
    } else {
      # If a run has failed, namely for inappropriate initial sets or for any other
      # reason, we associate it with a likelihood of -Inf

      thisrun <- c(list(c(temp_run, rep(NA, 3 * layers + 2), -Inf, -Inf), I(list(estimates))))

    }



    # Update the progress bar in the parent environment
    pe <- parent.env(environment())
    if (verbose) setTxtProgressBar(pe$pb_mpin, current)

    return(thisrun)
  }

  # Loop over the initial sets to find the optimal estimates
  # ----------------------------------------------------------------------------
  xs <- seq_len(length(initialsets))

  ux <- ux
  uix <- uix
  .xmpin <- .xmpin

  mpin_ms <- uix$mpin(layers = layers)
  if (verbose) {
    pb_mpin <- ux$progressbar(minvalue = 0, maxvalue = length(initialsets))
    cat(mpin_ms$progressbar)
  }

  if (is_parallel & length(initialsets) >= .default$parallel_cap()) {

    oplan <- future::plan(multisession, gc = TRUE,
                          workers = .default$parallel_cores())

    on.exit(plan(oplan), add = TRUE)

    runs <- furrr::future_map(xs, function(x) .get_mlrun(x))

  } else {

    runs <- lapply(xs, .get_mlrun)

  }

  time_off <- Sys.time()

  if (length(runs) > 1) {

    # Receive the list of all runs, each list element contains two list elements
    # the first one is all parameters relative to the run, and the second is the
    # list object, outcome of the ML neldermead estimation.
    estimates <- lapply(runs, "[[", 2)

    # Get and format the dataframe 'runs' which will be contained in the slot
    # 'details' of the optimal estimate
    xruns <- lapply(runs, "[[", 1)
    runs <- data.frame(do.call(rbind, xruns))

    colnames(runs) <- .xmpin$varnames(5, layers)
    rownames(runs) <- paste("set.", seq_len(nrow(runs)), sep = "")

    # Get the list of all likelihood. The number 'convergent' is the number of
    # all runs, for which the likelihood value is finite. If convergent is
    # different from zero, then there is an optimal estimate corresponding to
    # the one with highest likelihood value.
    lkdruns <- unlist(runs$likelihood)
    convergent <- length(lkdruns[is.finite(lkdruns)])
    if (convergent > 0) {
      optimizer <- which.max(lkdruns)[1]
      optimal <- estimates[[optimizer]][[1]]
    }

  }

  time_off <- Sys.time()


  # Optimization ended! - Format and return the optimal results
  # --------------------------------------------------------------------------

  initialpoints <- ux$todframe(initialsets)
  names(initialpoints) <- .xmpin$varnames(4, layers)

  if (is.finite(optimal$value)) {

    variables <- .xmpin$varnames()

    estimation <- split(
      optimal[["par"]], rep(1:5, c(layers, layers, layers, 1, 1)))

    for (i in 1:5) assign(variables[i], estimation[[i]])

    mpin_j <- setNames(a * mu / (sum(a * mu) + eb + es),
                       paste("layer.", 1:layers, "", sep = ""))
    mpin <- sum(a * mu) / (sum(a * mu) + eb + es)

    aggregates <- c(sum(a), sum(a * d) / sum(a),
                    sum(a * mu) / sum(a), eb, es)

    names(aggregates) <- .xmpin$varnames(2)

    parameters <- c(
      list(setNames(a,  paste("layer.", 1:layers, "", sep = ""))),
      list(setNames(d,  paste("layer.", 1:layers, "", sep = ""))),
      list(setNames(mu,  paste("layer.", 1:layers, "", sep = ""))),
      list(eb), list(es))

    names(parameters) <- .xmpin$varnames(2)

    xlist <- .xmpin$get_goodbadmpin(mpin_j, parameters)

    mpin_optimal <- new("estimate.mpin",
                   success = TRUE,
                   convergent.sets = convergent,
                   method = "ML",
                   layers = layers,
                   detection = detection,
                   parameters = parameters,
                   aggregates = aggregates,
                   likelihood = -optimal$value,
                   mpin.goodbad = xlist,
                   mpinJ = mpin_j,
                   mpin = mpin,
                   initialsets = initialpoints,
                   dataset = data,
                   details = runs,
                   runningtime = ux$timediff(time_on, time_off)
    )

    attr(mpin_optimal, "posteriors") <- .mpin_posteriors(
      data, c(a, d, mu, eb, es))


  } else {

    errmsg <- uierrors$mpin()$failed

    mpin_optimal <-
      new(
        "estimate.mpin",
        success = FALSE,
        method = "ML",
        detection = detection,
        layers = layers,
        errorMessage = errmsg,
        initialsets = initialpoints,
        dataset = data,
        runningtime = ux$timediff(time_on, time_off),
        likelihood = NaN,
        mpinJ = NaN,
        mpin = NaN
      )
  }

  return(mpin_optimal)

}

Try the PINstimation package in your browser

Any scripts or data that you put into this service are public.

PINstimation documentation built on March 31, 2023, 6:32 p.m.