Nothing
## - | FILE HEADER |
##
## Script name:
## model_adjpin.R
##
## Purpose of script:
## Implement the algorithms for generation of initial parameter sets,
## as well as, the estimation methods of the Adjusted PIN model of
## Duarte and Young (2009)
##
## Author:
## Montasser Ghachem
##
## Last updated:
## 2023-03-17
##
## License:
## GPL 3
##
## Email:
## montasser.ghachem@pinstimation.com
##
##
##
## Public functions:
## ++++++++++++++++++
##
## adjpin():
## Estimates the adjusted PIN 'adjPIN' as well as the
## probability of Symmetric Order-flow Shock 'PSOS'
## from the AdjPIN model of Duarte and Young(2009).
##
## initials_adjpin():
## Based on the algorithm in Ersan and Ghachem (2022b),
## generates sets of initial parameters to be used in
## the maximum likelihood estimation of AdjPIN model.
##
## initials_adjpin_cl():
## Based on an extension of the algorithm in Cheng and
## Lai(2021), generates sets of initial parameters to be
## used in maximum likelihood estimation of AdjPIN model.
##
## initials_adjpin_rnd():
## Generates random initial parameter sets to be used in
## the estimation of the AdjPIN model.
##
## ++++++++++++++++++
##
##
## --
## Package: PINstimation
## website: www.pinstimation.com
## Authors: Montasser Ghachem and Oguz Ersan
## +++++++++++++++++++++++++
## ++++++| | PUBLIC FUNCTIONS | |
## +++++++++++++++++++++++++
#' @title Estimation of adjusted PIN model
#'
#' @description Estimates the Adjusted Probability of Informed Trading
#' (`adjPIN`) as well as the Probability of Symmetric Order-flow Shock
#' (`PSOS`) from the `AdjPIN` model of Duarte and Young(2009).
#'
#' @usage adjpin(data, method = "ECM", initialsets = "GE", num_init = 20,
#' restricted = list(), ..., verbose = TRUE)
#'
#' @param data A dataframe with 2 variables: the first
#' corresponds to buyer-initiated trades (buys), and the second corresponds
#' to seller-initiated trades (sells).
#'
#' @param method A character string referring to the method
#' used to estimate the model of \insertCite{Duarte09;textual}{PINstimation}.
#' It takes one of two values: `"ML"` refers to the standard maximum likelihood
#' estimation, and `"ECM"` refers to the expectation-conditional maximization
#' algorithm. The default value is `"ECM"`. Details of the ECM method,
#' and comparative results can be found in
#' \insertCite{Ghachem2022;textual}{PINstimation}, and in
#' \insertCite{Ghachem2022b;textual}{PINstimation}.
#'
#' @param initialsets It can either be a character string referring to
#' prebuilt algorithms generating initial parameter sets or a dataframe
#' containing custom initial parameter sets.
#' If `initialsets` is a character string, it refers to the method of generation
#' of the initial parameter sets, and takes one of three values: `"GE"`, `"CL"`,
#' or `"RANDOM"`. `"GE"` refers to initial parameter sets generated by the
#' algorithm of \insertCite{Ersan2022b;textual}{PINstimation}, and implemented
#' in `initials_adjpin()`, `"CL"` refers to initial parameter sets generated by
#' the algorithm of \insertCite{ChengLai2021;textual}{PINstimation}, and
#' implemented in `initials_adjpin_cl()`, while `"RANDOM"` generates random
#' initial parameter sets as implemented in `initials_adjpin_rnd()`.
#' The default value is `"GE"`. If `initialsets` is a dataframe, the function
#' `adjpin()` will estimate the AdjPIN model using the provided initial
#' parameter sets.
#'
#' @param num_init An integer specifying the maximum number of
#' initial parameter sets to be used in the estimation.
#' If `initialsets="GE"`, the generation of initial parameter sets will stop
#' when the number of initial parameter sets reaches `num_init`. It can stop
#' earlier if the number of all possible generated initial parameter sets is
#' lower than `num_init`. If `initialsets="RANDOM"`, exactly `num_init`
#' initial parameter sets are returned. If `initialsets="CL"`: then `num_init`
#' is ignored, and all `256` initial parameter sets are used. The default
#' value is `20`. `[i]` The argument `num_init` is ignored when the argument
#' `initialsets` is a dataframe.
#'
#' @param restricted A binary list that allows estimating restricted
#' AdjPIN models by specifying which model parameters are assumed to be equal.
#' It contains one or multiple of the following four elements
#' `{theta, mu, eps, d}`. For instance, If `theta` is set to `TRUE`,
#' then the probability of liquidity shock in no-information days, and in
#' information days is assumed to be the same (\thetaB`=`\thetaS). If any of
#' the remaining rate elements `{mu, eps, d}` is set to `TRUE`,
#' (say `mu=TRUE`), then the rate is assumed to be the same on the buy side,
#' and on the sell side (\mub`=`\mus). If more than one element is set to
#' `TRUE`, then the restrictions are combined. For instance, if the argument
#' `restricted` is set to `list(theta=TRUE, eps=TRUE, d=TRUE)`, then the
#' restricted AdjPIN model is estimated, where \thetaB`=`\thetaS, \eb`=`\es,
#' and \Db`=`\Ds. If the value of the argument `restricted` is the empty list
#' (`list()`), then all parameters of the model are assumed to be independent,
#' and the unrestricted model is estimated. The default value is the empty
#' list `list()`.
#'
#' @param ... Additional arguments passed on to the function `adjpin()`. The
#' recognized arguments are `hyperparams`, and `fact`. The argument
#' `hyperparams` consists of a list containing the hyperparameters of the `ECM`
#' algorithm. When not empty, it contains one or more of the following
#' elements: `maxeval`, and `tolerance`. It is used only when the `method`
#' argument is set to `"ECM"`. The argument `fact` is a binary value that
#' determines which likelihood functional form is used: A factorization of
#' the likelihood function by \insertCite{Ersan2022b;textual}{PINstimation}
#' when it is set to `TRUE`, otherwise, the original likelihood function of
#' \insertCite{Duarte09;textual}{PINstimation}. The default value is `TRUE`.
#' More about these arguments are in the Details section.
#'
#' @param verbose A binary variable that determines whether
#' detailed information about the steps of the estimation of the AdjPIN model
#' is displayed. No output is produced when \code{verbose} is set to
#' \code{FALSE}. The default value is \code{TRUE}.
#'
#' @details The argument 'data' should be a numeric dataframe, and contain
#' at least two variables. Only the first two variables will be considered:
#' The first variable is assumed to correspond to the total number of
#' buyer-initiated trades, while the second variable is assumed to
#' correspond to the total number of seller-initiated trades. Each row or
#' observation correspond to a trading day. `NA` values will be ignored.
#'
#' If `initialsets` is neither a dataframe, nor a character string from the
#' set `{"GE",` `"CL",` `"RANDOM"}`, the estimation of the `AdjPIN` model is
#' aborted. The default initial parameters (`"GE"`) for the estimation
#' method are generated using a modified hierarchical agglomerative
#' clustering. For more information, see \code{initials_adjpin()}.
#'
#' The argument `hyperparams` contains the hyperparameters of the `ECM`
#' algorithm. It is either empty or contains one or two of the following
#' elements:
#'
#' \itemize{
#' \item `maxeval`: (`integer`) It stands for maximum number of iterations of
#' the `ECM` algorithm for each initial parameter set. When missing, `maxeval`
#' takes the default value of `100`.
#'
#' \item `tolerance` (`numeric`) The `ECM` algorithm is stopped when the
#' (relative) change of log-likelihood is smaller than tolerance. When
#' missing, `tolerance` takes the default value of `0.001`.
#' }
#'
#' @return Returns an object of class \code{estimate.adjpin}.
#'
#' @references
#'
#' \insertAllCited
#'
#' @examples
#' # We use 'generatedata_adjpin()' to generate a S4 object of type 'dataset'
#' # with 60 observations.
#'
#' sim_data <- generatedata_adjpin(days = 60)
#'
#' # The actual dataset of 60 observations is stored in the slot 'data' of the
#' # S4 object 'sim_data'. Each observation corresponds to a day and contains
#' # the total number of buyer-initiated transactions ('B') and seller-
#' # initiated transactions ('S') on that day.
#'
#' xdata <- sim_data@data
#'
#' # ------------------------------------------------------------------------ #
#' # Compare the unrestricted AdjPIN model with various restricted models #
#' # ------------------------------------------------------------------------ #
#'
#' # Estimate the unrestricted AdjPIN model using the ECM algorithm (default),
#' # and show the estimation output
#'
#' estimate.adjpin.0 <- adjpin(xdata, verbose = FALSE)
#'
#' show(estimate.adjpin.0)
#'
#' # Estimate the restricted AdjPIN model where mub=mus
#' \donttest{
#' estimate.adjpin.1 <- adjpin(xdata, restricted = list(mu = TRUE),
#' verbose = FALSE)
#'
#' # Estimate the restricted AdjPIN model where eps.b=eps.s
#'
#' estimate.adjpin.2 <- adjpin(xdata, restricted = list(eps = TRUE),
#' verbose = FALSE)
#'
#' # Estimate the restricted AdjPIN model where d.b=d.s
#'
#' estimate.adjpin.3 <- adjpin(xdata, restricted = list(d = TRUE),
#' verbose = FALSE)
#'
#' # Compare the different values of adjusted PIN
#'
#' estimates <- list(estimate.adjpin.0, estimate.adjpin.1,
#' estimate.adjpin.2, estimate.adjpin.3)
#'
#' adjpins <- sapply(estimates, function(x) x@adjpin)
#'
#' psos <- sapply(estimates, function(x) x@psos)
#'
#' summary <- cbind(adjpins, psos)
#' rownames(summary) <- c("unrestricted", "same.mu", "same.eps", "same.d")
#'
#' show(round(summary, 5))
#' }
#' @export
adjpin <- function(data, method = "ECM", initialsets = "GE", num_init = 20,
restricted = list(), ..., verbose = TRUE) {
# Check that all variables exist and do not refer to non-existent variables
# --------------------------------------------------------------------------
allvars <- names(formals())
allvars <- allvars[-6]
environment(.xcheck$existence) <- environment()
.xcheck$existence(allvars, err = uierrors$adjpin()$fn)
# Check the additional dot-dot-dot arguments
# --------------------------------------------------------------------------
hyperparams <- list()
fact <- TRUE
vargs <- list(...)
# check for unknown keys in the argument "..."
unknown <- setdiff(names(vargs), c("hyperparams", "fact"))
ux$stopnow(length(unknown) > 0, s = uierrors$mpin()$fn,
m = uierrors$arguments()$unknown(u = unknown))
if ("hyperparams" %in% names(vargs)) hyperparams <- vargs$hyperparams
if ("fact" %in% names(vargs)) fact <- vargs$fact
vargs <- NULL
# Check that all arguments are valid
# Exceptionally, we check the value of restricted before initialsets, since
# checking the size of initialsets requires a valid value for restricted.
# -------------------------------------------------------------------------
largs <- list(data, method, restricted, initialsets, num_init, 0, verbose)
names(largs) <- c("data", "method", "restricted", "initialsets",
"num_init", "...", "verbose")
largs[["..."]] <- NULL
largs$fact <- fact
largs$hyperparams <- hyperparams
if (is.null(hyperparams)) largs["hyperparams"] <- list(NULL)
rst <- .xcheck$args(arglist = largs, fn = "adjpin")
ux$stopnow(rst$off, m = rst$error, s = uierrors$adjpin()$fn)
rst <- .xcheck$hyperparams(hyperparams, nrow(data), adj = TRUE)
ux$stopnow(rst$off, m = rst$error, s = uierrors$adjpin()$fn)
hyperparams <- rst$hyperparams
restricted <- .xadjpin$allrestrictions(restricted)
# Check, prepare and initialize variables
# --------------------------------------------------------------------------
data <- ux$prepare(data)
initialpoints <- data.frame()
xclusters <- 0
# Generate or load initial sets
#-----------------------------------------------------------------------------
if (is.data.frame(initialsets)) {
init_type <- "CUSTOM"
ux$show(verbose, m = uix$adjpin()$start)
ux$show(
verbose, m = uix$adjpin(nrows = nrow(initialsets))$loadinitials)
initialpoints <- initialsets
} else {
if (!is.character(initialsets)) initialsets <- "GE"
if (is.character(initialsets)) {
initialsets <- toupper(initialsets)
}
get_cl_initialsets <- function(num_init) {
initialpoints <- suppressWarnings(
initials_adjpin_cl(data, restricted = restricted, verbose = FALSE))
return(initialpoints)
}
get_rnd_initialsets <- function(num_init) {
return(suppressWarnings(initials_adjpin_rnd(
data, restricted = restricted, num_init = num_init, verbose = FALSE)))
}
get_eg_initialsets <- function(num_init, xclusters) {
initialpoints <- data.frame()
xclusters <- 0
while ((nrow(initialpoints) < num_init) &
(xclusters < 0.5 * nrow(data))) {
new_initialsets <- suppressWarnings(initials_adjpin(
data, xtraclusters = xclusters, restricted = restricted,
verbose = FALSE))
temp_initialpoints <- rbind(initialpoints, new_initialsets)
# Remove duplicates among the probability variables
if (nrow(temp_initialpoints) > 0)
temp_initialpoints <- unique(temp_initialpoints)
# If the combination of both sets exceeds num_init initial sets, then
# pick randomly initial parameter sets, in order to have exactly
# num_init initial sets.
if (nrow(temp_initialpoints) > num_init) {
add_initialpoints <- new_initialsets[
sample(seq_len(nrow(new_initialsets)),
num_init - nrow(initialpoints)), ]
initialpoints <- rbind(initialpoints, add_initialpoints)
} else {
initialpoints <- temp_initialpoints
}
rownames(initialpoints) <- NULL
ux$show(
verbose,
uix$adjpin(
init = "GE", nrows = nrow(initialpoints))$computinginitials,
skip = FALSE)
xclusters <- xclusters + 1
}
return(initialpoints)
}
ux$show(verbose, uix$adjpin()$start)
init_type <- toupper(initialsets)
.unknowntype <- (!(init_type %in% c("GE", "CL", "RANDOM")))
if (.unknowntype) init_type <- "GE"
ux$show(verbose && .unknowntype, uix$adjpin()$unknowntype)
ux$show(verbose, uix$adjpin(init = init_type)$computinginitials,
skip = FALSE)
initialpoints <- switch(toupper(initialsets),
"CL" = get_cl_initialsets(num_init),
"RANDOM" = get_rnd_initialsets(num_init),
get_eg_initialsets(num_init, xclusters))
ux$show(verbose, uix$adjpin(init = init_type,
nrows = nrow(initialpoints))$computinginitials)
}
# Reorganize the initial sets and call the appropriate function
#-----------------------------------------------------------------------------
initialpoints <- as.data.frame(initialpoints)
colnames(initialpoints) <- .xadjpin$varnames(restricted)
# Transform initialpoints into a list
#-----------------------------------------------------------------------------
initialpoints <- ux$tolist(initialpoints)
# Perturb a bit the probabilities when they are zero or one, since ECM can't
# increase the probability of a cluster than has initially a probability zero
#-----------------------------------------------------------------------------
nu <- 10^-4
initialpoints <- lapply(
initialpoints, function(x) x + (x == 0) * nu - (x == 1) * nu)
if (method == "ML" | restricted$theta == TRUE) {
return(.adjpin_ml(data, initialsets = initialpoints, init_type = init_type,
restricted = restricted, fact = fact, verbose = verbose))
}
if (method == "ECM") {
return(.adjpin_ecm(data, initialsets = initialpoints, init_type = init_type,
restricted = restricted, hyperparams = hyperparams,
verbose = verbose))
}
}
#' @title AdjPIN initial parameter sets of Ersan & Ghachem (2022b)
#'
#' @description
#' Based on the algorithm in \insertCite{Ersan2022b;textual}{PINstimation},
#' generates sets of initial parameters to be used in the maximum likelihood
#' estimation of `AdjPIN` model.
#'
#' @usage initials_adjpin(data, xtraclusters = 4, restricted = list(),
#' verbose = TRUE)
#'
#' @param data A dataframe with 2 variables: the first
#' corresponds to buyer-initiated trades (buys), and the second corresponds
#' to seller-initiated trades (sells).
#'
#' @param xtraclusters An integer used to divide trading days into
#' #\code{(4 + xtraclusters)} clusters, thereby resulting in
#' #\code{comb(4 + xtraclusters - 1, 4 - 1)} initial parameter sets in line
#' with \insertCite{ErsanAlici2016;textual}{PINstimation}, and
#' \insertCite{Ersan2022b;textual}{PINstimation}.The default value is `4` as
#' chosen in \insertCite{Ersan2016;textual}{PINstimation}.
#'
#' @param restricted A binary list that allows estimating restricted
#' AdjPIN models by specifying which model parameters are assumed to be equal.
#' It contains one or multiple of the following four elements
#' `{theta, mu, eps, d}`. For instance, If `theta` is set to `TRUE`,
#' then the probability of liquidity shock in no-information days, and in
#' information days is assumed to be the same (\thetaB`=`\thetaS). If any of
#' the remaining rate elements `{mu, eps, d}` is set to `TRUE`,
#' (say `mu=TRUE`), then the rate is assumed to be the same on the buy side,
#' and on the sell side (\mub`=`\mus). If more than one element is set to
#' `TRUE`, then the restrictions are combined. For instance, if the argument
#' `restricted` is set to `list(theta=TRUE, eps=TRUE, d=TRUE)`, then the
#' restricted AdjPIN model is estimated, where \thetaB`=`\thetaS, \eb`=`\es,
#' and \Db`=`\Ds. If the value of the argument `restricted` is the empty list,
#' then all parameters of the model are assumed to be independent,
#' and the unrestricted model is estimated. The default value is the empty
#' list `list()`.
#'
#' @param verbose a binary variable that determines whether information messages
#' about the initial parameter sets, including the number of the initial
#' parameter sets generated. No message is shown when \code{verbose} is set
#' to \code{FALSE}. The default value is \code{TRUE}.
#'
#' @details The argument 'data' should be a numeric dataframe, and contain
#' at least two variables. Only the first two variables will be considered:
#' The first variable is assumed to correspond to the total number of
#' buyer-initiated trades, while the second variable is assumed to
#' correspond to the total number of seller-initiated trades. Each row or
#' observation correspond to a trading day. `NA` values will be ignored.
#'
#' The function \code{initials_adjpin()} implements the algorithm suggested in
#' \insertCite{Ersan2022b;textual}{PINstimation}, and uses a hierarchical
#' agglomerative clustering (HAC) to find initial parameter sets for
#' the maximum likelihood estimation.
#'
#' @return Returns a dataframe of numerical vectors of ten elements
#' \{\eqn{\alpha}, \eqn{\delta}, \eqn{\theta}, \eqn{\theta'},
#' \eb, \es, \mub, \mus, \Db, \Ds\}.
#'
#' @references
#'
#' \insertAllCited
#'
#' @examples
#' # There is a preloaded quarterly dataset called 'dailytrades' with 60
#' # observations. Each observation corresponds to a day and contains the
#' # total number of buyer-initiated trades ('B') and seller-initiated
#' # trades ('S') on that day. To know more, type ?dailytrades
#'
#' xdata <- dailytrades
#'
#' # Obtain a dataframe of initial parameter sets for the maximum likelihood
#' # estimation using the algorithm of Ersan and Ghachem (2022b).
#'
#' init.sets <- initials_adjpin(xdata)
#'
#' # Use the list to estimate adjpin using the adjpin() method
#' # Show the value of adjusted PIN
#'
#' estimate <- adjpin(xdata, initialsets = init.sets, verbose = FALSE)
#' show(estimate@adjpin)
#'
#' @export
initials_adjpin <- function(data, xtraclusters = 4, restricted = list(),
verbose = TRUE) {
# Check that all variables exist and do not refer to non-existent variables
# --------------------------------------------------------------------------
allvars <- names(formals())
environment(.xcheck$existence) <- environment()
.xcheck$existence(allvars, err = uierrors$adjpin()$fn)
# Check that all arguments are valid
# -------------------------------------------------------------------------
largs <- list(data, xtraclusters, restricted, verbose)
names(largs) <- names(formals())
rst <- .xcheck$args(arglist = largs, fn = "adjpin")
ux$stopnow(rst$off, m = rst$error, s = uierrors$adjpin()$fn)
# Check, prepare and initialize variables
# --------------------------------------------------------------------------
data <- ux$prepare(data)
restricted <- .xadjpin$allrestrictions(restricted)
initials <- NULL
bxdata <- sxdata <- data
cls <- 3
# Create data clustering into 4 clusters:
# by buys (bxmeans), and by sells (sxmeans)
# --------------------------------------------------------------------------
bxclusters <- hclust(dist(bxdata$b), method = "complete")
bxdata$cluster <- cutree(bxclusters, cls + 1 + xtraclusters)
bxmeans <- aggregate(. ~ cluster, bxdata, mean, drop = FALSE)
bxmeans <- bxmeans[order(bxmeans$b), ]
bxclrank <- bxmeans$cluster
bxdata$cluster <- match(bxdata$cluster, bxclrank)
sxclusters <- hclust(dist(sxdata$s), method = "complete")
sxdata$cluster <- cutree(sxclusters, cls + 1 + xtraclusters)
sxmeans <- aggregate(. ~ cluster, sxdata, mean, drop = FALSE)
sxmeans <- sxmeans[order(sxmeans$s), ]
sxclrank <- sxmeans$cluster
sxdata$cluster <- match(sxdata$cluster, sxclrank)
# Find all different ways to partition cluster among no-information
# cluster and information layers and store it in 'partitions'
# --------------------------------------------------------------------------
bind_to_matrix <- function(x, mat) {
return(cbind(rep(x, nrow(mat)), mat))
}
positions <- function(sizesofar, cluster, extra = xtraclusters) {
maxsize <- extra + cluster - sizesofar
if (cluster >= cls) {
return(matrix((sizesofar + 1):(sizesofar + maxsize), ncol = 1))
}
if (cluster < cls) {
sequence <- sizesofar + (1:maxsize)
output <- Reduce(
rbind, lapply(sequence, function(x)
(bind_to_matrix(x, positions(x, cluster + 1)))))
return(output)
}
}
partitions <- positions(0, 1)
# --------------------------------------------------------------------------
# [+] Functions to be used in the function initials_adjpin()
# --------------------------------------------------------------------------
# A function creates a summary of average rates per cluster
# --------------------------------------------------------------------------
create_datasummary <- function(thisdf, bylayer = TRUE) {
thisdf$key <- if (bylayer) thisdf$layer else thisdf$cluster
overview <- aggregate(. ~ key, thisdf, mean, drop = FALSE)
overview <- cbind(overview, aggregate(. ~ key, thisdf, min)[, c("b", "s")])
overview <- cbind(overview, aggregate(. ~ key, thisdf, max)[, c("b", "s")])
colnames(overview) <- c("key", "b", "s", "cluster", "layer", "minb",
"mins", "maxb", "maxs")
overview$days <- aggregate(b ~ key, thisdf, length)$b
overview$key <- NULL
overview[is.na(overview)] <- 0
return(overview)
}
# A function divides a cluster into sub-clusters - based on order imbalance
# --------------------------------------------------------------------------
into2clusters <- function(thiscluster) {
# Initialize the return value 'xoverview' to the cluster to split.
# The value 'xoverview' if the number of days in the cluster is larger
# than 1, and therefore can be split into 2 clusters.
xoverview <- thiscluster
if (thiscluster$days > 1) {
medlayers <- thiscluster[1, ]$layer
if (is.list(thiscluster[1, ]$layer)) medlayers <- unlist(medlayers)
xdata <- data[data$layer %in% medlayers, ]
xdata$oi <- xdata$b - xdata$s
clusters <- hclust(dist(xdata$oi), method = "complete")
xdata$cluster <- cutree(clusters, 2)
xdata$oi <- NULL
xoverview <- create_datasummary(xdata, bylayer = FALSE)
}
return(xoverview)
}
# ----------------------------------------------------------------------------
# Run the process of producing initial sets for all configurations
# ----------------------------------------------------------------------------
for (k in seq_len(nrow(partitions))) {
cutoffs <- unname(unlist(c(0, partitions[k, ], cls + xtraclusters + 1)))
bxdata$layer <- findInterval(bxdata$cluster, cutoffs, left.open = T)
sxdata$layer <- findInterval(sxdata$cluster, cutoffs, left.open = T)
# Create a summary of the data clustered by the variable 'layer'
# -------------------------------------------------------------------------
bbxmeans <- create_datasummary(bxdata)
bbxmeans <- bbxmeans[order(bbxmeans$b), ]
ssxmeans <- create_datasummary(sxdata)
ssxmeans <- ssxmeans[order(ssxmeans$s), ]
# +++ +++ #
# ++++++ ++++++ #
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# I. BUILD A HYPOTHETICAL DISTRIBUTION IN THE DATAFRAME 'PARAMBOX' + #
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# ++++++ ++++++ #
# +++ +++ #
# A box of parameters of mean buys, means sells and number of days
# It contains six rows: one for each cluster in the AdjPIN model
parambox <- data.frame(matrix(0, nrow = 6, ncol = 3))
colnames(parambox) <- c("b", "s", "days")
# Create a vector ([eb], [eb+db], [eb+mub], [eb+mub+db]) from the dataframe
# bbxmeans, and call it buyrates.
# --------------------------------------------------------------------------
int_buys <- bbxmeans[2:3, ]
b2 <- int_buys[which.max(int_buys$s), ]$b
b3 <- int_buys[which.min(int_buys$s), ]$b
buyrates <- c(min(bbxmeans$b), b2, b3, max(bbxmeans$b))
# Create a vector ([es], [es+ds], [es+mus], [es+mus+ds]) from the dataframe
# ssxmeans, and call it sellrates.
# -------------------------------------------------------------------------
int_sells <- ssxmeans[2:3, ]
s2 <- int_sells[which.max(int_sells$b), ]$s
s3 <- int_sells[which.min(int_sells$b), ]$s
sellrates <- c(min(ssxmeans$s), s2, s3, max(ssxmeans$s))
# Build the hypothetical distribution parambox
# --------------------------------------------------------------------------
# Use the values of buyrates ([eb], [eb+db], [eb+mub], [eb+mub+db]), and the
# values of sellrates ([es], [es+ds], [es+mus], [es+mus+ds]), to create a
# hypothetical distribution and store in the dataframe 'parambox'
#
#----------------------------------------
# |[buys] |[sells] |
#----------------------------------------
# c1 |[eb] |[es] |
# c2 |[eb+db] |[es+ds] |
# c3 |[eb+mub] |[es] |
# c4 |[eb+mub+db] |[es+ds] |
# c5 |[eb] |[es+mus] |
# c6 |[eb+db] |[es+mus+ds] |
#----------------------------------------
# Gather all elements relative to buyfirst = T (= F) in a list bflist
# (sflist). It will be easier to call all these elements, once the value
# of buyfirst is determined
bflist <- list(xmeans = bbxmeans, data = bxdata,
indxmax = which.max(bbxmeans$b),
indxliq = 1 + which.min(bbxmeans[2:3, ]$s))
sflist <- list(xmeans = ssxmeans, data = sxdata,
indxmax = which.max(ssxmeans$s),
indxliq = 1 + which.min(ssxmeans[2:3, ]$b))
xlists <- list(bflist, sflist)
for (buyfirst in c(TRUE, FALSE)) {
# Pick the active list based on the value of buyfirst
xlist <- xlists[[2 - buyfirst]]
# xmeans = bbxmeans when buyfirst = T, otherwise xmeans = ssxmeans
# ------------------------------------------------------------------------
xmeans <- xlist$xmeans
data <- xlist$data
# Initialize parambox at its theoretical values
# ------------------------------------------------------------------------
parambox[, 1] <- c(buyrates, buyrates[1:2])
parambox[, 2] <- c(sellrates[1:2], sellrates)
parambox[, 3] <- rep(0, 6)
# +++ +++ #
# ++++++ ++++++ #
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# II. PARTITION THE DATA IN SIX CLUSTERS IN A DARATFRAME 'SIXCLUSTERS' + #
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# ++++++ ++++++ #
# +++ +++ #
# indxmax: index of the cluster of maximum trades (eb+mub+db, es+mus+ds)
# indxliq: index of the cluster with liquidity shocks (eb+db, es+ds)
# ------------------------------------------------------------------------
indxmax <- xlist$indxmax
indxliq <- xlist$indxliq
# Identify the two clusters to be clustered further, different from
# indxmax and indxliq, and gather them into a dataframe 'clusterfurther'
# Gather all clusters in one dataframe called 'sixclusters'
# ------------------------------------------------------------------------
sixclusters <- xmeans[c(indxmax, indxliq), ]
clusterfurther <- xmeans[-c(indxmax, indxliq), ]
if (nrow(clusterfurther) > 0) {
for (rw in seq_len(nrow(clusterfurther))) {
sixclusters <- rbind(sixclusters, into2clusters(clusterfurther[rw, ]))
}
}
sixclusters$layer <- sixclusters$cluster <- NULL
# +++ +++ #
# ++++++ ++++++ #
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# III.ATTACH EACH CLUSTER TO HYPOTHETICAL CLUSTERS IN 'PARAMBOX' + #
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# ++++++ ++++++ #
# +++ +++ #
mergexrows <- function(rows) {
newrow <- rows[1, ]
newrow$days <- sum(rows$days)
newrow$b <- sum(rows$days * rows$b) / newrow$days
newrow$s <- sum(rows$days * rows$s) / newrow$days
return(newrow)
}
# Connect clusters to hypothetical clusters, using the vector 'xpositions'
# where the nth entry contains the index of hypothetical cluster.
# Compute a similarity score 'similarity$score', and pick the hypothetical
# cluster as the cluster with the highest similarity score.
# ------------------------------------------------------------------------
xpositions <- NULL
for (row in seq_len(nrow(sixclusters))) {
brow <- sixclusters[row, ]$b
srow <- sixclusters[row, ]$s
similarity <- parambox[, c("b", "s")]
similarity$dpb <- apply(parambox, 1, function(x)
abs(ppois(brow, x[1], log.p = TRUE, lower.tail = (x[1] < brow))))
similarity$dps <- apply(parambox, 1, function(x)
abs(ppois(srow, x[2], log.p = TRUE, lower.tail = (x[2] < srow))))
similarity$score <- similarity$dpb * similarity$dps
if (all(similarity$score == 0)) {
similarity$dpb <- (parambox$b - brow)^2
similarity$dps <- (parambox$s - srow)^2
similarity$score <- - sqrt(similarity$dpb + similarity$dps)
}
xposition <- tail(order(similarity$score), 1)
xposition <- head(order(similarity$score, decreasing = TRUE), 1)
xpositions <- c(xpositions, xposition)
}
# Attach the current cluster 'xcluster' into the hypothetical cluster that
# has the index 'hypo' in the hypothetical distribution 'parambox'.
# If the cluster 'hypo' in 'parambox' already contain a cluster, merge
# both clusters, using the function 'mergexrows()'.
# ------------------------------------------------------------------------
for (i in seq_len(length(xpositions))) {
hypo <- xpositions[i]
xcluster <- sixclusters[i, c("b", "s", "days")]
if (parambox[hypo, 3] == 0) {
parambox[hypo, ] <- xcluster
} else {
xrows <- rbind(xcluster, parambox[hypo, ])
parambox[hypo, ] <- mergexrows(xrows)
}
}
parambox[parambox$days == 0, ] <- 0
# +++ +++ #
# ++++++ ++++++ #
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# IV. COMPUTE INITIAL PARAMETER SETS FOLLOWING ERSAN & GHACHEM (2022) + #
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# ++++++ ++++++ #
# +++ +++ #
# Distribute the parambox into three variables avb, avs and days.
# avb: average buys, avs: average sells, and days: number of days.
# -----------------------------------------------------------------------
avb <- unlist(parambox[, 1])
avs <- unlist(parambox[, 2])
days <- unlist(parambox[, 3])
# Compute 'empirical' values for alpha (a), delta (d) theta (t)
# and theta' (tp) and min_avb (min_avs) the minimum average buys (sells)
# ------------------------------------------------------------------------
a <- sum(days[3:6]) / sum(days)
d <- sum(days[c(5, 6)]) / sum(days[3:6])
t <- days[2] / sum(days[1:2])
tp <- sum(days[c(4, 6)]) / sum(days[3:6])
params <- c(a, d, t, tp)
params[is.na(params)] <- 0
# Generation of parameters - See Ersan and Ghachem (2022)
# ------------------------------------------------------------------------
eb <- c(avb[1], avb[5])
eb <- max(eb[eb > 0], 0)
if (eb == 0) eb <- buyrates[1]
es <- c(avs[1], avs[3])
es <- max(es[es > 0], 0)
if (es == 0) es <- sellrates[1]
db <- c(avb[2] - eb, avb[6] - eb,
ifelse(avb[4] * avb[3] > 0, avb[4] - avb[3], 0))
db <- max(db[db > 0], 0)
ds <- c(avs[2] - es, avs[4] - es,
ifelse(avb[6] * avb[5] > 0, avb[6] - avb[5], 0))
ds <- max(ds[ds >= 0], 0)
mub <- c(avb[4] - eb - db, avb[3] - eb)
mub <- max(mub[mub > 0], 0)
mus <- c(avs[6] - es - ds, avs[5] - es)
mus <- max(mus[mus > 0], 0)
xparams <- c(params, eb, es, mub, mus, db, ds)
xparams[is.nan(xparams)] <- 0
# Exclude initial parameter sets where:
# [1] one or more probability parameters are negative
# [2] one or more rate parameters are non-positive
# [3] mub = 0, and delta != 1. If delta != 1, then there are
# positive information days, so we can estimate a positive mub.
# [4] mus = 0, and delta != 0. If delta != 0, then there are
# negative information days, so we can estimate a positive mus.
# [5] db = ds = 0, while either theta or thetap is different from zero
# ------------------------------------------------------------------------
invalid <- (any(xparams[1:4] < 0)) |
(floor(xparams[7]) == 0 & xparams[2] != 1) |
(floor(xparams[8]) == 0 & xparams[2] != 0) |
(min(floor(xparams[9:10])) == 0 & sum(xparams[3:4]) != 0)
if (!invalid) {
if (xparams[1] == 1) xparams[3] <- 0.5
initials <- rbind(initials, xparams)
}
} # for (buyfirst in c(TRUE, FALSE))
} # for (k in seq_len(nrow(partitions)))
# +++ +++ #
# ++++++ ++++++ #
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# V. EVENTUALLY ADJUST INITIAL PARAMETER SETS USING 'RESTRICTED' + #
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# ++++++ ++++++ #
# +++ +++ #
if (!is.null(initials)) {
initials <- unique(initials)
initials <- as.data.frame(initials)
colnames(initials) <- .xadjpin$varnames()
temp <- initials[, 1:2]
iv <- initials
xtheta <- initials[, 3:4]
if (restricted$theta)
xtheta <- 0.5 * ((1 - iv$alpha) * iv$theta + iv$alpha * iv$thetap)
temp <- cbind(temp, xtheta)
xeps <- initials[, 5:6]
if (restricted$eps)
xeps <- 0.5 * (iv$eps.b + iv$eps.s)
temp <- cbind(temp, xeps)
xmu <- initials[, 7:8]
if (restricted$mu) {
wb <- iv$alpha * (1 - iv$delta)
ws <- iv$alpha * iv$delta
wmu <- wb + ws
xmu <- (wb / wmu) * iv$mu.b + (ws / wmu) * iv$mu.s
}
temp <- cbind(temp, xmu)
xds <- initials[, 9:10]
if (restricted$d)
xds <- 0.5 * (iv$d.b + iv$d.s)
temp <- cbind(temp, xds)
initials <- temp
rownames(initials) <- NULL
colnames(initials) <- .xadjpin$varnames(restricted)
}
pin_err <- uierrors$pin()
ux$show(c = verbose, m = pin_err$displaysets(
"initials_adjpin(...)", nrow(initials)), warning = TRUE)
return(invisible(initials))
}
#' @title AdjPIN random initial sets
#'
#' @description
#' Generates random initial parameter sets to be used in the estimation of the
#' `AdjPIN` model of \insertCite{Duarte09;textual}{PINstimation}.
#'
#' @usage initials_adjpin_rnd(data, restricted = list(), num_init = 20,
#' verbose = TRUE)
#'
#' @param data A dataframe with 2 variables: the first
#' corresponds to buyer-initiated trades (buys), and the second corresponds
#' to seller-initiated trades (sells).
#'
#' @param restricted A binary list that allows estimating restricted
#' AdjPIN models by specifying which model parameters are assumed to be equal.
#' It contains one or multiple of the following four elements
#' `{theta, mu, eps, d}`. For instance, If `theta` is set to `TRUE`,
#' then the probability of liquidity shock in no-information days, and in
#' information days is assumed to be the same (\thetaB`=`\thetaS). If any of
#' the remaining rate elements `{mu, eps, d}` is set to `TRUE`,
#' (say `mu=TRUE`), then the rate is assumed to be the same on the buy side,
#' and on the sell side (\mub`=`\mus). If more than one element is set to
#' `TRUE`, then the restrictions are combined. For instance, if the argument
#' `restricted` is set to `list(theta=TRUE, eps=TRUE, d=TRUE)`, then the
#' restricted AdjPIN model is estimated, where \thetaB`=`\thetaS, \eb`=`\es,
#' and \Db`=`\Ds. If the value of the argument `restricted` is the empty list
#' (`list()`), then all parameters of the model are assumed to be independent,
#' and the unrestricted model is estimated. The default value is the empty
#' list `list()`.
#'
#' @param num_init An integer corresponds to the number of initial
#' parameter sets to be generated. The default value is `20`.
#'
#' @param verbose a binary variable that determines whether information messages
#' about the initial parameter sets, including the number of the initial
#' parameter sets generated. No message is shown when \code{verbose} is set to
#' \code{FALSE}. The default value is \code{TRUE}.
#'
#' @details The argument 'data' should be a numeric dataframe, and contain
#' at least two variables. Only the first two variables will be considered:
#' The first variable is assumed to correspond to the total number of
#' buyer-initiated trades, while the second variable is assumed to
#' correspond to the total number of seller-initiated trades. Each row or
#' observation correspond to a trading day. `NA` values will be ignored.
#' \cr\cr The buy rate parameters \{\eb, \mub, \Db\} are randomly generated
#' from the interval (`minB`, `maxB`), where `minB` (`maxB`) is the smallest
#' (largest) value of buys in the dataset, under the condition that
#' \eb`+`\mub`+`\Db< `maxB`. Analogously, the sell rate parameters
#' \{\es, \mus, \Ds\} are randomly generated from the interval (`minS`, `maxS`),
#' where `minS` (`maxS`) is the smallest(largest) value of sells in the
#' dataset, under the condition that \es`+`\mus`+`\Ds < `maxS`.
#'
#' @return Returns a dataframe of numerical vectors of ten elements
#' \{\eqn{\alpha}, \eqn{\delta}, \eqn{\theta}, \eqn{\theta'},
#' \eb, \es, \mub, \mus, \Db, \Ds\}.
#'
#' @references
#'
#' \insertAllCited
#'
#' @examples
#' # There is a preloaded quarterly dataset called 'dailytrades' with 60
#' # observations. Each observation corresponds to a day and contains the
#' # total number of buyer-initiated trades ('B') and seller-initiated
#' # trades ('S') on that day. To know more, type ?dailytrades
#'
#' xdata <- dailytrades
#'
#' # Obtain a dataframe of 20 random initial parameters for the MLE of
#' # the AdjPIN model using the initials_adjpin_rnd().
#'
#' initial.sets <- initials_adjpin_rnd(xdata, num_init = 20)
#'
#' # Use the dataframe to estimate the AdjPIN model using the adjpin()
#' # function.
#'
#' estimate <- adjpin(xdata, initialsets = initial.sets, verbose = FALSE)
#'
#' # Show the value of adjusted PIN
#'
#' show(estimate@adjpin)
#'
#' @export
initials_adjpin_rnd <- function(data, restricted = list(),
num_init = 20, verbose = TRUE) {
# Check that all variables exist and do not refer to non-existent variables
# --------------------------------------------------------------------------
allvars <- names(formals())
environment(.xcheck$existence) <- environment()
.xcheck$existence(allvars, err = uierrors$adjpin()$fn)
# Check that all arguments are valid
# -------------------------------------------------------------------------
largs <- list(data, restricted, num_init, verbose)
names(largs) <- names(formals())
rst <- .xcheck$args(arglist = largs, fn = "adjpin")
ux$stopnow(rst$off, m = rst$error, s = uierrors$adjpin()$fn)
restricted <- .xadjpin$allrestrictions(restricted)
# Check, prepare and initialize variables
# --------------------------------------------------------------------------
data <- ux$prepare(data)
initials <- NULL
min_tb <- min(data$b)
max_tb <- max(data$b)
min_ts <- min(data$s)
max_ts <- max(data$s)
# Collect random initial sets from generatedata_adjpin output
# --------------------------------------------------------------------------
for (s in 1:num_init) {
rb <- sample(1:100, 3, replace = TRUE)
rs <- sample(1:100, 3, replace = TRUE)
rb <- ceiling(min_tb + (rb / sum(rb)) * (max_tb - min_tb))
rs <- ceiling(min_ts + (rs / sum(rs)) * (max_ts - min_ts))
sdata <- generatedata_adjpin(series = 1, restricted = restricted,
ranges = list(mu.b = c(1, rb[1]),
mu.s = c(1, rs[1]),
d.b = c(1, rb[2]),
d.s = c(1, rs[2]),
eps.b = c(max(min_tb, 1), rb[3]),
eps.s = c(max(min_ts, 1), rs[3]))
)
initials <- rbind(initials, unlist(sdata@empiricals[1:10]))
}
initials <- as.data.frame(unname(initials))
colnames(initials) <- .xadjpin$varnames()
# Create custom initial sets given the vector 'restricted' if needed
# --------------------------------------------------------------------------
if (restricted$theta)
initials$theta <- with(initials, ((1 - alpha) * theta + alpha * thetap))
if (restricted$eps)
initials$eps <- with(initials, (eps.b + eps.s) / 2)
if (restricted$mu)
initials$mu <- with(initials, (mu.b + mu.s) / 2)
if (restricted$d)
initials$d <- with(initials, (d.b + d.s) / 2)
rownames(initials) <- NULL
initials <- initials[, .xadjpin$varnames(restricted)]
pin_err <- uierrors$pin()
ux$show(c = verbose, m = pin_err$displaysets(
"initials_adjpin_rnd(...)", nrow(initials)), warning = TRUE)
return(invisible(initials))
}
#' @title AdjPIN initial parameter sets of Cheng and Lai (2021)
#'
#' @description
#' Based on an extension of the algorithm in
#' \insertCite{ChengLai2021;textual}{PINstimation}, generates sets of initial
#' parameters to be used in the maximum likelihood
#' estimation of `AdjPIN` model.
#'
#' @usage initials_adjpin_cl(data, restricted = list(), verbose = TRUE)
#'
#' @param data A dataframe with 2 variables: the first
#' corresponds to buyer-initiated trades (buys), and the second corresponds
#' to seller-initiated trades (sells).
#'
#' @param restricted A binary list that allows estimating restricted
#' AdjPIN models by specifying which model parameters are assumed to be equal.
#' It contains one or multiple of the following four elements
#' `{theta, mu, eps, d}`. For instance, If `theta` is set to `TRUE`,
#' then the probability of liquidity shock in no-information days, and in
#' information days is assumed to be the same (\thetaB`=`\thetaS). If any of
#' the remaining rate elements `{mu, eps, d}` is set to `TRUE`,
#' (say `mu=TRUE`), then the rate is assumed to be the same on the buy side,
#' and on the sell side (\mub`=`\mus). If more than one element is set to
#' `TRUE`, then the restrictions are combined. For instance, if the argument
#' `restricted` is set to `list(theta=TRUE, eps=TRUE, d=TRUE)`, then the
#' restricted AdjPIN model is estimated, where \thetaB`=`\thetaS, \eb`=`\es,
#' and \Db`=`\Ds. If the value of the argument `restricted` is the empty list,
#' then all parameters of the model are assumed to be independent,
#' and the unrestricted model is estimated. The default value is the empty
#' list `list()`.
#'
#' @param verbose a binary variable that determines whether information messages
#' about the initial parameter sets, including the number of the initial
#' parameter sets generated. No message is shown when \code{verbose} is set
#' to \code{FALSE}. The default value is \code{TRUE}.
#'
#' @details The argument 'data' should be a numeric dataframe, and contain
#' at least two variables. Only the first two variables will be considered:
#' The first variable is assumed to correspond to the total number of
#' buyer-initiated trades, while the second variable is assumed to
#' correspond to the total number of seller-initiated trades. Each row or
#' observation correspond to a trading day. `NA` values will be ignored.
#' \cr\cr The function implements an extension of the algorithm of
#' \insertCite{ChengLai2021;textual}{PINstimation}. In their paper, the authors
#' assume that the probability of liquidity shock is the same in no-information,
#' and information days, i.e., \thetaB`=`\thetaS, and use a procedure similar to
#' that of \insertCite{Yan2012;textual}{PINstimation} to generate 64 initial
#' parameter sets. The function implements an extension of their algorithm,
#' by relaxing the assumption of equality of liquidity shock probabilities,
#' and generates thereby `256` initial parameter sets for the unrestricted
#' `AdjPIN` model.
#'
#' @return Returns a dataframe of numerical vectors of ten elements
#' \{\eqn{\alpha}, \eqn{\delta}, \eqn{\theta}, \eqn{\theta'},
#' \eb, \es, \mub, \mus, \Db, \Ds\}.
#'
#' @references
#'
#' \insertAllCited
#'
#' @examples
#' # There is a preloaded quarterly dataset called 'dailytrades' with 60
#' # observations. Each observation corresponds to a day and contains the
#' # total number of buyer-initiated trades ('B') and seller-initiated
#' # trades ('S') on that day. To know more, type ?dailytrades
#'
#' xdata <- dailytrades
#'
#' # The function adjpin(xdata, initialsets="CL") allows the user to directly
#' # estimate the AdjPIN model using the full set of initial parameter sets
#' # generated using the algorithm Cheng and Lai (2021)
#' \donttest{
#' estimate.1 <- adjpin(xdata, initialsets="CL", verbose = FALSE)
#' }
#'
#' # Obtaining the set of initial parameter sets using initials_adjpin_cl
#' # allows us to estimate the PIN model using a subset of these initial sets.
#'
#' # Use initials_adjpin_cl() to generate 256 initial parameter sets using the
#' # algorithm of Cheng and Lai (2021).
#'
#' initials_cl <- initials_adjpin_cl(xdata, verbose = FALSE)
#'
#' # Use 20 randonly chosen initial sets from the dataframe 'initials_cl' in
#' # order to estimate the AdjPIN model using the function adjpin() with custom
#' # initial parameter sets
#'
#' numberofsets <- nrow(initials_cl)
#' selectedsets <- initials_cl[sample(numberofsets, 20),]
#'
#' estimate.2 <- adjpin(xdata, initialsets = selectedsets, verbose = FALSE)
#'
#' # Compare the parameters and the pin values of both specifications
#' \donttest{
#' comparison <- rbind(
#' c(estimate.1@parameters, adjpin = estimate.1@adjpin, psos = estimate.1@psos),
#' c(estimate.2@parameters, estimate.2@adjpin, estimate.2@psos))
#'
#' rownames(comparison) <- c("all", "50")
#'
#' show(comparison)
#' }
#'
#' @export
initials_adjpin_cl <- function(data, restricted = list(), verbose = TRUE) {
# Check that all variables exist and do not refer to non-existent variables
# --------------------------------------------------------------------------
allvars <- names(formals())
environment(.xcheck$existence) <- environment()
.xcheck$existence(allvars, err = uierrors$adjpin()$fn)
# Check that all arguments are valid
# -------------------------------------------------------------------------
largs <- list(data, restricted, verbose)
names(largs) <- names(formals())
rst <- .xcheck$args(arglist = largs, fn = "adjpin")
ux$stopnow(rst$off, m = rst$error, s = uierrors$adjpin()$fn)
restricted <- .xadjpin$allrestrictions(restricted)
# -------------------------------------------------------------------------
# Prepare 'data' and initialize variables
# --------------------------------------------------------------------------
data <- ux$prepare(data)
restricted <- .xadjpin$allrestrictions(restricted)
days <- nrow(data)
# Generate the set of values of a, d, theta (t), and theta' (tp)
# ----------------------------------------------------------
a_values <- seq(0.2, 0.8, 0.2)
d_values <- seq(0.2, 0.8, 0.2)
t_values <- seq(0.2, 0.8, 0.2)
# Take the cartesian product of the values of a, and d; and
# store them in dataframe called initials.
# ----------------------------------------------------------
if (restricted$theta) {
initials <- expand.grid(a_values, d_values, t_values)
colnames(initials) <- c("a", "d", "t")
initials$tp <- initials$t
} else {
initials <- expand.grid(a_values, d_values, t_values,
t_values)
colnames(initials) <- c("a", "d", "t", "tp")
}
initials <- as.data.frame(initials)
# Different prob (day share) for the six groups
# ----------------------------------------------------------
xtemp <- lapply(ux$tolist(initials), .xadjpin$distribution)
xtemp <- days * ux$todframe(xtemp)
# Use the functions getbparams(), and getsparams()
# ----------------------------------------------------------
getbparams <- function(cl) {
vecb <- data[order(data$b), ]
vecs <- data[order(data$s), ]
# Calculate eb from the first cl[1] + cl[3] rows, then delete them
# along cluster cl[6] with buy rate (eb + mub + db)
# --------------------------------------------------------
eb <- mean(vecb[1:ceiling(cl[1] + cl[3]), ]$b)
remaining <- as.numeric(
rownames(vecb[(1 + ceiling(cl[1] + cl[3])):(days - ceiling(cl[6])), ]))
vecb <- vecb[row.names(vecb) %in% remaining, ]
vecs <- vecs[row.names(vecs) %in% remaining, ]
# Calculate mub from cl[5] (the one with lowest sell rate), then
# delete it.
# --------------------------------------------------------
mub <- mean(vecs[1:cl[5], ]$b) - eb
w5 <- as.numeric(rownames(vecs[1:cl[5], ]))
vecb <- vecb[!(rownames(vecb) %in% w5), ]
vecs <- vecs[!(rownames(vecs) %in% w5), ]
# Clusters cl[2] and cl[4] remains with trade intensity (eb + db)
# Use the average buy to find db
# --------------------------------------------------------
mb2 <- mean(vecb$b)
db <- mb2 - eb
list(eb = eb, mub = mub, db = db)
}
getsparams <- function(cl) {
vecb <- data[order(data$b), ]
vecs <- data[order(data$s), ]
# Calculate es from the first cl[1] + cl[5] rows, then delete them
# along cluster cl[4] with buy rate (es + mus + ds)
# --------------------------------------------------------
es <- mean(vecs[1:ceiling(cl[1] + cl[5]), ]$s)
remaining <- as.numeric(
rownames(
vecs[(1 + ceiling(cl[1] + cl[5])):(days - ceiling(cl[4])), ]))
vecb <- vecb[row.names(vecb) %in% remaining, ]
vecs <- vecs[row.names(vecs) %in% remaining, ]
# Calculate mus from cl[3] (the one with lowest buy rate), then
# delete it.
# --------------------------------------------------------
mus <- mean(vecb[1:cl[3], ]$s) - es
w3 <- as.numeric(rownames(vecb[1:cl[3], ]))
vecb <- vecb[!(rownames(vecb) %in% w3), ]
vecs <- vecs[!(rownames(vecs) %in% w3), ]
# Clusters cl[2] and cl[6] remains with trade intensity (es + ds)
# Use the average buy to find ds
# --------------------------------------------------------
ds <- mean(vecs$s) - es
list(es = es, mus = mus, ds = ds)
}
# Apply the function getbparams() and getsparams() to get the
# buy and sell parameters
# ----------------------------------------------------------
btemp <- ux$todframe(apply(xtemp, 1, getbparams))
stemp <- ux$todframe(apply(xtemp, 1, getsparams))
tempx <- cbind(btemp, stemp)
colnames(tempx) <- c("eb", "mub", "db", "es", "mus", "ds")
tempx <- tempx[, .xadjpin$vars()]
colnames(initials) <- c("alpha", "delta", "theta", "thetap")
initials <- cbind(initials, tempx)
colnames(initials) <- .xadjpin$varnames()
# Apply the list 'restricted' to get the average value when the
# value theta = TRUE, eps = TRUE, mu = TRUE, d = TRUE
# ----------------------------------------------------------
if (restricted$eps)
initials$eps <- (initials$eps.b + initials$eps.s) / 2
if (restricted$mu)
initials$mu <- (initials$mu.b + initials$mu.s) / 2
if (restricted$d)
initials$d <- (initials$d.b + initials$d.s) / 2
# Reorder the columns of the dataframe in the following order
# ----------------------------------------------------------
initials <- initials[, .xadjpin$varnames(restricted)]
# Return the variables initialsets as a dataframe
# ----------------------------------------------------------
initialsets <- initials
rownames(initialsets) <- NULL
pin_err <- uierrors$pin()
ux$show(c = verbose, m = pin_err$displaysets(
"initials_adjpin_cl(...)", nrow(initials)), warning = TRUE)
return(invisible(initialsets))
}
## +++++++++++++++++++++++++
## ++++++| | PRIVATE FUNCTIONS | |
## +++++++++++++++++++++++++
# -----------------------------------------------------------------------------#
# Main function implementing the ECM algorithm for one initial set #
# -----------------------------------------------------------------------------#
.adjpin_ecm_oneset <- function(distribution, params, restricted = list(), data,
hyperparams) {
# Implements the expectation-conditional maximization algorithm for one set
# of AdjPIN model parameters
#
# Args:
# distribution: the distribution of cluster probabilities
# params : a vector of (eps.b, eps.s, mu.b, mu.s, d.b, d.s)
# es : the value of uninformed trading (sells)
# data : the dataset of buys and sells
# hyperparams : the set of hyperparameters
#
# Returns:
# returns a list of optimal parameters output of ECM algorithm
# -------------------------------------------------------------------- #
# Log-likelihood of the AdjPIN model #
# -------------------------------------------------------------------- #
adjpin_loglkhd <- function(j, eb, es, mub, mus, db, ds, buys, sells) {
shock <- (j + 1) %% 2
cluster <- floor((j - 0.5) / 2)
is_good <- ifelse(cluster == 1, 1, 0)
is_bad <- ifelse(cluster == 2, 1, 0)
logprob <- dpois(buys, eb + is_good * mub + shock * db, log = TRUE) +
dpois(sells, es + is_bad * mus + shock * ds, log = TRUE)
logprob[is.na(logprob)] <- 0
return(logprob)
}
# -------------------------------------------------------------------- #
# Posterior probabilities for the AdjPIN model #
# -------------------------------------------------------------------- #
adjpin_posterior <- function(j, p, eb, es, mub, mus, db, ds, buys, sells) {
shock <- (j + 1) %% 2
cluster <- floor((j - 0.5) / 2)
is_good <- ifelse(cluster == 1, 1, 0)
is_bad <- ifelse(cluster == 2, 1, 0)
posprob <- p[j] * dpois(buys, eb + is_good * mub + shock * db) *
dpois(sells, es + is_bad * mus + shock * ds)
return(posprob)
}
# Check, prepare and initialize variables
# --------------------------------------------------------------------------
data <- ux$prepare(data)
clusters <- 6
e <- mu <- adjpin_loglikd <- dx <- maxeval <- tolerance <- 0
# Update 'hyperparams' by filling missing hyperparameters, and distribute the
# new list to seven different variables: maxeval'
# ----------------------------------------------------------------------------
rst <- .xcheck$hyperparams(hyperparams, nrow(data), adj = TRUE)
ux$stopnow(rst$off, m = rst$error, s = uierrors$adjpin()$fn)
hps <- rst$hyperparams
hpn <- names(hps)
for (i in seq_len(length(hpn))) assign(hpn[i], unname(unlist(hps[[i]])))
# Assign values to variables
# --------------------------------------------------------------------------
variables <- .xadjpin$vars(restricted)
values <- suppressWarnings(split(params, seq_len(length(variables))))
for (i in seq_len(length(values))) assign(variables[i],
unname(unlist(values[[i]])))
# Adjust these values according to the restrictions on parameters
# --------------------------------------------------------------------------
if (restricted$eps) eb <- es <- e
if (restricted$mu) mub <- mus <- mu
if (restricted$d) db <- ds <- dx
# Define the function, create and initialize the log-likelihood vector
# --------------------------------------------------------------------------
adjpin_loglikd <- function(z, p, eb, es, mub, mus, db, ds, buys, sells) {
logdensity <- vapply(1:clusters, adjpin_loglkhd, eb, es, mub, mus, db,
ds, buys, sells, FUN.VALUE = double(length(buys)))
logdensity[is.na(logdensity)] <- 0
return(ux$finite_sum(z * log(p)) + ux$finite_sum(z * logdensity))
}
loglik <- vector()
loglik[1] <- 0
loglik[2] <- adjpin_loglikd(z = distribution, p = distribution,
eb, es, mub, mus, db, ds, data$b, data$s)
iter <- 2
initialfail <- FALSE
#-----------------------------------------------------------------------------
while (iter <= maxeval && abs(diff(tail(loglik,2))) > tolerance) {
# ------------------------------------------------------------------------ #
# ------------------------ EXPECTATION STEP ------------------------------ #
# ------------------------------------------------------------------------ #
estimateQ <- function(eb, es, mub, mus, db, ds, distribution) {
# Compute the posterior probability matrix
# ------------------------------------------------------------------------
posterior_mx <- vapply(1:clusters, adjpin_posterior, p = distribution,
eb, es, mub, mus, db, ds, data$b, data$s,
FUN.VALUE = double(length(data$b)))
# if the matrix of posterior probabilities is zero or contain NA values
# then skip the estimation process.
if (any(is.na(posterior_mx)) || sum(posterior_mx) == 0)
return(list(interrupted = iter))
# Replace the content of a row whose sum is zero by equiprobable
# assignment to clusters. The observations has equal probability to
# belong to any of the six clusters.
daily_posterior <- rowSums(posterior_mx)
zerorows <- which(daily_posterior == 0)
if (length(zerorows) > 0)
posterior_mx[zerorows,] <- rep(1/6,6)
# Compute estimates of the latent variable yn
# ------------------------------------------------------------------------
yn <- sweep(posterior_mx, 1, rowSums(posterior_mx, na.rm = TRUE), `/`)
yn[is.na(yn)] <- 0
# Compute the optimal distribution Pi*
# ------------------------------------------------------------------------
new_distrib <- colMeans(yn, na.rm = TRUE)
# Compute the parameters of the optimization system (S)
# ------------------------------------------------------------------------
# (1) ypqr is the sum of yi for p in {(b)uy,(s)ell}; good information q in
# {(y)es, (n)o} and liquidity shock is r in {(y)es,(n)o}
#-------------------------------------------------------------------------
n <- length(data$b)
yl <- yn[, 3:length(yn[1, ])]
yig <- yl[, 1:2]
yib <- yl[, 3:4]
yis <- yn[, c(FALSE, TRUE)]
ybnn <- yn[, c(1, 5)]
ybyn <- yn[, 3]
ybny <- yn[, c(2, 6)]
ybyy <- yn[, 4]
ysnn <- yn[, c(1, 3)]
ysyn <- yn[, 5]
ysny <- yn[, c(2, 4)]
ysyy <- yn[, 6]
# (2) Compute R.* and Q.* as in Ersan and Ghachem (2022)
# ------------------------------------------------------------------------
r0 <- sum(ybnn * data$b, na.rm = TRUE)
r1 <- sum(ybyn * data$b, na.rm = TRUE)
r2 <- sum(ybny * data$b, na.rm = TRUE)
r3 <- sum(ybyy * data$b, na.rm = TRUE)
q0 <- sum(ysnn * data$s, na.rm = TRUE)
q1 <- sum(ysyn * data$s, na.rm = TRUE)
q2 <- sum(ysny * data$s, na.rm = TRUE)
q3 <- sum(ysyy * data$s, na.rm = TRUE)
# (3) Compute yg and yb and ys as in Ersan and Ghachem (2022)
# ------------------------------------------------------------------------
yg <- sum(yig, na.rm = TRUE)
yb <- sum(yib, na.rm = TRUE)
ys <- sum(yis)
LQ <- list(r0 = r0, r1 = r1, r2 = r2, r3 = r3,
q0 = q0, q1 = q1, q2 = q2, q3 = q3,
yg = yg, yb = yb, ys = ys, yn = yn,
distribution = new_distrib, n = n,
interrupted = 0)
return(LQ)
}
# Use the function estimateQ() to estimate the complete data log-likelihood
# function. If the expectation step failed at the first iteration, then
# we have an initial fail of the estimation (initialfail = TRUE).
LQ <- estimateQ(eb, es, mub, mus, db, ds, distribution)
initialfail <- (LQ$interrupted == 2)
if (LQ$interrupted) break
# ------------------------------------------------------------------------ #
# ---------------------ECM MAXIMIZATION STEP ----------------------------- #
# ------------------------------------------------------------------------ #
# Use the ECM method to optimize the model parameters
oldparams <- c(eb, mub, db, es, mus, ds)
# Optimize mub conditional on the existing eb, and the new odb
# Optimize mus conditional on the existing es, and the new ods
if (restricted$mu) {
omub <- omus <- .xadjpin$solve_eqx(
c(LQ$r1, LQ$r3, LQ$q1, LQ$q3),
c(eb, eb + db, es, es + ds), LQ$yg + LQ$yb, 0.5 * (mub + mus))
} else {
omub <- .xadjpin$solve_eqx(c(LQ$r1, LQ$r3), c(eb, eb + db), LQ$yg, mub)
omus <- .xadjpin$solve_eqx(c(LQ$q1, LQ$q3), c(es, es + ds), LQ$yb, mus)
}
# Optimize db conditional on the existing eb, and mub
# Optimize ds conditional on the existing es, and mus
if (restricted$d) {
odb <- ods <- .xadjpin$solve_eqx(
c(LQ$r2, LQ$r3, LQ$q2, LQ$q3),
c(eb, eb + omub, es, es + omus), 2 * LQ$ys, 0.5 * (db + ds))
} else {
odb <- .xadjpin$solve_eqx(
c(LQ$r2, LQ$r3), c(eb, eb + omub), LQ$ys, db)
ods <- .xadjpin$solve_eqx(
c(LQ$q2, LQ$q3), c(es, es + omus), LQ$ys, ds)
}
# Optimize eb conditional on the newly optimized odb, omub
# Optimize es conditional on the newly optimized ods, omus
if (restricted$eps) {
oeb <- oes <- .xadjpin$solve_eqx(
c(LQ$r0 + LQ$q0, LQ$r1, LQ$r2, LQ$r3, LQ$q1, LQ$q2, LQ$q3),
c(0, omub, odb, omub + odb,
omus, ods, omus + ods), 2 * LQ$n, 0.5 * (eb + es))
} else {
oeb <- .xadjpin$solve_eqx(
c(LQ$r0, LQ$r1, LQ$r2, LQ$r3),
c(0, omub, odb, omub + odb), LQ$n, eb)
oes <- .xadjpin$solve_eqx(
c(LQ$q0, LQ$q1, LQ$q2, LQ$q3),
c(0, omus, ods, omus + ods), LQ$n, es)
}
# Update the trading intensity rates
# --------------------------------------------------------------------------
# Round the value of the optimal rate parameters to a reasonable level of
# precision. This will help with the convergence of the ECM algorithm,
# without compromising the economic significance of the trading rates.
# We settle for the precision level of 2 decimal digits:
# Rates' digit precision: rdp = 3
rdp <- 3
db <- round(odb, rdp)
eb <- round(oeb, rdp)
mub <- round(omub, rdp)
ds <- round(ods, rdp)
es <- round(oes, rdp)
mus <- round(omus, rdp)
# Update the probability distribution over clusters
# --------------------------------------------------------------------------
# Round the optimal probability parameters to a reasonable level of
# precision. This will help with the convergence of the ECM algorithm,
# without compromising the economic significance of the trading rates.
# We settle for the precision level for probabilities of 5 decimal digits:
# Probabilities' digit precision: pdp = 5
pdp <- 5
distribution <- round(LQ$distribution, pdp)
# Append the new log-likelihood value and update the iterations' counter
# --------------------------------------------------------------------------
cloglik <- adjpin_loglikd(
z = LQ$yn, p = LQ$distribution, eb, es, mub, mus, db, ds, data$b, data$s)
loglik[iter + 1] <- cloglik
iter <- iter + 1
}
# Compute and return the parameters of the AdjPIN model, start with alpha
# in order to exclude invalid estimates occuring in the following cases:
# [+] No optimization has occured, i.e., the initial parameter set did not
# lead to a viable posterior distribution (initialfail == TRUE)
# [+] optimal estimates do not contain any information days (a == 0), or do
# not contain any non-information days (a == 1)
# --------------------------------------------------------------------------
failure_output <- list(likelihood = -Inf, adjpin = NaN, psos = NaN,
parameters = NULL)
a <- sum(distribution[3:6])
if (initialfail == TRUE | a == 0 | floor(a) == 1 | !is.numeric(distribution))
return(failure_output)
# if alpha != c(0,1), then calculate the remaining parameters.
d <- sum(distribution[5:6]) / sum(distribution[3:6])
theta <- distribution[2] / sum(distribution[1:2])
thetap <- sum(distribution[c(4, 6)]) / sum(distribution[3:6])
adjpin <- (a * ((1 - d) * mub + d * mus)) *
(1 / ((a * ((1 - d) * mub + d * mus)) +
(db + ds) * (a * thetap + (1 - a) * theta) + eb + es))
psos <- ((db + ds) * (a * thetap + (1 - a) * theta)) *
(1 / ((a * ((1 - d) * mub + d * mus)) + (db + ds) *
(a * thetap + (1 - a) * theta) + eb + es))
lkd <- loglik[iter]
# We futher exclude these instances
# [+] mub = 0 when delta != 1, i.e. delta < 0.999
# [+] mus = 0 when delta != 0 i.e. delta > 0.001
if ((floor(mub) == 0 & round(d, 3) != 1) |
(floor(mus) == 0 & round(d, 3) != 0))
return(failure_output)
optparams <- list( alpha = a, delta = d, theta = theta, thetap = thetap,
eps.b = eb, eps.s = es, mu.b = mub, mu.s = mus, d.b = db, d.s = ds)
output <- list(likelihood = -lkd, adjpin = adjpin, psos = psos,
parameters = optparams
)
return(output)
}
# -----------------------------------------------------------------------------#
# Main function to estimate AdjPIN model using the ECM algorithm #
# -----------------------------------------------------------------------------#
.adjpin_ecm <- function(data, initialsets, init_type = "GE", restricted = NULL,
hyperparams = list(), verbose = TRUE) {
# Estimates the Adjusted Probability of Informed Trading (`AdjPIN`) as well as
# the Probability of Symmetric Order-flow Shock (psos) from Duarte and Young
# (2009) using the ECM algorithm.
#
# Args:
# data : a dataframe containing two variables 'b' and 's'
# initialsets: a dataframe containing custom initial parameter sets.
# init_type: the algorithm generating the initial sets, can take the values
# "GE", "RANDOM" or "CUSTOM".
# restricted : a list specifying whether the model parameters are equal.
# It can take the value `NULL`, or contain one or more keys from
# the following keys (theta, mu, eps, d)
# hyperparams: a list containing the hyperparameters of the ECM
# algorithm. When not empty, it contains one or more of the
# following elements: minalpha, maxeval, tolerance, maxinit
# verbose : Extended information about the steps of ECM estimation will
# be displayed.
#
# Returns:
# an object of class 'estimate.adjpin'
# Check, prepare and initialize variables
# --------------------------------------------------------------------------
tdata <- ux$prepare(data)
restricted <- .xadjpin$allrestrictions(restricted)
init_type <- toupper(init_type)
optimal <- list(likelihood = -Inf)
time_on <- Sys.time()
# Prepare the initial sets to be used by the function ECM algorithm
# --------------------------------------------------------------------------
initialpoints <- initialsets
init_prob <- lapply(initialpoints,
function(x) .xadjpin$distribution(x, restricted))
lng <- length(initialpoints[[1]])
params <- lapply(initialpoints,
function(x) c(x[(5 - restricted$theta):lng]))
# Initialize some other variables
# --------------------------------------------------------------------------
runs <- data.frame(matrix(NA, ncol = 2 * lng + 3, nrow = 0))
ux$show(verbose, uix$adjpin()$emmethod)
convergent <- 0
if (verbose) {
pb_adjpin <- ux$progressbar(0, maxvalue = length(initialpoints))
cat(uix$adjpin()$progressbar)
}
for (u in seq_len(length(initialpoints))) {
# Prepare the initial sets
#-----------------------------------------------------------------------
cparam <- params[[u]]
pp <- init_prob[[u]]
temp_run <- initialpoints[[u]]
# Estimate the model by ECM algorithm and pick the model with lowest BIC
#-----------------------------------------------------------------------
estimates <- .adjpin_ecm_oneset(pp, cparam, restricted, tdata,
hyperparams)
thisrun <- c(temp_run, rep(NA, lng + 3))
if (verbose) setTxtProgressBar(pb_adjpin, u)
if (length(estimates$parameters) > 0) {
# The likelihood factorization of Ersan and Ghachem (2022), could have
# infinite value, in case some parameters are zero, these parameters are
# replaced by very low values. Remember that the ECM algorithm maximizes
# a different version of the likelihood function (See ECM paper by Ersan
# and Ghachem (2022))
#-----------------------------------------------------------------------
estimates$likelihood <- .xadjpin$likelihood(tdata, estimates$parameters)
convergent <- convergent + is.finite(estimates$likelihood)
# If the new 'estimates' has higher likelihood that the 'optimal', it
# becomes the new optimal.
optimal <- ux$update_optimal(estimates, optimal)
# adjust the parameters sent using the variable 'restricted'
xparams <- unlist(estimates$parameters)
if (restricted$d) xparams <- xparams[-10]
if (restricted$mu) xparams <- xparams[-8]
if (restricted$eps) xparams <- xparams[-6]
if (restricted$theta) xparams <- xparams[-4]
thisrun <- c(temp_run, xparams, estimates$likelihood,
estimates$adjpin, estimates$psos)
}
runs <- rbind(runs, thisrun)
if (verbose) setTxtProgressBar(pb_adjpin, u)
}
ux$show(verbose, uix$adjpin()$complete)
time_off <- Sys.time()
# Make the initial parameter sets into a dataframe again
# ------------------------------------------------------------------------
initialpoints <- ux$todframe(initialpoints)
if (is.infinite(optimal$likelihood)) {
adjpin_estimate <- new("estimate.adjpin", success = FALSE, method = "ECM",
convergent.sets = convergent, factorization = "GE",
restrictions = restricted, dataset = data,
algorithm = init_type, initialsets = initialpoints)
adjpin_estimate@runningtime <- ux$timediff(time_on, time_off)
adjpin_estimate@hyperparams <- hyperparams
return(adjpin_estimate)
}
# Return the optimal results as an 'estimate.adjpin' S4 object
# ------------------------------------------------------------------------
names(initialpoints) <- .xadjpin$varnames(restricted = restricted)
colnames(runs) <- .xadjpin$varnames(restricted, details = TRUE)
rownames(runs) <- paste("set.", seq_len(nrow(initialpoints)), sep = "")
adjpin_estimate <- new("estimate.adjpin", success = TRUE, method = "ECM",
convergent.sets = convergent, factorization = "GE",
parameters = unlist(optimal$parameters),
likelihood = optimal$likelihood,
restrictions = restricted,
algorithm = init_type, adjpin = optimal$adjpin, psos =
optimal$psos,
initialsets = initialpoints, details = runs,
hyperparams = hyperparams, dataset = data,
runningtime = ux$timediff(time_on, time_off))
return(adjpin_estimate)
}
# ---------------------------------------------------------------------------- #
# Main function to estimate AdjPIN model using the standard MLE #
# ---------------------------------------------------------------------------- #
.adjpin_ml <- function(data, initialsets, init_type = "GE",
restricted = NULL, fact = TRUE, verbose = TRUE) {
# Estimates the Adjusted Probability of Informed Trading (`AdjPIN`) as well as
# the Probability of Symmetric Order-flow Shock (psos) from Duarte and Young
# (2009) using the standard Maximum likelihood maximization.
#
# Args:
# data : a dataframe containing two variables 'b' and 's'
# initialsets: a dataframe containing custom initial parameter sets.
# init_type: the algorithm generating the initial sets, can take the values
# "GE", "RANDOM" or "CUSTOM".
# restricted : a list specifying whether the model parameters are equal.
# It can take the value `NULL`, or contain one or more keys from
# the following keys (theta, mu, eps, d)
# fact : a binary value that determines which likelihood functional form is
# used: A factorization of the likelihood function by Ersan and
# Ghachem(2022) when it is 'TRUE', otherwise, the original likelihood
# function of Duarte and Young (2009).
# verbose : Extended information about the steps of ML estimation will
# be displayed.
#
# Returns:
# an object of class 'estimate.adjpin'
# Check, prepare and initialize variables
# --------------------------------------------------------------------------
data <- ux$prepare(data)
restricted <- .xadjpin$allrestrictions(restricted)
init_type <- toupper(init_type)
ux$show(verbose, uix$adjpin()$mlemethod)
time_on <- Sys.time()
# 'fact': Choose the original or the factorized likelihood function
# --------------------------------------------------------------------------
mlefn <- factorizations$adjpin(data, restricted = restricted)
if (fact == FALSE)
mlefn <- factorizations$adjpin_none(data, restricted = restricted)
# Prepare initial sets
# --------------------------------------------------------------------------
if (is.data.frame(initialsets))
initialsets <- as.list(as.data.frame(t(initialsets)))
if (!is.list(initialsets))
initialsets <- list(initialsets)
runs <- data.frame(matrix(NA, ncol = 0,
nrow = 2 * (10 - sum(unlist(restricted))) + 3))
optimal <- list(likelihood = -Inf)
# convergent initial sets
convergent <- 0
# Estimate the AdjPIN model for each initial parameter set, and pick
# the one with highest likelihood.
#--------------------------------------------------------------------
if (verbose) {
pb_adjpin <- ux$progressbar(maxvalue = length(initialsets))
cat(uix$adjpin()$progressbar)
}
for (i in seq_len(length(initialsets))) {
temp_run <- unlist(initialsets[[i]])
len <- length(initialsets[[i]])
thisrun <- c(temp_run, rep(0, len + 3))
estimates <- NULL
tryCatch({
len <- length(initialsets[[i]])
low <- rep(0, len)
up <- rep(+Inf, len - (4 - restricted$theta))
up <- c(rep(1, 4 - restricted$theta), up)
estimates <- suppressWarnings(nloptr::neldermead(
initialsets[[i]], mlefn, lower = low, upper = up,
control = list(maxeval = 1000)))
})
if (!is.null(estimates)) {
# The likelihood is the additive inverse of the value in estimates
estimates$likelihood <- - estimates$value
convergent <- convergent + is.finite(estimates$likelihood)
optimal <- ux$update_optimal(estimates, optimal)
pin_values <- .xadjpin$compute_pin(estimates[["par"]], restricted)
thisrun <- c(temp_run, estimates[["par"]], estimates$likelihood,
pin_values$adjpin, pin_values$psos)
}
runs <- rbind(runs, thisrun)
if (verbose) setTxtProgressBar(pb_adjpin, i)
}
time_off <- Sys.time()
ux$show(verbose, uix$adjpin()$complete)
initialsets <- ux$todframe(initialsets)
names(initialsets) <- .xadjpin$varnames(restricted)
colnames(runs) <- .xadjpin$varnames(restricted, details = TRUE)
rownames(runs) <- paste("set.", seq_len(nrow(initialsets)), sep = "")
runs <- as.data.frame(runs)
if (is.finite(optimal$likelihood)) {
optpin <- .xadjpin$compute_pin(optimal[["par"]], restricted)
xparams <- setNames(optpin$params, .xadjpin$varnames(restricted))
optimal_adjpin <- new("estimate.adjpin", success = TRUE, method = "ML",
convergent.sets = convergent,
factorization = ifelse(fact, "GE", "NONE"),
parameters = xparams,
likelihood = optimal$likelihood,
restrictions = restricted, algorithm = init_type,
adjpin = optpin$adjpin, psos = optpin$psos,
initialsets = initialsets, details = runs)
optimal_adjpin@runningtime <- ux$timediff(time_on, time_off)
return(optimal_adjpin)
} else {
optimal_adjpin <- new("estimate.adjpin", success = FALSE, method = "ML",
convergent.sets = convergent,
factorization = ifelse(fact, "GE", "NONE"),
restrictions = restricted,
algorithm = init_type, initialsets = initialsets,
parameters = NaN, adjpin = NaN,
psos = NaN, likelihood = NaN)
optimal_adjpin@runningtime <- ux$timediff(time_on, time_off)
return(optimal_adjpin)
}
}
# ---------------------------------------------------------------------------- #
# Various supporting functions in the list of function .xadjpin #
# ---------------------------------------------------------------------------- #
.xadjpin <- list(
vars = function(restricted = NULL, all = FALSE) {
# returns the list of variable names given the list 'restricted' to be used in
# computation
#
# Args:
# restricted : a list specifying whether the model parameters are equal.
# It can take the value `NULL`, or contain one or more keys from
# the following keys (theta, mu, eps, d)
#
# Returns:
# a vector of variable names.
restricted <- .xadjpin$allrestrictions(restricted)
vars <- list(list(c("eb", "es"), "e"), list(c("mub", "mus"), "mu"),
list(c("db", "ds"), "dx"))
variables <- unlist(Map(function(x, y) x[[1 + y]], vars, restricted[2:4]))
xtheta <- c("t", "tp")
if (restricted$theta) xtheta <- "t"
if (all) variables <- c("a", "d", xtheta, variables)
return(variables)
},
varnames = function(restricted = NULL, details = FALSE) {
# returns the list of variable names given the list 'restricted' to be used in
# naming dataframes and lists
#
# Args:
# restricted : a list specifying whether the model parameters are equal.
# It can take the value `NULL`, or contain one or more keys from
# the following keys (theta, mu, eps, d)
#
# Returns:
# a vector of variable names.
restricted <- .xadjpin$allrestrictions(restricted)
vars <- list(list(c("theta", "thetap"), "theta"),
list(c("eps.b", "eps.s"), "eps"),
list(c("mu.b", "mu.s"), "mu"),
list(c("d.b", "d.s"), "d"))
variables <- unlist(Map(function(x, y) x[[1 + y]], vars, restricted))
variables <- c("alpha", "delta", variables)
if (details == TRUE)
variables <- c(paste("in.", variables, sep = ""),
paste("op.", variables, sep = ""),
"likelihood", "adjpin", "psos")
return(variables)
},
allrestrictions = function(restricted = NULL) {
# fills empty keys in the argument list 'restricted' and return the full list
#
# Args:
# restricted : a list specifying whether the model parameters are equal.
# It can take the value `NULL`, or contain one or more keys from
# the following keys (theta, mu, eps, d)
#
# Returns:
# a list with four keys (theta, eps, mu, d)
allnames <- c("theta", "eps", "mu", "d")
restricted <- lapply(allnames, function(x) ifelse(
is.null(restricted[[x]]), FALSE, restricted[[x]]))
names(restricted) <- allnames
return(restricted)
},
distribution = function(z, restricted = NULL) {
# finds the distribution of days among different clusters given the values
# of the model parameters (alpha, delta, theta and thetap)
#
# Args:
# z : a vector of model parameters consisting of (alpha, delta, theta) if
# restricted$theta = TRUE, otherwise, (alpha, delta, theta, thetap)
# restricted : a list specifying whether the model parameters are equal.
# It can take the value `NULL`, or contain one or more keys from
# the following keys (theta, mu, eps, d)
#
# Returns:
# a list of probabilities distributed a six DY clusters
a <- d <- theta <- thetap <- 0
values <- z
restricted <- .xadjpin$allrestrictions(restricted)
if (restricted$theta) values <- c(z[1:3], z[3])
variables <- c("a", "d", "theta", "thetap")
for (i in 1:4) assign(variables[i], values[i])
distrib <- c(
(1 - a) * (1 - theta), # C1 : no info. no shock
(1 - a) * theta, # C2 : no info. shock
a * (1 - d) * (1 - thetap), # C3 : good info. no shock
a * (1 - d) * thetap, # C4 : good info. shock
a * d * (1 - thetap), # C5 : bad info. no shock
a * d * thetap # C6 : bad info. shock
)
return(distrib)
},
compute_pin = function(params, restricted) {
# computes the values of adjpin and psos and returns them along
# the vector of parameters
#
# Args:
# params: an eventually restricted vector of model parameters
# restricted : a list specifying whether the model parameters are equal.
# It can take the value `NULL`, or contain one or more keys from
# the following keys (theta, mu, eps, d)
#
# Returns:
# an list with three keys: + params containing a vector of 10 parameters
# + adjpin containing the adjust pin
# + psos containing the psos value
# initialize the local variables
# --------------------------------------------------------------------------
a <- dx <- d <- mu <- e <- t <- tp <- NULL
# Recover the names of the variables, and assign them to their values
variables <- .xadjpin$vars(restricted, all = TRUE)
for (i in seq_len(length(params))) assign(variables[i], params[i])
# Recover the ten parameters by assigning the common value to the values of
# both sides buy and sell, e.g., if restricted$theta = TRUE, then set the
# value of thetap = theta
if (restricted$theta) tp <- t
if (restricted$mu) mub <- mus <- mu
if (restricted$eps) eb <- es <- e
if (restricted$d) db <- ds <- dx
# Compute adjpin, and psos
adjpin <- (a * ((1 - d) * mub + d * mus)) /
((a * ((1 - d) * mub + d * mus)) +
(db + ds) * (a * tp + (1 - a) * t) + eb + es)
psos <- ((db + ds) * (a * tp + (1 - a) * t)) /
((a * ((1 - d) * mub + d * mus)) + (db + ds) *
(a * tp + (1 - a) * t) + eb + es)
response <- list(params = c(a, d, t, tp, eb, es, mub, mus, db, ds),
adjpin = adjpin, psos = psos)
return(response)
},
likelihood = function(data, params) {
# returns the (adjpin) likelihood value given trade dataset 'data', and
# a set of parameters 'params'
#
# Args:
# data : a dataframe of daily buys and sells
# params: a set of adjpin model parameters
#
# Returns:
# a numeric value of the adjpin likelihood
lkd <- -factorizations$adjpin(data)(unlist(params))
if (is.na(lkd)) lkd <- +Inf
return(lkd)
},
solve_eqx = function(a, b, c, d) {
# The rational equation to be solved is
# :: a0 / (x + b0) + a1 / (x + b1) + ... = c
# d is the default value, if solving equations fails.
# Check if the b's are equal, and sum the a's corresponding
# to these values
ub <- unique(b)
if (length(ub) < length(b)) {
xb <- xa <- NULL
for (i in seq_len(length(ub))) {
xb <- c(xb, ub[i])
xa <- c(xa, sum(a[b == ub[i]]))
}
a <- xa
b <- xb
}
# Find the coefficients from the roots of the polynomials
allcoeffs <- c * .xadjpin$getcoeff(b) -
as.numeric(colSums(ux$todframe((Map(function(x) c(
a[x] * .xadjpin$getcoeff(b[-x]), 0), seq_len(length(b))))
)))
fsol <- tryCatch(
polyroot(allcoeffs), error = function(e) {
NULL
}
)
if (length(fsol) > 0) {
fsol <- fsol[Im(zapsmall(fsol)) == 0]
fsol <- Re(fsol)
fsol <- fsol[fsol > 0]
}
if (length(fsol) == 0) fsol <- d
return(fsol)
},
getcoeff = function(roots) {
if (length(roots) == 1) return(c(roots, 1))
# The coefficients are generated from the roots
coeffs <- Map(
function(x) sum(exp(colSums(log(combn(roots, x))))),
length(roots):0
)
coeffs <- unlist(coeffs)
}
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.