Nothing
#' Calculate Maximum Number of Shards
#'
#' A function to calculate the maximum number of shards to be used in distributed hierarchical Bayesian algorithm for scalable target marketing.
#'
#' @param R (integer) - Number of MCMC draws.
#' @param N (integer) - The number of observational units in the full dataset.
#' @param Data (list) - A list of lists where each sublist contains either 'regdata' or 'lgtdata'.
#' @param s (integer) - A small number of shards used to evaluate the distributed algorithm.
#' @param ep_squaremax (numeric) - A value indicating the user's maximum expected error tolerance.
#' @param ncomp (integer) - The number of components in the mixture.
#' @param Bpercent (numeric) - A decimal value representing the proportion of draws to burn-in
#' @param iterations (numeric) - The number of times to estimate the maximum number of shards
#' @param keep (numeric) - MCMC thinning parameter -- keep every \code{keep}th draw (default: 1)
#' @param npoints (integer) - The number of points at which to evaluate the difference in posterior distributions
#'
#' @return The function returns a list of: (1) A vector of s_max estimated for each iteration, (2) s_max_min calculated using C_0_min,
#' (3) epsilon_square, (4) ep_squaremax, (5) R, (6) N, (7) Np, (8) C_0, and (9) C_0_min
#'
#' @author Federico Bumbaca, Leeds School of Business, University of Colorado Boulder, \email{federico.bumbaca@colorado.edu}
#' @references Bumbaca, F. (Rico), Misra, S., & Rossi, P. E. (2020). Scalable Target Marketing: Distributed Markov Chain Monte Carlo for Bayesian Hierarchical Models. Journal of Marketing Research, 57(6), 999-1018.
#'
#'
#' @examples
#'
#' \donttest{
#' # Generate hierarchical linear data
#'R = 1000
#'N = 2000
#'nobs = 5 #number of observations
#'nvar = 3 #columns
#'nz = 2
#'
#'Z = matrix(runif(N*nz),ncol=nz)
#'Z = t(t(Z)-apply(Z,2,mean))
#'Delta = matrix(c(1,-1,2,0,1,0), ncol = nz)
#'tau0 = 0.1
#'iota = c(rep(1,nobs))
## create arguments for rmixture
#'tcomps=NULL
#'a = diag(1, nrow=3)
#'tcomps[[1]] = list(mu=c(-5,0,0),rooti=a)
#'tcomps[[2]] = list(mu=c(5, -5, 2),rooti=a)
#'tcomps[[3]] = list(mu=c(5,5,-2),rooti=a)
#'tpvec = c(.33,.33,.34)
#'ncomp=length(tcomps)
#'regdata=NULL
#'betas=matrix(double(N*nvar),ncol=nvar)
#'tind=double(N)
#'for (reg in 1:N) {
#' tempout=bayesm::rmixture(1,tpvec,tcomps)
#' if (is.null(Z)){
#' betas[reg,]= as.vector(tempout$x)
#' }else{
#' betas[reg,]=Delta%*%Z[reg,]+as.vector(tempout$x)}
#' tind[reg]=tempout$z
#' X=cbind(iota,matrix(runif(nobs*(nvar-1)),ncol=(nvar-1)))
#' tau=tau0*runif(1,min=0.5,max=1)
#' y=X%*%betas[reg,]+sqrt(tau)*rnorm(nobs)
#' regdata[[reg]]=list(y=y,X=X,beta=betas[reg,],tau=tau)
#'}
#'
#'Prior1=list(ncomp=ncomp)
#'keep=1
#'Mcmc1=list(R=R,keep=keep)
#'Data1=list(list(regdata=regdata,Z=Z))
#'returns = s_max(R = R, N = N, Data = Data1, s = 1, iterations = 2)
#'returns
#' }
#' @export
s_max <- function(R, N, Data, s = 3, ep_squaremax = 0.001,
ncomp = 1, Bpercent = 0.5, iterations = 10, keep = 1, npoints = 1000){
s_max = s_max_min = c()
if (!is.null(Data[[1]]$regdata)){
K = ncol(Data[[1]]$regdata[[1]]$X)
Np = length(Data[[1]]$regdata)
post1=post2=array(0, dim = c(iterations, npoints, K))
for(it in 1:iterations){
#Run the benchmark
Data = partition_data(Data, 1) #combine data if in multiple shards
out1 = list(rhierLinearMixtureParallel(Data=Data[[1]], Prior=list(ncomp=ncomp), Mcmc=list(R=R, keep=keep)))
#MCMC Draws
compdraw1 = out1[[1]]$compdraw[floor((Bpercent*R)+1):R]
probdraw1 = out1[[1]]$probdraw[floor((Bpercent*R)+1):R,]
deltadraw1 = out1[[1]]$Deltadraw[floor((Bpercent*R)+1):R]
x=list()
for(k in 1:K){
mu_list = rooti_list = NULL
for(r in 1:length(compdraw1)){
for(nc in 1:ncomp){
mu_list = c(mu_list, compdraw1[[r]][[nc]]$mu[k])
rooti_list[[ncomp*(r-1)+nc]] = compdraw1[[r]][[nc]]$rooti
}
}
sigma_max = backsolve(r=rooti_list[[which.max(mu_list)]], x=diag(K), k=K)
sigma_min = backsolve(r=rooti_list[[which.min(mu_list)]], x=diag(K), k=K)
x[[k]] = c(min(mu_list) - 3*sqrt(matrix((t(sigma_min) %*% sigma_min), nrow = K)[k,k]),
max(mu_list) + 3*sqrt(matrix((t(sigma_max) %*% sigma_max), nrow = K)[k,k]))
}
#Estimate predictive distribution for benchmark
post1[it,,]=predictive_distribution(x=x, out=out1, Data=Data)
#Run the distributed version
Data2 = partition_data(Data=Data, s=s)
out2 = parallel::mclapply(Data2, FUN = rhierLinearMixtureParallel, Prior = Prior1, Mcmc = Mcmc1, mc.cores = s, mc.set.seed = FALSE)
# Estimate predictive distribution for distributed version
post2[it,,] = predictive_distribution(x=x, out=out2, Data=Data2)}
#Obtain max error dimension-wise, then choose max of those as epsilon_square
difference = apply((post2-post1)^2, 3, function(x) colMeans(x))
difference_max = apply((post2-post1)^2, 3, function(x) apply(x, 2, max))
epsilon_square = max(difference)
epsilon_square_max = max(difference_max)
# Calculate maximum number of shards
C_0=(s^2+1)/(s*Np*R*epsilon_square)
C_0_min = (s^2+1)/(s*Np*R*epsilon_square_max)
if((N*R*ep_squaremax)^2-4*C_0^(-2) > 0){
s_max = floor((C_0/2)*(N*R*ep_squaremax+sqrt((N*R*ep_squaremax)^2-4*C_0^(-2))))
} else {
s_max = 1
}
if((N*R*ep_squaremax)^2-4*C_0_min^(-2) > 0){
s_max_min = floor((C_0_min/2)*(N*R*ep_squaremax+sqrt((N*R*ep_squaremax)^2-4*C_0_min^(-2))))
} else {
s_max_min = 1
}
return(list("s_max" = s_max,
"epsilon_square" = epsilon_square,
"ep_squaremax" = ep_squaremax,
"s_max_min" = s_max_min,
"R" = R,
"N" = N,
"Np"= Np,
"C_0" = C_0,
"C_0_min" = C_0_min))
}
if (!is.null(Data[[1]]$lgtdata)){
K = ncol(Data[[1]]$lgtdata[[1]]$X)
Np = length(Data[[1]]$lgtdata)
post1=post2=array(0, dim = c(iterations, npoints, K))
for(it in 1:iterations){
#Run the benchmark
Data = partition_data(Data, 1) #combine data if in multiple shards
out1 = list(rhierMnlRwMixtureParallel(Data=Data[[1]], Prior=list(ncomp=ncomp), Mcmc=list(R=R, keep=keep)))
compdraw1 = out1[[1]]$compdraw[floor((Bpercent*R)+1):R]
probdraw1 = out1[[1]]$probdraw[floor((Bpercent*R)+1):R,]
deltadraw1 = out1[[1]]$Deltadraw[floor((Bpercent*R)+1):R]
x=list()
for(k in 1:K){
mu_list = rooti_list = NULL
for(r in 1:length(compdraw1)){
for(nc in 1:ncomp){
mu_list = c(mu_list, compdraw1[[r]][[nc]]$mu[k])
rooti_list[[ncomp*(r-1)+nc]] = compdraw1[[r]][[nc]]$rooti
}
}
sigma_max = backsolve(r=rooti_list[[which.max(mu_list)]], x=diag(K), k=K)
sigma_min = backsolve(r=rooti_list[[which.min(mu_list)]], x=diag(K), k=K)
x[[k]] = c(min(mu_list) - 3*sqrt(matrix((t(sigma_min) %*% sigma_min), nrow = K)[k,k]),
max(mu_list) + 3*sqrt(matrix((t(sigma_max) %*% sigma_max), nrow = K)[k,k]))
}
#Estimate predictive distribution for benchmark
post1[it,,]=predictive_distribution(x=x, out=out1, Data=Data)
#Run the distributed version
Data2 = partition_data(Data=Data, s=s)
out2 = parallel::mclapply(Data2, FUN = rhierMnlRwMixtureParallel, Prior = Prior1, Mcmc = Mcmc1, mc.cores = s, mc.set.seed = FALSE)
# Estimate predictive distribution for distributed version
post2[it,,] = predictive_distribution(x=x, out=out2, Data=Data2)}
#Obtain max error dimension-wise, then choose max of those as epsilon_square
difference = apply((post2-post1)^2, 3, function(x) colMeans(x))
difference_max = apply((post2-post1)^2, 3, function(x) apply(x, 2, max))
epsilon_square = max(difference)
epsilon_square_max = max(difference_max)
# Calculate maximum number of shards
C_0=(s^2+1)/(s*Np*R*epsilon_square)
C_0_min = (s^2+1)/(s*Np*R*epsilon_square_max)
if((N*R*ep_squaremax)^2-4*C_0^(-2) > 0){
s_max = floor((C_0/2)*(N*R*ep_squaremax+sqrt((N*R*ep_squaremax)^2-4*C_0^(-2))))
} else {
s_max = 1
}
if((N*R*ep_squaremax)^2-4*C_0_min^(-2) > 0){
s_max_min = floor((C_0_min/2)*(N*R*ep_squaremax+sqrt((N*R*ep_squaremax)^2-4*C_0_min^(-2))))
} else {
s_max_min = 1
}
return(list("s_max" = s_max,
"epsilon_square" = epsilon_square,
"ep_squaremax" = ep_squaremax,
"s_max_min" = s_max_min,
"R" = R,
"N" = N,
"Np"= Np,
"C_0" = C_0,
"C_0_min" = C_0_min))
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.