R/s_max.R
In scalablebayesm: Distributed Markov Chain Monte Carlo for Bayesian Inference in Marketing

Documented in s_max

#' Calculate Maximum Number of Shards
#'
#' A function to calculate the maximum number of shards to be used in distributed hierarchical Bayesian algorithm for scalable target marketing.
#'
#' @param R (integer) - Number of MCMC draws.
#' @param N (integer) - The number of observational units in the full dataset.
#' @param Data (list) - A list of lists where each sublist contains either 'regdata' or 'lgtdata'.
#' @param s  (integer) - A small number of shards used to evaluate the distributed algorithm.
#' @param ep_squaremax (numeric) - A value indicating the user's maximum expected error tolerance.
#' @param ncomp (integer) - The number of components in the mixture.
#' @param Bpercent (numeric) - A decimal value representing the proportion of draws to burn-in
#' @param iterations (numeric) - The number of times to estimate the maximum number of shards
#' @param keep (numeric) - MCMC thinning parameter -- keep every \code{keep}th draw (default: 1) 
#' @param npoints (integer) - The number of points at which to evaluate the difference in posterior distributions
#' 
#' @return The function returns a list of: (1) A vector of s_max estimated for each iteration, (2) s_max_min calculated using C_0_min,
#' (3) epsilon_square, (4) ep_squaremax, (5) R, (6) N, (7) Np, (8) C_0, and (9) C_0_min
#' 
#' @author Federico Bumbaca, Leeds School of Business, University of Colorado Boulder, \email{federico.bumbaca@colorado.edu}
#' @references Bumbaca, F. (Rico), Misra, S., & Rossi, P. E. (2020). Scalable Target Marketing: Distributed Markov Chain Monte Carlo for Bayesian Hierarchical Models. Journal of Marketing Research, 57(6), 999-1018.
#' 
#' 
#' @examples
#' 
#' \donttest{
#' # Generate hierarchical linear data
#'R = 1000
#'N = 2000
#'nobs = 5 #number of observations
#'nvar = 3 #columns
#'nz = 2
#'
#'Z = matrix(runif(N*nz),ncol=nz) 
#'Z = t(t(Z)-apply(Z,2,mean))
#'Delta = matrix(c(1,-1,2,0,1,0), ncol = nz) 
#'tau0 = 0.1
#'iota = c(rep(1,nobs)) 

## create arguments for rmixture
#'tcomps=NULL
#'a = diag(1, nrow=3)
#'tcomps[[1]] = list(mu=c(-5,0,0),rooti=a) 
#'tcomps[[2]] = list(mu=c(5, -5, 2),rooti=a)
#'tcomps[[3]] = list(mu=c(5,5,-2),rooti=a)
#'tpvec = c(.33,.33,.34)                               
#'ncomp=length(tcomps)
#'regdata=NULL
#'betas=matrix(double(N*nvar),ncol=nvar) 
#'tind=double(N) 
#'for (reg in 1:N) { 
#'  tempout=bayesm::rmixture(1,tpvec,tcomps)
#'  if (is.null(Z)){
#'    betas[reg,]= as.vector(tempout$x)  
#'  }else{
#'    betas[reg,]=Delta%*%Z[reg,]+as.vector(tempout$x)} 
#'  tind[reg]=tempout$z
#'  X=cbind(iota,matrix(runif(nobs*(nvar-1)),ncol=(nvar-1))) 
#'  tau=tau0*runif(1,min=0.5,max=1) 
#'  y=X%*%betas[reg,]+sqrt(tau)*rnorm(nobs)
#'  regdata[[reg]]=list(y=y,X=X,beta=betas[reg,],tau=tau) 
#'}
#'
#'Prior1=list(ncomp=ncomp) 
#'keep=1
#'Mcmc1=list(R=R,keep=keep)
#'Data1=list(list(regdata=regdata,Z=Z))

#'returns = s_max(R = R, N = N, Data = Data1, s = 1, iterations = 2)
#'returns
#' }
#' @export

s_max <- function(R, N, Data, s = 3, ep_squaremax = 0.001,
                  ncomp = 1, Bpercent = 0.5, iterations = 10, keep = 1, npoints = 1000){
  s_max = s_max_min = c()
  
  if (!is.null(Data[[1]]$regdata)){ 
    K = ncol(Data[[1]]$regdata[[1]]$X)
    Np = length(Data[[1]]$regdata)
    post1=post2=array(0, dim = c(iterations, npoints, K))
    
    for(it in 1:iterations){
      #Run the benchmark
      Data = partition_data(Data, 1) #combine data if in multiple shards
      out1 = list(rhierLinearMixtureParallel(Data=Data[[1]], Prior=list(ncomp=ncomp), Mcmc=list(R=R, keep=keep)))
      
      #MCMC Draws
      compdraw1 = out1[[1]]$compdraw[floor((Bpercent*R)+1):R] 
      probdraw1 = out1[[1]]$probdraw[floor((Bpercent*R)+1):R,]
      deltadraw1 = out1[[1]]$Deltadraw[floor((Bpercent*R)+1):R]
      
      x=list()
      for(k in 1:K){
        mu_list = rooti_list = NULL
        for(r in 1:length(compdraw1)){
          for(nc in 1:ncomp){
            mu_list = c(mu_list, compdraw1[[r]][[nc]]$mu[k])
            rooti_list[[ncomp*(r-1)+nc]] = compdraw1[[r]][[nc]]$rooti
          }
        }
        
        sigma_max = backsolve(r=rooti_list[[which.max(mu_list)]], x=diag(K), k=K)
        sigma_min = backsolve(r=rooti_list[[which.min(mu_list)]], x=diag(K), k=K)
        
        x[[k]] = c(min(mu_list) - 3*sqrt(matrix((t(sigma_min) %*% sigma_min), nrow = K)[k,k]),
                   max(mu_list) + 3*sqrt(matrix((t(sigma_max) %*% sigma_max), nrow = K)[k,k]))
      }
      
      #Estimate predictive distribution for benchmark
      post1[it,,]=predictive_distribution(x=x, out=out1, Data=Data)
      
      #Run the distributed version
      Data2 = partition_data(Data=Data, s=s)
      out2 = parallel::mclapply(Data2, FUN = rhierLinearMixtureParallel, Prior = Prior1, Mcmc = Mcmc1, mc.cores = s, mc.set.seed = FALSE)
      
      # Estimate predictive distribution for distributed version
      post2[it,,] = predictive_distribution(x=x, out=out2, Data=Data2)}
      
      #Obtain max error dimension-wise, then choose max of those as epsilon_square
      difference = apply((post2-post1)^2, 3, function(x) colMeans(x))
      difference_max = apply((post2-post1)^2, 3, function(x) apply(x, 2, max))
      
      epsilon_square = max(difference)
      epsilon_square_max = max(difference_max)
      
      # Calculate maximum number of shards
      C_0=(s^2+1)/(s*Np*R*epsilon_square)
      C_0_min = (s^2+1)/(s*Np*R*epsilon_square_max)
      
      if((N*R*ep_squaremax)^2-4*C_0^(-2) > 0){
        s_max = floor((C_0/2)*(N*R*ep_squaremax+sqrt((N*R*ep_squaremax)^2-4*C_0^(-2))))
      } else {
        s_max = 1
      }
      
      if((N*R*ep_squaremax)^2-4*C_0_min^(-2) > 0){
        s_max_min = floor((C_0_min/2)*(N*R*ep_squaremax+sqrt((N*R*ep_squaremax)^2-4*C_0_min^(-2))))
      } else {
        s_max_min = 1
      }
      
    return(list("s_max" = s_max,
                "epsilon_square" = epsilon_square,
                "ep_squaremax" = ep_squaremax, 
                "s_max_min" = s_max_min,
                "R" = R, 
                "N" = N,
                "Np"= Np,
                "C_0" = C_0,
                "C_0_min" = C_0_min))
  }
  if (!is.null(Data[[1]]$lgtdata)){ 
    K = ncol(Data[[1]]$lgtdata[[1]]$X) 
    Np = length(Data[[1]]$lgtdata)
    post1=post2=array(0, dim = c(iterations, npoints, K))
    for(it in 1:iterations){
      
      #Run the benchmark
      Data = partition_data(Data, 1) #combine data if in multiple shards
      out1 = list(rhierMnlRwMixtureParallel(Data=Data[[1]], Prior=list(ncomp=ncomp), Mcmc=list(R=R, keep=keep)))
      
      compdraw1 = out1[[1]]$compdraw[floor((Bpercent*R)+1):R] 
      probdraw1 = out1[[1]]$probdraw[floor((Bpercent*R)+1):R,]
      deltadraw1 = out1[[1]]$Deltadraw[floor((Bpercent*R)+1):R]
      
      x=list()
      for(k in 1:K){
        mu_list = rooti_list = NULL
        for(r in 1:length(compdraw1)){
          for(nc in 1:ncomp){
            mu_list = c(mu_list, compdraw1[[r]][[nc]]$mu[k])
            rooti_list[[ncomp*(r-1)+nc]] = compdraw1[[r]][[nc]]$rooti
          }
        }
        
        sigma_max = backsolve(r=rooti_list[[which.max(mu_list)]], x=diag(K), k=K)
        sigma_min = backsolve(r=rooti_list[[which.min(mu_list)]], x=diag(K), k=K)
        
        x[[k]] = c(min(mu_list) - 3*sqrt(matrix((t(sigma_min) %*% sigma_min), nrow = K)[k,k]),
                   max(mu_list) + 3*sqrt(matrix((t(sigma_max) %*% sigma_max), nrow = K)[k,k]))
      }
      
      #Estimate predictive distribution for benchmark
      post1[it,,]=predictive_distribution(x=x, out=out1, Data=Data)
      
      #Run the distributed version
      Data2 = partition_data(Data=Data, s=s)
      out2 = parallel::mclapply(Data2, FUN = rhierMnlRwMixtureParallel, Prior = Prior1, Mcmc = Mcmc1, mc.cores = s, mc.set.seed = FALSE)
      
      # Estimate predictive distribution for distributed version
      post2[it,,] = predictive_distribution(x=x, out=out2, Data=Data2)}
      
      #Obtain max error dimension-wise, then choose max of those as epsilon_square
      difference = apply((post2-post1)^2, 3, function(x) colMeans(x))
      difference_max = apply((post2-post1)^2, 3, function(x) apply(x, 2, max))
      
      epsilon_square = max(difference)
      epsilon_square_max = max(difference_max)
      
      # Calculate maximum number of shards
      C_0=(s^2+1)/(s*Np*R*epsilon_square)
      C_0_min = (s^2+1)/(s*Np*R*epsilon_square_max)
      
      if((N*R*ep_squaremax)^2-4*C_0^(-2) > 0){
        s_max = floor((C_0/2)*(N*R*ep_squaremax+sqrt((N*R*ep_squaremax)^2-4*C_0^(-2))))
      } else {
        s_max = 1
      }
      
      if((N*R*ep_squaremax)^2-4*C_0_min^(-2) > 0){
        s_max_min = floor((C_0_min/2)*(N*R*ep_squaremax+sqrt((N*R*ep_squaremax)^2-4*C_0_min^(-2))))
      } else {
        s_max_min = 1
      }
      

      return(list("s_max" = s_max,
                  "epsilon_square" = epsilon_square,
                  "ep_squaremax" = ep_squaremax, 
                  "s_max_min" = s_max_min,
                  "R" = R, 
                  "N" = N,
                  "Np"= Np,
                  "C_0" = C_0,
                  "C_0_min" = C_0_min))
  }
}

Any scripts or data that you put into this service are public.

scalablebayesm documentation built on April 3, 2025, 7:55 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

scalablebayesm
Distributed Markov Chain Monte Carlo for Bayesian Inference in Marketing

R/s_max.R
In scalablebayesm: Distributed Markov Chain Monte Carlo for Bayesian Inference in Marketing

Defines functions s_max

Documented in s_max

Try the scalablebayesm package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

scalablebayesm Distributed Markov Chain Monte Carlo for Bayesian Inference in Marketing

R/s_max.R In scalablebayesm: Distributed Markov Chain Monte Carlo for Bayesian Inference in Marketing

Defines functions s_max

Documented in s_max

Try the scalablebayesm package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

scalablebayesm
Distributed Markov Chain Monte Carlo for Bayesian Inference in Marketing

R/s_max.R
In scalablebayesm: Distributed Markov Chain Monte Carlo for Bayesian Inference in Marketing