R/sample_data.R

Defines functions sample_data

Documented in sample_data

#' Sample Data
#'
#' @description
#' A function to subset data for use in distributed hierarchical bayesian algorithm for scalable target marketing.
#' 
#' @param Data (list) - A list of lists where each sublist contains either 'regdata' or 'lgtdata'.
#' @param Rate (numeric) - Proportion of the data to be sampled
#' 
#' @return Returns a list of the same structure as \code{Data}, but with length scaled by \code{Rate}.
#'
#' 
#' @examples
#' 
#' # Generate hierarchical linear data
#' R=1000
#' nreg=10000
#' nobs=5 #number of observations
#' nvar=3 #columns
#' nz=2
#' 
#' Z=matrix(runif(nreg*nz),ncol=nz) 
#' Z=t(t(Z)-apply(Z,2,mean))
#' Delta=matrix(c(1,-1,2,0,1,0), ncol = nz) 
#' tau0=.1
#' iota=c(rep(1,nobs)) 
#' 
#' ## create arguments for rmixture
#' tcomps=NULL
#' a = diag(1, nrow=3)
#' tcomps[[1]] = list(mu=c(-5,0,0),rooti=a) 
#' tcomps[[2]] = list(mu=c(5, -5, 2),rooti=a)
#' tcomps[[3]] = list(mu=c(5,5,-2),rooti=a)
#' tpvec = c(.33,.33,.34)                               
#' ncomp=length(tcomps)
#' regdata=NULL
#' betas=matrix(double(nreg*nvar),ncol=nvar) 
#' tind=double(nreg) 
#' for (reg in 1:nreg) { 
#'   tempout=bayesm::rmixture(1,tpvec,tcomps)
#'   if (is.null(Z)){
#'     betas[reg,]= as.vector(tempout$x)  
#'   }else{
#'     betas[reg,]=Delta%*%Z[reg,]+as.vector(tempout$x)} 
#'   tind[reg]=tempout$z
#'   X=cbind(iota,matrix(runif(nobs*(nvar-1)),ncol=(nvar-1))) 
#'   tau=tau0*runif(1,min=0.5,max=1) 
#'   y=X%*%betas[reg,]+sqrt(tau)*rnorm(nobs)
#'   regdata[[reg]]=list(y=y,X=X,beta=betas[reg,],tau=tau) 
#' }
#' 
#' Prior1=list(ncomp=ncomp) 
#' keep=1
#' Mcmc1=list(R=R,keep=keep)
#' Data1=list(list(regdata=regdata,Z=Z))
#' 
#' length(Data1[[1]]$regdata)
#' 
#' data_s = sample_data(Data = Data1, Rate = 0.1)
#' length(data_s[[1]]$regdata)
#'
#' @author Federico Bumbaca, \email{federico.bumbaca@@colorado.edu}
#' @rdname sample_data
#' @export
#' 
sample_data = function(Data, Rate=1) {
  if(Rate != 1) {
    shards = length(Data)
    Ns = length(Data[[1]][[1]]) #either regdata or lgtdata
    DataSample = vector(mode="list", length=shards)
    for (i in 1:shards) {
      sampind = sample.int(n = Ns, size = Ns*Rate)
      if(!is.null(Data[[i]]$regdata)){
        DataSample[[i]] = list(regdata = Data[[i]]$regdata[sampind],
                               Z = Data[[i]]$Z[sampind,])       
      }
      if(!is.null(Data[[i]]$lgtdata)){
        DataSample[[i]] = list(lgtdata = Data[[i]]$lgtdata[sampind],
                               p = Data[[i]]$p,
                               Z = Data[[i]]$Z[sampind,])}
    }
  }
  else {
    DataSample = Data
  }
  return(DataSample)
}

Try the scalablebayesm package in your browser

Any scripts or data that you put into this service are public.

scalablebayesm documentation built on April 3, 2025, 7:55 p.m.