R/LDA_gibbs_sampler.R
In bayesmove: Non-Parametric Bayesian Analyses of Animal Movement

Documented in cluster_segments

#' Cluster time segments into behavioral states
#'
#' This function performs a Gibbs sampler within the Latent Dirichlet Allocation
#' (LDA) model to estimate proportions of each behavioral state for all time
#' segments generated by \code{\link{segment_behavior}}. This is the second
#' stage of the two-stage Bayesian model that estimates proportions of
#' behavioral states by first segmenting individual tracks into relatively
#' homogeneous segments of movement.
#'
#' The LDA model analyzes all animal IDs pooled together, thereby providing
#' population-level estimates of behavioral states.
#'
#' @param dat A data frame returned by \code{\link{summarize_tsegs}} that
#'   summarizes the counts of observations per bin and movement variable for all
#'   animal IDs.
#' @param gamma1 numeric. A hyperparameter for the truncated stick-breaking
#'   prior for estimating the \code{theta} matrix.
#' @param alpha numeric. A hyperparameter for the Dirichlet distribution when
#'   estimating the \code{phi} matrix.
#' @param ngibbs numeric. The total number of iterations of the MCMC chain.
#' @param nmaxclust numeric. A single number indicating the maximum number of
#'   clusters to test.
#' @param nburn numeric. The length of the burn-in phase.
#' @param ndata.types numeric. A vector of the number of bins used to discretize
#'   each movement variable. These must be in the same order as the columns
#'   within \code{dat}.
#'
#' @return A list of model results is returned where elements include the
#'   \code{phi} matrix for each data stream, \code{theta} matrix, log likelihood
#'   estimates for each iteration of the MCMC chain \code{loglikel}, and
#'   matrices of the latent cluster estimates for each data stream \code{z.agg}.
#'
#'
#' @examples
#' \donttest{
#' #load data
#' data(tracks.seg)
#'
#' #select only id, tseg, SL, and TA columns
#' tracks.seg2<- tracks.seg[,c("id","tseg","SL","TA")]
#'
#' #summarize data by track segment
#' obs<- summarize_tsegs(dat = tracks.seg2, nbins = c(5,8))
#'
#' #cluster data with LDA
#' res<- cluster_segments(dat = obs, gamma1 = 0.1, alpha = 0.1, ngibbs = 1000,
#'                        nburn = 500, nmaxclust = 7, ndata.types = 2)
#' }
#'
#' @importFrom stats "rmultinom"
#' @export
cluster_segments=function(dat, gamma1, alpha, ngibbs, nmaxclust, nburn, ndata.types){
  ntsegm=nrow(dat)


  #separate variables
  y=list()
  nbins=rep(NA, ndata.types)
  for (i in 1:ndata.types){
    nome=paste0('y', i)
    ind=grep(nome, colnames(dat))
    y[[i]]=data.matrix(dat[,ind])
    nbins[i]=length(ind)
  }

  #initial values
  phi=z.agg=list()
  for (i in 1:ndata.types){
    phi[[i]]=matrix(1/nbins[i], nmaxclust, nbins[i])
    z.agg[[i]]=array(NA, dim=c(ntsegm, nbins[i], nmaxclust))
  }
  theta=matrix(1/nmaxclust, ntsegm, nmaxclust)

  for (j in 1:ndata.types){
    for (i in 1:ntsegm){
      for (k in 1:nbins[j]){
        z.agg[[j]][i,k,]=rmultinom(1, size=y[[j]][i,k], prob=rep(1/nmaxclust, nmaxclust))
      }
    }
  }

  #prepare for gibbs
  store.phi=zeroes=list()
  for (i in 1:ndata.types){
    store.phi[[i]]=matrix(NA, ngibbs, nmaxclust*nbins[i])
    zeroes[[i]]=array(0, c(ntsegm, nbins[i], nmaxclust))
  }
  store.theta=matrix(NA, ngibbs, ntsegm*nmaxclust)
  store.loglikel=rep(NA, 1)

  #progress bar
  pb <- progress::progress_bar$new(
    format = " iteration (:current/:total) [:bar] :percent [Elapsed: :elapsed, Remaining: :eta]",
    total = ngibbs, clear = FALSE, width = 100)

  #run gibbs sampler
  for (i in 1:ngibbs){
    pb$tick()  #create progress bar

    #re-order clusters
    if (i < nburn & i%%50==0){
      med=apply(theta, 2, mean)
      ordem=order(med, decreasing=T)
      theta=theta[,ordem]

      for (j in 1:ndata.types){
        z.agg[[j]]=z.agg[[j]][,,ordem]
        phi[[j]]=phi[[j]][ordem,]
      }
    }

    #sample from FCD's
    z.agg=sample.z(ntsegm=ntsegm, nbins=nbins, y=y, nmaxclust=nmaxclust,
                   phi=phi, ltheta=log(theta), zeroes=zeroes, ndata.types=ndata.types)

    v=sample.v(z.agg=z.agg, gamma1=gamma1,
               ntsegm=ntsegm, ndata.types=ndata.types, nmaxclust=nmaxclust)
    theta=get.theta(v=v, nmaxclust=nmaxclust, ntsegm=ntsegm)

    phi=sample.phi(z.agg=z.agg, alpha=alpha, nmaxclust=nmaxclust,
                   nbins=nbins, ndata.types=ndata.types)

    #calculate log-likelihood
    p1=0
    for (j in 1:ndata.types){
      prob1=theta %*% phi[[j]]
      p1=p1+sum(y[[j]]*log(prob1))
    }

    #store results
    for (j in 1:ndata.types){
      store.phi[[j]][i,]=phi[[j]]
    }
    store.theta[i,]=theta
    store.loglikel[i]=p1
  }

  list(phi=store.phi, theta=store.theta, loglikel=store.loglikel, z.agg=z.agg)
}