#' Estimating a block model for sequential data
#'
#' @description \code{block} is used to estimate a 'block model' (REFERENCE)
#' based on the dominant level of a variable in a given ordered sequence.
#' @param formula an object of class "formula" (or one that can be coerced to that class):
#' a symbolic description of the model to be fitted.
#' The details of model specification are given under 'Details'.
#' @param data an optional data frame, list or environment (or object coercible by
#' as.data.frame to a data frame) containing the variables in the model.
#' If not found in data, the variables are taken from environment(formula),
#' typically the environment from which glm is called.
#' @param thr Threshold for the length of a block in order to be identified.
#' @param verbose a logical indicating if some "progress report" should be given.
#' @param na.action a function which indicates what should happen when the data contain NAs.
#' The default is set by to na.pass.
#' @param method The method used for reduction of groups. See 'Details'.
#' @param ... Not currently used, just in case.
#' @details We define an "unloading block" as a group of fish that were
#' predominantly of the same type (e.g. species, quality, size class) within
#' each individual unloading. Unloading blocks are allowed to contain some minor
#' amount of fish with different characteristics for practical reasons.
#' To estimate the unloading blocks in each super-sample, the n-running
#' proportion for each type of fish in the unloading is calculated and the
#' dominant type (more than 50%) identified for each group of n consecutive
#' fishes. An unloading block is then computed as the union of contiguous
#' groups with the same dominant type. Small blocks (less than n fish unloaded)
#' dividing two blocks of the same dominant type are absorved to generate
#' uninterrupted unloading blocks of the same type. Since the expected length
#' of a unloading block is expected to change as function of the total
#' unloading length (i.e. the longer the unloading the longer the unloading
#' blocks), the value n to compute the running proportions should be taken as
#' a fraction (e.g. 3\%, thr=0.03) of the total number fish unloaded N.
#'
#' @return
#' @export
#'
#' @examples
#' \dontrun{
#'
#' data_file = system.file("data/supersamples_demo.csv", package="sharkbox")
#' dat = read.csv(data_file)
#' mod_block0 = blocks(order|trip ~ species, data=dat)
#' mod_block1 = blocks(order|trip ~ group, data=dat)
#' }
blocks = function(formula, data, thr=0.03, verbose=TRUE, na.action,
method="default", ...) {
if(missing(na.action)) na.action = na.pass
gp = interpret.smc(formula)
# check only one block term is included
cl = match.call()
# full model frame
mf = match.call(expand.dots = FALSE)
mf$formula = gp$fake.formula
mf[[1]] = quote(stats::model.frame)
mf$na.action = na.pass # check for NAs manually
mf$thr = mf$verbose = NULL
mf = eval(mf, data, parent.frame())
ind = .sort(mf, gp)
sf = if(gp$doSplit) mf[ind , gp$split.names, drop=FALSE] else rep(1, length=nrow(mf))
# check each split has complete data
fac = .getFactorLevels(mf=mf, vars=all.vars(gp$pred.formula), ind=ind)
dat = split(fac, f = sf)
thisBlocks = levels(fac)[unsplit(lapply(dat, .findBlocks, thr=thr), f=sf)]
# tb = fac$levels[unlist(tapply(fac$f, INDEX = sf, FUN=.findBlocks, thr=thr))]
check = .checkBlocks(thisBlocks, fac, thr, method)
while(!attr(check, "ok")) {
thisBlocks[thisBlocks==attr(check, "old")] = attr(check, "new")
if(attr(check, "old")=="NA") thisBlocks[is.na(thisBlocks)] = attr(check, "new")
if(isTRUE(verbose) & attr(check, "old")!="NA")
message(sprintf("Merging '%s' with '%s'", attr(check, "old"), attr(check, "new")))
check = .checkBlocks(thisBlocks, fac, thr, method)
}
thisBlocks = thisBlocks[order(ind)] # back to original order
thisBlocks = factor(thisBlocks)
bb = table(thisBlocks)
output = list(coefficients = names(bb), residuals = NULL,
fitted.values=thisBlocks, model=NULL, na.action=na.action,
call=call, formula=formula, terms=terms, data=mf, control=NULL,
method=method)
class(output) = "block"
return(output)
}
# Internals ---------------------------------------------------------------
.sort = function(xf, gp) {
if(!is.null(gp$split.names))
return(order(xf[, gp$split.names], xf[, gp$response]))
return(order(xf[, gp$response]))
}
#' Internal for identify blocks in a sequence
#'
#' @param x A vector of integers
#' @param thr The threshold to identify blocks, as fraction of total length
#'
#' @return A vector with the blocks labeled by the dominant integer in the block
#'
#' @examples
.findBlocks = function(x, thr=0.03) {
S = max(x)
spp = seq_len(S)
n = max(10, thr*length(x))
out = matrix(0, ncol=S, nrow=length(x))
for(iSp in spp) {
out[, iSp] = 0 + (x == iSp)
}
colnames(out) = seq_len(S)
out0 = apply(out, 2, .getRates, n=n)
out1 = apply(out, 2, .getBlocks, n=n)
out2 = rle(rowSums(out1))
ones = out2$values == 1
out2$values[ones] = seq_len(sum(ones))
out2$values[!ones] = NA
out2 = inverse.rle(out2)
if(sum(!is.na(out2))<n) {
# warning("No blocks identified, returning NA")
return(out2)
}
out3 = round(approx(x=out2, xout = seq_along(out2), rule=2)$y, 0)
spx = table(out3, x)
sp0 = setNames(apply(spx, 1, which.max), nm=NULL)
gg0 = setNames(seq_along(unique(sp0)), nm = unique(sp0))
out4 = setNames(gg0[as.character(sp0[out3])], nm=NULL)
out4 = .removeBumps(out4, thr=thr)
mainSp = apply(table(out4, factor(x, levels=spp)), 1, which.max)
out = setNames(mainSp[as.character(out4)], nm=NULL)
return(out)
}
# check for small blocks and suggest merging
.checkBlocks = function(thisBlocks, fac, thr, method) {
thisBlocks[is.na(thisBlocks)] = "NA"
bb = rle(thisBlocks)
class(bb) = "list"
bb = data.frame(bb)
bx = rowsum(bb$lengths, bb$values)
bx_names = rownames(bx)
bx = data.frame(count=as.numeric(bx), blocks=as.numeric(table(bb$values)[rownames(bx)]))
bx$keep = bx$count > thr*sum(bx$count)
ind = order(bx$keep, bx$blocks, bx$count)
bx = bx[ind, ]
bx$name = bx_names[ind]
rownames(bx) = NULL
check = bx
check$keep[is.na(check$blocks)] = FALSE # always remove NA in blocks!
check$keep[which(check$name == "NA")] = FALSE # always remove NA in blocks!
attr(check, "ok") = TRUE
# new -> old
if(all(check$keep)) return(check)
x = as.matrix(format(unclass(table(fac, thisBlocks))))
class(x) = "numeric"
rownames(x) = levels(fac)
# Method especific --------------------------------------------------------
x3 = t(t(x)/colSums(x))
x4 = x3
old = check[1, "name"]
for(j in seq_len(nrow(x3))) x4[j, ] = pmin(x3[j,], x3[j, old])
check$x4 = colSums(x4)[check$name]
ind = order(check$x4, check$blocks, decreasing = TRUE)
new = check[ind[2], "name"]
# end of method
attr(check, "old") = old
attr(check, "new") = new
attr(check, "ok") = FALSE
return(check)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.