R/categorize.R

Defines functions gom.Mpadj gom.Madj gom.log.Mpadj gom.log.Madj top_two_categories get_mix_parameters.BAMBI get_mix_parameters.Mclust get_mix_parameters get_goms.mclust get_goms.EMCluster get_goms_salop get_goms_linear get_goms categorize_positions categorize.EMCluster categorize.BAMBI categorize.mclust make_categories make_categories_runner make_report_function process_reports agglom.reduc agglom

Documented in agglom agglom.reduc categorize_positions get_goms get_goms_linear get_goms_salop get_mix_parameters make_categories make_categories_runner make_report_function process_reports top_two_categories

#' Condenses list of numbers to convenient printable form, e.g. c(1,2,3,5) -> '1-3,5'
agglom <- function(xs) {
    res <- Reduce(agglom.reduc, xs, list())
    res <- lapply(res, function(x) { if(x[1]==x[2]) { x[1] } else { paste0(x[1],'-',x[2]) }})
    paste0(res, collapse=',')
}

#' Reduction function for turning list of integers into lists of consecutive sets
agglom.reduc <- function(xs,x) {
    idx <- length(xs)
    if(idx == 0) {
        xs <- list(c(x,x))
    } else {
        if((x - xs[[idx]][2]) == 1) {
            xs[[idx]] = c(xs[[idx]][1],x)
        } else {
            xs[[idx+1]] = c(x,x)
        }
    }
    xs
}

#' Reporting function for parallel categorizations
process_reports <- function(f, sims, max.iter) {
    nsim <- length(sims)
    sim.map <- seq_along(sims)
    names(sim.map) <- sims
    done <- integer(length=nsim)
    ks <- vector(mode='list', length=nsim)
    length.wipe <- min(6, getOption('mc.cores', default=2L)) * (11 + log(max.iter, 10))
    wipe.str <- paste0(rep(' ', length.wipe), collapse='')
    while(!isIncomplete(f)) {
        msg <- readBin(f, 'character')
        msgs <- as.numeric(strsplit(msg, ':', fixed=TRUE)[[1]])
        # extract simulation id, and find its runner index
        simi <- msgs[1]
        simidx <- sim.map[as.character(simi)]
        # extract last completed iteration
        done[simidx] <- msgs[2]
        ks[[simidx]] <- c(ks[[simidx]], msgs[3])
        if(msgs[2] == max.iter) {
            #cat('\r', rep(' ', getOption('width')), sep='', collapse='')
            #cat('\r', rep(' ', length.wipe), sep='', collapse='')
            cat('\r', wipe.str, '\r')
            cat(sprintf('\r(%5.1f%%) %d %s\n',
                        100*sum(done==max.iter) / nsim,
                        simi,
                        agglom(sort(unique(ks[[simidx]])))))
            #cat('\r', simi, ' ', agglom(sort(unique(ks[[simi]]))), '\n',
            #    sep='', collapse='')
        }
        curr <- which((done>0) & (done<max.iter))
        fl <- order(done[curr], decreasing=TRUE)[c(1, length(curr))]
        pcts <- 100*done[curr]/max.iter
        if(length(curr) > 6) {
            cat(sprintf('\r%s\r%d %6.2f%% ...[+%d]... %d %6.2f%%\r',
                        wipe.str, sims[curr[fl[1]]], pcts[fl[1]], length(curr) - 2, sims[curr[fl[2]]], pcts[fl[2]]))
        } else {
            cat('\r', paste0(sprintf('%d %6.2f%%', sims[curr], 100*done[curr]/max.iter),
                             collapse=', '),
                sep='')
        }
    }
    cat('\n')
    parallel:::mcexit()
}

#' Factory for passing data to reporting function
make_report_function <- function(f, simi) {
    rf <- function(i, k) {
        writeBin(sprintf('%d:%d:%d', simi, i, k), f)
    }
    rf
}

#' Runner method for generating categories
#'
#' @param positions positions to categorize, as generated by simulation code
#' @param nsim number of simulation runs to categorize
#' @param max.iter number of iterations per market to categorize
#' @export
make_categories_runner <- function(positions, nsim=NULL, max.iter=NULL, ...) {
    #nsim <- if(is.null(nsim)) max(positions$sim) else nsim
    sim_seq <- unique(positions$sim)
    #cat('Make categories runner, max.iter', max.iter, '\n')
    #print(sim_seq)
    #nsim <- length(sim_seq)
    niter <- if(is.null(max.iter)) max(positions$id) else max.iter

    f <- fifo(tempfile(), open="w+b", blocking=T)
    if (inherits(parallel:::mcfork(), "masterProcess")) {
        process_reports(f, sim_seq, max.iter=niter)
    }
    result <- parallel::mclapply(sim_seq, function(simi) {
            reportf <- make_report_function(f, simi)
            make_categories(positions=positions[sim==simi],
                            reportf=reportf, 
                            max.iter=niter,
                            ...)
        })
    close(f)
    result
}

#' Make 'optimal' categories for a set of positions
#'
#' Finds the optimal number and position of categories for the evolution of a market
#' Optimal in the sense of information criterion
#' @param positions set of market positions
#' @param init.iter initial iteration to categorize (usually after insertions)
#' @param max.iter maximum iteration to categorize
#' @param package which clustering package to use
#' @param reportf verbosity function (NULL to be quiet)
#' @param ... capture additional arguments
make_categories <- function(positions,
                            min.iter=positions[agent.id>1, min(id)],
                            max.iter=positions[, max(id)],
                            package='mclust',
                            reportf=NULL,
                            ...) {
    # assign NULL mixture to all uncategorized iterations
    #ems <- rep(list(NULL), min.iter-1)
    #ems <- rep(list(NULL), max.iter)
    ems <- vector('list', max.iter)

    xcat.f <- switch(package,
                     'mclust'=categorize.mclust,
                     'BAMBI'=categorize.BAMBI,
                     'EMCluster'=categorize.EMCluster)
    max.errors <- 3
    #max.errors <- sample(0:1, 1)
    errorf <- function(x, e, remain=max.errors) {
        #if(remain < max.errors) cat('DEBUG errorf', remain, ', simi', positions[, unique(sim)], '\n')
        if(remain > 0) {
            xcat <- tryCatch({
                xcat.f(x, ...)
            }, error=function(e) {
                cat(sprintf('\nERROR (-%d) on %d: %s\n', remain, positions[, unique(sim)], e$message))
                errorf(x, e, remain-1)
            })
        } else {
            cat(sprintf('\nPERSISTENT ERROR on %d: %s\n', positions[, unique(sim)], e$message))
            stop(e)
        }   
        xcat
    }

    for(i in min.iter:max.iter) {
        x <- positions[id <= i, x]
        #cat('DEBUG initial, iter', i, ', simi', positions[, unique(sim)], '\n')
        xcat <- errorf(x, NULL)
        #tryCatch({
        #    xcat <- xcat.f(x, ...)
        #}, error=function(e) cat('\nERROR:', e$message, '\n'))
        #xcat <- switch(package,
        #               'mclust'=categorize.mclust(x, ...),
        #               'BAMBI'=categorize.BAMBI(x, ...),
        #               'EMCluster'=categorize.EMCluster(x, ...))

        ems[[i]] <- xcat$mix

        if(!is.null(reportf)) reportf(i, xcat$k)
    }
    ems
}

#' @import mclust
categorize.mclust <- function(x, min.k=1, max.k=9, ...) {
    .min.k <- min.k
    .max.k <- max.k
    res <- mclust::Mclust(x, G=.min.k:.max.k, ...)
    while(res$G == .max.k) {
        .min.k <- .max.k
        .max.k <- 2*.max.k
        res <- mclust::Mclust(x, G=.min.k:.max.k, ...)
    }
    list(mix=res, k=res$G)
}

#' @import BAMBI
categorize.BAMBI <- function(x, min.k=1, max.k=10, ic='BIC', ...) {
    subcommand <- function(x, ..min.k, ..max.k) {
        capture.output({
            res <- BAMBI::fit_incremental_angmix(model='wnorm',
                                                 data=x,
                                                 show.progress=FALSE,
                                                 silent=TRUE,
                                                 crit=ic,
                                                 start_ncomp=..min.k,
                                                 max_ncomp=..max.k)
            res <- BAMBI::bestmodel(res)
        })
        res
    }
    .min.k <- min.k
    .max.k <- max.k
    xang <- scales::rescale(x, to=c(0, 2*pi))
    res <- subcommand(xang, .min.k, .max.k)
    while(res$ncomp == .max.k) {
        .min.k <- .max.k
        .max.k <- 2*.max.k
        res <- subcommand(xang, .min.k, .max.k)
    }
    if (res$ncomp > 1) capture.output(res <- BAMBI::fix_label(res))
    mix <- BAMBI::pointest(res)
    list(mix=mix, k=res$ncomp)
}



categorize.EMCluster <- function(x, min.k=1, ic='BIC', ...) {
    nx <- length(x)
    mx <- as.matrix(x)

    catf <- function(k) {
        mix <- EMCluster::init.EM(mx, nclass=k, EMC=EMCluster::.EMC)
        ic <- EMCluster::em.ic(mx, mix)
        list(mix=mix, ic=ic)
    }

    # search space
    k <- min.k
    cata <- catf(k)

    # do while we keep seeing IC improvements
    repeat {
        # make sure we have enough observations to estimate k means and variances (2*k dofs)
        # plus 1 dof for optimization i think? otherwise we have collinearity
        if(2*(k+1) > nx - 1) break
        #if(k+1 > i/2) break

        # search at higher k
        catb <- catf(k+1)

        # get IC differences
        ics <- names(cata$ic)
        ic.diffs <- sapply(ics, function(ic) catb$ic[[ic]] - cata$ic[[ic]])
        names(ic.diffs) <- ics

        # if ic improvement, accept bump in k
        if(ic.diffs[ic] < 0) {
            k <- k+1
            cata <- catb
        } else {
            break
        }
    }
    list(mix=cata$mix, k=k)
}

#' Assigns category grade of membership to positions
#'
#' @param x positions
#' @param mix category mixture distribution
#' @param peak.difference grade of membership = difference from peak GOM within category
#' @importFrom EMCluster dmixmvn
#' @export
categorize_positions <- function(x, mix, peak.difference=TRUE) {
    res <- sapply(1:mix$nclass, function(ci) {
            cat.ps <- dmixmvn(as.matrix(x), emobj=NULL, log=TRUE,
                              pi=mix$pi[ci],
                              Mu=mix$Mu[ci,,drop=F],
                              LTSigma=mix$LTSigma[ci,,drop=FALSE])
            if(peak.difference) {
                cat.ps - max(cat.ps)
            } else {
                cat.ps
            }
        })
    res
}

#' Get grades of membership for an object according to a mixture
#' 
#' @param x object to categorize
#' @param mix mixtures distributions
#' @param logp return grades-of-membership in log form
#' @export
get_goms <- function(x, mix, logp=TRUE) {
    mixps <- get_mix_parameters(mix)
    sapply(1:mixps$k, function(i) {
            ps <- dnorm(x, mean=mixps$mean[i], sd=mixps$sd[i], log=logp)
            peak <- dnorm(mixps$mean[i], mean=mixps$mean[i], sd=mixps$sd[i], log=logp)
            if(logp) {
                ps - peak
            } else {
                ps/peak
            }
        })
}

#' Get grades of membership for an object according to a mixture (linear landscape)
#' 
#' @param x object to categorize
#' @param mix mixtures distributions
#' @param logp return grades-of-membership in log form
#' @export
get_goms_linear <- function(x, mix, logp = TRUE) {
    mixps <- get_mix_parameters(mix)
    sapply(1:mixps$k, function(i) {
            ps <- dnorm(x, mean=mixps$mean[i], sd=mixps$sd[i], log=logp)
            peak <- dnorm(mixps$mean[i], mean=mixps$mean[i], sd=mixps$sd[i], log=logp)
            if(logp) {
                ps - peak
            } else {
                ps/peak
            }
        })
}

#' Get grades of membership for an object according to a mixture (Salop landscape)
#' 
#' Currently assuming BAMBI
#' @param x object to categorize
#' @param mix mixtures distributions
#' @param logp return grades-of-membership in log form
#' @export
get_goms_salop <- function(x, mix, logp = TRUE) {
    mixps <- get_mix_parameters.BAMBI(mix)
    xang <- scales::rescale(x, to=c(0, 2*pi))
    sapply(1:mixps$k, function(i) {
            ps <- dwnorm(xang, mu=mixps$mean[i], kappa=mixps$kappa[i], log=logp)
            peak <- dwnorm(mixps$mean[i], mu=mixps$mean[i], kappa=mixps$kappa[i], log=logp)
            if(logp) {
                ps - peak
            } else {
                ps/peak
            }
        })
}

get_goms.EMCluster <- function(x, mix, logp=TRUE) {
    res <- sapply(1:mix$nclass, function(ci) {
            cat.ps <- dmixmvn(as.matrix(x), emobj=NULL, log=logp,
                              pi=mix$pi[ci],
                              Mu=mix$Mu[ci,,drop=F],
                              LTSigma=mix$LTSigma[ci,,drop=FALSE])
            #if(peak.difference) {
            #    cat.ps - max(cat.ps)
            #} else {
            #    cat.ps
            #}
            cat.ps
        })
    res
}

get_goms.mclust <- function(x, mix, logp=TRUE) {
    sapply(1:mix$G, function(i) {
            mean <- mix$parameters$mean[i]
            sigma <- sqrt(mix$parameters$variance$sigmasq[i])
            ps <- dnorm(x, mean=mean, sd=sigma, log=logp)
            peak <- dnorm(mean, mean=mean, sd=sigma, log=logp)

        })
    if(logp) {
        log(mix$z)
    } else {
        mix$z
    }
}

#' Extract mixture parameters from a mixture object
#'
#' Generalizes over the various ways of calculating mixtures that we have
get_mix_parameters <- function(mix) UseMethod('get_mix_parameters')

get_mix_parameters.Mclust <- function(mix) {
    list(k=mix$G,
         prob=mix$parameters$pro,
         mean=mix$parameters$mean,
         sd=rep(sqrt(mix$parameters$variance$sigmasq), length.out=mix$G))
}

get_mix_parameters.BAMBI <- function(mix) {
    list(k=ncol(mix),
         prob=mix['pmix',],
         mean=mix['mu',],
         kappa=mix['kappa',])
}

# TODO: for completeness, implement:
# get_mix_parameters.EMCluster

#' Find GOM for top two categories
#'
#' @param goms grades of membership for all categories
#' @return list(goms=GOM for top, second categories; ords=category ids)
#' @export
top_two_categories <- function(goms) {
    # find distance to second category
    # within each row, sort goms from highest to lowest
    goms_ords <- sapply(1:nrow(goms), function(ri) {
            ord <- order(goms[ri,], decreasing=TRUE)
            c(goms[ri,ord], ord) }
        )
    goms_ords <- t(goms_ords)

    #if(iter.cats$nclass > 1) {
    if(ncol(goms) > 1) {
        #res <- goms_ords[,c(1:2, iter.cats$nclass + 1:2)]
        #res <- goms_ords[,c(1:2, ncol(goms) + 1:2)]
        res <- list(goms=goms_ords[,1:2], ords=goms_ords[,1:2 + ncol(goms)])
    } else {
        # if we only have one category, second category has -Inf GOM, and no order
        #res <- cbind(goms[,1], -Inf, 1, NA)
        res <- list(goms=cbind(goms[,1], -Inf), ords=cbind(goms_ords[,2], NA))
    }
    res
}

############################################################
# GOM agent M(ean) adjustment

gom.log.Madj <- function(xdelta, gom.mean, gom.sd, gom.peak, verbose=.verbose$NONE, ...) {
    res <- dnorm(xdelta, mean=gom.mean, sd=gom.sd, log=TRUE) - gom.peak
    if(verbose >= .verbose$DEBUG2)
        cat(sprintf('Madj log, xdelta %s, mu %s, sd %s, res %s\n',
                    xdelta, gom.mean, gom.sd, res))
    res
}

gom.log.Mpadj <- function(xdelta, gom.mean, gom.sd, gom.peak, verbose=.verbose$NONE, ...) {
    res <- -2*(xdelta-gom.mean)/(gom.sd^2)
    if(verbose >= .verbose$DEBUG2)
        cat(sprintf('Mpadj log, xdelta %s, mu %s, sd %s, res %s\n',
                    xdelta, gom.mean, gom.sd, res))
    res
}

gom.Madj <- function(xdelta, gom.mean, gom.sd, gom.peak, verbose=.verbose$NONE, ...) {
    res <- exp(dnorm(xdelta, mean=gom.mean, sd=gom.sd, log=TRUE) - gom.peak)
    if(verbose >= .verbose$DEBUG2)
        cat(sprintf('Madj, xdelta %s, mu %s, sd %s, res %s\n',
                    xdelta, gom.mean, gom.sd, res))
    res
}

gom.Mpadj <- function(xdelta, gom.mean, gom.sd, gom.peak, verbose=.verbose$NONE, ...) {
    res <- -2*((xdelta-gom.mean)/(gom.sd^2))*gom.Madj(xdelta, gom.mean, gom.sd, gom.peak, verbose)
    if(verbose >= .verbose$DEBUG2)
        cat(sprintf('Mpadj, xdelta %s, mu %s, sd %s, res %s\n',
                    xdelta, gom.mean, gom.sd, res))
    res
}
balachia/pcPack documentation built on Dec. 19, 2021, 6:40 a.m.