#' @importFrom purrr map map2
#' @import dplyr
#' @import tidyr
#' @import assertthat
#' @importFrom mvtnorm dmvnorm
NULL
#' Likelihood of data under one category's model
#'
#' @param mod multivariate normal model (list with mean vector mu and covariance
#' matrix Sigma)
#' @param dat matrix with observations in rows and dimensions in columns, passed to
#' mvtnorm::mvnorm.
#' @param ... additional arguments are passed to dmvnorm
#'
#' @export
model_lhood <- function(mod, dat, ...) mvtnorm::dmvnorm(dat, mod$mu, mod$Sigma, ...)
#' Marginal likelihood of data under mixture model
#'
#' Calls \code{\link{model_lhood}} on each model and takes the average (assumes
#' equal prior/mixing weights)
#'
#' @param mods list of models in mixture.
#' @param data matrix with observations in rows and dimensions in columns
#' @param log =TRUE returns log likelihood (default)
#' @param ... additional arguments passed model_lhood (mvtnorm::dmvnorm)
#' @return vector with marginal likelihood for each row in data.
#'
#' @export
marginal_model_lhood <- function(mods, data, log=TRUE, ...) {
if (log) agg_fun = log_mean_exp
else agg_fun = mean
mods %>%
map(model_lhood, data, log=log, ...) %>%
# list of categories' (log)lhood vectors
purrr::lift(rbind)() %>% # category x token matrix
apply(., 2, agg_fun) # marginal token lhoods
}
#' Create data matrix for a model
#'
#' Extracts columns from data frame with names from model's \code{mu}
#' field. Operates recursively, so it will work on a list of models
#'
#' @param model a model (with field \code{mu}, a named vector) or a list of
#' models that will be flattened until there's a field named \code{mu}.
#' @param data a data frame with columns of the same names as the entries in
#' model's \code{mu}
#' @return a matrix with observations in rows and dimensions in columns.
#' @export
model_matrix <- function(model, data) {
## traverse list of models depth first to find basic model with 'mu' field to
## get colnames
while (! 'mu' %in% names(model)) {
## check for infinite loop here :)
if (is.atomic(model)) {
stop("Can't find fitted means named 'mu' to determine cue columns in new data'")
}
model <- model[[1]]
}
cue_columns <- names(model$mu)
data %>%
ungroup() %>%
select_(.dots = cue_columns) %>%
as.matrix()
}
#' Compute likelihood of many models on the same data
#'
#' @param data data.frame with columns F1 and F2 (passed to marginal_model_lhood)
#' @param model_list list of models to calculate likelihood
#' @param lhood_fun likelihood function
#' @return data.frame of likelihoods, with one column per model, one row per row
#' in data
#' @export
apply_model_list <- function(data, model_list, lhood_fun) {
data_mat <- model_matrix(model_list, data)
model_list %>%
map(~ lhood_fun(., data_mat)) %>%
## do.call(rbind, .) %>%
## t() %>% # matrix with Dialect as cols
as_data_frame() # data frame with Dialect as cols
}
#' Convert data frame of models to named list
#'
#' @param d data.frame of models
#' @param names_col (optional) name of column to be used for names of models
#' @param model_field ='model' name of column with models
#' @return a named list of models
#' @export
list_models <- function(d, names_col=NULL, model_field='model') {
models <- d[[model_field]] %>% as.list()
if (!is.null(names_col)) {
names(models) <- d[[names_col]]
}
return(models)
}
#' Convert named list of models to data frame
#'
#' Undoes \code{\link{list_models}}
#'
#' @param l named list of models
#' @param names_col name of column generated for names
#' @param model_col ='model' name of column generated for models
#' @return a data frame with names in names_col and models in model_col
#' @export
unlist_models <- function(l, names_col, model_col='model')
data_frame(names(l), l) %>% set_names(c(names_col, model_col))
add_id_col = function(x) mutate(x, id_=row_number())
#' Use trained models to classify observed cue values
#'
#' Names of columns are taken from the names of the first \code{mu} in models.
#'
#' @param data Data frame with grouping columns used to generate models
#' and F1,F2
#' @param models Data frame with column \code{category} and list column `model`
#' (as generated by `train_models`).
#' @param category Name of the column in \code{models} that has category labels
#' @return Data frame with one row per data x model combination, with likelihood
#' in lhood, posterior probability in posterior. The model with the highest
#' posterior for each token has TRUE in posterior_choice.
#'
#' @export
classify <- function(data, models, category) {
assert_that(is.data.frame(models))
assert_that(category %in% names(models))
assert_that(is.data.frame(data))
model_groups <- groups(models)
if (is.null(model_groups)) {
data <- data %>% ungroup()
} else {
data <- data %>% group_by_(.dots = model_groups)
}
## make a named list of category models for each group
model_lists <-
models %>%
do(models = list_models(., category))
## A more efficient implementation than rowwise: get a matrix of all the data in
## a group that draws on the same models, to take advantage of vectorization
## in `mvnorm`
if (is.null(model_groups)) {
## no grouping variables: simulate effects of nest + left_bind
data_and_models <- bind_cols(data_frame(data = list(data)),
model_lists)
} else {
data_and_models <-
data %>%
nest() %>%
left_join(model_lists)
}
data_and_models %>%
mutate(lhoods = map2(data, models,
~ apply_model_list(.x, .y, model_lhood)),
posteriors = map(lhoods,
. %>%
add_id_col() %>%
gather(model, lhood, -id_) %>%
group_by(id_) %>%
mutate(posterior = lhood / sum(lhood),
posterior_choice = posterior==max(posterior)))
) %>%
unnest(map2(data, posteriors,
~ inner_join(.x %>% add_id_col(), .y, by='id_') %>%
select(-id_))) %>%
group_by_(.dots=model_groups)
}
#' @describeIn classify Special case where \code{category = "Vowel"}
#' @export
classify_vowels <- function(data, models) classify(data, models, category='Vowel')
#' Classify test data with cross-validated models
#'
#' See \code{\link{train_models_indexical_with_holdout}}.
#'
#' @param data_and_models output of train_models_indexical_with_holdout, with
#' one row per talker (or holdout grouping level), and list columns for
#' \code{data_test} and \code{models}.
#' @return Data frame with one row per combination of data row and model, with
#' columns corresponding to the held-out and grouping variables, plus model,
#' lhood (total log-likelihood of data under model), log_posterior, posterior,
#' and posterior_choice (TRUE for the MAP model and FALSE otherwise)
#'
#' @export
classify_indexical_with_holdout <- function(data_and_models) {
lhoods_to_posterior <- function(lhoods) {
lhoods %>%
group_by_('model') %>%
summarise(lhood = sum(lhood)) %>% # aggregate log-lhood within talkers
# normalize to get posterior
mutate(log_posterior = lhood - log_sum_exp(lhood),
posterior = exp(log_posterior),
posterior_choice = posterior == max(posterior))
}
data_and_models %>%
mutate(lhoods = map2(data_test, models,
~ apply_model_list(.x, .y, marginal_model_lhood)),
posteriors = map(lhoods,
. %>%
gather(model, lhood, everything()) %>%
lhoods_to_posterior())) %>%
unnest(posteriors)
}
#' Two methods to compute accuracy from classifier
#'
#' @param tbl classifier output
#' @param category_col name of column specifying true category
#' @param method if 'choice' (default), accuracy is 1 if choice is correct,
#' 0 if not. if 'posterior', accuracy is posterior probability of true
#' category
#' @return tbl with accuracy in column \code{accuracy}
#'
#' @export
get_accuracy <- function(tbl, category_col, method='choice') {
assert_that(method %in% c('choice', 'posterior'))
assert_that(has_name(tbl, category_col))
category_eq_mod <- lazyeval::interp(~var==category_model,
var=as.name(category_col))
if (method == 'choice') {
tbl %>%
filter(posterior_choice) %>%
mutate_(accuracy = category_eq_mod)
} else if (method == 'posterior') {
tbl %>%
filter_(category_eq_mod) %>%
mutate_(accuracy = ~ posterior)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.