R/classification.R

Defines functions model_lhood marginal_model_lhood model_matrix apply_model_list list_models unlist_models add_id_col classify classify_vowels classify_indexical_with_holdout get_accuracy

Documented in apply_model_list classify classify_indexical_with_holdout classify_vowels get_accuracy list_models marginal_model_lhood model_lhood model_matrix unlist_models

#' @importFrom purrr map map2
#' @import dplyr
#' @import tidyr
#' @import assertthat
#' @importFrom mvtnorm dmvnorm
NULL

#' Likelihood of data under one category's model
#'
#' @param mod multivariate normal model (list with mean vector mu and covariance
#' matrix Sigma)
#' @param dat matrix with observations in rows and dimensions in columns, passed to
#' mvtnorm::mvnorm.
#' @param ... additional arguments are passed to dmvnorm
#'
#' @export
model_lhood <- function(mod, dat, ...) mvtnorm::dmvnorm(dat, mod$mu, mod$Sigma, ...)

#' Marginal likelihood of data under mixture model
#'
#' Calls \code{\link{model_lhood}} on each model and takes the average (assumes
#' equal prior/mixing weights)
#'
#' @param mods list of models in mixture.
#' @param data matrix with observations in rows and dimensions in columns
#' @param log =TRUE returns log likelihood (default)
#' @param ... additional arguments passed model_lhood (mvtnorm::dmvnorm)
#' @return vector with marginal likelihood for each row in data.
#'
#' @export
marginal_model_lhood <- function(mods, data, log=TRUE, ...) {
  if (log) agg_fun = log_mean_exp
  else     agg_fun = mean

  mods %>%
    map(model_lhood, data, log=log, ...) %>%
                                        # list of categories' (log)lhood vectors
    purrr::lift(rbind)() %>%               # category x token matrix
    apply(., 2, agg_fun)                # marginal token lhoods
}

#' Create data matrix for a model
#'
#' Extracts columns from data frame with names from model's \code{mu}
#' field. Operates recursively, so it will work on a list of models
#'
#' @param model a model (with field \code{mu}, a named vector) or a list of
#'   models that will be flattened until there's a field named \code{mu}.
#' @param data a data frame with columns of the same names as the entries in
#'   model's \code{mu}
#' @return a matrix with observations in rows and dimensions in columns.
#' @export
model_matrix <- function(model, data) {

  ## traverse list of models depth first to find basic model with 'mu' field to
  ## get colnames
  while (! 'mu' %in% names(model)) {
    ## check for infinite loop here :)
    if (is.atomic(model)) {
      stop("Can't find fitted means named 'mu' to determine cue columns in new data'")
    }
    model <- model[[1]]
  }

  cue_columns <- names(model$mu)

  data %>%
    ungroup() %>%
    select_(.dots = cue_columns) %>%
    as.matrix()
}

#' Compute likelihood of many models on the same data
#' 
#' @param data data.frame with columns F1 and F2 (passed to marginal_model_lhood)
#' @param model_list list of models to calculate likelihood
#' @param lhood_fun likelihood function
#' @return data.frame of likelihoods, with one column per model, one row per row
#' in data
#' @export
apply_model_list <- function(data, model_list, lhood_fun) {
  data_mat <- model_matrix(model_list, data)
  model_list %>%
    map(~ lhood_fun(., data_mat)) %>%
    ## do.call(rbind, .) %>%
    ## t() %>%                            # matrix with Dialect as cols
    as_data_frame()                       # data frame with Dialect as cols
}


#' Convert data frame of models to named list
#'
#' @param d data.frame of models
#' @param names_col (optional) name of column to be used for names of models
#' @param model_field ='model' name of column with models
#' @return a named list of models
#' @export
list_models <- function(d, names_col=NULL, model_field='model') {
  models <- d[[model_field]] %>% as.list()
  if (!is.null(names_col)) {
    names(models) <- d[[names_col]]
  }
  return(models)
}

#' Convert named list of models to data frame
#'
#' Undoes \code{\link{list_models}}
#' 
#' @param l named list of models
#' @param names_col name of column generated for names
#' @param model_col ='model' name of column generated for models
#' @return a data frame with names in names_col and models in model_col
#' @export
unlist_models <- function(l, names_col, model_col='model')
  data_frame(names(l), l) %>% set_names(c(names_col, model_col))

add_id_col = function(x) mutate(x, id_=row_number())

#' Use trained models to classify observed cue values
#'
#' Names of columns are taken from the names of the first \code{mu} in models.
#'
#' @param data Data frame with grouping columns used to generate models
#'   and F1,F2
#' @param models Data frame with column \code{category} and list column `model`
#'   (as generated by `train_models`).
#' @param category Name of the column in \code{models} that has category labels
#' @return Data frame with one row per data x model combination, with likelihood
#'   in lhood, posterior probability in posterior. The model with the highest
#'   posterior for each token has TRUE in posterior_choice.
#'
#' @export
classify <- function(data, models, category) {

  assert_that(is.data.frame(models))
  assert_that(category %in% names(models))
  assert_that(is.data.frame(data))

  model_groups <- groups(models)
  if (is.null(model_groups)) {
    data <- data %>% ungroup()
  } else {
    data <- data %>% group_by_(.dots = model_groups)
  }

  ## make a named list of category models for each group
  model_lists <-
    models %>%
    do(models = list_models(., category))

  ## A more efficient implementation than rowwise: get a matrix of all the data in
  ## a group that draws on the same models, to take advantage of vectorization
  ## in `mvnorm`
  if (is.null(model_groups)) {
    ## no grouping variables: simulate effects of nest + left_bind
    data_and_models <- bind_cols(data_frame(data = list(data)),
                                            model_lists)
  } else {
    data_and_models <- 
      data %>%
      nest() %>%
      left_join(model_lists)
  }

  data_and_models %>%
    mutate(lhoods = map2(data, models,
                         ~ apply_model_list(.x, .y, model_lhood)),
           posteriors = map(lhoods,
                            . %>%
                              add_id_col() %>%
                              gather(model, lhood, -id_) %>%
                              group_by(id_) %>%
                              mutate(posterior = lhood / sum(lhood),
                                     posterior_choice = posterior==max(posterior)))
           ) %>%
    unnest(map2(data, posteriors,
                ~ inner_join(.x %>% add_id_col(), .y, by='id_') %>%
                  select(-id_))) %>%
    group_by_(.dots=model_groups)
  
}

#' @describeIn classify Special case where \code{category = "Vowel"}
#' @export
classify_vowels <- function(data, models) classify(data, models, category='Vowel')




#' Classify test data with cross-validated models
#'
#' See \code{\link{train_models_indexical_with_holdout}}.
#' 
#' @param data_and_models output of train_models_indexical_with_holdout, with
#'   one row per talker (or holdout grouping level), and list columns for
#'   \code{data_test} and \code{models}.
#' @return Data frame with one row per combination of data row and model, with
#'   columns corresponding to the held-out and grouping variables, plus model,
#'   lhood (total log-likelihood of data under model), log_posterior, posterior,
#'   and posterior_choice (TRUE for the MAP model and FALSE otherwise)
#'
#' @export
classify_indexical_with_holdout <- function(data_and_models) {

  lhoods_to_posterior <- function(lhoods) {
    lhoods %>%
      group_by_('model') %>%
      summarise(lhood = sum(lhood)) %>%     # aggregate log-lhood within talkers
      # normalize to get posterior
      mutate(log_posterior = lhood - log_sum_exp(lhood),
             posterior = exp(log_posterior),
             posterior_choice = posterior == max(posterior))
  }

  data_and_models %>%
    mutate(lhoods = map2(data_test, models,
                         ~ apply_model_list(.x, .y, marginal_model_lhood)),
           posteriors = map(lhoods,
                            . %>%
                              gather(model, lhood, everything()) %>%
                              lhoods_to_posterior())) %>%
    unnest(posteriors)

}


#' Two methods to compute accuracy from classifier
#'
#' @param tbl classifier output
#' @param category_col name of column specifying true category
#' @param method if 'choice' (default), accuracy is 1 if choice is correct,
#'   0 if not. if 'posterior', accuracy is posterior probability of true
#'   category
#' @return tbl with accuracy in column \code{accuracy}
#'
#' @export
get_accuracy <- function(tbl, category_col, method='choice') {
  assert_that(method %in% c('choice', 'posterior'))
  assert_that(has_name(tbl, category_col))

  category_eq_mod <- lazyeval::interp(~var==category_model,
                                      var=as.name(category_col))

  if (method == 'choice') {
    tbl %>%
      filter(posterior_choice) %>%
      mutate_(accuracy = category_eq_mod)
  } else if (method == 'posterior') {
    tbl %>%
      filter_(category_eq_mod) %>%
      mutate_(accuracy = ~ posterior)
  }

}
kleinschmidt/phondisttools documentation built on May 20, 2019, 5:57 p.m.