R/reshape-result.r
In emil: Evaluation of Modeling without Information Leakage

Documented in get_importance get_performance get_prediction get_tuning select_.list subtree

#' @importFrom tidyr extract_ gather_ spread_
NULL

#' Pipe operator
#'
#' See \code{\link[magrittr]{\%>\%}} for details.
#'
#' @name %>%
#' @rdname pipe
#' @keywords internal
#' @export
#' @importFrom magrittr %>%
#' @usage lhs \%>\% rhs
NULL

#' Extractor function for modeling result
#' 
#' As opposed to the standard extractor function, this will keep the class.
#' 
#' @param x modeling result object, as produced by \code{\link{evaluate}}.
#' @param ... Sent to \link{Extract}.
#' @noRd
#' @author Christofer \enc{Bäcklin}{Backlin}
#' @export
`[.modeling_result` <- function(x, ...){
    structure(unclass(x)[...], class="modeling_result")
}

#' Extract a subset of a tree of nested lists
#' 
#' Modeling results produced by \code{\link{evaluate}} comes in the
#' form of nested lists. This function can be used to subset or rearrange parts
#' of the results into vectors, matrices or data frames.
#' Also note the \code{\link[emil]{select}} function that provides an extension
#' to the \pkg{dplyr} package for data manipulation.
#' 
#' This function can only be used to extract data, not to assign.
#' 
#' @param x List of lists.
#' @param i Indexes to extract on the first level of the tree. Can also be a
#'   function that will be applied to the downstream result of the function.
#' @param ... Indexes to extract on subsequent levels.
#' @param error_value A template for the return value in case it is missing or
#'   invalid. Note that \code{NA} is a \code{\link{logical}} by default,
#'   causing \code{subtree} to also convert existing results to logicals.
#'   To get around this, please specify it as \code{as.numeric(NA)},
#'   \code{as.character(NA)}, or similar (see the example below).
#' @param warn Specifies whether warnings should be displayed (\code{0}),
#'   ignored (\code{-1}), or break execution (\code{1}). Works like the
#'   \code{\link{options}} parameter \code{warn}.
#' @param simplify Whether to collapse results into vectors or matrices when
#'   possible (\code{TRUE}) or to preserve the original tree structure as a
#'   list (\code{FALSE}).
#' @return A subset of the list tree.
#' @example examples/subtree.r
#' @seealso \code{\link{select}}, \code{\link{get_prediction}},
#'   \code{\link{get_importance}}, \code{\link{get_tuning}}.
#' @author Christofer \enc{Bäcklin}{Backlin}
#' @importFrom methods as
#' @export
subtree <- function(x, i, ..., error_value, warn, simplify=TRUE){
    if(missing(error_value)) error_value <- NULL
    if(missing(warn)) warn <- is.null(error_value)
    if(is.null(error_value) && warn < 1){
        warning("With no `error_value` `warn` is ignored and all errors break the execution")
        warn <- 1
    }
    ret <- if(is.function(i)){
        if(missing(...)){
            i(x)
        } else {
            i(subtree(x, ..., error_value=error_value, warn=warn, simplify=simplify))
        }
    } else if(missing(...)){
        if(is.null(error_value)){
            x[i]
        } else {
            coerce_class <- function(x){
                x <- as(x, class(error_value))
                if(length(x) != length(error_value))
                    stop(sprintf("values must be length %i, but result is length %i",
                                 length(error_value), length(x)))
                x
            }
            lapply(x[i], function(xi){
                if(warn < 1) tryCatch({
                    coerce_class(xi)
                }, error = function(err){
                    if(warn == 0)
                        warning(err$message)
                    error_value
                }) else {
                    coerce_class(xi)
                }
            })
        }
    } else {
        lapply(x[i], subtree, ..., error_value=error_value, warn=warn, simplify=simplify)
    }
    if(simplify){
        if(length(ret) == 1){
            ret <- ret[[1]]
        } else if(all(sapply(ret, length) == 1)){
            ret <- unlist(ret, recursive=FALSE)
        } else if(all(sapply(lapply(ret, dim), is.null)) &&
                  all(sapply(ret, length) == length(ret[[1]]))){
            ret.class <- sapply(ret, class)
            ret.na <- sapply(ret, function(x) all(is.na(x)))
            i <- head(which(!ret.na), 1)
            if((is.numeric(ret[[i]]) || is.character(ret[[i]]) || is.logical(ret[[i]])) &&
                    length(unique(ret.class[!ret.na])) == 1){
                ret <- do.call(cbind, ret)
            }
        }
        if(is.null(dim(ret)) && !is.null(dim(x)) && length(ret) == length(x)){
            ret <- array(ret, dim=dim(x), dimnames=dimnames(x))
        }
    }
    ret
}

#' \pkg{emil} and \pkg{dplyr} integration
#' 
#' Modeling results can be converted to tabular format and manipulated using
#' \pkg{dplyr} and other Hadleyverse packages. This is accomplished by a class
#' specific \code{\link[dplyr]{select_}} function that differs somewhat in syntax
#' from the default \code{\link[dplyr]{select_}}.
#'
#' @param .data Modeling results, as returned by \code{\link{evaluate}}.
#' @param ... Not used, kept for consistency with \code{dplyr}.
#' @param .dots Indices to select on each level of \code{.data}, i.e.
#'   the first index specifies which top level elements of \code{.data} to
#'   select, the second specifies second-level-elements etc.
#'   The last index must select elements that can be converted to a data frame.
#'   In case the desired bottom-level element is related to the observations of
#'   a modeling task, e.g. the predctions of a test set, you must supply the
#'   resampling scheme used to produce \code{.data} at the appropriate level
#'   (see the examples).
#'
#'   The names of the \code{...} arguments specifies the names of the resulting
#'   data frame. Non-named arguments will be used to traverse the data but not
#'   returned.
#'
#'   In summary the \code{...} indices can be on the following forms:
#'   \describe{
#'     \item{Simple indices}{Anything that can be used to subset objects,
#'       e.g. integers, logicals, or characters.}
#'     \item{Functions}{A function that produces a data frame, vector or
#'       factor.}
#'     \item{Resampling schemes}{The same resampling scheme that was used to
#'       produce the modeling results.}
#'   }
#' @return A \code{\link{data.frame}} in long format.
#' @examples
#' # Produce some results
#' x <- iris[-5]
#' y <- iris$Species
#' names(y) <- sprintf("orchid%03i", seq_along(y))
#' cv <- resample("crossvalidation", y, nfold=3, nrepeat=2)
#' procedures <- list(nsc = modeling_procedure("pamr"),
#'                    rf = modeling_procedure("randomForest"))
#' result <- evaluate(procedures, x, y, resample=cv)
#' 
#' # Get the foldwise error for the NSC method
#' result %>% select(fold = TRUE, "nsc", error = "error")
#'
#' # Compare both methods 
#' require(tidyr)
#' result %>%
#'     select(fold = TRUE, method = TRUE, error = "error") %>%
#'     spread(method, error)
#' require(dplyr)
#' result %>%
#'     select(fold = TRUE, method = TRUE, error = "error") %>%
#'     group_by(method) %>% summarize(mean_error = mean(error))
#'
#' # Investigate the variability in estimated class 2 probability across folds
#' result %>%
#'     select(fold = cv, "nsc", "prediction", probability = function(x) x$probability[,2]) %>%
#'     spread(fold, probability)
#' @seealso subtree
#' @author Christofer \enc{Bäcklin}{Backlin}
#' @name select
#' @importFrom dplyr select
#' @export
NULL

#' @rdname select
#' @method select_ list
#' @importFrom dplyr select_
#' @importFrom lazyeval lazy_eval
#' @export
select_.list <- function(.data, ..., .dots){
    select_list(.data, lazy_eval(.dots))
}
#' @method select_ modeling_result
#' @noRd
#' @export
select_.modeling_result <- function(.data, ..., .dots){
    select_list(.data, lazy_eval(.dots))
}
#' @method select_ learning_curve
#' @noRd
#' @export
select_.learning_curve <- function(.data, ..., .dots){
    select_list(.data, lazy_eval(.dots))
}
#' @importFrom data.table rbindlist
select_list <- function(.data, .dots, id=NULL){
    named <- !is.null(names(.dots)) && names(.dots)[1] != ""
    subsetting <- inherits(.dots[[1]], c("numeric", "integer", "logical", "character"))

    if(length(.dots) == 1){
        if(subsetting){
            if(is.data.frame(.data) && (length(.dots[[1]]) > 1 || !named)){
                d <- .data[.dots[[1]]]
            } else {
                d <- data.frame(Value = .data[[.dots[[1]]]])
                if(named && length(d) > 0)
                    names(d) <- names(.dots)
            }
        } else if(is.function(.dots[[1]])){
            d <- .dots[[1]](.data)
            if(!inherits(d, c("data.frame", "NULL"))){
                if(named){
                    d <- data.frame(d)
                    names(d) <- names(.dots)
                } else {
                    stop("Functions must either be named or return a data frame or NULL.")
                }
            }
        } else {
            stop("Invalid argument.")
        }
        if(!is.null(id)){
            if(nrow(d) != length(id)) stop("Fold did not match data.")
            d$id <- id
        }
    } else {
        if(subsetting){
            d <- lapply(.data[.dots[[1]]], select_list, .dots[-1], id=id)
            if(named && length(d) > 0){
                if(is.null(names(d))){
                    d <- data.frame(..tmp = rep(seq_along(d), sapply(d, nrow)),
                                    rbindlist(d))
                } else {
                    d <- data.frame(..tmp = factor(rep(seq_along(d), sapply(d, nrow)),
                                              seq_along(d), names(d)),
                               rbindlist(d))
                }
                names(d)[1] <- names(.dots)[1]
            } else {
                d <- rbindlist(d)
            }
        } else if(is.function(.dots[[1]]) && length(.dots) > 1){
            stop("Not implemented.")
        } else if(inherits(.dots[[1]], "resample")){
            if(!identical(names(.data), names(.dots[[1]])))
                .data <- .data[names(.dots[[1]])]
            r <- if(identical(rownames(.dots[[1]]), as.character(1:nrow(.dots[[1]])))){
                seq_len(nrow(.dots[[1]]))
            } else {
                rownames(.dots[[1]])
            }
            d <- Map(function(x, i) select_list(x, .dots[-1], id=r[index_test(i)]),
                     .data, .dots[[1]])
            d <- data.frame(fold = factor(rep(names(.dots[[1]]), sapply(d, nrow)),
                                          names(.dots[[1]])),
                            rbindlist(d))
        } else {
            stop("Invalid argument.")
        }
    }
    d
}

#' Extract predictions from modeling results
#'
#' @param result Modeling result, as returned by \code{\link{evaluate}} and
#'   \code{\link{evaluate}}.
#' @param resample Resampling scheme used to create the results.
#' @param type The type of prediction to return. The possible types vary between
#'   modeling procedure.
#' @param format Table format of the output. See
#'   \url{http://en.wikipedia.org/wiki/Wide_and_narrow_data} for more info.
#' @return A data frame where the id column refers to the observations.
#' @author Christofer \enc{Bäcklin}{Backlin}
#' @export
get_prediction <- function(result, resample, type="prediction", format=c("long", "wide")){
    stopifnot(inherits(result, "modeling_result"))
    format <- match.arg(format)
    if(is_multi_procedure(result)){
        prediction <- select(result, fold=resample, method=TRUE, type,
                             prediction="prediction")
    } else {
        prediction <- select(result, fold=resample, type,
                             prediction=type)
        names(prediction)[names(prediction) == "prediction"] <- type
    }
    if(format == "wide"){
        spread_(prediction, "fold", type)
    } else {
        prediction
    }
}


#' Feature (variable) importance of a fitted model
#'
#' Note that different methods calculates feature importance in different
#' ways and that they are not directly comparable.
#'
#' When extending the \pkg{emil} framework with your own method, the importance
#' function should return a data frame where one column is called "feature" and
#' the remaining columns are named after the classes.
#'
#' @param object Fitted model.
#' @param format Table format of the output. See
#'   \url{http://en.wikipedia.org/wiki/Wide_and_narrow_data} for more info.
#' @param ... Sent on to the procedure's feature importance scoring function.
#' @return A vector of length p or an p-x-c matrix of feature importance
#'   scores where p is the number of descriptors and c is the number of classes.
#' @examples
#' procedure <- modeling_procedure("pamr")
#' model <- fit("pamr", x=iris[-5], y=iris$Species)
#' get_importance(model)
#' 
#' cv <- resample("crossvalidation", iris$Species, nrepeat=2, nfold=3)
#' result <- evaluate("pamr", iris[-5], iris$Species, resample=cv,
#'                    .save=c(importance=TRUE))
#' get_importance(result)
#' @author Christofer \enc{Bäcklin}{Backlin}
#' @seealso \code{\link{emil}}
#' @export
get_importance <- function(object, format, ...){
    UseMethod("get_importance")
}
#' @method get_importance model
#' @export
get_importance.model <- function(object, format=c("wide", "long"), ...){
    format <- match.arg(format)
    imp <- object$procedure$importance_fun(object$model, ...)
    if(format == "long"){
        gather_(imp, "class", "importance", setdiff(colnames(imp), "feature"))
    } else {
        imp
    }
}
#' @method get_importance modeling_result
#' @export
get_importance.modeling_result <- function(object, format=c("wide", "long"), ...){
    format <- match.arg(format)
    reset_notification(id = "importance_missing")
    gatherer <- function(x){
        if(is.null(x$importance)){
            if(is.null(x$model)){
                stop("Could not extract importance since neither models nor importance was saved during evaluation.")
            }
            notify_once("importance_missing", 
                "Feature importance was not calculated during the evaluation. Calculating now.",
                fun=message)
            get_importance(x$model, "wide")
        } else {
            x$importance
        }
    }
    if(is_multi_procedure(object)){
        imp <- select(object, fold=TRUE, method=TRUE, gatherer)
    } else {
        imp <- select(object, fold=TRUE, gatherer)
    }
    if(format == "long"){
        gather_(imp, "class", "importance")
    } else {
        imp
    }
}
#' @method get_importance list
#' @export
get_importance.list <- function(object, format=c("wide", "long"), ...){
    format <- match.arg(format)
    lapply(object, get_importance, format=format, ...)
    # FIXME to one table
}

#' Extract parameter tuning statistics
#' 
#' @param object Fitted model or modeling procedure
#' @return A data frame of tuning statistics in long format.
#' @examples
#' procedure <- modeling_procedure("randomForest",
#'     parameter = list(mtry = c(1, 3),
#'                      nodesize = c(4, 10)))
#' model <- fit(procedure, x=iris[-5], y=iris$Species)
#' get_tuning(model)
#' 
#' options(emil_max_indent=4)
#' ho <- resample("holdout", iris$Species, nfold=5)
#' result <- evaluate(procedure, iris[-5], iris$Species, resample=ho,
#'                    .save=c(model=TRUE))
#' get_tuning(result)
#' @author Christofer \enc{Bäcklin}{Backlin}
#' @export
get_tuning <- function(object){
    UseMethod("get_tuning")
}
#' @method get_tuning model
#' @export
get_tuning.model <- function(object){
    get_tuning(object$procedure)
}
#' @method get_tuning modeling_procedure
#' @export
get_tuning.modeling_procedure <- function(object){
    stopifnot(is_tunable(object) && is_tuned(object))
    parameter_frame <- tryCatch({
        x <- Map(function(i, p) data.frame(parameter_set = i, as.data.frame(p)),
            seq_along(object$tuning$parameter),
            object$tuning$parameter)
        if(any(sapply(x, nrow) > 1))
            stop("Only scalar parameter values can be put in a tuning data frame.")
        do.call(rbind, x)
    }, error = function(...){
        warning("Could not convert parameter values to a data frame, using a numeric index instead.")
        data.frame(parameter_set = seq_along(object$tuning$parameter))
    })
    error_frame <- object$tuning$error %>%
            spread_("fold", "error")
    merge(parameter_frame, error_frame, by="parameter_set")
}
#' @method get_tuning modeling_result
#' @export
get_tuning.modeling_result <- function(object){
    tuning.folds <- names(object)
    if(is_multi_procedure(object)){
        method <- if(is.null(names(object[[1]]))){
            seq_along(object[[1]])
        } else {
            names(object[[1]])
        }
        names(method) <- method
        tuning <- lapply(method, function(m){
            parameter <- names(object[[1]][[m]]$model$procedure$tuning$parameter[[1]])
            tuning <- select(object, fold=TRUE, method=m, "model", get_tuning.model) %>%
                gather_("parameter", "value", parameter)
        })
        tuning <- do.call(rbind, tuning)
    } else {
        parameter <- names(object[[1]]$model$procedure$tuning$parameter[[1]])
        tuning <- select(object, fold=TRUE, "model", get_tuning.model) %>%
            gather_("parameter", "value", parameter)
    }
    tuning %>% gather_("tuning_fold", "error", tuning.folds)
}

#' Extract prediction performance
#' 
#' @param result Modeling result, as returned by \code{\link{evaluate}}.
#' @param format Table format of the output. See
#'   \url{http://en.wikipedia.org/wiki/Wide_and_narrow_data} for more info.
#' @return Data frame.
#' @author Christofer \enc{Bäcklin}{Backlin}
#' @export
get_performance <- function(result, format=c("wide", "long")){
    format <- match.arg(format)
    if(is_multi_procedure(result)){
        p <- select(result, fold=TRUE, method=TRUE, error="error")
        if(format == "wide"){
            spread_(p, "method", "error")
        } else {
            p
        }
    } else {
        select(result, fold=TRUE, error="error")
    }
}