R/textmodel_cnnlstmemb.R

Defines functions load.textmodel_cnnlstmemb save.textmodel_cnnlstmemb print.predict.textmodel_cnnlstmemb summary.textmodel_cnnlstmemb print.textmodel_cnnlstmemb predict.textmodel_cnnlstmemb textmodel_cnnlstmemb.tokens2sequences textmodel_cnnlstmemb.tokens textmodel_cnnlstmemb

Documented in load.textmodel_cnnlstmemb predict.textmodel_cnnlstmemb save.textmodel_cnnlstmemb summary.textmodel_cnnlstmemb textmodel_cnnlstmemb

#' \[Experimental\] Convolutional NN + LSTM model fitted to word embeddings
#'
#' A function that combines a convolutional neural network layer with a long
#' short-term memory layer. It is designed to incorporate word sequences,
#' represented as sequentially ordered word embeddings, into text
#' classification. The model takes as an input a \pkg{quanteda} tokens object.
#'
#' @param x tokens object
#' @inheritParams quanteda.textmodels::textmodel_svm
#' @param dropout1 A floating variable bound between 0 and 1. It determines the
#'   rate at which units are dropped for the linear transformation of the
#'   inputs for the embedding layer.
#' @param dropout2 A floating variable bound between 0 and 1. It determines the
#'   rate at which units are dropped for the linear transformation of the
#'   inputs for the CNN layer.
#' @param dropout3 A floating variable bound between 0 and 1. It determines the
#'   rate at which units are dropped for the linear transformation of the
#'   inputs for the recurrent layer.
#' @param dropout4 A floating variable bound between 0 and 1. It determines the
#'   rate at which units are dropped for the linear transformation of the
#'   inputs for the recurrent layer.
#' @param wordembeddim The number of word embedding dimensions to be fit
#' @param cnnlayer A logical parameter that allows user to include or exclude a
#'   convolutional layer in the neural network model
#' @param filter The number of output filters in the convolution
#' @param kernel_size An integer or list of a single integer, specifying the
#'   length of the 1D convolution window
#' @param pool_size Size of the max pooling windows.
#'   [keras::layer_max_pooling_1d()]
#' @param units_lstm The number of nodes of the lstm layer
#' @param words The maximum number of words used to train model. Defaults to the
#'   number of features in `x`
#' @param maxsenlen The maximum sentence length of training data
#' @param optimizer optimizer used to fit model to training data, see
#'   [keras::compile.keras.engine.training.Model()]
#' @param loss objective loss function, see
#'   [keras::compile.keras.engine.training.Model()]
#' @param metrics metric used to train algorithm, see
#'   [keras::compile.keras.engine.training.Model()]
#' @param ... additional options passed to
#'   [keras::fit.keras.engine.training.Model()]
#' @keywords textmodel
#' @importFrom keras keras_model_sequential to_categorical
#' @importFrom keras layer_dense layer_activation layer_dropout compile fit
#' @importFrom keras layer_embedding layer_conv_1d layer_max_pooling_1d
#'   layer_lstm bidirectional
#' @seealso [save.textmodel_cnnlstmemb()], [load.textmodel_cnnlstmemb()]
#' @export
#' @examples
#' \dontrun{
#' # create dataset with evenly balanced coded & uncoded immigration sentences
#' corpcoded <- corpus_subset(data_corpus_manifestosentsUK,
#'                            !is.na(crowd_immigration_label))
#' corpuncoded <- data_corpus_manifestosentsUK %>%
#'     corpus_subset(is.na(crowd_immigration_label) & year > 1980) %>%
#'     corpus_sample(size = ndoc(corpcoded))
#' corp <- corpcoded + corpuncoded
#'
#' tok <- tokens(corp)
#'
#' tmod <- textmodel_cnnlstmemb(tok,
#'                              y = docvars(tok, "crowd_immigration_label"),
#'                              epochs = 5, verbose = 1)
#'
#' newdata = tokens_subset(tok, subset = is.na(crowd_immigration_label))
#' pred <- predict(tmod, newdata = newdata)
#' table(pred)
#' tail(texts(corpuncoded)[pred == "Immigration"], 10)
#'
#' }
textmodel_cnnlstmemb <- function(x, y, dropout1 = 0.2, dropout2 = 0.2,
                                 dropout3 = 0.2, dropout4 = 0.2,
                                 wordembeddim = 30, cnnlayer = TRUE,
                                 filter = 48, kernel_size = 5, pool_size = 4,
                                 units_lstm = 128, words = NULL,
                                 maxsenlen = 100, optimizer = "adam",
                                 loss = "categorical_crossentropy",
                                 metrics = "categorical_accuracy", ...) {
    UseMethod("textmodel_cnnlstmemb")
}

#' @export
textmodel_cnnlstmemb.tokens <- function(x, y, dropout1 = 0.2, dropout2 = 0.2, dropout3 = 0.2,
                                        dropout4 = 0.2, wordembeddim = 30, cnnlayer = TRUE, filter = 48,
                                        kernel_size = 5, pool_size = 4, units_lstm = 128, words = NULL,
                                        maxsenlen = 100, optimizer = "adam",
                                        loss = "categorical_crossentropy",
                                        metrics = "categorical_accuracy", ...) {
    stopifnot(ndoc(x) == length(y))
    stopifnot(is.tokens(x))
    y <- as.factor(y)
    result <- list(x = x, y = y, call = match.call(), classnames = levels(y))
    # trim missings for fitting model
    na_ind <- which(is.na(y))
    if (length(na_ind) > 0) {
        y <- y[-na_ind]
        # workaround just because negative indexing is broken in v2 for now
        na_ind_logical <- rep(TRUE, length(y))
        na_ind_logical[na_ind] <- FALSE
        x <- x[na_ind_logical]
    }
    
    x <- tokens2sequences(x, maxsenlen = maxsenlen, keepn = words)
    
    if (is.null(words))
        words <- x$nfeatures
    # "one-hot" encode y
    y2 <- to_categorical(as.integer(y) - 1, num_classes = nlevels(y))
    
    # use keras to fit the model
    model <- keras_model_sequential()
    
    model %>%
        layer_embedding(input_dim = words + 1, output_dim = wordembeddim,
                        input_length = maxsenlen) %>%
        layer_dropout(rate = dropout1)
    
    if (cnnlayer == TRUE) {
        model %>%
            layer_conv_1d(filters = filter, kernel_size = kernel_size,
                          activation = "relu") %>%
            layer_max_pooling_1d(pool_size = pool_size) %>%
            layer_dropout(rate = dropout2)
    }
    
    model %>%
        bidirectional(layer_lstm(units = units_lstm, dropout = dropout3,
                                 recurrent_dropout = dropout4)) %>%
        layer_dense(units = nlevels(y), activation = "softmax")
    
    compile(model, loss = loss, optimizer = optimizer, metrics = metrics)
    history <- fit(model, x$matrix, y2, ...)
    
    # compile, class, and return the result
    result <- c(result,
                nfeatures = x$nfeatures,
                maxsenlen = maxsenlen,
                list(clefitted = model))
    class(result) <- c("textmodel_cnnlstmemb", "textmodel", "list")
    return(result)
}

#' @export
textmodel_cnnlstmemb.tokens2sequences <- function(x, y, dropout1 = 0.2, dropout2 = 0.2, dropout3 = 0.2,
                                                  dropout4 = 0.2, wordembeddim = 30, cnnlayer = TRUE, filter = 48,
                                                  kernel_size = 5, pool_size = 4, units_lstm = 128, words = NULL,
                                                  maxsenlen = 100,
                                                  optimizer = "adam",
                                                  loss = "categorical_crossentropy",
                                                  metrics = "categorical_accuracy", ...) {
    stopifnot(nrow(x$matrix) == length(y))
    stopifnot(is.tokens2sequences(x))
    x <- tokens2sequences(x, maxsenlen = maxsenlen, keepn = words)
    y <- as.factor(y)
    result <- list(x = x, y = y, call = match.call(), classnames = levels(y))
    
    # trim missings for fitting model
    na_ind <- which(is.na(y))
    if (length(na_ind) > 0) {
        y <- y[-na_ind]
        # workaround just because negative indexing is broken in v2 for now
        na_ind_logical <- rep(TRUE, length(y))
        na_ind_logical[na_ind] <- FALSE
        x$matrix <- x$matrix[na_ind_logical, ]
    }
    
    words <- x$nfeatures
    maxsenlen <- ncol(x$matrix)
    # "one-hot" encode y
    y2 <- to_categorical(as.integer(y) - 1, num_classes = nlevels(y))
    
    # use keras to fit the model
    model <- keras_model_sequential()
    
    model %>%
        layer_embedding(input_dim = words + 1, output_dim = wordembeddim,
                        input_length = maxsenlen) %>%
        layer_dropout(rate = dropout1)
    
    if (cnnlayer == TRUE) {
        model %>%
            layer_conv_1d(filters = filter, kernel_size = kernel_size,
                          activation = "relu") %>%
            layer_max_pooling_1d(pool_size = pool_size) %>%
            layer_dropout(rate = dropout2)
    }
    
    model %>%
        layer_lstm(units = units_lstm, dropout = dropout3,
                                 recurrent_dropout = dropout4) %>%
        layer_dense(units = nlevels(y), activation = "softmax")
    
    compile(model, loss = loss, optimizer = optimizer, metrics = metrics)
    history <- fit(model, x$matrix, y2, ...)
    
    # compile, class, and return the result
    result <- c(result,
                nfeatures = x$nfeatures,
                maxsenlen = maxsenlen,
                list(clefitted = model))
    class(result) <- c("textmodel_cnnlstmemb", "textmodel", "list")
    return(result)
}

#' Prediction from a fitted textmodel_cnnlstmemb object
#'
#' `predict.textmodel_cnnlstmemb()` implements class predictions from a
#' fitted long short-term memory neural network model.
#' @param object a fitted [textmodel_cnnlstmemb] model
#' @param newdata dfm on which prediction should be made
#' @param type the type of predicted values to be returned; see Value
#' @param force make `newdata`'s feature set conformant to the model terms
#' @param ... not used
#' @return `predict.textmodel_cnnlstmemb` returns either a vector of class
#'   predictions for each row of `newdata` (when `type = "class"`), or
#'   a document-by-class matrix of class probabilities (when `type =
#'   "probability"`).
#' @seealso [textmodel_cnnlstmemb()]
#' @keywords textmodel internal
#' @importFrom keras predict_classes predict_proba
#' @importFrom stats predict
#' @export
predict.textmodel_cnnlstmemb <- function(object, newdata = NULL,
                                  type = c("class", "probability"),
                                  force = TRUE,
                                  ...) {
    quanteda:::unused_dots(...)

    type <- match.arg(type)

    if (!is.null(newdata)) {
        if(is.tokens(newdata)) {
         data <- tokens2sequences(newdata, maxsenlen = object$maxsenlen,
                                 keepn = object$nfeatures)           
        } else {
            data <- newdata
        }
        t2s_object <- tokens2sequences(object$x, maxsenlen = object$maxsenlen,
                                       keepn = object$nfeatures)
        data <- tokens2sequences_conform(data, t2s_object)
    } else {
        data <- tokens2sequences(object$x, maxsenlen = object$maxsenlen,
                                 keepn = object$nfeatures)
    }

    if (type == "class") {
        pred_y <- predict_classes(object$clefitted, x = data$matrix)
        pred_y <- factor(pred_y, labels = object$classnames,
                         levels = (seq_along(object$classnames) - 1))
        names(pred_y) <- rownames(data$matrix)
    } else if (type == "probability") {
        pred_y <- predict_proba(object$clefitted, x = data$matrix)
        colnames(pred_y) <- object$classnames
        rownames(pred_y) <- rownames(data$matrix)
    }

    pred_y
}

#' @export
#' @importFrom stats na.omit
#' @method print textmodel_cnnlstmemb
print.textmodel_cnnlstmemb <- function(x, ...) {
    layer_names <- gsub(pattern = "_\\d*", "",
                        lapply(x$clefitted$layers, function(z) z$name))
    cat("\nCall:\n")
    print(x$call)
    cat("\n",
        format(length(na.omit(x$y)), big.mark = ","), " training documents; ",
        format(length(x$nfeatures), big.mark = ","), " fitted features",
        ".\n",
        "Structure: ", paste(layer_names, collapse = " -> "), "\n",
        sep = "")
}

#' summary method for textmodel_cnnlstmemb objects
#' @param object output from [textmodel_cnnlstmemb()]
#' @param ... additional arguments not used
#' @keywords textmodel internal
#' @method summary textmodel_cnnlstmemb
#' @export
summary.textmodel_cnnlstmemb <- function(object, ...) {
    layer_names <- gsub(pattern = "_\\d*", "",
                        lapply(object$clefitted$layers, function(x) x$name))

    result <- list(
        "call" = object$call,
        "model structure" = paste(layer_names, collapse = " -> ")
    )
    as.summary.textmodel(result)
}

#' @export
#' @method print predict.textmodel_cnnlstmemb
print.predict.textmodel_cnnlstmemb <- function(x, ...) {
    print(unclass(x))
}


#' @rdname save.textmodel_mlp
#' @importFrom keras serialize_model
#' @method save textmodel_cnnlstmemb
#' @export
save.textmodel_cnnlstmemb <- function(x, ...) {
    x$clefitted <- serialize_model(x$clefitted)
    save(x, ...)
}

#' @rdname save.textmodel_mlp
#' @importFrom keras unserialize_model
#' @method load textmodel_cnnlstmemb
#' @export
load.textmodel_cnnlstmemb <- function(x, ...) {
    load(x, ...)
    x$clefitted <- unserialize_model(x$clefitted)
}
quanteda/quanteda.classifiers documentation built on Oct. 20, 2023, 6:53 a.m.