R/attribution.R

Defines functions plot.attributions attributions .attributions.sento_modelIter .attributions.sento_model attributions_dims attributions_lags attributions_docs

Documented in attributions plot.attributions

attributions_docs <- function(sento_measures, s, sentDates, seqDates, W, cols, refDates, coeffs, tNames) {
  B <- sento_measures$attribWeights[["B"]]
  nLags <- nrow(B)
  attribsDocs <- lapply(refDates, function(t) {
    datesIn <- seqDates[(which(seqDates == t) - nLags + 1):which(seqDates == t)] # dates between t and lag number
    docWeights <- W[date %in% datesIn, -1:-2]
    if (nrow(docWeights) == 0) return(NULL) # no documents on this date
    B$date <- datesIn
    sents <- s[date %in% datesIn, ]
    sentWeighted <- sents[, -1:-3] * docWeights # drop id, date and word_count columns
    n <- sents[, list(count = .N), by = list(date)] # number of individual occurrences of the dates
    sentFull <- lapply(tNames, function(b) {
      coeffsIn <- coeffs[stringi::stri_detect(cols, regex = paste0("\\b", b, "\\b"))] # coefficients for weighting scheme
      sel <- sapply(stringi::stri_split(names(coeffsIn), regex = "--"), function(n) paste0(n[1:2], collapse = "--"))
      timeWeights <- rep(B[B$date %in% n$date, b], n$count) # respective lag weights per document
      colAttr <- matrix(rep(coeffsIn, sum(n$count)), nrow = sum(n$count), byrow = TRUE) *
        timeWeights * sentWeighted[, sel, with = FALSE]
      return(colAttr)
    })
    attribs <- Reduce(`+`, lapply(sentFull, rowSums, na.rm = TRUE)) # sum document values over time weighting schemes
    out <- data.table::data.table(sents[, c("id")], date = sents$date, attrib = attribs)
    return(out)
  })
  names(attribsDocs) <- refDates
  return(attribsDocs)
}

attributions_lags <- function(s, sentDates, seqDates, W, cols, sento_measures, measures, coeffs,
                              attribsDocs, tNames, do.normalize) {
  B <- sento_measures$attribWeights[["B"]]
  nLags <- nrow(B)
  namesLags <- get_names_lags(nLags)
  attribsLag <- lapply(names(attribsDocs), function(d) {
    if (is.null(attribsDocs[[d]])) {
      doc <- data.table::data.table(date = "xxxx-yy-zz", attrib = 0) # throw-away template
    } else {
      doc <- attribsDocs[[d]][, list(attrib = sum(attrib)), by = date]
    }
    datesLags <- seqDates[(which(seqDates == as.Date(d)) - nLags + 1):which(seqDates == as.Date(d))]
    lagsMissing <- which(!(datesLags %in% doc[["date"]]))
    setnames(doc, "date", "lag")
    if (length(lagsMissing) == 0) {
      doc[, "lag" := namesLags]
      return(doc)
    }
    if (length(lagsMissing) != nLags) doc[, "lag" := namesLags[-lagsMissing]]
    datesMissing <- datesLags[lagsMissing]
    if (sento_measures$ctr$time$weightingParam$fill == "latest") {
      attribFills <- lapply(seq_along(datesMissing), function(j) {
        diffs <- s$date - datesMissing[j]
        diffsNeg <- diffs[diffs < 0]
        dateLatest <- s$date[1:length(diffsNeg)][which.max(diffsNeg)]
        docWeights <- W[date %in% dateLatest, -1:-2]
        sents <- s[date %in% dateLatest, ]
        sentWeighted <- colSums(sents[, -1:-3] * docWeights) # recreate sentiment measures as before time smoothing
        n <- sents[, list(count = .N), by = list(date)]
        sentFull <- lapply(tNames, function(b) {
          coeffsIn <- coeffs[stringi::stri_detect(cols, regex = paste0("\\b", b, "\\b"))]
          sel <- sapply(stringi::stri_split(names(coeffsIn), regex = "--"), function(n) paste0(n[1:2], collapse = "--"))
          attr <- coeffsIn * B[lagsMissing[j], b] * sentWeighted[sel]
          return(attr)
        })
        attribFill <- sum(unlist(sentFull), na.rm = TRUE)
        return(attribFill)
      })
      doc <- rbind(doc, data.table::data.table(lag = namesLags[lagsMissing], attrib = attribFills))
      doc <- doc[order(match(lag, namesLags))][lag %in% namesLags]
    } else {
      doc <- rbind(doc, data.table::data.table(lag = namesLags[lagsMissing], attrib = 0))
      doc <- doc[order(match(lag, namesLags))][lag %in% namesLags]
    }
    return(doc)
  })
  names(attribsLag) <- names(attribsDocs)
  attribsLag <- rbindlist(attribsLag, idcol = "date")
  attribsLag[, attrib := unlist(attrib)]
  attribsLag <- dcast(attribsLag, date ~ lag, value.var = "attrib")
  attribsLag[, date := as.Date(date)]
  if (do.normalize) {
    attribsLag[, colnames(attribsLag)[-1] := attribsLag[, -1] / sqrt(rowSums(attribsLag[, -1]^2, na.rm = TRUE))][]
  }
  for (i in seq_along(attribsLag)[-1])
    set(attribsLag, i = which(is.na(attribsLag[[i]])), j = i, value = 0)
  return(attribsLag)
}

attributions_dims <- function(sento_measures, measures, cols, refDates, loc, coeffs,
                              do.normalize, dimNames, missingNames, type) {
  attribsDim <- lapply(dimNames, function(x) {
    sel <- cols[stringi::stri_detect(cols, regex = paste0("\\b", x, "\\b"))]
    coeffsIn <- data.table::data.table(matrix(coeffs[sel], nrow = length(loc), ncol = length(coeffs[sel]), byrow = TRUE))
    attribs <- rowSums(coeffsIn * measures[loc, sel, with = FALSE, drop = FALSE], na.rm = TRUE)
    attr <- data.table::data.table(date = refDates, attrib = attribs)
    return(attr)
  })
  names(attribsDim) <- dimNames
  attribsDim <- dcast(rbindlist(attribsDim, idcol = "name"), date ~ name, value.var = "attrib")
  if (length(missingNames) > 0) attribsDim[, (missingNames) := 0]
  attribsDim <- attribsDim[, c("date", sento_measures[[type]]), with = FALSE]
  if (do.normalize) {
    attribsDim[, colnames(attribsDim)[-1] := attribsDim[, -1] / sqrt(rowSums(attribsDim[, -1]^2, na.rm = TRUE))][]
  }
  for (i in seq_along(attribsDim)[-1]) # set NaNs to zero (e.g. due to zero norm division)
    set(attribsDim, i = which(is.na(attribsDim[[i]])), j = i, value = 0)
  return(attribsDim)
}

.attributions.sento_model <- function(model, sento_measures, do.lags = TRUE, do.normalize = FALSE,
                                     refDates = NULL, factor = NULL) {
  check_class(sento_measures, "sento_measures")
  stopifnot(is.logical(do.normalize))

  sento_model <- model

  # get appropriate sentiment measures from sento_measures input object
  discarded <- sento_model$discarded
  measures <- data.table::as.data.table(sento_measures)[, c(TRUE, !discarded), with = FALSE]

  # set dates at which to do attribution
  sampleDates <- sento_model$dates
  if (is.null(refDates)) {
    refDates <- measures[date >= sampleDates[1] & date <= sampleDates[2], ][["date"]] # take in-sample dates
  } else {
    refDates <- sort(as.Date(refDates))
    if (min(refDates) < sampleDates[1])
      stop("The earliest date in 'refDates' is earlier than the first estimation sample date.")
    if (!all(refDates %in% measures[["date"]]))
      stop("Not all 'refDates' are available in the textual sentiment time series.")
  }

  # retrieve remaining required information from input objects
  s <- sento_measures$sentiment
  W <- sento_measures$attribWeights[["W"]]
  loc <- which(measures$date %in% refDates)
  measures[, "date" := NULL]
  cols <- colnames(measures)
  colsSplit <- stringi::stri_split(cols, regex = "--")
  lNames <- unique(sapply(colsSplit, "[", 1))
  lDel <- sento_measures$lexicons[which(!(sento_measures$lexicons %in% lNames))]
  fNames <- unique(sapply(colsSplit, "[", 2))
  fDel <- sento_measures$features[which(!(sento_measures$features %in% fNames))]
  tNames <- unique(sapply(colsSplit, "[", 3))
  tDel <- sento_measures$time[which(!(sento_measures$time %in% tNames))]

  # ignore fully dropped components in sentiment data.table (only composed of lexicon--feature combinations)
  if (length(fDel) > 0) {
    regex <- paste0(paste0("\\b", fDel, "\\b"), collapse = "|")
    s <- s[, !stringi::stri_detect(colnames(s), regex = regex), with = FALSE]
    W <- W[, !stringi::stri_detect(colnames(W), regex = regex), with = FALSE]
  }
  if (length(lDel) > 0) {
    regex <- paste0(paste0("\\b", lDel, "\\b"), collapse = "|")
    s <- s[, !stringi::stri_detect(colnames(s), regex = regex), with = FALSE]
    W <- W[, !stringi::stri_detect(colnames(W), regex = regex), with = FALSE]
  }

  # set a sequence of all possible dates
  sentDates <- unique(s$date)
  if (!(sento_measures$ctr$time$weightingParam$fill == "none"))
    seqDates <- seq(sentDates[1], sentDates[length(sentDates)], by = sento_measures$ctr$time$weightingParam$by)
  else seqDates <- sentDates

  # extract sentiment coefficients
  if (is.null(factor))
    coeffs <- stats::coef(sento_model$reg)[cols, ]
  else
    coeffs <- stats::coef(sento_model$reg)[[factor]][cols, ]

  # calculate and assemble attributions
  attribsAll <- list(documents = NULL, lags = NULL, lexicons = NULL, features = NULL, time = NULL)
  attribsDocs <- attributions_docs(sento_measures, s, sentDates, seqDates, W, cols, refDates, coeffs, tNames)
  attribsAll[["documents"]] <- attribsDocs
  if (do.lags == TRUE) {
    attribsAll[["lags"]] <- attributions_lags(s, sentDates, seqDates, W, cols, sento_measures, measures, coeffs,
                                              attribsDocs, tNames, do.normalize)
  }
  attribsAll[["lexicons"]] <- attributions_dims(sento_measures, measures, cols, refDates, loc, coeffs, do.normalize,
                                                lNames, lDel, "lexicons")
  attribsAll[["features"]] <- attributions_dims(sento_measures, measures, cols, refDates, loc, coeffs, do.normalize,
                                                fNames, fDel, "features")
  attribsAll[["time"]] <- attributions_dims(sento_measures, measures, cols, refDates, loc, coeffs, do.normalize,
                                            tNames, tDel, "time")

  class(attribsAll) <- c("attributions", class(attribsAll))

  return(attribsAll)
}

.attributions.sento_modelIter <- function(model, sento_measures, do.lags = TRUE, do.normalize = FALSE,
                                         refDates = NULL, factor = NULL) {
  stopifnot(is.logical(do.normalize))

  sento_modelIter <- model

  if (is.null(refDates)) refDates <- as.Date(names(sento_modelIter$models))
  attribsFull <- lapply(1:length(refDates), function(i) {
    date <- refDates[i]
    model <- sento_modelIter$models[[i]]
    attribs <- attributions(model, sento_measures, do.lags = do.lags, do.normalize = do.normalize,
                            refDates = date, factor = factor)
    return(attribs)
  })

  attribsAll <- list(documents = NULL, lags = NULL, lexicons = NULL, features = NULL, time = NULL)
  attribsAll[["documents"]] <- unlist(lapply(attribsFull, function(x) return(x[["documents"]])), recursive = FALSE)
  if (do.lags == TRUE) attribsAll[["lags"]] <- rbindlist(lapply(attribsFull, function(x) return(x[["lags"]])))
  attribsAll[["lexicons"]] <- rbindlist(lapply(attribsFull, function(x) return(x[["lexicons"]])))
  attribsAll[["features"]] <- rbindlist(lapply(attribsFull, function(x) return(x[["features"]])))
  attribsAll[["time"]] <- rbindlist(lapply(attribsFull, function(x) return(x[["time"]])))

  class(attribsAll) <- c("attributions", class(attribsAll))

  return(attribsAll)
}

#' @importFrom compiler cmpfun
#' @export
attributions.sento_model <- compiler::cmpfun(.attributions.sento_model)

#' @importFrom compiler cmpfun
#' @export
attributions.sento_modelIter <- compiler::cmpfun(.attributions.sento_modelIter)

#' Retrieve top-down model sentiment attributions
#'
#' @author Samuel Borms, Keven Bluteau
#'
#' @description Computes the attributions to predictions for a (given) number of dates at all possible sentiment dimensions,
#' based on the coefficients associated to each sentiment measure, as estimated in the provided model object.
#'
#' @details See \code{\link{sento_model}} for an elaborate modeling example including the calculation and plotting of
#' attributions. The attribution for logistic models is represented in terms of log odds. For binomial models, it is
#' calculated with respect to the last factor level or factor column. A \code{NULL} value for document-level attribution
#' on a given date means no documents are directly implicated in the associated prediction.
#'
#' @param model a \code{sento_model} or a \code{sento_modelIter} object created with \code{\link{sento_model}}.
#' @param sento_measures the \code{sento_measures} object, as created with \code{\link{sento_measures}}, used to estimate
#' the model from the first argument (make sure this is the case!).
#' @param do.lags a \code{logical}, \code{TRUE} also computes the attribution to each time lag. For large time lags,
#' this is time-consuming.
#' @param do.normalize a \code{logical}, \code{TRUE} divides each element of every attribution vector at a given date by its
#' L2-norm at that date, normalizing the values between -1 and 1. The document attributions are not normalized.
#' @param refDates the dates (as \code{"yyyy-mm-dd"}) at which attribution is to be performed. These should be between the latest
#' date available in the input \code{sento_measures} object and the first estimation sample date (that is, \code{model$dates[1]}
#' if \code{model} is a \code{sento_model} object). All dates should also be in \code{get_dates(sento_measures)}. If
#' \code{NULL} (default), attribution is calculated for all in-sample dates. Ignored if \code{model} is a \code{sento_modelIter}
#' object, for which attribution is calculated for all out-of-sample prediction dates.
#' @param factor the factor level as a single \code{character} vector to calculate attribution
#' for in case of (a) multinomial model(s). Ignored for linear and binomial models.
#'
#' @return A \code{list} of class \code{attributions}, with \code{"documents"}, \code{"lags"}, \code{"lexicons"},
#' \code{"features"} and \code{"time"} as attribution dimensions. The last four dimensions are
#' \code{data.table}s having a \code{"date"} column and the other columns the different components of the dimension, with
#' the attributions as values. Document-level attribution is further decomposed into a \code{data.table} per date, with
#' \code{"id"}, \code{"date"} and \code{"attrib"} columns. If \code{do.lags = FALSE}, the \code{"lags"} element is set
#' to \code{NULL}.
#'
#' @seealso \code{\link{sento_model}}
#'
#' @export
attributions <- function(model, sento_measures, do.lags = TRUE, do.normalize = FALSE, refDates = NULL, factor = NULL) {
  UseMethod("attributions", model)
}

#' Plot prediction attributions at specified level
#'
#' @author Samuel Borms, Keven Bluteau
#'
#' @description Shows a plot of the attributions along the dimension provided, stacked per date.
#'
#' @details See \code{\link{sento_model}} for an elaborate modeling example including the calculation and plotting of
#' attributions. This function does not handle the plotting of the attribution of individual documents, since there are
#' often a lot of documents involved and they appear only once at one date (even though a document may contribute to
#' predictions at several dates, depending on the number of lags in the time aggregation).
#'
#' @param x an \code{attributions} object created with \code{\link{attributions}}.
#' @param group a value from \code{c("lags", "lexicons", "features", "time")}.
#' @param ... not used.
#'
#' @return Returns a simple \code{\link{ggplot}} object, which can be added onto (or to alter its default elements) by using
#' the \code{+} operator. By default, a legend is positioned at the top if the number of components of the
#' dimension is at maximum twelve.
#'
#' @import ggplot2
#' @export
plot.attributions <- function(x, group = "features", ...) {
  if (!(group %in% c("lags", "lexicons", "features", "time")))
    stop("The 'group' argument should be either 'lags', 'lexicons', 'features' or 'time'.")
  attributions <- x[[group]]
  if (group == "lags" && is.null(attributions))
    stop("No 'lags' attribution is calculated. Set the 'do.lags' argument in the attributions() function to TRUE.")
  attributionsMelt <- data.table::melt(attributions, id.vars = "date", variable.factor = FALSE)
  attributionsMelt[, "pos" := ifelse(value >= 0, value, 0)][, "neg" := ifelse(value < 0, value, -1e-36)]
  attributionsMelt <- attributionsMelt[order(rank(as.character(variable)))]
  legendPos <- ifelse(length(unique(attributionsMelt[["variable"]])) <= 12, "top", "none")
  p <- ggplot(data = attributionsMelt, aes(x = date, fill = variable, color = variable)) +
    # geom_area(aes(fill = variable), alpha = 1) +
    geom_area(aes(y = pos), alpha = 1) +
    geom_area(aes(y = neg), alpha = 1) +
    # geom_ribbon(aes(ymin = 0, ymax = value), alpha = 0.5) +
    geom_hline(yintercept = 0, size = 0.50, linetype = "dotted") +
    scale_fill_grey(start = 0, end = 1) +
    scale_x_date(name = "Date", date_labels = "%m-%Y") +
    scale_y_continuous(name = "Attribution") +
    theme_bw() +
    plot_theme(legendPos)
  p
}
sborms/sentometrics documentation built on Aug. 21, 2021, 6:40 a.m.