R/sentomeasures_methods.R

Defines functions agg_global aggregate.sento_measures subset.sento_measures as.data.frame.sento_measures as.data.table.sento_measures get_dimensions get_dates print.sento_measures summary.sento_measures scale.sento_measures nobs.sento_measures nmeasures nmeasures.sento_measures diff.sento_measures plot.sento_measures

Documented in aggregate.sento_measures as.data.table.sento_measures diff.sento_measures get_dates get_dimensions nmeasures nobs.sento_measures plot.sento_measures scale.sento_measures subset.sento_measures

#' Plot sentiment measures
#'
#' @author Samuel Borms
#'
#' @method plot sento_measures
#'
#' @description Plotting method that shows all sentiment measures from the provided \code{sento_measures}
#' object in one plot, or the average along one of the lexicons, features and time weighting dimensions.
#'
#' @param x a \code{sento_measures} object created using \code{\link{sento_measures}}.
#' @param group a value from \code{c("lexicons", "features", "time", "all")}. The first three choices display the average of
#' all measures from the same group, in a different color. The choice \code{"all"} displays every single sentiment measure
#' in a separate color, but this may look visually overwhelming very fast, and can be quite slow.
#' @param ... not used.
#'
#' @return Returns a simple \code{\link{ggplot}} object, which can be added onto (or to alter its default elements) by using
#' the \code{+} operator (see example). By default, a legend is positioned at the top if there are at maximum twelve line
#' graphs plotted and \code{group} is different from \code{"all"}.
#'
#' @examples
#' # construct a sento_measures object to start with
#' corpus <- sento_corpus(corpusdf = sentometrics::usnews)
#' corpusSample <- quanteda::corpus_sample(corpus, size = 500)
#' l <- sento_lexicons(sentometrics::list_lexicons[c("LM_en")],
#'                     sentometrics::list_valence_shifters[["en"]])
#' ctr <- ctr_agg(howTime = c("equal_weight", "linear"), by = "month", lag = 3)
#' sm <- sento_measures(corpusSample, l, ctr)
#'
#' # plot sentiment measures
#' plot(sm, "features")
#'
#' \dontrun{
#' # adjust appearance of plot
#' library("ggplot2")
#' p <- plot(sm)
#' p <- p +
#'   scale_x_date(name = "year", date_labels = "%Y") +
#'   scale_y_continuous(name = "newName")
#' p}
#'
#' @import ggplot2
#' @export
plot.sento_measures <- function(x, group = "all", ...) {
  if (!(group %in% c("lexicons", "features", "time", "all")))
    stop("The 'group' argument should be either 'lexicons', 'features', 'time' or 'all'.")
  measures <- data.table::as.data.table(x)
  if (group == "all") {
    measuresMelt <- data.table::melt(measures, id.vars = "date", variable.factor = FALSE)
    legendPos <- "none"
  } else {
    measuresMelt <- measures_to_long(measures)[, c("date", group, "value"), with = FALSE]
    measuresMelt <- measuresMelt[, list(value = mean(value)), by = list(date, variable = eval(parse(text = group)))]
    legendPos <- ifelse(length(unique(measuresMelt[["variable"]])) <= 12, "top", "none")
  }
  measuresMelt <- measuresMelt[order(rank(as.character(variable)))]
  p <- ggplot(data = measuresMelt, aes(x = date, y = value, color = variable)) +
    geom_line() +
    # geom_hline(yintercept = 0, size = 0.50, linetype = "dotted") +
    scale_x_date(name = "Date", date_labels = "%m-%Y") +
    scale_y_continuous(name = "Sentiment") +
    theme_bw() +
    plot_theme(legendPos)
  p
}

#' Differencing of sentiment measures
#'
#' @author Samuel Borms
#'
#' @description Differences the sentiment measures from a \code{sento_measures} object.
#'
#' @param x a \code{sento_measures} object created using \code{\link{sento_measures}}.
#' @param lag a \code{numeric}, see documentation for the generic \code{\link{diff}}.
#' @param differences a \code{numeric}, see documentation for the generic \code{\link{diff}}.
#' @param ... not used.
#'
#' @return A modified \code{sento_measures} object, with the measures replaced by the differenced measures as well as updated
#' statistics.
#'
#' @examples
#' data("usnews", package = "sentometrics")
#' data("list_lexicons", package = "sentometrics")
#' data("list_valence_shifters", package = "sentometrics")
#'
#' # construct a sento_measures object to start with
#' corpus <- sento_corpus(corpusdf = usnews)
#' corpusSample <- quanteda::corpus_sample(corpus, size = 500)
#' l <- sento_lexicons(list_lexicons[c("LM_en", "HENRY_en")], list_valence_shifters[["en"]])
#' ctr <- ctr_agg(howTime = c("equal_weight", "linear"), by = "year", lag = 3)
#' sento_measures <- sento_measures(corpusSample, l, ctr)
#'
#' # first-order difference sentiment measures with a lag of two
#' diffed <- diff(sento_measures, lag = 2, differences = 1)
#'
#' @export
diff.sento_measures <- function(x, lag = 1, differences = 1, ...) {
  dates <- get_dates(x)[-1:-(lag * differences)]
  measures <- data.table::as.data.table(x)[, -1] # drop dates
  measuresDiff <- diff(as.matrix(measures), lag = lag, differences = differences)
  x$measures <- data.table::data.table(date = dates, measuresDiff)
  x$stats <- compute_stats(x)
  x
}

#' @export
nmeasures.sento_measures <- function(sento_measures) {
  NCOL(sento_measures[["measures"]]) - 1 # omit date column
}

#' Get number of sentiment measures
#'
#' @author Samuel Borms
#'
#' @description Returns the number of sentiment measures.
#'
#' @param sento_measures a \code{sento_measures} object created using \code{\link{sento_measures}}.
#'
#' @return The number of sentiment measures in the input \code{sento_measures} object.
#'
#' @export
nmeasures <- function(sento_measures) {
  UseMethod("nmeasures", sento_measures)
}

#' Get number of observations in the sentiment measures
#'
#' @author Samuel Borms
#'
#' @description Returns the number of data points available in the sentiment measures.
#'
#' @param object a \code{sento_measures} object created using \code{\link{sento_measures}}.
#' @param ... not used.
#'
#' @return The number of rows (observations/data points) in \code{object[["measures"]]}.
# #'
# #' @keywords internal
#'
#' @importFrom stats nobs
#' @export
nobs.sento_measures <- function(object, ...) {
  NROW(object[["measures"]])
}

#' Scaling and centering of sentiment measures
#'
#' @author Samuel Borms
#'
#' @description Scales and centers the sentiment measures from a \code{sento_measures} object, column-per-column. By default,
#' the measures are normalized. \code{NA}s are removed first.
#'
#' @details If one of the arguments \code{center} or \code{scale} is a \code{matrix}, this operation will be applied first,
#' and eventual other centering or scaling is computed on that data.
#'
#' @param x a \code{sento_measures} object created using \code{\link{sento_measures}}.
#' @param center a \code{logical} or a \code{numeric} vector, see documentation for the generic \code{\link{scale}}.
#' Alternatively, one can provide a \code{matrix} of dimensions \code{nobs(sento_measures)} times \code{1} or
#' \code{nmeasures(sento_measures)} with values to subtract from each individual observation.
#' @param scale a \code{logical} or a \code{numeric} vector, see documentation for the generic \code{\link{scale}}.
#' Alternatively, one can provide a \code{matrix} of dimensions \code{nobs(sento_measures)} times \code{1} or
#' \code{nmeasures(sento_measures)} with values to divide each individual observation by.
#'
#' @return A modified \code{sento_measures} object, with the measures replaced by the scaled measures as well as updated
#' statistics.
#'
#' @examples
#' data("usnews", package = "sentometrics")
#' data("list_lexicons", package = "sentometrics")
#' data("list_valence_shifters", package = "sentometrics")
#'
#' set.seed(505)
#'
#' # construct a sento_measures object to start with
#' corpus <- sento_corpus(corpusdf = usnews)
#' corpusSample <- quanteda::corpus_sample(corpus, size = 500)
#' l <- sento_lexicons(list_lexicons[c("LM_en", "HENRY_en")])
#' ctr <- ctr_agg(howTime = c("equal_weight", "linear"), by = "year", lag = 3)
#' sento_measures <- sento_measures(corpusSample, l, ctr)
#'
#' # scale sentiment measures to zero mean and unit standard deviation
#' sc1 <- scale(sento_measures)
#'
#' n <- nobs(sento_measures)
#' m <- nmeasures(sento_measures)
#'
#' # subtract a matrix
#' sc2 <- scale(sento_measures, center = matrix(runif(n * m), n, m), scale = FALSE)
#'
#' # divide every row observation based on a one-column matrix, then center
#' sc3 <- scale(sento_measures, center = TRUE, scale = matrix(runif(n)))
#'
#' @export
scale.sento_measures <- function(x, center = TRUE, scale = TRUE) {
  dates <- get_dates(x)
  measures <- data.table::as.data.table(x)[, -1] # drop dates
  if (is.matrix(center)) {
    if (nrow(center) != nobs(x) || !(ncol(center) %in% c(1, nmeasures(x))))
      stop("The matrix dimensions of the 'center' argument are not correct.")
    measures <- measures + center
    center <- FALSE
  }
  if (is.matrix(scale)) {
    if (nrow(scale) != nobs(x) || !(ncol(scale) %in% c(1, nmeasures(x))))
      stop("The matrix dimensions of the 'scale' argument are not correct.")
    measures <- measures / scale
    scale <- FALSE
  }
  measuresNorm <- scale(measures, center = center, scale = scale)
  x$measures <- data.table::data.table(date = dates, measuresNorm)
  x$stats <- compute_stats(x)
  x
}

#' @export
summary.sento_measures <- function(object, ...) {
  sento_measures <- object
  freq <- c("daily", "weekly", "monthly", "yearly")[c("day", "week", "month", "year") %in% sento_measures$ctr$time$weightingParam$by]
  cat("This sento_measures object contains ", nmeasures(sento_measures), " textual sentiment time series with ",
      nobs(sento_measures), " observations each ", "(", freq, ").", "\n", sep = "")
  cat("\n")
  cat("Following features are present:", sento_measures$features, "\n")
  cat("Following lexicons are used to calculate sentiment:", sento_measures$lexicons, "\n")
  cat("Following scheme is applied for aggregation within documents:", sento_measures$ctr$within$howWithin, "\n")
  cat("Following scheme is applied for aggregation across documents:", sento_measures$ctr$docs$howDocs, "\n")
  cat("Following schemes are applied for aggregation across time:", sento_measures$time, "\n")
  cat("\n")
  cat("Aggregate average statistics:", "\n")
  print(round(rowMeans(sento_measures$stats, na.rm = TRUE), 5))
  cat()
}

#' @export
print.sento_measures <- function(x, ...) {
  cat("A sento_measures object (", nmeasures(x),
      " textual sentiment time series, ", nobs(x),
      " observations).", "\n", sep = "")
}

#' Get the dates of the sentiment measures/time series
#'
#' @author Samuel Borms
#'
#' @description Returns the dates of the sentiment time series.
#'
#' @param sento_measures a \code{sento_measures} object created using \code{\link{sento_measures}}.
#'
#' @return The \code{"date"} column in \code{sento_measures[["measures"]]} as a \code{character} vector.
#'
#' @export
get_dates <- function(sento_measures) {
  check_class(sento_measures, "sento_measures")
  sento_measures$measures[, date]
}

#' Get the dimensions of the sentiment measures
#'
#' @author Samuel Borms
#'
#' @description Returns the components across all three dimensions of the sentiment measures.
#'
#' @param sento_measures a \code{sento_measures} object created using \code{\link{sento_measures}}.
#'
#' @return The \code{"features"}, \code{"lexicons"} and \code{"time"} elements in \code{sento_measures}.
#'
#' @export
get_dimensions <- function(sento_measures) {
  check_class(sento_measures, "sento_measures")
  sento_measures[c("features", "lexicons", "time")]
}

#' Get the sentiment measures
#'
#' @author Samuel Borms
#'
#' @description Extracts the sentiment measures \code{data.table} in either wide (by default)
#' or long format.
#'
#' @param x a \code{sento_measures} object created using \code{\link{sento_measures}}.
#' @param keep.rownames see \code{\link{as.data.table}}.
#' @param format a single \code{character} vector, one of \code{c("wide", "long")}.
#' @param ... not used.
#'
#' @return The panel of sentiment measures under \code{sento_measures[["measures"]]},
#' in wide or long format.
#'
#' @examples
#' data("usnews", package = "sentometrics")
#' data("list_lexicons", package = "sentometrics")
#' data("list_valence_shifters", package = "sentometrics")
#'
#' sm <- sento_measures(sento_corpus(corpusdf = usnews[1:200, ]),
#'                      sento_lexicons(list_lexicons["LM_en"]),
#'                      ctr_agg(lag = 3))
#'
#' data.table::as.data.table(sm)
#' data.table::as.data.table(sm, format = "long")
#'
#' @export
as.data.table.sento_measures <- function(x, keep.rownames = FALSE, format = "wide", ...) {
  if (format == "wide")
    x[["measures"]]
  else if (format == "long")
    measures_to_long(x[["measures"]])
  else
    stop("The 'format' argument should be 'wide' or 'long'.")
}

#' @export
as.data.frame.sento_measures <- function(x, ...) {
  as.data.frame(x[["measures"]])
}

#' Subset sentiment measures
#'
#' @author Samuel Borms
#'
#' @description Subsets rows of the sentiment measures based on its columns.
#'
#' @param x a \code{sento_measures} object created using \code{\link{sento_measures}}.
#' @param subset a logical (non-\code{character}) expression indicating the rows to keep. If a
#' \code{numeric} input is given, it is used for row index subsetting.
#' @param select a \code{character} vector of the lexicon, feature and time weighting scheme names, to indicate which
#' measures need to be selected, or as a \code{list} of \code{character} vectors, possibly with separately specified
#' combinations (consisting of one unique lexicon, one unique feature, and one unique time weighting scheme at maximum).
#' @param delete see the \code{select} argument, but to delete measures.
#' @param ... not used.
#'
#' @return A modified \code{sento_measures} object, with only the remaining rows and sentiment measures,
#' including updated information and statistics, but the original sentiment scores \code{data.table} untouched.
#'
#' @examples
#' data("usnews", package = "sentometrics")
#' data("list_lexicons", package = "sentometrics")
#' data("list_valence_shifters", package = "sentometrics")
#'
#' # construct a sento_measures object to start with
#' corpus <- sento_corpus(corpusdf = usnews)
#' corpusSample <- quanteda::corpus_sample(corpus, size = 500)
#' l <- sento_lexicons(list_lexicons[c("LM_en", "HENRY_en")])
#' ctr <- ctr_agg(howTime = c("equal_weight", "linear"), by = "year", lag = 3)
#' sm <- sento_measures(corpusSample, l, ctr)
#'
#' # three specified indices in required list format
#' three <- as.list(
#'   stringi::stri_split(c("LM_en--economy--linear",
#'                         "HENRY_en--wsj--equal_weight",
#'                         "HENRY_en--wapo--equal_weight"),
#'                       regex = "--")
#' )
#'
#' # different subsets
#' sub1 <- subset(sm, HENRY_en--economy--equal_weight >= 0.01)
#' sub2 <- subset(sm, date %in% get_dates(sm)[3:12])
#' sub3 <- subset(sm, 3:12)
#' sub4 <- subset(sm, 1:100) # warning
#'
#' # different selections
#' sel1 <- subset(sm, select = "equal_weight")
#' sel2 <- subset(sm, select = c("equal_weight", "linear"))
#' sel3 <- subset(sm, select = c("linear", "LM_en"))
#' sel4 <- subset(sm, select = list(c("linear", "wsj"), c("linear", "economy")))
#' sel5 <- subset(sm, select = three)
#'
#' # different deletions
#' del1 <- subset(sm, delete = "equal_weight")
#' del2 <- subset(sm, delete = c("linear", "LM_en"))
#' del3 <- subset(sm, delete = list(c("linear", "wsj"), c("linear", "economy")))
#' del4 <- subset(sm, delete = c("equal_weight", "linear")) # warning
#' del5 <- subset(sm, delete = three)
#'
#' @export
subset.sento_measures <- function(x, subset = NULL, select = NULL, delete = NULL, ...) {
  check_class(x, "sento_measures")

  # subset
  isNumericSubset <- tryCatch(is.numeric(subset), error = function(e) FALSE)
  if (isNumericSubset) {
    if (max(subset) > nobs(x)) {
      warning("At least one row index is greater than nobs(x). Input sento_measures object is returned.")
      return(x)
    }
    measuresNew <- data.table::as.data.table(x)[subset, ]
    if (nrow(measuresNew) == 0) {
      warning("No rows retained. Input sento_measures object is returned.")
      return(x)
    } else {
      x <- update_info(x, measuresNew) # subset update
    }
  } else {
    sub <- as.character(substitute(list(subset))[-1L])
    if (length(sub) > 0 && sub != "NULL") {
      sub <- stringi::stri_replace_all(sub, "", regex = " ")
      sub <- stringi::stri_replace_all(sub, "____", regex = "--")
      measures <- data.table::as.data.table(x)
      colnames(measures) <- stringi::stri_replace_all(colnames(measures), "____", regex = "--") # -- is problematic here
      measuresNew <- tryCatch(measures[eval(parse(text = sub), parent.frame(2))], error = function(e) NULL)
      if (is.null(measuresNew)) stop("The 'subset' argument must evaluate to logical.")
      colnames(measuresNew) <- stringi::stri_replace_all(colnames(measuresNew), "--", regex = "____")
      if (dim(measuresNew)[1] == 0) {
        warning("No rows selected in subset. Input sento_measures object is returned.")
        return(x)
      }
      x <- update_info(x, measuresNew) # subset update
    }
  }

  # select
  if (!is.null(select)) {
    allOpts <- unlist(get_dimensions(x))
    valid <- unlist(select) %in% allOpts
    if (any(!valid)) {
      stop(paste0("Following components make up none of the sentiment measures: ",
                  paste0(unique(unlist(select)[!valid]), collapse = ', '), "."))
    }

    measures <- data.table::as.data.table(x)
    namesList <- stringi::stri_split(colnames(measures), regex = "--")
    if (is.list(select)) {
      ind <- rep(FALSE, length(namesList))
      for (com in select) {
        inds <- sapply(namesList, function(x) return(all(com %in% x)))
        ind[inds == TRUE] <- TRUE
      }
    } else ind <- sapply(namesList, function(x) return(any(select %in% x)))
    if (!any(ind[-1])) {
      warning("No appropriate combination for selection found. Input sento_measures object is returned.")
      return(x)
    }
    measuresNew <- measures[, c(TRUE, ind[-1]), with = FALSE]
    x <- update_info(x, measuresNew) # select update
  }

  # delete
  if (!is.null(delete)) {
    allOpts <- unlist(get_dimensions(x))
    valid <- unlist(delete) %in% allOpts
    if (any(!valid)) {
      stop(paste0("Following components make up none of the sentiment measures: ",
                  paste0(unique(unlist(delete)[!valid]), collapse = ', '), "."))
    }

    measures <- data.table::as.data.table(x)
    namesList <- stringi::stri_split(colnames(measures), regex = "--")
    if (is.list(delete)) {
      ind <- rep(FALSE, length(namesList))
      for (com in delete) {
        inds <- sapply(namesList, function(x) return(all(com %in% x)))
        ind[inds == TRUE] <- TRUE
      }
    } else ind <- sapply(namesList, function(x) return(any(delete %in% x)))
    if (all(ind[-1]) || all(!ind[-1])) {
      warning("No appropriate combination found or all measures selected for deletion. Input sento_measures object is returned.")
      return(x)
    }
    measuresNew <- measures[, c(TRUE, !ind[-1]), with = FALSE]
    x <- update_info(x, measuresNew) # delete update
  }

  return(x)
}

#' Aggregate sentiment measures
#'
#' @author Samuel Borms
#'
#' @description Aggregates sentiment measures by combining across provided lexicons, features, and time weighting
#' schemes dimensions. For \code{do.global = FALSE}, the combination occurs by taking the mean of the relevant
#' measures. For \code{do.global = TRUE}, this function aggregates all sentiment measures into a weighted global textual
#' sentiment measure for each of the dimensions.
#'
#' @details If \code{do.global = TRUE}, the measures are constructed from weights that indicate the importance (and sign)
#' along each component from the \code{lexicons}, \code{features}, and \code{time} dimensions. There is no restriction in
#' terms of allowed weights. For example, the global index based on the supplied lexicon weights (\code{"globLex"}) is obtained
#' first by multiplying every sentiment measure with its corresponding weight (meaning, the weight given to the lexicon the
#' sentiment is computed with), then by taking the average per date.
#'
#' @param x a \code{sento_measures} object created using \code{\link{sento_measures}}.
#' @param lexicons a \code{list} with unique lexicons to aggregate at given name, e.g., \cr
#' \code{list(lex12 = c("lex1", "lex2"))}. See \code{x$lexicons} for the exact names to use. Use \code{NULL}
#' (default) to apply no merging across this dimension. If \code{do.global = TRUE}, should be a \code{numeric} vector of
#' weights, of size \code{length(x$lexicons)}, in the same order. A value of \code{NULL} means equally weighted.
#' @param features a \code{list} with unique features to aggregate at given name, e.g., \cr
#' \code{list(feat12 = c("feat1", "feat2"))}. See \code{x$features} for the exact names to use. Use \code{NULL}
#' (default) to apply no merging across this dimension. If \code{do.global = TRUE}, should be a \code{numeric} vector of
#' weights, of size \code{length(x$features)}, in the same order. A value of \code{NULL} means equally weighted.
#' @param time a \code{list} with unique time weighting schemes to aggregate at given name, e.g., \cr
#' \code{list(tw12 = c("tw1", "tw2"))}. See \code{x$time} for the exact names to use. Use \code{NULL} (default)
#' to apply no merging across this dimension. If \code{do.global = TRUE}, should be a \code{numeric} vector of
#' weights, of size \code{length(x$time)}, in the same order. A value of \code{NULL} means equally weighted.
#' @param do.global a \code{logical} indicating if the sentiment measures should be aggregated into weighted
#' global sentiment indices.
#' @param do.keep a \code{logical} indicating if the original sentiment measures should be kept (i.e., the aggregated
#' sentiment measures will be added to the current sentiment measures as additional indices if \code{do.keep = TRUE}).
#' @param ... not used.
#'
#' @return If \code{do.global = FALSE}, a modified \code{sento_measures} object, with the aggregated sentiment
#' measures, including updated information and statistics, but the original sentiment scores \code{data.table}
#' untouched.
#'
#' If \code{do.global = TRUE}, a \code{data.table} with the different types of weighted global sentiment measures,
#' named \code{"globLex"}, \code{"globFeat"}, \code{"globTime"} and \code{"global"}, with \code{"date"} as the first
#' column. The last measure is an average of the the three other measures.
#'
#' @examples
#' data("usnews", package = "sentometrics")
#' data("list_lexicons", package = "sentometrics")
#' data("list_valence_shifters", package = "sentometrics")
#'
#' # construct a sento_measures object to start with
#' corpus <- sento_corpus(corpusdf = usnews)
#' corpusSample <- quanteda::corpus_sample(corpus, size = 500)
#' l <- sento_lexicons(list_lexicons[c("LM_en", "HENRY_en")],
#'                     list_valence_shifters[["en"]])
#' ctr <- ctr_agg(howTime = c("equal_weight", "linear"),
#'                by = "year", lag = 3)
#' sento_measures <- sento_measures(corpusSample, l, ctr)
#'
#' # aggregation across specified components
#' smAgg <- aggregate(sento_measures,
#'                    time = list(W = c("equal_weight", "linear")),
#'                    features = list(journals = c("wsj", "wapo")),
#'                    do.keep = TRUE)
#'
#' # aggregation in full
#' dims <- get_dimensions(sento_measures)
#' smFull <- aggregate(sento_measures,
#'                     lexicons = list(L = dims[["lexicons"]]),
#'                     time = list(T = dims[["time"]]),
#'                     features = list(F = dims[["features"]]))
#'
#' # "global" aggregation
#' smGlobal <- aggregate(sento_measures, do.global = TRUE,
#'                       lexicons = c(0.3, 0.1),
#'                       features = c(1, -0.5, 0.3, 1.2),
#'                       time = NULL)
#'
#' \dontrun{
#' # aggregation won't work, but produces informative error message
#' aggregate(sento_measures,
#'           time = list(W = c("equal_weight", "almon1")),
#'           lexicons = list(LEX = c("LM_en")),
#'           features = list(journals = c("notInHere", "wapo")))}
#'
#' @export
aggregate.sento_measures <- function(x, features = NULL, lexicons = NULL, time = NULL,
                                     do.global = FALSE, do.keep = FALSE, ...) {
  stopifnot(is.logical(do.global))

  if (do.global == TRUE) {
    stopifnot(is.null(features) || is.numeric(features))
    stopifnot(is.null(lexicons) || is.numeric(lexicons))
    stopifnot(is.null(time) || is.numeric(time))
    measures <- agg_global(x, lexicons, features, time)
    if (do.keep == TRUE) measures <- cbind(measures, data.table::as.data.table(x)[, -1])
    return(measures)
  }

  stopifnot(is.logical(do.keep))
  stopifnot(is.null(features) || is.list(features))
  stopifnot(is.null(lexicons) || is.list(lexicons))
  stopifnot(is.null(time) || is.list(time))

  check <- check_agg_dimensions(x, features = features, lexicons = lexicons, time = time) # check inputs
  if (check$stop == TRUE)
    stop(paste0(c("Wrong inputs.", check$msg1, check$msg2), collapse = " "))

  measures <- data.table::as.data.table(x)
  toAgg <- list(lexicons = lexicons, features = features, time = time)

  if (do.keep == TRUE) {
    measuresOld <- measures
    namesOld <- colnames(measures)
  }
  # loop over lexicons, features and time lists
  for (across in toAgg) {
    # loop over set of aggregation levels to combine into given name (e.g., lex12 = c("lex1", "lex2"))
    for (i in seq_along(across)) {
      name <- names(across)[i] # e.g. "lex12"
      cols <- across[[i]] # e.g. c("lex1", "lex2")
      # find all sentiment columns aggregated at one of the 'cols' aggregation levels and stack them into ls
      ls <- sels <- as.list(1:length(cols))
      names(ls) <- names(sels) <- cols
      for (elem in cols) {
        sel <- colnames(measures)[stringi::stri_detect(colnames(measures), regex = paste0("\\b", elem, "\\b"))] # exact match
        selMeas <- measures[, sel, with = FALSE, drop = FALSE]
        nms <- stringi::stri_split(colnames(selMeas), regex = "--")
        loc <- which(stringi::stri_detect(nms[[1]], regex = elem))[1]
        nmsNew <- sapply(nms, function(x) {
          x[loc] <- name
          paste0(x, collapse = "--")
        })
        colnames(selMeas) <- nmsNew
        ls[[elem]] <- selMeas
        sels[[elem]] <- sel
      }
      common <- Reduce(intersect, lapply(ls, colnames))
      ls <- lapply(1:length(ls), function(k) {
        m <- ls[[k]]
        ind <- which(colnames(m) %in% common)
        measures <<- measures[, !sels[[k]][ind], with = FALSE, drop = FALSE] # drop columns to aggregate
        m[, ind, with = FALSE, drop = FALSE]
      })
      # take element-wise average for every row/column combination across columns to aggregate
      if (ncol(ls[[1]]) >= 2) { # ncol across elements of ls is the same
        all <- array(NA, dim = c(nrow(ls[[1]]), ncol(ls[[2]]), length(ls)))
        for (k in 1:length(ls)) all[, , k] <- as.matrix(ls[[k]])
        aggr <- apply(all, c(1, 2), mean, na.rm = TRUE)
        colnames(aggr) <- colnames(ls[[length(ls)]])
      } else {
        aggr <- as.matrix(rowMeans(do.call(cbind, ls)))
        colnames(aggr) <- colnames(ls[[length(ls)]])
      }
      measures <- cbind(measures, aggr) # add back aggregated columns
    }
  }
  # add old measures to aggregated measures (if do.keep is TRUE)
  if (do.keep == TRUE)
    measures <- cbind(measures, measuresOld[, !(namesOld %in% colnames(measures)), with = FALSE])

  sento_measures <- update_info(x, measures, aggs = toAgg) # update information in sento_measures object

  return(sento_measures)
}

agg_global <- function(sento_measures, lexicons = NULL, features = NULL, time = NULL) {
  check_class(sento_measures, "sento_measures")

  dims <- get_dimensions(sento_measures)
  n <- sapply(dims, length)
  weightsInp <- list(features, lexicons, time)
  weights <- sapply(1:3, function(i) {
    if (is.null(weightsInp[[i]]))
      w <- as.list(rep(1/n[i], n[i])) # modify weights if equal to default value of NULL
    else {
      w <- as.list(weightsInp[[i]])
      if (length(w) != n[i])
        stop("All weights must be equal in length to the respective number of components.")
    }
    names(w) <- dims[[i]] # named weight lists
    return(w)
  })

  measuresLong <- data.table::as.data.table(sento_measures, format = "long")
  measuresLong[, "wFeat" := unlist(weights[[1]][measuresLong[["features"]]])] # weights features
  measuresLong[, "wLex" := unlist(weights[[2]][measuresLong[["lexicons"]]])] # weights lexicon
  measuresLong[, "wTime" := unlist(weights[[3]][measuresLong[["time"]]])] # weights time
  globs <- measuresLong[, list(globLex = mean(value * wLex),
                               globFeat = mean(value * wFeat),
                               globTime = mean(value * wTime)), by = date]
  globs[["global"]] <- rowMeans(globs[, -1])

  return(globs)
}

Try the sentometrics package in your browser

Any scripts or data that you put into this service are public.

sentometrics documentation built on Aug. 18, 2021, 9:06 a.m.