R/sentomeasures_methods.R
In sentometrics: An Integrated Framework for Textual Sentiment Time Series Aggregation and Prediction

Documented in aggregate.sento_measures as.data.table.sento_measures diff.sento_measures get_dates get_dimensions nmeasures nobs.sento_measures plot.sento_measures scale.sento_measures subset.sento_measures

#' Plot sentiment measures
#'
#' @author Samuel Borms
#'
#' @method plot sento_measures
#'
#' @description Plotting method that shows all sentiment measures from the provided \code{sento_measures}
#' object in one plot, or the average along one of the lexicons, features and time weighting dimensions.
#'
#' @param x a \code{sento_measures} object created using \code{\link{sento_measures}}.
#' @param group a value from \code{c("lexicons", "features", "time", "all")}. The first three choices display the average of
#' all measures from the same group, in a different color. The choice \code{"all"} displays every single sentiment measure
#' in a separate color, but this may look visually overwhelming very fast, and can be quite slow.
#' @param ... not used.
#'
#' @return Returns a simple \code{\link{ggplot}} object, which can be added onto (or to alter its default elements) by using
#' the \code{+} operator (see example). By default, a legend is positioned at the top if there are at maximum twelve line
#' graphs plotted and \code{group} is different from \code{"all"}.
#'
#' @examples
#' # construct a sento_measures object to start with
#' corpus <- sento_corpus(corpusdf = sentometrics::usnews)
#' corpusSample <- quanteda::corpus_sample(corpus, size = 500)
#' l <- sento_lexicons(sentometrics::list_lexicons[c("LM_en")],
#'                     sentometrics::list_valence_shifters[["en"]])
#' ctr <- ctr_agg(howTime = c("equal_weight", "linear"), by = "month", lag = 3)
#' sm <- sento_measures(corpusSample, l, ctr)
#'
#' # plot sentiment measures
#' plot(sm, "features")
#'
#' \dontrun{
#' # adjust appearance of plot
#' library("ggplot2")
#' p <- plot(sm)
#' p <- p +
#'   scale_x_date(name = "year", date_labels = "%Y") +
#'   scale_y_continuous(name = "newName")
#' p}
#'
#' @import ggplot2
#' @export
plot.sento_measures <- function(x, group = "all", ...) {
  if (!(group %in% c("lexicons", "features", "time", "all")))
    stop("The 'group' argument should be either 'lexicons', 'features', 'time' or 'all'.")
  measures <- data.table::as.data.table(x)
  if (group == "all") {
    measuresMelt <- data.table::melt(measures, id.vars = "date", variable.factor = FALSE)
    legendPos <- "none"
  } else {
    measuresMelt <- measures_to_long(measures)[, c("date", group, "value"), with = FALSE]
    measuresMelt <- measuresMelt[, list(value = mean(value)), by = list(date, variable = eval(parse(text = group)))]
    legendPos <- ifelse(length(unique(measuresMelt[["variable"]])) <= 12, "top", "none")
  }
  measuresMelt <- measuresMelt[order(rank(as.character(variable)))]
  p <- ggplot(data = measuresMelt, aes(x = date, y = value, color = variable)) +
    geom_line() +
    # geom_hline(yintercept = 0, size = 0.50, linetype = "dotted") +
    scale_x_date(name = "Date", date_labels = "%m-%Y") +
    scale_y_continuous(name = "Sentiment") +
    theme_bw() +
    plot_theme(legendPos)
  p
}

#' Differencing of sentiment measures
#'
#' @author Samuel Borms
#'
#' @description Differences the sentiment measures from a \code{sento_measures} object.
#'
#' @param x a \code{sento_measures} object created using \code{\link{sento_measures}}.
#' @param lag a \code{numeric}, see documentation for the generic \code{\link{diff}}.
#' @param differences a \code{numeric}, see documentation for the generic \code{\link{diff}}.
#' @param ... not used.
#'
#' @return A modified \code{sento_measures} object, with the measures replaced by the differenced measures as well as updated
#' statistics.
#'
#' @examples
#' data("usnews", package = "sentometrics")
#' data("list_lexicons", package = "sentometrics")
#' data("list_valence_shifters", package = "sentometrics")
#'
#' # construct a sento_measures object to start with
#' corpus <- sento_corpus(corpusdf = usnews)
#' corpusSample <- quanteda::corpus_sample(corpus, size = 500)
#' l <- sento_lexicons(list_lexicons[c("LM_en", "HENRY_en")], list_valence_shifters[["en"]])
#' ctr <- ctr_agg(howTime = c("equal_weight", "linear"), by = "year", lag = 3)
#' sento_measures <- sento_measures(corpusSample, l, ctr)
#'
#' # first-order difference sentiment measures with a lag of two
#' diffed <- diff(sento_measures, lag = 2, differences = 1)
#'
#' @export
diff.sento_measures <- function(x, lag = 1, differences = 1, ...) {
  dates <- get_dates(x)[-1:-(lag * differences)]
  measures <- data.table::as.data.table(x)[, -1] # drop dates
  measuresDiff <- diff(as.matrix(measures), lag = lag, differences = differences)
  x$measures <- data.table::data.table(date = dates, measuresDiff)
  x$stats <- compute_stats(x)
  x
}

#' @export
nmeasures.sento_measures <- function(sento_measures) {
  NCOL(sento_measures[["measures"]]) - 1 # omit date column
}

#' Get number of sentiment measures
#'
#' @author Samuel Borms
#'
#' @description Returns the number of sentiment measures.
#'
#' @param sento_measures a \code{sento_measures} object created using \code{\link{sento_measures}}.
#'
#' @return The number of sentiment measures in the input \code{sento_measures} object.
#'
#' @export
nmeasures <- function(sento_measures) {
  UseMethod("nmeasures", sento_measures)
}

#' Get number of observations in the sentiment measures
#'
#' @author Samuel Borms
#'
#' @description Returns the number of data points available in the sentiment measures.
#'
#' @param object a \code{sento_measures} object created using \code{\link{sento_measures}}.
#' @param ... not used.
#'
#' @return The number of rows (observations/data points) in \code{object[["measures"]]}.
# #'
# #' @keywords internal
#'
#' @importFrom stats nobs
#' @export
nobs.sento_measures <- function(object, ...) {
  NROW(object[["measures"]])
}

#' Scaling and centering of sentiment measures
#'
#' @author Samuel Borms
#'
#' @description Scales and centers the sentiment measures from a \code{sento_measures} object, column-per-column. By default,
#' the measures are normalized. \code{NA}s are removed first.
#'
#' @details If one of the arguments \code{center} or \code{scale} is a \code{matrix}, this operation will be applied first,
#' and eventual other centering or scaling is computed on that data.
#'
#' @param x a \code{sento_measures} object created using \code{\link{sento_measures}}.
#' @param center a \code{logical} or a \code{numeric} vector, see documentation for the generic \code{\link{scale}}.
#' Alternatively, one can provide a \code{matrix} of dimensions \code{nobs(sento_measures)} times \code{1} or
#' \code{nmeasures(sento_measures)} with values to subtract from each individual observation.
#' @param scale a \code{logical} or a \code{numeric} vector, see documentation for the generic \code{\link{scale}}.
#' Alternatively, one can provide a \code{matrix} of dimensions \code{nobs(sento_measures)} times \code{1} or
#' \code{nmeasures(sento_measures)} with values to divide each individual observation by.
#'
#' @return A modified \code{sento_measures} object, with the measures replaced by the scaled measures as well as updated
#' statistics.
#'
#' @examples
#' data("usnews", package = "sentometrics")
#' data("list_lexicons", package = "sentometrics")
#' data("list_valence_shifters", package = "sentometrics")
#'
#' set.seed(505)
#'
#' # construct a sento_measures object to start with
#' corpus <- sento_corpus(corpusdf = usnews)
#' corpusSample <- quanteda::corpus_sample(corpus, size = 500)
#' l <- sento_lexicons(list_lexicons[c("LM_en", "HENRY_en")])
#' ctr <- ctr_agg(howTime = c("equal_weight", "linear"), by = "year", lag = 3)
#' sento_measures <- sento_measures(corpusSample, l, ctr)
#'
#' # scale sentiment measures to zero mean and unit standard deviation
#' sc1 <- scale(sento_measures)
#'
#' n <- nobs(sento_measures)
#' m <- nmeasures(sento_measures)
#'
#' # subtract a matrix
#' sc2 <- scale(sento_measures, center = matrix(runif(n * m), n, m), scale = FALSE)
#'
#' # divide every row observation based on a one-column matrix, then center
#' sc3 <- scale(sento_measures, center = TRUE, scale = matrix(runif(n)))
#'
#' @export
scale.sento_measures <- function(x, center = TRUE, scale = TRUE) {
  dates <- get_dates(x)
  measures <- data.table::as.data.table(x)[, -1] # drop dates
  if (is.matrix(center)) {
    if (nrow(center) != nobs(x) || !(ncol(center) %in% c(1, nmeasures(x))))
      stop("The matrix dimensions of the 'center' argument are not correct.")
    measures <- measures + center
    center <- FALSE
  }
  if (is.matrix(scale)) {
    if (nrow(scale) != nobs(x) || !(ncol(scale) %in% c(1, nmeasures(x))))
      stop("The matrix dimensions of the 'scale' argument are not correct.")
    measures <- measures / scale
    scale <- FALSE
  }
  measuresNorm <- scale(measures, center = center, scale = scale)
  x$measures <- data.table::data.table(date = dates, measuresNorm)
  x$stats <- compute_stats(x)
  x
}

#' @export
summary.sento_measures <- function(object, ...) {
  sento_measures <- object
  freq <- c("daily", "weekly", "monthly", "yearly")[c("day", "week", "month", "year") %in% sento_measures$ctr$time$weightingParam$by]
  cat("This sento_measures object contains ", nmeasures(sento_measures), " textual sentiment time series with ",
      nobs(sento_measures), " observations each ", "(", freq, ").", "\n", sep = "")
  cat("\n")
  cat("Following features are present:", sento_measures$features, "\n")
  cat("Following lexicons are used to calculate sentiment:", sento_measures$lexicons, "\n")
  cat("Following scheme is applied for aggregation within documents:", sento_measures$ctr$within$howWithin, "\n")
  cat("Following scheme is applied for aggregation across documents:", sento_measures$ctr$docs$howDocs, "\n")
  cat("Following schemes are applied for aggregation across time:", sento_measures$time, "\n")
  cat("\n")
  cat("Aggregate average statistics:", "\n")
  print(round(rowMeans(sento_measures$stats, na.rm = TRUE), 5))
  cat()
}

#' @export
print.sento_measures <- function(x, ...) {
  cat("A sento_measures object (", nmeasures(x),
      " textual sentiment time series, ", nobs(x),
      " observations).", "\n", sep = "")
}

#' Get the dates of the sentiment measures/time series
#'
#' @author Samuel Borms
#'
#' @description Returns the dates of the sentiment time series.
#'
#' @param sento_measures a \code{sento_measures} object created using \code{\link{sento_measures}}.
#'
#' @return The \code{"date"} column in \code{sento_measures[["measures"]]} as a \code{character} vector.
#'
#' @export
get_dates <- function(sento_measures) {
  check_class(sento_measures, "sento_measures")
  sento_measures$measures[, date]
}

#' Get the dimensions of the sentiment measures
#'
#' @author Samuel Borms
#'
#' @description Returns the components across all three dimensions of the sentiment measures.
#'
#' @param sento_measures a \code{sento_measures} object created using \code{\link{sento_measures}}.
#'
#' @return The \code{"features"}, \code{"lexicons"} and \code{"time"} elements in \code{sento_measures}.
#'
#' @export
get_dimensions <- function(sento_measures) {
  check_class(sento_measures, "sento_measures")
  sento_measures[c("features", "lexicons", "time")]
}

#' Get the sentiment measures
#'
#' @author Samuel Borms
#'
#' @description Extracts the sentiment measures \code{data.table} in either wide (by default)
#' or long format.
#'
#' @param x a \code{sento_measures} object created using \code{\link{sento_measures}}.
#' @param keep.rownames see \code{\link{as.data.table}}.
#' @param format a single \code{character} vector, one of \code{c("wide", "long")}.
#' @param ... not used.
#'
#' @return The panel of sentiment measures under \code{sento_measures[["measures"]]},
#' in wide or long format.
#'
#' @examples
#' data("usnews", package = "sentometrics")
#' data("list_lexicons", package = "sentometrics")
#' data("list_valence_shifters", package = "sentometrics")
#'
#' sm <- sento_measures(sento_corpus(corpusdf = usnews[1:200, ]),
#'                      sento_lexicons(list_lexicons["LM_en"]),
#'                      ctr_agg(lag = 3))
#'
#' data.table::as.data.table(sm)
#' data.table::as.data.table(sm, format = "long")
#'
#' @export
as.data.table.sento_measures <- function(x, keep.rownames = FALSE, format = "wide", ...) {
  if (format == "wide")
    x[["measures"]]
  else if (format == "long")
    measures_to_long(x[["measures"]])
  else
    stop("The 'format' argument should be 'wide' or 'long'.")
}

#' @export
as.data.frame.sento_measures <- function(x, ...) {
  as.data.frame(x[["measures"]])
}

#' Subset sentiment measures
#'
#' @author Samuel Borms
#'
#' @description Subsets rows of the sentiment measures based on its columns.
#'
#' @param x a \code{sento_measures} object created using \code{\link{sento_measures}}.
#' @param subset a logical (non-\code{character}) expression indicating the rows to keep. If a
#' \code{numeric} input is given, it is used for row index subsetting.
#' @param select a \code{character} vector of the lexicon, feature and time weighting scheme names, to indicate which
#' measures need to be selected, or as a \code{list} of \code{character} vectors, possibly with separately specified
#' combinations (consisting of one unique lexicon, one unique feature, and one unique time weighting scheme at maximum).
#' @param delete see the \code{select} argument, but to delete measures.
#' @param ... not used.
#'
#' @return A modified \code{sento_measures} object, with only the remaining rows and sentiment measures,
#' including updated information and statistics, but the original sentiment scores \code{data.table} untouched.
#'
#' @examples
#' data("usnews", package = "sentometrics")
#' data("list_lexicons", package = "sentometrics")
#' data("list_valence_shifters", package = "sentometrics")
#'
#' # construct a sento_measures object to start with
#' corpus <- sento_corpus(corpusdf = usnews)
#' corpusSample <- quanteda::corpus_sample(corpus, size = 500)
#' l <- sento_lexicons(list_lexicons[c("LM_en", "HENRY_en")])
#' ctr <- ctr_agg(howTime = c("equal_weight", "linear"), by = "year", lag = 3)
#' sm <- sento_measures(corpusSample, l, ctr)
#'
#' # three specified indices in required list format
#' three <- as.list(
#'   stringi::stri_split(c("LM_en--economy--linear",
#'                         "HENRY_en--wsj--equal_weight",
#'                         "HENRY_en--wapo--equal_weight"),
#'                       regex = "--")
#' )
#'
#' # different subsets
#' sub1 <- subset(sm, HENRY_en--economy--equal_weight >= 0.01)
#' sub2 <- subset(sm, date %in% get_dates(sm)[3:12])
#' sub3 <- subset(sm, 3:12)
#' sub4 <- subset(sm, 1:100) # warning
#'
#' # different selections
#' sel1 <- subset(sm, select = "equal_weight")
#' sel2 <- subset(sm, select = c("equal_weight", "linear"))
#' sel3 <- subset(sm, select = c("linear", "LM_en"))
#' sel4 <- subset(sm, select = list(c("linear", "wsj"), c("linear", "economy")))
#' sel5 <- subset(sm, select = three)
#'
#' # different deletions
#' del1 <- subset(sm, delete = "equal_weight")
#' del2 <- subset(sm, delete = c("linear", "LM_en"))
#' del3 <- subset(sm, delete = list(c("linear", "wsj"), c("linear", "economy")))
#' del4 <- subset(sm, delete = c("equal_weight", "linear")) # warning
#' del5 <- subset(sm, delete = three)
#'
#' @export
subset.sento_measures <- function(x, subset = NULL, select = NULL, delete = NULL, ...) {
  check_class(x, "sento_measures")

  # subset
  isNumericSubset <- tryCatch(is.numeric(subset), error = function(e) FALSE)
  if (isNumericSubset) {
    if (max(subset) > nobs(x)) {
      warning("At least one row index is greater than nobs(x). Input sento_measures object is returned.")
      return(x)
    }
    measuresNew <- data.table::as.data.table(x)[subset, ]
    if (nrow(measuresNew) == 0) {
      warning("No rows retained. Input sento_measures object is returned.")
      return(x)
    } else {
      x <- update_info(x, measuresNew) # subset update
    }
  } else {
    sub <- as.character(substitute(list(subset))[-1L])
    if (length(sub) > 0 && sub != "NULL") {
      sub <- stringi::stri_replace_all(sub, "", regex = " ")
      sub <- stringi::stri_replace_all(sub, "____", regex = "--")
      measures <- data.table::as.data.table(x)
      colnames(measures) <- stringi::stri_replace_all(colnames(measures), "____", regex = "--") # -- is problematic here
      measuresNew <- tryCatch(measures[eval(parse(text = sub), parent.frame(2))], error = function(e) NULL)
      if (is.null(measuresNew)) stop("The 'subset' argument must evaluate to logical.")
      colnames(measuresNew) <- stringi::stri_replace_all(colnames(measuresNew), "--", regex = "____")
      if (dim(measuresNew)[1] == 0) {
        warning("No rows selected in subset. Input sento_measures object is returned.")
        return(x)
      }
      x <- update_info(x, measuresNew) # subset update
    }
  }

  # select
  if (!is.null(select)) {
    allOpts <- unlist(get_dimensions(x))
    valid <- unlist(select) %in% allOpts
    if (any(!valid)) {
      stop(paste0("Following components make up none of the sentiment measures: ",
                  paste0(unique(unlist(select)[!valid]), collapse = ', '), "."))
    }

    measures <- data.table::as.data.table(x)
    namesList <- stringi::stri_split(colnames(measures), regex = "--")
    if (is.list(select)) {
      ind <- rep(FALSE, length(namesList))
      for (com in select) {
        inds <- sapply(namesList, function(x) return(all(com %in% x)))
        ind[inds == TRUE] <- TRUE
      }
    } else ind <- sapply(namesList, function(x) return(any(select %in% x)))
    if (!any(ind[-1])) {
      warning("No appropriate combination for selection found. Input sento_measures object is returned.")
      return(x)
    }
    measuresNew <- measures[, c(TRUE, ind[-1]), with = FALSE]
    x <- update_info(x, measuresNew) # select update
  }

  # delete
  if (!is.null(delete)) {
    allOpts <- unlist(get_dimensions(x))
    valid <- unlist(delete) %in% allOpts
    if (any(!valid)) {
      stop(paste0("Following components make up none of the sentiment measures: ",
                  paste0(unique(unlist(delete)[!valid]), collapse = ', '), "."))
    }

    measures <- data.table::as.data.table(x)
    namesList <- stringi::stri_split(colnames(measures), regex = "--")
    if (is.list(delete)) {
      ind <- rep(FALSE, length(namesList))
      for (com in delete) {
        inds <- sapply(namesList, function(x) return(all(com %in% x)))
        ind[inds == TRUE] <- TRUE
      }
    } else ind <- sapply(namesList, function(x) return(any(delete %in% x)))
    if (all(ind[-1]) || all(!ind[-1])) {
      warning("No appropriate combination found or all measures selected for deletion. Input sento_measures object is returned.")
      return(x)
    }
    measuresNew <- measures[, c(TRUE, !ind[-1]), with = FALSE]
    x <- update_info(x, measuresNew) # delete update
  }

  return(x)
}

#' Aggregate sentiment measures
#'
#' @author Samuel Borms
#'
#' @description Aggregates sentiment measures by combining across provided lexicons, features, and time weighting
#' schemes dimensions. For \code{do.global = FALSE}, the combination occurs by taking the mean of the relevant
#' measures. For \code{do.global = TRUE}, this function aggregates all sentiment measures into a weighted global textual
#' sentiment measure for each of the dimensions.
#'
#' @details If \code{do.global = TRUE}, the measures are constructed from weights that indicate the importance (and sign)
#' along each component from the \code{lexicons}, \code{features}, and \code{time} dimensions. There is no restriction in
#' terms of allowed weights. For example, the global index based on the supplied lexicon weights (\code{"globLex"}) is obtained
#' first by multiplying every sentiment measure with its corresponding weight (meaning, the weight given to the lexicon the
#' sentiment is computed with), then by taking the average per date.
#'
#' @param x a \code{sento_measures} object created using \code{\link{sento_measures}}.
#' @param lexicons a \code{list} with unique lexicons to aggregate at given name, e.g., \cr
#' \code{list(lex12 = c("lex1", "lex2"))}. See \code{x$lexicons} for the exact names to use. Use \code{NULL}
#' (default) to apply no merging across this dimension. If \code{do.global = TRUE}, should be a \code{numeric} vector of
#' weights, of size \code{length(x$lexicons)}, in the same order. A value of \code{NULL} means equally weighted.
#' @param features a \code{list} with unique features to aggregate at given name, e.g., \cr
#' \code{list(feat12 = c("feat1", "feat2"))}. See \code{x$features} for the exact names to use. Use \code{NULL}
#' (default) to apply no merging across this dimension. If \code{do.global = TRUE}, should be a \code{numeric} vector of
#' weights, of size \code{length(x$features)}, in the same order. A value of \code{NULL} means equally weighted.
#' @param time a \code{list} with unique time weighting schemes to aggregate at given name, e.g., \cr
#' \code{list(tw12 = c("tw1", "tw2"))}. See \code{x$time} for the exact names to use. Use \code{NULL} (default)
#' to apply no merging across this dimension. If \code{do.global = TRUE}, should be a \code{numeric} vector of
#' weights, of size \code{length(x$time)}, in the same order. A value of \code{NULL} means equally weighted.
#' @param do.global a \code{logical} indicating if the sentiment measures should be aggregated into weighted
#' global sentiment indices.
#' @param do.keep a \code{logical} indicating if the original sentiment measures should be kept (i.e., the aggregated
#' sentiment measures will be added to the current sentiment measures as additional indices if \code{do.keep = TRUE}).
#' @param ... not used.
#'
#' @return If \code{do.global = FALSE}, a modified \code{sento_measures} object, with the aggregated sentiment
#' measures, including updated information and statistics, but the original sentiment scores \code{data.table}
#' untouched.
#'
#' If \code{do.global = TRUE}, a \code{data.table} with the different types of weighted global sentiment measures,
#' named \code{"globLex"}, \code{"globFeat"}, \code{"globTime"} and \code{"global"}, with \code{"date"} as the first
#' column. The last measure is an average of the the three other measures.
#'
#' @examples
#' data("usnews", package = "sentometrics")
#' data("list_lexicons", package = "sentometrics")
#' data("list_valence_shifters", package = "sentometrics")
#'
#' # construct a sento_measures object to start with
#' corpus <- sento_corpus(corpusdf = usnews)
#' corpusSample <- quanteda::corpus_sample(corpus, size = 500)
#' l <- sento_lexicons(list_lexicons[c("LM_en", "HENRY_en")],
#'                     list_valence_shifters[["en"]])
#' ctr <- ctr_agg(howTime = c("equal_weight", "linear"),
#'                by = "year", lag = 3)
#' sento_measures <- sento_measures(corpusSample, l, ctr)
#'
#' # aggregation across specified components
#' smAgg <- aggregate(sento_measures,
#'                    time = list(W = c("equal_weight", "linear")),
#'                    features = list(journals = c("wsj", "wapo")),
#'                    do.keep = TRUE)
#'
#' # aggregation in full
#' dims <- get_dimensions(sento_measures)
#' smFull <- aggregate(sento_measures,
#'                     lexicons = list(L = dims[["lexicons"]]),
#'                     time = list(T = dims[["time"]]),
#'                     features = list(F = dims[["features"]]))
#'
#' # "global" aggregation
#' smGlobal <- aggregate(sento_measures, do.global = TRUE,
#'                       lexicons = c(0.3, 0.1),
#'                       features = c(1, -0.5, 0.3, 1.2),
#'                       time = NULL)
#'
#' \dontrun{
#' # aggregation won't work, but produces informative error message
#' aggregate(sento_measures,
#'           time = list(W = c("equal_weight", "almon1")),
#'           lexicons = list(LEX = c("LM_en")),
#'           features = list(journals = c("notInHere", "wapo")))}
#'
#' @export
aggregate.sento_measures <- function(x, features = NULL, lexicons = NULL, time = NULL,
                                     do.global = FALSE, do.keep = FALSE, ...) {
  stopifnot(is.logical(do.global))

  if (do.global == TRUE) {
    stopifnot(is.null(features) || is.numeric(features))
    stopifnot(is.null(lexicons) || is.numeric(lexicons))
    stopifnot(is.null(time) || is.numeric(time))
    measures <- agg_global(x, lexicons, features, time)
    if (do.keep == TRUE) measures <- cbind(measures, data.table::as.data.table(x)[, -1])
    return(measures)
  }

  stopifnot(is.logical(do.keep))
  stopifnot(is.null(features) || is.list(features))
  stopifnot(is.null(lexicons) || is.list(lexicons))
  stopifnot(is.null(time) || is.list(time))

  check <- check_agg_dimensions(x, features = features, lexicons = lexicons, time = time) # check inputs
  if (check$stop == TRUE)
    stop(paste0(c("Wrong inputs.", check$msg1, check$msg2), collapse = " "))

  measures <- data.table::as.data.table(x)
  toAgg <- list(lexicons = lexicons, features = features, time = time)

  if (do.keep == TRUE) {
    measuresOld <- measures
    namesOld <- colnames(measures)
  }
  # loop over lexicons, features and time lists
  for (across in toAgg) {
    # loop over set of aggregation levels to combine into given name (e.g., lex12 = c("lex1", "lex2"))
    for (i in seq_along(across)) {
      name <- names(across)[i] # e.g. "lex12"
      cols <- across[[i]] # e.g. c("lex1", "lex2")
      # find all sentiment columns aggregated at one of the 'cols' aggregation levels and stack them into ls
      ls <- sels <- as.list(1:length(cols))
      names(ls) <- names(sels) <- cols
      for (elem in cols) {
        sel <- colnames(measures)[stringi::stri_detect(colnames(measures), regex = paste0("\\b", elem, "\\b"))] # exact match
        selMeas <- measures[, sel, with = FALSE, drop = FALSE]
        nms <- stringi::stri_split(colnames(selMeas), regex = "--")
        loc <- which(stringi::stri_detect(nms[[1]], regex = elem))[1]
        nmsNew <- sapply(nms, function(x) {
          x[loc] <- name
          paste0(x, collapse = "--")
        })
        colnames(selMeas) <- nmsNew
        ls[[elem]] <- selMeas
        sels[[elem]] <- sel
      }
      common <- Reduce(intersect, lapply(ls, colnames))
      ls <- lapply(1:length(ls), function(k) {
        m <- ls[[k]]
        ind <- which(colnames(m) %in% common)
        measures <<- measures[, !sels[[k]][ind], with = FALSE, drop = FALSE] # drop columns to aggregate
        m[, ind, with = FALSE, drop = FALSE]
      })
      # take element-wise average for every row/column combination across columns to aggregate
      if (ncol(ls[[1]]) >= 2) { # ncol across elements of ls is the same
        all <- array(NA, dim = c(nrow(ls[[1]]), ncol(ls[[2]]), length(ls)))
        for (k in 1:length(ls)) all[, , k] <- as.matrix(ls[[k]])
        aggr <- apply(all, c(1, 2), mean, na.rm = TRUE)
        colnames(aggr) <- colnames(ls[[length(ls)]])
      } else {
        aggr <- as.matrix(rowMeans(do.call(cbind, ls)))
        colnames(aggr) <- colnames(ls[[length(ls)]])
      }
      measures <- cbind(measures, aggr) # add back aggregated columns
    }
  }
  # add old measures to aggregated measures (if do.keep is TRUE)
  if (do.keep == TRUE)
    measures <- cbind(measures, measuresOld[, !(namesOld %in% colnames(measures)), with = FALSE])

  sento_measures <- update_info(x, measures, aggs = toAgg) # update information in sento_measures object

  return(sento_measures)
}

agg_global <- function(sento_measures, lexicons = NULL, features = NULL, time = NULL) {
  check_class(sento_measures, "sento_measures")

  dims <- get_dimensions(sento_measures)
  n <- sapply(dims, length)
  weightsInp <- list(features, lexicons, time)
  weights <- sapply(1:3, function(i) {
    if (is.null(weightsInp[[i]]))
      w <- as.list(rep(1/n[i], n[i])) # modify weights if equal to default value of NULL
    else {
      w <- as.list(weightsInp[[i]])
      if (length(w) != n[i])
        stop("All weights must be equal in length to the respective number of components.")
    }
    names(w) <- dims[[i]] # named weight lists
    return(w)
  })

  measuresLong <- data.table::as.data.table(sento_measures, format = "long")
  measuresLong[, "wFeat" := unlist(weights[[1]][measuresLong[["features"]]])] # weights features
  measuresLong[, "wLex" := unlist(weights[[2]][measuresLong[["lexicons"]]])] # weights lexicon
  measuresLong[, "wTime" := unlist(weights[[3]][measuresLong[["time"]]])] # weights time
  globs <- measuresLong[, list(globLex = mean(value * wLex),
                               globFeat = mean(value * wFeat),
                               globTime = mean(value * wTime)), by = date]
  globs[["global"]] <- rowMeans(globs[, -1])

  return(globs)
}
Any scripts or data that you put into this service are public.
sentometrics documentation built on Aug. 18, 2021, 9:06 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
sentometrics
An Integrated Framework for Textual Sentiment Time Series Aggregation and Prediction

R/sentomeasures_methods.R
In sentometrics: An Integrated Framework for Textual Sentiment Time Series Aggregation and Prediction

Documented in aggregate.sento_measures as.data.table.sento_measures diff.sento_measures get_dates get_dimensions nmeasures nobs.sento_measures plot.sento_measures scale.sento_measures subset.sento_measures

Try the sentometrics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

sentometrics An Integrated Framework for Textual Sentiment Time Series Aggregation and Prediction

R/sentomeasures_methods.R In sentometrics: An Integrated Framework for Textual Sentiment Time Series Aggregation and Prediction

Documented in aggregate.sento_measures as.data.table.sento_measures diff.sento_measures get_dates get_dimensions nmeasures nobs.sento_measures plot.sento_measures scale.sento_measures subset.sento_measures

Try the sentometrics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

sentometrics
An Integrated Framework for Textual Sentiment Time Series Aggregation and Prediction

R/sentomeasures_methods.R
In sentometrics: An Integrated Framework for Textual Sentiment Time Series Aggregation and Prediction