R/keyword.R

Defines functions KeywordBasedFeatures

Documented in KeywordBasedFeatures

#' Keyword based features
#'
#' Computes keyword based features.
#'
#' @param tokens Token data.table containing ids for document (id),
#'   sentence (sid) and token (tid), token word and original word.
#' @param negations Negation word list.
#' @return A data.table with features computed for each document.
#' @seealso CleanNLPTokens
#' @export
KeywordBasedFeatures <- function(tokens, negations) {
  ids <- sort(unique(tokens$id))

  res <- tokens[, list(uppercase.words=sum(IsUppercaseWord(token)),
                       uppercase.words.unique=sum(IsUppercaseWord(token) &
                                                  !duplicated(token)),
                       repeated.marks=sum(IsRepeatedMarks(word)),
                       laughter=sum(IsLaughter(word)),
                       elongated=sum(IsElongated(word)),
                       negation.words=sum(word %in% negations),
                       user.mentions=sum(IsUserMention(word))), by=id]
  res <- setkey(res, id)[ids]

  last.token <- LastToken(tokens)[, list(id, word)]
  setkey(last.token, id)
  last.token <- last.token[ids]
  res <- cbind(res, end.exclamation=IsExclamation(last.token$word))
  setkey(res, id)
}

## TODO
## uppercase word ratios
## uppercase char ratios
## other ratios?

## EndExclamation <- function(tokens, col.name=NULL) {
##   res <- LastToken(tokens)[, list(id, N=IsExclamation(word))]
##   RenameColumn(res, col.name)
## }

## CountUppercaseWords <- function(tokens, col.name=NULL) {
##   res <- tokens[, list(N=sum(IsUppercaseWord(token))), by=id]
##   RenameColumn(res, col.name)
## }

## CountUserMentions <- function(tokens, col.name=NULL) {
##   res <- tokens[, list(N=sum(IsUserMention(word))), by=id]
##   RenameColumn(res, col.name)
## }

## CountRepeatedMarks <- function(tokens, col.name=NULL) {
##   res <- tokens[, list(N=sum(IsRepeatedMarks(word))), by=id]
##   RenameColumn(res, col.name)
## }
M3SOulu/TextFeatures documentation built on June 24, 2022, 4:56 p.m.