R/dictionaryStatistics.R

Defines functions dictionaryStatisticsSingle dictionaryStatistics loadFields

Documented in dictionaryStatistics dictionaryStatisticsSingle loadFields

#' @title Dictionary Handling
#' @description \code{loadFields()} loads dictionaries that are available on the web as plain text files.
#' @param fieldnames A list of names for the dictionaries. It is expected that files with that name can be found below the URL.
#' @param baseurl The base path delivering the dictionaries. Should end in a /, field names will be appended and fed into read.csv().
#' @param fileSuffix The suffix for the dictionary files
#' @param directory The last component of the base url. 
#' Useful to retrieve enriched word fields from metadata repo.
#' @param fileSep The file separator used to construct the URL
#' Can be overwritten to load local dictionaries.
#' @importFrom readr read_csv locale col_character
#' @section File Format:
#' Dictionary files should contain one word per line, with no comments 
#' or any other meta information. 
#' The entry name for the dictionary is given as the file name. 
#' It's therefore best if it does not contain
#' special characters. The dictionary must be in UTF-8 encoding, and the 
#' file needs to end on .txt.
#' @return A named list that holds the loaded dictionaries as character vectors.
#' @rdname dictionaryHandling
#' @export
#' @examples 
#' # retrieves word fields from github
#' fields <- loadFields(fieldnames=c("Liebe", "Familie", "Krieg"))
loadFields <- function(fieldnames=c("Liebe","Familie"),
                       baseurl=paste("https://raw.githubusercontent.com/quadrama/metadata/master",
                                     ensureSuffix(directory,fileSep), sep=fileSep),
                       directory="fields/",
                       fileSuffix=".txt",
                       fileSep = "/") {
  r <- list()
  for (field in fieldnames) {
    url <- paste(baseurl, field, fileSuffix, sep="")
    r[[field]] <- as.character((readr::read_csv(url, 
                                                col_names = FALSE, 
                                                locale = readr::locale(),
                                                col_types = c(readr::col_character())))$X1)
  }
  r
}

#' @name dictionaryStatistics
#' @title Dictionary Use
#' @description These methods retrieve 
#' count the number of occurrences of the words in the dictionaries, 
#' across different speakers and/or segments.
#' The function \code{dictionaryStatistics()} calculates statistics for 
#' dictionaries with multiple entries, \code{dictionaryStatisticsSingle()} only
#' for a single word list. 
#' @param drama A QDDrama object.
#' @param fieldnames A list of names for the dictionaries. 
#' @param fields A list of lists that contains the actual field names. 
#' By default, we load the \code{base_dictionary}.
#' @param normalizeByCharacter Logical. Whether to normalize by character 
#' speech length.
#' @param normalizeByField Logical. Whether to normalize by dictionary 
#' size. You usually want this.
#' @param column The table column we apply the dictionary on. 
#' Should be either "Token.surface" or "Token.lemma", the latter is the default.
#' @param ci Whether to ignore case. Defaults to TRUE, i.e., case is ignored.
#' @importFrom stats aggregate ave
#' @importFrom utils as.roman
#' @importFrom data.table as.data.table setcolorder
#' @seealso \code{\link{loadFields}} \code{\link{characterNames}}
#' @rdname dictionaryStatistics
#' @examples
#' # Check multiple dictionary entries
#' data(rksp.0)
#' dstat <- dictionaryStatistics(rksp.0, fieldnames=c("Krieg","Familie"))
#' @export
dictionaryStatistics <- function(drama, fields=DramaAnalysis::base_dictionary[fieldnames],
                                 fieldnames=c("Liebe"),
                                 segment=c("Drama","Act","Scene"),
                                 normalizeByCharacter = FALSE, 
                                 normalizeByField = FALSE, 
                                 byCharacter = TRUE,
                                 column="Token.lemma", 
                                 ci = TRUE) {
  stopifnot(inherits(drama, "QDDrama"))
  
  
  # we need this to prevent notes in R CMD check
  .N <- NULL
  . <- NULL
  corpus <- NULL
  Speaker.figure_surface <- NULL
  Speaker.figure_id <- NULL
  
  
  segment <- match.arg(segment)
  
  text <- switch(segment,
                 Drama=drama$text,
                 Act=segment(drama$text, drama$segments),
                 Scene=segment(drama$text, drama$segments))
  
  
  bylist <- list(text$corpus, text$drama, text$Speaker.figure_id)
  r <- aggregate(text, by=bylist, length)[,1:3]

  first <- TRUE
  singles <- lapply(names(fields),function(x) {
    dss <- as.data.table(dictionaryStatisticsSingle(drama, fields[[x]], ci=ci,
                                        segment = segment,
                                        byCharacter = byCharacter,
                                        normalizeByCharacter = normalizeByCharacter, 
                                        normalizeByField = normalizeByField, 
                                        column=column))
    colnames(dss)[ncol(dss)] <- x
    if (x == names(fields)[[1]]) {
      if (segment=="Scene") {
        u <- unique(text[,c("begin.Scene","Number.Act", "Number.Scene")])
        
        dss <- merge(dss, u, 
                     by.x=c("Number.Act", "Number.Scene"),
                     by.y=c("Number.Act", "Number.Scene"))
        dss$begin.Scene <- NULL
        data.table::setcolorder(dss, c("corpus","drama","Number.Act","Number.Scene","character",x))
      }
      dss
    } else {
      dss[,x,with=FALSE]
    }
  })
  r <- Reduce(cbind,singles)
  class(r) <- c("QDDictionaryStatistics", "QDHasCharacter", switch(segment, 
                                                Drama = "QDByDrama",
                                                Act   = "QDByAct",
                                                Scene ="QDByScene"), "data.frame")
  if (byCharacter) 
    class(r) <- append(class(r), "QDByCharacter")
  
  r
}


#' @param wordfield A character vector containing the words or lemmas 
#' to be counted (only for \code{*Single}-functions)
#' @param fieldNormalizer Defaults to the length of the wordfield. 
#' If normalizeByField is given, the absolute numbers are divided 
#' by this number.
#' @param segment The segment level that should be used. By default, 
#' the entire play will be used. Possible values are "Drama" (default), 
#' "Act" or "Scene".
#' @param colnames The column names to be used in the output table.
#' @param byCharacter Logical, defaults to TRUE. If false, values will be calculated
#' for the entire segment (play, act, or scene), and not for individual characters.
#' @examples
#' # Check a single dictionary entries
#' data(rksp.0)
#' fstat <- dictionaryStatisticsSingle(rksp.0, wordfield=c("der"))
#' @importFrom stats aggregate
#' @importFrom stats na.omit
#' @importFrom reshape2 melt
#' @importFrom stats as.formula
#' @rdname dictionaryStatistics
#' @export
dictionaryStatisticsSingle <- function(drama, wordfield=c(), 
                                       segment=c("Drama","Act","Scene"),
                                       normalizeByCharacter = FALSE, 
                                       normalizeByField = FALSE, 
                                       byCharacter = TRUE,
                                       fieldNormalizer = length(wordfield), 
                                       column="Token.lemma", 
                                       ci=TRUE,
                                       colnames=NULL)
  {
  stopifnot(inherits(drama, "QDDrama"))
  
  # we need this to prevent notes in R CMD check
  .N <- NULL
  . <- NULL
  .SD <- NULL
  `:=` <- NULL
  N <- NULL
  value <- NULL

  segment <- match.arg(segment)
  
  text <- switch(segment,
                 Drama=drama$text,
                 Act=segment(drama$text, drama$segments),
                 Scene=segment(drama$text, drama$segments))
  
  bycolumns <- c("corpus",
                 switch(segment,
                        Drama=c("drama"),
                        Act=c("drama","Number.Act"),
                        Scene=c("drama","Number.Act","Number.Scene"))
                 )
  #if (byFigure == TRUE) {
  #  bycolumns <- c(bycolumns, "Speaker.figure_id")
  #}
  bylist <- paste(bycolumns,collapse=",")
  if (ci) {
    wordfield <- tolower(wordfield)
    casing <- tolower
  } else {
    casing <- identity
  }
  if (normalizeByField == FALSE) {
    fieldNormalizer <- 1
  }
  
  dt <- data.table(text)
  dt$match <- casing(dt[[column]]) %in% wordfield
  
  # counting
  # xt <- dt[,.(x=sum(match)),keyby=bylist]
  if (byCharacter == TRUE) {
    xt <- dt[,melt(xtabs(~ Speaker.figure_id, data=.SD[match])), 
             keyby=bylist]
  } else {
    xt <- dt[,.(value=sum(match)), keyby=bylist]
  }
  
  if(normalizeByField || normalizeByCharacter) {
    xt$value <- as.double(xt$value)
  }
  
  # remove combinations of character and play that don't exist
  if (byCharacter == TRUE) {
    xt <- unique(merge(xt, drama$characters, 
                by.x = c("corpus","drama","Speaker.figure_id"), 
                by.y = c("corpus","drama","figure_id"))[,names(xt), with=F])
  }
  # xt$match <- NULL
  
  if (normalizeByCharacter == TRUE) {
    if (byCharacter == TRUE) {
      bycolumns <- append(bycolumns,"Speaker.figure_id")
    }
    bylist <- paste(bycolumns, collapse=",")
    xt <- merge(xt, dt[,.N,keyby=bylist], 
          by.x = bycolumns,
          by.y = bycolumns)
    xt[,value:=((value/fieldNormalizer)/N), keyby=bylist]
    xt <- xt[,-"N"]
  } else {
    xt$value <- as.double(xt$value) / fieldNormalizer
  }
  
  r <- xt
  colnames(r)[ncol(r)] <- "x"
  if (byCharacter) {
    colnames(r)[ncol(r)-1] <- "character"
  }
  if (! is.null(colnames)) {
    colnames(r) <- colnames
  }
  
  r[is.nan(r$x)]$x <- 0
  class(r) <- c("QDDictionaryStatistics", "QDHasCharacter", 
                switch(segment, 
                       Drama = "QDByDrama",
                       Act   = "QDByAct",
                       Scene ="QDByScene"), "data.frame", class(r))
  if (byCharacter) {
    class(r) <- append(class(r), "QDByCharacter")
    r$character <- as.factor(r$character)
  }
  r
}




#' @description The function \code{filterByDictionary()} can be used to filter a matrix as produced by 
#' \code{frequencytable()} by the words in the given dictionary(/-ies).
#' @param ft A matrix as produced by \code{frequencytable()}.
#' @param fieldnames A list of names for the dictionaries.
#' @param fields A list of lists that contains the actual field names. 
#' By default, we load the base_dictionary (as in \code{dictionaryStatistics()}).
#' @export
#' @rdname frequencyTable
#' @examples
#' data(rksp.0)
#' ftable <- frequencytable(rksp.0, 
#'                          byCharacter = TRUE)
#'                                               
#' filtered <- filterByDictionary(ftable, 
#'                                fieldnames=c("Krieg", "Familie"))
#' 

filterByDictionary <- function(ft, 
                           fields=DramaAnalysis::base_dictionary[fieldnames],
                           fieldnames=c("Liebe")) {
  as.matrix(ft[,which(colnames(ft) %in% unlist(fields))])
}

#' @export
#' @rdname dictionaryStatistics
#' @description Extract the number part from a 
#' \code{QDDictionaryStatistics} table as a matrix 
#' @param x An object of the type \code{QDDictionaryStatistics}, 
#' e.g., the output of \code{dictionaryStatistics}.
#' @param ... All other parameters are passed to \code{as.matrix.data.frame()}.
#' @return A numeric matrix that contains the frequency with which 
#' a dictionary is present in a subset of tokens
#' @examples
#' mat <- as.matrix(dictionaryStatistics(rksp.0, fieldnames=c("Krieg","Familie")))
as.matrix.QDDictionaryStatistics <- function (x, ...) {
  stopifnot(inherits(x, "QDDictionaryStatistics"))
  
  # check if there is a column for the character
  if (inherits(x, "QDByCharacter")) {
    byCharacter <- TRUE
  } else {
    byCharacter <- FALSE
  }
  
  # check how many segment columns there are
  if (inherits(x, "QDByDrama")) {
    segment <- "Drama"
    metaCols <- 1:(3+byCharacter)
  } else if (inherits(x, "QDByAct")) {
    segment <- "Act"
    metaCols <- 1:(4+byCharacter)
  } else if (inherits(x, "QDByScene")) {
    segment <- "Scene"
    metaCols <- 1:(5+byCharacter)
  }
  
  as.matrix.data.frame(x[,max(metaCols):ncol(x)])
}
quadrama/DramaAnalysis documentation built on Sept. 28, 2020, 8:42 p.m.