R/text.R
In DramaAnalysis: Analysis of Dramatic Texts

Documented in characterNames combine dramaNames filterCharacters numberOfPlays split.QDDrama tfidf

#' @title QuaDramA colors
#' @description color scheme to be used for QuaDramA plots
#' Taken from http://google.github.io/palette.js/, tol-rainbow, 10 colors
#' @export
qd.colors <- c(rgb(120,28,129, maxColorValue = 255),
               rgb(67, 50, 141, maxColorValue = 255),
               rgb(65, 111, 184, maxColorValue = 255),
               rgb(81, 156, 184, maxColorValue = 255),
               rgb(112, 180, 132, maxColorValue = 255),
               rgb(153, 189, 92, maxColorValue = 255),
               rgb(195, 186, 69, maxColorValue = 255),
               rgb(224, 162, 57, maxColorValue = 255),
               rgb(230, 107, 45, maxColorValue = 255),
               rgb(217, 33, 32, maxColorValue = 255)
              );


#' @export
#' @title Format Names
#' @description The function \code{characterNames()} is applicable on 
#' all tables with a character table 
#' (that are of the class \code{QDHasCharacter}). It can be used to reformat the character 
#' names. The function \code{FUN} is applied to the character \emph{name} entries within 
#' the \code{QDDrama} object. The factor levels in the character column of \code{x} are replaced by 
#' the result values of \code{FUN}.
#' @param x The object in which we want to transform names, needs to inherit the type \code{QDHasCharacter}.
#' @param drama The QDDrama object with all the information.
#' @param sort Numeric. If set to a non-zero value, the resulting data.frame will be sorted 
#' alphabetically
#' according to the drama and character name. If the value is above 0, the 
#' sorting will be ascending, if set to a negative value, the sorting is 
#' descending. If sort is set to 0 (the default), the order is unchanged. 
#' The ordering can also be specified explicitly, by passing an integer vector 
#' with as many elements as \code{x} has rows.
#' @param FUN A function applied to the strings. 
#' Defaults to \code{stringr::str_to_title}, which
#' converts the strings to title case.
#' @param ... All other arguments are ignored.
#' @seealso \code{str_to_title}
#' @importFrom stringr str_to_title
#' @return The function returns \code{x}, but with modified character
#' names.
#' @examples 
#' data(rksp.0)
#' ustat <- utteranceStatistics(rksp.0)
#' ustat <- characterNames(ustat, rksp.0)
characterNames <- function(x, 
                                  drama, 
                                  FUN=stringr::str_to_title, 
                                  sort=0, 
                                  ...) {
  requireNamespace("stringr")
  stopifnot(inherits(x, "QDHasCharacter"))
  stopifnot(inherits(drama, "QDDrama"))
  
  positions <- match(levels(x$character), drama$characters$figure_id)
  levels(x$character) <- FUN(drama$characters$figure_surface[positions])

  if (length(sort) == nrow(x)) {
    x <- x[sort,]
  } else if (length(sort) == 1) {
    if (length(sort) == 1 && sort > 0) {
      x <- x[order(x$drama, x$character),]
    } else if (length(sort) == 1 && sort < 0) {
      x <- x[order(x$drama, x$character, decreasing=TRUE),]
    }
  }
    
  x

}

#' @title Format drama titles
#' @description Given a QDDrama object, this function generates a list of nicely 
#' formatted names, following the format string.
#' @param x The QDDrama object
#' @param ids If specified, should be a character vector of play ids (prefixed with corpus). 
#' Then the return value only contains the plays in the vector and in the order specified.
#' @param orderBy The meta data key that the final list will be ordered by
#' @param formatString A character vector. Contains special symbols that
#' are replaced by meta data entries about the plays. The following symbols can
#' be used:
#' - \%T: title of the play
#' - \%A: Author name 
#' - \%P: GND entry of the author (if known)
#' - \%DR, %DP, %DW: Date of premiere, print or written
#' - \%DM: The minimal date 
#' - \%L: The language
#' - \%I: The id
#' - \%C: The corpus prefix
#' @return Character vector of formatted drama names
#' @export
#' @importFrom stringr str_replace
dramaNames <- function(x, 
                       ids = NULL,
                       formatString = "%A: %T (%DM)", 
                       orderBy = "drama") {
  stopifnot(inherits(x, "QDDrama"))
  
  keys = list("%T"="documentTitle", 
              "%A"="Name",
              "%DR" = "Date.Premiere",
              "%DP" = "Date.Printed",
              "%DW" = "Date.Written",
              "%P" = "Pnd",
              "%I" = "drama",
              "%C" = "corpus",
              "%L" = "language")
  t <- rep(formatString, times=nrow(x$meta))
  for (key in names(keys)) {
    column <- x$meta[[keys[[key]]]]
    t <- stringr::str_replace(t, key, ifelse(is.na(column), 
                                             "NA",
                                             as.character(column)))
  }
  names(t) <- x$meta$drama
  
  suppressWarnings(column <- apply(x$meta[,c("Date.Printed", "Date.Written", "Date.Premiere")], 
                                   1, min, na.rm=TRUE))
  
  t <- stringr::str_replace(t, "%DM", ifelse(is.finite(column),
                                             as.character(column),
                                             "NA"))
  
  names(t) <- paste(x$meta$corpus, x$meta$drama, sep=":")
  
  if (is.null(ids)) {
    t[order(x$meta[[orderBy]])]
  } else {
    t[ids]
  }
}

#' @title Filter characters
#' @description This function can be used to filter characters from all tables 
#' that contain  a character column (and are of the class QDHasCharacter). 
#' @param hasCharacter The object we want to filter.
#' @param drama The QDDrama object.
#' @param by Character vector. Specifies the filter mechanism. 
#' @param n The threshold or a list of character names/ids to keep.
#' @details The function supports three filter mechanisms: The filter by 
#' \code{rank} sorts the characters according to the number of tokens they speak
#' and \emph{keeps} the top $n$ characters. The filter called \code{tokens} keeps 
#' all characters that speak $n$ or more tokens. The filter called \code{name} 
#' keeps the characters that are provided by name as a vector as \code{n}.
#' @return The filtered QDHasCharacter object
#' @export
#' @examples 
#' data(rjmw.0)
#' dstat <- dictionaryStatistics(rjmw.0)
#' filterCharacters(dstat, rjmw.0, by="tokens", n=1000)
#' 
#' 
filterCharacters <- function(hasCharacter, 
                   drama, 
                   by=c("rank", "tokens", "name"), 
                   n=ifelse(by=="tokens", 500, ifelse(by=="rank", 10, c()))) {
  
  # verify that objects have the correct types
  stopifnot(inherits(hasCharacter, "QDHasCharacter"))
  stopifnot(inherits(drama, "QDDrama"))
  
  # match argument
  by <- match.arg(by) 
  
  # retrieve character statistics
  charStat <- characterStatistics(drama)
  
  # by default, we keep everyone
  keep <- charStat$character
  if (by == "tokens") {
    keep <- as.character(charStat[charStat$tokens >= n,]$character)
  } else if (by == "rank") {
    keep <- as.character(charStat[order(charStat$tokens, decreasing = TRUE),]$character[1:n])
  } else if (by == "name") {
    keep <- n
  }

  # filter based on keep and return
  hasCharacter <- hasCharacter[hasCharacter$character %in% keep, ]
  
  # remove factor levels
  if (is.factor(hasCharacter$character)) {
    hasCharacter$character <- as.factor(as.character(hasCharacter$character))
  }
  hasCharacter
}



tfidf1 <- function(word) {
  docfreq <- sum(word>0)
  docfreq <- log((length(word)+1) / (sum(word>0)))
  if (docfreq==0)
    rep(0,length(word))
  else
    word*docfreq
}

#' @title TF-IDF
#' @description This function calculates a variant of TF-IDF. 
#' The input is assumed to contain relative frequencies.
#' IDF is calculated as follows: \eqn{idf_t = \log\frac{N+1}{n_t}}, with \eqn{N} being 
#' the total number of documents (i.e., rows) and \eqn{n_t} the number of documents
#' containing term \eqn{t}. We add one to the denominator to prevent terms that appear
#' in every document to become 0.
#' 
#' @param ftable A matrix, containing "documents" as rows and "terms" as columns. 
#' Values are assumed to be normalized by document, i.e., contain relative frequencies.
#' @export
#' @return A matrix containing TF*IDF values instead of relative frequencies.
#' @examples
#' data(rksp.0)
#' ftable <- frequencytable(rksp.0, byCharacter=TRUE, normalize=TRUE)
#' rksp.0.tfidf <- tfidf(ftable)
#' @examples
#' mat <- matrix(c(0.10,0.2, 0,
#'                 0,   0.2, 0,
#'                 0.1, 0.2, 0.1,
#'                 0.8, 0.4, 0.9),
#'               nrow=3,ncol=4)
#' mat2 <- tfidf(mat)
#' print(mat2)
tfidf <- function(ftable) {
  if (mean(apply(ftable,1,sum)) - 1 > 1e-5)
    stop("Matrix should contain relative frequencies")
  r <- apply(ftable, 2, tfidf1)
  rownames(r) <- rownames(ftable)
  r[is.na(r)] <- 0
  r
}


extractTopTerms <- function(mat, top=10) {
  r <- apply(mat, 1, function(x) { list(head(x[order(x, decreasing=TRUE)],n=top)) })
  lapply(r, unlist)
}







#' @export
#' @title Combine multiple plays
#' @description The function \code{combine(x, y)} can be used to merge 
#' multiple objects of the type \code{QDDrama} into one.
#' @param x A \code{QDDrama}
#' @param y A \code{QDDrama}
#' @return A single QDDrama object that represents both plays.
#' @examples 
#' 
#' data(rksp.0)
#' data(rjmw.0)
#' d <- combine(rjmw.0, rksp.0)
combine <- function(x, y) {
  stopifnot(inherits(x, "QDDrama"))
  stopifnot(inherits(y, "QDDrama"))

  r <- lapply(names(x), function(z) {
    t <- rbind(x[[z]], y[[z]])
    class(t) <- class(x[[z]])
    t
  })
  names(r) <- names(x)
  class(r) <- append("QDDrama", class(r))

  r
}

#' @export
#' @title Split multiple plays
#' @description The function \code{split(x)} expects an object of type \code{QDDrama} and can 
#' be used to split a \code{QDDrama} object that consists of multiple dramas 
#' into a list thereof. It is the counterpart to \code{combine(x, y)}.
#' @param x The object of class \code{QDDrama} (consisting of multiple dramas). 
#' For \code{split()} it should consist of multiple plays. For \code{combine()} it 
#' can but doesn't have to.
#' @param ... All other arguments are ignored.
#' @return Returns a list of individual QDDrama objects, each containing one text.
#' @examples 
#' data(rksp.0)
#' data(rjmw.0)
#' d <- combine(rjmw.0, rksp.0)
#' dlist <- split(d)
split.QDDrama <- function(x, ...) {
  stopifnot(inherits(x, "QDDrama"))
  r <- lapply(unique(x$characters$drama), function(y) {
    d <- lapply(x, function(z) {
      t <- z[z$drama == y]
      class(t) <- class(z)
      t
    })
    class(d) <- append("QDDrama", class(d))
    d
  })
  names(r) <- unique(x$characters$drama)

  r
}

#' @title Number of plays
#' @description The function \code{numberOfPlays()} determines how many
#' different plays are contained in a single QDDrama object.
#' @return An integer. The number of plays contained in the QDDrama object.
#' @param x The QDDrama object
#' @export
#' @examples 
#' # returns 1
#' numberOfPlays(rksp.0)
#' 
#' # returns 2
#' numberOfPlays(combine(rksp.0, rjmw.0))
numberOfPlays <- function(x) {
  stopifnot(inherits(x, "QDDrama"))
  
  if ("meta" %in% names(x) && "drama" %in% names(x$meta)) {
    length(unique(x$meta$drama))
  } else if ("text" %in% names(x) && "drama" %in% names(x$text)) {
    length(unique(x$text$drama))
  }
}



first <- function(x,n=0) {
  sort(x)[n]
}

last <- function(x, n=0) {
  len <- length(x)
  if (n > len) {
    return(NA)
  }
  sort(x,partial=len-(n-1))[len-(n-1)]
}

maxN <- function(x, N=2){
  len <- length(x)
  if(N>len){
    warning('N greater than length(x).  Setting N=length(x)')
    N <- length(x)
  }
  sort(x,partial=len-N+1)[(len-N+1):(len)]
}

Any scripts or data that you put into this service are public.

DramaAnalysis documentation built on Jan. 13, 2021, 10:02 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

DramaAnalysis
Analysis of Dramatic Texts

R/text.R
In DramaAnalysis: Analysis of Dramatic Texts

Defines functions maxN last first numberOfPlays split.QDDrama combine extractTopTerms tfidf tfidf1 filterCharacters dramaNames characterNames

Documented in characterNames combine dramaNames filterCharacters numberOfPlays split.QDDrama tfidf

Try the DramaAnalysis package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

DramaAnalysis Analysis of Dramatic Texts

R/text.R In DramaAnalysis: Analysis of Dramatic Texts

Defines functions maxN last first numberOfPlays split.QDDrama combine extractTopTerms tfidf tfidf1 filterCharacters dramaNames characterNames

Documented in characterNames combine dramaNames filterCharacters numberOfPlays split.QDDrama tfidf

Try the DramaAnalysis package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

DramaAnalysis
Analysis of Dramatic Texts

R/text.R
In DramaAnalysis: Analysis of Dramatic Texts