#' @title Load drama
#' @description This function parses and loads one or more dramas in raw TEI format.
#' @param filename The filename of the drama to load (or a list thereof).
#' @import xml2
#' @importFrom data.table data.table
#' @importFrom tokenizers tokenize_words
#' @exportClass QDDrama
#' @return The function returns an object of class \code{QDDrama}.
#' @export
loadDramaTEI <- function(filename) {
# recursive loading for multiple dramas
if (is.list(filename)) {
drama_list <- lapply(filename, loadDramaTEI)
drama <- drama_list[[1]]
for (i in 2:length(drama_list)) {
drama <- combine(drama, drama_list[[i]])
}
} else {
raw_tei <- xml2::read_xml(filename) # read xml file
nsp <- xml2::xml_ns_rename(xml2::xml_ns(raw_tei), d1="tei") # set tei-namespace
id <- gsub("([^.]+).xml", "\\1", filename)
corpus <- "tei"
# create QDDrama object and its elements
drama <- list()
text_segments_stage <- parseTEI(raw_tei, nsp, id, corpus)
drama$text <- text_segments_stage$text
drama$segments <- text_segments_stage$segments
drama$characters <- loadCharactersTEI(raw_tei, nsp, corpus, id)
drama$meta <- loadMetaTEI(raw_tei, nsp, corpus, id)
drama$mentions <- loadMentionsTEI()
drama$stageDirections <- text_segments_stage$stage
# add classes to list objects
class(drama$text) <- append("QDHasUtteranceBE", class(drama$text))
class(drama$segments) <- append("QDHasSegments", class(drama$segments))
class(drama$mentions) <- append("QDHasUtteranceBE", class(drama$mentions))
class(drama$stageDirections) <- append("QDHasUtteranceBE", class(drama$stageDirections))
class(drama) <- append("QDDrama", class(drama))
}
drama
}
# This function parses the xml object iteratively, constructs the text,
# segments and stageDirections data.table objects row by row as a list
# and then converts them into the right format.
# internal
parseTEI <- function(raw_tei, nsp, id, corpus) {
# initialize lists of text, segments and stageDirections
text_l <- list()
segments_l <- list()
stage_l <- list()
# intialize count variables
d_length <- 0
act_counter <- 0
scene_counter <- 0
position <- 1
no_scenes <- FALSE
# subset of raw_tei that is relevant: act-divs, scene-divs, sp-elements, stage-elements outside sp
relevant_tei <- xml2::xml_find_all(raw_tei,
"//tei:div[@type='act']|
//tei:div[@type='scene']|
//tei:sp|
//tei:div/tei:stage",
ns=nsp)
# tei-parsing
for (elem in relevant_tei) {
# new act
if (identical(xml2::xml_attr(elem, "type", ns = nsp), "act")) {
if (act_counter > 0 && scene_counter == 0) { # if there are no scenes
segments_l <- writeSegmentRow(corpus, id, act_begin,
act_counter, 0, 0, scene_counter, segments_l)
no_scenes <- TRUE
} else if (scene_counter > 0) { # write last scene of last act
scene_end <- position - 1
segments_l <- writeSegmentRow(corpus, id, act_begin, act_counter,
scene_begin, scene_end, scene_counter, segments_l)
}
# update count variables for act and scene
act_begin <- position
act_counter <- act_counter + 1
scene_counter <- 0
# new scene
} else if (identical(xml2::xml_attr(elem, "type", ns = nsp), "scene")) {
scene_end <- position - 1
if (act_counter == 0) { # if there are no acts
act_begin <- 0
}
if (scene_counter > 0) { # if there are scenes
segments_l <- writeSegmentRow(corpus, id, act_begin, act_counter,
scene_begin, scene_end, scene_counter, segments_l)
}
# update count variables for scenes
scene_counter <- scene_counter + 1
scene_begin <- position
# new stage direction without speaker
} else if (identical(xml2::xml_name(elem, ns = nsp), "tei:stage")) {
tokenized_st <- tokenizers::tokenize_words(xml2::xml_text(elem), lowercase = FALSE, strip_punct = FALSE)[[1]]
stage_l <- writeTextRow(corpus, id, tokenized_st, c("_Stage"), stage_l, position, elem, "_Stage")
d_length <- d_length + length(tokenized_st)
position <- position + nchar(xml2::xml_text(elem))
# speaker & speech
} else if (identical(xml2::xml_name(elem, ns = nsp), "tei:sp")) {
# get speaker ids
raw_ids <- strsplit(xml2::xml_attr(elem, "who", ns = nsp), "#")[[1]]
speaker_ids <- sapply(raw_ids, trimws)[2:length(raw_ids)]
# parse subelements of sp
for (sp_elem in xml2::xml_children(elem)) {
# parse speaker_surface
if (identical(xml2::xml_name(sp_elem, ns = nsp), "tei:speaker")) {
speaker_surface <- gsub("\\.", "", xml2::xml_text(sp_elem))
d_length <- d_length + length(tokenizers::tokenize_words(speaker_surface, lowercase = FALSE, strip_punct = FALSE)[[1]])
position <- position + nchar(xml2::xml_text(sp_elem))
# parse p-elements inside sp
} else if (identical(xml2::xml_name(sp_elem, ns = nsp), "tei:p")) {
# iterate over contents in p
for (p_elem in xml2::xml_contents(sp_elem)) {
# parse stage-element inside p
if (identical(xml2::xml_name(p_elem, ns = nsp), "tei:stage")) {
tokenized_st <- tokenizers::tokenize_words(xml2::xml_text(p_elem), lowercase = FALSE, strip_punct = FALSE)[[1]]
stage_l <- writeTextRow(corpus, id, tokenized_st, speaker_ids, stage_l, position, p_elem, speaker_surface)
d_length <- d_length + length(tokenized_st)
# parse text inside speech
} else {
tokenized_p <- tokenizers::tokenize_words(xml2::xml_text(p_elem), lowercase = FALSE, strip_punct = FALSE)[[1]]
text_l <- writeTextRow(corpus, id, tokenized_p, speaker_ids, text_l, position, sp_elem, speaker_surface)
d_length <- d_length + length(tokenized_p)
}
position <- position + nchar(xml2::xml_text(p_elem))
}
# parse lg-element inside sp
} else if (identical(xml2::xml_name(sp_elem, ns = nsp), "tei:lg")) {
temp_text <- paste(lapply(xml2::xml_children(sp_elem), xml2::xml_text), collapse=" ")
tokenized_lg <- tokenizers::tokenize_words(temp_text, lowercase = FALSE, strip_punct = FALSE)[[1]]
text_l <- writeTextRow(corpus, id, tokenized_lg, speaker_ids, text_l, position, sp_elem, speaker_surface)
d_length <- d_length + length(tokenized_lg)
position <- position + nchar(xml2::xml_text(sp_elem))
# parse stage-element inside sp
} else if (identical(xml2::xml_name(sp_elem, ns = nsp), "tei:stage")) { # stage-element inside sp-element
tokenized_st <- tokenizers::tokenize_words(xml2::xml_text(sp_elem), lowercase = FALSE, strip_punct = FALSE)[[1]]
stage_l <- writeTextRow(corpus, id, tokenized_st, speaker_ids, stage_l, position, sp_elem, speaker_surface)
d_length <- d_length + length(tokenized_st)
position <- position + nchar(xml2::xml_text(sp_elem))
# parse l-element inside sp
} else if (identical(xml2::xml_name(sp_elem, ns = nsp), "tei:l")) { # l-element inside sp-element
tokenized_l <- tokenizers::tokenize_words(xml2::xml_text(sp_elem), lowercase = FALSE, strip_punct = FALSE)[[1]]
text_l <- writeTextRow(corpus, id, tokenized_l, speaker_ids, text_l, position, sp_elem, speaker_surface)
d_length <- d_length + length(tokenized_l)
position <- position + nchar(xml2::xml_text(sp_elem))
}
}
}
}
# write last segments row after parsing
if (no_scenes) { # if there are no scenes
segments_l <- writeSegmentRow(corpus, id, act_begin,
act_counter, 0, 0, scene_counter, segments_l)
} else {
scene_end <- position - 1
segments_l <- writeSegmentRow(corpus, id, act_begin, act_counter,
scene_begin, scene_end, scene_counter, segments_l)
}
# convert text_l into data.table
dt_text <- data.table::data.table(as.data.frame(do.call(rbind, text_l)))
names(dt_text) <- c("corpus", "drama", "utteranceBegin", "utteranceEnd",
"Speaker.figure_surface", "Speaker.figure_id", "Token.surface")
dt_text$Token.pos <- NA
dt_text$Token.lemma <- NA
dt_text$length <- d_length
dt_text <- fixColumnType(dt_text)
dt_text$Speaker.figure_surface <- as.factor(dt_text$Speaker.figure_surface)
dt_text$Speaker.figure_id <- as.factor(dt_text$Speaker.figure_id)
# convert segments_l into data.table
dt_segments <- data.table::data.table(as.data.frame(do.call(rbind, segments_l)))
names(dt_segments) <- c("corpus", "drama", "begin.Act", "end.Act", "Number.Act",
"begin.Scene", "end.Scene", "Number.Scene")
act_begins <- unique(dt_segments$begin.Act)[-1]
# fill end.Act-columns of segments-object
if (act_counter == 1) { # if there is only one act
dt_segments$end.Act[] <- position - 1
} else if (act_counter > 1) { # if there is more than one act
for (i in 1:length(act_begins)) {
dt_segments$end.Act[dt_segments$Number.Act == i] <- act_begins[[i]] - 1
}
dt_segments$end.Act[is.na(dt_segments$end.Act)] <- position - 1
} else { # if there are no acts
dt_segments$end.Act[] <- position - 1
}
dt_segments <- fixColumnType(dt_segments)
# convert stage_l into data.table
dt_stage <- data.table::data.table(as.data.frame(do.call(rbind, stage_l)))
names(dt_stage) <- c("corpus", "drama", "utteranceBegin", "utteranceEnd",
"Speaker.figure_surface", "Speaker.figure_id", "Token.surface")
dt_stage$Token.pos <- NA
dt_stage$Token.lemma <- NA
dt_stage$length <- d_length
dt_stage <- fixColumnType(dt_stage)
dt_stage$Speaker.figure_surface <- as.factor(dt_stage$Speaker.figure_surface)
dt_stage$Speaker.figure_id <- as.factor(dt_stage$Speaker.figure_id)
# output named list of data.table objects
list("text" = dt_text, "segments" = dt_segments, "stage" = dt_stage)
}
# This helper function for \code{parseTEI()} adds a row to the input text list.
# internal
writeTextRow <- function(corpus, id, tokenized, speakers, text_l, position, elem, speaker_surface) {
for (token in tokenized) {
for (speaker in speakers) {
text_l <- append(text_l, list(list(corpus,
id,
position,
position + nchar(xml2::xml_text(elem)) - 1,
speaker_surface,
speaker,
token)))
}
}
text_l
}
# This helper function for \code{parseTEI()} adds a row to the input segments list.
# internal
writeSegmentRow <- function(corpus, id, act_begin, act_counter,
scene_begin, scene_end, scene_counter, segments_l) {
segments_l <- append(segments_l, list(list(corpus,
id,
act_begin,
NA, # act end
act_counter,
scene_begin,
scene_end,
scene_counter)))
}
# This helper function for \code{parseTEI()} fixes column types of the input data.table object.
# internal
fixColumnType <- function(dt) {
dt[] <- lapply(dt, function(column) {
if (is.character(column[[1]])) {
as.character(column)
} else if (is.numeric(column[[1]])){
as.numeric(column)
} else {
column
}
})
dt
}
# This function parses the raw input tei and creates the characters data.table
# from the listPerson-element.
# internal
loadCharactersTEI <- function(raw_tei, nsp, corpus, drama) {
# initalize data.table
dt_characters <- data.table::data.table(corpus=character(), drama=character(), figure_surface=character(),
figure_id=character(), Gender=character(), Age=numeric())
# find listPerson-element
listPerson <- xml2::xml_find_all(raw_tei, "//tei:listPerson/tei:person", ns = nsp)
# parse all characters in listPerson
for (elem in listPerson) {
dt_characters <- rbind(dt_characters, list(corpus, drama, xml2::xml_text(xml2::xml_children(elem)[[1]]), xml2::xml_attr(elem, "id"), xml2::xml_attr(elem, "sex"), xml2::xml_attr(elem, "age")))
}
# output data.table object
dt_characters
}
# This function parses the raw input tei and creates the meta data.table from
# the titleStmt-element and the bibl-element of type originalSource.
# internal
loadMetaTEI <- function (raw_tei, nsp, corpus, id) {
# find title, language, author and pnd in titleStmt
title <- xml2::xml_text(xml2::xml_find_first(raw_tei, "//tei:titleStmt/tei:title[@type='main']", ns = nsp))
language <- xml2::xml_attr(raw_tei, "lang")
author <- xml2::xml_text(xml2::xml_find_first(raw_tei, "//tei:titleStmt/tei:author", ns = nsp))
pnd <- gsub("pnd:", "", xml2::xml_attr(xml2::xml_find_first(raw_tei, "//tei:titleStmt/tei:author", ns = nsp), "key"))
# find dates from bibl-element of type originalSource
date_written <- fixDate(xml2::xml_attr(xml2::xml_find_first(raw_tei, "//tei:bibl[@type='originalSource']/tei:date[@type='written']", ns = nsp), "when"))
date_printed <- fixDate(xml2::xml_attr(xml2::xml_find_first(raw_tei, "//tei:bibl[@type='originalSource']/tei:date[@type='print']", ns = nsp), "when"))
date_premiere <- fixDate(xml2::xml_attr(xml2::xml_find_first(raw_tei, "//tei:bibl[@type='originalSource']/tei:date[@type='premiere']", ns = nsp), "when"))
date_translated <- NA # not applicable so far
# construct data.table
dt_meta <- data.table::data.table(corpus = corpus, drama = id, documentTitle = title,
language = language, Name = author, Pnd = pnd, Translator.Name = NA,
Translator.Pnd = NA, Date.Written = date_written, Date.Printed = date_printed,
Date.Premiere = date_premiere, Date.Translation = NA)
# output data.table object
dt_meta
}
# This helper function for \code{loadMetaTEI()} fixes the type of dates if available.
# internal
fixDate <- function(date) {
if (is.na(date)) {
NA
} else {
as.numeric(date)
}
}
# This function creates the mentions data.table. Since there are no linguistic
# annotations for the tei files (yet), the mentions table is left empty.
# internal
loadMentionsTEI <- function() {
dt_mentions <- data.table::data.table(corpus=character(), drama=character(), utteranceBegin=numeric(),
utteranceEnd=numeric(), utteranceSpeakerId=character(),
mentionBegin=numeric(), mentionEnd=numeric(),
mentionSurface=character(), entityId=character())
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.