# utterances_to_nsdf <-
# function(utterances,
# caseAttributes = NULL,
# codeRegex = "\\[\\[([a-zA-Z0-9._-]+)\\]\\]",
# sidRegex = "\\[\\[sid=([a-zA-Z0-9._-]+)\\]\\]",
# cidRegex = "\\[\\[cid=([a-zA-Z0-9._-]+)\\]\\]",
# paragraphBreakRegEx = "---paragraph-break---",
# mergeStanzas = TRUE) {
#
# ### Identify paragraph breaks
# pBreaks <- grepl(paragraphBreakRegEx,
# utterances);
#
# ### Convert to paragraph numbers
# paragraphs <- purrr::accumulate(pBreaks, `+`);
#
# ### Remove paragraph breaks from the list of
# ### utterances and from the list of paragraph
# ### numbers
# utterances <- utterances[!pBreaks];
# paragraphs <- paragraphs[!pBreaks];
#
# ### Add index for original utterance order
# originalOrder <- seq_along(utterances);
#
# ### Process stanza identifiers
#
# sids <-
# regmatches(utterances,
# gregexpr(sidRegex, utterances));
#
# multipleSids <-
# which(unlist(lapply(sids, length))>1);
# if (length(multipleSids) > 0) {
# warning(paste0("Multiple stanza identifiers found in the following utterances:\n",
# paste0(utterances[multipleSids],
# collapse="\n"),
# "\n\nOnly using the first stanza identifier for each stanza, removing and ignoring the rest!"));
# sids <-
# lapply(sids, head, 1);
# }
#
# ### Clean stanza identifiers (i.e. only retain identifier)
# sids <-
# lapply(sids, gsub, pattern=sidRegex, replacement="\\1");
#
# ### Set NA as identifier for stanzas without identifier
# sids <-
# ifelse(unlist(lapply(sids,
# length)),
# sids,
# NA);
#
# ### Convert from a list to a vector
# sids <- unlist(sids);
#
# ### Generate identifiers for stanzas without identifier
# sids[is.na(sids)] <-
# paste0("autogenerated_sid_",
# 1:(sum(is.na(sids))));
#
# ### Process case identifiers
#
# cids <-
# regmatches(utterances,
# gregexpr(cidRegex, utterances));
#
# multipleCids <-
# which(unlist(lapply(cids, length))>1);
# if (length(multipleCids) > 0) {
# warning(paste0("Multiple case identifiers found in the following utterances:\n",
# paste0(utterances[multipleCids],
# collapse="\n"),
# "\n\nOnly using the first case identifier for each stanza, ignoring the rest!"));
# cids <-
# lapply(cids, head, 1);
# }
#
# ### Clean case identifiers (i.e. only retain identifier)
# cids <-
# lapply(cids, gsub, pattern=cidRegex, replacement="\\1");
#
# ### Set "no_cid" as case for stanzas without case
# cids <-
# ifelse(unlist(lapply(cids,
# length)),
# cids,
# "no_cid");
#
# ### Convert from a list to a vector
# cids <- unlist(cids);
#
# ### Delete stanza and case identifiers
# utterances <-
# gsub(paste0(sidRegex, "|", cidRegex),
# "",
# utterances);
#
# ### Bind cids and sids into a dataframe
# res1 <- data.frame(order = originalOrder,
# cid = as.character(cids),
# sid = as.character(sids),
# paragraph = paragraphs,
# stringsAsFactors=FALSE);
#
# ### Process codes
#
# ### Find matches
# matches <-
# regmatches(utterances,
# gregexpr(codeRegex, utterances));
# ### Retain only the parenthesized expression
# cleanedMatches <-
# lapply(matches, gsub, pattern=codeRegex, replacement="\\1");
# ### Get a complete list of all used codes
# uniqueCodes <-
# sort(unique(unlist(cleanedMatches)));
# ### Get presence of codes in utterances
# occurrences <-
# lapply(cleanedMatches,
# `%in%`,
# x=uniqueCodes);
# ### Convert from logical to numeric
# occurrenceCounts <-
# lapply(occurrences, as.numeric);
# ### Add the codes as names
# namedOccurrences <-
# lapply(occurrenceCounts,
# `names<-`,
# value <- uniqueCodes);
# ### Convert from a vector to a list
# namedOccurrences <-
# lapply(namedOccurrences,
# as.list);
# ### Convert the lists to dataframes
# occurrenceDfs <-
# lapply(namedOccurrences,
# as.data.frame,
# stringsAsFactors=FALSE);
# ### Bind the dataframes together as rows
# res2 <-
# do.call(rbind,
# occurrenceDfs);
#
# ### Delete codes from utterances
# utterances <-
# gsub(paste0(codeRegex),
# "",
# utterances);
#
# ### Trim spaces from front and back
# utterances <-
# trimws(utterances);
#
# if (!is.null(caseAttributes)) {
# tryCatch(parsedAttributes <- yaml::yaml.load(caseAttributes),
# error = function(e) {
# warning("Encountered error when trying to parse the case attributes!\n\n",
# "They are probably not specified in valid YAML. Please see the help ",
# "file using `?utterance_to_df` (sans backticks). The error generated ",
# "by the `yaml::load.yaml` function was: '",
# e$message,
# "'.\n");
# });
#
# names(parsedAttributes) <-
# purrr::map(parsedAttributes,
# 'cid');
#
# unspecifiedCases <-
# setdiff(cids, names(parsedAttributes));
#
# emptyAttributes <-
# as.list(paste0("no_", names(parsedAttributes[[1]])));
# names(emptyAttributes) <- names(parsedAttributes[[1]]);
#
# for (i in unspecifiedCases) {
# parsedAttributes[[i]] <-
# emptyAttributes;
# parsedAttributes[[i]]$cid <- i;
# }
#
# parsedAttributes <-
# lapply(parsedAttributes,
# function(x) { x$cid <- as.character(x$cid);
# return(x);});
#
# parsedAttributes <-
# dplyr::bind_rows(purrr::map(parsedAttributes,
# as.data.frame,
# stringsAsFactors=FALSE));
#
# res1 <-
# dplyr::left_join(res1,
# parsedAttributes,
# by='cid');
#
# }
#
# ### Add stanza identifiers and case identifiers
# res <-
# cbind(res1,
# data.frame(utterance = utterances),
# res2);
#
# if (!is.null(parsedAttributes)) {
# attr(res, 'attributeColumns') <-
# setdiff(names(parsedAttributes), 'cid');
# }
#
# ### Return the results
# return(res);
# }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.