delete/CAN_BE_DELETED_utterances_to_nsdf.R

# utterances_to_nsdf <-
#   function(utterances,
#            caseAttributes = NULL,
#            codeRegex = "\\[\\[([a-zA-Z0-9._-]+)\\]\\]",
#            sidRegex = "\\[\\[sid=([a-zA-Z0-9._-]+)\\]\\]",
#            cidRegex = "\\[\\[cid=([a-zA-Z0-9._-]+)\\]\\]",
#            paragraphBreakRegEx = "---paragraph-break---",
#            mergeStanzas = TRUE) {
#
#     ### Identify paragraph breaks
#     pBreaks <- grepl(paragraphBreakRegEx,
#                      utterances);
#
#     ### Convert to paragraph numbers
#     paragraphs <- purrr::accumulate(pBreaks, `+`);
#
#     ### Remove paragraph breaks from the list of
#     ### utterances and from the list of paragraph
#     ### numbers
#     utterances <- utterances[!pBreaks];
#     paragraphs <- paragraphs[!pBreaks];
#
#     ### Add index for original utterance order
#     originalOrder <- seq_along(utterances);
#
#     ### Process stanza identifiers
#
#     sids <-
#       regmatches(utterances,
#                  gregexpr(sidRegex, utterances));
#
#     multipleSids <-
#       which(unlist(lapply(sids, length))>1);
#     if (length(multipleSids) > 0) {
#       warning(paste0("Multiple stanza identifiers found in the following utterances:\n",
#                      paste0(utterances[multipleSids],
#                             collapse="\n"),
#                      "\n\nOnly using the first stanza identifier for each stanza, removing and ignoring the rest!"));
#       sids <-
#         lapply(sids, head, 1);
#     }
#
#     ### Clean stanza identifiers (i.e. only retain identifier)
#     sids <-
#       lapply(sids, gsub, pattern=sidRegex, replacement="\\1");
#
#     ### Set NA as identifier for stanzas without identifier
#     sids <-
#       ifelse(unlist(lapply(sids,
#                            length)),
#              sids,
#              NA);
#
#     ### Convert from a list to a vector
#     sids <- unlist(sids);
#
#     ### Generate identifiers for stanzas without identifier
#     sids[is.na(sids)] <-
#       paste0("autogenerated_sid_",
#              1:(sum(is.na(sids))));
#
#     ### Process case identifiers
#
#     cids <-
#       regmatches(utterances,
#                  gregexpr(cidRegex, utterances));
#
#     multipleCids <-
#       which(unlist(lapply(cids, length))>1);
#     if (length(multipleCids) > 0) {
#       warning(paste0("Multiple case identifiers found in the following utterances:\n",
#                      paste0(utterances[multipleCids],
#                             collapse="\n"),
#                      "\n\nOnly using the first case identifier for each stanza, ignoring the rest!"));
#       cids <-
#         lapply(cids, head, 1);
#     }
#
#     ### Clean case identifiers (i.e. only retain identifier)
#     cids <-
#       lapply(cids, gsub, pattern=cidRegex, replacement="\\1");
#
#     ### Set "no_cid" as case for stanzas without case
#     cids <-
#       ifelse(unlist(lapply(cids,
#                            length)),
#              cids,
#              "no_cid");
#
#     ### Convert from a list to a vector
#     cids <- unlist(cids);
#
#     ### Delete stanza and case identifiers
#     utterances <-
#       gsub(paste0(sidRegex, "|", cidRegex),
#            "",
#            utterances);
#
#     ### Bind cids and sids into a dataframe
#     res1 <- data.frame(order = originalOrder,
#                        cid = as.character(cids),
#                        sid = as.character(sids),
#                        paragraph = paragraphs,
#                        stringsAsFactors=FALSE);
#
#     ### Process codes
#
#     ### Find matches
#     matches <-
#       regmatches(utterances,
#                  gregexpr(codeRegex, utterances));
#     ### Retain only the parenthesized expression
#     cleanedMatches <-
#       lapply(matches, gsub, pattern=codeRegex, replacement="\\1");
#     ### Get a complete list of all used codes
#     uniqueCodes <-
#       sort(unique(unlist(cleanedMatches)));
#     ### Get presence of codes in utterances
#     occurrences <-
#       lapply(cleanedMatches,
#              `%in%`,
#              x=uniqueCodes);
#     ### Convert from logical to numeric
#     occurrenceCounts <-
#       lapply(occurrences, as.numeric);
#     ### Add the codes as names
#     namedOccurrences <-
#       lapply(occurrenceCounts,
#              `names<-`,
#              value <- uniqueCodes);
#     ### Convert from a vector to a list
#     namedOccurrences <-
#       lapply(namedOccurrences,
#              as.list);
#     ### Convert the lists to dataframes
#     occurrenceDfs <-
#       lapply(namedOccurrences,
#              as.data.frame,
#              stringsAsFactors=FALSE);
#     ### Bind the dataframes together as rows
#     res2 <-
#       do.call(rbind,
#               occurrenceDfs);
#
#     ### Delete codes from utterances
#     utterances <-
#       gsub(paste0(codeRegex),
#            "",
#            utterances);
#
#     ### Trim spaces from front and back
#     utterances <-
#       trimws(utterances);
#
#     if (!is.null(caseAttributes)) {
#       tryCatch(parsedAttributes <- yaml::yaml.load(caseAttributes),
#                error = function(e) {
#                  warning("Encountered error when trying to parse the case attributes!\n\n",
#                          "They are probably not specified in valid YAML. Please see the help ",
#                          "file using `?utterance_to_df` (sans backticks). The error generated ",
#                          "by the `yaml::load.yaml` function was: '",
#                          e$message,
#                          "'.\n");
#                });
#
#       names(parsedAttributes) <-
#         purrr::map(parsedAttributes,
#                    'cid');
#
#       unspecifiedCases <-
#         setdiff(cids, names(parsedAttributes));
#
#       emptyAttributes <-
#         as.list(paste0("no_", names(parsedAttributes[[1]])));
#       names(emptyAttributes) <- names(parsedAttributes[[1]]);
#
#       for (i in unspecifiedCases) {
#         parsedAttributes[[i]] <-
#           emptyAttributes;
#         parsedAttributes[[i]]$cid <- i;
#       }
#
#       parsedAttributes <-
#         lapply(parsedAttributes,
#                function(x) { x$cid <- as.character(x$cid);
#                return(x);});
#
#       parsedAttributes <-
#         dplyr::bind_rows(purrr::map(parsedAttributes,
#                                     as.data.frame,
#                                     stringsAsFactors=FALSE));
#
#       res1 <-
#         dplyr::left_join(res1,
#                          parsedAttributes,
#                          by='cid');
#
#     }
#
#     ### Add stanza identifiers and case identifiers
#     res <-
#       cbind(res1,
#             data.frame(utterance = utterances),
#             res2);
#
#     if (!is.null(parsedAttributes)) {
#       attr(res, 'attributeColumns') <-
#         setdiff(names(parsedAttributes), 'cid');
#     }
#
#     ### Return the results
#     return(res);
#   }
gitlab-r-packages-mirror/rock documentation built on Dec. 3, 2024, 5:40 p.m.