#' Parsing sources
#'
#' These function parse one (`parse_source`) or more (`parse_sources`) sources and the
#' contained identifiers, sections, and codes.
#'
#' @param text,file As `text` or `file`, you can specify a `file` to read with
#' encoding `encoding`, which will then be read using [base::readLines()]. If the
#' argument is named `text`, whether it is the path to an existing file is checked
#' first, and if it is, that file is read. If the argument is named `file`, and it
#' does not point to an existing file, an error is produced (useful if calling
#' from other functions). A `text` should be a character vector where every
#' element is a line of the original source (like provided by [base::readLines()]);
#' although if a character vector of one element *and* including at least one
#' newline character (`\\n`) is provided as `text`, it is split at the newline
#' characters using [base::strsplit()]. Basically, this behavior means that the
#' first argument can be either a character vector or the path to a file; and if
#' you're specifying a file and you want to be certain that an error is thrown if
#' it doesn't exist, make sure to name it `file`.
#' @param path The path containing the files to read.
#' @param extension The extension of the files to read; files with other extensions will
#' be ignored. Multiple extensions can be separated by a pipe (`|`).
#' @param regex Instead of specifing an extension, it's also possible to specify a regular
#' expression; only files matching this regular expression are read. If specified, `regex`
#' takes precedece over `extension`,
#' @param recursive Whether to also process subdirectories (`TRUE`)
#' or not (`FALSE`).
#' @param codeRegexes,idRegexes,sectionRegexes These are named character vectors with one
#' or more regular expressions. For `codeRegexes`, these specify how to extract the codes
#' (that were used to code the sources). For `idRegexes`, these specify how to extract the
#' different types of identifiers. For `sectionRegexes`, these specify how to extract the
#' different types of sections. The `codeRegexes` and `idRegexes` must each contain one
#' capturing group to capture the codes and identifiers, respectively.
#' @param autoGenerateIds The names of the `idRegexes` that, if missing, should receive
#' autogenerated identifiers (which consist of 'autogenerated_' followed by an incrementing
#' number).
#' @param persistentIds The names of the `idRegexes` for the identifiers which, once
#' attached to an utterance, should be attached to all following utterances as well (until
#' a new identifier with the same name is encountered, after which that identifier will be
#' attached to all following utterances, etc).
#' @param inductiveCodingHierarchyMarker For inductive coding, this marker is used to indicate
#' hierarchical relationships between codes. The code at the left hand side of this marker will
#' be considered the parent code of the code on the right hand side. More than two levels
#' can be specified in one code (for example, if the `inductiveCodingHierarchyMarker` is '>',
#' the code `grandparent>child>grandchild` would indicate codes at three levels.
#' @param metadataContainers The name of YAML fragments containing metadata (i.e. attributes
#' about cases).
#' @param codesContainers The name of YAML fragments containing (parts of) deductive coding
#' trees.
#' @param delimiterRegEx The regular expression that is used to extract the YAML fragments.
#' @param ignoreRegex The regular expression that is used to delete lines before any other
#' processing. This can be used to enable adding comments to sources, which are then ignored
#' during analysis.
#' @param ignoreOddDelimiters If an odd number of YAML delimiters is encountered, whether this
#' should result in an error (`FALSE`) or just be silently ignored (`TRUE`).
#' @param encoding The encoding of the file to read (in `file`).
#' @param silent Whether to provide (`FALSE`) or suppress (`TRUE`) more detailed progress updates.
#' @param x The object to print.
#' @param prefix The prefix to use before the 'headings' of the printed result.
#' @param ... Any additional arguments are passed on to the default print method.
#'
#' @aliases parse_source parse_sources print.rockParsedSource
#' @rdname parsing_sources
#' @export
parse_source <- function(text,
file,
codeRegexes = c(code = "\\[\\[([a-zA-Z0-9._>-]+)\\]\\]"),
idRegexes = c(caseId = "\\[\\[cid=([a-zA-Z0-9._-]+)\\]\\]",
stanzaId = "\\[\\[sid=([a-zA-Z0-9._-]+)\\]\\]"),
sectionRegexes = c(paragraphs = "---paragraph-break---",
secondary = "---<[a-zA-Z0-9]?>---"),
autoGenerateIds = c('stanzaId'),
persistentIds = c('caseId'),
inductiveCodingHierarchyMarker = ">",
metadataContainers = c("metadata"),
codesContainers = c("codes", "dct"),
delimiterRegEx = "^---$",
ignoreRegex = "^#",
ignoreOddDelimiters=FALSE,
encoding="UTF-8",
silent=FALSE) {
if (missing(file)) {
if (missing(text)) {
stop("Provide either a `file` or a `text` to scan!");
} else {
if ((length(text) == 1) && file.exists(text)) {
x <- readLines(text,
encoding=encoding,
warn=FALSE);
} else {
x <- text;
if ((length(x) == 1) && grepl('\n', x)) {
x <-
strsplit(x,
"\n")[[1]];
}
}
}
} else {
if (file.exists(file)) {
x <- readLines(file,
encoding=encoding,
warn=FALSE);
} else {
stop("The file you specified in argument `file` ('",
paste0(file, collapse=" "),
"') does not exist. If you meant to provide a text ",
"to process, please use argument `text`");
}
}
arguments <- as.list(environment());
### First process YAML fragments and remove them
yamlFragments <-
yum::extract_yaml_fragments(text=x,
delimiterRegEx=delimiterRegEx,
ignoreOddDelimiters=ignoreOddDelimiters);
x <-
yum::delete_yaml_fragments(text=x,
delimiterRegEx=delimiterRegEx,
ignoreOddDelimiters=ignoreOddDelimiters);
### Then remove lines to ignore
linesToIgnore <- grepl(ignoreRegex,
x);
ignoredLines <- x[linesToIgnore];
x <- x[!linesToIgnore];
### Create dataframe for parsing
sourceDf <- data.frame(utterances_raw = x,
stringsAsFactors=FALSE);
### Identify sections
if (!is.null(sectionRegexes) && length(sectionRegexes) > 0) {
for (sectionRegex in names(sectionRegexes)) {
### Store whether each utterance matches
sourceDf[, glue::glue("{sectionRegex}_match")] <-
grepl(sectionRegexes[sectionRegex], x);
### Set incremental counter for each match
sourceDf[, glue::glue("{sectionRegex}_counter")] <-
purrr::accumulate(sourceDf[, glue::glue("{sectionRegex}_match")],
`+`);
}
}
### Process identifiers
if (!is.null(idRegexes) && length(idRegexes) > 0) {
for (idRegex in names(idRegexes)) {
### Get a list of matches
ids <-
regmatches(x,
gregexpr(idRegexes[idRegex], x));
### Check whether there are multiple matches
multipleIds <-
which(unlist(lapply(ids, length))>1);
if (length(multipleIds) > 0) {
warning(glue::glue("Multiple identifiers matching '{idRegex}' found in the following utterances:\n",
paste0(x[multipleIds],
collapse="\n"),
"\n\nOnly using the first identifier for each utterance, removing and ignoring the rest!"));
ids <-
lapply(ids, utils::head, 1);
}
### Clean identifiers (i.e. only retain identifier content itself)
ids <-
lapply(ids, gsub, pattern=idRegexes[idRegex], replacement="\\1");
### Set "no_id" for utterances without id
ids <-
ifelse(unlist(lapply(ids,
length)),
ids,
"no_id");
### Convert from a list to a vector
ids <- unlist(ids);
if (length(ids) > 1) {
### Implement 'identifier persistence' by copying the
### identifier of the previous utterance if the identifier
### is not set - can't be done using vectorization as identifiers
### have to carry over sequentially.
if (idRegex %in% persistentIds) {
rawIds <- ids;
for (i in 2:length(ids)) {
if ((ids[i] == "no_id")) {
ids[i] <- ids[i-1];
}
}
}
} else {
ids = "no_id";
}
### Check whether any matches were found
if (!(all(ids=="no_id"))) {
### Generate identifiers for ids without identifier
if (idRegex %in% autoGenerateIds) {
ids[ids=="no_id"] <-
paste0("autogenerated_id_",
1:(sum(ids=="no_id")));
}
### Store identifiers in sourceDf
sourceDf[, idRegex] <-
ids;
if (idRegex %in% persistentIds) {
sourceDf[, paste0(idRegex, "_raw")] <-
rawIds;
}
}
}
}
### Delete identifiers and store clean version in sourceDf
x <-
gsub(paste0(idRegexes, collapse="|"),
"",
x);
sourceDf$utterances_without_identifiers <- x;
codings <- list();
codingLeaves <- list();
inductiveCodeProcessing <- list();
inductiveCodeTrees <- list();
inductiveDiagrammeR <- list();
### Process codes
if (!is.null(codeRegexes) && length(codeRegexes) > 0) {
for (codeRegex in names(codeRegexes)) {
### Find matches
matches <-
regmatches(x,
gregexpr(codeRegexes[codeRegex], x));
### Retain only the parenthesized expression
cleanedMatches <-
lapply(matches, gsub, pattern=codeRegexes[codeRegex], replacement="\\1");
### Get a complete list of all used codes
codings[[codeRegex]] <-
sort(unique(unlist(cleanedMatches)));
### Split these unique codes into levels
if ((nchar(inductiveCodingHierarchyMarker) > 0) &&
(!is.null(codings[[codeRegex]])) &&
(length(codings[[codeRegex]]) > 0)) {
inductiveCodeProcessing[[codeRegex]] <- list();
inductiveCodeProcessing[[codeRegex]]$splitCodings <-
strsplit(codings[[codeRegex]],
inductiveCodingHierarchyMarker);
inductiveCodeProcessing[[codeRegex]]$inductiveLeaves <-
unlist(lapply(inductiveCodeProcessing[[codeRegex]]$splitCodings,
utils::tail,
1));
} else {
inductiveCodeProcessing[[codeRegex]]$inductiveLeaves <-
codings[[codeRegex]];
}
codingLeaves[[codeRegex]] <-
inductiveCodeProcessing[[codeRegex]]$inductiveLeaves;
### Get presence of codes in utterances
occurrences <-
lapply(cleanedMatches,
`%in%`,
x=codings[[codeRegex]]);
### Convert from logical to numeric
occurrenceCounts <-
lapply(occurrences, as.numeric);
### Add the codes as names
namedOccurrences <-
lapply(occurrenceCounts,
`names<-`,
value <- inductiveCodeProcessing[[codeRegex]]$inductiveLeaves);
### Removed this at 2019-02-18 after meeting with Szilvia - no idea why
### I'd put it in, but it caused these columns to be *lists* in the data.frame
### for some weird reason.
###
### Convert from a vector to a list
# namedOccurrences <-
# lapply(namedOccurrences,
# as.list);
### Convert the lists to dataframes
sourceDf <-
cbind(sourceDf,
as.data.frame(do.call(rbind,
namedOccurrences)));
### Delete codes from utterances
x <-
gsub(codeRegexes[codeRegex],
"",
x);
if ((nchar(inductiveCodingHierarchyMarker) > 0) &&
(!is.null(inductiveCodeProcessing[[codeRegex]]$inductiveLeaves)) &&
(length(inductiveCodeProcessing[[codeRegex]]$inductiveLeaves) > 0)) {
### Build tree for this code regex. First some preparation.
inductiveCodeProcessing[[codeRegex]]$localRoots <-
unlist(lapply(inductiveCodeProcessing[[codeRegex]]$splitCodings,
utils::head, 1));
inductiveCodeProcessing[[codeRegex]]$localBranches <-
unlist(lapply(inductiveCodeProcessing[[codeRegex]]$splitCodings,
utils::tail, -1));
inductiveCodeProcessing[[codeRegex]]$localRootsThatAreBranches <-
unlist(lapply(inductiveCodeProcessing[[codeRegex]]$localRoots,
`%in%`,
inductiveCodeProcessing[[codeRegex]]$localBranches));
### Convert split codings into node-ready lists
inductiveCodeProcessing[[codeRegex]]$subTrees <-
lapply(inductiveCodeProcessing[[codeRegex]]$splitCodings,
function(subTree) {
return(lapply(subTree,
function(x) {
stats::setNames(list(x,x,x),
c('idName',
'labelName',
'codeName'));
}));
});
### Local roots that are not branches should be attached to the root of
### the inductive code tree for this code set, along with their children.
inductiveCodeTrees[[codeRegex]] <-
data.tree::Node$new('codes');
### First add only the local roots that have no parents
for (currentLocalRoot in unique(inductiveCodeProcessing[[codeRegex]]$localRoots[
!inductiveCodeProcessing[[codeRegex]]$localRootsThatAreBranches
])) {
### Add first node to the root
inductiveCodeTrees[[codeRegex]]$AddChild(currentLocalRoot);
inductiveCodeTrees[[codeRegex]][[currentLocalRoot]]$label <-
currentLocalRoot;
inductiveCodeTrees[[codeRegex]][[currentLocalRoot]]$code <-
currentLocalRoot;
}
### Then process their branches/children
for (currentSubtree in inductiveCodeProcessing[[codeRegex]]$splitCodings[
!inductiveCodeProcessing[[codeRegex]]$localRootsThatAreBranches
]) {
if (length(currentSubtree) > 1) {
### Add children; first save reference to this node
currentNode <-
inductiveCodeTrees[[codeRegex]][[currentSubtree[1]]];
if (!silent) {
ufs::cat0("\nThis node has children: storing node to add children to ('",
currentNode$name,
"').");
}
### Then loop through children and progressively add them
for (currentBranch in currentSubtree[2:length(currentSubtree)]) {
if (is.null(currentNode[[currentBranch]])) {
if (!silent) {
ufs::cat0("\nThis parent node does not yet have a child with the name '",
currentBranch,
"', so adding it to that parent node.");
}
currentNode <-
currentNode$AddChild(currentBranch);
currentNode$label <-
currentBranch;
currentNode$code <-
currentBranch;
} else {
if (!silent) {
ufs::cat0("\nThis parent node already has a child with the name '",
currentBranch,
"', so not adding anything at this point.");
}
currentNode <-
currentNode[[currentBranch]];
}
}
}
}
### Then start working on the subtrees that should be attached to
### a parent
for (i in seq_along(inductiveCodeProcessing[[codeRegex]]$splitCodings[
inductiveCodeProcessing[[codeRegex]]$localRootsThatAreBranches
])) {
currentSubtree <-
inductiveCodeProcessing[[codeRegex]]$splitCodings[
inductiveCodeProcessing[[codeRegex]]$localRootsThatAreBranches
][[i]];
currentNode <-
data.tree::FindNode(inductiveCodeTrees[[codeRegex]],
currentSubtree[1]);
if (is.null(currentNode)) {
warning(paste0("Code '", codings[[codeRegex]][i], "' does not ",
"have a parent I can find!"));
} else {
### If it's found, loop through the children and progressively add them
for (currentBranch in currentSubtree[2:length(currentSubtree)]) {
currentNode <-
currentNode$AddChild(currentBranch);
currentNode$label <-
currentBranch;
currentNode$code <-
currentBranch;
}
}
}
data.tree::SetGraphStyle(inductiveCodeProcessing[[codeRegex]],
directed="false");
# inductiveDiagrammeR[[codeRegex]] <-
# data.tree::ToDiagrammeRGraph(inductiveCodeProcessing[[codeRegex]]);
} else {
inductiveCodeTrees[[codeRegex]] <- NULL;
}
}
}
### Trim spaces from front and back and store clean utterances
sourceDf$utterances_clean <-
trimws(x);
if (nrow(sourceDf) > 0) {
sourceDf$originalSequenceNr <- 1:nrow(sourceDf);
cleanSourceDf <-
sourceDf[!grepl(paste0(sectionRegexes, collapse="|"),
x), ];
cleanSourceDf <-
cleanSourceDf[nchar(cleanSourceDf$utterances_clean)>0, ];
} else {
cleanSourceDf <- data.frame();
}
if (nrow(cleanSourceDf) > 0) {
cleanSourceDf$sequenceNr <- 1:nrow(cleanSourceDf);
}
### Store results in the object to return
res <-
structure(list(arguments = arguments,
sourceDf = cleanSourceDf,
rawSourceDf = sourceDf,
codings = codingLeaves,
rawCodings = codings,
inductiveCodeProcessing = inductiveCodeProcessing,
inductiveCodeTrees = inductiveCodeTrees,
inductiveGraphs = inductiveDiagrammeR),
class="rockParsedSource");
### Process metadata and deductive code trees
if (!is.null(yamlFragments)) {
res$metadata <-
yum::load_yaml_fragments(yamlFragments=yamlFragments,
select=metadataContainers);
res$deductiveCodes <-
yum::load_yaml_fragments(yamlFragments=yamlFragments,
select=codesContainers);
res$deductiveCodeTrees <-
yum::build_tree(res$deductiveCodes);
}
### Add raw yamlFragments
res$yamlFragments <- yamlFragments;
### Return result
return(res);
}
#' @rdname parsing_sources
#' @method print rockParsedSource
#' @export
print.rockParsedSource <- function(x, prefix="### ", ...) {
totalSectionMatches <-
sum(unlist(lapply(x$rawSourceDf[, grep('_match',
names(x$rawSourceDf))],
as.numeric)));
appliedCodes <-
sort(unique(unlist(x$codings)));
totalCodingMatches <-
sum(unlist(x$sourceDf[, appliedCodes]));
if (totalCodingMatches > 0) {
codingInfo <-
glue::glue("These {nrow(x$sourceDf)} utterances were coded ",
"{totalCodingMatches} times in total using these codes: ",
"{ufs::vecTxtQ(appliedCodes)}.");
} else {
codingInfo <-
glue::glue("These {nrow(x$sourceDf)} utterances were not coded at all.");
}
if (length(x$inductiveCodeTrees) > 0) {
inductiveTreesInfo <-
glue::glue("This source contained inductive coding trees. ",
"These are shown in R Studio's viewer.\n\n")
} else {
inductiveTreesInfo <-
glue::glue("This source contained no inductive coding trees.\n\n")
}
if (length(x$deductiveCodeTrees) > 0) {
deductiveTreesInfo <-
glue::glue("This source contained deductive coding trees. ",
"These are also shown in R Studio's viewer.\n\n")
} else {
deductiveTreesInfo <-
glue::glue("This source contained no deductive coding trees.\n\n")
}
identifiers <-
names(x$arguments$idRegexes);
occurringIdentifiers <-
identifiers[identifiers %in% names(x$sourceDf)];
if (length(occurringIdentifiers) > 0) {
actualIdentifiers <-
lapply(x$sourceDf[, occurringIdentifiers, drop=FALSE],
unique);
actualIdentifiers <-
lapply(actualIdentifiers,
sort);
actualIdentifiers <-
lapply(actualIdentifiers,
function(x) return(x[!(x=="no_id")]));
identifierInfo <-
glue::glue("This source contained matches with identifier regular expressions. Specifically, ",
glue::glue_collapse(lapply(names(actualIdentifiers),
function(x) return(glue::glue("identifier regular expression '{x}' matched ",
"with identifiers {ufs::vecTxtQ(actualIdentifiers[[x]])}"))),
", "),
".");
} else {
identifierInfo <-
glue::glue("This source contained no matches with identifier regular expressions.")
}
print(glue::glue("\n\n",
"{prefix}Preprocessing\n\n",
"The parsed source contained {length(x$arguments$x)} lines. ",
"After removing lines that matched '{x$arguments$ignoreRegex}', ",
"the regular expression specifying which lines to ignore, and did not ",
"make up the {length(x$yamlFragments)} YAML fragments with metadata or ",
"deductive coding tree specifications, {nrow(x$rawSourceDf)} lines remained.",
" {totalSectionMatches} of these matched one of the section regular ",
"expressions ({ufs::vecTxtQ(x$arguments$sectionRegexes)}), and after ",
" removing these lines and all lines that were empty after removing ",
" characters that matched one or more identifier ",
"({ufs::vecTxtQ(x$arguments$idRegexes)}) and coding ",
"({ufs::vecTxtQ(x$arguments$codeRegexes)}) regular expressions, ",
"{nrow(x$sourceDf)} utterances remained.",
"\n\n",
"{prefix}Identifiers\n\n",
identifierInfo,
"\n\n",
"{prefix}Utterances and coding\n\n",
codingInfo,
"\n\n",
"{prefix}Inductive coding trees\n\n",
inductiveTreesInfo,
"\n",
"{prefix}Deductive coding trees\n\n",
deductiveTreesInfo));
if (length(x$inductiveCodeTrees) > 0) {
for (i in names(x$inductiveCodeTrees)) {
print(graphics::plot(x$inductiveCodeTrees[[i]]));
#DiagrammeR::render_graph(x$inductiveGraphs[[i]]);
}
}
if (length(x$deductiveCodeTrees) > 0) {
print(graphics::plot(x$deductiveCodeTrees));
}
invisible(x);
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.