R/parse_source.R

#' Parsing sources
#'
#' These function parse one (`parse_source`) or more (`parse_sources`) sources and the
#' contained identifiers, sections, and codes.
#'
#' @param text,file As `text` or `file`, you can specify a `file` to read with
#' encoding `encoding`, which will then be read using [base::readLines()]. If the
#' argument is named `text`, whether it is the path to an existing file is checked
#' first, and if it is, that file is read. If the argument is named `file`, and it
#' does not point to an existing file, an error is produced (useful if calling
#' from other functions). A `text` should be a character vector where every
#' element is a line of the original source (like provided by [base::readLines()]);
#' although if a character vector of one element *and* including at least one
#' newline character (`\\n`) is provided as `text`, it is split at the newline
#' characters using [base::strsplit()]. Basically, this behavior means that the
#' first argument can be either a character vector or the path to a file; and if
#' you're specifying a file and you want to be certain that an error is thrown if
#' it doesn't exist, make sure to name it `file`.
#' @param path The path containing the files to read.
#' @param extension The extension of the files to read; files with other extensions will
#' be ignored. Multiple extensions can be separated by a pipe (`|`).
#' @param regex Instead of specifing an extension, it's also possible to specify a regular
#' expression; only files matching this regular expression are read. If specified, `regex`
#' takes precedece over `extension`,
#' @param recursive Whether to also process subdirectories (`TRUE`)
#' or not (`FALSE`).
#' @param codeRegexes,idRegexes,sectionRegexes These are named character vectors with one
#' or more regular expressions. For `codeRegexes`, these specify how to extract the codes
#' (that were used to code the sources). For `idRegexes`, these specify how to extract the
#' different types of identifiers. For `sectionRegexes`, these specify how to extract the
#' different types of sections. The `codeRegexes` and `idRegexes` must each contain one
#' capturing group to capture the codes and identifiers, respectively.
#' @param autoGenerateIds The names of the `idRegexes` that, if missing, should receive
#' autogenerated identifiers (which consist of 'autogenerated_' followed by an incrementing
#' number).
#' @param persistentIds The names of the `idRegexes` for the identifiers which, once
#' attached to an utterance, should be attached to all following utterances as well (until
#' a new identifier with the same name is encountered, after which that identifier will be
#' attached to all following utterances, etc).
#' @param inductiveCodingHierarchyMarker For inductive coding, this marker is used to indicate
#' hierarchical relationships between codes. The code at the left hand side of this marker will
#' be considered the parent code of the code on the right hand side. More than two levels
#' can be specified in one code (for example, if the `inductiveCodingHierarchyMarker` is '>',
#' the code `grandparent>child>grandchild` would indicate codes at three levels.
#' @param metadataContainers The name of YAML fragments containing metadata (i.e. attributes
#' about cases).
#' @param codesContainers The name of YAML fragments containing (parts of) deductive coding
#' trees.
#' @param delimiterRegEx The regular expression that is used to extract the YAML fragments.
#' @param ignoreRegex The regular expression that is used to delete lines before any other
#' processing. This can be used to enable adding comments to sources, which are then ignored
#' during analysis.
#' @param ignoreOddDelimiters If an odd number of YAML delimiters is encountered, whether this
#' should result in an error (`FALSE`) or just be silently ignored (`TRUE`).
#' @param encoding The encoding of the file to read (in `file`).
#' @param silent Whether to provide (`FALSE`) or suppress (`TRUE`) more detailed progress updates.
#' @param x The object to print.
#' @param prefix The prefix to use before the 'headings' of the printed result.
#' @param ... Any additional arguments are passed on to the default print method.
#'
#' @aliases parse_source parse_sources print.rockParsedSource
#' @rdname parsing_sources
#' @export
parse_source <- function(text,
                         file,
                         codeRegexes = c(code = "\\[\\[([a-zA-Z0-9._>-]+)\\]\\]"),
                         idRegexes = c(caseId = "\\[\\[cid=([a-zA-Z0-9._-]+)\\]\\]",
                                       stanzaId = "\\[\\[sid=([a-zA-Z0-9._-]+)\\]\\]"),
                         sectionRegexes = c(paragraphs = "---paragraph-break---",
                                            secondary = "---<[a-zA-Z0-9]?>---"),
                         autoGenerateIds = c('stanzaId'),
                         persistentIds = c('caseId'),
                         inductiveCodingHierarchyMarker = ">",
                         metadataContainers = c("metadata"),
                         codesContainers = c("codes", "dct"),
                         delimiterRegEx = "^---$",
                         ignoreRegex = "^#",
                         ignoreOddDelimiters=FALSE,
                         encoding="UTF-8",
                         silent=FALSE) {


  if (missing(file)) {
    if (missing(text)) {
      stop("Provide either a `file` or a `text` to scan!");
    } else {
      if ((length(text) == 1) && file.exists(text)) {
        x <- readLines(text,
                       encoding=encoding,
                       warn=FALSE);
      } else {
        x <- text;
        if ((length(x) == 1) && grepl('\n', x)) {
          x <-
            strsplit(x,
                     "\n")[[1]];
        }
      }
    }
  } else {
    if (file.exists(file)) {
      x <- readLines(file,
                     encoding=encoding,
                     warn=FALSE);
    } else {
      stop("The file you specified in argument `file` ('",
           paste0(file, collapse=" "),
           "') does not exist. If you meant to provide a text ",
           "to process, please use argument `text`");
    }
  }

  arguments <- as.list(environment());

  ### First process YAML fragments and remove them
  yamlFragments <-
    yum::extract_yaml_fragments(text=x,
                                delimiterRegEx=delimiterRegEx,
                                ignoreOddDelimiters=ignoreOddDelimiters);
  x <-
    yum::delete_yaml_fragments(text=x,
                               delimiterRegEx=delimiterRegEx,
                               ignoreOddDelimiters=ignoreOddDelimiters);

  ### Then remove lines to ignore
  linesToIgnore <- grepl(ignoreRegex,
                         x);
  ignoredLines <- x[linesToIgnore];
  x <- x[!linesToIgnore];

  ### Create dataframe for parsing
  sourceDf <- data.frame(utterances_raw = x,
                         stringsAsFactors=FALSE);

  ### Identify sections
  if (!is.null(sectionRegexes) && length(sectionRegexes) > 0) {
    for (sectionRegex in names(sectionRegexes)) {
      ### Store whether each utterance matches
      sourceDf[, glue::glue("{sectionRegex}_match")] <-
        grepl(sectionRegexes[sectionRegex], x);
      ### Set incremental counter for each match
      sourceDf[, glue::glue("{sectionRegex}_counter")] <-
        purrr::accumulate(sourceDf[, glue::glue("{sectionRegex}_match")],
                          `+`);
    }
  }

  ### Process identifiers
  if (!is.null(idRegexes) && length(idRegexes) > 0) {
    for (idRegex in names(idRegexes)) {

      ### Get a list of matches
      ids <-
        regmatches(x,
                   gregexpr(idRegexes[idRegex], x));

      ### Check whether there are multiple matches
      multipleIds <-
        which(unlist(lapply(ids, length))>1);
      if (length(multipleIds) > 0) {
        warning(glue::glue("Multiple identifiers matching '{idRegex}' found in the following utterances:\n",
                       paste0(x[multipleIds],
                              collapse="\n"),
                       "\n\nOnly using the first identifier for each utterance, removing and ignoring the rest!"));
        ids <-
          lapply(ids, utils::head, 1);
      }

      ### Clean identifiers (i.e. only retain identifier content itself)
      ids <-
        lapply(ids, gsub, pattern=idRegexes[idRegex], replacement="\\1");

      ### Set "no_id" for utterances without id
      ids <-
        ifelse(unlist(lapply(ids,
                             length)),
               ids,
               "no_id");

      ### Convert from a list to a vector
      ids <- unlist(ids);

      if (length(ids) > 1) {
        ### Implement 'identifier persistence' by copying the
        ### identifier of the previous utterance if the identifier
        ### is not set - can't be done using vectorization as identifiers
        ### have to carry over sequentially.
        if (idRegex %in% persistentIds) {
          rawIds <- ids;
          for (i in 2:length(ids)) {
            if ((ids[i] == "no_id")) {
              ids[i] <- ids[i-1];
            }
          }
        }
      } else {
        ids = "no_id";
      }

      ### Check whether any matches were found
      if (!(all(ids=="no_id"))) {
        ### Generate identifiers for ids without identifier
        if (idRegex %in% autoGenerateIds) {
          ids[ids=="no_id"] <-
            paste0("autogenerated_id_",
                   1:(sum(ids=="no_id")));
        }
        ### Store identifiers in sourceDf
        sourceDf[, idRegex] <-
          ids;
        if (idRegex %in% persistentIds) {
          sourceDf[, paste0(idRegex, "_raw")] <-
            rawIds;
        }
      }
    }
  }

  ### Delete identifiers and store clean version in sourceDf
  x <-
    gsub(paste0(idRegexes, collapse="|"),
         "",
         x);
  sourceDf$utterances_without_identifiers <- x;

  codings <- list();
  codingLeaves <- list();
  inductiveCodeProcessing <- list();
  inductiveCodeTrees <- list();
  inductiveDiagrammeR <- list();
  ### Process codes
  if (!is.null(codeRegexes) && length(codeRegexes) > 0) {

    for (codeRegex in names(codeRegexes)) {

      ### Find matches
      matches <-
        regmatches(x,
                   gregexpr(codeRegexes[codeRegex], x));

      ### Retain only the parenthesized expression
      cleanedMatches <-
        lapply(matches, gsub, pattern=codeRegexes[codeRegex], replacement="\\1");

      ### Get a complete list of all used codes
      codings[[codeRegex]] <-
        sort(unique(unlist(cleanedMatches)));

      ### Split these unique codes into levels
      if ((nchar(inductiveCodingHierarchyMarker) > 0) &&
          (!is.null(codings[[codeRegex]])) &&
          (length(codings[[codeRegex]]) > 0)) {
        inductiveCodeProcessing[[codeRegex]] <- list();

        inductiveCodeProcessing[[codeRegex]]$splitCodings <-
          strsplit(codings[[codeRegex]],
                   inductiveCodingHierarchyMarker);

        inductiveCodeProcessing[[codeRegex]]$inductiveLeaves <-
          unlist(lapply(inductiveCodeProcessing[[codeRegex]]$splitCodings,
                        utils::tail,
                        1));
      } else {
        inductiveCodeProcessing[[codeRegex]]$inductiveLeaves <-
          codings[[codeRegex]];
      }

      codingLeaves[[codeRegex]] <-
        inductiveCodeProcessing[[codeRegex]]$inductiveLeaves;

      ### Get presence of codes in utterances
      occurrences <-
        lapply(cleanedMatches,
               `%in%`,
               x=codings[[codeRegex]]);

      ### Convert from logical to numeric
      occurrenceCounts <-
        lapply(occurrences, as.numeric);

      ### Add the codes as names
      namedOccurrences <-
        lapply(occurrenceCounts,
               `names<-`,
               value <- inductiveCodeProcessing[[codeRegex]]$inductiveLeaves);

      ### Removed this at 2019-02-18 after meeting with Szilvia - no idea why
      ### I'd put it in, but it caused these columns to be *lists* in the data.frame
      ### for some weird reason.
      ###
      ### Convert from a vector to a list
      # namedOccurrences <-
      #   lapply(namedOccurrences,
      #          as.list);

      ### Convert the lists to dataframes
      sourceDf <-
        cbind(sourceDf,
              as.data.frame(do.call(rbind,
                                    namedOccurrences)));

      ### Delete codes from utterances
      x <-
        gsub(codeRegexes[codeRegex],
             "",
             x);

      if ((nchar(inductiveCodingHierarchyMarker) > 0) &&
          (!is.null(inductiveCodeProcessing[[codeRegex]]$inductiveLeaves)) &&
          (length(inductiveCodeProcessing[[codeRegex]]$inductiveLeaves) > 0)) {

        ### Build tree for this code regex. First some preparation.
        inductiveCodeProcessing[[codeRegex]]$localRoots <-
          unlist(lapply(inductiveCodeProcessing[[codeRegex]]$splitCodings,
                        utils::head, 1));
        inductiveCodeProcessing[[codeRegex]]$localBranches <-
          unlist(lapply(inductiveCodeProcessing[[codeRegex]]$splitCodings,
                        utils::tail, -1));
        inductiveCodeProcessing[[codeRegex]]$localRootsThatAreBranches <-
          unlist(lapply(inductiveCodeProcessing[[codeRegex]]$localRoots,
                        `%in%`,
                        inductiveCodeProcessing[[codeRegex]]$localBranches));

        ### Convert split codings into node-ready lists
        inductiveCodeProcessing[[codeRegex]]$subTrees <-
          lapply(inductiveCodeProcessing[[codeRegex]]$splitCodings,
                 function(subTree) {
                   return(lapply(subTree,
                                 function(x) {
                                   stats::setNames(list(x,x,x),
                                                   c('idName',
                                                     'labelName',
                                                     'codeName'));
                                 }));

                 });

        ### Local roots that are not branches should be attached to the root of
        ### the inductive code tree for this code set, along with their children.
        inductiveCodeTrees[[codeRegex]] <-
          data.tree::Node$new('codes');

        ### First add only the local roots that have no parents
        for (currentLocalRoot in unique(inductiveCodeProcessing[[codeRegex]]$localRoots[
                                          !inductiveCodeProcessing[[codeRegex]]$localRootsThatAreBranches
                                        ])) {
          ### Add first node to the root
          inductiveCodeTrees[[codeRegex]]$AddChild(currentLocalRoot);
          inductiveCodeTrees[[codeRegex]][[currentLocalRoot]]$label <-
            currentLocalRoot;
          inductiveCodeTrees[[codeRegex]][[currentLocalRoot]]$code <-
            currentLocalRoot;
        }

        ### Then process their branches/children
        for (currentSubtree in inductiveCodeProcessing[[codeRegex]]$splitCodings[
                                 !inductiveCodeProcessing[[codeRegex]]$localRootsThatAreBranches
                               ]) {
          if (length(currentSubtree) > 1) {
            ### Add children; first save reference to this node
            currentNode <-
              inductiveCodeTrees[[codeRegex]][[currentSubtree[1]]];
            if (!silent) {
              ufs::cat0("\nThis node has children: storing node to add children to ('",
                        currentNode$name,
                        "').");
            }
            ### Then loop through children and progressively add them
            for (currentBranch in currentSubtree[2:length(currentSubtree)]) {
              if (is.null(currentNode[[currentBranch]])) {
                if (!silent) {
                  ufs::cat0("\nThis parent node does not yet have a child with the name '",
                            currentBranch,
                            "', so adding it to that parent node.");
                }
                currentNode <-
                  currentNode$AddChild(currentBranch);
                currentNode$label <-
                  currentBranch;
                currentNode$code <-
                  currentBranch;
              } else {
                if (!silent) {
                  ufs::cat0("\nThis parent node already has a child with the name '",
                            currentBranch,
                            "', so not adding anything at this point.");
                }
                currentNode <-
                  currentNode[[currentBranch]];
              }
            }
          }
        }

        ### Then start working on the subtrees that should be attached to
        ### a parent
        for (i in seq_along(inductiveCodeProcessing[[codeRegex]]$splitCodings[
                                 inductiveCodeProcessing[[codeRegex]]$localRootsThatAreBranches
                               ])) {
          currentSubtree <-
            inductiveCodeProcessing[[codeRegex]]$splitCodings[
              inductiveCodeProcessing[[codeRegex]]$localRootsThatAreBranches
            ][[i]];
          currentNode <-
            data.tree::FindNode(inductiveCodeTrees[[codeRegex]],
                                currentSubtree[1]);
          if (is.null(currentNode)) {
            warning(paste0("Code '", codings[[codeRegex]][i], "' does not ",
                           "have a parent I can find!"));
          } else {
            ### If it's found, loop through the children and progressively add them
            for (currentBranch in currentSubtree[2:length(currentSubtree)]) {
              currentNode <-
                currentNode$AddChild(currentBranch);
              currentNode$label <-
                currentBranch;
              currentNode$code <-
                currentBranch;
            }
          }
        }

        data.tree::SetGraphStyle(inductiveCodeProcessing[[codeRegex]],
                                 directed="false");

        # inductiveDiagrammeR[[codeRegex]] <-
        #   data.tree::ToDiagrammeRGraph(inductiveCodeProcessing[[codeRegex]]);


      } else {
        inductiveCodeTrees[[codeRegex]] <- NULL;
      }

    }
  }

  ### Trim spaces from front and back and store clean utterances
  sourceDf$utterances_clean <-
    trimws(x);

  if (nrow(sourceDf) > 0) {
    sourceDf$originalSequenceNr <- 1:nrow(sourceDf);

    cleanSourceDf <-
      sourceDf[!grepl(paste0(sectionRegexes, collapse="|"),
                      x), ];

    cleanSourceDf <-
      cleanSourceDf[nchar(cleanSourceDf$utterances_clean)>0, ];
  } else {
    cleanSourceDf <- data.frame();
  }

  if (nrow(cleanSourceDf) > 0) {
    cleanSourceDf$sequenceNr <- 1:nrow(cleanSourceDf);
  }

  ### Store results in the object to return
  res <-
    structure(list(arguments = arguments,
                   sourceDf = cleanSourceDf,
                   rawSourceDf = sourceDf,
                   codings = codingLeaves,
                   rawCodings = codings,
                   inductiveCodeProcessing = inductiveCodeProcessing,
                   inductiveCodeTrees = inductiveCodeTrees,
                   inductiveGraphs = inductiveDiagrammeR),
              class="rockParsedSource");

  ### Process metadata and deductive code trees
  if (!is.null(yamlFragments)) {
    res$metadata <-
      yum::load_yaml_fragments(yamlFragments=yamlFragments,
                               select=metadataContainers);
    res$deductiveCodes <-
      yum::load_yaml_fragments(yamlFragments=yamlFragments,
                               select=codesContainers);
    res$deductiveCodeTrees <-
      yum::build_tree(res$deductiveCodes);
  }

  ### Add raw yamlFragments
  res$yamlFragments <- yamlFragments;

  ### Return result
  return(res);

}

#' @rdname parsing_sources
#' @method print rockParsedSource
#' @export
print.rockParsedSource <- function(x, prefix="### ",  ...) {
  totalSectionMatches <-
    sum(unlist(lapply(x$rawSourceDf[, grep('_match',
                                           names(x$rawSourceDf))],
                      as.numeric)));

  appliedCodes <-
    sort(unique(unlist(x$codings)));

  totalCodingMatches <-
    sum(unlist(x$sourceDf[, appliedCodes]));

  if (totalCodingMatches > 0) {
    codingInfo <-
      glue::glue("These {nrow(x$sourceDf)} utterances were coded ",
                 "{totalCodingMatches} times in total using these codes: ",
                 "{ufs::vecTxtQ(appliedCodes)}.");
  } else {
    codingInfo <-
      glue::glue("These {nrow(x$sourceDf)} utterances were not coded at all.");
  }

  if (length(x$inductiveCodeTrees) > 0) {
    inductiveTreesInfo <-
      glue::glue("This source contained inductive coding trees. ",
                 "These are shown in R Studio's viewer.\n\n")
  } else {
    inductiveTreesInfo <-
      glue::glue("This source contained no inductive coding trees.\n\n")
  }

  if (length(x$deductiveCodeTrees) > 0) {
    deductiveTreesInfo <-
      glue::glue("This source contained deductive coding trees. ",
                 "These are also shown in R Studio's viewer.\n\n")
  } else {
    deductiveTreesInfo <-
      glue::glue("This source contained no deductive coding trees.\n\n")
  }

  identifiers <-
    names(x$arguments$idRegexes);
  occurringIdentifiers <-
    identifiers[identifiers %in% names(x$sourceDf)];

  if (length(occurringIdentifiers) > 0) {
    actualIdentifiers <-
      lapply(x$sourceDf[, occurringIdentifiers, drop=FALSE],
             unique);
    actualIdentifiers <-
      lapply(actualIdentifiers,
             sort);
    actualIdentifiers <-
      lapply(actualIdentifiers,
             function(x) return(x[!(x=="no_id")]));
    identifierInfo <-
      glue::glue("This source contained matches with identifier regular expressions. Specifically, ",
                 glue::glue_collapse(lapply(names(actualIdentifiers),
                                            function(x) return(glue::glue("identifier regular expression '{x}' matched ",
                                                                          "with identifiers {ufs::vecTxtQ(actualIdentifiers[[x]])}"))),
                                     ", "),
                 ".");
  } else {
    identifierInfo <-
      glue::glue("This source contained no matches with identifier regular expressions.")
  }

  print(glue::glue("\n\n",
                   "{prefix}Preprocessing\n\n",
                   "The parsed source contained {length(x$arguments$x)} lines. ",
                   "After removing lines that matched '{x$arguments$ignoreRegex}', ",
                   "the regular expression specifying which lines to ignore, and did not ",
                   "make up the {length(x$yamlFragments)} YAML fragments with metadata or ",
                   "deductive coding tree specifications, {nrow(x$rawSourceDf)} lines remained.",
                   " {totalSectionMatches} of these matched one of the section regular ",
                   "expressions ({ufs::vecTxtQ(x$arguments$sectionRegexes)}), and after ",
                   " removing these lines and all lines that were empty after removing ",
                   " characters that matched one or more identifier ",
                   "({ufs::vecTxtQ(x$arguments$idRegexes)}) and coding ",
                   "({ufs::vecTxtQ(x$arguments$codeRegexes)}) regular expressions, ",
                   "{nrow(x$sourceDf)} utterances remained.",
                   "\n\n",
                   "{prefix}Identifiers\n\n",
                   identifierInfo,
                   "\n\n",
                   "{prefix}Utterances and coding\n\n",
                   codingInfo,
                   "\n\n",
                   "{prefix}Inductive coding trees\n\n",
                   inductiveTreesInfo,
                   "\n",
                   "{prefix}Deductive coding trees\n\n",
                   deductiveTreesInfo));
  if (length(x$inductiveCodeTrees) > 0) {
    for (i in names(x$inductiveCodeTrees)) {
      print(graphics::plot(x$inductiveCodeTrees[[i]]));
      #DiagrammeR::render_graph(x$inductiveGraphs[[i]]);
    }
  }
  if (length(x$deductiveCodeTrees) > 0) {
    print(graphics::plot(x$deductiveCodeTrees));
  }
  invisible(x);
}
Matherion/rock documentation built on May 19, 2019, 6:20 p.m.