############################################################################## ---
############## Finding files ----
### These functions identify files on the local disc that we want to read.
### As such, they are helpers for readFiles and readHumdrum.
### There is one exported (public-facing) function: findHumdrum.
findFiles <- function(..., recursive = FALSE) {
# This function is the workhorse behind the readFiles function,
# (which itself support readHumdrum and findHumdrum).
# The function is used to identify files (excludes directories) on the local system,
# based their filenames/paths, using a unique type of regex-based path search pattern.
# Details of how these search patterns work can be found in the findHumdrum user documentation.
# As a part of this process, the recursive argument is (eventually) passed to the base::list.files function.
#
#
# findFiles returns a named list of filepaths.
#### Parse ... arguments into a cartesian collection of patterns, ordered left to right
patterns <- assemblePatterns(list(...))
patterns[patterns == ""] <- ".*"
##### split paths into directories (all but rightmost) and files (rightmost)
dirpaths <- dirname(patterns)
filepaths <- basename(patterns)
#### find matches
## dirs
matchingdirs <- lapply(dirpaths, matchDirs)
## full paths (incl file)
matchingpaths <- if (sum(lengths(matchingdirs)) == 0L) {
replicate(length(matchingdirs), character(0), simplify = FALSE)
} else {
Map(matchFiles, filepaths, matchingdirs,
recursive = recursive)
}
data.table(Filepath = matchingpaths,
Directories = matchingdirs,
Pattern = patterns,
Label = names(patterns))
}
assemblePatterns <- function(patternargs) {
# This function does work for findFiles
# It translates one or more regular expression "patternargs" into
# a cartesian product of specific file-path regular expressions to look for.
# See the findHumdrum documentation for more explanation.
if (length(patternargs) == 0L) return(c(`_1` = ""))
# Cartesian product patterns
patgrid <- do.call('expand.grid', c(patternargs, stringsAsFactors = FALSE))
patterns <- apply(patgrid, 1, paste, collapse = .Platform$file.sep)
patterns <- gsub(paste0(strrep(.Platform$file.sep, 3), '*'), .Platform$file.sep, patterns)
## pattern labels
patlabs <- if (length(unlist(lapply(patternargs, names))) > 0L) {
patgridlabs <- patgrid
patgridlabs[] <- sapply(1:ncol(patgrid),
function(i) names(unlist(patternargs))[match(patgrid[,i], unlist(patternargs))] )
patlabs <- apply(patgridlabs, 1, paste, collapse = '')
patlabs
} else {
rep("", length(patterns))
}
patlabs[patlabs == ""] <- paste0("_", seq_len(sum(patlabs == "")))
names(patterns) <- patlabs
#
patterns
}
matchDirs <- function(dirpattern) {
# This function does work for findFiles.
# Given directory-path regular expressions (dirpattern) it finds matching directories on the system.
# See the findHumdrum documentation for more explanation.
dirpattern <- strsplit(dirpattern, split = .Platform$file.sep)[[1]]
dirpattern <- dirpattern[dirpattern != "."]
initial <- if (length(dirpattern) && (dirpattern[1] == "" || (.Platform$OS.type == 'windows' && dirpattern[1] == 'C:'))) {
dirpattern <- dirpattern[-1]
if (.Platform$OS.type == 'windows') 'C:' else '/'
} else {
"."
}
Reduce( function(cur, nex) {
unlist( lapply(cur,
function(curdir) {
hits <- if (nex %in% c('..', '.')) {
return(paste(curdir, nex, sep = .Platform$file.sep))
} else {
grep(nex, useBytes = TRUE,
list.dirs(path = curdir, recursive = FALSE, full.names = FALSE),
value = TRUE)
}
if (length(hits) > 0L) {
paste(curdir, hits, sep = .Platform$file.sep)
} else {
character(0)
}
}
)
)
},
x = dirpattern,
init = initial
) -> matchingdirs
## remove paths containing //
matchingdirs <- gsub(paste0(strrep(.Platform$file.sep, 3), '*'), .Platform$file.sep, matchingdirs)
##
matchingdirs
}
matchFiles <- function(filepattern, matchingdirs, recursive) {
# This function does work for findFiles.
# Given file-path regular expressions (filepattern), AND the matchingdirs produced by matchDirs,
# it finds matching files in all the matched directories.
# See the findHumdrum documentation for more explanation.
matchingfiles <-
lapply(matchingdirs,
function(curdir) {
paths <- list.files(path = curdir, pattern = filepattern, recursive = recursive)
files <- Filter(function(x) !x %in% list.dirs(curdir, recursive = FALSE, full.names = FALSE),
paths)
if (length(files) > 0) paste(curdir, files, sep = .Platform$file.sep) else NULL
})
if (length(matchingfiles) == 0L) return(character(0))
matchingfiles <- unlist(matchingfiles, use.names = FALSE)
matchingfiles <- stringr::str_replace(matchingfiles, pattern = paste0("^\\.", .Platform$file.sep), replacement = '')
Filter(Negate(is.na), matchingfiles)
}
filterFilesByContent <- function(fileFrame, patterns, combine = `&`) {
# takes a data.table with filenames
# and removes those that don't contain matches to (regex) patterns args.
# By default, files much match ALL patterns
# THIS COULD BE UPDATED to allow ALL (&) or ANY (|)
matches <- lapply(fileFrame$Files,
function(files) {
if (length(files) == 0L) return(files)
matches <- lapply(patterns, stringr::str_detect, str = files)
if (length(matches) > 1L) Reduce(combine, matches) else matches[[1]]
})
fileFrame[ , Filepath := list(Map(`[`, Filepath, matches))]
fileFrame[ , Files := list(Map(`[`, Files , matches))]
fileFrame
}
#' @name readHumdrum
#' @export
findHumdrum <- function(..., contains = NULL, recursive = FALSE, allowDuplicates = FALSE, verbose = FALSE) {
## First call findFiles with exactly the same arguments
fileFrame <- readFiles(..., contains = contains, recursive = recursive,
allowDuplicates = allowDuplicates, verbose = verbose)
fileFrame <- isValidHumdrum(fileFrame, errorReport.path = NULL)
fileFrame <- fileFrame[Valid == TRUE]
fileFrame[ , Valid := NULL]
fileFrame
}
###################################################################---
############## Reading files ----
# These are helper functions used by readHumdrum
# They do the actual work of reading in text from files.
# There is no exported (public-facing) function for users.
readFiles <- function(..., contains = NULL, recursive = FALSE, allowDuplicates = FALSE, validate = TRUE, verbose = FALSE) {
# readFiles is a major workhorse for readHumdrum and findHumdrum.
# It uses findFiles (and it's associated regex-based path search patterns) to ID files.
# It then reads any text files, filters for files that contain matches to regexes in "contains" argument.
# Finally, it checks for duplicates,
# Details of how these search patterns work can be found in the findHumdrum user documentation.
# As a part of this process, the recursive argument is (eventually) passed to the base::list.files function.
#
message("Finding and reading files...")
### Filenames
fileFrame <- findFiles(..., recursive = recursive)
# Duplicate matches?
matchTable <- fileFrame[ , table(unlist(Filepath), rep(Pattern, lengths(Filepath)))] # this is used later, for verbose printing
if (!allowDuplicates) fileFrame[ , Filepath := .(remove.duplicates(Filepath))]
### Files
fileFrame[ , Files := .(lapply(Filepath, readTextFiles))]
fileFrame <- as.list(fileFrame) # this stupidy (translatign to list and back)
# is because of annoying data.table assignment behavior, which acts differently
# depending assiging columns of lists if nrow == 1.
fileFrame$Filepath <- lapply(fileFrame$Files, \(files) if (length(files)) names(files) else files)
fileFrame <- as.data.table(fileFrame)
if (!is.null(contains)) fileFrame <- filterFilesByContent(fileFrame, contains)
### print warnings if there are duplicates
# this is delayed to here, because earlier, we don't know how many files will be removed by contains
duplicateWarnings(matchTable[rownames(matchTable) %in% unlist(fileFrame$Filepath), , drop = FALSE],
allowDuplicates, verbose)
#### print matches per pattern
with(fileFrame,
{
Label <- ifelse(grepl('^_[1-9]', Label), "", paste0(Label, ' = '))
# prepare string to print if contains
contains <- if(!is.null(contains)) {
glue::glue(" (containing ",
harvard(contains, 'and', quote = TRUE),
")")
} else ""
# prepare string to print if verbose
filepaths <- if (verbose) {
sapply(Filepath,
function(fps) {
paste(paste0('\t\t', fps), collapse = '\n')
})
} else ""
message(glue::glue("\tREpath-pattern {Label}'{Pattern}'",
" matches {lengths(Filepath)} text files",
contains,
" in {lengths(Directories)} {plural(lengths(Directories), 'directories', 'directory')}",
if (verbose) ":\n" else ".\n",
"{filepaths}",
.trim = FALSE ))
})
# Unwrap nested lists
fileFrame[ , Directories := NULL]
fileFrame <- as.data.table(lapply(fileFrame,
\(col) {
if (is.list(col)) {
unlist(col)
} else {
rep(col, lengths(fileFrame$Filepath))
}
}))
# Split lines
fileFrame[ , FileLines := list(splitFiles2Lines(Files))]
# Number files
fileFrame[ , File := match(Filepath, unique(Filepath))]
# return
message(glue::glue("{num2print(sum(lengths(fileFrame$Filepath)), capitalize = TRUE, label = 'file')} read from disk."))
fileFrame
}
# readLinesFast <- function(fpath, lines = TRUE) {
# # This function is awesomely fast, but doesn't check if the file are proper text files
#
# s <- file.info(fpath)$size
#
# buf <- readChar(fpath, s, useBytes = TRUE)
# buf <- stringi::stri_enc_tonative(buf)
# if (lines) stringi::stri_split_fixed(buf, "\n", omit_empty = TRUE)[[1]] else buf
# }
#
readTextFiles <- function(fpaths) {
# This function reads files,
# skipping and leaving out non-text files
# returns a vector of strings (one per file)
if (length(fpaths) == 0L) return(character(0))
raw <- lapply(fpaths, stringi::stri_read_raw)
enc <- data.table::rbindlist(lapply(stringi::stri_enc_detect(raw), head, n = 1))
# text <- !is.na(enc$Encoding) & enc$Confidence >= 0.75
text <- stringi::stri_enc_isascii(raw) |
stringi::stri_enc_isutf16be(raw) |
stringi::stri_enc_isutf16le(raw) |
stringi::stri_enc_isutf32be(raw) |
stringi::stri_enc_isutf16le(raw) |
stringi::stri_enc_isutf8(raw)
files <- unlist(Map(stringi::stri_encode, raw[text], enc$Encoding[text]))
names(files) <- fpaths[text]
files
}
splitFiles2Lines <- function(filestrs) {
lines <- stringi::stri_split_lines(filestrs)
lapply(lines, \(ls) if (ls[length(ls)] == "") ls[-length(ls)] else ls)
}
duplicateWarnings <- function(matchTable, allowDuplicates, verbose) {
duplicates <- rowSums(matchTable) > 1L
if (!any(duplicates)) return(NULL)
duptable <- matchTable[duplicates, , drop = FALSE]
warning <- c('Regarding findHumdrum/readHumdrum file-search patterns:\n\t\t',
glue::glue("{num2print(sum(duplicates), capitalize = TRUE)} ",
"{plural(sum(duplicates), 'files match', 'file matches')} ",
"match more than one search-pattern. \n"))
warning(warning, call. = FALSE, immediate. = FALSE, noBreaks. = FALSE)
message(strrep('-', options('width')$width),
'\nhumdrumR WARNING\n\t',
warning,
sep = '')
if (verbose) {
uniqcombos <- unique.matrix(duptable)
for (i in 1:nrow(uniqcombos)) {
curdup <- rownames(duptable)[apply(duptable, 1, \(row) all(row == uniqcombos[i,]))]
message('\n',
glue::glue("The following files match patterns ",
paste0('\t', glue::glue_collapse(colnames(duptable)[uniqcombos[i, ] > 0L],
sep = ', ', last = ' AND ')),
'\t:\n\t',
paste0('\t', glue::glue_collapse(curdup)),
'\n'))
# cat('\n')
}
}
message('\n\t', appendLF = FALSE,
if (allowDuplicates) {
"If this is a problem, be more specific in your searches or specify allowDuplicates = FALSE
to force humdrumR to only match each file once.\n"
} else {
"Since allowDuplicates = FALSE, these files are only being read once and
associated with the first pattern they match.\n"
},
strrep('-', options('width')$width),
'\n')
return(NULL)
}
shortFilenames <- function(fileFrame) {
# This function takes a list of full directory paths
# and finds the shortest unique version of each one.
# In most cases, the final final name is returned,
# but if two (or more directores) contain files
# with the same names, their directory names are retained
# to distinguish them
runs <- rle(fileFrame$Filepath)
shorten <- function(fns) {
stepin <- stringr::str_replace(fns, '[^/]*/', '')
duples <- duplicated(stepin) | rev(duplicated(rev(stepin))) |
stepin == fns | stepin == '' | is.na(stepin)
ifelse(duples, fns, Recall(stepin))
}
runs$values <- shorten(runs$values)
fileFrame[ , Filename := inverse.rle(runs)]
fileFrame
}
#############################################################################################---
########################### readHumdrum ----
################################################################--
## This is the main event: the function which parses text files (found and read already by findHumdrum and readFiles)
## readHumdrum itself is the only exported (public-facing) function.
#' Find and read humdrum files into R
#'
#' These functions find valid humdrum files on your local machine and read them into `humdrumR`.
#'
#' `findHumdrum` does the work of finding and reading the text files into `R`.
#' `readHumdrum` utilizes `findHumdrum` to read files, then parses them to
#' create a [humdrum table][humTable] and build
#' a [humdrumR data object][humdrumRclass] around the table.
#'
#'
#' @param ... ***One or more patterns used to identify files to read.***
#'
#' Must be `character` strings.
#'
#' For details: see the "REpath-patterns" section below.
#'
#' @param recursive ***Should files be found recursively through sub directories?***
#'
#' Defaults to `FALSE`.
#'
#' Must be a singleton `logical` value: an on/off switch.
#'
#' If `TRUE`, the final part of the search pattern (i.e., the file search) is searched for recursively through all sub directories.
#'
#' @param allowDuplicates ***Indicating what should happen if multiple search patterns match the same files.***
#'
#' Defaults to `FALSE`.
#'
#' Must be a singleton `logical` value: an on/off switch.
#'
#' If `allowDuplicates = TRUE`,
#' any such files are read multiple times, grouped into their respective corpora by the `Label` field.
#' If `allowDuplicates = FALSE`, any redundant files are only read into the corpus of the first pattern they
#' match.
#'
#' @param contains ***REGEX for filtering files.***
#'
#' Defaults to `NULL`.
#'
#' Must be `character`.
#'
#' If `!is.null(contains)`, the `contains` argument is
#' is treated as regular expressions: only files which contain matches to
#' *all* of these regular expressions are read.
#' Thus, `readHumdrum('.*krn$', contains = "EEE")` will only read kern files which contain matches
#' to `"EE"`---which is kern for the E two octaves below middle C (or lower).
#'
#'
#' @param verbose ***Whether to print filename while reading or not.***
#'
#' Defaults to `FALSE`.
#'
#' Must be a singleton `logical` value: an on/off switch.
#'
#' If `TRUE`, the names of matching files are printed before parsing begins. This is very
#' useful as a check to make sure you aren't reading the wrong files!
#'
#' @param tandems ***Controls which, if any, tandem interpretations are parsed into their own fields.***
#'
#' Defaults to `"known"`.
#'
#' Must be `character`.
#'
#' @param reference ***Which reference records should be parsed into fields.***
#'
#' Defaults to `"all"`.
#'
#' Must be `character`.
#'
#' @section REpath-patterns:
#'
#' "REpath-patterns" are specified using `...` arguments.
#' In combination, all the `...` arguments are used to search for file paths.
#' Each part of the search path you specify (`"dirpart/dirpart/filepart"`, etc) are matched as regular expressions
#' against directories/files on your disc.
#' Thus, we can say things like `findHumdrum("../^A.*/.*krn$")`, which would
#' match any kern files in any directory beginning with a capital `"A"` in the
#' directory above the current working directory.
#' For conveniance, you can break the path across multiple arguments instead of using delimited strings: For example, the code
#' `findHumdrum("..", "^A.*", ".*krn$")` will give an identical result as the previous example
#' (`findHumdrum("../^A.*/,*krn$")`).
#' This is useful when searching for more than one pattern (see next paragraph) in the same directory.
#'
#' If you want to search for *more than one* pattern, you can input them as a character vector:
#' For instance, `readHumdrum(c("mozart", "beethoven")`---this command will search for
#' filenames containing "mozart" OR "beethoven."
#' This works for directories too: `readHumdrum(c("Mozart", "Beethoven"), ".*krn$")` will
#' look for any kern files in directories containing "Mozart" OR "Beethoven."
#' If patterns are named, these names will show up as identifying patterns in the `[humdrumR][humdrumR]` object's
#' `Label` field. Unnamed patterns are simply labeled with numbers.
#' We refer to files matched from regex patterns to be "subcorpora" of the total corpus.
#'
#' Normal (system appropriate) conventions (i.e., directories separated by `"/"`,
#' `'~'` at beginning to indicate home, `".."` to indicate directory above working directory, etc.)
#' are followed.
#' If a pattern contains a solo dot followed by a file sep---e.g., `"./"`, `"x/./y"`---this is
#' treated as the current directory, not a regular expression.
#' If a pattern contains two dots---e.g., `"../"`---this is treated as the directory above, not a regular expression.
#' If you want to create a regular expression to match any directory, use `".*/"`.
#'
#' The regex pattern `""` matches any file (it is changed to `".*"`). If you don't specifiy any `...` argument,
#' `findHumdrum` (or `readHumdrum`) will default to `".*"` as well.
#' Thus, `readHumdrum()` will read any humdrum files in the working directory.
#'
#' (If two or more files in different directories share the same name, a unique name is created for
#' each file by appending the names of the directories they occupy, recursively
#' until the names are unique.)
#'
#' If a single humdrum file has multiple pieces in it---meaning that all spine paths close with `*-`, then
#' open again with `**`---then they are parsed separately.
#' They are distinguished in the `Piece` field.
#' If there are no multi-piece files, `Piece` and `File` will be identical.
#'
#' @section Validity:
#'
#' `findHumdrum` and `readHumdrum` automatically ignore non-text files.
#' What's more, any files which contain humdrum syntax errors (checked by [validateHumdrum()]) are automatically
#' skipped. If you want to see specifically what errors occurred, call [validateHumdrum()]
#' directly and use its `errorReport.path` argument.
#'
#' @section Tandem Interpretations:
#'
#' All tandem interpretations in a humdrum dataset are summarized in the [humdrum table's][humTable]
#' `Tandem` field, which is described in detail [here][extractTandem()].
#' In addition, certain "known" tandem interpretations are parsed into their *own* fields automatically.
#' For example, `*clefG4` and "`*clefF2` are parsed as `Clef` data, while `*k[b-]` is parsed as a `KeySignature`.
#' The "known" tandem interpretations that `humdrumR` recognizes are encoded in a built-in
#' table called `knownInterpretations`.
#' Each interpretation has a humdrumR name (`"Clef"`, `"TimeSignature"`, etc.) as well as a regular expression
#' associated with it.
#'
#' The `tandems` argument to `readHumdrum` controls which tandem interpretations are
#' parsed into their own fields. This can be helpful to either save processing time and memory
#' by *not* parsing interpretations you won't need, or to parse interpretations that
#' humdrumR doesn't recognize.
#' The default value for the `tandems` argument is `"known"`. If the `tandems` argument
#' contains `"known"` *all* tandem interpretations in the built-in `knownInterpretations`
#' table are parsed.
#' Users may specify different interpretations to parse in two ways:
#'
#' 1) character strings
#' matching one of the name values from the `Name` column of `knownInterpretations`.
#' For instance, if you specify `tandems = c('Clef', 'TimeSignature')`, only clef (e.g., `"*clefG2"`),
#' and time signature (e.g., `"*M3/4"`) intepretations will be parsed.
#' 2) if the character string(s) in `tandem` do not exactly match one of the names in
#' `knownInterpretations$Name`, they are treated as regular expressions and used to match
#' tandem interpretations in the data. This allows users to parse non-standard tandem interpretations
#' that humdrumR doesn't already know about.
#'
#' If any values in `tandems` are named, these names will be used for resulting fields.
#' If no matches to an given interpretation are found, no field is created for that interpretation.
#' If `tandems = NULL`, then no tandem interpretations are parsed.
#'
#' @section Reference Records:
#'
#' By default (`reference = "all"`), humdrumR reads all reference records in the data.
#' The reference code for each record (e.g, the "OTL", in "!!!OTL: xxx") is used as the name of
#' an associated field.
#' (If a reference record has no reference code (i.e., it lacks a colon), the field is called "Unkeyed.")
#' In large datasets with many reference records, the reference data can actually make up a large portion
#' of the humdrum table, and eat up a lot of memory. In these cases, we might not want to read
#' all (or any) reference records---we can instead read only the reference records that we are planning to use
#' in our analyses (if any).
#' If `reference = NULL`, no reference records are parsed.
#' Otherwise, the character values of `reference` are treated as reference codes and only
#' matching reference records are parsed.
#' For instance, `readHumdrum(_, reference = "OTL")` will *only* parse OTL reference records.
#' If the values of `reference` are named, these names are used to name associated fields.
#' Thus, by specifing `reference = c(Title = 'OTL')`, you can use "OTL" reference records to populate
#' a field called "Title".
#'
#' If there are more than one reference records with the same reference code,
#' either explicitely numbered (e.g., "!!!COM1:", "!!!COM2:") all are read and rather than making two
#' or more fields, a single field is created ("COM" in this) with the multiple values separated by ";".
#'
#' If your humdrum data includes files containing multiple pieces, special consideration is
#' needed to determine (or guess) which reference records (or other global comments) "go with" which piece.
#' Obviously, reference records at the beginning and end of each file are grouped with the first
#' and last pieces respectively.
#' However, reference records that are between pieces in any multi-piece file require some guess work.
#' `readHumdrum()` will look at reference codes and attempt to group in-between reference records
#' into pieces in a logical way by avoiding duplicated reference codes.
#'
#'
#' @section Spines and Paths:
#'
#'
#' In the [humdrum syntax](http://www.humdrum.org/guide/ch05/), data is placed in "spines,"
#' which are not the same as "columns" in a spreadsheet. A "column" refers to a
#' tab-delineated group of values.
#' "Spines" can be a single column, or they may (at any time) split into multiple columns,
#' which can in turn split again, using the `"*^"` interpretation token. The reverse can happen as well,
#' with two or more columns merging into a single column, using the `"v"` token.
#' This means that, while humdrum data at first glance looks like a simple two-dimensional table,
#' it is actually a flexible tree structure. As spines split and merge, the total number of columns
#' can change during a piece, creating a "ragged" edge.
#' Another similar issue is that a corpus of humdrum files may have varying numbers of spines/columns, between pieces.
#' ("Global" comment/reference records are also a special case, as that are always a single value, even if interspersed with
#' multi-column local records.)
#
#' `readHumdrum` assumes a slightly more strict version of the humdrum syntax:
#' that all the spines which appear at the beginning of a file (headed with exclusive interpretations
#' like `"**kern"`) can never merge into each other. Thus, a humdrum file read into `humdrumR`
#' must not end with fewer columns than it starts.
#' Spine merges (`"*v"`) can only happen within spine paths that originally split off the same spine.
#' This extra-strict specification of spine paths in the humdrum syntax is, fortunately, something that has been
#' informally followed in most humdrum datasets.
#'
#' Our strict spine-path definition makes everything work fairly simply:
#' Within a piece, the spines which appear at the beginning of the piece are the "true" spines throughout the piece, numbered
#' from left to right, starting from `1L`.
#' For each local token, the value in the `Spine` field is an integer indicating which of these
#' "true" spines it belongs to---global tokens have a `NA` value in their `Spine` field, because they do not belong to any spine.
#' Any spine path splits (`"*^"`) from the main spines form **spine paths**.
#' Every spine's paths are numbered in the `Path` field, from right to left, starting from `0L`.
#' A spine with no splits will have all `0L`s in its `Path` field.
#'
#'
#' @section Result:
#'
#' `findHumdrum` returns a "fileFrame" (`data.table`), listing all file names,
#' the patterns they match, the directories they were found in, *and* the raw text content of these files.
#'
#' `readHumdrum` returns a fully parsed [humdrumR object][humdrumRclass].
#'
#' @examples
#'
#' readHumdrum() # loads all valid humdrum files in the current directory.
#'
#' readHumdrum(".*krn$") # loads all files ending with "krn" in the currect directory
#'
#' readHumdrum("^Composers$/^Be|^Mo/.*/^Joined$/.*krn$")
#' # Goes inside the directory "Composers".
#' # Inside "Composers" looks for directories that start with "Be" or "Mo".
#' # If there are any "Be|Mo" matching directories within "Composers", matches all directories within them.
#' # Within these directories, looks for directories called "Joined".
#' # If there are any directories called "Joined", loads all files (if any) that end with "krn".
#'
#' readHumdrum("^Composers$", "^Be|^Mo", ".*", "^Joined$", ".*krn$")
#' # exactly the same as the previous!
#'
#' readHumdrum("^Composers$", c(Beethoven = "^Be", Mozart = "^Mo"), ".*", "^Joined$", ".*krn$")
#' # exactly the same as the previous, except now the two matching patterns ("^Be", or "^Mo") will be grouped
#' # in the Label field as "Beethoven" and "Mozart" respectively.
#'
#' @name readHumdrum
#' @export
readHumdrum <- function(..., recursive = FALSE, contains = NULL, allowDuplicates = FALSE, verbose = FALSE,
tandems = 'known', reference = 'all') {
fileFrame <- findHumdrum(..., contains = contains, recursive = recursive,
allowDuplicates = allowDuplicates, verbose = verbose)
#
if (nrow(fileFrame) == 0L) return(NULL)
#
message(glue::glue("Parsing {num2print(nrow(fileFrame), label = 'file')}..."), appendLF = FALSE)
if (verbose) message()
## Divide out any pieces within files
fileFrame <- separatePieces(fileFrame)
## Parse records
humtabs <- Map(
function(file, filename, piece) {
humtab <- parseRecords(file, piece, reference)
if (verbose) message(strrep(' ', 17L), '...', filename)
humtab
},
fileFrame$FileLines, fileFrame$Filepath, fileFrame$Piece)
##########-
### Assembling local record information with corpus metadata (filenames, reference records, etc.)
##########-
message("Assembling corpus...", appendLF = FALSE)
###### consolidate files into one data.table
humtab <- data.table::rbindlist(humtabs, fill = TRUE) # fill = TRUE because some pieces have different reference records
# file/piece info
fileFrame <- shortFilenames(fileFrame)
fileFrame[ , FileLines := NULL]
# combine with parsed data
humtab <- humtab[fileFrame, on = "Piece"]
## Other general information about tokens
humtab[ , Type := parseTokenType(Token, E = TRUE, S = TRUE)]
humtab <- humtab[Type != 'P']
humtab[ , Global := is.na(Spine)]
humtab[ , Pattern := NULL]
#
tandemTab <- parseTandem(humtab$Tandem, tandems)
if (!is.null(tandemTab)) humtab <- cbind(humtab, tandemTab)
#
message('Done!\n')
makeHumdrumR(humtab,
pattern = setNames(unique(fileFrame$Pattern), unique(fileFrame$Label)),
tandemFields = colnames(tandemTab))
}
parseRecords <- function(records, piece, reference) {
# This function is the biggest part of readHumdrum
# It takes a character vector representing the records
# in a single humdrum file, and outputs a data.table (an incomplete humdrum table).
global <- parseGlobal(records, reference)
local <- parseLocal(records)
humtab <- if (length(global) == 1 && is.na(global)) local else rbind(global$Data, local, fill = TRUE)
# Data record numbers
humtab$DataRecord <- rep(NA, nrow(humtab))
D <- !grepl('^[!=*]', humtab$Token)
humtab$DataRecord[D] <- match(humtab$Record[D], sort(unique(humtab$Record[D])))
#
humtab <- if (!(length(global) == 1 && is.na(global))) cbind(humtab, global$Table) else humtab
humtab[ , Piece := piece]
humtab
}
parseGlobal <- function(records, reference) {
# This function works for parseRecords
# It takes a character vectors of records
# from a single humdrum file and outputs
# two things:
# Data, which is a data.table (the beginnings of a humdrum table), representing the tokens.
# Reference, (also a data.table, which will fold into a humdrum table) representing the reference records.
globalr <- grep('^!!', records, useBytes = TRUE)
if (length(globalr) == 0) return(NA)
globalrecords <- records[globalr]
types <- parseTokenType(globalrecords)
refind <- grepl('^!!!', globalrecords)
list(Data = data.table(Token = globalrecords, Record = globalr, Type = types),
Table = parseReference(globalrecords[refind], reference))
}
parseReference <- function(refTable, reference) {
# This parses a character vector of reference records into
# a data.table with columns indicating reference keys
# and rows holdingtheir values
if (length(reference) == 0L) return(NULL)
refrecords <- stringr::str_sub(refTable, start = 4L) # rid of !!!
refTable <- as.data.table(stringi::stri_split_regex(refrecords, ': *', n = 2, simplify = TRUE))
colnames(refTable) <- c('refKeys', 'refVals')
# if there is no colon, there is no key so need to shift the first column over and insert "Unkeyed"
refTable <- refTable[refKeys != '' | refVals != '']
refTable[refKeys == '', refKeys := 'Unkeyed']
#multiple keys
refTable[ , refKeys := stringi::stri_replace_all_regex(refKeys, '[0-9]+$', '')]
if (any(duplicated(refTable$refKeys))) {
refTable <- refTable[ , .(refVals = paste(refVals, collapse = '; ')), by = refKeys]
}
# filter by reference argument
if (!(length(reference) == 1L && reference == 'all')) {
refTable <- refTable[refKeys %in% reference]
names <- .names(reference)
names[names == ""] <- reference[names == ""]
refTable[ , refKeys := names[match(refKeys, reference)]]
}
if (nrow(refTable) == 0L) {
NULL
} else {
as.data.table(as.list(setNames(refTable$refVals, refTable$refKeys)))
}
}
parseLocal <- function(records) {
# This is the most substantive chunk of
# parseRecords (and thus, read Humdrum)
# Takes a character vector of local records
# returns a data.table (the makings of a humdrum table)
recordn <- grep('^!!', records, invert = TRUE, useBytes = TRUE)
localrecords <- records[recordn]
###local is list of vectors of strings
###(each string = one record)
local <- stringi::stri_split_fixed(localrecords, pattern = '\t')
#spine paths
if (any(grepl('*^', localrecords, fixed = TRUE),
grepl('*v', localrecords, fixed = TRUE),
grepl('*+', localrecords, fixed = TRUE),
grepl('*-', head(localrecords, -1), fixed = TRUE))) {
# big if statement decides whether parseSpinePaths needs to be called.
# It's worthwhile to waste a little time to do it here.
local <- padSpinePaths(local)
}
###mat is character matrix
###(row = record, col = spine/subspine)
mat <- stringi::stri_list2matrix(local, byrow = TRUE, fill = '_P')
#flatten spines and get recordns for each.
rownames(mat) <- recordn
tokens <- setNames(as.vector(mat), rep(recordn, ncol(mat)))
## get spine and path numbers of appropriate length
SpineNumbers <- cumsum(apply(mat, 2, function(spine) any(spine != '_P') && grepl('^\\*\\*', spine[spine != '_P'][1])))
SpinePaths <- unlist(use.names = FALSE, tapply(SpineNumbers, SpineNumbers, seq_along, simplify = TRUE)) - 1L
Columns <- seq_along(SpineNumbers)
ParentPaths <- integer(length(SpineNumbers))
if (any(SpinePaths) > 0L) {
ind <- which(mat != '_P', arr.ind = TRUE)
ind <- ind[!duplicated(ind[ , 2]), ]
# targets <- unique(which(mat[ind[SpinePaths > 0L, 'row'] - 1L, , drop = FALSE] == '*^', arr.ind = TRUE))[ , 'col']
splitrows <- mat[ind[SpinePaths > 0L, 'row'] - 1L, , drop = FALSE]
splitrows <- which(splitrows == '*^' | splitrows == '*+', arr.ind = TRUE)
targets <- tapply(splitrows[ , 'col'], splitrows[ , 'row'], max)
ParentPaths[SpinePaths > 0L] <- SpinePaths[targets]
}
#sections
sections <- parseSections(mat[ , 1])
barlines <- parseBarlines(mat[ , 1])
#interpretations
exclusivestandems <- parseInterpretations(mat)
exclusives <- exclusivestandems$Exclusive
tandems <- unlist(exclusivestandems$Tandem)
##################### tokens
tokens <- parseMultiStops(tokens)
#If there are multistops, the spines are no longer the same lengths, and have different recordns.
stopNs <- as.integer(gsub('^.*\\.', '', names(tokens)))
# get rid of stuff before period, leaving only stop number
recordns <- gsub('\\..*$', '', names(tokens))
# get rid of stuff after period, leaving just record number (don't make numeric yet)
# expand objects to match recordn, which may have changed when multistops were introduced
# recordns are still characters. This is necessarry, because we use them as names to index other objects.
sections <- sections[recordns, , drop = FALSE] # these two should always be the same in each spine,
barlines <- barlines[recordns, , drop = FALSE] # so I'm really just copying the first spine...maybe change this to work like tandems (below)?
spineLengths <- nrow(mat) + apply(mat[!grepl('^[*!=]', mat[ , 1]), , drop = FALSE], 2, \(col) sum(stringi::stri_count_fixed(col, ' ')))
Columns <- rep(Columns, spineLengths)
tandems <- tandems[paste0(SpineNumbers[Columns], '_', recordns)] # These are different in each spine
# Don't need recordns to be characters anymore.
recordns <- as.integer(recordns)
humtab <- data.table::data.table(Token = tokens,
Spine = SpineNumbers[Columns],
Path = SpinePaths[Columns],
ParentPath = ParentPaths[Columns],
Record = recordns,
Stop = stopNs,
Exclusive = exclusives[Columns],
Tandem = tandems,
barlines)
if (ncol(sections) > 0L) humtab <- cbind(humtab, sections)
# humtab <- humtab[Token != '_P'] # "_P" tokens were inserted as padding by parseSpinePaths
humtab
}
separatePieces <- function(fileFrame) {
# Takes a file frame.
# It looks at each file and breaks it into subfiles if necessarry
# it returns a new file frame, expanded to include newly split files
# and with new "Piece" column
filelines <- fileFrame$FileLines
containmultiple <- sapply(filelines,
\(lines) {
sum(stringi::stri_detect_regex(lines, '^\\*\\*')) > 1L
})
filelines[containmultiple] <-
lapply(filelines[containmultiple],
\(lines) {
starts <- which(stringi::stri_detect_regex(lines, '^\\*\\*'))
# find global comments which are "OUTSIDE" of pieces.
nopen <- stringi::stri_count_regex(lines,'^\\*\\*|\t\\*\\*')
nclose <- stringi::stri_count_regex(lines,'^\\*-|\t\\*-')
nspines <- cumsum(nopen - c(0L, head(nclose, -1L)))
global <- which(nspines == 0L)
# we can't just grep for ^!! because that would also capture global comments
# which are mid-piece
#
codes <- stringi::stri_extract_first_regex(lines[global], '!!![^:]+:')
groups <- findInterval(global, starts)
# At blocks of global records between pieces and see if we can improve alignment
# by shifting some reference records between adjacent blocks.
blocks <- table(groups = factor(groups, levels = 0L:length(starts)), codes)
blocks[2L, ] <- blocks[2L, ] + blocks[1L, ]
blocks[length(starts), ] <- blocks[length(starts), ] + blocks[nrow(blocks), ]
blocks <- head(tail(blocks, -1L), -1L)
shift <- which(blocks > 1L & rbind(blocks[-1 , , drop = FALSE], FALSE) == 0L, arr.ind = TRUE)
if (nrow(shift)) {
shift <- Map(shift[ , 'groups'], shift[ , 'codes'],
f = \(group, code) groups == rownames(blocks)[group] & codes == colnames(blocks)[code])
shift <- Reduce('|', shift)
shift <- tapply_inplace(shift,
groups,
\(s) {
not <- which(!s)
if (any(not)) seq_along(s) > max(which(!s)) else !logical(length(s))
}) # makes sure only block of TRUE on right edge are shifted
groups <- ifelse(shift, groups + 1L, groups)
}
# get records
groups[groups == 0L] <- 1L
records <- tapply(global[groups <= length(starts)],
factor(groups[groups <= length(starts)], levels = 1L:(length(starts))),
min)
starts <- pmin(records,starts, na.rm = TRUE)
# divide into pieces
pieces <- cumsum(seq_along(lines) %in% starts)
split(lines, f = pieces)
})
filelines[!containmultiple] <- lapply(filelines[!containmultiple], list)
## spread out
newFrame <- data.table(FileLines = unlist(filelines, recursive = FALSE),
Pattern = rep(fileFrame$Pattern, lengths(filelines)),
Filepath = rep(fileFrame$Filepath, lengths(filelines)))
newFrame[ , Piece := 1:nrow(newFrame)]
# newFrame[ , Piece := unlist(lapply(lengths(filelines), seq_len))]
fileFrame <- fileFrame[ , c('Filepath', 'File', 'Label')]
newFrame[fileFrame, on = 'Filepath']
}
#
# parseTokenType <- function(spine) {
# # This is called by parseRecords
# # simply categories records by spine type,
# # to create the humdrum tables Type field.
# type <- rep('D', length(spine))
# type[spine == '_P'] <- 'P'
# type[grepl('^!' , spine)] <- 'L'
# type[grepl('^!!' , spine)] <- 'G'
# type[grepl('^\\*', spine)] <- 'I'
# type[grepl('^=', spine)] <- 'M'
# type[spine == '.'] <- 'd'
# type
# }
parseTokenType <- function(spine, E = FALSE, S = FALSE) {
# This is called by parseRecords
# simply categories records by spine type,
# to create the humdrum tables Type field.
out <- rep('D', length(spine))
out[spine == '_P'] <- 'P'
out[spine == '.'] <- 'd'
out[stringi::stri_detect_regex(spine, '^\\*')] <- 'I'
out[stringi::stri_detect_regex(spine, '^=')] <- 'M'
out[stringi::stri_detect_regex(spine, '^!')] <- 'L'
out[stringi::stri_detect_regex(spine, '^!!')] <- 'G'
if (E) out[stringi::stri_detect_regex(spine, '^\\*\\*')] <- 'E'
if (S) out[stringi::stri_detect_regex(spine, '^\\*[-v^>]|\\*tb')] <- 'S'
out
}
padSpinePaths <- function(local) {
# Used by parseLocal to make sense of spine paths.
# This function takes a list of character vectors (each representing a list of tokens from one record)
# identifies spine paths in them, and pads the records with "_P" such that
# subspines (spine paths) are grouped within their spine, and all rows are the same length.
####NEED TO ADD OPTIONS FOR ** AND *-
minpath <- min(sapply(local, Position, f = \(x) x %in% c('*^', '*v', '*+', '*-')), na.rm = TRUE)
lapply(minpath:max(lengths(local)),
\(j) {
open <- sapply(local, \(row) length(row) >= j && (row[j] == '*^' || row[j] == '*+'))
close <- sapply(local, \(row) length(row) > j && all(row[j:(j + 1)] == '*v'))
if (any(open) | any(close)) {
open <- c(FALSE, head(open, -1))
close <- c(FALSE, head(close, -1))
pad <- cumsum(as.numeric(open) + -as.numeric(close))
pad <- abs(pad - max(pad))
local[pad > 0] <<- Map(append,
x = local[pad > 0],
values = lapply(pad[pad > 0], \(n) rep('_P', n)),
after = j)
}
close <- sapply(local, \(row) length(row) > j && all(row[j] == '*-'))
if (any(head(close, -1))) {
pad <- cumsum(c(FALSE, head(close, -1)))
local[pad > 0] <<- Map(append, x = local[pad > 0], values = '_P', after = j - 1)
}
}
)
local
}
parseInterpretations <- function(spinemat) {
# This function is called by parseLocal to create the basic
# Exclusive and Tandem interpretation Field for the humdrum table.
# It takes a spinematrix (a character matrix representing
# humdrum records and padded columns)
# It returns 1) a list of character vectors representing exclusive interpretations
# and 2) a list of character vectors representing comma-delineated, backwards-cumulative tandem interpretaions
# i.e. c("*clefF4",
# "*F:,*clefF4",
# "*k[b-],*F:,*clefF4")
spinemat[is.na(spinemat)] <- '_P' # Not sure why this is necassary (Nat, December 2018)
lapply(1:ncol(spinemat),
\(j) {
if (j > 1 && any(spinemat[ , j] == '_P')) {
spinemat[spinemat[ , j] == '_P', j] <<- spinemat[spinemat[ , j] == '_P', j - 1]
}
spine <- spinemat[ , j]
interpind <- rev(grep('^\\*[^*->^v]', spine))
interpind <- interpind[interpind > tail(stringi::stri_startswith_fixed(spine, '**'), 1)]
interps <- stringr::str_sub(spine[interpind], start = 2L)
setNames(sapply(seq_along(spine), \(i) paste(interps[i >= interpind], collapse = ',')),
paste0(j, '_', rownames(spinemat)))
}
) -> tandemIs
# exclusive
exclusiveI <- apply(spinemat, 2, \(spine) tail(spine[stringi::stri_startswith_fixed(spine, '**')], 1))
exclusiveI <- stringr::str_sub(exclusiveI, start = 3) # strip away **
list(Exclusive = exclusiveI,
Tandems = tandemIs)
}
parseTandem <- function(tandems, known) {
# This function takes the parsed, cummulative
# Tandem fields produced by parseInterpretations
# and, using functions from the file humInterpretations.R
# identifies any "known" (i.e., standard) interpretations
# and parses these into a table of new Interpretation fields
# for the humdrum table.
# Known indicates
if (length(known) == 0L || all(tandems == "", na.rm = TRUE)) return(NULL)
# which are the tandems we want it to recognize
REs <- getRE(known)
names(REs)[names(known) != ''] <- names(known)[names(known) != '']
if (any(REs == 'known')) {
REs <- c(REs[REs != 'known'],
knownInterpretations[Type == 'Tandem', getRE(Name, types = 'Tandem')])
}
# parse the stacked tandem vector
uniqueTandem <- unique(unlist(stringr::str_split(unique(tandems), ',')))
uniqueTandem <- uniqueTandem[!is.na(uniqueTandem) & uniqueTandem != '']
# Reduce REs to only those that appear in data
REs <- REs[apply(do.call('cbind', lapply(REs, stringr::str_detect, string = uniqueTandem)), 2, any)]
tandemMat <- lapply(REs,
\(re) stringr::str_match(tandems, re)[ ,1]) #finds first instance
names(tandemMat) <- names(REs)
as.data.table(tandemMat)
}
#' Get tandem interpretation information from humdrum data
#'
#' `extractTandem` extracts tandem interpretations from the raw `Tandem`
#' spine in [humdrumR object][humdrumRclass].
#'
#' @details
#'
#' Every [humdrumR][humdrumRclass] object has a field called
#' `Tandem`, which is a vector of strings which accumulates
#' tandem interpretations in each Spine.
#' At each record, all the previous tandems that occured in each spine
#' are listed (comma separated), with the most recent appearing first.
#'
#' For example, consider this file:
#'
#' ```
#' **kern **kern
#' *C: *C:
#' *Ibass *Isoprn
#' 2G 4g
#' . 4f
#' 2C 2e
#' *G: *G:
#' 2D 2f#
#' 2G 2g
#' *- *-
#' ```
#'
#' The `Tandem` field for these two spines would look like this:
#'
#' ```
#' ""
#' "C:" "C:"
#' "Ibass,C:" "Isoprn,C:"
#' "Ibass,C:" "Isoprn,C:"
#' "Ibass,C:" "Isoprn,C:"
#' "Ibass,C:" "Isoprn,C:"
#' "G:,Ibass,C:" "G:,Isoprn,C:"
#' "G:,Ibass,C:" "G:,Isoprn,C:"
#' "G:,Ibass,C:" "G:,Isoprn,C:"
#' "G:,Ibass,C:" "G:,Isoprn,C:"
#' ```
#'
#' Notice that the `"C:"` isn't erased by the appearance of `"G:"`---this is a naive parser
#' that doesn't "know" that `"C:"` and `"G:"` are related.
#' The earlier tandem (`"C:"`) is just pushed further back onto the stack.
#'
#'
#' Don't worry, the [humdrumR data parser][readHumdrum()] *does* recognize many
#' common tandem interpretations (like `*C:` and `*G:`) and will automatically parse
#' them if they are present---in this case, they are put into the `Key` field automatically.
#' However, the `Tandem` field is retained in case your data contains any novel tandem intepretations
#' that `humdrumR` does not recognize.
#'
#' @section extractTandem:
#'
#' If your data *does* contain novel/unknown tandem interpretations, you can use the
#' `extractTandem` function to pull them out of the `Tandem` field.
#' The first argument to `extractTandem` must be the `Tandem` field from a
#' [humdrumR object][humdrumRclass].
#' The second argument (`regex`) is a regular expression which is matched against
#' the the tandem interpretations.
#' For each token in `Tandem`, the most recent match (if any) is retained.
#'
#' For example, if we wanted to manually extract the key information from the `Tandem` field
#' (which `humdrumR` automatically does for you), we could call `extractTandem(Tandem, "[A-Ga-g][#-]*:")`.
#'
#' @param Tandem ***Parsed tandem interpretation data.***
#'
#' Must be `atomic`.
#'
#' Should always be the `Tandem` field from a [humdrumR object][humdrumRclass].
#'
#' @param regex ***A regular expression to match tandem interpretations.***
#'
#' Must be a single `character` string.
#'
#' You should not include a `*` at the beginning---the `*` marker for tandem interpretations are already removed from the `Tandem` field.
#'
#' @seealso {Read more about tandem interpretations in `humdrumR` in the
#' [humdrum table docs][humTable]. [readHumdrum()] also preprocesses
#' some "known" tandem interpretations. }
#' @export
tandem <- function(regex, Tandem) {
checks(regex, xcharacter & xlen1)
Tandem <- paste0(',', Tandem, ',')
regex <- gsub('\\^', '', regex)
regex <- gsub('\\$', '', regex)
regex <- paste0(',', regex, ',')
matches <- stringr::str_extract(Tandem, pattern = regex)
stringr::str_sub(matches, 2L, -2L) # get rid of commas
}
parseMultiStops <- function(spine) {
# This function is used by parseLocal to stretch out spines
# containing multi stops (i.e., tokens separate by spaces) represented by their own tokens.
# This takes a character vector representing a single
# spine of tokens, with the names of the vector representing
# record numbers.
# It returns a new character vector of tokens, with multi stops
# split into their on character strings, and with new record numbers enumerating
# any multi stops "X.01", "X.02", "X.03", etc.
# Single-stop tokens get labellex "X.01", while non-data tokens get "X.0"
nspaces <- stringi::stri_count_fixed(spine, ' ') #count multistops
nspaces[grepl('^[!*=]', spine)] <- 0 #except not in non-data tokens
if (!any(nspaces > 0)) return(setNames(spine, paste0(names(spine), '.01'))) # if there are no multistops, just as ".01" to names
Map(function(recn, n) gsub('0\\.', '.', paste0(recn, .01 * seq_len(n))),
names(spine),
nspaces + 1) -> multiRecordn
spine[!grepl('^[!=*]', spine)] <- strsplit(spine[!grepl('^[!=*]', spine)], split = ' ')
setNames(unlist(spine), unlist(multiRecordn))
}
parseSections <- function(spine) {
# This function is called by parseLocal in order
# to parse section information in the humdrum data
# and make Section fields (for the humdrum table) as appropriate.
# It takes a character vector representing a spine
# and returns a data.frame of section fields (or NULL if none)
if (!(any(grepl('^\\*>', spine)))) return(data.frame(row.names = names(spine)))
sectionTypes <- unique(stringi::stri_extract_all_regex(grep('\\*>[^>]*>[^>]', spine, value = TRUE), '\\*>[^>]*>'))
sectionTypes <- paste0('^\\', c('*>[^>]*$', sectionTypes))
nesting <- max(stringi::stri_count_fixed(stringr::str_sub(grep('^\\*>', spine, value = TRUE), 3L), '>'))
matrices <- lapply(sectionTypes,
\(type) {
hits <- grepl(type, spine)
if (!any(hits)) return(data.frame(row.names = names(spine)))
depth <- rep(NA_integer_, length(spine))
depth[hits] <- stringi::stri_count_fixed(stringr::str_sub(spine[hits], 3L), '>')
typemat <- if (any(depth != 0, na.rm = TRUE)) {
sapply(sort(unique(depth[depth != 0])),
\(N) {
cats <- c(NA_character_, spine[hits & depth <= N])
cats[which((hits & depth < N)[hits & depth <= N]) + 1] <- "" # NA_character_
cats[cumsum(hits & depth <= N) + 1]
} )
} else {
matrix(c(NA_character_, spine[hits])[cumsum(hits) + 1], ncol = 1)
}
name <- paste0('Formal', if (type == '^\\*>[^>]*$') '' else gsub('[\\^>*]' , '', type))
colnames(typemat) <- rep(name, ncol(typemat))
if (ncol(typemat) > 1L) colnames(typemat) <- paste0(colnames(typemat), c("", paste0("_", 1:(ncol(typemat) - 1))))
rownames(typemat) <- names(spine)
if (type == '^\\*>[^>]*$') gsub('^\\*>', '', typemat) else gsub(paste0(type, '>*'), '', typemat)
} )
as.data.frame(matrices, stringsAsFactors = FALSE)
}
parseBarlines <- function(spine) {
# This function is used by parseLocal
# to extract the three barline related fields from
# a humdrum spine.
# It takes a character vector representing a humdrum spine
# and returns a data.frame with three columns representing the three barline fields (Bar, DoubleBar, BarLabel)
Singles <- cumsum(grepl('^=', spine))
Doubles <- cumsum(grepl('^==', spine))
BarLabels <- stringr::str_sub(grep('^=', spine, value = TRUE), 2L)[ifelse(Singles == 0L, NA_integer_, Singles)]
data.frame(Bar = Singles,
DoubleBar = Doubles,
BarLabel = BarLabels,
row.names = names(spine), stringsAsFactors = FALSE)
}
#############################################################################################---
########################### readTSV ----
################################################################--
readTSV <- function(paths) {
if (dir.exists('.humdrumR_tmp')) {
content <- dir('.humdrumR_tmp', full.names = TRUE)
for (file in content) file.remove(file)
} else {
dir.create('.humdrumR_tmp')
}
message("Naively converting ", num2str(length(paths)), ' tsv file',
plural(length(paths), 's', ''), ' to basic humdrum files...',
appendLF = FALSE)
newfilepaths <- lapply(paths,
\(path) {
tsv <- fread(file = path)
colnames(tsv) <- paste0('**', colnames(tsv))
tsv[] <- lapply(tsv, as.character)
tsv[] <- lapply(tsv, \(col) {col[col == '' | is.na(col)] <- '.'; col})
ender <- replicate(ncol(tsv), '*-')
ender <- rlang::eval_tidy(rlang::expr(data.table(!!!ender)))
colnames(ender) <- colnames(tsv)
tsv <- rbind(tsv, ender)
newpath <- paste0('.humdrumR_tmp', .Platform$file.sep, basename(path), '.humdrumR')
fwrite(tsv, sep = '\t', file = newpath)
newpath
})
message('done!')
corpus <- readHumdrum('.humdrumR_tmp/.*')
for (file in newfilepaths) file.remove(file)
file.remove('.humdrumR_tmp')
corpus
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.