# columnDescriptor -------------------------------------------------------------
#' Column Descriptor
#'
#' @param match pattern or fixed text to match in header line
#' @param fixed if TRUE, \emph{match} is taken as a fixed string to be looked
#' for in the header line, otherwise it is interpreded as a regular expression
#' @export
#'
columnDescriptor <- function(match = ".*", fixed = FALSE)
{
list(match = match, fixed = fixed)
}
# readCsvInputFile -------------------------------------------------------------
#' Read CSV File Giving Column Descriptions
#'
#' @param csv full path to CSV file
#' @param sep column separator
#' @param dec decimal character
#' @param headerRow number row in which the header (containing column captions)
#' is found
#' @param headerPattern pattern matching the header row. If \emph{headerPattern}
#' is given \emph{headerRow} is not considered
#' @param columnDescription list of column descriptors. The list elements are
#' named with the name of the list elements being the names that shall be used
#' in the returned data frame. Each list element is a list with elements
#' \emph{match} (pattern to be looked for in the header fields), ...
#' @param maxRowToLookForHeader maximum number of rows to be considered when
#' looking for the header row
#' @param stopOnMissingColumns if TRUE (default) the program stops if not all
#' columns defined in \emph{columnDescription} are found
#' @param fileEncoding encoding of the input file
#' @param encoding passed to readLines, "Latin-1" or "UTF-8"
#' @param \dots further arguments passed to read.table
#' @export
readCsvInputFile <- function(
csv, sep, dec, headerRow = 1, headerPattern = "", columnDescription = NULL,
maxRowToLookForHeader = 10, stopOnMissingColumns = TRUE,
fileEncoding = "UTF-8", encoding = "unknown", ...
)
{
if (! file.exists(csv)) {
stop("No such file: ", csv)
}
if (headerPattern != "") {
fileLines <- catchWarning(readLinesWithEncoding(
file = csv,
fileEncoding = fileEncoding,
n = maxRowToLookForHeader,
warn = FALSE,
encoding = encoding
))
stopOnWarning(fileLines, where = "readLinesWithEncoding()")
headerRow <- catchWarning(grep(headerPattern, fileLines))
stopOnWarning(headerRow, where = "grep()")
}
if (isNullOrEmpty(headerRow)) {
stopFormatted(
paste0(
"I could not find the header row within the first %d lines!\n",
"I was looking for: '%s'"
),
maxRowToLookForHeader,
headerPattern
)
}
headerFields <- readAndSplitRowInFile(
csv, headerRow, sep, encoding = encoding
)
if (is.null(columnDescription)) {
columnDescription <- defaultColumnDescription(headerFields)
}
if (stopOnMissingColumns) {
stopIfNotEnoughColumns(headerFields, columnDescription, sep)
}
newColumnDescription <- findColumnNumbersByMatchingPatterns(
headerFields, columnDescription
)
if (stopOnMissingColumns) {
stopIfNotAllColumnsFound(newColumnDescription, headerFields)
}
warnOnMultipleMatches(newColumnDescription, headerFields)
# if there is a duplicate caption, take only the first!
colNumbers <- as.integer(
sapply(newColumnDescription, function(x){x[["colNumber"]][1]})
)
# remove indices of described columns that were not found
colNames <- names(columnDescription)[!is.na(colNumbers)]
colNumbers <- colNumbers[!is.na(colNumbers)]
if (length(colNumbers) == 0) {
warning("Not at least one of the described columns found.\n",
msgAvailableFields(headerFields))
return()
}
data <- utils::read.table(
csv, sep = sep, dec = dec, header = FALSE, skip = headerRow, ...
)
stats::setNames(data[, colNumbers, drop = FALSE], colNames)
}
# stopOnWarning ----------------------------------------------------------------
stopOnWarning <- function(x, where = "where?")
{
msg <- attr(x, "warningMessage")
if (!is.null(msg)) {
stopFormatted(
"There was a warning in '%s'. Please have a look at it:\n%s\n",
where,
collapsed(msg, "\n")
)
}
}
# msgAvailableFields -----------------------------------------------------------
#' Message Listing Available Fields
#'
#' Message to be shown if fields/columns are missing
#'
#' @param x vector of character
#' @export
#'
msgAvailableFields <- function(x)
{
sprintf("\nAvailable columns:\n %s", numberedEnumeration(x))
}
# readAndSplitRowInFile --------------------------------------------------------
readAndSplitRowInFile <- function(csv, rowNumber, sep, encoding, version = 2)
{
if (version == 1 ) {
fields <- utils::read.table(
file = csv,
sep = sep,
nrows = rowNumber,
fill = TRUE,
header = FALSE,
encoding = encoding
)
fields <- utils::tail(fields, 1)
} else {
fields <- utils::read.table(
file = csv,
sep = sep,
skip = rowNumber - 1,
nrows = 1,
header = FALSE,
encoding = encoding
)
}
as.character(as.matrix(fields))
}
# defaultColumnDescription -----------------------------------------------------
defaultColumnDescription <- function(headerFields)
{
columnDescription <- list()
for (headerField in headerFields) {
# ignore NA header fields
if (!is.na(headerField)) {
columnDescription[[toColumnName(headerField)]] <- columnDescriptor(
match = headerField,
fixed = TRUE
)
}
}
columnDescription
}
# toColumnName -----------------------------------------------------------------
toColumnName <- function(x)
{
substSpecialChars(x)
}
# stopIfNotEnoughColumns -------------------------------------------------------
stopIfNotEnoughColumns <- function(headerFields, columnDescription, sep)
{
ncol <- length(headerFields)
ncolRequired <- length(columnDescription)
if (ncol < ncolRequired) {
stop(sprintf(
paste("I found only %d of %d required column(s).",
"Is '%s' the correct column separator?",
"I read the following header fields:\n %s"),
ncol, ncolRequired, sep, numberedEnumeration(headerFields)
))
}
}
# numberedEnumeration ----------------------------------------------------------
numberedEnumeration <- function(x)
{
paste0(seq_along(x), ". ", hsQuoteChr(x), collapse = "\n ")
}
# findColumnNumbersByMatchingPatterns ------------------------------------------
findColumnNumbersByMatchingPatterns <- function(
headerFields, columnDescription
)
{
for (colName in names(columnDescription)) {
pattern <- columnDescription[[colName]]$match
fixed <- columnDescription[[colName]]$fixed
columnDescription[[colName]]$colNumber <- grep(
pattern, headerFields, fixed = ifelse(is.null(fixed), FALSE, fixed)
)
}
columnDescription
}
# stopIfNotAllColumnsFound -----------------------------------------------------
stopIfNotAllColumnsFound <- function(columnDescription, headerFields)
{
notFound <- sapply(
columnDescription, FUN = function(x) { length(x$colNumber) == 0 }
)
if (any(notFound)) {
msg <- "The following columns could not be found with the given patterns:\n "
stop(
msg,
paste(collapse = "\n ", sprintf(
"%s: '%s'", names(columnDescription)[notFound],
sapply(columnDescription[notFound], "[[", "match")
)),
msgAvailableFields(headerFields)
)
}
}
# warnOnMultipleMatches --------------------------------------------------------
warnOnMultipleMatches <- function(columnDescription, headerFields)
{
ambiguous <- sapply(
columnDescription, FUN = function(x) { length(x$colNumber) > 1 }
)
if (any(ambiguous)) {
msg <- paste(
"For the following patterns more than one columns were found.",
"The first match is used in each case:\n ",
paste(
sprintf(
"'%s' matches %s",
sapply(columnDescription[ambiguous], "[[", "match"),
sapply(columnDescription[ambiguous], function(x) {
commaCollapsed(hsQuoteChr(headerFields[x$colNumber]))
})
)
),
collapse = "\n "
)
warning(msg)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.