# function grepcolumns correctly processes regex patterns starting with '^' or ending with '$'
grepcolumns <- function(pattern, d) {
# indicate columns with matches
m <- mapply(function(x) length(grep(pattern, x)) > 0, d)
# pattern found?
grep(TRUE, m)
}
#' @rdname df_pattern_subset
#'
#' @param d a \code{data.frame}
#' @param pattern a \code{regex} pattern
#' @param strategy character specifying if upmost left ("ul") or lowermost right ("lr") occurrence
#' is to be sought.
#'
#' @return \code{df_pattern_locate} returns an integer vector of length 2 (i.e. a tuple)
#' specifying row and column number, or \code{c(NA, NA)} if no occurrence of \code{pattern} found
#' @export
#'
df_pattern_locate <- function(d, pattern, strategy = c("ul", "lr")) {
# is the upper left or lower right corner to be located?
strategy = match.arg(strategy)
# grep applied to a data.frame checks if the pattern is present in a column!
columnswithpattern = grepcolumns(pattern, d)
# if length == 0, the pattern is not to be found within d
if (length(columnswithpattern) == 0) {
message(paste0("Pattern `", pattern, "` not found!"))
return(c(NA_integer_, NA_integer_))
}
# colwithp is a scalar containing the column number corresponding to the pattern and strategy
colwithp <- ifelse(strategy == "ul", min(columnswithpattern), max(columnswithpattern))
# rowswithpattern is generated from column colwithp (as this one contains the appropriate pattern)
rowswithpattern <- grep(pattern, pull(d, colwithp))
# rowwithp is a scalar containing the row number corresponding to the pattern and strategy
rowwithp <- ifelse(strategy == "ul", min(rowswithpattern), max(rowswithpattern))
# return the result as a tuple
c(rowwithp, colwithp)
}
#' @name df_pattern_subset
#' @title Subset a data frame given two patterns
#'
#' @description \code{df_pattern_locate} locates the up- and leftmost or lower- and rightmost occurrence of a
#' given pattern within a given \code{data.frame}.
#'
#' \code{df_pattern_subset} returns a subset of a \code{data.frame} given two patterns, which correspond to
#' the upper left and lower right corner of the subset.
#'
#' @param d a \code{data.frame}
#' @param pattern_ul \code{regex} pattern specifying upper left corner
#' @param pattern_lr \code{regex} pattern specifying lower left corner
#' @param ignore_columns logical; if \code{TRUE}, filtering of columns is omitted, irrespective of columns where patterns were found
#' @param ignore_rows logical; if \code{TRUE}, filtering of rows is omitted, irrespective of rows where patterns were found
#' @param which_header character, specifying if a new header based on the row above ("row above", default) the upper left corner
#' or if old table headers ("table header") be used
#' @param adjust_ul integer vector of length 2 specifying addends to row and column to determine upper left corner
#' @param adjust_lr integer vector of length 2 specifying addends to row and column to determine lower right corner
#'
#' @return \code{df_pattern_subset} returns a new \code{data.frame} extending from the upper left to the
#' lower right corner previously located (using \code{pattern_ul} and \code{pattern_lr}, respectively).
#' @seealso [rbinder()] for combining multiple csv-like files with identical headers; \code{df_pattern_subset} can be used to create
#' custom readers for \code{rbinder}.
#' @export
#'
#' @examples
#' # # load fantasy data from an excel file (generation of data see below):
#' # # fantasy <- read_xlsx(paste0(system.file("extdata", package = "kungfu"), "/fnts.xlsx"))
#' # fantasy_subset <- df_pattern_subset(fnts, "43", "^dun.*dun$")
#' #
#' # # locate pattern
#' # df_pattern_locate(fnts, "dunbumm", "ul")
#' # # returns c(3, 2)
#' #
#' # # generate fantasy excel file fnts.xlxs
#' # library(openxlsx)
#' # syllables <- c("arg", "ton", "hun", "kon", "dun", "bumm", "samm", "kur", "dem", "schim", "to")
#' # word <- function(s = syllables, range = c(1,3), clps = "") {
#' # paste(sample(s, round(runif(1, min(range), max(range))), replace = TRUE), collapse = clps)
#' # }
#' # words <- function(n, ...) mapply(function(x) word(...), 1:n)
#' # set.seed(1)
#' # fnts <- data.frame(id = sample(1:200, 100),
#' # bumka = words(100),
#' # humka = words(100, range = c(3,6), clps = " "),
#' # group = sample(LETTERS, 100, replace = TRUE),
#' # groko = words(100, range = c(2,4), clps = ", "))
#' #
#' # write.xlsx(fnts, "../inst/extdata/fnts.xlsx", overwrite = TRUE)
df_pattern_subset <- function(d,
pattern_ul = NULL,
pattern_lr = NULL,
ignore_columns = FALSE,
ignore_rows = FALSE,
which_header = c("row above", "table header"),
adjust_ul = c(0, 0),
adjust_lr = c(0, 0)) {
# validate input
if (is.null(pattern_ul) & is.null(pattern_lr)) stop("Please provide either 'pattern_ul' or 'pattern_lr'!")
which_header <- match.arg(which_header)
# initialize locations of upper left and lower rigth corners
location_ul <- c(NA, NA)
location_lr <- c(NA, NA)
# locate new corners according to pattern_ul and pattern_lr
if (!is.null(pattern_ul)) location_ul = df_pattern_locate(d, pattern_ul, strategy = "ul")
if (!is.null(pattern_lr)) location_lr = df_pattern_locate(d, pattern_lr, strategy = "lr")
# "ignore columns" means: erase information pertaining to column selection, i.e. have all columns returned
if (ignore_columns) {
location_ul[2] <- NA # column no. upper left corner
location_lr[2] <- NA # column no. lower right corner
}
# "ignore rows" means: erase information pertaining to row selection, i.e. have all rows returned
if (ignore_rows) {
location_ul[1] <- NA # row no. upper left corner
location_lr[1] <- NA # row no. lower right corner
}
# adjust, if so specified
location_ul <- location_ul + adjust_ul
location_lr <- location_lr + adjust_lr
# create indices
index_rows <- rep(TRUE, nrow(d))
index_cols <- rep(TRUE, ncol(d))
# trim data.frame row- and columnwise
if (!is.na(location_ul[1])) { # row no. of upper left corner
if (location_ul[1] > 1 & location_ul[1] <= nrow(d))
index_rows[1:(location_ul[1] - 1)] <- FALSE
}
if (!is.na(location_ul[2])) { # col no. of upper left corner
if (location_ul[2] > 1 & location_ul[2] <= ncol(d))
index_cols[1:(location_ul[2] - 1)] <- FALSE
}
if (!is.na(location_lr[1])) { # row no. of lower left cornder
if (location_lr[1] > 0 & location_lr[1] < nrow(d))
index_rows[(location_lr[1] + 1):nrow(d)] <- FALSE
}
if (!is.na(location_lr[2])) { # col no. of lower left corner
if (location_lr[2] > 0 & location_lr[2] < ncol(d))
index_cols[(location_lr[2] + 1):ncol(d)] <- FALSE
}
# generate first subset ss1
ss1 <- d[index_rows, index_cols] # ss1 lacks header adjustment
ss <- ss1
# if no row above available, or rows are ignored (i.e. all rows selected), take table header
if (!is.na(location_ul[1])) {
if (location_ul[1] < 2) which_header = "table header" # no rows above available!
} else {
which_header = "table header" # location_ul[1] NA!
}
# adjust header if specified (and if possible)
if (which_header == "row above") {
colnames(ss) <- d[location_ul[1] - 1, index_cols]
}
# return final data.frame
ss
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.