#' Produce a concordance.
#'
#' Produces a concordance of a regular expression and returns
#' a data frame.
#'
#' @param text The text to search in, as either a character vector or a list
#' of character vectors.
#' @param pattern The Perl-compartible regular expression to search for.
#' @param ignore_case If \code{ignore_case = TRUE} (the default),
#' a case-insensitive search is performed.
#' @param locations If \code{locations = TRUE}, the element indexes with
#' matches are returned.
#' @param extra_context If positive, specifies the number of additional \strong{lines}
#' around the lines with matches to return. Useful if not enough context is
#' given in the lines with matches.
#' @param shorten If positive, truncates the preceding and following contexts
#' to the number of \strong{characters} specified. Useful if too much context is
#' given in the lines with matches, for example, in texts with (very)
#' long paragraphs.
#'
#' @return A data frame.
#'
#' @seealso Stefan Th. Gries' function \href{http://www.linguistics.ucsb.edu/faculty/stgries/exact_matches.r}{exact.matches()}.
#'
#' @examples
#' concord("hello", "e")
#' concord("some sentence here", "sentence")
#' concord("some sentence here", "s[eo]")
#'
#' text <- c("first sentence here", "short paragraph here", "third line here")
#' concord(text, "e[nr]", locations = TRUE)
#' concord("some sentence here", "e(?=r)")
#' concord("some sentence here", "e(?!n)")
#' @export
concord <- function(text, pattern, ignore_case = TRUE, locations = FALSE, extra_context = 0, shorten = 0) {
UseMethod("concord")
}
#' @export
concord.list <- function(text, pattern, ignore_case = TRUE, locations = FALSE, extra_context = 0, shorten = 0) {
output <- lapply(text, function(x) concord.default(x, pattern, ignore_case, locations, extra_context, shorten))
rep_times <- sapply(output, nrow)
rep_times <- sapply(rep_times, function(x) if (is.null(x)) {0} else {x})
if (is.null(names(output))) {
names(output) <- seq_along(output)
}
SOURCE <- rep(names(output), rep_times)
output <- data.frame(do.call("rbind", output))
output <- cbind(SOURCE, output)
rownames(output) <- seq(1, nrow(output))
return(output)
}
#' @export
concord.default <- function(text, pattern, ignore_case = TRUE, locations = FALSE, extra_context = 0, shorten = 0) {
# replaces tabs with spaces in the input, so that the output doesn't have extra columns
text <- stringr::str_replace_all(text, "\t", " ")
# finds indexes of pattern within text
locs <- stringr::str_locate_all(text, stringr::regex(pattern, ignore_case = ignore_case))
# gets number of matches per element of text
num_hits <- sapply(locs, nrow)
# continues if any matches
if (any(num_hits)) {
# gets element number of elements with matches
elements_with_hits <- seq_along(num_hits)[num_hits > 0]
# eliminates elements w/o matches
text_min <- text[elements_with_hits]
num_hits <- num_hits[num_hits > 0]
# duplicates the elements with matches, the number of times that there are matches
text_min <- rep(text_min, num_hits)
# gets the starting and ending positions of matches
starts <- unlist(sapply(locs, function(x) x[, 'start']))
ends <- unlist(sapply(locs, function(x) x[, 'end']))
# gets the matches and preceding and following contexts, and pastes them together
pre <- stringr::str_trim(stringr::str_sub(text_min, end = starts - 1))
fol <- stringr::str_trim(stringr::str_sub(text_min, start = ends + 1))
boundaries <- data.matrix(data.frame(start = starts, end = ends))
matches <- stringr::str_trim(stringr::str_sub(text_min, start = boundaries))
# gets index numbers with matches, as many times as there are matches
if (locations | extra_context != 0) {
element_num <- rep(elements_with_hits, num_hits)
}
# optionally, adds extra context
if (extra_context > 0) {
# extra_context <- ceiling(extra_context)
all_pre <- c()
all_fol <- c()
counter <- extra_context
while (counter > 0) {
# gets extra preceding context
cur_pre_num <- element_num - counter
cur_pre_num <- sapply(cur_pre_num, function(x) if (x < 0) {0} else {x})
temp <- sapply(cur_pre_num, function(x) text[x])
cur_pre <- sapply(temp, function(x) if (length(x) > 0) {x} else {""})
all_pre <- stringr::str_c(all_pre, cur_pre, sep = " ")
# gets extra following context
cur_fol_num <- element_num + counter
cur_fol_num <- sapply(cur_fol_num, function(x) if (x > length(text)) {0} else {x})
temp <- sapply(cur_fol_num, function(x) text[x])
cur_fol <- sapply(temp, function(x) if (length(x) > 0) {x} else {""})
all_fol <- stringr::str_c(cur_fol, all_fol, sep = " ")
counter <- counter - 1
} # end while loop
pre <- stringr::str_c(all_pre, pre, sep = " ")
fol <- stringr::str_c(fol, all_fol, sep = " ")
} # end extra context
pre <- stringr::str_trim(pre)
fol <- stringr::str_trim(fol)
# optionally, truncates the pre and post contexts
if (shorten > 0) {
pre_len <- stringr::str_length(pre)
pre <- stringr::str_sub(pre, pre_len - shorten + 1, pre_len)
fol <- stringr::str_sub(fol, 1, shorten)
}
# pastes output together
output <- stringr::str_c(pre, matches, fol, sep = "\t")
# optionally, adds the element number
if (locations) {
output <- stringr::str_c(element_num, output, sep = "\t")
}
# returns the output to the user
df <- data.frame(do.call("rbind", stringr::str_split(output, "\t")))
if (locations) {
names(df) <- c("INDEX", "PRE_CONTEXT", "MATCH", "POST_CONTEXT")
} else {
names(df) <- c("PRE_CONTEXT", "MATCH", "POST_CONTEXT")
}
return(df)
} # end if any matches
} # end function
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.