Nothing
.onLoad <- function(lib, pkg) {
if (Sys.getenv("RECEPTIVITI_URL") == "")
Sys.setenv(RECEPTIVITI_URL = "https://api.receptiviti.com/")
}
#' Receptiviti API
#'
#' The main function to access the \href{https://www.receptiviti.com}{Receptiviti} API.
#'
#' @param text A character vector with text to be processed, path to a directory containing files, or a vector of file paths.
#' If a single path to a directory, each file is collapsed to a single text. If a path to a file or files,
#' each line or row is treated as a separate text, unless \code{collapse_lines} is \code{TRUE} (in which case,
#' files will be read in as part of bundles at processing time, as is always the case when a directory).
#' Use \code{files} to more reliably enter files, or \code{dir} to more reliably specify a directory.
#' @param output Path to a \code{.csv} file to write results to. If this already exists, set \code{overwrite} to \code{TRUE}
#' to overwrite it.
#' @param id Vector of unique IDs the same length as \code{text}, to be included in the results.
#' @param text_column,id_column Column name of text/id, if \code{text} is a matrix-like object, or a path to a csv file.
#' @param files A list of file paths, as alternate entry to \code{text}.
#' @param dir A directory to search for files in, as alternate entry to \code{text}.
#' @param file_type File extension to search for, if \code{text} is the path to a directory containing files to be read in.
#' @param encoding Encoding of file(s) to be read in. If not specified, this will be detected, which can fail,
#' resulting in mis-encoded characters; for best (and fasted) results, specify encoding.
#' @param return_text Logical; if \code{TRUE}, \code{text} is included as the first column of the result.
#' @param context Name of the analysis context.
#' @param custom_context Name of a custom context (as listed by \code{\link{receptiviti_norming}}),
#' or \code{TRUE} if \code{context} is the name of a custom context.
#' @param api_args A list of additional arguments to pass to the API (e.g., \code{list(sallee_mode = "sparse")}). Defaults to the
#' \code{receptiviti.api_args} option. Custom norming contexts can be established with the \code{\link{receptiviti_norming}}
#' function, then referred to here with the \code{custom_context} argument (only available in API V2).
#' @param frameworks A vector of frameworks to include results from. Texts are always scored with all available framework --
#' this just specifies what to return. Defaults to \code{all}, to return all scored frameworks. Can be set by the
#' \code{receptiviti.frameworks} option (e.g., \code{options(receptiviti.frameworks = c("liwc", "sallee"))}).
#' @param framework_prefix Logical; if \code{FALSE}, will remove the framework prefix from column names, which may result in duplicates.
#' If this is not specified, and 1 framework is selected, or \code{as_list} is \code{TRUE}, will default to remove prefixes.
#' @param as_list Logical; if \code{TRUE}, returns a list with frameworks in separate entries.
#' @param bundle_size Number of texts to include in each request; between 1 and 1,000.
#' @param bundle_byte_limit Memory limit (in bytes) of each bundle, under \code{1e7} (10 MB, which is the API's limit).
#' May need to be lower than the API's limit, depending on the system's requesting library.
#' @param collapse_lines Logical; if \code{TRUE}, and \code{text} contains paths to files, each file is treated as a single text.
#' @param retry_limit Maximum number of times each request can be retried after hitting a rate limit.
#' @param overwrite Logical; if \code{TRUE}, will overwrite an existing \code{output} file.
#' @param compress Logical; if \code{TRUE}, will save as an \code{xz}-compressed file.
#' @param make_request Logical; if \code{FALSE}, a request is not made. This could be useful if you want to be sure and
#' load from one of the caches, but aren't sure that all results exist there; it will error out if it encounters
#' texts it has no other source for.
#' @param text_as_paths Logical; if \code{TRUE}, ensures \code{text} is treated as a vector of file paths. Otherwise, this will be
#' determined if there are no \code{NA}s in \code{text} and every entry is under 500 characters long.
#' @param cache Path to a directory in which to save unique results for reuse; defaults to
#' \code{Sys.getenv(}\code{"RECEPTIVITI_CACHE")}. See the Cache section for details.
#' @param cache_overwrite Logical; if \code{TRUE}, will write results to the cache without reading from it. This could be used
#' if you want fresh results to be cached without clearing the cache.
#' @param cache_format Format of the cache database; see \code{\link[arrow]{FileFormat}}.
#' Defaults to \code{Sys.getenv(}\code{"RECEPTIVITI_CACHE_FORMAT")}.
#' @param clear_cache Logical; if \code{TRUE}, will clear any existing files in the cache. Use \code{cache_overwrite} if
#' you want fresh results without clearing or disabling the cache. Use \code{cache = FALSE} to disable the cache.
#' @param request_cache Logical; if \code{FALSE}, will always make a fresh request, rather than using the response
#' from a previous identical request.
#' @param cores Number of CPU cores to split bundles across, if there are multiple bundles. See the Parallelization section.
#' @param collect_results Logical; if \code{FALSE}, will not retain bundle results in memory for return.
#' @param use_future Logical; if \code{TRUE}, uses a \code{future} back-end to process bundles, in which case,
#' parallelization can be controlled with the \code{\link[future]{plan}} function (e.g., \code{plan("multisession")}
#' to use multiple cores); this is required to see progress bars when using multiple cores. See the Parallelization section.
#' @param in_memory Logical; if \code{FALSE}, will write bundles to temporary files, and only load them as they are being requested.
#' @param clear_scratch_cache Logical; if \code{FALSE}, will preserve the bundles written when \code{in_memory} is \code{TRUE}, after
#' the request has been made.
#' @param verbose Logical; if \code{TRUE}, will show status messages.
#' @param key API Key; defaults to \code{Sys.getenv("RECEPTIVITI_KEY")}.
#' @param secret API Secret; defaults to \code{Sys.getenv("RECEPTIVITI_SECRET")}.
#' @param url API URL; defaults to \code{Sys.getenv("RECEPTIVITI_URL")}, which defaults to
#' \code{"https://api.receptiviti.com/"}.
#' @param version API version; defaults to \code{Sys.getenv("RECEPTIVITI_VERSION")}, which defaults to
#' \code{"v1"}.
#' @param endpoint API endpoint (path name after the version); defaults to \code{Sys.getenv("RECEPTIVITI_ENDPOINT")},
#' which defaults to \code{"framework"}.
#' @param include_headers Logical; if \code{TRUE}, \code{receptiviti_status}'s verbose message will include
#' the HTTP headers.
#'
#' @returns Nothing if \code{collect_results} is \code{FALSE}.
#' Otherwise, a \code{data.frame} with columns for \code{text} (if \code{return_text} is \code{TRUE}; the originally entered text),
#' \code{id} (if one was provided), \code{text_hash} (the MD5 hash of the text), a column each for relevant entries in \code{api_args},
#' and scores from each included framework (e.g., \code{summary.word_count} and \code{liwc.i}). If \code{as_list} is \code{TRUE},
#' returns a list with a named entry containing such a \code{data.frame} for each framework.
#'
#' @section Request Process:
#' This function (along with the internal \code{manage_request} function) handles texts and results in several steps:
#' \enumerate{
#' \item Prepare bundles (split \code{text} into <= \code{bundle_size} and <= \code{bundle_byte_limit} bundles).
#' \enumerate{
#' \item If \code{text} points to a directory or list of files, these will be read in later.
#' \item If \code{in_memory} is \code{FALSE}, bundles are written to a temporary location,
#' and read back in when the request is made.
#' }
#' \item Get scores for texts within each bundle.
#' \enumerate{
#' \item If texts are paths, or \code{in_memory} is \code{FALSE}, will load texts.
#' \item If \code{cache} is set, will skip any texts with cached scores.
#' \item If \code{request_cache} is \code{TRUE}, will check for a cached request.
#' \item If any texts need scoring and \code{make_request} is \code{TRUE}, will send unscored texts to the API.
#' }
#' \item If a request was made and \code{request_cache} is set, will cache the response.
#' \item If \code{cache} is set, will write bundle scores to the cache.
#' \item After requests are made, if \code{cache} is set, will defragment the cache
#' (combine bundle results within partitions).
#' \item If \code{collect_results} is \code{TRUE}, will prepare results:
#' \enumerate{
#' \item Will realign results with \code{text} (and \code{id} if provided).
#' \item If \code{output} is specified, will write realigned results to it.
#' \item Will drop additional columns (such as \code{custom} and \code{id} if not provided).
#' \item If \code{framework} is specified, will use it to select columns of the results.
#' \item Returns results.
#' }
#' }
#'
#' @section Cache:
#' If the \code{cache} argument is specified, results for unique texts are saved in an
#' \href{https://arrow.apache.org}{Arrow} database in the cache location
#' (\code{Sys.getenv(}\code{"RECEPTIVITI_CACHE")}), and are retrieved with subsequent requests.
#' This ensures that the exact same texts are not re-sent to the API.
#' This does, however, add some processing time and disc space usage.
#'
#' If \code{cache} is \code{TRUE}, a default directory (\code{receptiviti_cache}) will be looked for
#' in the system's temporary directory (which is usually the parent of \code{tempdir()}).
#' If this does not exist, you will be asked if it should be created.
#'
#' The primary cache is checked when each bundle is processed, and existing results are loaded at
#' that time. When processing many bundles in parallel, and many results have been cached,
#' this can cause the system to freeze and potentially crash.
#' To avoid this, limit the number of cores, or disable parallel processing.
#'
#' The \code{cache_format} arguments (or the \code{RECEPTIVITI_CACHE_FORMAT} environment variable) can be used to adjust the format of the cache.
#'
#' You can use the cache independently with \code{open_database(Sys.getenv("RECEPTIVITI_CACHE"))}.
#'
#' You can also set the \code{clear_cache} argument to \code{TRUE} to clear the cache before it is used again, which may be useful
#' if the cache has gotten big, or you know new results will be returned. Even if a cached result exists, it will be
#' reprocessed if it does not have all of the variables of new results, but this depends on there being at least 1 uncached
#' result. If, for instance, you add a framework to your account and want to reprocess a previously processed set of texts,
#' you would need to first clear the cache.
#'
#' Either way, duplicated texts within the same call will only be sent once.
#'
#' The \code{request_cache} argument controls a more temporary cache of each bundle request. This is cleared when the
#' R session ends. You might want to set this to \code{FALSE} if a new framework becomes available on your account
#' and you want to process a set of text you already processed in the current R session without restarting.
#'
#' Another temporary cache is made when \code{in_memory} is \code{FALSE}, which is the default when processing
#' in parallel (when \code{cores} is over \code{1} or \code{use_future} is \code{TRUE}). This contains
#' a file for each unique bundle, which is read in as needed by the parallel workers.
#'
#' @section Parallelization:
#' \code{text}s are split into bundles based on the \code{bundle_size} argument. Each bundle represents
#' a single request to the API, which is why they are limited to 1000 texts and a total size of 10 MB.
#' When there is more than one bundle and either \code{cores} is greater than 1 or \code{use_future} is \code{TRUE} (and you've
#' externally specified a \code{\link[future]{plan}}), bundles are processed by multiple cores.
#'
#' If you have texts spread across multiple files, they can be most efficiently processed in parallel
#' if each file contains a single text (potentially collapsed from multiple lines). If files contain
#' multiple texts (i.e., \code{collapse_lines = FALSE}), then texts need to be read in before bundling
#' in order to ensure bundles are under the length limit.
#'
#' Whether processing in serial or parallel, progress bars can be specified externally with
#' \code{\link[progressr]{handlers}}; see examples.
#' @examples
#' \dontrun{
#'
#' # check that the API is available, and your credentials work
#' receptiviti_status()
#'
#' # score a single text
#' single <- receptiviti("a text to score")
#'
#' # score multiple texts, and write results to a file
#' multi <- receptiviti(c("first text to score", "second text"), "filename.csv")
#'
#' # score many texts in separate files
#' ## defaults to look for .txt files
#' file_results <- receptiviti(dir = "./path/to/txt_folder")
#'
#' ## could be .csv
#' file_results <- receptiviti(
#' dir = "./path/to/csv_folder",
#' text_column = "text", file_type = "csv"
#' )
#'
#' # score many texts from a file, with a progress bar
#' ## set up cores and progress bar
#' ## (only necessary if you want the progress bar)
#' future::plan("multisession")
#' progressr::handlers(global = TRUE)
#' progressr::handlers("progress")
#'
#' ## make request
#' results <- receptiviti(
#' "./path/to/largefile.csv",
#' text_column = "text", use_future = TRUE
#' )
#' }
#' @importFrom curl new_handle curl_fetch_memory curl_fetch_disk handle_setopt
#' @importFrom jsonlite toJSON fromJSON read_json
#' @importFrom utils object.size
#' @importFrom digest digest
#' @importFrom parallel detectCores makeCluster clusterExport parLapplyLB parLapply stopCluster
#' @importFrom progressr progressor
#' @importFrom stringi stri_enc_detect
#' @export
receptiviti <- function(
text = NULL,
output = NULL,
id = NULL,
text_column = NULL,
id_column = NULL,
files = NULL,
dir = NULL,
file_type = "txt",
encoding = NULL,
return_text = FALSE,
context = "written",
custom_context = FALSE,
api_args = getOption("receptiviti.api_args", list()),
frameworks = getOption("receptiviti.frameworks", "all"),
framework_prefix = TRUE,
as_list = FALSE,
bundle_size = 1000,
bundle_byte_limit = 75e5,
collapse_lines = FALSE,
retry_limit = 50,
clear_cache = FALSE,
clear_scratch_cache = TRUE,
request_cache = TRUE,
cores = detectCores() - 1,
collect_results = TRUE,
use_future = FALSE,
in_memory = TRUE,
verbose = FALSE,
overwrite = FALSE,
compress = FALSE,
make_request = TRUE,
text_as_paths = FALSE,
cache = Sys.getenv("RECEPTIVITI_CACHE"),
cache_overwrite = FALSE,
cache_format = Sys.getenv("RECEPTIVITI_CACHE_FORMAT", "parquet"),
key = Sys.getenv("RECEPTIVITI_KEY"),
secret = Sys.getenv("RECEPTIVITI_SECRET"),
url = Sys.getenv("RECEPTIVITI_URL"),
version = Sys.getenv("RECEPTIVITI_VERSION"),
endpoint = Sys.getenv("RECEPTIVITI_ENDPOINT")
) {
# check input
if (!is.null(output)) {
if (!file.exists(output) && file.exists(paste0(output, ".xz")))
output <- paste0(output, ".xz")
if (!overwrite && file.exists(output))
stop(
"output file already exists; use overwrite = TRUE to overwrite it",
call. = FALSE
)
}
if (isTRUE(cache)) {
temp <- dirname(tempdir())
if (basename(temp) == "working_dir") temp <- dirname(dirname(temp))
cache <- paste0(temp, "/receptiviti_cache")
if (!dir.exists(cache)) {
if (
interactive() &&
!isFALSE(getOption("receptiviti.cache_prompt")) &&
grepl(
"^(?:[Yy1]|$)",
readline("Do you want to establish a default cache? [Y/n] ")
)
) {
} else {
options(receptiviti.cache_prompt = FALSE)
cache <- ""
}
}
}
if (!is.character(cache)) cache <- ""
if (cache != "") {
if (!requireNamespace("arrow", quietly = TRUE)) {
stop("install the `arrow` package to enable the cache", call. = FALSE)
}
if (!(cache_format %in% c("parquet", "feather"))) {
stop("cache format can only be `parquet` or `feather`", call. = FALSE)
}
if (clear_cache) unlink(cache, TRUE)
dir.create(cache, FALSE, TRUE)
cached_parts <- list.files(
cache,
cache_format,
recursive = TRUE,
full.names = TRUE
)
}
st <- proc.time()[[3]]
if (is.character(custom_context)) {
context <- custom_context
custom_context <- TRUE
}
if (context != "written") {
norming_status <- receptiviti_norming(
name_only = TRUE,
url = url,
key = key,
secret = secret,
verbose = FALSE
)
if (verbose) {
message(
"retrieved custom norming context list (",
round(proc.time()[[3]] - st, 4),
")"
)
}
context_id <- if (custom_context) paste0("custom/", context) else context
if (!length(norming_status) || !(context_id %in% norming_status)) {
stop(
"custom norming context ",
context,
" is not on record or is not complete",
call. = FALSE
)
}
}
if (
length(frameworks) &&
!("all" %in% frameworks) &&
grepl("2", version, fixed = TRUE)
) {
api_args$frameworks <- paste0(
frameworks[frameworks != "summary"],
collapse = ","
)
}
if (!is.null(api_args$frameworks)) {
available_frameworks <- c(
"summary",
receptiviti_frameworks(url, key, secret)
)
if (verbose)
message(
"retrived frameworks list (",
round(proc.time()[[3]] - st, 4),
")"
)
arg_frameworks <- unlist(strsplit(api_args$frameworks, ",", fixed = TRUE))
su <- !(arg_frameworks %in% available_frameworks)
if (any(su)) {
stop(
"requested framework(s) are not available to your account: ",
paste(arg_frameworks[su], collapse = ", "),
call. = FALSE
)
}
}
res <- manage_request(
text,
id = id,
text_column = text_column,
id_column = id_column,
files = files,
dir = dir,
file_type = file_type,
encoding = encoding,
context = if (custom_context) paste0("custom/", context) else context,
api_args = api_args,
bundle_size = bundle_size,
bundle_byte_limit = bundle_byte_limit,
collapse_lines = collapse_lines,
retry_limit = retry_limit,
clear_scratch_cache = clear_scratch_cache,
request_cache = request_cache,
cores = cores,
use_future = use_future,
in_memory = in_memory,
verbose = verbose,
make_request = make_request,
text_as_paths = text_as_paths,
cache = cache,
cache_overwrite = cache_overwrite,
cache_format = cache_format,
key = key,
secret = secret,
url = url,
version = version,
endpoint = endpoint
)
data <- res$data
final_res <- res$final_res
# defragment cache
if (cache != "") {
cache <- normalizePath(cache, "/", FALSE)
exclude <- c("id", "bin", names(api_args))
bin_dirs <- list.dirs(cache)
if (length(bin_dirs) > 1) {
if (verbose)
message("defragmenting cache (", round(proc.time()[[3]] - st, 4), ")")
write_time <- as.numeric(Sys.time())
for (bin_dir in bin_dirs[-1]) {
files <- list.files(bin_dir, cache_format, full.names = TRUE)
if (length(files) > 1) {
previous <- files[!(files %in% cached_parts)]
if (collect_results && length(previous)) {
existing_cols <- unique(c(
"id",
"bin",
names(arrow::schema(arrow::open_dataset(
previous[[1]],
format = cache_format
)))
))
if (
length(existing_cols) != ncol(final_res) ||
!all(existing_cols %in% colnames(final_res))
) {
if (verbose)
message(" clearing existing cache since columns did not align")
unlink(previous)
}
}
bin_content <- dplyr::compute(arrow::open_dataset(
bin_dir,
format = cache_format
))
su <- !duplicated(as.character(bin_content$text_hash))
if (!all(su)) bin_content <- bin_content[su, ]
writer <- if (cache_format == "parquet") arrow::write_parquet else
arrow::write_feather
all_rows <- nrow(bin_content)
for (i in seq_len(ceiling(all_rows / 1e9))) {
writer(
bin_content[seq((i - 1) * 1e9 + 1, min(all_rows, i * 1e9)), ],
paste0(bin_dir, "/part-", write_time, "-", i, ".", cache_format)
)
}
unlink(files)
}
}
}
}
if (!collect_results) {
if (verbose) message("done (", round(proc.time()[[3]] - st, 4), ")")
return(invisible(NULL))
}
# prepare final results
if (verbose)
message("preparing output (", round(proc.time()[[3]] - st, 4), ")")
rownames(final_res) <- final_res$id
rownames(data) <- data$id
data$text_hash <- structure(
final_res$text_hash,
names = data[final_res$id, "text"]
)[data$text]
final_res <- cbind(
data[,
c(if (return_text) "text", if (res$provided_id) "id", "text_hash"),
drop = FALSE
],
final_res[
structure(final_res$id, names = final_res$text_hash)[data$text_hash],
!colnames(final_res) %in% c("id", "bin", "text_hash", "custom"),
drop = FALSE
]
)
row.names(final_res) <- NULL
if (!is.null(output)) {
if (!grepl("\\.csv", output, TRUE)) output <- paste0(output, ".csv")
if (compress && !grepl(".xz", output, fixed = TRUE))
output <- paste0(output, ".xz")
if (grepl(".xz", output, fixed = TRUE)) compress <- TRUE
if (verbose)
message(
"writing results to file: ",
output,
" (",
round(proc.time()[[3]] - st, 4),
")"
)
dir.create(dirname(output), FALSE, TRUE)
if (overwrite) unlink(output)
if (compress) output <- xzfile(output)
arrow::write_csv_arrow(final_res, file = output)
}
if (is.character(frameworks) && frameworks[1] != "all") {
if (verbose)
message("selecting frameworks (", round(proc.time()[[3]] - st, 4), ")")
vars <- colnames(final_res)
sel <- grepl(
paste0("^(?:", paste(tolower(frameworks), collapse = "|"), ")"),
vars
)
if (any(sel)) {
if (
missing(framework_prefix) &&
(length(frameworks) == 1 && frameworks != "all")
)
framework_prefix <- FALSE
sel <- unique(c("text", "id", "text_hash", names(api_args), vars[sel]))
sel <- sel[sel %in% vars]
final_res <- final_res[, sel]
} else {
warning(
"frameworks did not match any columns -- returning all",
call. = FALSE
)
}
}
if (as_list) {
if (missing(framework_prefix)) framework_prefix <- FALSE
inall <- c("text", "id", "text_hash", names(api_args))
cols <- colnames(final_res)
inall <- inall[inall %in% cols]
pre <- sub("\\..*$", "", cols)
pre <- unique(pre[!pre %in% inall])
final_res <- lapply(structure(pre, names = pre), function(f) {
res <- final_res[, c(inall, grep(paste0("^", f), cols, value = TRUE))]
if (!framework_prefix) colnames(res) <- sub("^.+\\.", "", colnames(res))
res
})
} else if (!framework_prefix)
colnames(final_res) <- sub("^.+\\.", "", colnames(final_res))
if (verbose) message("done (", round(proc.time()[[3]] - st, 4), ")")
invisible(final_res)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.