#' @title Detect the language that the abstract or other fields are written in
#'
#' @param CitDat A dataframe/tibble possibly returned by \code{\link[CitaviR]{read_Citavi_xlsx}}.
#' @param fieldsToDetectIn Character vector with names of fields whose text
#' language should be detected. Default is \code{c("Abstract")}. When multiple fields are given
#' (e.g. \code{c("Abstract", "Title")}), they are combined into a single string whose language
#' is then detected.
#' @param wantedLanguage Character vector with names of languages that are desired. Default is
#' \code{c("english")}. If not set to \code{NULL}, a new column \code{det_lang_wanted}
#' is created, which is \code{TRUE} if the detected language in \code{det_lang} is a wanted
#' language.
#'
#' @details
#' `r lifecycle::badge("experimental")` \cr
#' The underlying core function determining the language is \code{textcat::textcat()}.
#'
#' @examples
#' example_path <- example_file("3dupsin5refs/3dupsin5refs.ctv6")
#' read_Citavi_ctv6(example_path) %>%
#' detect_language() %>%
#' dplyr::select(Abstract, det_lang, det_lang_wanted)
#'
#' @return A tibble containing at least one additional column: \code{det_lang}.
#' @importFrom textcat textcat
#' @importFrom tidyr unite
#' @import dplyr
#' @export
#'
detect_language <- function (CitDat, fieldsToDetectIn = c("Abstract"), wantedLanguage = c("english")) {
# stop if empty arguments -------------------------------------------
if (is.null(all_of(fieldsToDetectIn))) {
stop("'fieldsToDetectIn' must not be NULL/NA.")
}
# check if fieldsToDetectIn are present -----------------------------------
if (!all_of(fieldsToDetectIn %in% names(CitDat))) {
stop(paste("Could not be found in dataset column names:\n",
fieldsToDetectIn[fieldsToDetectIn %not_in% names(CitDat)]))
}
# collapse fieldsToDetectIn -----------------------------------------------
CitDat <- CitDat %>%
tidyr::unite(
fieldsToDetectIn,
col = "StringToDetect",
sep = "; ",
remove = FALSE,
na.rm = TRUE
) %>%
mutate(StringToDetect = if_else(.data$StringToDetect == "", NA_character_, .data$StringToDetect))
# det_lang ----------------------------------------------------------------
CitDat <- CitDat %>%
mutate(det_lang = textcat::textcat(.data$StringToDetect)) %>%
select(-.data$StringToDetect)
# det_lang_wanted ---------------------------------------------------------
if (!is.null(wantedLanguage)) {
CitDat <- CitDat %>%
mutate(det_lang_wanted = if_else(is.na(.data$det_lang),
NA_character_,
.data$det_lang %in% wantedLanguage))
}
# return tibble -----------------------------------------------------------
CitDat
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.