###############################################################################
###############################################################################
###############################################################################
# Module : dev_t.R
# Description : This module is used to gather all functions related to the T
# of the ELT methodology of the data pipelines at the CLESSN
#
# T: Stands for 'Tranform' and is about taking data from our data
# warehouse, combining it with other data from our data warehouse
# and enriching it and then store it in our datamarts area
# in order to consume it in its most refined form to answer
# questions, conduct scientific research, or visualize it in
# graphics.
#
# As a reminder, the data stored in our data warehouse is stored
# in databases in tables (rectangular format). Observations in
# data warehouses tables represent a structured reality as it was
# stored in the original raw data which was harvested in out data
# lake
#
# WARNING : The functions in this file HAVE NOT BEEN VERIFIED and HAVE NOT
# been subject to the CLESSN package VERIFICATION checklist
# Also, their relevance into the clessnverse package has not
# been oconfirmes either
###############################################################################
###############################################################################
###############################################################################
# DATAWAREHOUSE FUNCTIONS (READ)
###############################################################################
#' @title clessnverse::get_warehouse_table
#'
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' get_warehouse_table allows the programmer to retrieve a data
#' table from the CLESSN data warehouse named hublot.
#'
#' @param table_name The name of the table to retrieve from the warehouse without
#' the 'chlub_tables_warehouse' prefix
#' @param credentials The hublot credentials obtained from the
#' hublot::get_credentials function
#' @param data_filter a filter on the data to be selected in the query
#' @param nbrows Optional argument
#' If nbrows is greater than 0, the dataframe returned will be
#' limited to nbrows observations. This is particularly useful
#' when trying to see if there are records in a table and what
#' how structured they are.
#' If nbrows is omitted, then all rows of the table are returned
#'
#' @return returns a dataframe containing the data warehouse table content
#' with the document.id and creation & update time stamps
#'
#' @examples
#' \dontrun{
#' # connect to hublot
#' credentials <- hublot::get_credentials(
#' Sys.getenv("HUB3_URL"),
#' Sys.getenv("HUB3_USERNAME"),
#' Sys.getenv("HUB3_PASSWORD")
#' )
#'
#' # gets the entire warehouse table 'people'
#' clessnverse::get_warehouse_table('people', credentials)
#'
#' # gets the first 10 rows of the 'political_parties_press_releases' table
#' clessnverse::get_warehouse_table(
#' table_name = 'political_parties_press_releases',
#' data_filter = list(),
#' credentials = credentials,
#' nbrows=10
#' )
#' }
#'
#' @export
#'
get_warehouse_table <- function(table_name, credentials, data_filter=list(), nbrows=0) {
function_name <- "get_warehouse_table"
# validate arguments
if (is.null(credentials$auth) || is.na(credentials$auth)) stop(
paste("You must supply valid hublot credentials in", function_name)
)
data <- hublot::list_tables(credentials)
hublot_tables_list <- tidyjson::spread_all(data)
if (!paste("warehouse_", table_name, sep="") %in% hublot_tables_list$table_name) stop(
paste("This table is not in hublot:", table_name)
)
table_longname <- paste("clhub_tables_warehouse_", table_name, sep="")
hublot::count_table_items(table_longname, credentials)
if (length(data_filter) == 0) {
page <- hublot::list_table_items(table_longname, credentials)
} else {
page <- hublot::filter_table_items(table_longname, credentials, data_filter)
}
data <- list()
repeat {
data <- c(data, page$results)
if (length(data_filter) == 0) {
page <- hublot::list_next(page, credentials)
} else {
page <- hublot::filter_next(page, credentials)
}
if (is.null(page) || (nbrows != 0 && length(data) >= nbrows)) {
break
}
}
if (length(data) == 0) {
warning(paste("table", table_name, "is empty in function", function_name))
return(data.frame())
}
if (nbrows != 0 && length(data) >= nbrows) data <- data[1:nbrows]
data1 <- replace_null(data)
df <- data.frame(t(sapply(data1,c)))
df_data <- data.frame(t(sapply(df$data,c)))
# Check if the structure is even or uneven
if (length(unique(sapply(df$data, length))) == 1) {
# This is very fast on large dataframes but only works on even data schemas
print("dataset is even")
df$data <- NULL
names(df) <- paste("hub.",names(df),sep="")
df <- as.data.frame(cbind(df,df_data))
#df <- df %>% replace(.data == "NULL", NA)
for (col in names(df)) df[,col] <- unlist(df[,col])
} else {
print("dataset is uneven")
# This is slower on larg data sets but works on uneven data schemas
df <- clessnverse::spread_list_to_df(data)
}
#test <- cbind(df[!sapply(df, is.list)],
# (t(apply(df[sapply(df, is.list)], 1, unlist))))
return(df)
}
###############################################################################
#' @title clessnverse::get_hub2_table
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' get_hub2_table allows the programmer to retrieve a data
#' table from the CLESSN hub2 data warehouse.
#' ** WARNING hub2 will be decommissionned by end of 2022 **
#'
#' @param table_name The name of the table to retrieve from the hub2 warehouse
#' @param data_filter A list containing the filters to apply against the query
#' to retrieve the data. Only observations in the table
#' complyingw with the filter conditions will be returned
#' @param max_pages The number of pages to return. A page is 1000 rows.
#' Tu return the entire table use *max_pages = -1*
#' @param hub_conf The hub2.0 credentials obtained from the
#' clessnhub::login function
#' @importFrom dplyr "%>%"
#' @return returns a dataframe containing the data warehouse table content
#'
#' @examples
#' \dontrun{
#' clessnhub::login(
#' Sys.getenv("HUB_USERNAME"),
#' Sys.getenv("HUB_PASSWORD"),
#' Sys.getenv("HUB_URL"))
#'
#' # get the journalists intervention in press conference from the
#' # 'agoraplus_interventions' table from hub2
#' data_filter = list(
#' type = "press_conference",
#' metadata__location = "CA-QC",
#' data__speakerType = "journalist",
#' data__eventDate__gte = "2021-01-01",
#' data__eventDate__lte = "2022-06-23"
#' )
#'
#' df <- clessnverse::get_hub2_table(
#' table_name = 'agoraplus_interventions',
#' data_filter = data_filter,
#' max_pages = -1,
#' hub_conf = hub_config
#' )
#' }
#'
#' @export
#'
get_hub2_table <- function(table_name, data_filter=NULL, max_pages=-1, hub_conf) {
function_name <- "get_hub2_table"
http_post <- function(path, body, options=NULL, verify=T, hub_c) {
token <- hub_c$token
token_prefix <- hub_c$token_prefix
response <- httr::POST(
url=paste0(hub_c$url, path),
body=body, httr::accept_json(),
httr::content_type_json(),
config=httr::add_headers(Authorization=paste(token_prefix, token)),
verify=verify,
httr::timeout(30))
return(response)
}
if (!is.null(data_filter) && !class(data_filter) == "list" || length(data_filter) == 0) data_filter <- NULL
data_filter <- jsonlite::toJSON(data_filter, auto_unbox = T)
path <- paste("/data/", table_name, "/count/", sep="")
response <- http_post(path, body=data_filter, hub_c = hub_conf)
result <- httr::content(response)
count <- result$count
print(paste("count:", count))
path <- paste("/data/", table_name, "/filter/", sep="")
response <- http_post(path, body=data_filter, hub_c = hub_conf)
page <- httr::content(response)
data = list()
repeat {
data <- c(data, page$results)
print(paste(length(data), "/", count))
path <- page$"next"
if (is.null(path)) {
break
}
max_pages <- max_pages - 1
if (max_pages == 0)
{
break
}
path <- strsplit(path, "science")[[1]][[2]]
response <- http_post(path, body=data_filter, hub_c = hub_conf)
page <- httr::content(response)
}
if (length(data) == 0) {
warning(paste("table", table_name, "is empty in function", function_name))
return(data.frame())
}
data1 <- replace_null(data)
df <- data.frame(t(sapply(data1,c)))
df_data <- data.frame(t(sapply(df$data,c)))
df_metadata <- data.frame(t(sapply(df$metadata,c)))
# Check if the structure is even or uneven
if (length(unique(sapply(df$data, length))) == 1 && length(unique(sapply(df$metadata, length))) == 1) {
# This is very fast on large dataframes but only works on even data schemas
df$data <- NULL
df$metadata <- NULL
names(df) <- paste("hub.",names(df),sep="")
df <- as.data.frame(cbind(df,df_data))
df <- as.data.frame(cbind(df,df_metadata))
#df <- df %>% replace(.data == "NULL", NA)
#df <- df %>% replace(is.null(.data), NA)
for (col in names(df)) {df[,col] <- unlist(df[,col])}
} else {
# This is slower on larg data sets but works on uneven data schemas
df <- clessnverse::spread_list_to_df(data)
df_metadata <- df[which(grepl("^metadata.",names(df)))]
df_data <- df[which(grepl("^data.",names(df)))]
df_hub <- dplyr::select(df, -c(c(names(df_data),names(df_metadata))))
names(df_data) <- gsub("^data.", "", names(df_data))
names(df_metadata) <- gsub("^metadata.", "", names(df_metadata))
names(df_hub) <- paste("hub.", names(df_hub), sep="")
df <- df_hub %>% bind_cols(df_metadata) %>% bind_cols(df_data)
}
return(df)
}
###############################################################################
###############################################################################
###############################################################################
# DATAMART FUNCTIONS (READ)
###############################################################################
#' @title clessnverse::get_mart_table
#'
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' get_mart_table allows the programmer to retrieve a data
#' table from a CLESSN data mart.
#'
#' @param table_name The name of the table to retrieve from the warehouse without
#' the 'chlub_tables_mart' prefix
#' @param credentials The hublot credentials obtained from the hublot::
#' @param data_filter a filter on the data to be selected in the query
#' @param nbrows Optional argument
#' If nbrows is greater than 0, the dataframe returned will be
#' limited to nbrows observations. This is particularly useful
#' when trying to see if there are records in a table and what
#' how structured they are.
#' If nbrows is omitted, then all rows of the table are returned
#'
#' @return returns a dataframe containing the data warehouse table with a JSON
#' attribute as well as a document.id and creation & update time stamps
#'
#' @examples
#' \dontrun{
#' # connect to hublot
#' credentials <- hublot::get_credentials(
#' Sys.getenv("HUB3_URL"),
#' Sys.getenv("HUB3_USERNAME"),
#' Sys.getenv("HUB3_PASSWORD")
#' )
#'
#' # gets the entire datamart political_parties_press_releases_freq
#' datamart <- clessnverse::get_mart_table(
#' table_name = 'political_parties_press_releases_freq',
#' data_filter = list(),
#' credentials = credentials)
#'
#' # gets the first 10 rows of the warehouse table 'political_parties_press_releases_freq'
#' datamart <- clessnverse::get_mart_table(
#' table_name = 'political_parties_press_releases_freq',
#' data_filter = list(),
#' credentials = credentials,
#' nbrows=10)
#' }
#'
#' @export
#'
get_mart_table <- function(table_name, credentials, data_filter=list(), nbrows=0) {
function_name <- "get_mart_table"
# validate arguments
if (is.null(credentials$auth) || is.na(credentials$auth)) stop(
paste("You must supply valid hublot credentials in", function_name)
)
data <- hublot::list_tables(credentials)
hublot_tables_list <- tidyjson::spread_all(data)
if (!paste("mart_", table_name, sep="") %in% hublot_tables_list$table_name) stop(
paste("This table is not in hublot:", table_name)
)
table_longname <- paste("clhub_tables_mart_", table_name, sep="")
if (length(data_filter) == 0) {
page <- hublot::list_table_items(table_longname, credentials)
} else {
page <- hublot::filter_table_items(table_longname, credentials, data_filter)
}
data <- list()
repeat {
data <- c(data, page$results)
if (length(data_filter) == 0) {
page <- hublot::list_next(page, credentials)
} else {
page <- hublot::filter_next(page, credentials)
}
if (is.null(page) || (nbrows != 0 && length(data) >= nbrows)) {
break
}
}
if (length(data) == 0) {
warning(paste("table", table_name, "is empty in function", function_name))
return(data.frame())
}
if (nbrows != 0 && length(data) >= nbrows) data <- data[1:nbrows]
data1 <- replace_null(data)
df <- data.frame(t(sapply(data1,c)))
df_data <- data.frame(t(sapply(df$data,c)))
# Check if the structure is even or uneven
if (length(unique(sapply(df$data, length))) == 1) {
print("dataset is even")
# This is very fast on large dataframes but only works on even data schemas
df$data <- NULL
names(df) <- paste("hub.",names(df),sep="")
df <- as.data.frame(cbind(df,df_data))
#df <- df %>% replace(.data == "NULL", NA)
for (col in names(df)) df[,col] <- unlist(df[,col])
} else {
print("dataset is uneven")
# This is slower on larg data sets but works on uneven data schemas
df <- clessnverse::spread_list_to_df(data)
}
#test <- cbind(df[!sapply(df, is.list)],
# (t(apply(df[sapply(df, is.list)], 1, unlist))))
return(df)
}
###############################################################################
###############################################################################
###############################################################################
# DATAMART FUNCTIONS (WRITE)
###############################################################################
#' @title Write a row in a data table of a CLESSN data mart
#'
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' @param table_name The name of the data mart table to write an observation to
#' without the 'chlub_tables_mart' prefix.
#' @param key A character string containing the unique primary key of this
#' observation in the table. Data integrity of the CLESSN data
#' model is maintained by having a unique key per observation in
#' each table.
#' @param row A named list containing the observation to write to the datamart
#' table. The names of the list *are the columns* of the table.
# @param mode A character string cintaining either "refresh" or "append".
# If mode = "refresh" then if an observation with a key = key
# already exists in the table, it will be overwritten with the
# new values.
# If mode = "append" then it will be added to the table. However
# if an existing observation with a key = key already exists in the
# table, a warning will be returned.
#' @param refresh_data Logical.
#' @param credentials The hublot credentials obtained from the
#' hublot::get_credentials function
#' @return returns a dataframe containing the data warehouse table with a JSON
#' attribute as well as a document.id and creation & update time stamps
#'
#' @examples
#' \dontrun{
#' # connect to hublot
#' credentials <- hublot::get_credentials(
#' Sys.getenv("HUB3_URL"),
#' Sys.getenv("HUB3_USERNAME"),
#' Sys.getenv("HUB3_PASSWORD")
#' )
#'
#'
#' clessnverse::commit_mart_row(
#' table_name = "political_parties_press_releases_freq",
#' key = "QS212022",
#' row = list(week_num=21, count=6, political_party="QS"),
#' mode = "refresh",
#' credentials = credentials)
#' }
#'
#' @export
#'
commit_mart_row <- function(table_name, key, row = list(), refresh_data = FALSE, credentials) {
# If the row with the same key exist and mode=refresh then overwrite it with the new data
# Otherwise, do nothing (just log a message)
table_name <- paste("clhub_tables_mart_", table_name, sep="")
data_filter <- list(key__exact = key)
item <- hublot::filter_table_items(table_name, credentials, data_filter)
if(length(item$results) == 0) {
# l'item n'existe pas deja dans hublot
hublot::add_table_item(table_name,
body = list(key = key, timestamp = Sys.time(), data = row),
credentials)
} else {
# l'item existe deja dans hublot
if (refresh_data) {
hublot::update_table_item(table_name,
id = item$result[[1]]$id,
body = list(key = key, timestamp = as.character(Sys.time()), data = jsonlite::toJSON(row, auto_unbox = T)),
credentials)
} else {
# Do nothing but log a message saying skipping
} # if (mode == "refresh")
} #if(length(item$results) == 0)
}
###############################################################################
#' @title clessnverse::commit_mart_table
#'
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' commit_mart_row allows the programmer to write a tabe as a
#' CLESSN data mart.
#'
#' @param table_name The name of the data mart table to store without the
#' 'chlub_tables_mart' prefix.
#' @param df blah
#' @param key_column blah
#' @param mode A character string cintaining either "refresh" or "append".
#' If mode = "refresh" then if an observation with a key = key
#' already exists in the table, it will be overwritten with the
#' new values.
#' If mode = "append" then it will be added to the table. However
#' if an existing observation with a key = key already exists in the
#' table, a warning will be returned.
#' @param credentials The hublot credentials obtained from the
#' hublot::get_credentials function
#'
#' @return returns a dataframe containing the data warehouse table with a JSON
#' attribute as well as a document.id and creation & update time stamps
#'
#' @examples
#' \dontrun{
#' # connect to hublot
#' credentials <- hublot::get_credentials(
#' Sys.getenv("HUB3_URL"),
#' Sys.getenv("HUB3_USERNAME"),
#' Sys.getenv("HUB3_PASSWORD")
#' )
#'
#' # Writes a data frame into political_parties_press_releases_freq
#' datamart <- clessnverse::commit_mart_table(
#' table_name = 'political_parties_press_releases_freq',
#' df = data.frame(key = "123", count = 1, week_num = "28", party = "CAQ"),
#' key_column = 'key',
#' mode = 'add',
#' credentials = credentials)
#' }
#'
#' @export
#'
commit_mart_table <- function(table_name, df, key_column, mode, credentials) {
table_name <- paste("clhub_tables_mart_", table_name, sep="")
df <- as.data.frame(df)
pb_chap <- utils::txtProgressBar(min = 0, # Minimum value of the progress bar
max = nrow(df), # Maximum value of the progress bar
style = 3, # Progress bar style (also available style = 1 and style = 2)
width = 80, # Progress bar width. Defaults to getOption("width")
char = "=") # Character used to create the bar
for (i in 1:nrow(df)) {
utils::setTxtProgressBar(pb_chap, i)
key <- df[[key_column]][i]
data_filter <- list(key__exact = key)
item <- hublot::filter_table_items(table_name, credentials, data_filter)
data_row <- as.list(df[i,] %>% select(-c("key")))
if(length(item$results) == 0) {
# l'item n'existe pas deja dans hublot
hublot::add_table_item(table_name,
body = list(key = key, timestamp = Sys.time(), data = data_row),
credentials)
} else {
# l'item existe deja dans hublot
if (mode == "refresh") {
hublot::update_table_item(table_name,
id = item$result[[1]]$id,
body = list(key = key, timestamp = as.character(Sys.time()), data = jsonlite::toJSON(data_row, auto_unbox = T)),
credentials)
} else {
# Do nothing but log a message saying skipping
} # if (mode == "refresh")
} #if(length(item$results) == 0)
} #for (i in 1:nrow(df))
}
###############################################################################
###############################################################################
###############################################################################
# DICTIONARIES FUNCTIONS (READ)
###############################################################################
#' Retrieves a dictionary from hublot.
#'
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' Creates a dictionary object from a dictionary located in
#' the CLESSN data lake (hublot).
#' @param topic The name or topic of the dictionary to retrieve from hublot.
#' @param lang The language of the dictionary. "en" is for English,
#' "fr" is for French. Both are included by default.
#' @param credentials The user's personal credentials from hublot.
#' @return A quanteda type dictionary object.
#' @author CLESSN
#' @examples
#'
#' \dontrun{
#' # get credentials from hublot
#' credentials <- hublot::get_credentials(
#' Sys.getenv("HUB3_URL"),
#' Sys.getenv("HUB3_USERNAME"),
#' Sys.getenv("HUB3_PASSWORD")
#' )
#' # retrieve the COVID dictionary in both EN and FR
#' clessnverse::get_dictionary("covid", c("en", "fr"), credentials)
#' }
#' @export
#'
get_dictionary <-
function(topic, lang = c("en","fr"), credentials) {
# Validate arguments
file_info <- hublot::retrieve_file("config_dict", credentials)
config_dict <- utils::read.csv2(file_info$file)
if (is.null(credentials$auth) || is.na(credentials$auth)) {
stop("hublot credentials in clessnverse::get_dictionary are invalid")
}
if (!topic %in% config_dict$topic) {
stop (
paste(
"invalid topic in clessnverse::get_dictionary function:",
topic,
"\nvalid topics are",
paste(config_dict$topic, collapse = ", ")
)
)
}
if (!unique(unlist(strsplit(config_dict$lang, ","))) %vcontains% lang) {
stop (paste(
"invalid language in clessnverse::get_dictionary function:",
lang
))
}
# Get dictionary file from lake
file_key <- paste("dict_", topic, sep = "")
file_info <- hublot::retrieve_file(file_key, credentials)
dict_df <- utils::read.csv2(file_info$file, encoding = "UTF-8")
# Filter on language provided in lang if language is a dictionary feature
if (!is.null(dict_df$language)) {
dict_df <- dict_df[dict_df$language %in% lang, ]
# Remove language column
dict_df$language <- NULL
}
dict_list <- list()
for (c in unique(dict_df$category)) {
dict_list[[c]] <- dict_df$item[dict_df$category == c]
}
# Convert dataframe to quanteda dict and return it
qdict <- quanteda::dictionary(as.list(dict_list))
return(qdict)
}
###############################################################################
###############################################################################
###############################################################################
# DATA TRANSFORMATION
###############################################################################
#' @title clessnverse::compute_nb_sentences
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' Calculates the number of sentences in a bloc of text
#' @param txt_bloc be documented
#' @return return
#' @examples # To be documented
#' @export
compute_nb_sentences <- function(txt_bloc) {
df_sentences <- tibble::tibble(txt = txt_bloc) %>%
tidytext::unnest_tokens("sentence", "txt", token="sentences",format="text", to_lower = T)
nb_sentences <- nrow(df_sentences)
return(nb_sentences)
}
###############################################################################
#' @title clessnverse::compute_nb_words
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' Calculates the number of words in a bloc of text
#' @param txt_bloc be documented
#' @return return
#' @examples # To be documented
#' @export
compute_nb_words <- function(txt_bloc) {
df_words <- tibble::tibble(txt = txt_bloc) %>%
tidytext::unnest_tokens("words", "txt", token="words",format="text", to_lower = T)
nb_words <- nrow(df_words)
return(nb_words)
}
###############################################################################
#' @title clessnverse::clean_corpus
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' clesns a bloc of text by removing double spaces, non-brekeable
#' spaces and L, S, D apostrophe (in french language). This is particularly
#" useful before using the clessnverse::run_dictionary function.
#' @param txt_bloc blah
#' @return return
#' @examples # To be documented
#'
#' @export
clean_corpus <- function(txt_bloc) {
# Prepare corpus
txt <- stringr::str_replace_all(string = txt_bloc, pattern = "M\\.|Mr\\.|Dr\\.", replacement = "")
txt <- stringr::str_replace_all(string = txt, pattern = "(l|L)\\'", replacement = "")
txt <- stringr::str_replace_all(string = txt, pattern = "(s|S)\\'", replacement = "")
txt <- stringr::str_replace_all(string = txt, pattern = "(d|D)\\'", replacement = "")
txt <- gsub("\u00a0", " ", txt)
txt <- stringr::str_replace_all(string = txt, pattern = " ", replacement = " ")
return(txt)
}
###############################################################################
#' @title clessnverse::compute_category_sentiment_score
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' calculates the sentiment score using a word stems dictionnary by category
#' @param txt_bloc : the bloc of text to study
#' @param category_dictionary : a topic dictionary containing the categories to calculate the sentiment on
#' @param sentiment_dictionary : sentiment lexicoder dictionary
#' @return returns a dataframe containing the sentiment score of each category in the category dictionary
#' @examples # To be documented
#'
#' @importFrom stats aggregate
#'
#' @export
compute_category_sentiment_score <- function(txt_bloc, category_dictionary, sentiment_dictionary) {
# Build one corpus per category and compute sentiment on each corpus
corpus <- data.frame(doc_id = integer(), category = character(), txt = character())
txt_bloc <- stringr::str_replace_all(string = txt_bloc, pattern = "M\\.|Mr\\.|Dr\\.", replacement = "")
txt_bloc <- stringr::str_replace_all(string = txt_bloc, pattern = "(l|L)\\'", replacement = "")
txt_bloc <- stringr::str_replace_all(string = txt_bloc, pattern = "(s|S)\\'", replacement = "")
txt_bloc <- stringr::str_replace_all(string = txt_bloc, pattern = "(d|D)\\'", replacement = "")
txt_bloc <- gsub("\u00a0", " ", txt_bloc)
txt_bloc <- stringr::str_replace_all(string = txt_bloc, pattern = " ", replacement = " ")
df_sentences <- tibble::tibble(txt = txt_bloc) %>%
tidytext::unnest_tokens("sentence", "txt", token="sentences",format="text", to_lower = T)
toks <- quanteda::tokens(df_sentences$sentence)
dfm_corpus <- quanteda::dfm(toks)
lookup <- quanteda::dfm_lookup(dfm_corpus, dictionary = category_dictionary, valuetype = "glob")
df <- quanteda::convert(lookup, to="data.frame") %>% select(-c("doc_id"))
df_sentences <- df_sentences %>% cbind(df)
df_sentences <- df_sentences %>% tidyr::pivot_longer(-c(.data$sentence), names_to = "category", values_to = "relevance")
df_sentences <- df_sentences %>% filter(.data$relevance > 0)
df_categories <- df_sentences %>%
dplyr::group_by(.data$category) %>%
dplyr::summarise(txt = paste(.data$sentence, collapse = " "), relevance = sum(.data$relevance))
df_categories$txt <- stringr::str_replace_all(string = df_categories$txt, pattern = "M\\.|Mr\\.|Dr\\.", replacement = "")
toks <- quanteda::tokens(df_categories$txt)
toks <- quanteda::tokens(df_categories$txt, remove_punct = TRUE)
# On n'enleve pas les stopwords parce qu'on veut garder "pas" ou "ne" car connotation negative
# toks <- quanteda::tokens_remove(toks, quanteda::stopwords("french"))
# toks <- quanteda::tokens_remove(toks, quanteda::stopwords("spanish"))
# toks <- quanteda::tokens_remove(toks, quanteda::stopwords("english"))
# toks <- quanteda::tokens_replace(
# toks,
# quanteda::types(toks),
# stringi::stri_replace_all_regex(quanteda::types(toks), "[lsd]['\\p{Pf}]", ""))
if (length(toks) == 0) {
tokens <- quanteda::tokens("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.", remove_punct = TRUE)
}
dfm_corpus <- quanteda::dfm(toks)
lookup <- quanteda::dfm_lookup(dfm_corpus, dictionary = sentiment_dictionary, valuetype = "glob")
df <- quanteda::convert(lookup, to="data.frame") %>% select(-c("doc_id"))
df_categories <- df_categories %>%
cbind(df)
if (nrow(df_categories) > 0) {
df_categories <- df_categories %>%
dplyr::mutate(sentiment = .data$positive - .data$neg_positive - .data$negative + .data$neg_negative) %>%
select(-c("txt"))
}
df_category_pads <- data.frame(category = names(category_dictionary), relevance=rep(0L, length(category_dictionary)),
negative=rep(0L, length(category_dictionary)), positive=rep(0L, length(category_dictionary)),
neg_positive=rep(0L, length(category_dictionary)), neg_negative=rep(0L, length(category_dictionary)),
sentiment=rep(0L, length(category_dictionary)))
df_sentiments <- df_categories %>% rbind(df_category_pads)
df_sentiments <- stats::aggregate(df_sentiments[,-c(1)], list(df_sentiments$category), FUN=sum)
names(df_sentiments)[1] <- "category"
return(df_sentiments)
}
######################################################
#' @title clessnverse::detect_language
#' @description detects the language of the text provided as a parameter
#' @param text : the text to translate
#' @param engine : "azure" | "deeptranslate" | "fastText"
#' @return 2 chars language detected
#' @examples example
#'
#' @export
detect_language <- function(engine, text) {
if (is.null(text)) return(NA_character_)
if (is.na(text)) return(NA_character_)
if (nchar(trimws(text)) == 0) return(NA_character_)
if (!(engine %in% c("deeptranslate", "fastText"))) {
stop(paste("translation engine", engine,"not supported. Supported values are: fastText | deeptranslate"))
}
if (engine == "deeptranslate") {
# detect language first
url <- "https://deep-translate1.p.rapidapi.com/language/translate/v2/detect"
text <- gsub("\\\"","'", text)
text <- gsub("\\n"," ",text)
if (is.null(text)) return(NA_character_)
if (is.na(text)) return(NA_character_)
if (nchar(trimws(text)) == 0) return(NA_character_)
df <- tidytext::unnest_tokens(
data.frame(txt=text),
input = txt,
output = "Sentence",
token = "regex",
pattern = "(?<!\\b\\p{L}r)\\.|\\n\\n", to_lower=F)
#clessnverse::logit(scriptname, "detecting language", logger)
response <- httr::VERB(
"POST",
url,
body = paste("{\"q\":\"", df$Sentence[1],"\"}", sep=''),
httr::add_headers(
'X-RapidAPI-Key' = Sys.getenv("DEEP_TRANSLATE_KEY"),
'X-RapidAPI-Host' = 'deep-translate1.p.rapidapi.com'),
httr::content_type("application/octet-stream")
)
r <- jsonlite::fromJSON(httr::content(response, "text"))
lang <- r$data$detections$language
return(lang)
}
if (engine == "fastText") {
text <- gsub("\\\"","'", text)
text <- gsub("\\n"," ",text)
if (nchar(system.file(package="fastText")) == 0) stop("Package fastText is not installed. Please run remotes::install_github('mlampros/fastText')")
if (!exists("file_ftz")) file_ftz <<- system.file("language_identification/lid.176.ftz", package = "fastText")
lang <- fastText::language_identification(
input_obj = text,
pre_trained_language_model_path = file_ftz,
k = 1,
th = 0.0,
threads = 1,
verbose = FALSE
)
return(lang$iso_lang_1)
}
}
######################################################
#' @title clessnverse::translate_text
#' @description translates the text provided as a parameter using language autodetection in the language
#' @param text : the text to translate
#' @param engine : "azure" | "deeptranslate"
#' @param source_lang : the language to translate from - if NA, then automatic detection will be performed
#' @param target_lang : which language to translate to
#' @param translate : translates for real. this is helpful to set to FALSE when debugging to avoid translation API fees
#' @return a vector containing 1. a string = language detected and 2. a string = the translated text
#' @examples example
#'
#'
#' @export
translate_text <- function (text, engine = "azure", source_lang = NA, target_lang = "en", translate = FALSE) {
if (is.null(text)) return(NA_character_)
if (is.na(text)) return(NA_character_)
if (nchar(trimws(text)) == 0) return(NA_character_)
text <- gsub("\u00a0", " ", text)
if (!(engine %in% c("azure", "deeptranslate"))) {
stop(paste("translation engine", engine,"not supported. Supporter values are: azure | deeptranslate"))
}
if (engine == "azure") {
base_url <- "https://api.cognitive.microsofttranslator.com/"
path <- '/translate?api-version=3.0'
params = paste('&to=', target_lang, sep="")
url <- paste(base_url, path, params, sep="")
# There atr characters that need to be escaped (or even removed) in order for the translator to
# be able to take them
text <- stringr::str_replace_all(text, "\\'", "\\\\'")
text <- stringr::str_replace_all(text, "\\\u00ab", "")
text <- stringr::str_replace_all(text, "\\\u00bb", "")
text <- stringr::str_replace_all(text, "\\\u00ab", "")
text <- stringr::str_replace_all(text, "\\\u00bb", "")
text <- stringr::str_replace_all(text, "\\\u2019", "\\\\'")
key <- Sys.getenv("AZURE_TRANSLATE_KEY")
headers <- httr::add_headers(`Ocp-Apim-Subscription-Key`=key,
`Ocp-Apim-Subscription-Region`= "canadacentral",
`Content-Type` = "application/json")
if(translate) {
response <- httr::POST(url, headers,
body = paste("[{'Text':'",text,"'}]", sep = ""),
encode = "json")
response_json <- jsonlite::parse_json(httr::content(response, "text"))
while (!is.null(response_json[1][[1]]$code) && response_json[1][[1]]$code == "429001") {
Sys.sleep(30)
response <- httr::POST(url, headers,
body = paste("[{'Text':'",text,"'}]", sep = ""),
encode = "json")
response_json <- jsonlite::parse_json(httr::content(response, "text"))
}
if (is.na(source_lang)) {
return(
c(
response_json[1][[1]]$detectedLanguage[1]$language,
response_json[1][[1]]$translations[[1]]$text
)
)
} else {
return(response_json[1][[1]]$translations[[1]]$text)
}
} else {
return(
c(
"Fake lang code - use translate = TRUE if you want to consume the translation service",
"Fake translation text - use translate = TRUE if you want to consume the translation service"
)
)
} # if(translate)
} # if (engine == "azure")
if (engine == "deeptranslate") {
key <- Sys.getenv("DEEP_TRANSLATE_KEY")
text <- gsub("\\n", "\\[000\\]", text)
# if source_lang = NA let's detect the language first
if (is.na(source_lang)) source_lang <- clessnverse::detect_language(engine = "deeptranslate", text)
# translate next
url <- "https://deep-translate1.p.rapidapi.com/language/translate/v2"
if (nchar(text) > 3000) {
# more than 5000 characters
df <- tidytext::unnest_tokens(
data.frame(txt=text),
input = txt,
output = "Sentence",
token = "regex",
pattern = "(?<!\\b\\p{L}r)\\.|\\n\\n", to_lower=F)
result <- ""
payload_txt <- ""
for (i in 1:nrow(df)) {
if (is.null(df$Sentence[i])) next
if (is.na(df$Sentence[i])) next
if (nchar(trimws(df$Sentence[i])) == 0) next
if ( payload_txt == "" ) {
payload_txt <- trimws(df$Sentence[i])
} else {
if ( nchar(payload_txt) + nchar(df$Sentence[i]) < 3000 && i < nrow(df) ) {
payload_txt <- trimws(paste(payload_txt, trimws(df$Sentence[i]), sep = ". "))
next
}
payload_txt <- trimws(paste(payload_txt, trimws(df$Sentence[i]), sep = ". "))
payload_txt <- paste(payload_txt, ".", sep='')
payload <- paste("{\"q\":\"", payload_txt,"\",\"source\": \"",source_lang,"\",\"target\": \"",target_lang,"\"}", sep='')
encode <- "json"
#clessnverse::logit(scriptname, paste("translating language - pass", i), logger)
response <- httr::VERB(
"POST",
url,
body = payload,
httr::add_headers('X-RapidAPI-Key' = key,
'X-RapidAPI-Host' = 'deep-translate1.p.rapidapi.com'),
httr::content_type("application/json"),
encode = encode)
#clessnverse::logit(scriptname, paste("translating language done - pass", i), logger)
r <- jsonlite::fromJSON(httr::content(response, "text"))
result <- trimws(paste(result,r$data$translations$translatedText, sep=" "))
payload_txt <- ""
} #if (payload_txt == "")
} # for
} else {
# less than 5000 characters
payload <- paste("{\"q\":\"", text,"\",\"source\": \"",source_lang,"\",\"target\": \"",target_lang,"\"}", sep='')
encode <- "json"
#clessnverse::logit(scriptname, "translating language", logger)
response <- httr::VERB(
"POST",
url,
body = payload,
httr::add_headers('X-RapidAPI-Key' = key,
'X-RapidAPI-Host' = 'deep-translate1.p.rapidapi.com'),
httr::content_type("application/json"),
encode = encode)
#clessnverse::logit(scriptname, "translating language done", logger)
r <- jsonlite::fromJSON(httr::content(response, "text"))
result <- trimws(r$data$translations$translatedText)
} #if (nchar(text) > 5000)
result <- gsub("\u00a0", " ", result)
result <- gsub("\\[000\\]", "\\\n", result)
if (is.na(source_lang)) {
return(c(trimws(source_lang), trimws(result)))
} else {
return(trimws(result))
}
} #if (engine == "deeptranslate")
return(
c(
"Fake language code - use translate = TRUE if you want to consume the translation service",
"Fake translation text - use translate = TRUE if you want to consume the translation service"
)
)
}
######################################################
#' @title Replace accented characters with their non-accented counterparts
#' @param str : the text to remove accents from
#' @param pattern : patterns of the accents to remove
#' @return str without accent
#' @examples example
#'
#'
#' @export
rm_accents <- function(str,pattern="all") {
if(!is.character(str))
str <- as.character(str)
pattern <- unique(pattern)
if(any(pattern == "\u00c7"))
pattern[pattern == "\u00c7"] <- "\u00e7" # c cedil uppercase <- lowercase
symbols <- c(
acute = "\u00e1\u0107\u00e9\u00ed\u0144\u00f3\u015b\u00fa\u00c1\u0106\u00c9\u00cd\u0143\u00d3\u015a\u00da\u00fd\u00dd\u017a\u0179",
grave = "\u00e0\u00e8\u00ec\u00f2\u00f9\u00c0\u00c8\u00cc\u00d2\u00d9",
circunflex = "\u00e2\u00ea\u00ee\u00f4\u00fb\u00c2\u00ca\u00ce\u00d4\u00db",
tilde = "\u00e3\u00f5\u00c3\u00d5\u00f1\u00d1",
umlaut = "\u00e4\u00eb\u00ef\u00f6\u0151\u00fc\u00c4\u00cb\u00cf\u00d6\u0150\u00dc\u00ff",
cedil = "\u0105\u0104\u00e7\u00c7\u0119\u0118\u0146\u0145\u021b\u021a\u0219\u0218\u015f\u015e\u00df\u00df\u0163\u0162",
flex="\u0103\u0102\u010f\u010e\u011b\u011a\u0115\u0114\u0148\u0147\u0161\u0160\u010d\u010c\u0159\u0158\u017e\u017d\u0115\u0114",
dotted="\u0117\u0116\u017c\u017b",
round="\u00e5\u00c5",
bar="\u0101\u0100\u0113\u0112\u012b\u012a\u016b\u016a",
interlaced="\u00e6\u00c6",
cross="\u0142\u0141\u00f8\u00d8\u0111\u0110"
)
nudeSymbols <- c(
acute = "aceinosuACEINOSUyYzZ",
grave = "aeiouAEIOU",
circunflex = "aeiouAEIOU",
tilde = "aoAOnN",
umlaut = "aeioouAEIOOUy",
cedil = "aAcCenNEtTsSsSsStT",
flex="aAdDeEeEnNsScCrRzZeE",
dotted="eEzZ",
round="aA",
bar="aAeEiIuU",
interlaced="aA",
cross="lLoOdD"
)
accentTypes <- c("\u00b4","`","^","~","\u00a8","\u00e7","")
if(any(c("all","al","a","todos","t","to","tod","todo")%in%pattern))
return(chartr(paste(symbols, collapse=""), paste(nudeSymbols, collapse=""), str))
for(i in which(accentTypes%in%pattern))
str <- chartr(symbols[i],nudeSymbols[i], str)
return(str)
}
######################################################
#' @title Return a list of words of a sentence
#' @param txt the string
#' @return a list of words
#' @examples example
#'
#'
#'
#' @export
split_words <- function(txt) {
#txt <- gsub("[[:punct:][:blank:]]+", " ", txt)
txt <- gsub("[(\\!|\\'|\\#|\\%|\\&|\\'|\\(|\\)|\\+|\\,|\\/|\\:|\\;|\\<|\\=|\\>|\\?|\\@|\\[|\\/|\\]|\\^|\\_|\\{|\\||\\}|\\~)[:blank:]]+", " ", txt)
#txt <- gsub("[(\\,|\\.|\\;)[:blank:]]+", " ", txt)
txt <- trimws(txt, which="both")
list <- strsplit(txt, "\\s+")[[1]]
return(list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.