R/dev_t.R

Defines functions split_words rm_accents detect_language compute_category_sentiment_score clean_corpus compute_nb_words compute_nb_sentences get_dictionary commit_mart_table commit_mart_row get_mart_table get_hub2_table get_warehouse_table

Documented in clean_corpus commit_mart_row commit_mart_table compute_category_sentiment_score compute_nb_sentences compute_nb_words detect_language get_dictionary get_hub2_table get_mart_table get_warehouse_table rm_accents split_words

###############################################################################
###############################################################################
###############################################################################
# Module      : dev_t.R
# Description : This module is used to gather all functions related to the T
#               of the ELT methodology of the data pipelines at the CLESSN
#
#               T: Stands for 'Tranform' and is about taking data from our data
#               warehouse, combining it with other data from our data warehouse
#               and enriching it and then store it in our datamarts area
#               in order to consume it in its most refined form to answer
#               questions, conduct scientific research, or visualize it in
#               graphics.
#
#               As a reminder, the data stored in our data warehouse is stored
#               in databases in tables (rectangular format).  Observations in
#               data warehouses tables represent a structured reality as it was
#               stored in the original raw data which was harvested in out data
#               lake
#
# WARNING     : The functions in this file HAVE NOT BEEN VERIFIED and HAVE NOT
#               been subject to the CLESSN package VERIFICATION checklist
#               Also, their relevance into the clessnverse package has not
#               been oconfirmes either







###############################################################################
###############################################################################
###############################################################################
#   DATAWAREHOUSE FUNCTIONS (READ)



###############################################################################
#' @title clessnverse::get_warehouse_table
#'
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' get_warehouse_table allows the programmer to retrieve a data
#' table from the CLESSN data warehouse named hublot.
#'
#' @param table_name The name of the table to retrieve from the warehouse without
#'                   the 'chlub_tables_warehouse' prefix
#' @param credentials The hublot credentials obtained from the
#'                    hublot::get_credentials function
#' @param data_filter a filter on the data to be selected in the query
#' @param nbrows   Optional argument
#'                 If nbrows is greater than 0, the dataframe returned will be
#'                 limited to nbrows observations.  This is particularly useful
#'                 when trying to see if there are records in a table and what
#'                 how structured they are.
#'                 If nbrows is omitted, then all rows of the table are returned
#'
#' @return returns a dataframe containing the data warehouse table content
#'         with the document.id and creation & update time stamps
#'
#' @examples
#'  \dontrun{
#'  # connect to hublot
#'  credentials <- hublot::get_credentials(
#'    Sys.getenv("HUB3_URL"),
#'    Sys.getenv("HUB3_USERNAME"),
#'    Sys.getenv("HUB3_PASSWORD")
#'    )
#'
#'  # gets the entire warehouse table 'people'
#'  clessnverse::get_warehouse_table('people', credentials)
#'
#'  # gets the first 10 rows of the 'political_parties_press_releases' table
#'  clessnverse::get_warehouse_table(
#'    table_name = 'political_parties_press_releases',
#'    data_filter = list(),
#'    credentials = credentials,
#'    nbrows=10
#'    )
#'  }
#'
#' @export
#'
get_warehouse_table <- function(table_name, credentials, data_filter=list(), nbrows=0) {

  function_name <- "get_warehouse_table"
  # validate arguments
  if (is.null(credentials$auth) || is.na(credentials$auth)) stop(
    paste("You must supply valid hublot credentials in", function_name)
  )

  data <- hublot::list_tables(credentials)
  hublot_tables_list <- tidyjson::spread_all(data)

  if (!paste("warehouse_", table_name, sep="") %in% hublot_tables_list$table_name) stop(
    paste("This table is not in hublot:", table_name)
  )

  table_longname <- paste("clhub_tables_warehouse_", table_name, sep="")

  hublot::count_table_items(table_longname, credentials)

  if (length(data_filter) == 0) {
    page <- hublot::list_table_items(table_longname, credentials)
  } else {
    page <- hublot::filter_table_items(table_longname, credentials, data_filter)
  }
  data <- list()

  repeat {
    data <- c(data, page$results)
    if (length(data_filter) == 0) {
      page <- hublot::list_next(page, credentials)
    } else {
      page <- hublot::filter_next(page, credentials)
    }
    if (is.null(page) || (nbrows != 0 && length(data) >= nbrows)) {
      break
    }
  }

  if (length(data) == 0) {
    warning(paste("table", table_name, "is empty in function", function_name))
    return(data.frame())
  }

  if (nbrows != 0 && length(data) >= nbrows) data <- data[1:nbrows]

  data1 <- replace_null(data)

  df <- data.frame(t(sapply(data1,c)))
  df_data <-  data.frame(t(sapply(df$data,c)))

  # Check if the structure is even or uneven
  if (length(unique(sapply(df$data, length))) == 1) {
    # This is very fast on large dataframes but only works on even data schemas
    print("dataset is even")
    df$data <- NULL
    names(df) <- paste("hub.",names(df),sep="")
    df <- as.data.frame(cbind(df,df_data))
    #df <- df %>% replace(.data == "NULL", NA)
    for (col in names(df)) df[,col] <- unlist(df[,col])
  } else {
    print("dataset is uneven")
    # This is slower on larg data sets but works on uneven data schemas
    df <- clessnverse::spread_list_to_df(data)
  }

  #test <- cbind(df[!sapply(df, is.list)],
  #    (t(apply(df[sapply(df, is.list)], 1, unlist))))

  return(df)
}




###############################################################################
#' @title clessnverse::get_hub2_table
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' get_hub2_table allows the programmer to retrieve a data
#' table from the CLESSN hub2 data warehouse.
#' ** WARNING hub2 will be decommissionned by end of 2022 **
#'
#' @param table_name The name of the table to retrieve from the hub2 warehouse
#' @param data_filter A list containing the filters to apply against the query
#'               to retrieve the data.  Only observations in the table
#'               complyingw with the filter conditions will be returned
#' @param max_pages The number of pages to return.  A page is 1000 rows.
#'                 Tu return the entire table use *max_pages = -1*
#' @param hub_conf The hub2.0 credentials obtained from the
#'                 clessnhub::login function
#' @importFrom dplyr "%>%"
#' @return returns a dataframe containing the data warehouse table content
#'
#' @examples
#' \dontrun{
#'  clessnhub::login(
#'    Sys.getenv("HUB_USERNAME"),
#'    Sys.getenv("HUB_PASSWORD"),
#'    Sys.getenv("HUB_URL"))
#'
#'  # get the journalists intervention in press conference from the
#'  # 'agoraplus_interventions' table from hub2
#'  data_filter = list(
#'    type = "press_conference",
#'    metadata__location = "CA-QC",
#'    data__speakerType = "journalist",
#'    data__eventDate__gte = "2021-01-01",
#'    data__eventDate__lte = "2022-06-23"
#'    )
#'
#'  df <- clessnverse::get_hub2_table(
#'    table_name = 'agoraplus_interventions',
#'    data_filter = data_filter,
#'    max_pages = -1,
#'    hub_conf = hub_config
#'    )
#'  }
#'
#' @export
#'
get_hub2_table <- function(table_name, data_filter=NULL, max_pages=-1, hub_conf) {

  function_name <- "get_hub2_table"

  http_post <- function(path, body, options=NULL, verify=T, hub_c) {
    token <- hub_c$token
    token_prefix <- hub_c$token_prefix
    response <- httr::POST(
      url=paste0(hub_c$url, path),
      body=body, httr::accept_json(),
      httr::content_type_json(),
      config=httr::add_headers(Authorization=paste(token_prefix, token)),
      verify=verify,
      httr::timeout(30))
    return(response)
  }

  if (!is.null(data_filter) && !class(data_filter) == "list" || length(data_filter) == 0) data_filter <- NULL

  data_filter <- jsonlite::toJSON(data_filter, auto_unbox = T)

  path <- paste("/data/", table_name, "/count/", sep="")
  response <- http_post(path, body=data_filter, hub_c = hub_conf)
  result <- httr::content(response)
  count <- result$count
  print(paste("count:", count))

  path <- paste("/data/", table_name, "/filter/", sep="")
  response <- http_post(path, body=data_filter, hub_c = hub_conf)
  page <- httr::content(response)
  data = list()

  repeat {

    data <- c(data, page$results)
    print(paste(length(data), "/", count))
    path <- page$"next"

    if (is.null(path)) {
      break
    }

    max_pages <- max_pages - 1
    if (max_pages == 0)
    {
      break
    }

    path <- strsplit(path, "science")[[1]][[2]]
    response <- http_post(path, body=data_filter, hub_c = hub_conf)
    page <- httr::content(response)
  }

  if (length(data) == 0) {
    warning(paste("table", table_name, "is empty in function", function_name))
    return(data.frame())
  }

  data1 <- replace_null(data)

  df <- data.frame(t(sapply(data1,c)))
  df_data <-  data.frame(t(sapply(df$data,c)))
  df_metadata <- data.frame(t(sapply(df$metadata,c)))

  # Check if the structure is even or uneven
  if (length(unique(sapply(df$data, length))) == 1 && length(unique(sapply(df$metadata, length))) == 1) {
    # This is very fast on large dataframes but only works on even data schemas
    df$data <- NULL
    df$metadata <- NULL
    names(df) <- paste("hub.",names(df),sep="")
    df <- as.data.frame(cbind(df,df_data))
    df <- as.data.frame(cbind(df,df_metadata))

    #df <- df %>% replace(.data == "NULL", NA)
    #df <- df %>% replace(is.null(.data), NA)

    for (col in names(df)) {df[,col] <- unlist(df[,col])}

  } else {
    # This is slower on larg data sets but works on uneven data schemas
    df <- clessnverse::spread_list_to_df(data)
    df_metadata <- df[which(grepl("^metadata.",names(df)))]
    df_data <- df[which(grepl("^data.",names(df)))]
    df_hub <- dplyr::select(df, -c(c(names(df_data),names(df_metadata))))
    names(df_data) <- gsub("^data.", "", names(df_data))
    names(df_metadata) <- gsub("^metadata.", "", names(df_metadata))
    names(df_hub) <- paste("hub.", names(df_hub), sep="")
    df <- df_hub %>% bind_cols(df_metadata) %>% bind_cols(df_data)
  }

  return(df)
}








###############################################################################
###############################################################################
###############################################################################
#   DATAMART FUNCTIONS (READ)



###############################################################################
#' @title clessnverse::get_mart_table
#'
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' get_mart_table allows the programmer to retrieve a data
#' table from a CLESSN data mart.
#'
#' @param table_name The name of the table to retrieve from the warehouse without
#'                   the 'chlub_tables_mart' prefix
#' @param credentials The hublot credentials obtained from the hublot::
#' @param data_filter a filter on the data to be selected in the query
#' @param nbrows Optional argument
#'               If nbrows is greater than 0, the dataframe returned will be
#'               limited to nbrows observations.  This is particularly useful
#'               when trying to see if there are records in a table and what
#'               how structured they are.
#'               If nbrows is omitted, then all rows of the table are returned
#'
#' @return returns a dataframe containing the data warehouse table with a JSON
#'         attribute as well as a document.id and creation & update time stamps
#'
#' @examples
#' \dontrun{
#'  # connect to hublot
#'  credentials <- hublot::get_credentials(
#'    Sys.getenv("HUB3_URL"),
#'    Sys.getenv("HUB3_USERNAME"),
#'    Sys.getenv("HUB3_PASSWORD")
#'    )
#'
#'  # gets the entire datamart political_parties_press_releases_freq
#'  datamart  <- clessnverse::get_mart_table(
#'    table_name = 'political_parties_press_releases_freq',
#'    data_filter = list(),
#'    credentials = credentials)
#'
#'  # gets the first 10 rows of the warehouse table 'political_parties_press_releases_freq'
#'  datamart  <- clessnverse::get_mart_table(
#'    table_name = 'political_parties_press_releases_freq',
#'    data_filter = list(),
#'    credentials = credentials,
#'    nbrows=10)
#'  }
#'
#' @export
#'
get_mart_table <- function(table_name, credentials, data_filter=list(), nbrows=0) {

  function_name <- "get_mart_table"

  # validate arguments
  if (is.null(credentials$auth) || is.na(credentials$auth)) stop(
    paste("You must supply valid hublot credentials in", function_name)
  )

  data <- hublot::list_tables(credentials)
  hublot_tables_list <- tidyjson::spread_all(data)
  if (!paste("mart_", table_name, sep="") %in% hublot_tables_list$table_name) stop(
    paste("This table is not in hublot:", table_name)
  )

  table_longname <- paste("clhub_tables_mart_", table_name, sep="")

  if (length(data_filter) == 0) {
    page <- hublot::list_table_items(table_longname, credentials)
  } else {
    page <- hublot::filter_table_items(table_longname, credentials, data_filter)
  }
  data <- list()

  repeat {
    data <- c(data, page$results)

    if (length(data_filter) == 0) {
      page <- hublot::list_next(page, credentials)
    } else {
      page <- hublot::filter_next(page, credentials)
    }
    if (is.null(page) || (nbrows != 0 && length(data) >= nbrows)) {
      break
    }
  }

  if (length(data) == 0) {
    warning(paste("table", table_name, "is empty in function", function_name))
    return(data.frame())
  }


  if (nbrows != 0 && length(data) >= nbrows) data <- data[1:nbrows]

  data1 <- replace_null(data)

  df <- data.frame(t(sapply(data1,c)))
  df_data <-  data.frame(t(sapply(df$data,c)))

  # Check if the structure is even or uneven
  if (length(unique(sapply(df$data, length))) == 1) {
    print("dataset is even")
    # This is very fast on large dataframes but only works on even data schemas
    df$data <- NULL
    names(df) <- paste("hub.",names(df),sep="")
    df <- as.data.frame(cbind(df,df_data))
    #df <- df %>% replace(.data == "NULL", NA)
    for (col in names(df)) df[,col] <- unlist(df[,col])
  } else {
    print("dataset is uneven")
    # This is slower on larg data sets but works on uneven data schemas
    df <- clessnverse::spread_list_to_df(data)
  }

  #test <- cbind(df[!sapply(df, is.list)],
  #    (t(apply(df[sapply(df, is.list)], 1, unlist))))

  return(df)
}








###############################################################################
###############################################################################
###############################################################################
#   DATAMART FUNCTIONS (WRITE)

###############################################################################
#' @title Write a row in a data table of a CLESSN data mart
#'
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' @param table_name The name of the data mart table to write an observation to
#'                   without the 'chlub_tables_mart' prefix.
#' @param key A character string containing the unique primary key of this
#'            observation in the table.  Data integrity of the CLESSN data
#'            model is maintained by having a unique key per observation in
#'            each table.
#' @param row A named list containing the observation to write to the datamart
#'            table.  The names of the list *are the columns* of the table.
# @param mode A character string cintaining either "refresh" or "append".
#             If mode = "refresh" then if an observation with a key = key
#             already exists in the table, it will be overwritten with the
#             new values.
#             If mode = "append" then it will be added to the table.  However
#             if an existing observation with a key = key already exists in the
#             table, a warning will be returned.
#' @param refresh_data Logical.
#' @param credentials The hublot credentials obtained from the
#'                    hublot::get_credentials function
#' @return returns a dataframe containing the data warehouse table with a JSON
#'         attribute as well as a document.id and creation & update time stamps
#'
#' @examples
#' \dontrun{
#' # connect to hublot
#' credentials <- hublot::get_credentials(
#'   Sys.getenv("HUB3_URL"),
#'   Sys.getenv("HUB3_USERNAME"),
#'   Sys.getenv("HUB3_PASSWORD")
#'   )
#'
#'
#' clessnverse::commit_mart_row(
#'   table_name = "political_parties_press_releases_freq",
#'   key = "QS212022",
#'   row = list(week_num=21, count=6, political_party="QS"),
#'   mode = "refresh",
#'   credentials = credentials)
#' }
#'
#' @export
#'
commit_mart_row <- function(table_name, key, row = list(), refresh_data = FALSE, credentials) {
  # If the row with the same key exist and mode=refresh then overwrite it with the new data
  # Otherwise, do nothing (just log a message)
  table_name <- paste("clhub_tables_mart_", table_name, sep="")

  data_filter <- list(key__exact = key)
  item <- hublot::filter_table_items(table_name, credentials, data_filter)

  if(length(item$results) == 0) {
    # l'item n'existe pas deja dans hublot
    hublot::add_table_item(table_name,
                           body = list(key = key, timestamp = Sys.time(), data = row),
                           credentials)
  } else {
    # l'item existe deja dans hublot
    if (refresh_data) {
      hublot::update_table_item(table_name,
                                id = item$result[[1]]$id,
                                body = list(key = key, timestamp = as.character(Sys.time()), data = jsonlite::toJSON(row, auto_unbox = T)),
                                credentials)
    } else {
      # Do nothing but log a message saying skipping
    } # if (mode == "refresh")
  } #if(length(item$results) == 0)
}





###############################################################################
#' @title clessnverse::commit_mart_table
#'
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' commit_mart_row allows the programmer to write a tabe as a
#' CLESSN data mart.
#'
#' @param table_name The name of the data mart table to store without the
#'                   'chlub_tables_mart' prefix.
#' @param df blah
#' @param key_column blah
#' @param mode A character string cintaining either "refresh" or "append".
#'             If mode = "refresh" then if an observation with a key = key
#'             already exists in the table, it will be overwritten with the
#'             new values.
#'             If mode = "append" then it will be added to the table.  However
#'             if an existing observation with a key = key already exists in the
#'             table, a warning will be returned.
#' @param credentials The hublot credentials obtained from the
#'                    hublot::get_credentials function
#'
#' @return returns a dataframe containing the data warehouse table with a JSON
#'         attribute as well as a document.id and creation & update time stamps
#'
#' @examples
#' \dontrun{
#'  # connect to hublot
#'  credentials <- hublot::get_credentials(
#'    Sys.getenv("HUB3_URL"),
#'    Sys.getenv("HUB3_USERNAME"),
#'    Sys.getenv("HUB3_PASSWORD")
#'    )
#'
#'  # Writes a data frame into political_parties_press_releases_freq
#'   datamart  <- clessnverse::commit_mart_table(
#'   table_name = 'political_parties_press_releases_freq',
#'   df = data.frame(key = "123", count = 1, week_num = "28", party = "CAQ"),
#'   key_column = 'key',
#'   mode = 'add',
#'   credentials = credentials)
#' }
#'
#' @export
#'
commit_mart_table <- function(table_name, df, key_column, mode, credentials) {
  table_name <- paste("clhub_tables_mart_", table_name, sep="")

  df <- as.data.frame(df)

  pb_chap <- utils::txtProgressBar(min = 0,      # Minimum value of the progress bar
                                   max = nrow(df), # Maximum value of the progress bar
                                   style = 3,    # Progress bar style (also available style = 1 and style = 2)
                                   width = 80,  # Progress bar width. Defaults to getOption("width")
                                   char = "=")   # Character used to create the bar


  for (i in 1:nrow(df)) {
    utils::setTxtProgressBar(pb_chap, i)

    key <- df[[key_column]][i]

    data_filter <- list(key__exact = key)
    item <- hublot::filter_table_items(table_name, credentials, data_filter)

    data_row <- as.list(df[i,] %>% select(-c("key")))

    if(length(item$results) == 0) {
      # l'item n'existe pas deja dans hublot
      hublot::add_table_item(table_name,
                             body = list(key = key, timestamp = Sys.time(), data = data_row),
                             credentials)
    } else {
      # l'item existe deja dans hublot
      if (mode == "refresh") {
        hublot::update_table_item(table_name,
                                  id = item$result[[1]]$id,
                                  body = list(key = key, timestamp = as.character(Sys.time()), data = jsonlite::toJSON(data_row, auto_unbox = T)),
                                  credentials)
      } else {
        # Do nothing but log a message saying skipping
      } # if (mode == "refresh")
    } #if(length(item$results) == 0)

  } #for (i in 1:nrow(df))
}








###############################################################################
###############################################################################
###############################################################################
#   DICTIONARIES FUNCTIONS (READ)


###############################################################################
#' Retrieves a dictionary from hublot.
#'
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' Creates a dictionary object from a dictionary located in
#' the CLESSN data lake (hublot).
#' @param topic The name or topic of the dictionary to retrieve from hublot.
#' @param lang The language of the dictionary. "en" is for English,
#' "fr" is for French. Both are included by default.
#' @param credentials The user's personal credentials from hublot.
#' @return A quanteda type dictionary object.
#' @author CLESSN
#' @examples
#'
#' \dontrun{
#'  # get credentials from hublot
#'  credentials <- hublot::get_credentials(
#'    Sys.getenv("HUB3_URL"),
#'    Sys.getenv("HUB3_USERNAME"),
#'    Sys.getenv("HUB3_PASSWORD")
#'    )
#'  # retrieve the COVID dictionary in both EN and FR
#'  clessnverse::get_dictionary("covid", c("en", "fr"), credentials)
#' }
#' @export
#'
get_dictionary <-
  function(topic, lang = c("en","fr"), credentials) {
    # Validate arguments
    file_info <- hublot::retrieve_file("config_dict", credentials)
    config_dict <- utils::read.csv2(file_info$file)

    if (is.null(credentials$auth) || is.na(credentials$auth)) {
      stop("hublot credentials in clessnverse::get_dictionary are invalid")
    }

    if (!topic %in% config_dict$topic) {
      stop (
        paste(
          "invalid topic in clessnverse::get_dictionary function:",
          topic,
          "\nvalid topics are",
          paste(config_dict$topic, collapse = ", ")
        )
      )
    }

    if (!unique(unlist(strsplit(config_dict$lang, ","))) %vcontains% lang) {
      stop (paste(
        "invalid language in clessnverse::get_dictionary function:",
        lang
      ))
    }

    # Get dictionary file from lake
    file_key <- paste("dict_", topic, sep = "")
    file_info <- hublot::retrieve_file(file_key, credentials)
    dict_df <- utils::read.csv2(file_info$file, encoding = "UTF-8")

    # Filter on language provided in lang if language is a dictionary feature
    if (!is.null(dict_df$language)) {
      dict_df <- dict_df[dict_df$language %in% lang, ]

      # Remove language column
      dict_df$language <- NULL
    }

    dict_list <- list()
    for (c in unique(dict_df$category)) {
      dict_list[[c]] <- dict_df$item[dict_df$category == c]
    }

    # Convert dataframe to quanteda dict and return it
    qdict <- quanteda::dictionary(as.list(dict_list))
    return(qdict)
  }







###############################################################################
###############################################################################
###############################################################################
#   DATA TRANSFORMATION


###############################################################################
#' @title clessnverse::compute_nb_sentences
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' Calculates the number of sentences in a bloc of text
#' @param txt_bloc be documented
#' @return return
#' @examples # To be documented
#' @export
compute_nb_sentences <- function(txt_bloc) {
  df_sentences <- tibble::tibble(txt = txt_bloc) %>%
    tidytext::unnest_tokens("sentence", "txt", token="sentences",format="text", to_lower = T)

  nb_sentences <- nrow(df_sentences)

  return(nb_sentences)
}


###############################################################################
#' @title clessnverse::compute_nb_words
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' Calculates the number of words in a bloc of text
#' @param txt_bloc be documented
#' @return return
#' @examples # To be documented
#' @export
compute_nb_words <- function(txt_bloc) {
  df_words <- tibble::tibble(txt = txt_bloc) %>%
    tidytext::unnest_tokens("words", "txt", token="words",format="text", to_lower = T)

  nb_words <- nrow(df_words)

  return(nb_words)
}





###############################################################################
#' @title clessnverse::clean_corpus
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' clesns a bloc of text by removing double spaces, non-brekeable
#' spaces and L, S, D apostrophe (in french language).  This is particularly
#" useful before using the clessnverse::run_dictionary function.
#' @param txt_bloc blah
#' @return return
#' @examples # To be documented
#'
#' @export
clean_corpus <- function(txt_bloc) {
  # Prepare corpus
  txt <- stringr::str_replace_all(string = txt_bloc, pattern = "M\\.|Mr\\.|Dr\\.", replacement = "")
  txt <- stringr::str_replace_all(string = txt, pattern = "(l|L)\\'", replacement = "")
  txt <- stringr::str_replace_all(string = txt, pattern = "(s|S)\\'", replacement = "")
  txt <- stringr::str_replace_all(string = txt, pattern = "(d|D)\\'", replacement = "")
  txt <- gsub("\u00a0", " ", txt)
  txt <- stringr::str_replace_all(string = txt, pattern = "  ", replacement = " ")

  return(txt)
}




###############################################################################
#' @title clessnverse::compute_category_sentiment_score
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' calculates the sentiment score using a word stems dictionnary by category
#' @param txt_bloc : the bloc of text to study
#' @param category_dictionary : a topic dictionary containing the categories to calculate the sentiment on
#' @param sentiment_dictionary : sentiment lexicoder dictionary
#' @return returns a dataframe containing the sentiment score of each category in the category dictionary
#' @examples # To be documented
#'
#' @importFrom stats  aggregate
#'
#' @export
compute_category_sentiment_score <- function(txt_bloc, category_dictionary, sentiment_dictionary) {
  # Build one corpus per category and compute sentiment on each corpus
  corpus <- data.frame(doc_id = integer(), category = character(), txt = character())

  txt_bloc <- stringr::str_replace_all(string = txt_bloc, pattern = "M\\.|Mr\\.|Dr\\.", replacement = "")
  txt_bloc <- stringr::str_replace_all(string = txt_bloc, pattern = "(l|L)\\'", replacement = "")
  txt_bloc <- stringr::str_replace_all(string = txt_bloc, pattern = "(s|S)\\'", replacement = "")
  txt_bloc <- stringr::str_replace_all(string = txt_bloc, pattern = "(d|D)\\'", replacement = "")
  txt_bloc <- gsub("\u00a0", " ", txt_bloc)

  txt_bloc <- stringr::str_replace_all(string = txt_bloc, pattern = "  ", replacement = " ")

  df_sentences <- tibble::tibble(txt = txt_bloc) %>%
    tidytext::unnest_tokens("sentence", "txt", token="sentences",format="text", to_lower = T)

  toks <- quanteda::tokens(df_sentences$sentence)

  dfm_corpus <- quanteda::dfm(toks)
  lookup <- quanteda::dfm_lookup(dfm_corpus, dictionary = category_dictionary, valuetype = "glob")
  df <- quanteda::convert(lookup, to="data.frame") %>% select(-c("doc_id"))

  df_sentences <- df_sentences %>% cbind(df)
  df_sentences <- df_sentences %>% tidyr::pivot_longer(-c(.data$sentence), names_to = "category", values_to = "relevance")
  df_sentences <- df_sentences %>% filter(.data$relevance > 0)

  df_categories <- df_sentences %>%
    dplyr::group_by(.data$category) %>%
    dplyr::summarise(txt = paste(.data$sentence, collapse = " "), relevance = sum(.data$relevance))

  df_categories$txt <- stringr::str_replace_all(string = df_categories$txt, pattern = "M\\.|Mr\\.|Dr\\.", replacement = "")

  toks <- quanteda::tokens(df_categories$txt)
  toks <- quanteda::tokens(df_categories$txt, remove_punct = TRUE)
  # On n'enleve pas les stopwords parce qu'on veut garder "pas" ou "ne" car connotation negative
  # toks <- quanteda::tokens_remove(toks, quanteda::stopwords("french"))
  # toks <- quanteda::tokens_remove(toks, quanteda::stopwords("spanish"))
  # toks <- quanteda::tokens_remove(toks, quanteda::stopwords("english"))
  # toks <- quanteda::tokens_replace(
  #   toks,
  #   quanteda::types(toks),
  #   stringi::stri_replace_all_regex(quanteda::types(toks), "[lsd]['\\p{Pf}]", ""))


  if (length(toks) == 0) {
    tokens <- quanteda::tokens("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.", remove_punct = TRUE)
  }

  dfm_corpus <- quanteda::dfm(toks)
  lookup <- quanteda::dfm_lookup(dfm_corpus, dictionary = sentiment_dictionary, valuetype = "glob")
  df <- quanteda::convert(lookup, to="data.frame") %>% select(-c("doc_id"))

  df_categories <- df_categories %>%
    cbind(df)

  if (nrow(df_categories) > 0) {
    df_categories <- df_categories %>%
      dplyr::mutate(sentiment = .data$positive - .data$neg_positive - .data$negative + .data$neg_negative) %>%
      select(-c("txt"))
  }

  df_category_pads <- data.frame(category = names(category_dictionary), relevance=rep(0L, length(category_dictionary)),
                                 negative=rep(0L, length(category_dictionary)), positive=rep(0L, length(category_dictionary)),
                                 neg_positive=rep(0L, length(category_dictionary)), neg_negative=rep(0L, length(category_dictionary)),
                                 sentiment=rep(0L, length(category_dictionary)))

  df_sentiments <- df_categories %>% rbind(df_category_pads)

  df_sentiments <- stats::aggregate(df_sentiments[,-c(1)], list(df_sentiments$category), FUN=sum)
  names(df_sentiments)[1] <- "category"

  return(df_sentiments)
}




######################################################
#' @title clessnverse::detect_language
#' @description detects the language of the text provided as a parameter
#' @param text : the text to translate
#' @param engine : "azure" | "deeptranslate" | "fastText"
#' @return 2 chars language detected
#' @examples example
#'
#' @export
detect_language <- function(engine, text) {
  if (is.null(text)) return(NA_character_)
  if (is.na(text)) return(NA_character_)
  if (nchar(trimws(text)) == 0) return(NA_character_)

  if (!(engine %in% c("deeptranslate", "fastText"))) {
    stop(paste("translation engine", engine,"not supported.  Supported values are: fastText | deeptranslate"))
  }

  if (engine == "deeptranslate") {
    # detect language first
    url <- "https://deep-translate1.p.rapidapi.com/language/translate/v2/detect"

    text <- gsub("\\\"","'", text)
    text <- gsub("\\n"," ",text)

    if (is.null(text)) return(NA_character_)
    if (is.na(text)) return(NA_character_)
    if (nchar(trimws(text)) == 0) return(NA_character_)

    df <- tidytext::unnest_tokens(
        data.frame(txt=text),
        input = txt,
        output = "Sentence",
        token = "regex",
        pattern = "(?<!\\b\\p{L}r)\\.|\\n\\n", to_lower=F)

    #clessnverse::logit(scriptname, "detecting language", logger)

    response <- httr::VERB(
      "POST",
      url,
      body = paste("{\"q\":\"", df$Sentence[1],"\"}", sep=''),
      httr::add_headers(
          'X-RapidAPI-Key' = Sys.getenv("DEEP_TRANSLATE_KEY"),
          'X-RapidAPI-Host' = 'deep-translate1.p.rapidapi.com'),
          httr::content_type("application/octet-stream")
    )

    r <- jsonlite::fromJSON(httr::content(response, "text"))

    lang <- r$data$detections$language
    return(lang)
  }

  if (engine == "fastText") {

    text <- gsub("\\\"","'", text)
    text <- gsub("\\n"," ",text)

    if (nchar(system.file(package="fastText")) == 0) stop("Package fastText is not installed.  Please run remotes::install_github('mlampros/fastText')")

    if (!exists("file_ftz")) file_ftz <<- system.file("language_identification/lid.176.ftz", package = "fastText")

    lang <- fastText::language_identification(
      input_obj = text,
      pre_trained_language_model_path = file_ftz,
      k = 1,
      th = 0.0,
      threads = 1,
      verbose = FALSE
    )

    return(lang$iso_lang_1)
  }
}



######################################################
#' @title clessnverse::translate_text
#' @description translates the text provided as a parameter using language autodetection in the language
#' @param text : the text to translate
#' @param engine : "azure" | "deeptranslate"
#' @param source_lang : the language to translate from - if NA, then automatic detection will be performed
#' @param target_lang : which language to translate to
#' @param translate : translates for real.  this is helpful to set to FALSE when debugging to avoid translation API fees
#' @return a vector containing 1. a string = language detected and 2. a string =  the translated text
#' @examples example
#'
#'
#' @export
translate_text <- function (text, engine = "azure", source_lang = NA, target_lang = "en", translate = FALSE) {
  if (is.null(text)) return(NA_character_)
  if (is.na(text)) return(NA_character_)
  if (nchar(trimws(text)) == 0) return(NA_character_)

  text <- gsub("\u00a0", " ", text)


  if (!(engine %in% c("azure", "deeptranslate"))) {
    stop(paste("translation engine", engine,"not supported.  Supporter values are: azure | deeptranslate"))
  }

  if (engine == "azure") {
    base_url <- "https://api.cognitive.microsofttranslator.com/"
    path <- '/translate?api-version=3.0'
    params = paste('&to=', target_lang, sep="")
    url <- paste(base_url, path, params, sep="")

    # There atr characters that need to be escaped (or even removed) in order for the translator to
    # be able to take them
    text <- stringr::str_replace_all(text, "\\'", "\\\\'")
    text <- stringr::str_replace_all(text, "\\\u00ab", "")
    text <- stringr::str_replace_all(text, "\\\u00bb", "")
    text <- stringr::str_replace_all(text, "\\\u00ab", "")
    text <- stringr::str_replace_all(text, "\\\u00bb", "")
    text <- stringr::str_replace_all(text, "\\\u2019", "\\\\'")

    key <- Sys.getenv("AZURE_TRANSLATE_KEY")

    headers <- httr::add_headers(`Ocp-Apim-Subscription-Key`=key,
                                 `Ocp-Apim-Subscription-Region`= "canadacentral",
                                 `Content-Type` = "application/json")

    if(translate) {
      response <- httr::POST(url, headers,
                           body = paste("[{'Text':'",text,"'}]", sep = ""),
                           encode = "json")
      response_json <- jsonlite::parse_json(httr::content(response, "text"))

      while (!is.null(response_json[1][[1]]$code) && response_json[1][[1]]$code == "429001") {
        Sys.sleep(30)

        response <- httr::POST(url, headers,
                               body = paste("[{'Text':'",text,"'}]", sep = ""),
                               encode = "json")

        response_json <- jsonlite::parse_json(httr::content(response, "text"))
      }

      if (is.na(source_lang)) {
        return(
          c(
            response_json[1][[1]]$detectedLanguage[1]$language,
            response_json[1][[1]]$translations[[1]]$text
          )
        )
      } else {
        return(response_json[1][[1]]$translations[[1]]$text)
      }
    } else {
      return(
        c(
          "Fake lang code - use translate = TRUE if you want to consume the translation service",
          "Fake translation text - use translate = TRUE if you want to consume the translation service"
        )
      )
    } # if(translate)
  } # if (engine == "azure")



  if (engine == "deeptranslate") {
    key <- Sys.getenv("DEEP_TRANSLATE_KEY")

    text <- gsub("\\n", "\\[000\\]", text)

    # if source_lang = NA let's detect the language first
    if (is.na(source_lang)) source_lang <- clessnverse::detect_language(engine = "deeptranslate", text)

    # translate next
    url <- "https://deep-translate1.p.rapidapi.com/language/translate/v2"

    if (nchar(text) > 3000) {
      # more than 5000 characters
      df <- tidytext::unnest_tokens(
        data.frame(txt=text),
        input = txt,
        output = "Sentence",
        token = "regex",
        pattern = "(?<!\\b\\p{L}r)\\.|\\n\\n", to_lower=F)

      result <- ""
      payload_txt <- ""

      for (i in 1:nrow(df)) {
        if (is.null(df$Sentence[i])) next
        if (is.na(df$Sentence[i])) next
        if (nchar(trimws(df$Sentence[i])) == 0) next

        if ( payload_txt == "" ) {
          payload_txt <- trimws(df$Sentence[i])
        } else {
          if ( nchar(payload_txt) + nchar(df$Sentence[i]) < 3000 && i < nrow(df) ) {
            payload_txt <- trimws(paste(payload_txt, trimws(df$Sentence[i]), sep = ".  "))
            next
          }

          payload_txt <- trimws(paste(payload_txt, trimws(df$Sentence[i]), sep = ".  "))
          payload_txt <- paste(payload_txt, ".", sep='')
          payload <- paste("{\"q\":\"", payload_txt,"\",\"source\": \"",source_lang,"\",\"target\": \"",target_lang,"\"}", sep='')
          encode <- "json"

          #clessnverse::logit(scriptname, paste("translating language - pass", i), logger)

          response <- httr::VERB(
            "POST",
            url,
            body = payload,
            httr::add_headers('X-RapidAPI-Key' = key,
            'X-RapidAPI-Host' = 'deep-translate1.p.rapidapi.com'),
            httr::content_type("application/json"),
            encode = encode)

          #clessnverse::logit(scriptname, paste("translating language done - pass", i), logger)

          r <- jsonlite::fromJSON(httr::content(response, "text"))

          result <- trimws(paste(result,r$data$translations$translatedText, sep=" "))

          payload_txt <- ""
        } #if (payload_txt == "")
      } # for
    } else {
      # less than 5000 characters
      payload <- paste("{\"q\":\"", text,"\",\"source\": \"",source_lang,"\",\"target\": \"",target_lang,"\"}", sep='')
      encode <- "json"

      #clessnverse::logit(scriptname, "translating language", logger)

      response <- httr::VERB(
        "POST",
        url,
        body = payload,
        httr::add_headers('X-RapidAPI-Key' = key,
        'X-RapidAPI-Host' = 'deep-translate1.p.rapidapi.com'),
        httr::content_type("application/json"),
        encode = encode)

      #clessnverse::logit(scriptname, "translating language done", logger)

      r <- jsonlite::fromJSON(httr::content(response, "text"))

      result <- trimws(r$data$translations$translatedText)
    } #if (nchar(text) > 5000)

    result <- gsub("\u00a0", " ", result)
    result <- gsub("\\[000\\]", "\\\n", result)

    if (is.na(source_lang)) {
      return(c(trimws(source_lang), trimws(result)))
    } else {
      return(trimws(result))
    }
  } #if (engine == "deeptranslate")


  return(
    c(
      "Fake language code - use translate = TRUE if you want to consume the translation service",
      "Fake translation text - use translate = TRUE if you want to consume the translation service"
    )
  )
}



######################################################
#' @title Replace accented characters with their non-accented counterparts
#' @param str : the text to remove accents from
#' @param pattern : patterns of the accents to remove
#' @return str without accent
#' @examples example
#'
#'
#' @export
rm_accents <- function(str,pattern="all") {
  if(!is.character(str))
    str <- as.character(str)

  pattern <- unique(pattern)

  if(any(pattern == "\u00c7"))
    pattern[pattern == "\u00c7"] <- "\u00e7" # c cedil uppercase <- lowercase

  symbols <- c(
    acute = "\u00e1\u0107\u00e9\u00ed\u0144\u00f3\u015b\u00fa\u00c1\u0106\u00c9\u00cd\u0143\u00d3\u015a\u00da\u00fd\u00dd\u017a\u0179",
    grave = "\u00e0\u00e8\u00ec\u00f2\u00f9\u00c0\u00c8\u00cc\u00d2\u00d9",
    circunflex = "\u00e2\u00ea\u00ee\u00f4\u00fb\u00c2\u00ca\u00ce\u00d4\u00db",
    tilde = "\u00e3\u00f5\u00c3\u00d5\u00f1\u00d1",
    umlaut = "\u00e4\u00eb\u00ef\u00f6\u0151\u00fc\u00c4\u00cb\u00cf\u00d6\u0150\u00dc\u00ff",
    cedil = "\u0105\u0104\u00e7\u00c7\u0119\u0118\u0146\u0145\u021b\u021a\u0219\u0218\u015f\u015e\u00df\u00df\u0163\u0162",
    flex="\u0103\u0102\u010f\u010e\u011b\u011a\u0115\u0114\u0148\u0147\u0161\u0160\u010d\u010c\u0159\u0158\u017e\u017d\u0115\u0114",
    dotted="\u0117\u0116\u017c\u017b",
    round="\u00e5\u00c5",
    bar="\u0101\u0100\u0113\u0112\u012b\u012a\u016b\u016a",
    interlaced="\u00e6\u00c6",
    cross="\u0142\u0141\u00f8\u00d8\u0111\u0110"
  )

  nudeSymbols <- c(
    acute = "aceinosuACEINOSUyYzZ",
    grave = "aeiouAEIOU",
    circunflex = "aeiouAEIOU",
    tilde = "aoAOnN",
    umlaut = "aeioouAEIOOUy",
    cedil = "aAcCenNEtTsSsSsStT",
    flex="aAdDeEeEnNsScCrRzZeE",
    dotted="eEzZ",
    round="aA",
    bar="aAeEiIuU",
    interlaced="aA",
    cross="lLoOdD"
  )

  accentTypes <- c("\u00b4","`","^","~","\u00a8","\u00e7","")

  if(any(c("all","al","a","todos","t","to","tod","todo")%in%pattern))
    return(chartr(paste(symbols, collapse=""), paste(nudeSymbols, collapse=""), str))

  for(i in which(accentTypes%in%pattern))
    str <- chartr(symbols[i],nudeSymbols[i], str)

  return(str)
}


######################################################
#' @title Return a list of words of a sentence
#' @param txt the string
#' @return a list of words
#' @examples example
#'
#'
#'
#' @export
split_words <- function(txt) {
  #txt <- gsub("[[:punct:][:blank:]]+", " ", txt)
  txt <- gsub("[(\\!|\\'|\\#|\\%|\\&|\\'|\\(|\\)|\\+|\\,|\\/|\\:|\\;|\\<|\\=|\\>|\\?|\\@|\\[|\\/|\\]|\\^|\\_|\\{|\\||\\}|\\~)[:blank:]]+", " ", txt)
  #txt <- gsub("[(\\,|\\.|\\;)[:blank:]]+", " ", txt)

  txt <- trimws(txt, which="both")

  list <- strsplit(txt, "\\s+")[[1]]

  return(list)
}
clessn/clessn-verse documentation built on Feb. 18, 2024, 12:42 p.m.