repvisForODK: Analyse data from ODK Central

Documented in free_text_wordcloud preprocess_wc_generation

#' Generates a wordcloud based on text from free text questions in an ODK form.
#'
#' The main computation, namely the creation of the corpus, the preprocessing of the text in the corpus
#' and the generation of the wordcloud is done through the function \code{\link{preprocess_wc_generation}}.
#' This function serves as a wrapper that applies the \code{\link{collection_period}} to every free text question
#' column that the user specified in the argument text_col.
#'
#' @param svc Logical that indicates whether the data shall be parsed using ruODK's \code{\link[ruODK]{odata_submission_get}}. Optional, defaults to FALSE.
#' @param df Data frame containing the ODK data that is to be used. Optional, defaults to NULL.
#' @param csv Character that specifies the path to the csv file that is to be read. Optional, defaults to NULL.
#' @param text_col Character or Character vector (if multiple questions shall be examined) that specifies the names of the columns of the free text questions.
#' @param lang_wc Character that specifies the language of the answers of the free text question. Check \code{\link[tm]{stopwords}} to find out more about stopwords list options.
#' @param lang Character that specifies the language of the answers of the free text question. Check \code{\link[tm]{stopwords}} to find out more about stopwords list options.
#' @param df_schema_ext Data frame that defines the schema of the from. Can be passed to the function to avoid downloading it multiple times. Optional, defaults to NULL.
#' @param choice_col String specifying the choices column that is to be changed, defaults to NULL.
#' @param label_col String specifying the labels column that is to be changed, defaults to NULL.
#'
#' @return List
#'
#' @export
#' @import tm wordcloud2
#'
#' @examples
#' \dontrun{
#' # 1. with SVC
#' # ruODK needs to be set up for this function to work
#' repvisforODK::setup_ruODK(svc = example/svc.svc, un = exampleusername, pw = examplepassword, tz = 'Europe/Berlin', verbose = TRUE)
#'
#' free_text_wordcloud(svc = TRUE, text_col = c('question2', 'question5'), lang_wc = 'french', lang = 'english', delimiter = ' ', choice_col = 'choices_english_(en)', label_col = 'label_english_(en)')
#'
#' # 2. with data frame and external form schema
#' df_schema = ruODK::form_schema_ext()
#'
#' free_text_wordcloud(df = df_odk_data, text_col = c('question2', 'question5'), lang_wc = 'french', lang = 'english', df_schema_ext = df_schema, delimiter = ' ', choice_col = 'choices_english_(en)', label_col = 'label_english_(en)')
#'
#' # 3. with csv and qvec
#' free_text_wordcloud(csv = 'example/file/odk_data.csv', text_col = c('question2', 'question5'), lang_wc = 'french', lang = 'english', qvec = c('question1', 'question4'), delimiter = ' ', choice_col = 'choices_english_(en)', label_col = 'label_english_(en)')
#' }
free_text_wordcloud <- function(svc = FALSE, df = NULL, csv = NULL, text_col, lang_wc, lang = NULL, df_schema_ext = NULL, choice_col = NULL, label_col = NULL) {

  if (svc & !is.null(df_schema_ext)) {
    stop('Please specify one and only one of the arguments "svc" and "df_schema_ext".')
  }

  # load data
  df <- repvisforODK::check_data_args(df, csv, svc)

  # calls function that creates word cloud for each free text column that was specified in the text_col argument
  wc_list = lapply(text_col,
                   preprocess_wc_generation, df_c = df, lang_wc_c = lang_wc, svc_c = svc, lang_c = lang, df_schema_ext_c = df_schema_ext, choice_col_c = choice_col, label_col_c = label_col)

  return(wc_list)
}

#-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

#' Takes a data frame column, generates a corpus, preprocesses it and generates a word cloud.
#'
#' This function is used within \code{\link{free_text_wordcloud}}. It uses the tm package to create
#' a corpus, apply preprocessing and create a Term Document Matrix. Then wordcloud2 is used to create
#' the word cloud.
#'
#' @param text_col Character or Character vector (if multiple questions shall be examined) that specifies the names of the columns of the free text questions.
#' @param lang_wc_c Character that specifies the language of the answers of the free text question. Check \code{\link[tm]{stopwords}} to find out more about stopwords list options.
#' @param df_c Data frame containing the ODK data that is to be used. Optional, defaults to df which is the df parsed in \code{\link{free_text_wordcloud}}.
#' @param lang_c Character containing the name of the language that is to be examined.
#' @param svc_c Logical that specifies whether the data is coming directly from ODK or not.
#' @param df_schema_ext_c Data frame that defines the schema of the from. Can be passed to the function to avoid downloading it multiple times. Optional, defaults to NULL.
#' @param choice_col_c String specifying the choices column that is to be changed, defaults to NULL.
#' @param label_col_c String specifying the labels column that is to be changed, defaults to NULL.
#'
#' @return wordcloud2 html-widget
#'
#' @export
#' @import tm wordcloud2 DT
#'
#' @examples
#' \dontrun{
#' Function is only used within free_text_wordcloud(). See the latter for example.
#' }
preprocess_wc_generation <- function(text_col, lang_wc_c, df_c = df, lang_c = lang, svc_c = svc, df_schema_ext_c = df_schema_ext, choice_col_c = choice_col, label_col_c = label_col) {

  # isolating text in vector
  text = df_c[[text_col]]
  text = text[!is.na(text)]

  # creating corpus
  corpus <- tm::Corpus(tm::VectorSource(text))

  # preprocessing corpus
  corpus <- corpus %>%
    tm::tm_map(removeNumbers) %>%
    tm::tm_map(removePunctuation) %>%
    tm::tm_map(stripWhitespace)
  corpus <- tm::tm_map(corpus, content_transformer(tolower))
  corpus <- tm::tm_map(corpus, removeWords, stopwords(lang_wc_c))

  # creating TD matrix
  dtm <- tm::TermDocumentMatrix(corpus)
  dtm_matrix <- as.matrix(dtm)
  words <- sort(rowSums(dtm_matrix),decreasing = TRUE)
  df_wc <- data.frame(word = names(words),freq = words)

  # defining title
  if (svc_c | !is.null(df_schema_ext_c)) {

    if (svc_c) {
      df_schema <- ruODK::form_schema_ext()
    } else df_schema <- df_schema_ext_c

    df_schema = repvisforODK::rename_schema(df = df_schema, lang = lang_c, choice_col = choice_col_c, label_col = label_col_c)

    title = df_schema$labels_fin[df_schema$ruodk_name == text_col]
  } else title = text_col

  # generating word cloud
  wc = wordcloud2::wordcloud2(df_wc,
                              size=1.6,
                              color=repvisforODK::set_color('contrast_scale')
                              )

  # adding title to the html widget
  wc <- repvisforODK::add_html_title_tag(wc, title)

  return(wc)
}