surveystats: Survey Monkey Statistical Analysis

Documented in load_data tab_all_survey_question tab_survey_question

#' Tabulate a survey question
#'
#' \code{tab_survey_question} Create a statistical table for a target question
#'
#' This function create a tibble table, with percent values for each answer and knowledge level for
#' all sub-question (if any) regarding the target main question. It takes a survey data, select the main question,
#' with it's respectives sub-questions, and calculate percent values for each answer, sub-question and knowledge level.
#' Also for each sub-question, it makes a chi-square test for the correlation between responses and knowledge level.
#' Finally, the function applies the question and answers labels.
#'
#' @author Leonardo Rocha
#'
#' @param surveydata List. A list of composed of tibbles of survey data, answers labels and questions labels, generated by \code{load_data} function.
#' @param question_number Numeric. Number of the main question
#'
#' @return Tibble. Table with percent values and p-value (chi-squared test between knowledge level) for sub-questions and knowledge levels.
#'
#' @importFrom magrittr %>%
#' @export
#'
#' @examples
#' tab_survey_question(surveydata = AI_survey, question_number = 4)
#' tab_survey_question(AI_survey, 2)
tab_survey_question <- function(surveydata, question_number) {

  knowledge_distinction = F
  order_reverse = T

  survey_data <- surveydata[['data']]
  question_label <- surveydata[['questions']]
  answer_label <- surveydata[['answers']]


  # geração do string que vai ser utilizado para poder selecionar as colunas de dt
  # que serão utilizadas para a tabulação
  question_start <- paste("q", question_number, "_", sep = "")

  # tabulação principal e simples
  # agrupando por subquestão (answer)
  # eliminando as linhas sem respostas
  # transformando as respostas e o conhecimento em fator
  # calculando as contagens por conhecimento e resposta (mas mantendo a questão para adicionar rotulo)
  # por fim completando aquelas respostas que não tiveram frequência para algum grupo com 0
  qtab <- survey_data %>%
    dplyr::select("q1_1", tidyselect::starts_with(question_start)) %>%
    dplyr::rename(knowledge = 1) %>%
    tidyr::gather(question, answer, -1) %>%
    dplyr::filter(answer != "") %>%
    dplyr::mutate(answer = factor(answer),
                  knowledge = factor(knowledge)) %>%
    dplyr::group_by(knowledge, question, answer) %>%
    dplyr::summarise(count = dplyr::n()) %>%
    tidyr::complete(answer, fill = list(count = 0)) %>%
    dplyr::ungroup()


  # calculando o p-valor do teste qui-quadrado por questão.
  # isso é para ver se tem alguma diferença por nível de conhecimento
  # para isso são geradas tabelas de contigência, agrupadas por questão
  # são então cada tabela de contigência é agrupada para questao em uma variável
  # data onde fica localizada a tabela de contigência
  p_values <- qtab %>%
    tidyr::spread(answer, count) %>%
    dplyr::select(-knowledge) %>%
    dplyr::group_by(question) %>%
    tidyr::nest() %>%
    dplyr::mutate(p_value = purrr::as_vector(lapply(data, chi.square_pvalue))) %>%
    dplyr::select(-data) %>%
    dplyr::ungroup()

  # geração de uma tabela com os totais, dos dois níveis de conhecimento
  # é adicionado o valor 3 para conhecimento que será representante de total
  qtab_total <- qtab %>%
    dplyr::group_by(question, answer) %>%
    dplyr::summarise(count = sum(count),
                     knowledge = "3") %>%
    dplyr::ungroup()

  # agrugem dos valores de p_valor por questão
  # nível de conhecimento total
  # também são contabilizados os totais por questão e nível de conhecimento,
  # e contabilizados os valores percentuais de cada resposta
  # ao final são aplicadas as labels às variáveis categóricas representadas por números:
  # questão principal
  # sub questão
  # resposta
  result <- qtab %>%
    dplyr::bind_rows(qtab_total) %>%
    dplyr::group_by(question, knowledge) %>%
    dplyr::mutate(total = sum(count),
                  percent = count / total) %>%
    dplyr::ungroup()


  # Isso daqui é uma estrutura provisória, testando um formato sem a distinção por conhecimento
  # porém os cálculos para fazer a segmentação por nível de conhecimento ainda são realizados
  # caso necessário seria fácil a reversão, bastando juntar a parte de baixo com a parte de cima
  # selecionadno apenas o caso em que há distinção por nível de conhecimento
  if (knowledge_distinction) {
    result <- result %>% dplyr::left_join(p_values, by = "question")
  } else {
    result <- result %>% dplyr::filter(knowledge == "3") %>% dplyr::select(-knowledge)
  }

  result <- result %>%
    apply_labels(question_label = question_label, answer_label = answer_label)

  # adaptação para contemplar os casos em que a base de dados está em uma ordem inversa das categorias
  if (order_reverse) {result$answer_label <- forcats::fct_rev(result$answer_label)}

  return(result)





#  qtab %>%
#    dplyr::bind_rows(qtab_total) %>%
#    dplyr::group_by(question, knowledge) %>%
#    dplyr::mutate(total = sum(count),
#                  percent = count / total) %>%
#    dplyr::ungroup() %>%
#    dplyr::left_join(p_values, by = "question") %>%
#    apply_labels(question_label = question_label, answer_label = answer_label)
}






#' Tabulate all survey questions
#'
#' Apply \code{tab_survey_question} to all questions in a survey data
#'
#' @author Leonardo Rocha
#'
#' @param surveydata List. A list of composed of tibbles of survey data, answers labels and questions labels, generated by \code{load_data} function.
#'
#' @return List. A list of Tables. A Table for each question
#' with percent values and p-value (chi-squared test between knowledge level)
#' for sub-questions and knowledge levels.
#'
#' @export
tab_all_survey_question <- function(surveydata) {


  n_questions <- get_n_questions(surveydata[['data']])
  questions_list <- seq(2, n_questions)
  data_list <- purrr::map(questions_list, tab_survey_question, surveydata = surveydata)
  name_list <- sapply(questions_list, function(x) {paste('q', x, sep = '')})
  names(data_list) <- name_list

  return(data_list)
}






#' Load data
#'
#' Load the survey data from an excel file. The data in the excel must be in a very specific form.
#'
#' @author Leonardo Rocha
#'
#' @param file_path Character. path to the XLSX file
#'
#' @return List. List with tree named Tibbles, 'data', 'questions' and 'answers'.
#'
#' @export
load_data <- function(file_path) {
  list(
    'data' = openxlsx::read.xlsx(file_path, sheet = 'data') %>%
      dplyr::mutate_all(as.character) %>%
      dplyr::as_tibble(),
    'questions' = openxlsx::read.xlsx(file_path, sheet = 'questions') %>%
      dplyr::mutate_all(as.character) %>%
      dplyr::as_tibble(),
    'answers' = openxlsx::read.xlsx(file_path, sheet = 'answers') %>%
      dplyr::mutate_all(as.character) %>%
      dplyr::as_tibble()
  )
}