#' Compute descriptive statistics for responses to a typeform question
#' @param x Data.frame with clean typeform survey results (one question only)
#' @return A data.frame where each element is a summary of the correponding question's results
#' @details Statistics are computed based on the type of question that is being asked.
#' @export
summarizeQuestion <- function (x) {
out <- NULL
question_type <- x$type
x <- x$results
if (question_type == 'list') {
n <- length(unique(x[['userid']]))
out <- data.frame(table(x[['value']]), stringsAsFactors=FALSE) %>%
dplyr::mutate(Pct = Freq/n) %>%
dplyr::arrange(desc(Freq))
}
if (question_type %in% c('dropdown','yesno')) {
out <- data.frame(table(x[['value']]), stringsAsFactors=FALSE) %>%
dplyr::mutate(Pct = Freq/sum(Freq)) %>%
dplyr::arrange(desc(Freq))
}
if (question_type == 'opinionscale') {
out <- data.frame(table(x[['value']]), stringsAsFactors=FALSE) %>%
dplyr::mutate(Pct = Freq/sum(Freq)) %>%
dplyr::arrange(Var1)
}
if (question_type == 'number') {
out <- data.frame(min = min(x[['value']], na.rm=TRUE),
q1 = quantile(x[['value']], 0.25, na.rm=TRUE),
median = quantile(x[['value']], 0.5, na.rm=TRUE),
mean = mean(x[['value']], na.rm=TRUE),
q3 = quantile(x[['value']], 0.75, na.rm=TRUE),
max = max(x[['value']], na.rm=TRUE),
nmiss = sum(is.na(x[['value']])))
}
if (question_type %in% c("textfield", "textarea", "email")) {
out <- data.frame(userid=x[['userid']],
value=x[['value']],
stringsAsFactors=FALSE)
}
return(out)
}
#' Compute descriptive statistics for a list of typeform question-responses
#' @param x List of data.frames, each with clean results for a single question
#' @return A list where each element is a summary of the correponding question's results
#' @details Text fields are combined into a single data.frame as the last element (called "Text")
#' @export
summarizeData <- function (x) {
## summarize each individual question
results <- lapply(x, typeformR::summarizeQuestion)
## deal with questions with text field responses
if (1 == 0) {
## combine text fields into their own unique list element (because I don't know what to use them for yet!)
text_id <- as.numeric(which(sapply(x, function(x) x$type %in% c("textfield", "textarea", "email"))))
for (i in seq_along(text_id)) {
id <- text_id[i]
if (i==1) {
out <- results[[id]]
} else {
out <- dplyr::left_join(out, results[[id]], by="userid")
}
names(out)[i+1] <- names(results)[id]
}
## remove from summary data, append to end of summary
results[text_id] <- NULL
results[[length(results)+1]] <- out
names(results)[length(results)] <- "Text"
}
return(results)
}
#' Merge user responses for three different questions
#' @param x List of data.frames, each with clean results for a single question
#' @param ids Vector of question indices
#' @return A list containing {data, type, question}
#' data -- data.frame with the merged data: {userid, value1, value2}
#' @export
mergeQuestions <- function(x, ids) {
# Merge all questions within `ids`
for (i in seq_along(ids)) {
if (i == 1) {
out <- setNames(x[[ ids[i] ]][['results']], c('userid','value1'))
} else {
out <- out %>% dplyr::left_join(setNames(x[[ ids[i] ]][['results']], c('userid',paste0('value',i))),
by = 'userid')
}
}
# Remove rows w/ missing data
out <- out[complete.cases(out), ]
# Store result as a list
list(data = out,
type = as.character(vapply(x[ids], function(x) x$type, character(1))),
question = as.character(vapply(x[ids], function(x) x$question, character(1))))
}
#' Create a bi-variate plot that describes the results of two questions
#' @param x List of data.frames, each with clean results for a single question
#' @param id1 Index of the first question
#' @param id2 Index of the second question
#' @param levels1 Ordered factor levels for the first question
#' @param levels2 Ordered factor levels for the second question
#' @param plot_style Plot style ("bar", "grid", "dot", "boxplot", "violin")
#' @return A plot
#' @import ggplot2
#' @importFrom magrittr "%>%"
#' @details Question type will dictate the plot that is produced:
#' \itemize{
#' \item{\bold{dropdown/list vs. dropdown/list} -- faceted bar graph}
#' }
#' @export
plotTwoQuestions <- function (x, id1, id2, levels1=NULL, levels2=NULL, plot_style=NULL) {
# merge responses
xt <- typeformR::mergeQuestions(x, ids=c(id1, id2))
# remove levels with only 1 response
removeSparseLevels <- function (vals) {
vals <- as.character(vals)
tmp <- data.frame(table(vals))
good_levels <- as.character(tmp[ tmp[,2]>1, 1])
vals[!(vals %in% good_levels)] <- "Other"
vals <- factor(vals, levels=c(good_levels,"Other"))
return(vals)
}
# -- perform on categorical variables only
if ( xt$type[1] %in% c("dropdown","list","yesno") ) {
xt$data$value1 <- removeSparseLevels(xt$data$value1)
}
if ( xt$type[2] %in% c("dropdown","list","yesno") ) {
xt$data$value2 <- removeSparseLevels(xt$data$value2)
}
# set factor levels
if (!is.null(levels1)) { xt$data$value1 <- factor(xt$data$value1, levels=levels1) }
if (!is.null(levels2)) { xt$data$value2 <- factor(xt$data$value2, levels=levels2) }
# -- and drop rows w/ missing data (note: this may be dangerous!)
xt$data <- xt$data[complete.cases(xt$data),]
# bivariate plot
# - categorical vs. categorical
if ( sum(xt$type %in% c("dropdown","list","yesno")) == 2 ) {
# default style = "bar" (if not set correctly)
if (is.null(plot_style) || !is.element(plot_style, c("grid","bar","dot"))) {
plot_style <- "bar"
}
tmp <- xt$data %>% dplyr::group_by(value1) %>%
dplyr::mutate(respondents = dplyr::n_distinct(userid)) %>%
dplyr::group_by(value1, value2) %>%
dplyr::summarize(respondents = head(respondents,1),
n = n()) %>%
dplyr::mutate(pct=n/respondents,
name = paste0(value1," (n = ", respondents, ")"))
if (plot_style == "bar") {
g1 <- tmp %>% ggplot(aes(x=value2, y=pct)) +
geom_bar(stat="identity", fill='orange') +
labs(x=NULL, y="Percentage of Respondents",
title=gsub("<strong>|</strong>", "", names(x)[id1]),
subtitle=gsub("<strong>|</strong>", "", names(x)[id2])) +
scale_y_continuous(labels=scales::percent) +
facet_wrap(~name) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.text.x = element_text(angle=-45, hjust=0))
} else if (plot_style == "grid") {
g1 <- expand.grid(value1=unique(xt$data$value1),value2=unique(xt$data$value2)) %>%
left_join(tmp, by=c('value1','value2')) %>%
dplyr::mutate(pct = ifelse(is.na(pct), 0, pct)) %>%
ggplot(aes(x=value1, y=value2, fill=pct)) +
geom_tile() +
geom_text(aes(label=ifelse(pct>0, scales::percent(round(pct,2)), "")),
color='black') +
labs(x=gsub("<strong>|</strong>", "", names(x)[id1]),
y=gsub("<strong>|</strong>", "", names(x)[id2]),
fill=NULL) +
scale_fill_gradient(labels=scales::percent, low='white', high='lightblue') +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
axis.text.y = element_text(angle = 0, hjust = 1),
plot.title = element_text(hjust = 0.5)) +
guides(fill=FALSE)
} else if (plot_style == "dot") {
g1 <- tmp %>% ggplot() +
geom_point(aes(x=value1, y=value2, size=pct), color='darkred') +
labs(x=NULL, y=NULL, size=NULL,
title=gsub("<strong>|</strong>", "", names(x)[id1]),
subtitle=gsub("<strong>|</strong>", "", names(x)[id2])) +
scale_size_continuous(labels=scales::percent) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
}
}
# - numeric vs. numeric
if ( sum(xt$type %in% c("number","opinionscale")) == 2 ) {
g1 <- xt$data %>% ggplot(aes(x=value1, y=value2)) +
geom_jitter(width=0.15, height=0.15) +
geom_smooth(method = "lm", se=FALSE, color='darkgreen') +
labs(x=xt$question[1], y=xt$question[2],
title = paste0("Correlation = ", round(cor(xt$data$value1, xt$data$value2),2))) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
}
# - numeric vs. categorical
if ( sum(xt$type %in% c("dropdown","list","yesno")) == 1 &
sum(xt$type %in% c("number","opinionscale")) == 1 ) {
idx <- which(xt$type %in% c("dropdown","list","yesno"))
idy <- which(xt$type %in% c("number", "opinionscale"))
# default plot_style = "boxplot" (if not set correctly)
if (is.null(plot_style) || !is.element(plot_style, c("boxplot","violin"))) {
plot_style <- "boxplot"
}
if (plot_style == "boxplot") {
g1 <- xt$data %>% ggplot(aes_string(x=names(xt$data)[idx+1],
y=names(xt$data)[idy+1])) +
geom_boxplot(fill='dodgerblue') +
labs(x=xt$question[idx], y=xt$question[idy]) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
} else if (plot_style == "violin") {
g1 <- xt$data %>% ggplot(aes_string(x=names(xt$data)[idx+1],
y=names(xt$data)[idy+1])) +
geom_violin(fill='dodgerblue') +
labs(x=xt$question[idx], y=xt$question[idy]) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
}
}
g1
}
#' Test for an association between two survey questions
#' @param x A list where each element represents a single question
#' @param id1 Index of the first question
#' @param id2 Index of the second question
#' @return P-value for the test of association
#' @export
associationTest <- function (x, id1, id2) {
## Merge two questions
x <- typeformR::mergeQuestions(x, c(id1, id2))
## Question type
qtype_cat <- c('dropdown','list','yesno')
qtype_num <- c("number", "opinionscale")
## Test for an association
## - two category = chi-square test
## - one category, one numeric = anova test
## - two numeric = Pearson correlation test (<-- still need to add)
if ( length(unique(x$data$value1))<=1 | length(unique(x$data$value2))<=1 ) {
pval <- NA
} else if ( sum(x$type %in% qtype_cat) == 2 ) {
pval <- chisq.test(x$data$value1, x$data$value2)$p.value
} else if ( sum(x$type %in% qtype_num)==1 & sum(x$type %in% qtype_cat)==1 ) {
x_id <- which(x$type %in% qtype_cat) + 1
y_id <- which(x$type %in% qtype_num) + 1
pval <- anova(lm(x$data[,y_id] ~ x$data[,x_id]))[1,5]
} else if ( sum(x$type %in% qtype_num) == 2 ) {
pval <- cor.test(x$data$value1,x$data$value2)$p.value
} else {
pval <- NA
}
return(pval)
}
#' Test associations between all survey questions
#' @param data A data.frame returned from typeformR::cleanData()
#' @return A list that includes:
#' data -- data.frame with: {question1, question1, pvalue}
#' plot -- association heatmap
#' @export
getAssociations <- function (data) {
n <- length(data)
assoc_summary <- NULL
for (id in 1:n) {
idc <- setdiff(1:n, id)
tmp <- data.frame(question1 = rep(names(data)[id], n-1),
question2 = names(data)[idc],
pvalue = sapply(idc,
function(x) suppressWarnings(typeformR::associationTest(data, id, x))))
if (is.null(assoc_summary)) {
assoc_summary <- tmp
} else {
assoc_summary <- rbind(assoc_summary, tmp)
}
}
assoc_summary$question1 <- factor(assoc_summary$question1, levels=names(data))
assoc_summary$question2 <- factor(assoc_summary$question2, levels=names(data))
# -- stats
stats <- data.frame(n_pairs = nrow(assoc_summary),
n_missing = sum(is.na(assoc_summary$pvalue)),
n_sig_001 = sum(assoc_summary$pvalue < .001, na.rm=TRUE),
n_sig_01 = sum(assoc_summary$pvalue < .01, na.rm=TRUE),
n_sig_05 = sum(assoc_summary$pvalue < .05, na.rm=TRUE))
# -- heatmap
g1 <- assoc_summary %>%
ggplot(aes(x=question1, y=question2, fill=pvalue)) +
geom_tile(aes(text=sprintf("%s<br>%s<br>%s",
question1, question2,
sprintf('%0.3f',round(pvalue,3))))) +
scale_fill_gradient(low="red", high="white") +
labs(x="Question 1", y="Question 2", fill="P-value") +
theme_bw() +
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
# -- P-value vs. [# of pairs]
g2 <- assoc_summary %>%
dplyr::arrange(pvalue) %>%
dplyr::mutate(count=row_number()) %>%
ggplot(aes(x=count, y=ifelse(is.na(pvalue), 1, pvalue),
color=ifelse(is.na(pvalue),'a','b'))) +
geom_line() +
scale_color_manual(values=c("a"="gray", "b"="black"), guide=FALSE) +
labs(x="# of Question Pairs", y="P-value") +
theme_bw()
list(data = assoc_summary,
stats = stats,
heatmap = g1,
plot = g2)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.