#' Credit goes to Zack Arno
#' Get NA percentages and frequency for each column in the data set
#' @param data The dataset (data.frame).
#' @return frequency and proportion table of NA rates.
#' @export
get_na_response_rates<-function(data){
na_count_per_question<-sapply(data, function(y) sum(length(which(is.na(y)))))
na_percent_per_question <-sapply(data, function(y) ((sum(length(which(is.na(y)))))/nrow(data))*100)
non_response_df<-data.frame(num_non_response=na_count_per_question,perc_non_response= na_percent_per_question)
non_response_df1<-non_response_df %>%
mutate(question=rownames(.)) %>%
dplyr::select(question, everything()) %>%
arrange(num_non_response, question)
}
#' @name auto_detect_sm_parents
#' @rdname auto_detect_sm_parents
#' @title Detect select multiple parent columns
#'
#' @description `auto_detect_sm_parents` is mean to detect select multiple parent columns in a way that does
#' not rely on the XLSForm as the input
#'
#' @param df a survey object or dataframe
#' @param sm_sep select multiple parent child separator. This is specific for XLSForm data (default = /).
#' If using read_csv to read in data the separator will most likely be '/' where as if using read.csv it will likely be '.'
#' @return a list of select multiple parent columns in data set.
#'
#'
#' @export
auto_detect_sm_parents<- function(df, sm_sep="."){
sm_parents<-sub(glue::glue('.[^\\{sm_sep}]*$'), '', colnames(df))
sm_parents<-data.frame(col_names=sm_parents[sm_parents!=""])
select_multiple_detected<-sm_parents %>%
group_by(col_names) %>%
summarise(n=n()) %>%
filter(n>1) %>%
select(col_names)
return(as.character(select_multiple_detected$col_names))
}
#' @name auto_sm_parent_child
#' @rdname auto_sm_parent_child
#' @title detect and group together select multiple parent and children columns
#' @description `auto_sm_parent_child` is mean to detect select multiple parent columns & children columns in a way that does
#' not rely on the XLSForm as the input
#' @param df a survey object or dataframe
#' @param sm_sep select multiple parent child separator. This is specific for XLSForm data (default = /).
#' If using read_csv to read in data the separator will most likely be '/' where as if using read.csv it will likely be '.'
#' @return a data frame containing the the child select multiple columns alongside there parents
#' @export
auto_sm_parent_child<- function(df, sm_sep="."){
sm_parents<-auto_detect_sm_parents(df, sm_sep)
sm_child<- df %>%
select(starts_with(glue::glue("{sm_parents}{sm_sep}"))) %>%
colnames()
tibble(
sm_parent=sub(glue::glue('.[^\\{sm_sep}]*$'),'',sm_child),
sm_child
)
}
#' @name survey_collapse_binary_long
#' @rdname survey_collapse_binary_long
#' @title Collapse logical binary columns into tidy long format
#'
#' @description `survey_collapse_binary_long()` uses the srvyr [srvyr::survey_mean] & survey package [survey::svymean] methods
#' to collapse/or aggregate binary logical data. This function can be used on its own, but was build mainly to for its use in [butteR::survey_collapse]
#' which is meant to help batch analyze data
#'
#' @param dfsvy a survey or preferably srvyr object
#' @param x columns to collapse
#' @param disag the columns to collapse/ subset by(analagous to [[dplyr::group_by]] to [[dplyr::summarise]]) flow
#' @param na_val if you want NA replaced by value. By default NA values will be removed prior to aggregation. It is recommended
#' that you do not adjust this value and deal with na values as a separate step
#' @param sm_sep select multiple parent child separator. This is specific for XLSForm data (default = /).
#' If using read_csv to read in data the separator will most likely be '/' where as if using read.csv it will likely be '.'
#' @return a long format data frame containing the collapsed data.
#'
#'
#' @export
survey_collapse_binary_long<- function(df,
x,
disag=NULL,
na_val=NA_real_,
sm_sep="/" ) {
if(is.na(na_val) & !all(!is.na(df$variables[[x]]))){
df<-df%>%
filter(!is.na(!!sym(x)))
}
if(!is.na(na_val)){
df<-df %>%
mutate(
!!x:=ifelse(is.na(x), na_val,x)
)
}
if(!is.null(disag)){
disag_syms<-syms(disag)
df<-df %>%
group_by(!!!disag_syms)
df_n<-df
if(is.logical(df$variables[[x]])) {
df_n<-df %>%
group_by(!!!disag_syms,!!x:=factor(!!sym(x)),.drop=FALSE)
vec_n<-df_n %>%
summarise(n_unweighted= unweighted(n())) %>%
filter(!!sym(x)==T) %>%
pull(n_unweighted)
}
if(!is.logical(df$variables[[x]])) {
df_n<-df %>%
group_by(!!!disag_syms,.drop=FALSE)
vec_n<-df_n %>%
summarise(n_unweighted= unweighted(n())) %>%
pull(n_unweighted)
vec_m<-df_n %>%
summarise(median= survey_median(!!sym(x),na.rm =T,vartype = "ci")) %>%
pull(median)
}
}
if(is.null(disag)){
if(is.logical(df$variables[[x]])) {
df_n<-df %>%
group_by(!!sym(x),.drop=F)
vec_n<-df_n %>%
summarise(n_unweighted= unweighted(n())) %>%
filter(!!sym(x)==T) %>%
pull(n_unweighted)
}
if(!is.logical(df$variables[[x]])){
vec_n<-df %>%
mutate(!!x := !is.na(!!sym(x))) %>%
summarise(n_unweighted= unweighted(n())) %>%
pull(n_unweighted)
vec_m<-df %>%
# mutate(!!x := !is.na(!!sym(x))) %>%
summarise(median= survey_median(!!sym(x),na.rm =T,vartype = "ci")) %>%
pull(median)
}
subset_names<- "dummy"
subset_vals<- "dummy"
}
if(length(vec_n)==0){
vec_n<-0
}
res<-df %>%
summarise(
stat=survey_mean(!!sym(x),na.rm=TRUE,vartype="ci"),
) %>%
mutate(variable_val=x) %>% # mean for intger
cbind(n_unweighted=vec_n)
if(!is.logical(df$variables[[x]])){
res <- res |> cbind(median = vec_m)
}
if(!is.null(disag)){
class(disag)
subset_names<- glue::glue("subset_{1:length(disag)}_name")
subset_vals<- glue::glue("subset_{1:length(disag)}_val")
# res<-
res<- res %>%
rename_at(.vars = disag,
.funs = function(x) glue::glue("subset_{1:length(x)}_val")) %>%
mutate_key_pair(names =subset_names,values = disag ) %>%
mutate_at(
.vars = subset_vals,.funs = function(x)as.character(x)
)
# res<-res %>%
# pivot_longer(disag,
# names_to="subset_name",
# values_to= "subset_value") %>%
# mutate(subset_value=as.character(subset_value))
}
res %>%
mutate(variable=sub(glue::glue('.[^\\{sm_sep}]*$'), '',
variable_val)) %>%
select(any_of(c ("variable",
"variable_val",
as.character(subset_names),
as.character(subset_vals))),
everything())
# dplyr::select(any_of(
# c("variable","variable_value","subset_name", "subset_value")
# ),
# everything())
}
#' @name survey_collapse_categorical_long
#' @rdname survey_collapse_categorical_long
#' @title Collapse categorical data into tidy long format
#'
#' @description `survey_collapse_categorical)long()` uses the srvyr [srvyr::survey_mean] & survey package [survey::svyciprop] methods
#' to collapse/or aggregate cateogrical data. This function can be used on its own, but was build mainly to for its use in [butteR::survey_collapse]
#' which is meant to help batch analyze data
#'
#' @param dfsvy a survey or preferably srvyr object
#' @param x columns to collapse
#' @param disag the columns to collapse/ subset by(analagous to [[dplyr::group_by]] to [[dplyr::summarise]]) flow
#' #' @param na_val if you want NA replaced by value. By default NA values will be removed prior to aggregation. It is recommended
#' that you do not adjust this value and deal with na values as a separate step
#' @param sm_sep select multiple parent child separator. This is specific for XLSForm data (default = /).
#' If using read_csv to read in data the separator will most likely be '/' where as if using read.csv it will likely be '.'
#' @return a long format data frame containing the collapsed data.
#'
#'
#' @export
survey_collapse_categorical_long<- function(df, x,disag=NULL,na_val=NA_character_) {
if(is.na(na_val)){
df<- df %>%
filter(!is.na(!!sym(x)))
}
if(!is.na(na_val)){
df %>%
mutate(
!!x:=ifelse(is.na(x), na_val,x)
)
}
if(!is.null(disag)){
group_by_vars<-syms(c(disag,x))
}else{
group_by_vars<-syms(c(x))
}
df<-df %>%
group_by(!!!group_by_vars,.drop=F)
res<-df %>%
summarise(
stat=survey_mean(na.rm=TRUE,vartype="ci"),
n_unweighted= unweighted(n())
) %>%
mutate(variable=x) %>%
rename(variable_val=x)
if(!is.null(disag)){
subset_names<- glue::glue("subset_{1:length(disag)}_name")
subset_vals<- glue::glue("subset_{1:length(disag)}_val")
res<- res %>%
rename_at(.vars = disag,
.funs = function(x) glue::glue("subset_{1:length(x)}_val")) %>%
mutate_key_pair(names =subset_names,values = disag ) %>%
mutate_at(
.vars = subset_vals,.funs = function(x)as.character(x)
)
}
res %>%
select(any_of(c ("variable",
"variable_val",
"subset_names", "subset_vals")),
everything())
}
#' @name survey_analysis
#' @rdname survey_analysis
#' @title Batch Collapse Survey Data into tidy long format
#'
#' @description `survey_analysis` uses the srvyr [srvyr::survey_mean] & survey package [survey::svymean] methods
#' to collapse/or aggregate survey data. This function uses `survey_collapse_categorical_long` and `survey_collapse_binary_long`
#' to perform the batch analysis. This function is extracted from butteR
#'
#' @param df a survey or preferably srvyr object
#' @param vars_to_analyze columns to collapse
#' @param disag the columns to collapse/ subset by(analagous to [[dplyr::group_by]] to [[dplyr::summarise]]) flow
#' @param na_val if you want NA replaced by value. By default NA values will be removed prior to aggregation. It is recommended
#' that you do not adjust this value and deal with na values as a separate step
#' @param sm_sep select multiple parent child separator. This is specific for XLSForm data (default = /).
#' @param kobo_path kobo tool path
#' @param question_lable A logical variable. Select TRUE if label from kobo is necessary in the analysis.
#' If using read_csv to read in data the separator will most likely be '/' where as if using read.csv it will likely be '.'
#' @return a long format data frame containing the collapsed data.
#'
#'
#' @export
survey_analysis<-function(df,
weights = F,
weight_column =NULL,
strata ,
vars_to_analyze=NULL,
disag=NULL,
na_val,
sm_sep="/",
question_lable = F,
kobo_path = NULL){
if(!is.null(vars_to_analyze)) {vars_to_analyze <- vars_to_analyze[vars_to_analyze %in% names(df)]}
if(is.null(vars_to_analyze)) {vars_to_analyze <- names(df)}
if(!is.null(weight_column)) {vars_to_analyze <- vars_to_analyze[!vars_to_analyze %in% weight_column]}
if(weights == T){
df <- as_survey(df,strata = strata, weight = weight_column)
}
if(weights == F){
df <- as_survey(df)
}
sm_parent_child_all<-auto_sm_parent_child(df$variables)
sm_parent_child_vars<- sm_parent_child_all %>%
filter(sm_parent %in% vars_to_analyze)
not_sm<-vars_to_analyze[!vars_to_analyze %in% sm_parent_child_vars$sm_parent]
vars_to_analyze<- c(not_sm, sm_parent_child_vars$sm_child)
######### check calculation type #############################################
calculation_type <- lapply(df$variables,class) %>% as.data.frame()
calculation_type <- calculation_type %>%
pivot_longer(cols = names(calculation_type),
names_to = "main_variable",values_to = "type") %>% mutate(
type = case_when(main_variable %in% sm_parent_child_all$sm_child ~ "logical", T~ type)
) %>% mutate(analysis_type = case_when(type %in% c("numeric","integer") ~ "mean",
type == "logical" ~ "prop_select_multiple",
T~ "prop_select_one")) %>% select(-type)
#########33
############################# NA RESPONSE #######################################################################################
na_response_rate <-get_na_response_rates(df$variables)
response_rate <- na_response_rate %>% mutate(
response_count = nrow(df$variables)- num_non_response
)
response_rate <- dplyr::filter(response_rate ,!grepl('\\.',question)) %>% tibble() %>% dplyr::select(question,response_count)
##############################################################################################################
if(!is.null(disag)){
vars_to_analyze<-setdiff(vars_to_analyze,disag )
}
res_list<-list()
for(i in vars_to_analyze){
print(i)
if(is.character(df$variables[[i]])|is.factor(df$variables[[i]])){
res_list[[i]] <-survey_collapse_categorical_long(df = df,
x = i,
disag = disag,
na_val = NA_character_
)
}
if(is.logical(df$variables[[i]])|is.numeric(df$variables[[i]])){
res_list[[i]] <-survey_collapse_binary_long(df = df,
x = i,
disag = disag,
na_val = NA_real_,
sm_sep = sm_sep
)
}
}
output_result<- bind_rows(res_list) %>% tidyr::separate(variable_val,
c("question", "options"),sep = "\\.",
extra='merge') %>% mutate(
main_variable = case_when(is.na(variable)| variable == "" ~question, T ~ variable),
choice = case_when(!is.na(options)|options!= ""~ options,T~question),
choice = case_when(main_variable == choice ~ NA_character_, T~ choice)
) %>% select(main_variable,choice,everything()) %>% select(-variable,-question,-options)
if(question_lable == T) {
read_sheets(kobo_path)
survey <- survey %>% select(name,starts_with("label::"))
choices <- choices %>% select(name,starts_with("label::"))%>% distinct(name,.keep_all = T)
names(choices) <- paste0("choice_", names(choices))
output_result<- output_result %>% left_join(survey,by = c("main_variable"= "name")) %>%
left_join(choices,by= c("choice"="choice_name")) %>% select(main_variable,starts_with("label::"),choice,starts_with("Choice_label"),everything())
output_result <- output_result %>% left_join(response_rate,by =c("main_variable"="question"))
}
if(question_lable == F) {
output_result <- output_result %>% left_join(response_rate,by =c("main_variable"="question"))
}
if(!is.null(disag)){
main_variable_list <- output_result$main_variable %>% unique()
count_by_location <- list()
for(i in main_variable_list){
df_2 <- df$variables %>% dplyr::select(i,disag) %>% dplyr::filter(!is.na(df$variables[i]))
count_by_location[[i]] <- df_2 %>% dplyr::group_by(!!!syms(disag)) %>% dplyr::summarise(
count_by_subset = n()) %>% mutate(
main_variable = i
)
}
count_by_location_df <- do.call("bind_rows",count_by_location)
for(i in 1:length(disag)){
new_col_name <- paste0("subset_",i,"_val")
count_by_location_df <- count_by_location_df %>% rename(
!!new_col_name:= disag[[i]]
)
}
count_by_location_df <- count_by_location_df %>% select(main_variable,count_by_subset,starts_with("subset_"))
output_result <- output_result %>% left_join(count_by_location_df) %>% distinct()
output_result <- output_result %>% left_join(calculation_type) %>% mutate(
analysis_type = case_when(is.na(choice) ~ "mean",
main_variable %in% sm_parent_child_all$sm_parent ~ "prop_select_multiple",
T~ analysis_type)
)
if(length(disag) == 1){output_result <- output_result %>% mutate(key_index = paste0(analysis_type," @/@ ", main_variable," ~/~ ",choice, " @/@ ",
subset_1_name, " ~/~ " , subset_1_val))}
if(length(disag) == 2){output_result <- output_result %>% mutate(key_index = paste0(analysis_type," @/@ ", main_variable," ~/~ ",choice, " @/@ ",
subset_1_name, " ~/~ " , subset_1_val, " ~/~ ",subset_2_name, " ~/~ " , subset_2_val))}
if(length(disag) == 3){output_result <- output_result %>% mutate(key_index = paste0(analysis_type," @/@ ", main_variable," ~/~ ",choice, " @/@ ",
subset_1_name, " ~/~ " , subset_1_val, " ~/~ ",subset_2_name, " ~/~ " , subset_2_val,
" ~/~ ",subset_3_name, " ~/~ " , subset_3_val))}
if(length(disag) == 4){output_result <- output_result %>% mutate(key_index = paste0(analysis_type," @/@ ", main_variable," ~/~ ",choice, " @/@ ",
subset_1_name, " ~/~ " , subset_1_val, " ~/~ ",subset_2_name, " ~/~ " , subset_2_val,
" ~/~ ",subset_3_name, " ~/~ " , subset_3_val,
" ~/~ ",subset_4_name, " ~/~ " , subset_4_val))}
if(length(disag) == 5){output_result <- output_result %>% mutate(key_index = paste0(analysis_type," @/@ ", main_variable," ~/~ ",choice, " @/@ ",
subset_1_name, " ~/~ " , subset_1_val,
" ~/~ ",subset_2_name, " ~/~ " , subset_2_val,
" ~/~ ",subset_3_name, " ~/~ " , subset_3_val,
" ~/~ ",subset_4_name, " ~/~ " , subset_4_val,
" ~/~ ",subset_5_name, " ~/~ " , subset_5_val))}
output_result <- output_result %>% relocate(analysis_type,.after = last_col()) %>% relocate(key_index,.after = last_col())
}
if(is.null(disag)){
output_result <- output_result %>% left_join(calculation_type) %>% mutate(
analysis_type = case_when(is.na(choice) ~ "mean",
main_variable %in% unique(sm_parent_child_all$sm_parent) ~ "prop_select_multiple",
T~ analysis_type)
) %>% relocate(response_count, .after = last_col())
output_result <- output_result %>% mutate(
key_index = paste0(analysis_type," @/@ ", main_variable," ~/~ ",choice, " @/@ ",
"NA ~/~ NA")
) %>% relocate(analysis_type,.after = last_col()) %>% relocate(key_index,.after = last_col())
}
output_result
}
#' @name mutate_key_pair
#' @rdname mutate_key_pair
#' @title Mutate columns on based on a list of names and values
#'
#' @description conditionally mutate on columns based
#' on a list of column names and values. This is mostly useful for conditional
#' mutate commands and can currently only mutate uniform columns.
#' It is used inside the survey collapse functions
#'
#' @param df dataframe
#' @param names names of columns to mutate
#' @param values uniform values to mutate
mutate_key_pair<- function(df, names, values){
df %>%
tibble::add_column(!!!set_names(as.list(values),nm=names))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.