TFstats <- function(input_stats,
batch_time = NULL,
cutoff_stats_flags,
max_chart_return,
max_chart_iterate,
number_of_unique_topline_charts = 1000,
number_of_unique_seg_and_crosstab_charts = 1000){
# number_of_unique_topline_charts = 1000
# number_of_unique_seg_and_crosstab_charts = 1000
# input_stats <- readRDS("~/TrendFinder/Outputs/2022-08-11/Output - Responses - Formatted - Batch Time 2022-08-11 09_23_26 EDT.rds")
# input_stats <- input_stats[which(input_stats$`Banner QID` %in% c(484, 6636, 161322, 161324, 15280)), ]
num_cores <- detectCores()
NA_stem_group_IDs <- which(is.na(input_stats$`Stem Group ID`))
NA_banner_group_IDs <- which(is.na(input_stats$`Banner Group ID`))
if(length(NA_stem_group_IDs) > 0){
input_stats$`Stem Group ID`[which(is.na(input_stats$`Stem Group ID`))] <- 0
}
if(length(NA_banner_group_IDs) > 0){
input_stats$`Banner Group ID`[which(is.na(input_stats$`Banner Group ID`))] <- 0
}
input_cols <- colnames(input_stats)
NA_stem_QID_rows <- which(is.na(input_stats$`Stem QID`))
NA_banner_QID_rows <- which(is.na(input_stats$`Banner QID`))
NA_rows <- c(NA_stem_QID_rows, NA_banner_QID_rows)
if(length(NA_rows) > 0){
input_stats <- input_stats[-NA_rows, ]
}
id_colnames <- c("Weighting Scheme", "Stem Answer ID", "Stem Group ID", "Banner Answer ID", "Banner Group ID")
id_cols <- which(input_cols %in% id_colnames)
response_cols <- c(grep('response count', input_cols))
total_cols <- c(grep('total responses', input_cols))
# Drop any rows that aren't "interesting"
input_stats_subset <- input_stats[input_stats$`Answer Flag` == 2, ]
# Drop any rows with less than 100 in the relevant columns for stats testing
input_stats_subset <- input_stats_subset %>% filter_at(vars(total_cols[length(total_cols)]), any_vars(. >= 100))
if(length(total_cols) > 1){
input_stats_subset <- input_stats_subset %>% filter_at(vars(total_cols[length(total_cols) - 1]), any_vars(. >= 100))
}
if(length(total_cols) > 2){
input_stats_subset <- input_stats_subset %>% filter_at(vars(total_cols[length(total_cols) - 2]), any_vars(. >= 100))
}
# All the emotions questions highly correlate with each other. There may be other groups that are wise to exclude from each other
stem_emotion_rows <- NULL
banner_emotion_rows <- NULL
emotion_questions_strings <- paste0('^', emotion_questions, '$')
for(i in 1:length(emotion_questions_strings)){
stem_emotion_rows_next <- grep(emotion_questions_strings[i], input_stats_subset$`Stem QID`)
banner_emotion_rows_next <- grep(emotion_questions_strings[i], input_stats_subset$`Banner QID`)
stem_emotion_rows <- c(stem_emotion_rows, stem_emotion_rows_next) %>% unique()
banner_emotion_rows <- c(banner_emotion_rows, banner_emotion_rows_next) %>% unique()
}
stem_emotion_question_indices <- which(stem_emotion_rows %in% banner_emotion_rows)
remove_rows <- stem_emotion_rows[stem_emotion_question_indices] %>% unique()
input_stats_subset <- input_stats_subset[-remove_rows, ]
unique_input_rows <- input_stats_subset[, id_colnames] %>% transpose() %>% as.list()
df_response <- input_stats_subset[, c(id_cols, response_cols)]
colnames(df_response) <- gsub(' - response count', '', colnames(df_response))
df_total <- input_stats_subset[, c(id_cols, total_cols)]
colnames(df_total) <- gsub(' - total responses', '', colnames(df_total))
names_to_def <- c("start_date", "end_date")
df_response_longer <- pivot_longer(df_response, cols = contains(" - "),
names_to = names_to_def,
names_sep = " - ",
values_to = "response count")
df_total_longer <- pivot_longer(df_total, cols = contains(" - "),
names_to = names_to_def,
names_sep = " - ",
values_to = "total responses")
df <- merge(df_response_longer, df_total_longer, by = c(id_colnames, names_to_def)) %>%
.[!duplicated(.), ]
stems <- df[, c('Weighting Scheme', 'Stem Answer ID', 'Stem Group ID')] %>%
.[!duplicated(.), ]
banners <- df[, c('Weighting Scheme', 'Banner Answer ID', 'Banner Group ID')] %>%
.[!duplicated(.), ]
start_and_end_dates <- df[, c('start_date', 'end_date')] %>%
.[!duplicated(.), ] %>%
setorder(., -"start_date")
row.names(start_and_end_dates) <- NULL
number_of_time_periods <- nrow(start_and_end_dates)
if(number_of_time_periods < 3){
total_date_periods <- number_of_time_periods
total_seg_periods <- number_of_time_periods
} else {
total_date_periods <- 3
total_seg_periods <- 2
}
# Test the last period with (up to) the two periods directly before it for significance
if(total_date_periods > 1){
date_sig_table <- mclapply(unique_input_rows, TFdatePropTest,
total_date_periods = total_date_periods,
df = df,
start_and_end_dates = start_and_end_dates,
id_colnames = id_colnames,
mc.cores = num_cores) %>%
rbindlist()
} else{
date_sig_table <- NULL
}
unique_banners <- input_stats_subset[, c("Weighting Scheme", "Banner Answer ID", "Banner Group ID")] %>%
.[!duplicated(.), ] %>%
transpose() %>%
as.list()
for(i in total_seg_periods:1){
seg_sig_table_interim <- mclapply(unique_banners, TFsegPropTest,
start_date = start_and_end_dates$start_date[i],
end_date = start_and_end_dates$end_date[i],
id_colnames = id_colnames,
df = df,
mc.cores = num_cores) %>%
rbindlist(., fill = TRUE)
remove_date_columns <- -grep('date', colnames(seg_sig_table_interim))
remove_count_columns <- -grep('response', colnames(seg_sig_table_interim))
remove_columns <- c(remove_date_columns, remove_count_columns)
seg_sig_table_interim <- seg_sig_table_interim[, ..remove_columns]
if(i == total_seg_periods){
seg_sig_table <- seg_sig_table_interim
} else{
seg_sig_table <- merge(seg_sig_table, seg_sig_table_interim, by = id_colnames)
}
}
output_stats <- left_join(input_stats, date_sig_table, by = id_colnames) %>%
left_join(., seg_sig_table, by = id_colnames)
sig_columns <- grep('significance', colnames(output_stats))
# Create identifier for how many times a row has recently had statistical significance either in time or relative to topline
output_stats$`Stats Flag` <- rowSums(as.data.frame(output_stats[, sig_columns]), na.rm = TRUE) %>%
abs()
output_stats_topline <- output_stats[which(output_stats$`Stem QID` == 0), ] %>%
.[which(.$`Answer Flag` == 2), ]
output_stats_seg_and_crosstab <- output_stats[which(output_stats$`Stem QID` != 0), ] %>%
.[which(.$`Answer Flag` == 2), ]
# Adjust for topline by using cutoff values among toplines and crosstabs separately
cutoff_stats_flags_topline <- cutoff_stats_flags
while(number_of_unique_topline_charts > max_chart_return){
output_stats_topline$decile <- ntile(output_stats_topline$`Stats Flag`, cutoff_stats_flags_topline)
number_of_unique_topline_charts <- output_stats_topline$`Banner QID`[which(output_stats_topline$decile == cutoff_stats_flags_topline)] %>%
length()
cutoff_stats_flags_topline <- cutoff_stats_flags_topline + max_chart_iterate
}
cutoff_stats_flags_seg_and_crosstab <- cutoff_stats_flags
while(number_of_unique_seg_and_crosstab_charts > max_chart_return){
output_stats_seg_and_crosstab$decile <- ntile(output_stats_seg_and_crosstab$`Stats Flag`, cutoff_stats_flags_seg_and_crosstab)
number_of_unique_seg_and_crosstab_charts <- output_stats_seg_and_crosstab$`Banner QID`[which(output_stats_seg_and_crosstab$decile == cutoff_stats_flags_seg_and_crosstab)] %>%
length()
cutoff_stats_flags_seg_and_crosstab <- cutoff_stats_flags_seg_and_crosstab + max_chart_iterate
}
if(length(output_stats_topline$decile) != 0){
output_stats_topline <- output_stats_topline[which(output_stats_topline$decile == max(output_stats_topline$decile)), ]
}
if(length(output_stats_seg_and_crosstab$decile) != 0){
output_stats_seg_and_crosstab <- output_stats_seg_and_crosstab[which(output_stats_seg_and_crosstab$decile == max(output_stats_seg_and_crosstab$decile)), ]
}
output_stats_deciles <- rbind(output_stats_topline, output_stats_seg_and_crosstab)
output_stats <- left_join(output_stats, output_stats_deciles, by = colnames(output_stats))
output_stats$Chart[which(!is.na(output_stats$decile))] <- 1
output_stats <- output_stats[ , -which(colnames(output_stats) == 'decile')]
if(!is.null(batch_time)){
output_statsName <- outputName("Output - Responses - Formatted with Stats", batch_time = batch_time)
} else{
output_statsName <- outputName("Output - Responses - Formatted with Stats")
}
# saveRDS(output_stats, file = paste0(output_statsName, " - ", Sys.time(), ".rds"))
# write.table(output_stats[which(output_stats$`Answer Flag` == 2),], file=paste0(output_statsName, " - ", Sys.time(), ".tsv"), quote=TRUE, sep='\t', row.names=FALSE)
return(output_stats)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.