R/TFstats.R

Defines functions TFstats

TFstats <- function(input_stats,
                    batch_time = NULL,
                    cutoff_stats_flags,
                    max_chart_return,
                    max_chart_iterate,
                    number_of_unique_topline_charts = 1000,
                    number_of_unique_seg_and_crosstab_charts = 1000){
  # number_of_unique_topline_charts = 1000
  # number_of_unique_seg_and_crosstab_charts = 1000
  # input_stats <- readRDS("~/TrendFinder/Outputs/2022-08-11/Output - Responses - Formatted - Batch Time 2022-08-11 09_23_26 EDT.rds")
  # input_stats <- input_stats[which(input_stats$`Banner QID` %in% c(484, 6636, 161322, 161324, 15280)), ]

  num_cores <- detectCores()
  
  NA_stem_group_IDs <- which(is.na(input_stats$`Stem Group ID`))
  NA_banner_group_IDs <- which(is.na(input_stats$`Banner Group ID`))
  
  if(length(NA_stem_group_IDs) > 0){
    input_stats$`Stem Group ID`[which(is.na(input_stats$`Stem Group ID`))] <- 0
  }
  if(length(NA_banner_group_IDs) > 0){
    input_stats$`Banner Group ID`[which(is.na(input_stats$`Banner Group ID`))] <- 0
  }
  
  
  input_cols <- colnames(input_stats)
  
  NA_stem_QID_rows <- which(is.na(input_stats$`Stem QID`))
  NA_banner_QID_rows <- which(is.na(input_stats$`Banner QID`))
  NA_rows <- c(NA_stem_QID_rows, NA_banner_QID_rows)
  
  if(length(NA_rows) > 0){
    input_stats <- input_stats[-NA_rows, ]
  }
  
  id_colnames <- c("Weighting Scheme", "Stem Answer ID", "Stem Group ID", "Banner Answer ID", "Banner Group ID")
  id_cols <- which(input_cols %in% id_colnames)
  
  response_cols <- c(grep('response count', input_cols))
  total_cols <- c(grep('total responses', input_cols))
  
  # Drop any rows that aren't "interesting"
  input_stats_subset <- input_stats[input_stats$`Answer Flag` == 2, ]
  
  # Drop any rows with less than 100 in the relevant columns for stats testing
  input_stats_subset <- input_stats_subset %>% filter_at(vars(total_cols[length(total_cols)]), any_vars(. >= 100))
  if(length(total_cols) > 1){
    input_stats_subset <- input_stats_subset %>% filter_at(vars(total_cols[length(total_cols) - 1]), any_vars(. >= 100))
  }
  if(length(total_cols) > 2){
    input_stats_subset <- input_stats_subset %>% filter_at(vars(total_cols[length(total_cols) - 2]), any_vars(. >= 100))
  }
  
  # All the emotions questions highly correlate with each other. There may be other groups that are wise to exclude from each other
  stem_emotion_rows <- NULL
  banner_emotion_rows <- NULL
  
  emotion_questions_strings <- paste0('^', emotion_questions, '$')
  
  
  for(i in 1:length(emotion_questions_strings)){
    stem_emotion_rows_next <- grep(emotion_questions_strings[i], input_stats_subset$`Stem QID`)
    
    banner_emotion_rows_next <- grep(emotion_questions_strings[i], input_stats_subset$`Banner QID`)
    
    stem_emotion_rows <- c(stem_emotion_rows, stem_emotion_rows_next) %>% unique()
    banner_emotion_rows <- c(banner_emotion_rows, banner_emotion_rows_next) %>% unique()
  }
  
  
  stem_emotion_question_indices <- which(stem_emotion_rows %in% banner_emotion_rows)
  remove_rows <- stem_emotion_rows[stem_emotion_question_indices] %>% unique()
  
  input_stats_subset <- input_stats_subset[-remove_rows, ]
  
  unique_input_rows <- input_stats_subset[, id_colnames] %>% transpose() %>% as.list()
  
  df_response <- input_stats_subset[, c(id_cols, response_cols)]
  colnames(df_response) <- gsub(' - response count', '', colnames(df_response))
  
  df_total <- input_stats_subset[, c(id_cols, total_cols)]
  colnames(df_total) <- gsub(' - total responses', '', colnames(df_total))
  
  names_to_def <- c("start_date", "end_date")
  
  df_response_longer <- pivot_longer(df_response, cols = contains(" - "),
                                     names_to = names_to_def,
                                     names_sep = " - ",
                                     values_to = "response count")
  
  df_total_longer <- pivot_longer(df_total, cols = contains(" - "),
                                  names_to = names_to_def,
                                  names_sep = " - ",
                                  values_to = "total responses")
  
  df <- merge(df_response_longer, df_total_longer, by = c(id_colnames, names_to_def)) %>%
    .[!duplicated(.), ]
  
  stems <-  df[, c('Weighting Scheme', 'Stem Answer ID', 'Stem Group ID')] %>%
    .[!duplicated(.), ]
  banners <-  df[, c('Weighting Scheme', 'Banner Answer ID', 'Banner Group ID')] %>%
    .[!duplicated(.), ]
  start_and_end_dates <- df[, c('start_date', 'end_date')] %>%
    .[!duplicated(.), ] %>%
    setorder(., -"start_date")
  
  row.names(start_and_end_dates) <- NULL
  
  number_of_time_periods <- nrow(start_and_end_dates)
  
  if(number_of_time_periods < 3){
    total_date_periods <- number_of_time_periods
    total_seg_periods <- number_of_time_periods
  } else {
    total_date_periods <- 3
    total_seg_periods <- 2
  }
  

  
  # Test the last period with (up to) the two periods directly before it for significance
  if(total_date_periods > 1){
    date_sig_table <- mclapply(unique_input_rows, TFdatePropTest,
                               total_date_periods = total_date_periods,
                               df = df,
                               start_and_end_dates = start_and_end_dates,
                               id_colnames = id_colnames,
                               mc.cores = num_cores) %>%
      rbindlist()
  } else{
    date_sig_table <- NULL
  }
  
  
  unique_banners <- input_stats_subset[, c("Weighting Scheme", "Banner Answer ID", "Banner Group ID")] %>%
    .[!duplicated(.), ] %>%
    transpose() %>%
    as.list()
  
  
  for(i in total_seg_periods:1){
    
    seg_sig_table_interim <- mclapply(unique_banners, TFsegPropTest,
                                      start_date = start_and_end_dates$start_date[i],
                                      end_date = start_and_end_dates$end_date[i],
                                      id_colnames = id_colnames,
                                      df = df,
                                      mc.cores = num_cores) %>%
      rbindlist(., fill = TRUE)
    
    remove_date_columns <- -grep('date', colnames(seg_sig_table_interim))
    remove_count_columns <- -grep('response', colnames(seg_sig_table_interim))
    remove_columns <- c(remove_date_columns, remove_count_columns)
    
    seg_sig_table_interim <- seg_sig_table_interim[, ..remove_columns]
    
    if(i == total_seg_periods){
      seg_sig_table <- seg_sig_table_interim
    } else{
      seg_sig_table <- merge(seg_sig_table, seg_sig_table_interim, by = id_colnames)
    }
    
  }
  
  output_stats <- left_join(input_stats, date_sig_table, by = id_colnames) %>%
    left_join(., seg_sig_table, by = id_colnames)
  
  sig_columns <- grep('significance', colnames(output_stats))
  
  
  # Create identifier for how many times a row has recently had statistical significance either in time or relative to topline
  
  output_stats$`Stats Flag` <- rowSums(as.data.frame(output_stats[, sig_columns]), na.rm = TRUE) %>%
    abs()
  
  output_stats_topline <- output_stats[which(output_stats$`Stem QID` == 0), ] %>%
    .[which(.$`Answer Flag` == 2), ]
  output_stats_seg_and_crosstab <- output_stats[which(output_stats$`Stem QID` != 0), ] %>%
    .[which(.$`Answer Flag` == 2), ]
  
  # Adjust for topline by using cutoff values among toplines and crosstabs separately
  cutoff_stats_flags_topline <- cutoff_stats_flags
  
  
  while(number_of_unique_topline_charts > max_chart_return){
    output_stats_topline$decile <- ntile(output_stats_topline$`Stats Flag`, cutoff_stats_flags_topline)
    number_of_unique_topline_charts <- output_stats_topline$`Banner QID`[which(output_stats_topline$decile == cutoff_stats_flags_topline)] %>%
      length()
    
    cutoff_stats_flags_topline <- cutoff_stats_flags_topline + max_chart_iterate
  }
  
  cutoff_stats_flags_seg_and_crosstab <- cutoff_stats_flags
  
  while(number_of_unique_seg_and_crosstab_charts > max_chart_return){
    output_stats_seg_and_crosstab$decile <- ntile(output_stats_seg_and_crosstab$`Stats Flag`, cutoff_stats_flags_seg_and_crosstab)
    number_of_unique_seg_and_crosstab_charts <- output_stats_seg_and_crosstab$`Banner QID`[which(output_stats_seg_and_crosstab$decile == cutoff_stats_flags_seg_and_crosstab)] %>%
      length()
    
    cutoff_stats_flags_seg_and_crosstab <- cutoff_stats_flags_seg_and_crosstab + max_chart_iterate
  }
  
  if(length(output_stats_topline$decile) != 0){
    output_stats_topline <- output_stats_topline[which(output_stats_topline$decile == max(output_stats_topline$decile)), ]
  }
  
  if(length(output_stats_seg_and_crosstab$decile) != 0){
    output_stats_seg_and_crosstab <- output_stats_seg_and_crosstab[which(output_stats_seg_and_crosstab$decile == max(output_stats_seg_and_crosstab$decile)), ]
  }
  
  output_stats_deciles <- rbind(output_stats_topline, output_stats_seg_and_crosstab)
  output_stats <- left_join(output_stats, output_stats_deciles, by = colnames(output_stats))
  output_stats$Chart[which(!is.na(output_stats$decile))] <- 1
  output_stats <- output_stats[ , -which(colnames(output_stats) == 'decile')]
  
  
  if(!is.null(batch_time)){
    output_statsName <- outputName("Output - Responses - Formatted with Stats", batch_time = batch_time)
    
  } else{
    output_statsName <- outputName("Output - Responses - Formatted with Stats")
  }
  
  # saveRDS(output_stats, file = paste0(output_statsName, " - ", Sys.time(), ".rds"))
  # write.table(output_stats[which(output_stats$`Answer Flag` == 2),], file=paste0(output_statsName, " - ", Sys.time(), ".tsv"), quote=TRUE, sep='\t', row.names=FALSE)
  
  return(output_stats)
  
  
  
}
emerson-civicscience/trendfinder documentation built on Sept. 4, 2022, 10:30 a.m.