R/engage.R

Defines functions engage

#' @export

engage <- function(bi.user = NULL,
									 bi.password = NULL,
									 batch_time = NULL,
									 stem_questions = NULL,
									 banner_questions = NULL,
									 segment_list = NULL,
									 segment_names = NULL,
									 segment_crosstabs = FALSE,
									 crosstab_input = NULL,
									 stem_start_dates = NULL,
									 stem_end_dates = NULL,
									 start_dates = "2021-01-01",
									 end_dates = NULL,
									 time_period = NULL,
									 date_stem_and_banner = FALSE,
									 weighting_schemes = NULL,
									 run_topline = TRUE,
									 run_crosstabs = TRUE,
									 manual_crosstab_input = NULL,
									 use_manual_crosstab_only = TRUE,
									 format_output = TRUE,
									 run_stats = FALSE,
									 use_default_answer_flag = FALSE,
									 cutoff_stats_flags = 10,
									 max_chart_return = 50,
									 max_chart_iterate = 5,
									 use_tags = TRUE,
									 must_plot = NULL,
									 plot_all = FALSE,
									 python_loc = NULL,
									 python_loc_movers = NULL,
									 plot_stem_override = FALSE,
									 letter_stats_output = FALSE,
									 period_type = "quarter",
									 baseline_name = NULL,
									 recent_movers_output = FALSE,
									 recent_movers_segments = NULL,
									 recent_movers_banners = NULL,
									 ancestry_output = FALSE,
									 make_charts = FALSE,
									 notes = NULL){

	stem_questions <- unique(stem_questions)
	banner_questions <- unique(banner_questions)
	manual_crosstab_input <- manual_crosstab_input[!duplicated(manual_crosstab_input), ]

	weighting_dict <- TFweights(weighting_schemes,
	                            bi.user = bi.user,
	                            bi.password = bi.password)

	if(is.null(end_dates)){
		data_end_dates <- TFdateHandler(start_dates)
	} else{
		data_end_dates <- TFdateHandler(start_dates, data_end_dates = end_dates)
	}
	
	if(!is.null(stem_start_dates)){
  	if(is.null(stem_end_dates)){
  	  stem_end_dates <- TFdateHandler(stem_start_dates)   
  	  } else{
  	    stem_end_dates <- TFdateHandler(stem_start_dates, data_end_dates = stem_end_dates)
  	  }
	}
	
	if(length(time_period) != length(start_dates)){
		time_period <- start_dates
	}

	questionList <- TFtoplineQuestionHandler(stem_questions = stem_questions,
																					 banner_questions = banner_questions,
																					 weighting_schemes = weighting_schemes)


	# TFcrosstabQuestionHandler() outputs a data frame with three columns: stem, banner, weights
	crosstab_input <- TFcrosstabQuestionHandler(manual_crosstab_input = manual_crosstab_input,
																							use_manual_crosstab_only = use_manual_crosstab_only,
																							stem_questions = stem_questions,
																							banner_questions = banner_questions,
																							weighting_schemes = weighting_schemes)


	### The function outputFilePathMaker generates the filepath for the output. Specifying fileLocation
	### will create a subfolder called "Outputs" in the fileLocation directory
	# outputDate <- today() - 1 # The default date in outputFilePathMaker is today()
	outputFilePath <- outputFilePathMaker()
	
	# save_variables_all <- readRDS('~/TrendFinder/Outputs/save_variables.rds')
	# save_variables <- NULL
	# 
	# for(i in save_variables_all){
	#   next_variable <- i[!is.null(eval(as.symbol(i)))]
	#   save_variables <- c(save_variables, next_variable)
	# }
	# 
	# save_variables <- ls()[which(ls() %in% save_variables)]

	image_file_name <- file.path(outputFilePath, paste0("TrendFinder Environment ", batch_time, ".RData"))
	save.image(file = image_file_name)
	
	# if(!is.null(crosstab_input)){
	#   
	#   crosstabStemList <- crosstab_input[, c("stem", "weight")]
	#   colnames(crosstabStemList) <- colnames(questionList)
	#   crosstabBannerList <- crosstab_input[, c("banner", "weight")]
	#   crosstabQuestionList <- rbind(crosstabStemList, crosstabBannerList) %>%
	#     .[!duplicated(.), ] 
	# 	
	# }

	# If no questions are specified for topline results but topline results are requested, create topline list of all
	# unique questions in crosstab input.
	if(is.null(questionList)){
		if(!is.null(crosstab_input)){
			questionList <- crosstabQuestionList
		} else{
			print("You don\'t have a topline list or a crosstab table/tibble to input.")
			stop()
		}
	} else{
	  questionList <- rbind(questionList, crosstabQuestionList) %>%
	    .[!duplicated(.), ] %>%
	    arrange(., banner)
	}

	anti_join_columns <- c("stem_start_date", "stem_end_date", "start_date", "end_date", "stem", "banner", "weighting_scheme") # Used for matching history with current preconditions
	condition_columns <- c("banner", "precondition", "weighting_scheme") # Used to remove unwanted columns from precondition tables

	batch_time_char <- format(batch_time, "%Y-%m-%d %H:%M:%S %Z")
	
	num_cores <- detectCores()

	if(run_topline){
	  


		questionListTopline <- transpose(questionList) %>%
			as.list()

		toplineConditions <- mclapply(questionListTopline, TFtoplinePreconditions,
		                              data_start_dates = start_dates,
																	data_end_dates = data_end_dates,
																	mc.cores = num_cores) %>%
			rbindlist()


		### Each time new data is computed for TrendFinder, it will be pushed to two files (essentially database tables):
		##### '~/TrendFinder/Outputs/trendfinder_results.rds'
		##### '~/TrendFinder/Outputs/trendfinder_history.rds'
		### trendfinder_history is a much smaller file as it is just metadat about when/how the data was computed and the
		### the unique topline quesetion ID/crosstab question IDs rather than the answer IDs and actual data
		### If the table trendfinder_history has a matching row for start_date, end_date, weights, stem, and banner,
		### it is excluded with the anti_join() function
		
		trendfinder_history <- readRDS('~/TrendFinder/Outputs/trendfinder_history.rds')

		toplineConditionsDeduped <- anti_join(toplineConditions, trendfinder_history, by=all_of(anti_join_columns))
		
		rm(list='trendfinder_history')
		gc()

		if(nrow(toplineConditionsDeduped) == 0){
		  toplineResults <- NULL
			outputResults <- NULL
			trendfinder_history_update <- NULL
			# allConditions <- toplineConditions
		} else{

			trendfinder_history_update <- toplineConditionsDeduped %>% select(all_of(anti_join_columns))

			toplineConditionsDeduped <- as_tibble(toplineConditionsDeduped)

			toplineConditionsDeduped <- toplineConditionsDeduped[, condition_columns]

			toplineConditionsList <- transpose(toplineConditionsDeduped) %>%
				as.list()


			toplineResults <- mclapply(toplineConditionsList, TFtopline,
																 mc.cores = num_cores) %>%
				rbindlist()

			# fileName <- outputName("Topline Results", batch_time = batch_time_char)
			# saveRDS(toplineResults, paste0(fileName, ".rds"))

			outputResults <- toplineResults
		}

		if(!is.null(outputResults)){
			outputResults$data.banner <- as.character(outputResults$data.banner)
		}

		allConditions <- toplineConditions %>% select(all_of(anti_join_columns))


	} else{

		toplineResults <- NULL
		outputResults <- NULL
		trendfinder_history_update <- NULL
		allConditions <- NULL

	}

	if(!is.null(segment_list)){

		questionListUnique <- unique(questionList$banner)

		segmentConditions <- mclapply(questionListUnique, TFsegmentPreconditions,
																	segment_list = segment_list,
																	stem_start_dates = stem_start_dates,
																	stem_end_dates = stem_end_dates,
																	data_start_dates = start_dates,
																	data_end_dates = data_end_dates,
																	date_stem_and_banner = date_stem_and_banner,
																	mc.cores = num_cores) %>%
			rbindlist()
		
		segment_names <- unique(segmentConditions$stem)

		weightingDictSegments <- tibble(weighting_scheme = segmentConditions$weighting_scheme, value = segmentConditions$weights) %>%
			unique()
		# packageVersion('tibble') # Currently using 2.1.3 and get https://github.com/tidyverse/tibble/issues/798
		
		trendfinder_history <- readRDS('~/TrendFinder/Outputs/trendfinder_history.rds')
		
		segmentConditionsDeduped <- anti_join(segmentConditions, trendfinder_history, by = all_of(anti_join_columns))
		
		rm(list='trendfinder_history')
		gc()
		

		if(nrow(segmentConditionsDeduped) == 0){
			segmentResultsChar <- NULL
		} else{
		  
		  
		  trendfinder_history_update <- rbind(trendfinder_history_update, segmentConditionsDeduped[, ..anti_join_columns])

			segmentConditionsDedupedSubset <- segmentConditionsDeduped[ , ..condition_columns]

			### Remove rows based on prior results run by TrendFinder
			### See comment just after toplineConditions (above) for more info

			segmentConditionsList <- transpose(segmentConditionsDedupedSubset) %>%
				as.list()

			segmentResults <- mclapply(segmentConditionsList, TFsegment,
																 weightingDictSegments = weightingDictSegments,
																 mc.cores = num_cores) %>%
				rbindlist()

			# segmentResults <- TFoutputResultsFormat(segmentResults, batch_time = batch_time_char)
			
			segmentConditionsDeduped <- as.data.frame(segmentConditionsDeduped)

			segmentConditionsDeduped <- segmentConditionsDeduped[, anti_join_columns] ### R is seemingly schizophrenic about whether it should be `..anti_join_columns` or `anti_join_columns`
			
			
			
			segmentResultsChar <- segmentResults
			segmentResultsChar$data.banner <- as.character(segmentResults$data.banner)
			
		}

		# fileName <- outputName("Segment Results", batch_time = batch_time_char)
		# saveRDS(segmentResults, paste0(fileName, ".rds"))

		
		# segmentResultsChar$batch <- batch_time_char
		# colnames(segmentResultsChar)[grep('data.', colnames(segmentResultsChar))] <- gsub('data.', '', colnames(segmentResultsChar)[grep('data', colnames(segmentResultsChar))])
		# colnames(segmentResultsChar)[grep('response.count', colnames(segmentResultsChar))] <- gsub('response.count', 'response_count', colnames(segmentResultsChar)[grep('response.count', colnames(segmentResultsChar))])

		outputResults <- rbind(outputResults, segmentResultsChar)
		
		segment_conditions_for_all <- segmentConditions[, ..anti_join_columns]
		allConditions <- rbind(allConditions, segment_conditions_for_all)
		segmentConditionsDeduped <- as.data.frame(segmentConditionsDeduped)
		trendfinder_history_update <- rbind(trendfinder_history_update, segmentConditionsDeduped[, anti_join_columns])

	}


	if(run_crosstabs){
		crosstabRows <- transpose(crosstab_input) %>%
			as.list()

		crosstabConditions <- mclapply(crosstabRows, TFcrosstabPreconditions,
		                               segment_list = segment_list,
		                               stem_start_dates = stem_start_dates,
		                               stem_end_dates = stem_end_dates,
																	 data_start_dates = start_dates,
																	 data_end_dates = data_end_dates,
																	 date_stem_and_banner = date_stem_and_banner,
																	 mc.cores = num_cores) %>%
			rbindlist()

		if(segment_crosstabs){
		  if(!is.null(segment_list)){
		    segment_crosstab_join <- left_join(crosstabConditions[, c("stem_start_date", "stem_end_date", "start_date", "end_date", "stem", "banner")], 
		                                       segmentConditions[, c("stem_start_date", "stem_end_date", "start_date", "end_date", "banner", "precondition", "weighting_scheme")], 
		                                       by = c("start_date", "end_date", "banner"))
		    crosstabConditions <- rbind(crosstabConditions, segment_crosstab_join)
		  }
		}
		
		# crosstabConditions$stem_start_date[crosstabConditions$stem_start_date == "NA"] <- NA
		# crosstabConditions$stem_end_date[crosstabConditions$stem_end_date == "NA"] <- NA
		
		
		### Remove rows based on prior results run by TrendFinder
		### See comment just after toplineConditions (above) for more info
		
		trendfinder_history <- readRDS('~/TrendFinder/Outputs/trendfinder_history.rds')
		
		crosstabConditionsDeduped <- anti_join(crosstabConditions, trendfinder_history, by=all_of(anti_join_columns))
		
		rm(list='trendfinder_history')
		gc()
    
    
		if(nrow(crosstabConditionsDeduped) == 0){
			crosstabResults <- NULL
		} else{
		  
			trendfinder_history_update <- rbind(trendfinder_history_update, crosstabConditionsDeduped %>% select(all_of(anti_join_columns)))
			
			crosstabConditionsDeduped <- as_tibble(crosstabConditionsDeduped)
			
			crosstabConditionsDeduped <- crosstabConditionsDeduped[, c("start_date", "end_date", "stem", condition_columns)]
			
			remove_stem <- which(is.na(crosstabConditionsDeduped$stem))
			remove_banner <- which(is.na(crosstabConditionsDeduped$banner))
			remove_rows <- c(remove_stem, remove_banner) %>% unique()
			
			if(!length(remove_rows) == 0){
			  crosstabConditionsDeduped <- crosstabConditionsDeduped[-remove_rows, ]
			}
			
			crosstabConditionsList <- transpose(crosstabConditionsDeduped) %>%
				as.list()
			
			crosstabResults <- mclapply(crosstabConditionsList, TFcrosstab,
																	weighting_dict = weighting_dict,
																	mc.cores = num_cores) %>%
				rbindlist()
		}

		# fileName <- outputName("Crosstab Results", batch_time = batch_time_char)
		# saveRDS(crosstabResults, paste0(fileName, ".rds"))

		crosstabResultsChar <- crosstabResults

		if(!is.null(crosstabResultsChar)){
			crosstabResultsChar$data.stem <- as.character(crosstabResultsChar$data.stem)
		}

		# crosstabResultsChar$data.banner <- as.character(crosstabResultsChar$data.banner)

		outputResults <- rbind(outputResults, crosstabResultsChar)
		crosstabConditions <- crosstabConditions %>% select(anti_join_columns)
		allConditions <- rbind(allConditions, crosstabConditions)



	} else{
		crosstabResults <- NULL
	}

	output_end_time <- Sys.time()
	elapsed_time <- output_end_time - batch_time
	print(elapsed_time) # For informational/diagnostic use
	# fileName <- outputName("Output Results", batch_time = batch_time_char)
	

	if(!is.null(outputResults)){
	  outputResults <- as_tibble(outputResults)

		outputResults <- TFoutputResultsFormat(outputResults, 
		                                       stem_start_dates,
		                                       stem_end_dates,
		                                       start_dates,
		                                       data_end_dates,
		                                       batch_time = batch_time_char)
		# write.table(outputResults, file=paste0(fileName,'.tsv'), quote=TRUE, sep='\t', row.names=FALSE)
		# saveRDS(outputResults, paste0(fileName, ".rds"))
	}

	trendfinder_results <- readRDS('~/TrendFinder/Outputs/trendfinder_results.rds')

	if(!is.null(trendfinder_history_update)){
	  if(nrow(trendfinder_history_update) > 0){
	      trendfinder_history_update$batch <- batch_time_char
	      trendfinder_history_update <- as.data.frame(trendfinder_history_update)
	      trendfinder_history <- readRDS('~/TrendFinder/Outputs/trendfinder_history.rds')
	      trendfinder_history_update <-trendfinder_history_update[, colnames(trendfinder_history)]
	      trendfinder_history <- rbind(trendfinder_history, trendfinder_history_update)
	      saveRDS(trendfinder_history, '~/TrendFinder/Outputs/trendfinder_history.rds')
	      trendfinder_results <- rbind(trendfinder_results, outputResults)
	      trendfinder_results <- trendfinder_results[!duplicated(trendfinder_results), ] # Just in case
	      saveRDS(trendfinder_results, '~/TrendFinder/Outputs/trendfinder_results.rds')
	    }
	  }


	# now have to mash up the already computed rows here with join instead of anti-join

	allConditionsAnswers <- left_join(allConditions, dataKey, by = c("stem" = "Question ID"))
	allConditionsAnswers$`Answer ID`[which(is.na(allConditionsAnswers$`Answer ID`))] <- allConditionsAnswers$stem[which(is.na(allConditionsAnswers$`Answer ID`))]
	allConditionsAnswers$stem <- allConditionsAnswers$`Answer ID`
	allConditionsAnswers <- allConditionsAnswers[, ..anti_join_columns]
	allConditionsAnswers <- left_join(allConditionsAnswers, dataKey, by = c("banner" = "Question ID"))
	allConditionsAnswers$banner <- allConditionsAnswers$`Answer ID`
	allConditionsAnswers <- allConditionsAnswers[, ..anti_join_columns]

	all_results <- left_join(allConditionsAnswers, trendfinder_results, by = all_of(anti_join_columns))
	
	if(exists('trendfinder_history')){
	  rm(list='trendfinder_history')
	}
	rm(list='trendfinder_results')
	gc()

	# This is here to remove joins from dataKey that were not actually calculated
	# When answer groupings in the calculation phase are done away with, this won't be necessary
	# outputResults <- outputResults[-which(is.na(outputResults$response_count)), ]

	all_results <- TFoutputResultsFormat(all_results, 
	                                     stem_start_dates,
	                                     stem_end_dates,
	                                     start_dates,
	                                     data_end_dates,
	                                     batch_time = batch_time_char)
	fileName <- outputName("All Results", batch_time = batch_time_char)
	# saveRDS(all_results, paste0(fileName, ".rds"))

	# if (!is.null(format_output)) {
	#   if (!is.null(cutoff_stats_flags)) {
	#     outputFormatted <- TFwider(outputResults) %>%
	#       TFformat(.) %>%
	#       TFstats(., cutoff_stats_flags)
	#   } else{
	#     outputFormatted <- TFwider(outputResults) %>%
	#       TFformat(.)
	#   }
	# }

	# outputWider <- TFwider(all_results)
	outputFormatted <- TFwider(all_results) %>%
	  TFformat(., time_period = time_period, segment_names = segment_names, use_default_answer_flag = use_default_answer_flag, batch_time = batch_time_char)
	
	if(ancestry_output){
	  outputFormatted <- TFancestryOutputFormat(outputFormatted, data_start_dates = start_dates, data_end_dates = data_end_dates, time_period = time_period)
	  remove_rows_ancestry <- which(is.na(outputFormatted$`Banner Name`))
	  if(length(remove_rows_ancestry) > 0){
	    outputFormatted <- outputFormatted[-remove_rows_ancestry, ]
	  }
	}
	
	
	if(run_stats){
	  outputStats <- TFstats(outputFormatted, 
	                         batch_time = batch_time,
	                         cutoff_stats_flags = cutoff_stats_flags,
	                         max_chart_return = max_chart_return,
	                         max_chart_iterate = max_chart_iterate)
	  
	  input_TFmakeCharts <- outputStats
	} else{
	  input_TFmakeCharts <- outputFormatted
	}
	
	if(letter_stats_output){
	  input_TFmakeCharts <- TFdatePropTestLetters(input_TFmakeCharts)
	  
	  
	  saveRDS(input_TFmakeCharts, paste0("~/TrendFinder/Outputs/", today(), "/Output - Responses - Formatted with Stats - Batch Time ", gsub(":", "_", batch_time_char), ".rds"))
	  ### Need to:
	  #### Remove unneeded rows from "Ancestry History" that result in NAs in output
	  #### Find way to run the data for two months if necessary
	  #### Find way to append an empty column of notes unless the last two months are needed or data is unavailable
	  #### Automate aggregate charts for the first few tabs if Ancestry wants
	}
	
	# View(outputFormatted)
	
	# write.table(outputFormatted, file=paste0(fileName,'.tsv'), quote=TRUE, sep='\t', row.names=FALSE)
	# write.table(outputResults, file=paste0('~/TrendFinder/Outputs/2021-11-09/outputResults.tsv'), quote=TRUE, sep='\t', row.names=FALSE)
	
	if(make_charts){
	  TFmakeCharts(input_TFmakeCharts,
	               segment_names = segment_names,
	               use_tags = use_tags,
	               must_plot = must_plot,
	               plot_all = plot_all,
	               python_loc = python_loc,
	               ancestry_output = ancestry_output)
	}
	
	
	if(recent_movers_output){
	  TFmakeMoversReport(input_TFmakeCharts,
	                     period_type = period_type,
	                     baseline_name = baseline_name,
	                     recent_movers_segments = recent_movers_segments,
	                     recent_movers_banners = recent_movers_banners,
	                     python_loc_movers = python_loc_movers)
	}
	
	}
emerson-civicscience/trendfinder documentation built on Sept. 4, 2022, 10:30 a.m.