surveyR: surveyR: Get things done with LimeSurvey!

Documented in Count_and_Filter Get_Interview_Time

#' Get Interview Timing
#'
#' This function takes a survey list (either raw or filtered) and returns a list with three entries:
#' the first is a data.table describing the timings of the responses, and the other two are median LOIs of either only the completes,
#' or of all responses.
#' @param Survey_List this is a list containing only the downloaded data from the survey for a multiple-part survey.
#' @param racer_percentile This is the percentile that's serves as the threshold for racers- entered as a number (default is 0.025), any times within this percentile are classified as racers. All racers are speeders, but in some projects we make racing a hard BQ criteria, and speeding needs to be combined with other criteria.
#' @param speeder_percentile This is the percentile that's serves as the threshold for speeders- entered as a number (default is 0.05), any times within this percentile are classified as speeders
#' @return A list with three components: 1) A Data.table that gives the key, qPID, qPart1ID, Start_Time, End_Time, interviewtime, whether the response ended in a complete, and whether it was a racer (bottom 2.5 percentile) or a speeder (bottom 5 percentile)
#' 2) Median LOI for completes, 3) Median LOI for all of the interviews passed to the function (includes those that made it to the last part but didn't submit on the last page)
#' @keywords Screenouts, Filtering
#' @export
#' @examples
#' Get_Interview_Time(Cint_Surveys_0)
Get_Interview_Time <- function(Survey_List, racer_percentile = 0.025, speeder_percentile = 0.05){
  # Function gives a list of the interview times
  if(length(Survey_List) > 1){

    Merged_First_and_Last <- merge.data.table(Survey_List[[1]],Survey_List[[length(Survey_List)]], by = c("key"), suffixes = c(".first", ".last"))
    Merged_First_and_Last[, ':='(qPID = qPID.first,
                                 qPart1ID = qPart1ID.first,
                                 interviewtime = time_length(lubridate::ymd_hms(datestamp.last) - lubridate::ymd_hms(startdate.first),unit = "minutes"),
                                 Start_Time = lubridate::ymd_hms(startdate.first),
                                 End_Time = lubridate::ymd_hms(datestamp.last),
                                 complete = ifelse(("complete" %in% names(Survey_List[[1]])) & ("complete" %in% names(Survey_List[[length(Survey_List)]])),  complete.first, NA))]

    LOI_Fast <- quantile(Merged_First_and_Last$interviewtime, probs = c(racer_percentile,speeder_percentile),na.rm=TRUE)

    Merged_First_and_Last[,  ':=' (racer = ifelse(interviewtime <= LOI_Fast[1], 1, 0),
                                   speeder = ifelse(interviewtime <= LOI_Fast[2], 1, 0))]

    Return_List <- list()
    # Information about all of the interviews given to the function (that contain both a first and last section- an incomplete that didn't make it to the last part at all is never included)
    Return_List[[1]] <- Merged_First_and_Last[, list(key,qPID,qPart1ID,Start_Time, End_Time, interviewtime, complete, racer, speeder)]
    # Median LOI for completes
    Return_List[[2]] <- data.table(Median_LOI = median(Merged_First_and_Last[complete == 1]$interviewtime, na.rm=TRUE))
    # Median LOI for all of the interviews passed to the function (includes those that made it to the last part but didn't submit on the last page)
    Return_List[[3]] <- data.table(Median_LOI = median(Merged_First_and_Last$interviewtime, na.rm=TRUE))
  } # end of multiple part surveys if statement
  else{

    Merged_First_and_Last <- copy(Survey_List[[1]])
    Merged_First_and_Last[, ':='(qPID = qPID,
                                 qPart1ID = qPart1ID,
                                 interviewtime = time_length(lubridate::ymd_hms(datestamp) - lubridate::ymd_hms(startdate),unit = "minutes"),
                                 Start_Time = lubridate::ymd_hms(startdate),
                                 End_Time = lubridate::ymd_hms(datestamp),
                                 complete = ifelse(lastpage == max(lastpage, na.rm = TRUE), 1, 0))]

    LOI_Fast <- quantile(Merged_First_and_Last$interviewtime, probs = c(racer_percentile,speeder_percentile),na.rm=TRUE)

    Merged_First_and_Last[,  ':=' (racer = ifelse(interviewtime <= LOI_Fast[1], 1, 0),
                                   speeder = ifelse(interviewtime <= LOI_Fast[2], 1, 0))]

    Return_List <- list()
    # Information about all of the interviews given to the function (that contain both a first and last section- an incomplete that didn't make it to the last part at all is never included)
    Return_List[[1]] <- Merged_First_and_Last[, list(key,qPID,qPart1ID,Start_Time, End_Time, interviewtime, complete, racer, speeder)]
    # Median LOI for completes
    Return_List[[2]] <- data.table(Median_LOI = median(Merged_First_and_Last[complete == 1]$interviewtime, na.rm=TRUE))
    # Median LOI for all of the interviews passed to the function (includes those that made it to the last part but didn't submit on the last page)
    Return_List[[3]] <- data.table(Median_LOI = median(Merged_First_and_Last$interviewtime, na.rm=TRUE))


  }



  return(Return_List)
}



#' Filter Raw Surveys, Remove Duplicates and Dropouts
#'
#' This function takes a raw survey list and returns a list with 8 entries: the first is a filtered version of the raw survey data.
#' This function does all of the heavy lifting in removing duplicate survey responses from single respondents, and provides almost all of
#' the counting functionality for the field report.
#'
#' @param Survey_List this is a list containing only the downloaded data from the survey for a multiple-part survey.
#' @param racer_percentile_definition This is the percentile that's serves as the threshold for racers- entered as a number (default is 0.025), any times within this percentile are classified as racers. All racers are speeders, but in some projects we make racing a hard BQ criteria, and speeding needs to be combined with other criteria.
#' @param speeder_percentile_definition This is the percentile that's serves as the threshold for speeders- entered as a number (default is 0.05), any times within this percentile are classified as speeders
#' @return A list with eight components:
#' 1) A list with the filtered survey parts as data.tables
#' 2) A data.table with the relevant information for the field report
#' 3) A data.table of survey IDs that were screened out
#' 4) A data.table of the first survey part with survey IDs deemed as "junk"- incomplete, but not screenouts
#' 5) The list returned by the Get_Interview_Time function on the filtered data
#' 6) data.table of Unique_Incomplete_IDs
#' 7) data.table of Panel_Report_ID_List
#' 8) data.table of Dropouts_First_Part_Pages
#' @keywords Screenouts, Filtering, Field Report
#' @export
#' @examples
#' Count_and_Filter(Cint_Surveys_0)
Count_and_Filter <- function(Survey_List, racer_percentile_definition = 0.025, speeder_percentile_definition = 0.05){


  # Info (clap open to read)

  # For a comparison of data tables and data frames (so that dataframe lovers can brush up on the syntax):
  # https://atrebas.github.io/post/2019-03-03-datatable-dplyr/

  # Battle plan for getting rid of all duplicates:
  # Find the "whitelist" (PID/Part1ID combinations that are not duplicated) for the final survey part- these are the good completes
  # Count these for reporting
  # Find the whitelist for the first survey part by finding one for screenouts and one for those that completed the first part
  # Count the non-duplicated screenouts for reporting
  # These whitelists will be combined and any duplicates will be removed.
  # Then, the first survey part will be filtered using this combined whitelist.

  # The whitelist containing just the incompletes and the completes is used to filter ALL the survey parts.

  # REQUIRED ELEMENTS OF THE SURVEY:
  # Column: "qPart1ID" - {SAVEID} from the first survey part
  # Column: "qPID" - These are the respondents' unique IDs
  # Column: "lastpage" - automatically present in Limesurvey data, this is the last page the respondent reached.
  # Survey must be packed up as a List

  # Return values:
  # A list
  # [[1]]: This is the entered Survey_List filtered (in all parts) with only the completes and incompletes
  # The completes and incompletes are noted as such in the column "complete"
  # All responses are given a "key" value in the key column- this is just the concatenation of the qPart1ID and the qPID
  # [[2]]: Low-level reporting on the number of respondents incoming, n.completes, n.incompletes, n.screenouts (undifferentiated), n.junk
  # [[3]]: Unique screenout entries to pass on to the reporting function to differentiate the reasons for screenouts (I don't want to handle that in the
  # filter function- I want to keep it widely applicable with no changes)
  # [[4]]: Junk entries
  # [[5]]: Time
  # [[6]]: Incompletes


  # Count total incoming traffic (including lastpage == -1 and responses without PID)
  n.incoming <- nrow(Survey_List[[1]])

  # Prepare the incoming data: identify screenouts
  Survey_List <- Identify_Screenouts(Survey_List)

  # Identify dropouts in the first part- this will be continued at the end of this function
  # Survey_List[[1]]$dropout_part1 <- ifelse(Survey_List[[1]]$lastpage <= 1 | (Survey_List[[1]]$lastpage < max(Survey_List[[1]]$lastpage, na.rm = TRUE) & Survey_List[[1]]$screenout != 1),1,0)
# Update 2021.09.13: I don't want to count clickouts as dropouts!
  Survey_List[[1]]$dropout_part1 <- ifelse((Survey_List[[1]]$lastpage < max(Survey_List[[1]]$lastpage, na.rm = TRUE) & Survey_List[[1]]$screenout != 1 & Survey_List[[1]]$lastpage >=1),1,0)



  # Prepare the incoming data: Get rid of missing qPIDs
  Survey_List[[1]]$qPID[Survey_List[[1]]$qPID == ""] <- NA
  Survey_List[[1]] <- Survey_List[[1]][!is.na(qPID)]
  # %>%na.omit(cols = c("qPID"))

  # Count the number that are eliminated by getting rid of entries without PID
  n.with_PID <- nrow(Survey_List[[1]])
  n.missing_PID <- n.incoming - n.with_PID

  # Full first survey part with all incoming traffic including click-aways (only thing missing are the empty PIDs)
  First_Survey_Part_All_Incoming_Traffic <- Survey_List[[1]]

  # Eliminate all those entries without a qPart1ID (important for the rest of the duplicate search and filtering process) or a qPID from all Survey parts:
  for(i in 1:length(Survey_List)){
    Survey_List[[i]]$qPID[Survey_List[[i]]$qPID == ""] <- NA
    Survey_List[[i]]$qPart1ID[Survey_List[[i]]$qPart1ID == ""] <- NA

    Survey_List[[i]] <- Survey_List[[i]][!is.na(qPID)]
    # %>%na.omit(cols = c("qPID"))
    Survey_List[[i]] <- Survey_List[[i]][!is.na(qPID)]
    # %>%na.omit(cols = c("qPart1ID"))
  }

  # Count how many are lost due to lack of qPart1ID
  n.with_Part1ID <- nrow(Survey_List[[1]])
  n.missing_PID <- n.with_PID - n.with_Part1ID

  # Create a column that is the identifier "key"- the combination of the Part1ID and the PID
  for(i in 1:length(Survey_List)){
    Survey_List[[i]][, key := paste0(qPart1ID, qPID)]
  }


  # Examine the last survey part:
  # Get the uniques, but only those that are completes, and only take the last complete if there are multiple completes from the same PID: OLD
  # Unique_Complete_IDs <- unique(Survey_List[[length(Survey_List)]][lastpage == max(Survey_List[[length(Survey_List)]][["lastpage"]], na.rm = TRUE)], by = c("qPID"), fromLast = TRUE)[, .(qPart1ID, qPID, key)]

  #UPDATED (17.03.2021)-
  Grouped_qPID_max_lastpage <- Survey_List[[length(Survey_List)]][, .(max = .I[which.max(lastpage)]), by = qPID]
  melted_g <- melt(Grouped_qPID_max_lastpage, id="qPID")
  Unique_Complete_IDs <- Survey_List[[length(Survey_List)]][melted_g$value][, v := melted_g$variable][]
  Unique_Complete_IDs <- Unique_Complete_IDs[lastpage == max(Survey_List[[length(Survey_List)]][["lastpage"]], na.rm = TRUE)]
  Unique_Complete_IDs <- unique(Unique_Complete_IDs, by = c("qPID"))[, .(qPart1ID, qPID, key)]


  n.completes <- nrow(Unique_Complete_IDs)
  #^^ With this, we have a list of our good completes. Only these should be kept from all of the sections for further analysis.
  # These are definitely those that will be kept in the first section, if these PIDs have multiple Part1IDs


  # Examine the first survey part:
  # Get the unique incompletes (defined as all those that completed the first section, but not the last)
  # Have to exclude the unique completes list!
  # OLD:
  # Unique_Incomplete_IDs <- unique(Survey_List[[1]][!(qPID %in% Unique_Complete_IDs$qPID) & lastpage == max(Survey_List[[1]][["lastpage"]], na.rm = TRUE)], by = c("qPID"), fromLast = TRUE)[, .(qPart1ID, qPID, key)]
  # n.incompletes <- nrow(Unique_Incomplete_IDs)

  #Updated: (17.03.2021)
  Grouped_qPID_max_lastpage <- Survey_List[[1]][!(qPID %in% Unique_Complete_IDs$qPID), .(max = .I[which.max(lastpage)]), by = qPID]
  melted_g <- melt(Grouped_qPID_max_lastpage, id="qPID")
  Unique_Incomplete_IDs <- Survey_List[[1]][melted_g$value][, v := melted_g$variable][]
  Unique_Incomplete_IDs <- Unique_Incomplete_IDs[lastpage == max(Survey_List[[1]][["lastpage"]], na.rm = TRUE)]
  Unique_Incomplete_IDs <- unique(Unique_Incomplete_IDs, by = c("qPID"))[, .(qPart1ID, qPID, key)]
  n.incompletes <- nrow(Unique_Incomplete_IDs)

  # Finish identifying the dropouts (includes duplicates! This is just for the field reporting to figure out if there are problems!)
  First_Survey_Part_All_Incoming_Traffic <- First_Survey_Part_All_Incoming_Traffic[ , dropout_part2 := ifelse( lastpage == max(Survey_List[[1]][["lastpage"]], na.rm = TRUE) & (qPID %in% Unique_Incomplete_IDs$qPID), 1, 0)]
  First_Survey_Part_All_Incoming_Traffic <- First_Survey_Part_All_Incoming_Traffic[ , dropout := ifelse( dropout_part1 == 1 | dropout_part2 == 1, 1, 0)]
  Panel_Report_ID_List <- First_Survey_Part_All_Incoming_Traffic[,.(qPID,lastpage,screenout,dropout_part1, dropout_part2, dropout)]

  n.Total_Dropouts_Part_1 <- nrow(Panel_Report_ID_List[dropout_part1 == 1])
  n.Unique_Dropouts_Part_1 <- nrow(unique(Panel_Report_ID_List[dropout_part1 == 1], by = c("qPID")))

  n.Total_Dropouts_Part_2 <- nrow(Panel_Report_ID_List[dropout_part2 == 1])
  n.Unique_Dropouts_Part_2 <-  nrow(unique(Panel_Report_ID_List[dropout_part2 == 1], by = c("qPID")))

  # Dropouts first part
  Dropouts_First_Part_Pages <- Panel_Report_ID_List[, count_lastpage := .N, by = lastpage]%>%
    unique(by = "lastpage")

  Dropouts_First_Part_Pages <- Dropouts_First_Part_Pages[order(lastpage)]

  # Dropouts last part
  Dropouts_Last_Part <- merge(unique(Survey_List[[1]][lastpage == max(Survey_List[[1]][["lastpage"]], na.rm = TRUE)], by = "key"), Survey_List[[length(Survey_List)]], by = "key", all.x = TRUE)[, count_lastpage := .N, by = "lastpage.y"]%>%
    unique(by = "lastpage.y")%>%
    setnames(old = "lastpage.y", new = "lastpage")

  Dropouts_Last_Part <- Dropouts_Last_Part[, .(lastpage,count_lastpage)]%>%
    setorder(cols = "lastpage")



  # Get the unique screenouts:
  Unique_Screenout_IDs <- unique(Survey_List[[1]][screenout == 1], by = c("qPID"), fromLast = TRUE)[, .(qPart1ID, qPID, key)]
  n.screenouts <- nrow(Unique_Screenout_IDs)

  # Now create the whitelist of responses to keep in:
  Whitelist_Keepers <- rbindlist(list(Unique_Complete_IDs,Unique_Incomplete_IDs),use.names = TRUE)


  # Filter all the survey parts using this whitelist and remove any remaining duplicates that happen to be on the whitelist
  # Also add a column that shows if the particular response belongs to a complete or not
  Filtered_Survey_List <- list()

  #OLD
  # for(i in 1:length(Survey_List)){
  #   Filtered_Survey_List[[i]] <- unique(Survey_List[[i]][key %in% Whitelist_Keepers$key], by = c("key"), fromLast = TRUE)
  #
  #   # This I changed (NEW GRADS 1.3.2021) Original:
  #   # Filtered_Survey_List[[i]][, complete := ifelse(key %in% Unique_Complete_IDs$key, 1, 0)]
  #   # New: (now it has to not only be a complete in the completes from the last section, but also complete in its own section)
  #   Filtered_Survey_List[[i]][, complete := ifelse(key %in% Unique_Complete_IDs$key & lastpage == max(Filtered_Survey_List[[i]][["lastpage"]], na.rm = TRUE), 1, 0)]
  # }

  #NEW as of 17.03.2021
  for(i in 1:length(Survey_List)){
    ID_Max_Laspage <- Survey_List[[i]][key %in% Whitelist_Keepers$key, .(max = .I[which.max(lastpage)]), by = qPID]
    ID_Max_Laspage_Melt <- melt(ID_Max_Laspage, id="qPID")


    Filtered_Survey_List[[i]] <- Survey_List[[i]][ID_Max_Laspage_Melt$value][, v := ID_Max_Laspage_Melt$variable]

    # This I changed (NEW GRADS 1.3.2021) Original:
    # Filtered_Survey_List[[i]][, complete := ifelse(key %in% Unique_Complete_IDs$key, 1, 0)]
    # New: (now it has to not only be a complete in the completes from the last section, but also complete in its own section)
    Filtered_Survey_List[[i]][, complete := ifelse(key %in% Unique_Complete_IDs$key & lastpage == max(Filtered_Survey_List[[i]][["lastpage"]], na.rm = TRUE), 1, 0)]
  }

  # https://stackoverflow.com/questions/25314336/extract-the-maximum-value-within-each-group-in-a-dataframe
  # Last <- aggregate(Merged_Surveys[[9]]$lastpage, by = list(Merged_Surveys[[9]]$qPID), max)
  # ZZ <- Merged_Surveys[[9]]%>%
  #   group_by(qPID)%>%
  #   summarise(lastpage = max(lastpage))


  # GOOD THING!!!! 17.03.2021
  # https://stackoverflow.com/questions/40749532/data-table-aggregate-by-group-and-keep-corresponding-values-from-other-columns
  # ZZZ <- Merged_Surveys[[9]][, .(max = .I[which.max(lastpage)]), by = qPID]
  # maggi <- melt(ZZZ, id="qPID")
  #
  # ZZZZZ <- Merged_Surveys[[9]][maggi$value][, v := maggi$variable][]
  # fwrite(ZZZZZ, "Test.csv", bom = TRUE, sep = ";")

  # Also (nice to have) a list of JUNK (these are the those that have lastpage -1 or otherwise have problems- useful to monitor the number in case there are too many)
  # There will be no special list of Junk, you can just look into the raw data if there's an indication that there's too much junk.
  # JUNK ALSO INCLUDES UNFILTERED DUPLICATES!!! This is important to know, because it gives a sense of whether a certain respondent is attempting the survey repeatedly.
  # Junk includes:
  # All responses that quit in first part before reaching the last page that ARE NOT screenouts
  # All duplicates
  # What isn't counted as junk are screenouts from respondents that went on to either submit an incomplete or a complete
  # So, one participant can have up to three non-junk, unique entries: A valid complete, a valid incomplete, and a valid screenout
  # But this is just for the junk count! In the final filtered lists, no respondent can have more than 1 response, and it will be their "most complete"
  # No screenouts are kept in the final filtered list.
  All_Non_Junk <- rbindlist(list(Whitelist_Keepers,Unique_Screenout_IDs), use.names = TRUE)
  Junk_Attempts <- Survey_List[[1]][!(key %in% All_Non_Junk$key)]
  n.junk <- nrow(Junk_Attempts)


  Times <- Get_Interview_Time(Filtered_Survey_List, racer_percentile = racer_percentile_definition, speeder_percentile = speeder_percentile_definition)
  Median_LOI_Surveys <- Times[[2]]$Median_LOI


  # Preparing the Return List:
  Return_List <- list()
  # Return our filtered list
  Return_List[[1]] <- Filtered_Survey_List
  # Return our interesting data
  Return_List[[2]] <- data.table(Total_Incoming_Traffic = n.incoming,
                                 Incoming_Traffic_No_Clickouts_No_Screenouts = n.incoming - n.screenouts - Dropouts_First_Part_Pages[lastpage == -1]$count_lastpage,
                                 # Traffic_with_PID = n.with_PID,
                                 # Traffic_without_PID = n.missing_PID,
                                 # Traffic_with_PID_and_Part1ID = n.with_Part1ID,
                                 # Traffic_without_PID_or_Part1ID = n.missing_PID,
                                 Total_Dropouts_Part_1 = n.Total_Dropouts_Part_1,
                                 Unique_Dropouts_Part_1 = n.Unique_Dropouts_Part_1,
                                 Total_Dropouts_Subsequent_Parts = n.Total_Dropouts_Part_2,
                                 Unique_Dropouts_Subsequent_Parts = n.Unique_Dropouts_Part_2,
                                 Total_Dropouts = n.Unique_Dropouts_Part_1 + n.Unique_Dropouts_Part_2,
                                 Completes = n.completes,
                                 Screenouts = n.screenouts,
                                 IR = (n.completes / (n.completes + n.screenouts)),
                                 Median_LOI = Median_LOI_Surveys,
                                 Incompletes = n.incompletes
                                 # Junk = n.junk
  )
  # Return the screenout list (to break down the reasons for screenouts after the filter)
  Return_List[[3]] <- Survey_List[[1]][(key %in% Unique_Screenout_IDs$key)]
  # Return the junk list (for fun!)
  Return_List[[4]] <- Junk_Attempts
  Return_List[[5]] <- Times # Of the filtered survey list
  Return_List[[6]] <- Unique_Incomplete_IDs
  Return_List[[7]] <- Panel_Report_ID_List
  Return_List[[8]] <- Dropouts_First_Part_Pages[, .(lastpage, count_lastpage)]
  Return_List[[9]] <- Dropouts_Last_Part[, .(lastpage, count_lastpage)]

  return(Return_List)
}






#' Perform automatic quality checks on the filtered surveys
#'
#' This function takes a survey list (filtered) and performs quality control checks on it. The QC revolves around four main aspects:
#' 1) Survey timing. The bottom 2.5% of times are automatically filtered out. The bottom 2.5%-5% are flagged, and need to be combined with one of the other three criteria to classify the complete as BQ.
#' 2) Variance in the driver questions: For the America's Best Employers Suite, we have a battery of "driver questions" that are 1-5 Likert type arrays. These all have the prefix "qd" in their variable codes and are found in the first survey section. The variance of these questions is tested, and the bottom 5% is flagged.
#' 3) "Q34" - Please answer this question with "5" if you are paying attention- otherwise the response is flagged.
#' 4) "Plausibility check" - Does the average driver response (good or bad)  generally agree with the overall level of recommendation (q5.1.)? Otherwise the response is flagged.
#' @param Filtered_Survey_List Contains the entire filtered survey list
#' @param HighNPS This is the "high" direct evaluation value for the plausibility check (default is 7)
#' @param LowNPS  This is the "low" direct evaluation value for the plausibility check (default is 2)
#' @param HighDriver This is the "high" mean value for the question battery for the plausibility check (default is 4)
#' @param LowDriver This is the "low" mean value for the question battery for the plausibility check (default is 3)
#' @param QCQ_Value This is the value that the respondent should click on for the Quality Check Question, so that it will NOT be flagged as bad quality.
#' @param Variance_Percentile This is the percentile of the variances under which the response will be flagged as potentially BQ.
#' @return The filtered survey list, but with a column in each list element that is called "BQ" to flag all of the responses that fail the quality control test.
#' @keywords Quality Control
#' @export
#' @examples
#' Quality_Control(Cint_Filtered_Surveys)
Quality_Control <- function(Filtered_Survey_List, HighNPS = 7, LowDriver = 2, LowNPS = 3, HighDriver = 4, QCQ_Value = 5, Variance_Percentile = 0.05){
  # All of the Quality checks outside of the time control deal with variables found in the first part of the survey (at least for this survey). I will therefore do the quality control on the first part (including on the incompletes- these I'll throw out of quality control at the last step where I will put "incomplete" in the BQ column instead of a value, and create a BQ column that will then be matched using the keys with all of the other survey parts)

  ZFiltered_Survey_Drivers2 <- Filtered_Survey_List[[1]][[1]]
  cols2 <- grep("^qd", names(ZFiltered_Survey_Drivers2), perl = TRUE)

  # VARIANCE TEST: see what the size of the variance is over all of the driver questions.
  # Create a column with the variance of the driver statements
  ZFiltered_Survey_Drivers2$variance_drivers <- apply(ZFiltered_Survey_Drivers2[, ..cols2], 1, function(x) var(x, na.rm=TRUE))
  # Identify the variances of the lower percentiles:
  Variance_Cheaters <- quantile(ZFiltered_Survey_Drivers2$variance_drivers, probs = c(Variance_Percentile),na.rm=TRUE)%>%
    t()
  # Create a helper column for "Variance Cheaters"- repsondents where the variance was too low (below the chosen percentile above)
  ZFiltered_Survey_Drivers2$variance_cheaters <- ifelse(ZFiltered_Survey_Drivers2$variance_drivers <= Variance_Cheaters[,1],1,0)

  # CONSISTENCY TEST
  # Here we want to check if the respondent's overall score of their company (q5.1.) is roughly in line with their satisfaction in the drivers.
  # To do this, we compare the overall score of company (q5.1.) with the mean of the drivers
  ZFiltered_Survey_Drivers2$mean_drivers <- apply(ZFiltered_Survey_Drivers2[, ..cols2], 1, function(x) mean(x, na.rm=TRUE))
  # Create a helper column for the "Inconsistent_Responses"
  # Case one: High q5.1.(NPS), low mean driver score
  # HighNPS <- 7
  # LowDriver <- 2
  # Case two: Low q5.1.(NPS), high mean driver score
  # LowNPS <- 3
  # HighDriver <- 4

  # Helper column for the consistency
  ZFiltered_Survey_Drivers2$inconsistent_responses <- ifelse(ZFiltered_Survey_Drivers2$q5.1.>=HighNPS & ZFiltered_Survey_Drivers2$mean_drivers <= LowDriver,1,ifelse(ZFiltered_Survey_Drivers2$q5.1.<=LowNPS & ZFiltered_Survey_Drivers2$mean_drivers >= HighDriver,1,0))


  # Combine the timing information
  ZFiltered_Survey_Drivers2 <- merge(ZFiltered_Survey_Drivers2, Filtered_Survey_List[[5]][[1]][,!c("complete")], by = c("key","qPID","qPart1ID"), all.x = TRUE)

  # Finding bad quality responses
  # BQ is defined as meeting 2 out of the four conditions:
  # 1) Speeder (speeders = 1)
  # 2) Variance too low (variance cheaters) (variance_cheaters = 1)
  # 3) Inconsistent answers (inconsistent_responses = 1)
  # 4) Not paying attention (q7a.q7b34. != 5)
  # There are six combinations here, I did this very fast, it could probably be made prettier/simpler

  # This just adds a helper column that identifies the BQ completes: 1 for bad quality, 0 for good quality
  ZFiltered_Survey_Drivers2[, BQ := ifelse(complete == 1, ifelse((racer == 1) | (speeder == 1 & variance_cheaters==1) | (speeder==1 & inconsistent_responses == 1) | (speeder==1 & qdxQCA.x001.!= QCQ_Value) | (variance_cheaters==1 & inconsistent_responses==1) | (variance_cheaters==1 & qdxQCA.x001.!= QCQ_Value) | (inconsistent_responses==1& qdxQCA.x001.!= QCQ_Value),1,0),2)]

  BQ_Keys <- ZFiltered_Survey_Drivers2[, .(key,BQ)]

  Filtered_Survey_List[[1]][[1]] <- ZFiltered_Survey_Drivers2

  # Add BQ Column to all of the other survey parts:
  for(i in 2:length(Filtered_Survey_List[[1]])){
    Filtered_Survey_List[[1]][[i]] <- merge(Filtered_Survey_List[[1]][[i]], BQ_Keys, by = "key", all.x = TRUE)
  }

  return(Filtered_Survey_List)

}



#' Perform automatic quality checks on the filtered surveys, but focuses solely on the survey times!
#'
#' This function takes a survey list (filtered) and performs a quality control check on it. This check is based on:
#' Survey timing. The bottom 5% are classified as BQ.
#' @param Filtered_Survey_List Contains the entire filtered survey list
#' @return The filtered survey list, but with a column in each list element that is called "BQ" to flag all of the responses that fail the quality control test.
#' @keywords Quality Control
#' @export
#' @examples
#' Quality_Control_Time_Only(Cint_Filtered_Surveys)
Quality_Control_Time_Only <- function(Filtered_Survey_List){

  # IMPORTANT: The quality control function adds the timing information to the first survey part!

  ZFiltered_Survey_Drivers2 <- Filtered_Survey_List[[1]][[1]]

  # Combine the timing information
  ZFiltered_Survey_Drivers2 <- merge(ZFiltered_Survey_Drivers2, Filtered_Survey_List[[5]][[1]][,!c("complete")], by = c("key","qPID","qPart1ID"), all.x = TRUE)

  # BQ is defined here as:
  # 1) Speeder or Racer (speeders = 1, Racer = 1)

  # This just adds a helper column that identifies the BQ completes: 1 for bad quality, 0 for good quality
  ZFiltered_Survey_Drivers2[, BQ := ifelse(complete == 1, ifelse((racer == 1) | (speeder == 1),1,0),2)]

  BQ_Keys <- ZFiltered_Survey_Drivers2[, .(key,BQ)]

  Filtered_Survey_List[[1]][[1]] <- ZFiltered_Survey_Drivers2

  # Add BQ Column to all of the other survey parts:
  if(length(Filtered_Survey_List[[1]]) > 1){
  for(i in 2:length(Filtered_Survey_List[[1]])){
    Filtered_Survey_List[[1]][[i]] <- merge(Filtered_Survey_List[[1]][[i]], BQ_Keys, by = "key", all.x = TRUE)
  }
  } # end if


  return(Filtered_Survey_List)

}


#' Merge Manual QC onto all of the survey parts
#'
#' This function allows you to merge a manual QC document back onto all of the automatically quality controlled survey parts and update the quality status. The manual QC document that you are merging needs to have the keys and a column titled "Suspicious" that simply has a "1" entry for any entry with a bad quality response
#'
#' @param Quality_Checked_Survey_List This is the automatically controlled survey list
#' @param Human_Checked_Datatable This is the singular human controlled survey table (can take any form, just needs to have a "key" column and a "Suspicious" column)
#' @return Updated survey list where the automatic BQ status is put into an Auto_BQ column, and the Suspicious entry trumps the Auto_BQ to result in a final BQ column.
#' @keywords Manual Quality Control
#' @export
#' @examples
#' Final_QC_Total <- Add_Final_Quality_Check(QC_Total, Human_Checked_Datatable_Manual_QC)
Add_Final_Quality_Check <- function(Quality_Checked_Survey_List, Human_Checked_Datatable){

  Human_Checked_Datatable <- Human_Checked_Datatable[, .(key,Suspicious)]
  Human_Checked_Datatable$Suspicious[is.na(Human_Checked_Datatable$Suspicious)] <- 0


  for(i in 1:length(Quality_Checked_Survey_List[[1]])){
    Quality_Checked_Survey_List[[1]][[i]] <- merge(Quality_Checked_Survey_List[[1]][[i]], Human_Checked_Datatable, by = "key", all.x = TRUE)
    Quality_Checked_Survey_List[[1]][[i]][, ':=' (Auto_BQ = BQ)]
    Quality_Checked_Survey_List[[1]][[i]][, ':=' (BQ = ifelse(is.na(Suspicious), Auto_BQ, ifelse(Suspicious == 1, 1, Auto_BQ)))]

  }

  return(Quality_Checked_Survey_List)

}


#' Filter Raw Surveys, Remove Duplicates and Dropouts, Between Specific Dates
#'
#' This function takes a raw survey list and returns a list with 8 entries: the first is a filtered version of the raw survey data.
#' This function does all of the heavy lifting in removing duplicate survey responses from single respondents, and provides almost all of
#' the counting functionality for the field report.
#'
#' @param Survey_List this is a list containing only the downloaded data from the survey for a multiple-part survey.
#' @param Earliest_First_Part_startdate in the form "ymd_hms"
#' @param Latest_First_Part_startdate in the form "ymd_hms"
#' @return A list with eight components:
#' 1) A list with the filtered survey parts as data.tables
#' 2) A data.table with the relevant information for the field report
#' 3) A data.table of survey IDs that were screened out
#' 4) A data.table of the first survey part with survey IDs deemed as "junk"- incomplete, but not screenouts
#' 5) The list returned by the Get_Interview_Time function on the filtered data
#' 6) data.table of Unique_Incomplete_IDs
#' 7) data.table of Panel_Report_ID_List
#' 8) data.table of Dropouts_First_Part_Pages
#' @keywords Screenouts, Filtering, Field Report
#' @export
#' @examples
#' Count_and_Filter_Between_Dates(Cint_Surveys_0, "2020-03-11 13:05:10", "2020-04-11 13:05:10")
Count_and_Filter_Between_Dates <- function(Survey_List, Earliest_First_Part_startdate, Latest_First_Part_startdate){


  # Changing the dates to lubridate and getting entries only within a date range:
  for(i in 1:length(Survey_List)){
    Survey_List[[i]] <- Survey_List[[i]][, startdate := lubridate::ymd_hms(startdate)]
    Survey_List[[i]] <- Survey_List[[i]][startdate %between% c(Earliest_First_Part_startdate,Latest_First_Part_startdate)]

  }
  # Now we'll have some completes that don't match to their start, because we've eliminated the start from the dataset

  # To fix this problem, we have to collect the IDs from the subsequent sections:
  # IDs from subsequent sections
  IDs <- list()

  for(i in 2:length(Survey_List)){

    IDs[[i-1]] <- Survey_List[[i]][, .(qPID)]

  }

  All_subsequent_IDs <- rbindlist(IDs)
  All_subsequent_IDs_unique <- unique(All_subsequent_IDs)
  # Only IDs that worked in our time frame
  All_subsequent_IDs_who_started_after_date <- All_subsequent_IDs_unique[qPID %in% Survey_List[[1]]$qPID]

  # Filter for only the desired IDs- don't filter in the first section!!!
  for(i in 2:length(Survey_List)){
    Survey_List[[i]] <- Survey_List[[i]][qPID %in% All_subsequent_IDs_who_started_after_date$qPID]
  }



  # This is a date-specific filter function, to report on specific blocks of the results and compare changes

  # Info (clap open to read)

  # For a comparison of data tables and data frames (so that dataframe lovers can brush up on the syntax):
  # https://atrebas.github.io/post/2019-03-03-datatable-dplyr/

  # Battle plan for getting rid of all duplicates:
  # Find the "whitelist" (PID/Part1ID combinations that are not duplicated) for the final survey part- these are the good completes
  # Count these for reporting
  # Find the whitelist for the first survey part by finding one for screenouts and one for those that completed the first part
  # Count the non-duplicated screenouts for reporting
  # These whitelists will be combined and any duplicates will be removed.
  # Then, the first survey part will be filtered using this combined whitelist.

  # The whitelist containing just the incompletes and the completes is used to filter ALL the survey parts.

  # REQUIRED ELEMENTS OF THE SURVEY:
  # Column: "qPart1ID" - {SAVEID} from the first survey part
  # Column: "qPID" - These are the respondents' unique IDs
  # Column: "lastpage" - automatically present in Limesurvey data, this is the last page the respondent reached.
  # Survey must be packed up as a List

  # Return values:
  # A list
  # [[1]]: This is the entered Survey_List filtered (in all parts) with only the completes and incompletes
  # The completes and incompletes are noted as such in the column "complete"
  # All responses are given a "key" value in the key column- this is just the concatenation of the qPart1ID and the qPID
  # [[2]]: Low-level reporting on the number of respondents incoming, n.completes, n.incompletes, n.screenouts (undifferentiated), n.junk
  # [[3]]: Unique screenout entries to pass on to the reporting function to differentiate the reasons for screenouts (I don't want to handle that in the
  # filter function- I want to keep it widely applicable with no changes)
  # [[4]]: Junk entries
  # [[5]]: Time
  # [[6]]: Incompletes

  # Count total incoming traffic (including lastpage == -1 and responses without PID)
  n.incoming <- nrow(Survey_List[[1]])

  # Prepare the incoming data: identify screenouts
  Survey_List <- Identify_Screenouts(Survey_List)

  # Identify dropouts in the first part- this will be continues at the end of this function
  Survey_List[[1]]$dropout_part1 <- ifelse(Survey_List[[1]]$lastpage <= 1 | (Survey_List[[1]]$lastpage < max(Survey_List[[1]][["lastpage"]], na.rm = TRUE) & Survey_List[[1]]$screenout != 1),1,0)

  # Prepare the incoming data: Get rid of missing qPIDs
  Survey_List[[1]]$qPID[Survey_List[[1]]$qPID == ""] <- NA
  Survey_List[[1]] <- Survey_List[[1]][!is.na(qPID)]
  # %>%na.omit(cols = c("qPID"))

  # Count the number that are eliminated by getting rid of entries without PID
  n.with_PID <- nrow(Survey_List[[1]])
  n.missing_PID <- n.incoming - n.with_PID

  # Full first survey part with all incoming traffic including click-aways (only thing missing are the empty PIDs)
  First_Survey_Part_All_Incoming_Traffic <- Survey_List[[1]]

  # Eliminate all those entries without a qPart1ID (important for the rest of the duplicate search and filtering process) or a qPID from all Survey parts:
  for(i in 1:length(Survey_List)){
    Survey_List[[i]]$qPID[Survey_List[[i]]$qPID == ""] <- NA
    Survey_List[[i]]$qPart1ID[Survey_List[[i]]$qPart1ID == ""] <- NA

    Survey_List[[i]] <- Survey_List[[i]][!is.na(qPID)]
    # %>%na.omit(cols = c("qPID"))
    Survey_List[[i]] <- Survey_List[[i]][!is.na(qPID)]
    # %>%na.omit(cols = c("qPart1ID"))
  }

  # Count how many are lost due to lack of qPart1ID
  n.with_Part1ID <- nrow(Survey_List[[1]])
  n.missing_PID <- n.with_PID - n.with_Part1ID

  # Create a column that is the identifier "key"- the combination of the Part1ID and the PID
  for(i in 1:length(Survey_List)){
    Survey_List[[i]][, key := paste0(qPart1ID, qPID)]
  }


  # Examine the last survey part:
  # Get the uniques, but only those that are completes, and only take the last complete if there are multiple completes from the same PID:
  Unique_Complete_IDs <- unique(Survey_List[[length(Survey_List)]][lastpage == max(Survey_List[[length(Survey_List)]][["lastpage"]], na.rm = TRUE)], by = c("qPID"), fromLast = TRUE)[, .(qPart1ID, qPID, key)]
  n.completes <- nrow(Unique_Complete_IDs)
  #^^ With this, we have a list of our good completes. Only these should be kept from all of the sections for further analysis.
  # These are definitely those that will be kept in the first section, if these PIDs have multiple Part1IDs


  # Examine the first survey part:
  # Get the unique incompletes (defined as all those that completed the first section, but not the last)
  # Have to exclude the unique completes list!
  Unique_Incomplete_IDs <- unique(Survey_List[[1]][!(qPID %in% Unique_Complete_IDs$qPID) & lastpage == max(Survey_List[[1]][["lastpage"]], na.rm = TRUE)], by = c("qPID"), fromLast = TRUE)[, .(qPart1ID, qPID, key)]
  n.incompletes <- nrow(Unique_Incomplete_IDs)

  # Finish identifying the dropouts (includes duplicates! This is just for the field reporting to figure out if there are problems!)
  First_Survey_Part_All_Incoming_Traffic <- First_Survey_Part_All_Incoming_Traffic[ , dropout_part2 := ifelse( lastpage == max(Survey_List[[1]][["lastpage"]], na.rm = TRUE) & (qPID %in% Unique_Incomplete_IDs$qPID), 1, 0)]
  First_Survey_Part_All_Incoming_Traffic <- First_Survey_Part_All_Incoming_Traffic[ , dropout := ifelse( dropout_part1 == 1 | dropout_part2 == 1, 1, 0)]
  Panel_Report_ID_List <- First_Survey_Part_All_Incoming_Traffic[,.(qPID,lastpage,screenout,dropout_part1, dropout_part2, dropout)]

  n.Total_Dropouts_Part_1 <- nrow(Panel_Report_ID_List[dropout_part1 == 1])
  n.Unique_Dropouts_Part_1 <- nrow(unique(Panel_Report_ID_List[dropout_part1 == 1], by = c("qPID")))

  n.Total_Dropouts_Part_2 <- nrow(Panel_Report_ID_List[dropout_part2 == 1])
  n.Unique_Dropouts_Part_2 <-  nrow(unique(Panel_Report_ID_List[dropout_part2 == 1], by = c("qPID")))

  # Dropouts first part
  Dropouts_First_Part_Pages <- Panel_Report_ID_List[, count_lastpage := .N, by = lastpage]%>%
    unique(by = "lastpage")

  Dropouts_First_Part_Pages <- Dropouts_First_Part_Pages[order(lastpage)]

  # # Dropouts last part
  # Dropouts_Last_Part <- QC_Completes[, count_lastpage := .N, by = "lastpage"]%>%
  #   unique(by = "lastpage")
  #
  # Dropouts_Last_Part <- Dropouts_Last_Part[, .(lastpage,count_lastpage)]%>%
  #   setorder(cols = "lastpage")



  # Get the unique screenouts:
  Unique_Screenout_IDs <- unique(Survey_List[[1]][screenout == 1], by = c("qPID"), fromLast = TRUE)[, .(qPart1ID, qPID, key)]
  n.screenouts <- nrow(Unique_Screenout_IDs)

  # Now create the whitelist of responses to keep in:
  Whitelist_Keepers <- rbindlist(list(Unique_Complete_IDs,Unique_Incomplete_IDs),use.names = TRUE)


  # Filter all the survey parts using this whitelist and remove any remaining duplicates that happen to be on the whitelist
  # Also add a column that shows if the particular response belongs to a complete or not
  Filtered_Survey_List <- list()

  for(i in 1:length(Survey_List)){
    Filtered_Survey_List[[i]] <- unique(Survey_List[[i]][key %in% Whitelist_Keepers$key], by = c("key"), fromLast = TRUE)
    Filtered_Survey_List[[i]][, complete := ifelse(key %in% Unique_Complete_IDs$key, 1, 0)]
  }


  # Also (nice to have) a list of JUNK (these are the those that have lastpage -1 or otherwise have problems- useful to monitor the number in case there are too many)
  # There will be no special list of Junk, you can just look into the raw data if there's an indication that there's too much junk.
  # JUNK ALSO INCLUDES UNFILTERED DUPLICATES!!! This is important to know, because it gives a sense of whether a certain respondent is attempting the survey repeatedly.
  # Junk includes:
  # All responses that quit in first part before reaching the last page that ARE NOT screenouts
  # All duplicates
  # What isn't counted as junk are screenouts from respondents that went on to either submit an incomplete or a complete
  # So, one participant can have up to three non-junk, unique entries: A valid complete, a valid incomplete, and a valid screenout
  # But this is just for the junk count! In the final filtered lists, no respondent can have more than 1 response, and it will be their "most complete"
  # No screenouts are kept in the final filtered list.
  All_Non_Junk <- rbindlist(list(Whitelist_Keepers,Unique_Screenout_IDs), use.names = TRUE)
  Junk_Attempts <- Survey_List[[1]][!(key %in% All_Non_Junk$key)]
  n.junk <- nrow(Junk_Attempts)


  Times <- Get_Interview_Time(Filtered_Survey_List)
  Median_LOI_Surveys <- Times[[2]]$Median_LOI


  # Preparing the Return List:
  Return_List <- list()
  # Return our filtered list
  Return_List[[1]] <- Filtered_Survey_List
  # Return our interesting data
  Return_List[[2]] <- data.table(Total_Incoming_Traffic = n.incoming,
                                 # Traffic_with_PID = n.with_PID,
                                 # Traffic_without_PID = n.missing_PID,
                                 # Traffic_with_PID_and_Part1ID = n.with_Part1ID,
                                 # Traffic_without_PID_or_Part1ID = n.missing_PID,
                                 Total_Dropouts_Part_1 = n.Total_Dropouts_Part_1,
                                 Unique_Dropouts_Part_1 = n.Unique_Dropouts_Part_1,
                                 Total_Dropouts_Subsequent_Parts = n.Total_Dropouts_Part_2,
                                 Unique_Dropouts_Subsequent_Parts = n.Unique_Dropouts_Part_2,
                                 Completes = n.completes,
                                 Screenouts = n.screenouts,
                                 IR = (n.completes / (n.completes + n.screenouts)),
                                 Median_LOI = Median_LOI_Surveys,
                                 Incompletes = n.incompletes
                                 # Junk = n.junk
  )
  # Return the screenout list (to break down the reasons for screenouts after the filter)
  Return_List[[3]] <- Survey_List[[1]][(key %in% Unique_Screenout_IDs$key)]
  # Return the junk list (for fun!)
  Return_List[[4]] <- Junk_Attempts
  Return_List[[5]] <- Times
  Return_List[[6]] <- Unique_Incomplete_IDs
  Return_List[[7]] <- Panel_Report_ID_List
  Return_List[[8]] <- Dropouts_First_Part_Pages[, .(lastpage, count_lastpage)]

  return(Return_List)
}









#' Perform automatic quality checks on the filtered surveys
#' THIS VERSION DOES NOT HAVE AN UP-TO-DATE DOCUMENTATION! This is misleading!
#'
#' This function takes a survey list (filtered) and performs quality control checks on it. The QC revolves around four main aspects:
#' 1) Survey timing. The bottom 2.5% of times are automatically filtered out. The bottom 2.5%-5% are flagged, and need to be combined with one of the other three criteria to classify the complete as BQ.
#' 2) Variance in the driver questions: For the America's Best Employers Suite, we have a battery of "driver questions" that are 1-5 Likert type arrays. These all have the prefix "qd" in their variable codes and are found in the first survey section. The variance of these questions is tested, and the bottom 5% is flagged.
#' 3) "Q34" - Please answer this question with "5" if you are paying attention- otherwise the response is flagged.
#' 4) "Plausibility check" - Does the average driver response (good or bad)  generally agree with the overall level of recommendation (q5.1.)? Otherwise the response is flagged.
#' @param Filtered_Survey_List Contains the entire filtered survey list
#' @param NPS_Name <optional, default is "q5.1."> The name of the variable that contains the NPS (or "most important" score, that can be compared against a battery of drivers). This is the "big" NPS question (or something similar, evaluated on a ten point scales and that can be compared with the drivers)
#' @param HighNPS This is the "high" direct evaluation value for the plausibility check (default is 7)
#' @param LowNPS  This is the "low" direct evaluation value for the plausibility check (default is 2)
#' @param Driver_Identifier <optional, default "^qd"> - this is whatever uniquely identifies the driver questions. You can use regex here.
#' @param HighDriver This is the "high" mean value for the question battery for the plausibility check (default is 4)
#' @param LowDriver This is the "low" mean value for the question battery for the plausibility check (default is 3)
#' @param QCQ_Value This is the value that the respondent should click on for the Quality Check Question, so that it will NOT be flagged as bad quality.
#' @param QCQ_Name <optional, default "qdxQCA.x001."> - this is the identifier of the quality control question
#' @param key_name default "key" - this is the unique ID name for each response
#' @param Variance_Percentile <optional, default 0.05> this is the percentile under which the variance of the drivers is considered a flag for BQ
#' @return The filtered survey list, but with a column in each list element that is called "BQ" to flag all of the responses that fail the quality control test.
#' @keywords Quality Control
#' @export
#' @examples
#' Quality_Control_Array_Writer(Filtered_Survey_List, NPS_Name = "q5.1.", HighNPS = 7, LowDriver = 2, LowNPS = 3, HighDriver = 4, QCQ_Value = 5, QCQ_Name = "qdxQCA.x001.", Variance_Percentile = 0.05, Driver_Identifier = "^qd", key_name = "key")
Quality_Control_Array_Writer <- function(Filtered_Survey_List, NPS_Name = "q5.1.", HighNPS = 7, LowDriver = 2, LowNPS = 3, HighDriver = 4, QCQ_Value = 5, QCQ_Name = "qdxQCA.x001.", Variance_Percentile = 0.05, Driver_Identifier = "^qd", key_name = "key"){
  # All of the Quality checks outside of the time control deal with variables found in the first part of the survey (at least for this survey). I will therefore do the quality control on the first part (including on the incompletes- these I'll throw out of quality control at the last step where I will put "incomplete" in the BQ column instead of a value, and create a BQ column that will then be matched using the keys with all of the other survey parts)

  # Just for readability, since the two functions take the same argument!
  the_key <- key_name
  # Grab the column names to focus on for the driver evaluations:
  Driver_List <- Filtered_Survey_List[[1]][[1]]
  # Columns to do the array parsing with
  dcols <- grep(paste0(Driver_Identifier,"|^",key_name), names(Driver_List), perl = TRUE)
  # Columns to remove from the set to replace with the parsed array columns:
  dcolsReplace <- grep(paste0(Driver_Identifier), names(Driver_List), perl = TRUE)



  # Parse the array writer arrays
  Driver_Table <- Parse_Array_Writer(Driver_List[, ..dcols], key_name = the_key)
  Driver_Table <- Driver_Table[, evaluation := as.numeric(evaluation)]
  # Cast the parsed data so that it resembles the old format:
  Drivers <- data.table::dcast(Driver_Table, get(key_name)~code, value.var = "evaluation")

  # Put the parsed driver columns back into the dataset:
  Driver_List <- merge(Driver_List[, !..dcolsReplace], Drivers, by.x = key_name, by.y = "key_name", all = TRUE)
  dcols2 <- grep(paste0(Driver_Identifier), names(Driver_List), perl = TRUE)

  # VARIANCE TEST: see what the size of the variance is over all of the driver questions.
  # Create a column with the variance of the driver statements
  Driver_List$variance_drivers <- apply(Driver_List[, ..dcols2], 1, function(x) var(x, na.rm=TRUE))
  # Identify the variances of the lower percentiles:
  Variance_Cheaters <- quantile(Driver_List$variance_drivers, probs = c(Variance_Percentile),na.rm=TRUE)%>%
    t()
  # Create a helper column for "Variance Cheaters"- repsondents where the variance was too low (below the chosen percentile above)
  Driver_List$variance_cheaters <- ifelse(Driver_List$variance_drivers <= Variance_Cheaters[,1],1,0)

  # CONSISTENCY TEST
  # Here we want to check if the respondent's overall score of their company (q5.1.) is roughly in line with their satisfaction in the drivers.
  # To do this, we compare the overall score of company (q5.1.) with the mean of the drivers
  Driver_List$mean_drivers <- apply(Driver_List[, ..dcols2], 1, function(x) mean(x, na.rm=TRUE))
  # Create a helper column for the "Inconsistent_Responses"
  # Case one: High q5.1.(NPS), low mean driver score
  # HighNPS <- 7
  # LowDriver <- 2
  # Case two: Low q5.1.(NPS), high mean driver score
  # LowNPS <- 3
  # HighDriver <- 4

  # Helper column for the consistency
  Driver_List <- Driver_List[, inconsistent_responses := ifelse( (get(NPS_Name) >= HighNPS) & (mean_drivers <= LowDriver), 1, ifelse((get(NPS_Name) <= LowNPS) & (mean_drivers >= HighDriver),1,0))]


  # Combine the timing information
  Driver_List <- merge(Driver_List, Filtered_Survey_List[[5]][[1]][,!c("complete")], by = c(key_name,"qPID","qPart1ID"), all.x = TRUE)

  # Finding bad quality responses
  # BQ is defined as meeting 2 out of the four conditions:
  # 1) Speeder (speeders = 1)
  # 2) Variance too low (variance cheaters) (variance_cheaters = 1)
  # 3) Inconsistent answers (inconsistent_responses = 1)
  # 4) Not paying attention (q7a.q7b34. != 5)
  # There are six combinations here, I did this very fast, it could probably be made prettier/simpler

  # This just adds a helper column that identifies the BQ completes: 1 for bad quality, 0 for good quality
  Driver_List[, BQ := ifelse(complete == 1, ifelse((racer == 1) | (speeder == 1 & variance_cheaters==1) | (speeder==1 & inconsistent_responses == 1) | (speeder==1 & get(QCQ_Name) != QCQ_Value) | (variance_cheaters==1 & inconsistent_responses==1) | (variance_cheaters==1 & get(QCQ_Name) != QCQ_Value) | (inconsistent_responses==1 & get(QCQ_Name) != QCQ_Value),1,0),2)]

  BQ_Keys <- Driver_List[, .(key,BQ)]

  Filtered_Survey_List[[1]][[1]] <- Driver_List

  # Add BQ Column to all of the other survey parts:
  # for(i in 2:length(Filtered_Survey_List[[1]])){
  #   Filtered_Survey_List[[1]][[i]] <- merge(Filtered_Survey_List[[1]][[i]], BQ_Keys, by = key_name, all.x = TRUE)
  # }

  return(Filtered_Survey_List)

}