huntr: Hunt Lab Experiment Data

# Compile Data ------------------------------------------------------------

tidyProblemNames = function(dt) {
    dt[dt$problem == "Problem #1 - Foreign Fighters"]$problem = "Foreign Fighters"
    dt[dt$problem == "Problem #2 - Forecasting Piracy"]$problem = "Forecasting Piracy"
    dt[dt$problem == "Problem #3 - Corporate Espionage"]$problem = "Corporate Espionage"
    dt[dt$problem == "Problem #4 - The Park Young-min Case"]$problem = "The Park Young-min Case"
    return(dt)
}

addRescaledEngagementMetrics = function(analytics) {
    as = c("report_count",
           "resource_count",
           "comment_count",
           "chat_count",
           "comment_vote_count",
           "resource_vote_count",
           "simple_rating",
           "partial_rating",
           "complete_rating")
    rescale = function(x) {
        ( x - min(x, na.rm = T) ) / ( max(x, na.rm = T) - min(x, na.rm = T ) )
    }
    for (a in as) {
        if (max(analytics[[a]]) - min(analytics[[a]]) == 0) {
            analytics[[paste0(a,"_scaled")]] = 0
        } else {
            analytics[[paste0(a,"_scaled")]] = rescale(analytics[[a]])
        }
    }
    analytics$engagement_scaled = 7*analytics$report_count_scaled +
        3*analytics$resource_count_scaled +
        3*analytics$complete_rating_scaled +
        2*analytics$comment_count_scaled + 
        analytics$chat_count_scaled +
        analytics$simple_rating_scaled +
        analytics$partial_rating_scaled +
        analytics$comment_vote_count_scaled +
        analytics$resource_vote_count_scaled
    
    return(analytics)
}

fetchPlatformData = function(path_to_data, instance_name) {
    path = paste0(path_to_data, instance_name, '/PlatformData/')
    
    # Initialise and populate list of instance data.
    instance_data = list()
    instance_data$analytics = fread(paste0(path, 'analytics/analytics.csv'))
    instance_data$authors = fread(paste0(path, 'authors/authors.csv'))
    instance_data$chat = fread(paste0(path, 'chat/chat.csv'))
    instance_data$comments = fread(paste0(path, 'comments/comments.csv'))
    instance_data$login = fread(paste0(path, 'login/login.csv'))
    instance_data$problems = fread(paste0(path, 'problems/problems.csv'))
    instance_data$ratings = fread(paste0(path, 'ratings/ratings.csv'))
    instance_data$relations = fread(paste0(path, 'relations/relations.csv'))
    instance_data$top_reports = fread(paste0(path, 'reports/top_reports.csv'))
    instance_data$responses = fread(paste0(path, 'responses/responses.csv'))
    instance_data$timeline = fread(paste0(path, 'timeline/timeline.csv'))
    
    # Improve consistency of column names.
    setnames(instance_data$problems, "problem_title", "problem")
    
    setnames(instance_data$analytics, "title", "problem")
    setnames(instance_data$analytics, "teamName", "team")
    setnames(instance_data$analytics, "userName", "user")
    setnames(instance_data$analytics, "teamId", "team_id")
    setnames(instance_data$analytics, "problemId", "problem_id")
    setnames(instance_data$analytics, "userId", "user_id")
    
    setnames(instance_data$authors, "team_name", "team")
    setnames(instance_data$authors, "author_id", "user_id")
    setnames(instance_data$authors, "author_name", "user")
    
    setnames(instance_data$chat, "author_name", "user")
    setnames(instance_data$chat, "author_id", "user_id")
    instance_data$chat$problem_title = NULL
    setnames(instance_data$chat, "team_name", "team")
    
    setnames(instance_data$comments, "team_name", "team")
    setnames(instance_data$comments, "author_name", "author")
    setnames(instance_data$comments, "commenter_name", "commenter")
    
    setnames(instance_data$login, "userName", "user")
    setnames(instance_data$login, "userId", "user_id")
    setnames(instance_data$login, "eventType", "event_type")
    setnames(instance_data$login, "timeStamp", "timestamp")
    
    setnames(instance_data$ratings, "author_name", "author")
    setnames(instance_data$ratings, "rater_name", "rater")
    
    setnames(instance_data$relations, "userName", "user")
    setnames(instance_data$relations, "title", "problem")
    setnames(instance_data$relations, "teamName", "team")
    setnames(instance_data$relations, "problemId", "problem_id")
    
    setnames(instance_data$top_reports, "title", "problem")
    setnames(instance_data$top_reports, "team_name", "team")
    
    setnames(instance_data$responses, "team_name", "team")
    
    setnames(instance_data$timeline, "userName", "user")
    setnames(instance_data$timeline, "title", "problem")
    setnames(instance_data$timeline, "tipe", "type")
    setnames(instance_data$timeline, "teamName", "team")
    setnames(instance_data$timeline, "chunkId", "chunk_id")
    setnames(instance_data$timeline, "parentId", "parent_id")
    setnames(instance_data$timeline, "problemId", "problem_id")
    setnames(instance_data$timeline, "teamId", "team_id")
    setnames(instance_data$timeline, "timeStamp", "timestamp")
    setnames(instance_data$timeline, "userId", "user_id")
    
    # Filter out dummy problems.
    dummy_problems = c("'Sandpit' Problem",
                       "Test Problem",
                       "Problem #1 Teaser - 'Foreign Fighters'",
                       "Problem #2 Teaser - 'Forecasting Piracy'",
                       "Problem #3 Teaser - 'Corporate Espionage'",
                       "Problem #4 Teaser - The Park Young-min Case")
    for (problem_name in dummy_problems) {
        instance_data$relations = instance_data$relations[problem != problem_name]   
        instance_data$timeline = instance_data$timeline[problem != problem_name]   
    }
    
    # Add problem title as variable to tables that only have problem ID.
    for (table_name in c("responses", "authors", "comments", "ratings", "chat")) {
        setDT(instance_data[[table_name]])[instance_data$problems, problem := i.problem, on = c(problem_id = "problem_id")]
    }
    
    # Tidy problem names.
    for (table_name in c("problems", "responses", "analytics", "relations", "timeline", "chat")) {
        instance_data[[table_name]] = tidyProblemNames(instance_data[[table_name]])
        
    }
    
    # Add normalised engagement metrics to analytics table.
    instance_data$analytics = addRescaledEngagementMetrics(instance_data$analytics)
    
    # Add misc. other metrics to analytics table.
    instance_data$analytics[,vote_count:=comment_vote_count + resource_vote_count]
    instance_data$analytics[,quick_rating:=simple_rating + partial_rating]
    
    return(instance_data)
}

# helper function: reverse-score a single column
# if a likert column with vals 1-5  is scored in reverse, then a 5 becomes a 1,
# a 4 becomes a 2, etc. So in this example the reverse-scored value is 6 minus 
# the original value. Generally, it's (max_value + 1) - original .
reverse_score_column = function(column_vector, max_val, reverse = F) {
    if (reverse == F) {
        return(column_vector)
    } else {
        return((max_val + 1) - column_vector)
    }
}

# score the whole psychological scale at once by specifying which columns are
# reverse-scored (FALSE means don't reverse), and the max value of each column
score_likert_scale = function(my_df, scale_col_names, scale_maxes, scale_reverses){
    mask = mapply(reverse_score_column, 
                  my_df[, scale_col_names], 
                  scale_maxes, scale_reverses)
    final_score = rowSums(mask)
    return(final_score)
}

computeAOMT = function(dt) {
    aomt_colnames = paste0('aomt', 1:11)
    aomt_max_vec = rep(7, length(aomt_colnames))
    aomt_reverse_vec <- c(FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE)
    dt$aomt = score_likert_scale(dt,
                                 aomt_colnames,
                                 aomt_max_vec,
                                 aomt_reverse_vec)
    
    return(dt)
}

computeMatrixReasoning = function(dt) {
    dt$matrix = (dt$mat1 == "D") +
        (dt$mat2 == "C") +
        (dt$mat3 == "E") +
        (dt$mat4 == "B") +
        (dt$mat5 == "B") +
        (dt$mat6 == "D") +
        (dt$mat7 == "E") +
        (dt$mat8 == "C") +
        (dt$mat9 == "A") +
        (dt$mat10 == "D") +
        (dt$mat11 == "E")
    return(dt)
}

compile_parts_2018_SwarmChallengeExp1 = function(path_to_data, instance_name) {
    path = paste0(path_to_data, instance_name, '/QualtricsData/')
    
    ES = fread(paste0(path, "ind_diffs_responses_no_scoring.csv"))
    
    colnames(ES) = c("startDate",
                     "endDate",
                     "status",
                     "IPaddress",
                     "progress",
                     "duration",
                     "finished",
                     "recordedDate",
                     "responseID",
                     "externalReference",
                     "latitude",
                     "longitude",
                     "distributionChannel",
                     "userLanguage",
                     
                     #"gaveConsent1", # Efficacy of the SWARM platform
                     
                     "yearsIntelAnalyticalExperience",
                     "yearsProfAnalyticalExperience",
                     "describeAnalyticalExperience",
                     "freeTime",
                     
                     "age",
                     "gender",
                     "education",
                     
                     "major1",
                     "major2",
                     "minor1",
                     "minor2",
                     
                     "englishProficiency",
                     "loteProficiency1",
                     "loteProficiencyText1",
                     "loteProficiency2",
                     "loteProficiencyText2",
                     "loteProficiency3",
                     "loteProficiencyText3",
                     "loteProficiency4",
                     "loteProficiencyText4",
                     "loteProficiency5",
                     "loteProficiencyText5",
                     "loteProficiency6",
                     "loteProficiencyText6",
                     
                     "enjoyLogicProbs",
                     "enjoyNumProbs",
                     
                     "exp1", # Math
                     "exp2", # Quant Model
                     "exp3", # Stats
                     "exp4", # Prob
                     "exp5", # Bayse Nets
                     "exp6", # Programming
                     "exp7", # Experimental Design
                     "exp8", # Risk Analysis
                     "exp9", # Forecasting
                     "exp10", # Decision Theory
                     "exp11", # Game Theory
                     "exp12", # SATs
                     "exp13", # Argument Mapping
                     "exp14", # Informal Logic
                     "exp15", # Sys Think
                     "exp16", # Image Analysis
                     "exp17", # Link Analysis
                     "exp18", # Graphic Design
                     "exp19", # Technical Writing
                     
                     "pc35Acomp",
                     "pc57A",
                     "pc14Acomp",
                     "pc60AUB",
                     "pc5AUB",
                     
                     "pr1",
                     "pr2",
                     "pr3",
                     "pr4",
                     "pr5",
                     "pr6",
                     "pr7",
                     "pr8",
                     "pr9",
                     "pr10",
                     "pr11",
                     "pr12",
                     "pr13",
                     "pr14",
                     "pr15",
                     "pr16",
                     
                     "mat1",
                     "mat2",
                     "mat3",
                     "mat4",
                     "mat5",
                     "mat6",
                     "mat7",
                     "mat8",
                     "mat9",
                     "mat10",
                     "mat11",
                     
                     "pc60Acomp",
                     "pc5Acomp",
                     "pc35A",
                     "pc57Acomp",
                     "pc14B",
                     
                     "crt1",
                     "crt2",
                     "crt3",
                     "crt4",
                     "crt5",
                     "crt6",
                     "crtSeenBefore",
                     
                     "aomt1",
                     "aomt2",
                     "aomt3",
                     "aomt4",
                     "aomt5",
                     "aomt6",
                     "aomt7",
                     "aomt8",
                     "aomt9",
                     "aomt10",
                     "aomt11",
                     
                     "bfi1",
                     "bfi2",
                     "bfi3",
                     "bfi4",
                     "bfi5",
                     "bfi6",
                     "bfi7",
                     "bfi8",
                     "bfi9",
                     "bfi10",
                     
                     "toa1",
                     "toa2",
                     "toa3",
                     "toa4",
                     "toa5",
                     "toa6",
                     "toa7",
                     "toa8",
                     "toa9",
                     "toa10",
                     "toa11",
                     "toa12",
                     "toa13",
                     "toa14",
                     "toa15",
                     "toa16",
                     
                     "pc14A",
                     "pc57AUB",
                     "pc5B",
                     "pc35AUB",
                     "pc60A",
                     
                     "rmePractice",
                     "rme1",
                     "rme2",
                     "rme3",
                     "rme4",
                     "rme5",
                     "rme6",
                     "rme7",
                     "rme8",
                     "rme9",
                     "rme10",
                     "rme11",
                     "rme12",
                     "rme13",
                     "rme14",
                     "rme15",
                     "rme16",
                     "rme17",
                     "rme18",
                     "rme19",
                     "rme20",
                     "rme21",
                     "rme22",
                     "rme23",
                     "rme24",
                     "rme25",
                     "rme26",
                     "rme27",
                     "rme28",
                     "rme29",
                     "rme30",
                     "rme31",
                     "rme32",
                     "rme33",
                     "rme34",
                     "rme35",
                     "rme36",
                     
                     "pc60B",
                     "pc57B",
                     "pc14AUB",
                     "pc5A",
                     "pc35B"
    )
    
    colsToRemove = c("startDate",
                     "endDate",
                     "status",
                     "IPaddress",
                     "duration",
                     "recordedDate",
                     "externalReference",
                     "latitude",
                     "longitude",
                     "distributionChannel",
                     "userLanguage"
    )
    ES = ES[,!..colsToRemove]
    
    ES[ES == ""] <- NA
    
    # ES[ES == "Not familiar with this domain"] <- 1
    # ES[ES == "Studied in school, but don't use it"] <- 2
    # ES[ES == "Use this knowledge occasionally"] <- 3
    # ES[ES == "Use this knowledge regularly"] <- 4
    # ES[ES == "I am a recognized expert"] <- 5
    # ES[ES == "I am an international authority"] <- 6
    
    # pcQs = c("pc5A",
    #          "pc5B",
    #          "pc5Acomp",
    #          "pc5AUB",
    #          "pc14A",
    #          "pc14B",
    #          "pc14Acomp",
    #          "pc14AUB",
    #          "pc35A",
    #          "pc35B",
    #          "pc35Acomp",
    #          "pc35AUB",
    #          "pc57A",
    #          "pc57B",
    #          "pc57Acomp",
    #          "pc57AUB",
    #          "pc60A",
    #          "pc60B",
    #          "pc60Acomp",
    #          "pc60AUB"
    # )
    
    ES[age >= 18 & age <= 25, agegroup := "18-25"]
    ES[age >= 26 & age <= 35, agegroup := "26-35"]
    ES[age >= 36 & age <= 45, agegroup := "36-45"]
    ES[age >= 46 & age <= 55, agegroup := "46-55"]
    ES[age >= 56 & age <= 65, agegroup := "56-65"]
    ES[age >= 66, agegroup := "over 65"]
    
    # Re-encode AOMT and BFI questions.
    # ES[ES == "Strongly Disagree"] <- 1
    # ES[ES == "Strongly disagree"] <- 1
    # ES[ES == "Disagree"] <- 2
    # ES[ES == "Somewhat disagree"] <- 3
    # ES[ES == "Neither agree nor disagree"] <- 4
    # ES[ES == "Somewhat agree"] <- 5
    # ES[ES == "Agree"] <- 6
    # ES[ES == "Strongly agree"] <- 7
    
    
    for (cl in paste0('mat', 1:11)) {
        ES[[cl]] = as.character(ES[[cl]])
        ES[get(cl) == 1, (cl) := "A"]
        ES[get(cl) == 4, (cl) := "B"]
        ES[get(cl) == 5, (cl) := "C"]
        ES[get(cl) == 6, (cl) := "D"]
        ES[get(cl) == 7, (cl) := "E"]
        ES[get(cl) == 8, (cl) := "F"]
    }
    
    
    ES$finished = (ES$finished == 1)
    
    ES$user = NA
    ES$type = NA
    
    # Reorder columns.
    ES = ES[,.(
        responseID,
        progress,
        finished,
        type,
        
        user,
        
        age,
        agegroup,
        gender,
        education,

        major1,
        major2,
        minor1,
        minor2,
        
        yearsIntelAnalyticalExperience,
        yearsProfAnalyticalExperience,
        describeAnalyticalExperience,
        freeTime,
        
        englishProficiency,
        loteProficiency1,
        loteProficiencyText1,
        loteProficiency2,
        loteProficiencyText2,
        loteProficiency3,
        loteProficiencyText3,
        loteProficiency4,
        loteProficiencyText4,
        loteProficiency5,
        loteProficiencyText5,
        loteProficiency6,
        loteProficiencyText6,
         
        enjoyLogicProbs,
        enjoyNumProbs,
         
        exp1,
        exp2,
        exp3,
        exp4,
        exp5,
        exp6,
        exp7,
        exp8,
        exp9,
        exp10,
        exp11,
        exp12,
        exp13,
        exp14,
        exp15,
        exp16,
        exp17,
        exp18,
        exp19,
         
        pc5A,
        pc5B,
        pc5Acomp,
        pc5AUB,
        pc14A,
        pc14B,
        pc14Acomp,
        pc14AUB,
        pc35A,
        pc35B,
        pc35Acomp,
        pc35AUB,
        pc57A,
        pc57B,
        pc57Acomp,
        pc57AUB,
        pc60A,
        pc60B,
        pc60Acomp,
        pc60AUB,
         
        pr1,
        pr2,
        pr3,
        pr4,
        pr5,
        pr6,
        pr7,
        pr8,
        pr9,
        pr10,
        pr11,
        pr12,
        pr13,
        pr14,
        pr15,
        pr16,
         
        mat1,
        mat2,
        mat3,
        mat4,
        mat5,
        mat6,
        mat7,
        mat8,
        mat9,
        mat10,
        mat11,
         
        crt1,
        crt2,
        crt3,
        crt4,
        crt5,
        crt6,
        crtSeenBefore,
         
        aomt1,
        aomt2,
        aomt3,
        aomt4,
        aomt5,
        aomt6,
        aomt7,
        aomt8,
        aomt9,
        aomt10,
        aomt11,
         
        bfi1,
        bfi2,
        bfi3,
        bfi4,
        bfi5,
        bfi6,
        bfi7,
        bfi8,
        bfi9,
        bfi10,
         
        toa1,
        toa2,
        toa3,
        toa4,
        toa5,
        toa6,
        toa7,
        toa8,
        toa9,
        toa10,
        toa11,
        toa12,
        toa13,
        toa14,
        toa15,
        toa16,
         
        rme1,
        rme2,
        rme3,
        rme4,
        rme5,
        rme6,
        rme7,
        rme8,
        rme9,
        rme10,
        rme11,
        rme12,
        rme13,
        rme14,
        rme15,
        rme16,
        rme17,
        rme18,
        rme19,
        rme20,
        rme21,
        rme22,
        rme23,
        rme24,
        rme25,
        rme26,
        rme27,
        rme28,
        rme29,
        rme30,
        rme31,
        rme32,
        rme33,
        rme34,
        rme35,
        rme36
    )]
    
    # Populate user, team and type.
    lookup = fread(paste0(path_to_data, instance_name, '/AdminData/match_ind_diffs_response_to_swarm_username.csv'))
    for (k in 1:nrow(ES)) {
        if (ES$responseID[k] %in% lookup$IDS_ResponseId) {
            i = which(lookup$IDS_ResponseId == ES$responseID[k])
            ES$user[k] = lookup$username[i]
            ES$type[k] = lookup$Type.x[i]
        }
    }
    
    ES$user = tolower(ES$user)
    
    ES <- ES[ES$finished]
    ES = ES[!is.na(ES$user)]
    ES = as.data.frame(ES)
    
    # Compute AOMT construct.
    ES = computeAOMT(ES)
    
    # Compute matrix reasoning score.
    ES = computeMatrixReasoning(ES)
    
    return(setDT(ES))
}

compile_parts_2020_HuntChallenge = function(path_to_data, instance_name) {
    path = paste0(path_to_data, instance_name, '/QualtricsData/')
    
    entrySurveyPub = fread(paste0(path, "HC2020_EntrySurvey_Public.csv"))
    entrySurveyOrg = fread(paste0(path, "HC2020_EntrySurvey_Organisations.csv"))
    exitSurveyPub = fread(paste0(path, "HC2020_ExitSurvey_Public.csv"))
    exitSurveyOrg = fread(paste0(path, "HC2020_ExitSurvey_Organisations.csv"))
    
    entrySurveyPub = entrySurveyPub[3:nrow(entrySurveyPub)]
    entrySurveyOrg = entrySurveyOrg[3:nrow(entrySurveyOrg)]
    exitSurveyPub = exitSurveyPub[3:nrow(exitSurveyPub)]
    exitSurveyOrg = exitSurveyOrg[3:nrow(exitSurveyOrg)]
    
    exitSurveyPub = exitSurveyPub[,1:101]
    
    colnames(entrySurveyPub) <- c("startDate",
                                  "endDate",
                                  "status",
                                  "IPaddress",
                                  "progress",
                                  "duration",
                                  "finished",
                                  "recordedDate",
                                  "responseID",
                                  "recipientLastName",
                                  "recipientFirstName",
                                  "recipientEmail",
                                  "externalReference",
                                  "latitude",
                                  "longitude",
                                  "distributionChannel",
                                  "userLanguage",
                                  
                                  "reservedPlace",
                                  "gaveConsent2", # Problem Solving in Online Groups
                                  "nickname",
                                  "email",
                                  "agreedToTerms",
                                  
                                  "interests",
                                  "interestsOtherInput",
                                  
                                  "enExpct1", # InterestingProblems
                                  "enExpct2", # TimeCommitment
                                  "enExpct3", # DifficultProblems
                                  "enExpct4", # LearnSkills
                                  "enExpct5", # AchievableProblems
                                  "enExpct6", # ProductivePlatform
                                  "enExpct7", # AnalyticalTraining
                                  "enExpct8", # PositiveExperience
                                  "enExpct9", # EffectiveCollaboration
                                  "enExpct10", # ApplicableToWork
                                  
                                  "pri1", # BenchmarkTeam
                                  "pri2", # TestSkills
                                  "pri3", # Fun
                                  "pri4", # DevelopSkills
                                  "pri5", # LearnPlatform
                                  "pri6", # LearnCA
                                  "pri7", # NewCollaborationStyle
                                  "pri8", # ResearchContribution
                                  "pri9", # SuperTeam
                                  "pri10", # LensKit
                                  "pri11", # Crowdsource
                                  "priOther", # Other
                                  "priOtherInput",
                                  
                                  "aomt1",
                                  "aomt2",
                                  "aomt3",
                                  "aomt4",
                                  "aomt5",
                                  "aomt6",
                                  "aomt7",
                                  "aomt8",
                                  "aomt9",
                                  "aomt10",
                                  "aomt11",
                                  
                                  "agegroup",
                                  "gender",
                                  "occupation",
                                  "education",
                                  "studyarea",
                                  "studyareaOtherInput",
                                  "yearsWorkExperience",
                                  "typeAnalyticalExperience",
                                  "yearsAnalyticalExperience",
                                  
                                  "enCap1", # ReportWriting
                                  "enCap2", # UsingSATs
                                  "enCap3", # OSINT
                                  "enCap4", # Frameworks
                                  "enCap5", # Assumptions
                                  "enCap6", # EvaluatingQoR
                                  "enCap7", # DecisionMaking
                                  
                                  "hasMultidisciplinaryExperience",
                                  "multidisciplinaryExperienceInput",
                                  
                                  "user",
                                  "team"
    )
    
    colnames(entrySurveyOrg) <- c("startDate",
                                  "endDate",
                                  "status",
                                  "IPaddress",
                                  "progress",
                                  "duration",
                                  "finished",
                                  "recordedDate",
                                  "responseID",
                                  "recipientLastName",
                                  "user",
                                  "recipientEmail",
                                  "externalReference",
                                  "latitude",
                                  "longitude",
                                  "distributionChannel",
                                  "userLanguage",
                                  
                                  "gaveConsent1", # Problem Solving in Online Groups
                                  "gaveConsent3", # Identifying and Rating Quality of Reasoning
                                  
                                  "interests",
                                  "interestsOtherInput",
                                  
                                  "enExpct1", # InterestingProblems
                                  "enExpct2", # TimeCommitment
                                  "enExpct3", # DifficultProblems
                                  "enExpct4", # LearnSkills
                                  "enExpct5", # AchievableProblems
                                  "enExpct6", # ProductivePlatform
                                  "enExpct7", # AnalyticalTraining
                                  "enExpct8", # PositiveExperience
                                  "enExpct9", # EffectiveCollaboration
                                  "enExpct10", # ApplicableToWork
                                  
                                  "pri1", # BenchmarkTeam
                                  "pri2", # TestSkills
                                  "pri3", # Fun
                                  "pri4", # DevelopSkills
                                  "pri5", # LearnPlatform
                                  "pri6", # LearnCA
                                  "pri7", # NewCollaborationStyle
                                  "pri8", # ResearchContribution
                                  "pri9", # SuperTeam
                                  "pri10", # LensKit
                                  "pri11", # Crowdsource
                                  "priOther", # Other
                                  "priOtherInput",
                                  
                                  "aomt1",
                                  "aomt2",
                                  "aomt3",
                                  "aomt4",
                                  "aomt5",
                                  "aomt6",
                                  "aomt7",
                                  "aomt8",
                                  "aomt9",
                                  "aomt10",
                                  "aomt11",
                                  
                                  "agegroup",
                                  "gender",
                                  "occupation",
                                  "education",
                                  "studyarea",
                                  "studyareaOtherInput",
                                  "yearsWorkExperience",
                                  "typeAnalyticalExperience",
                                  "yearsAnalyticalExperience",
                                  
                                  "enCap1", # ReportWriting
                                  "enCap2", # UsingSATs
                                  "enCap3", # OSINT
                                  "enCap4", # Frameworks
                                  "enCap5", # Assumptions
                                  "enCap6", # EvaluatingQoR
                                  "enCap7", # DecisionMaking
                                  
                                  "hasMultidisciplinaryExperience",
                                  "multidisciplinaryExperienceInput"
    )
    
    colnames(exitSurveyOrg) = c(
        "startDate",
        "endDate",
        "status",
        "IPaddress",
        "progress",
        "duration",
        "finished",
        "recordedDate",
        "responseID",
        "recipientLastName",
        "user",
        "recipientEmail",
        "externalReference",
        "latitude",
        "longitude",
        "distributionChannel",
        "userLanguage",
        "recaptcha",
        
        "username",
        "starRating",
        "timeWellSpent",
        "bestThing",
        "worstThing",
        
        "rate1", # onboarding process
        "rate2", # our communication
        "rate3", # the training
        "rate4", # the feedback
        "rate5", # the help center
        
        "exExpct1", # interesting problems
        "exExpct2", # reasonable time commitment
        "exExpct3", # difficult problems
        "exExpct4", # learning new skills and tools
        "exExpct5", # achievable problems
        "exExpct6", # platform will be productive work space
        "exExpct7", # training in OSINT tools
        "exExpct8", # positive team working experience
        "exExpct9", # effective collaboration, compared to normal methods
        "exExpct10", # can be applied in my workplace
        
        "hoursPerWeek",
        "enoughTime",
        "proportionOwnTime",
        
        "tw1", # enjoyed the team social experience
        "tw2", # enjoyed the team collaboration experience
        "tw3", # could positively contribute
        "tw4", # efforts recognised
        "tw5", # easy to keep track
        "tw6", # too dominant
        "tw7", # had to lead
        
        "pf1", # shared mission
        "pf2", # unified goal
        "pf3", # managing contributions
        "pf4", # keeping track
        "pf5", # flexible and agile
        "pf6", # innovative problem solving
        "pf7", # enabling efficient workflow
        "pf8", # meeting deadlines
        "pf9", # clear communication
        "pf10", # engaging/disengaging
        "pf11", # information sharing
        "pf12", # working together positively
        "pf13", # making decisions
        "pf14", # production of useful output
        "pfComments",
        
        "fb1", # accurately reflected
        "fb2", # understand strengths and weeknesses
        "fb3", # build expertise
        "fb4", # used feedback
        
        "exCap1", # analytic report writing
        "exCap2", # using SATs
        "exCap3", # using OSINT tools
        "exCap4", # applying strategic thinking
        "exCap5", # identifying and analysing assumptions
        "exCap6", # evaluating QoR
        "exCap7", # using decision making frameworks
        
        "mostValuable",
        "cha1", # more effective
        "cha2", # more engaging
        "cha3", # some people but not others
        
        "career1",
        "career2",
        
        "ca1", # understood CA
        "ca2", # used CA
        "ca3", # understoodimproved QoR
        "ca4", # intend to apply
        
        "swarm1", # better reasoned reports
        "swarm2", # team > individual
        "swarm3", # would improve intelligence analysis in org
        "swarm4", # would use it
        
        "lk1", # easy to find relevant tools
        "lk2", # tools were well explained
        "lk3", # was helpful
        "lk4", # preferred existing tools
        "lk5", # difficult to navigate
        "lk6", # used it often
        "lkSuggestions",
        
        "responseStatements",
        
        "ratingTool",
        "ratingToolWhyNot",
        "ratingToolPurpose",
        
        "featureRequests",
        "externalTools",
        "externalToolsComments",
        
        "bestQuestionNotAsked",
        
        "testimonial",
        "otherComments"
    )
    
    colnames(exitSurveyPub) = c(
        "startDate",
        "endDate",
        "status",
        "IPaddress",
        "progress",
        "duration",
        "finished",
        "recordedDate",
        "responseID",
        "recipientLastName",
        "recipientFirstName",
        "recipientEmail",
        "externalReference",
        "latitude",
        "longitude",
        "distributionChannel",
        "userLanguage",
        "recaptcha",
        
        "username",
        "starRating",
        "timeWellSpent",
        "bestThing",
        "worstThing",
        
        "rate1", # onboarding process
        "rate2", # our communication
        "rate3", # the training
        "rate4", # the feedback
        "rate5", # the help center
        
        "exExpct1", # interesting problems
        "exExpct2", # reasonable time commitment
        "exExpct3", # difficult problems
        "exExpct4", # learning new skills and tools
        "exExpct5", # achievable problems
        "exExpct6", # platform will be productive work space
        "exExpct7", # training in OSINT tools
        "exExpct8", # positive team working experience
        "exExpct9", # effective collaboration, compared to normal methods
        "exExpct10", # can be applied in my workplace
        
        # "hoursPerWeek",
        # "enoughTime",
        # "proportionOwnTime",
        
        "tw1", # enjoyed the team social experience
        "tw2", # enjoyed the team collaboration experience
        "tw3", # could positively contribute
        "tw4", # efforts recognised
        "tw5", # easy to keep track
        "tw6", # too dominant
        "tw7", # had to lead
        
        "pf1", # shared mission
        "pf2", # unified goal
        "pf3", # managing contributions
        "pf4", # keeping track
        "pf5", # flexible and agile
        "pf6", # innovative problem solving
        "pf7", # enabling efficient workflow
        "pf8", # meeting deadlines
        "pf9", # clear communication
        "pf10", # engaging/disengaging
        "pf11", # information sharing
        "pf12", # working together positively
        "pf13", # making decisions
        "pf14", # production of useful output
        "pfComments",
        
        "fb1", # accurately reflected
        "fb2", # understand strengths and weeknesses
        "fb3", # build expertise
        "fb4", # used feedback
        
        "exCap1", # analytic report writing
        "exCap2", # using SATs
        "exCap3", # using OSINT tools
        "exCap4", # applying strategic thinking
        "exCap5", # identifying and analysing assumptions
        "exCap6", # evaluating QoR
        "exCap7", # using decision making frameworks
        
        "mostValuable",
        "cha1", # more effective
        "cha2", # more engaging
        "cha3", # some people but not others
        
        "career1",
        
        "ca1", # understood CA
        "ca2", # used CA
        "ca3", # understoodimproved QoR
        "ca4", # intend to apply
        
        "swarm1", # better reasoned reports
        "swarm2", # team > individual
        "swarm3", # would improve intelligence analysis in org
        "swarm4", # would use it
        
        "lk1", # easy to find relevant tools
        "lk2", # tools were well explained
        "lk3", # was helpful
        "lk4", # preferred existing tools
        "lk5", # difficult to navigate
        "lk6", # used it often
        "lkSuggestions",
        
        "responseStatements",
        
        "ratingTool",
        "ratingToolWhyNot",
        "ratingToolPurpose",
        
        "featureRequests",
        "externalTools",
        "externalToolsComments",
        
        "bestQuestionNotAsked",
        
        "testimonial",
        "otherComments"
    )
    
    entrySurveyPub$isOrg = FALSE;
    entrySurveyOrg$isOrg = TRUE;
    exitSurveyPub$isOrg = FALSE;
    exitSurveyOrg$isOrg = TRUE;
    
    
    ES = rbind(entrySurveyPub,
               entrySurveyOrg,
               use.names = TRUE,
               fill = TRUE)
    ExS = rbind(exitSurveyPub,
               exitSurveyOrg,
               use.names = TRUE,
               fill = TRUE)
    
    for (k in 1:nrow(ExS)) {
        if (!ExS$isOrg[k]) {
            ExS$user[k] = as.character(ES[recipientEmail == ExS$recipientEmail[k]]$user[1])
        }
    }
    
    colsToRemove <- c("startDate",
                      "endDate",
                      "status",
                      "IPaddress",
                      "duration",
                      "recordedDate",
                      "responseID",
                      "recipientLastName",
                      "recipientFirstName",
                      "recipientEmail",
                      "externalReference",
                      "latitude",
                      "longitude",
                      "distributionChannel",
                      "userLanguage",
                      "reservedPlace",
                      "nickname",
                      "email",
                      "team"
    )
    ES = ES[,!..colsToRemove]
    
    colsToRemove <- c("startDate",
                      "endDate",
                      "status",
                      "IPaddress",
                      "duration",
                      "recordedDate",
                      "responseID",
                      "recipientLastName",
                      "recipientFirstName",
                      "recipientEmail",
                      "externalReference",
                      "latitude",
                      "longitude",
                      "distributionChannel",
                      "userLanguage",
                      "recaptcha",
                      "username",
                      "career2"
    )
    ExS = ExS[,!..colsToRemove]
    
    ES[ES == ""] <- NA
    
    for (cl in paste0('pri', 1:11)) {
        ES[get(cl) == "Not important", (cl) := 1]
        ES[get(cl) == "Neutral", (cl) := 2]
        ES[get(cl) == "Important", (cl) := 3]
    }
    
    ES[ES == "I don't expect this"] <- 1
    ES[ES == "Neutral"] <- 2
    ES[ES == "I do expect this"] <- 3
    
    # ES[ES == "Not important"] <- 1
    # ES[ES == "Important"] <- 2
    
    ES[ES == "Strongly Disagree"] <- 1
    ES[ES == "Strongly disagree"] <- 1
    ES[ES == "Disagree"] <- 2
    ES[ES == "Somewhat disagree"] <- 3
    ES[ES == "Neither agree nor disagree"] <- 4
    ES[ES == "Somewhat agree"] <- 5
    ES[ES == "Agree"] <- 6
    ES[ES == "Strongly agree"] <- 7
    for (i in 1:11) {
        ES[[paste0("aomt", i)]] = as.numeric(ES[[paste0("aomt", i)]])
    }
    
    ES[ES == "prefer not to say"] <- "Prefer not to say"
    ES[ES == "Other (please describe)"] <- "Other"
    
    ES[ES == "None"] <- 1
    ES[ES == "Low"] <- 2
    ES[ES == "Moderate"] <- 3
    ES[ES == "High"] <- 4
    
    ES[,`:=`(progress = as.numeric(progress),
             finished = (finished == "True"),
             gaveConsent2 = (gaveConsent2 == "Yes"),
             gaveConsent3 = (gaveConsent3 == "Yes"),
             agreedToTerms = (nchar(agreedToTerms) > 10),
             interests = lapply(strsplit(interests, ","), trimws),
             interestsOtherInput = ifelse(interestsOtherInput == "", NA, interestsOtherInput),
             studyarea = trimws(studyarea),
             hasMultidisciplinaryExperience = (hasMultidisciplinaryExperience == "Yes")
    )]
    
    # Separate out interests into own column.
    suppressWarnings(ES[,`:=`(
        int1 = stringr::str_detect(interests, "What the problems will be like"),
        int2 = stringr::str_detect(interests, "Platform functionality"),
        int3 = stringr::str_detect(interests, "How reports are created"),
        int4 = stringr::str_detect(interests, "Team collaboration experience"),
        int5 = stringr::str_detect(interests, "The structured training available"),
        int6 = stringr::str_detect(interests, "The tools in the Lens Kit"),
        int7 = stringr::str_detect(interests, "The Contending Analyses methodology"),
        int8 = stringr::str_detect(interests, "The evaluation methods"),
        int9 = stringr::str_detect(interests, "How my team performs"),
        int10 = stringr::str_detect(interests, "Whether the public do as well as the professionals"),
        int11 = stringr::str_detect(interests, "Other")
    )])
    
    # Seperate out analytical experience into own column.
    ES[,`:=`(
        ae1 = stringr::str_detect(typeAnalyticalExperience, "No direct experience"),
        ae2 = stringr::str_detect(typeAnalyticalExperience, "Yes, in an intelligence or related field"),
        ae3 = stringr::str_detect(typeAnalyticalExperience, "Yes, in a scientific field"),
        ae4 = stringr::str_detect(typeAnalyticalExperience, "Yes, in another field"),
        ae5 = stringr::str_detect(typeAnalyticalExperience, "Prefer not to say")
    )]
    
    colsToRemove = c("interests",
                     "typeAnalyticalExperience")
    ES = ES[,!..colsToRemove]
    
    ExS[ExS == ""] <- NA
    ExS[timeWellSpent == "No", timeWellSpent := 1]
    ExS[timeWellSpent == "Unsure", timeWellSpent := 2]
    ExS[timeWellSpent == "Yes", timeWellSpent := 3]
    
    for (cl in paste0('rate', 1:5)) {
        ExS[get(cl) == "Poor", (cl) := 1]
        ExS[get(cl) == "Average", (cl) := 2]
        ExS[get(cl) == "Good", (cl) := 3]
    }
    
    for (cl in paste0('exExpct', 1:10)) {
        ExS[get(cl) == "Below", (cl) := 1]
        ExS[get(cl) == "Below ", (cl) := 1]
        ExS[get(cl) == "Met", (cl) := 2]
        ExS[get(cl) == "Exceeded", (cl) := 3]
        ExS[get(cl) == "I had no expectations", (cl) := 4]
    }
    
    for (cl in paste0('tw', 1:7)) {
        ExS[get(cl) == "Disagree", (cl) := 1]
        ExS[get(cl) == "Neutral", (cl) := 2]
        ExS[get(cl) == "Agree", (cl) := 3]
    }
    
    for (cl in paste0('pf', 1:14)) {
        ExS[get(cl) == "No", (cl) := 1]
        ExS[get(cl) == "Yes", (cl) := 2]
        ExS[get(cl) == "Not sure", (cl) := 3]
    }
    
    for (cl in paste0('fb', 1:4)) {
        ExS[get(cl) == "Disagree", (cl) := 1]
        ExS[get(cl) == "Neutral", (cl) := 2]
        ExS[get(cl) == "Agree", (cl) := 3]
    }
    
    for (cl in paste0('exCap', 1:7)) {
        ExS[get(cl) == "No", (cl) := 1]
        ExS[get(cl) == "Somewhat", (cl) := 2]
        ExS[get(cl) == "Significantly", (cl) := 3]
    }
    
    for (cl in paste0('cha', 1:3)) {
        ExS[get(cl) == "Disagree", (cl) := 1]
        ExS[get(cl) == "Neutral", (cl) := 2]
        ExS[get(cl) == "Agree", (cl) := 3]
    }
    
    for (cl in paste0('career', 1)) {
        ExS[get(cl) == "No change", (cl) := 1]
        ExS[get(cl) == "Not changed", (cl) := 1]
        ExS[get(cl) == "No change\t", (cl) := 1]
        ExS[get(cl) == "Positively\t", (cl) := 2]
        ExS[get(cl) == "Increased", (cl) := 2]
    }
    
    for (cl in paste0('ca', 1:4)) {
        ExS[get(cl) == "No", (cl) := 1]
        ExS[get(cl) == "Yes", (cl) := 2]
        ExS[get(cl) == "Not Sure", (cl) := 3]
    }
    
    for (cl in paste0('swarm', 1:4)) {
        ExS[get(cl) == "Strongly disagree", (cl) := 1]
        ExS[get(cl) == "Disagree", (cl) := 2]
        ExS[get(cl) == "Neutral", (cl) := 3]
        ExS[get(cl) == "Agree", (cl) := 4]
        ExS[get(cl) == "Strongly agree", (cl) := 5]
    }
    
    for (cl in paste0('lk', 1:6)) {
        ExS[get(cl) == "Disagree", (cl) := 1]
        ExS[get(cl) == "Neutral", (cl) := 2]
        ExS[get(cl) == "Agree", (cl) := 3]
    }
    
    ExS[ratingTool == "No", ratingTool := 1]
    ExS[ratingTool == "Yes", ratingTool := 2]
    
    ExS[externalTools == "No", externalTools := 1]
    ExS[externalTools == "Yes", externalTools := 2]
    
    ExS[enoughTime == "No", enoughTime := 1]
    ExS[enoughTime == "Yes", enoughTime := 2]
    
    ExS[,`:=`(progress = as.numeric(progress),
              finished = (finished == "True"),
              starRating = as.integer(substr(starRating,1,1)),
              responseStatements = lapply(strsplit(responseStatements, ","), trimws),
              ratingToolPurpose = lapply(strsplit(ratingToolPurpose, ","), trimws)
    )]
    
    # Separate out responseStatements into own column.
    suppressWarnings(ExS[,`:=`(
        res1 = stringr::str_detect(responseStatements, "The resources were an important contribution to problem solving"),
        res2 = stringr::str_detect(responseStatements, "It was easy to keep track of all the resources posted on the Platform"),
        res3 = stringr::str_detect(responseStatements, "My team created a lot of resources"),
        res4 = stringr::str_detect(responseStatements, "It was too time consuming to read through everyone's resources.")
    )])
    
    # Seperate out analytical experience into own column.
    suppressWarnings(ExS[,`:=`(
        whyRate1 = stringr::str_detect(ratingToolPurpose, "I used rating to fairly indicate the readiness or quality of a report"),
        whyRate2 = stringr::str_detect(ratingToolPurpose, "I used rating to give guidance to the author"),
        whyRate3 = stringr::str_detect(ratingToolPurpose, "I used rating to push my prefered report to the top")
    )])
    
    colsToRemove = c("responseStatements",
                     "ratingToolPurpose")
    ExS = ExS[,!..colsToRemove]
    
    
    # REORDER COLUMNS
    ES = ES[, .(
        progress,
        finished,
        gaveConsent2,
        gaveConsent3,
        agreedToTerms,
        user,
        agegroup,
        gender,
        occupation,
        isOrg,
        education,
        studyarea,
        studyareaOtherInput,
        int1,
        int2,
        int3,
        int4,
        int5,
        int6,
        int7,
        int8,
        int9,
        int10,
        int11,
        interestsOtherInput,
        enExpct1,
        enExpct2,
        enExpct3,
        enExpct4,
        enExpct5,
        enExpct6,
        enExpct7,
        enExpct8,
        enExpct9,
        enExpct10,
        pri1,
        pri2,
        pri3,
        pri4,
        pri5,
        pri6,
        pri7,
        pri8,
        pri9,
        pri10,
        pri11,
        priOther,
        priOtherInput,
        aomt1,
        aomt2,
        aomt3,
        aomt4,
        aomt5,
        aomt6,
        aomt7,
        aomt8,
        aomt9,
        aomt10,
        aomt11,
        enCap1,
        enCap2,
        enCap3,
        enCap4,
        enCap5,
        enCap6,
        enCap7,
        yearsWorkExperience,
        yearsAnalyticalExperience,
        ae1,
        ae2,
        ae3,
        ae4,
        ae5,
        hasMultidisciplinaryExperience,
        multidisciplinaryExperienceInput
    )]
    
    ExS = ExS[, .(
        progress,
        finished,
        isOrg,
        user,
        starRating,
        timeWellSpent,
        bestThing,
        worstThing,
        
        hoursPerWeek,
        enoughTime,
        proportionOwnTime,
        
        rate1,
        rate2,
        rate3,
        rate4,
        rate5,
        
        exExpct1,
        exExpct2,
        exExpct3,
        exExpct4,
        exExpct5,
        exExpct6,
        exExpct7,
        exExpct8,
        exExpct9,
        exExpct10,
        
        tw1,
        tw2,
        tw3,
        tw4,
        tw5,
        tw6,
        tw7,
        
        pf1,
        pf2,
        pf3,
        pf4,
        pf5,
        pf6,
        pf7,
        pf8,
        pf9,
        pf10,
        pf11,
        pf12,
        pf13,
        pf14,
        pfComments,
        
        fb1,
        fb2,
        fb3,
        fb4,
        
        exCap1,
        exCap2,
        exCap3,
        exCap4,
        exCap5,
        exCap6,
        exCap7,
        
        mostValuable,
        
        cha1,
        cha2,
        cha3,
        
        career1,
        
        ca1,
        ca2,
        ca3,
        ca4,
        
        swarm1,
        swarm2,
        swarm3,
        swarm4,
        
        lk1,
        lk2,
        lk3,
        lk4,
        lk5,
        lk6,
        lkSuggestions,
        
        res1,
        res2,
        res3,
        res4,
        
        whyRate1,
        whyRate2,
        whyRate3,
        
        ratingTool,
        ratingToolWhyNot,
        featureRequests,
        externalTools,
        externalToolsComments,
        bestQuestionNotAsked,
        testimonial,
        otherComments
    )]

    ES <- ES[finished & ((gaveConsent3 == TRUE) | is.na(gaveConsent3))]
    ES = ES[!is.na(ES$user)]
    
    ExS <- ExS[(finished)]
    ExS = ExS[!is.na(ExS$user)]
    
    ES = merge(ES, ExS, by = c("user", "isOrg"), all = T)
    ES = as.data.frame(ES)
    
    ES$user = tolower(ES$user)
    
    colsToRemove = c('finished.x', 'progress.x',
                     'finished.y', 'progress.y')
    for (cl in colsToRemove) {
        ES[[cl]] = NULL
    }
    
    # Compute AOMT construct.
    ES = computeAOMT(ES)
    
    return(setDT(ES))
}

compile_parts_2020_PsychologyCapstone = function(path_to_data, instance_name) {
    path = paste0(path_to_data, instance_name, '/QualtricsData/')
    
    entrySurvey = fread(paste0(path, "EntrySurvey-noPII.csv"))
    entrySurveySupplement = fread(paste0(path, "EntrySurveySupplement-noPII.csv"))
    exitSurvey = fread(paste0(path, "ExitSurvey-noPII.csv"))
    
    entrySurvey = entrySurvey[3:nrow(entrySurvey)]
    entrySurveySupplement = entrySurveySupplement[3:nrow(entrySurveySupplement)]
    exitSurvey = exitSurvey[3:nrow(exitSurvey)]
    
    colnames(entrySurvey) <- c("startDate",
                                  "endDate",
                                  "status",
                                  "IPaddress",
                                  "progress",
                                  "duration",
                                  "finished",
                                  "recordedDate",
                                  "responseID",
                                  "recipientLastName",
                                  "recipientFirstName",
                                  "recipientEmail",
                                  "externalReference",
                                  "latitude",
                                  "longitude",
                                  "distributionChannel",
                                  "userLanguage",
                                  
                                  "gaveConsent4", # Psychology Capstone consent
                                  "user",
                                  
                                  "age",
                                  "gender",
                                  "education",
                                  
                                  "major1",
                                  "major2",
                                  "minor1",
                                  "minor2",
                                  
                                  "englishProficiency",
                                  "loteProficiency1",
                                  "loteProficiencyText1",
                                  "loteProficiency2",
                                  "loteProficiencyText2",
                                  "loteProficiency3",
                                  "loteProficiencyText3",
                                  "loteProficiency4",
                                  "loteProficiencyText4",
                                  "loteProficiency5",
                                  "loteProficiencyText5",
                                  "loteProficiency6",
                                  "loteProficiencyText6",
                                  
                                  "enjoyLogicProbs",
                                  "enjoyNumProbs",
                                  
                                  "exp1", # Math
                                  "exp2", # Quant Model
                                  "exp3", # Stats
                                  "exp4", # Prob
                                  "exp5", # Bayse Nets
                                  "exp6", # Programming
                                  "exp7", # Experimental Design
                                  "exp8", # Risk Analysis
                                  "exp9", # Forecasting
                                  "exp10", # Decision Theory
                                  "exp11", # Game Theory
                                  "exp12", # SATs
                                  "exp13", # Argument Mapping
                                  "exp14", # Informal Logic
                                  "exp15", # Sys Think
                                  "exp16", # Image Analysis
                                  "exp17", # Link Analysis
                                  "exp18", # Graphic Design
                                  "exp19", # Technical Writing
                                  
                                  "pc35Acomp",
                                  "pc57A",
                                  "pc14Acomp",
                                  "pc60AUB",
                                  "pc5AUB",
                                  
                                  "pr1",
                                  "pr2",
                                  "pr3",
                                  "pr4",
                                  "pr5",
                                  "pr6",
                                  "pr7",
                                  "pr8",
                                  "pr9",
                                  "pr10",
                                  "pr11",
                                  "pr12",
                                  "pr13",
                                  "pr14",
                                  "pr15",
                                  "pr16",
                                  
                                  "mat1",
                                  "mat2",
                                  "mat3",
                                  "mat4",
                                  "mat5",
                                  "mat6",
                                  "mat7",
                                  "mat8",
                                  "mat9",
                                  "mat10",
                                  "mat11",
                                  
                                  "pc60Acomp",
                                  "pc5Acomp",
                                  "pc35A",
                                  "pc57Acomp",
                                  "pc14B",
                                  
                                  "crt1",
                                  "crt2",
                                  "crt3",
                                  "crt4",
                                  "crt5",
                                  "crt6",
                                  "crtSeenBefore",
                                  "crtSeenBeforeText",
                                  
                                  "aomt1",
                                  "aomt2",
                                  "aomt3",
                                  "aomt4",
                                  "aomt5",
                                  "aomt6",
                                  "aomt7",
                                  "aomt8",
                                  "aomt9",
                                  "aomt10",
                                  "aomt11",
                                  
                                  "bfi1",
                                  "bfi2",
                                  "bfi3",
                                  "bfi4",
                                  "bfi5",
                                  "bfi6",
                                  "bfi7",
                                  "bfi8",
                                  "bfi9",
                                  "bfi10",
                                  
                                  "pc14A",
                                  "pc57AUB",
                                  "pc5B",
                                  "pc35AUB",
                                  "pc60A",
                                  
                                  "pc60B",
                                  "pc57B",
                                  "pc14AUB",
                                  "pc5A",
                                  "pc35B",
                                  
                                  "tp1",
                                  "tp2",
                                  "tp3",
                                  "tp4",
                                  "tp5",
                                  "tp6",
                                  "tp7",
                                  "tp8",
                                  "tp9",
                                  "tp10",
                                  "tp11",
                                  "tp12",
                                  "tp13",
                                  "tp14",
                                  "tp15",
                                  "tp16"
    )
    
    colnames(entrySurveySupplement) <- c("startDate",
                               "endDate",
                               "status",
                               "IPaddress",
                               "progress",
                               "duration",
                               "finished",
                               "recordedDate",
                               "responseID",
                               "recipientLastName",
                               "recipientFirstName",
                               "recipientEmail",
                               "externalReference",
                               "latitude",
                               "longitude",
                               "distributionChannel",
                               "userLanguage",
                               
                               "user",
                               
                               "tp1",
                               "tp2",
                               "tp3",
                               "tp4",
                               "tp5",
                               "tp6",
                               "tp7",
                               "tp8",
                               "tp9",
                               "tp10",
                               "tp11",
                               "tp12",
                               "tp13",
                               "tp14",
                               "tp15",
                               "tp16"
    )
    
    # Remove empty 
    ES = entrySurvey[,1:(152-16)]
    
    colsToRemove = c("startDate",
                     "endDate",
                     "status",
                     "IPaddress",
                     "duration",
                     "recordedDate",
                     "externalReference",
                     "latitude",
                     "longitude",
                     "distributionChannel",
                     "userLanguage",
                     "responseID",
                     "recipientLastName",
                     "recipientFirstName",
                     "recipientEmail"
    )
    ES = ES[,!..colsToRemove]
    
    ES[ES == ""] <- NA
    
    # Add in missing team perceptions component.
    for (k in 1:nrow(entrySurveySupplement)) {
        ES[user == entrySurveySupplement$user[k],paste0('tp',1:16)] = entrySurveySupplement[k,paste0('tp',1:16)]
    }
    
    colnames(exitSurvey) <- c("startDate",
                             "endDate",
                             "status",
                             "IPaddress",
                             "progress",
                             "duration",
                             "finished",
                             "recordedDate",
                             "responseID",
                             "recipientLastName",
                             "recipientFirstName",
                             "recipientEmail",
                             "externalReference",
                             "latitude",
                             "longitude",
                             "distributionChannel",
                             "userLanguage",
                             
                             "user",
                             "deg",
                             "degOther",
                             "workExp",
                             
                             "tps1",
                             "tps2",
                             "tps3",
                             "tps4",
                             "tps5",
                             "tps6",
                             "tps7",
                             "tps8",
                             "tps9",
                             "tps10",
                             "tps11",
                             "tps12",
                             "tps13",
                             "tps14",
                             "tps15",
                             "tps16",
                             
                             "tmcoh1",
                             "tmcoh2",
                             "tmcoh3",
                             "tmcoh4",
                             
                             "swed1",
                             "swed2",
                             
                             paste0("BFI", 1:60),
                             paste0("jpc", 1:15)
    )
    
    colsToRemove = c("startDate",
                     "endDate",
                     "status",
                     "IPaddress",
                     "duration",
                     "externalReference",
                     "latitude",
                     "longitude",
                     "distributionChannel",
                     "userLanguage",
                     "responseID",
                     "recipientLastName",
                     "recipientFirstName",
                     "recipientEmail"
    )
    ExS = exitSurvey
    ExS = ExS[,!..colsToRemove]
    
    ExS[ExS == ""] <- NA
    
    users = unique(ExS$user)
    rowsToKeep = c()
    for (u in users) {
        temp = which(ExS$user == u)
        rowsToKeep = c(rowsToKeep, temp[length(temp)])
    }
    ExS = ExS[rowsToKeep]
    
    ES[age >= 18 & age <= 25, agegroup := "18-25"]
    ES[age >= 26 & age <= 35, agegroup := "26-35"]
    ES[age >= 36 & age <= 45, agegroup := "36-45"]
    ES[age >= 46 & age <= 55, agegroup := "46-55"]
    ES[age >= 56 & age <= 65, agegroup := "56-65"]
    ES[age >= 66, agegroup := "over 65"]
    
    ES[,`:=`(progress = as.numeric(progress),
             finished = (finished == "True"),
             gaveConsent4 = (gaveConsent4 == "I am happy to participate")
    )]
    ExS[,`:=`(progress = as.numeric(progress),
             finished = (finished == "True")
    )]
    
    for (cl in paste0('exp', 1:19)) {
        ES[get(cl) == "Not familiar with this domain", (cl) := 1]
        ES[get(cl) == "Studied in school, but don't use it", (cl) := 2]
        ES[get(cl) == "Use this knowledge occasionally", (cl) := 3]
        ES[get(cl) == "Use this knowledge regularly", (cl) := 4]
        ES[get(cl) == "I am a recognized expert", (cl) := 5]
        ES[get(cl) == "I am an international authority", (cl) := 6]
        # ExS[[cl]] = as.numeric(ExS[[cl]])
    }
    
    for (cl in paste0('aomt', 1:11)) {
        ES[get(cl) == "Strongly Disagree", (cl) := 1]
        ES[get(cl) == "Strongly disagree", (cl) := 1]
        ES[get(cl) == "Disagree", (cl) := 2]
        ES[get(cl) == "Somewhat disagree", (cl) := 3]
        ES[get(cl) == "Neither agree nor disagree", (cl) := 4]
        ES[get(cl) == "Somewhat agree", (cl) := 5]
        ES[get(cl) == "Agree", (cl) := 6]
        ES[get(cl) == "Strongly agree", (cl) := 7]
        ES[[cl]] = as.numeric(ES[[cl]])
    }

    for (cl in paste0('bfi', 1:10)) {
        ES[get(cl) == "Strongly Disagree", (cl) := 1]
        ES[get(cl) == "Strongly disagree", (cl) := 1]
        ES[get(cl) == "Disagree", (cl) := 2]
        ES[get(cl) == "Somewhat disagree", (cl) := 3]
        ES[get(cl) == "Neither agree nor disagree", (cl) := 4]
        ES[get(cl) == "Somewhat agree", (cl) := 5]
        ES[get(cl) == "Agree", (cl) := 6]
        ES[get(cl) == "Strongly agree", (cl) := 7]
        # ExS[[cl]] = as.numeric(ExS[[cl]])
    }
    
    for (cl in paste0('tp', 1:16)) {
        ES[get(cl) == "Strongly disagree", (cl) := 1]
        ES[get(cl) == "Somewhat disagree", (cl) := 2]
        ES[get(cl) == "Neither agree nor disagree", (cl) := 3]
        ES[get(cl) == "Somewhat agree", (cl) := 4]
        ES[get(cl) == "Strongly agree", (cl) := 5]
        # ExS[[cl]] = as.numeric(ExS[[cl]])
    }
    
    for (cl in paste0('tps', 1:16)) {
        ExS[get(cl) == "Strongly disagree", (cl) := 1]
        ExS[get(cl) == "Somewhat disagree", (cl) := 2]
        ExS[get(cl) == "Neither agree nor disagree", (cl) := 3]
        ExS[get(cl) == "Somewhat agree", (cl) := 4]
        ExS[get(cl) == "Strongly agree", (cl) := 5]
        # ExS[[cl]] = as.numeric(ExS[[cl]])
    }
    
    for (cl in paste0('tmcoh', 1:4)) {
        ExS[get(cl) == "Strongly Disagree", (cl) := 1]
        ExS[get(cl) == "Strongly disagree", (cl) := 1]
        ExS[get(cl) == "Disagree", (cl) := 2]
        ExS[get(cl) == "Somewhat disagree", (cl) := 3]
        ExS[get(cl) == "Neither agree nor disagree", (cl) := 4]
        ExS[get(cl) == "Somewhat agree", (cl) := 5]
        ExS[get(cl) == "Agree", (cl) := 6]
        ExS[get(cl) == "Strongly agree", (cl) := 7]
        # ExS[[cl]] = as.numeric(ExS[[cl]])
    }
    
    for (cl in paste0('swed', 1:2)) {
        ExS[get(cl) == "Improved collaboration", (cl) := 3]
        ExS[get(cl) == "Increased engagement", (cl) := 3]
        ExS[get(cl) == "Made no difference", (cl) := 2]
        ExS[get(cl) == "Hampered collaboration", (cl) := 1]
        ExS[get(cl) == "Hampered engagement", (cl) := 1]
        # ExS[[cl]] = as.numeric(ExS[[cl]])
    }
    
    for (cl in paste0('BFI', 1:60)) {
        ExS[get(cl) == "Disagree strongly", (cl) := 1]
        ExS[get(cl) == "Disagree a little", (cl) := 2]
        ExS[get(cl) == "Neutral; no opinion", (cl) := 3]
        ExS[get(cl) == "Agree a little", (cl) := 4]
        ExS[get(cl) == "Agree strongly", (cl) := 5]
        # ExS[[cl]] = as.numeric(ExS[[cl]])
    }
    
    for (cl in paste0('jpc', 1:15)) {
        ExS[get(cl) == "Strongly Disagree", (cl) := 1]
        ExS[get(cl) == "Strongly disagree", (cl) := 1]
        ExS[get(cl) == "Disagree", (cl) := 2]
        ExS[get(cl) == "Somewhat disagree", (cl) := 3]
        ExS[get(cl) == "Neither agree nor disagree", (cl) := 4]
        ExS[get(cl) == "Somewhat agree", (cl) := 5]
        ExS[get(cl) == "Agree", (cl) := 6]
        ExS[get(cl) == "Strongly agree", (cl) := 7]
        # ExS[[cl]] = as.numeric(ExS[[cl]])
    }
    
    
    
    # REORDER COLUMNS
    ES = ES[,.(
        progress,
        finished,
        gaveConsent4,
        user,
        
        agegroup,
        gender,
        education,
        
        major1,
        major2,
        minor1,
        minor2,
        
        englishProficiency,
        loteProficiency1,
        loteProficiencyText1,
        loteProficiency2,
        loteProficiencyText2,
        loteProficiency3,
        loteProficiencyText3,
        loteProficiency4,
        loteProficiencyText4,
        loteProficiency5,
        loteProficiencyText5,
        loteProficiency6,
        loteProficiencyText6,
        
        enjoyLogicProbs,
        enjoyNumProbs,
        
        exp1,
        exp2,
        exp3,
        exp4,
        exp5,
        exp6,
        exp7,
        exp8,
        exp9,
        exp10,
        exp11,
        exp12,
        exp13,
        exp14,
        exp15,
        exp16,
        exp17,
        exp18,
        exp19,
        
        pc5A,
        pc5B,
        pc5Acomp,
        pc5AUB,
        pc14A,
        pc14B,
        pc14Acomp,
        pc14AUB,
        pc35A,
        pc35B,
        pc35Acomp,
        pc35AUB,
        pc57A,
        pc57B,
        pc57Acomp,
        pc57AUB,
        pc60A,
        pc60B,
        pc60Acomp,
        pc60AUB,
        
        pr1,
        pr2,
        pr3,
        pr4,
        pr5,
        pr6,
        pr7,
        pr8,
        pr9,
        pr10,
        pr11,
        pr12,
        pr13,
        pr14,
        pr15,
        pr16,
        
        mat1,
        mat2,
        mat3,
        mat4,
        mat5,
        mat6,
        mat7,
        mat8,
        mat9,
        mat10,
        mat11,
        
        crt1,
        crt2,
        crt3,
        crt4,
        crt5,
        crt6,
        crtSeenBefore,
        crtSeenBeforeText,
        
        aomt1,
        aomt2,
        aomt3,
        aomt4,
        aomt5,
        aomt6,
        aomt7,
        aomt8,
        aomt9,
        aomt10,
        aomt11,
        
        bfi1,
        bfi2,
        bfi3,
        bfi4,
        bfi5,
        bfi6,
        bfi7,
        bfi8,
        bfi9,
        bfi10,
        
        tp1,
        tp2,
        tp3,
        tp4,
        tp5,
        tp6,
        tp7,
        tp8,
        tp9,
        tp10,
        tp11,
        tp12,
        tp13,
        tp14,
        tp15,
        tp16
    )]
    
    ExS = ExS[, .(
        progress,
        finished,
        user,
        deg,
        degOther,
        workExp,
        
        tps1,
        tps2,
        tps3,
        tps4,
        tps5,
        tps6,
        tps7,
        tps8,
        tps9,
        tps10,
        tps11,
        tps12,
        tps13,
        tps14,
        tps15,
        tps16,
        
        tmcoh1,
        tmcoh2,
        tmcoh3,
        tmcoh4,
        
        swed1,
        swed2,
        
        BFI1,
        BFI2,
        BFI3,
        BFI4,
        BFI5,
        BFI6,
        BFI7,
        BFI8,
        BFI9,
        BFI10,
        BFI11,
        BFI12,
        BFI13,
        BFI14,
        BFI15,
        BFI16,
        BFI17,
        BFI18,
        BFI19,
        BFI20,
        BFI21,
        BFI22,
        BFI23,
        BFI24,
        BFI25,
        BFI26,
        BFI27,
        BFI28,
        BFI29,
        BFI30,
        BFI31,
        BFI32,
        BFI33,
        BFI34,
        BFI35,
        BFI36,
        BFI37,
        BFI38,
        BFI39,
        BFI40,
        BFI41,
        BFI42,
        BFI43,
        BFI44,
        BFI45,
        BFI46,
        BFI47,
        BFI48,
        BFI49,
        BFI50,
        BFI51,
        BFI52,
        BFI53,
        BFI54,
        BFI55,
        BFI56,
        BFI57,
        BFI58,
        BFI59,
        BFI60,
        
        jpc1,
        jpc2,
        jpc3,
        jpc4,
        jpc5,
        jpc6,
        jpc7,
        jpc8,
        jpc9,
        jpc10,
        jpc11,
        jpc12,
        jpc13,
        jpc14,
        jpc15
    )]
    
    ES <- ES[finished & ((gaveConsent4 == TRUE) | is.na(gaveConsent4))]
    ES = ES[!is.na(ES$user)]
    
    ExS <- ExS[(finished)]
    ExS = ExS[!is.na(ExS$user)]
    
    
    
    ES = merge(ES, ExS, by = c("user"), all = T)
    ES = as.data.frame(ES)
    
    ES$user = tolower(ES$user)
    
    colsToRemove = c('finished.x', 'progress.x',
                     'finished.y', 'progress.y')
    for (cl in colsToRemove) {
        ES[[cl]] = NULL
    }
    
    # Compute AOMT construct.
    ES = computeAOMT(ES)
    
    # Compute matrix reasoning score.
    ES = computeMatrixReasoning(ES)
    
    # Add in participants who didn't complete the entry survey.
    users = c("gecko307","flyingfox753","koala806","fairywren473","thornydevil258","dingo367","pademelon117","goanna424","kookaburra542","galah431","corella928","currawong437","budgerigar571","echidna358","magpie806","kiwi231","emu750","quokka744","tassiedevil533","quoll339","sugarglider399","boobook990","cassowary120","numbat535","bilby505","blackswan808","kangaroo331","blackswan705","possum193","wombat650","bluewhale248","blackcockatoo586","flyingfox651","pobblebonk917","gecko593","crocodile147","wallaby468","frogmouth979","pharlap649","cockatoo319","kangaroo384","platypus604","wallaroo177","galah361","tassiedevil576","flyingfox488","cassowary792","budgerigar439","kiwi471","fairywren318","frogmouth679","kangaroo769")
    users = users[!(users %in% ES$user)]
    newES = data.table(user = users)
    ES = plyr::rbind.fill(ES, newES)
    
    return(setDT(ES))
}

compile_parts = function(path_to_data, instance_name) {
    
    # Lookup table for relevant functions. This is required because demographic surveys differ
    # syntactically across different experiements, and so each require custom code to tidy the
    # data into a consistent format.
    compile_parts = list(
        "x2018_SwarmChallengeExp1" = compile_parts_2018_SwarmChallengeExp1,
        "x2020_HuntChallenge" = compile_parts_2020_HuntChallenge,
        "x2020_PsychologyCapstone" = compile_parts_2020_PsychologyCapstone
    )
    
    if (instance_name %in% names(compile_parts)) {
        return(compile_parts[[instance_name]](path_to_data, instance_name))
    } else {
        return(NULL)
    }
}


compile_teamparts_2018_SwarmChallengeExp1 = function(repo, path_to_data, instance_name) {
    path = paste0(path_to_data, instance_name, '/QualtricsData/')
    
    parts = repo[[instance_name]]$CoreData$parts
    tmprt = data.table(
        user = parts$user,
        team = rep(NA, nrow(parts))
    )

    # Populate team.
    lookup = fread(paste0(path_to_data, instance_name, '/AdminData/match_ind_diffs_response_to_swarm_username.csv'))
    lookup$username = tolower(lookup$username)
    for (k in 1:nrow(tmprt)) {
        if (tmprt$user[k] %in% lookup$username) {
            i = which(lookup$username == tmprt$user[k])
            tmprt$team[k] = lookup$Team[i]
        }
    }
    
    tmprt = tmprt[!is.na(tmprt$team)]
    tmprt = as.data.frame(tmprt)
    
    return(setDT(tmprt))
}

compile_teamparts_2020_HuntChallenge = function(repo, path_to_data, instance_name) {
    path = paste0(path_to_data, instance_name, '/QualtricsData/')
    
    parts = repo[[instance_name]]$CoreData$parts
    
    pubLookup = fread(paste0(path, "HC2020_EntrySurvey_Public.csv"))
    supLookup = fread(paste0(path_to_data, instance_name, '/AdminData/Superteams.csv'))
    orgLookup = fread(paste0(path_to_data, instance_name, '/AdminData/OrgMaster.csv'))
    
    pubLookup = pubLookup[,user:team]
    supLookup = supLookup[,.(username, group_code)]
    supLookup = supLookup[, `:=`(
        user = username,
        team = group_code
    )]
    supLookup = supLookup[,user:team]
    orgLookup = orgLookup[,.(username, group_code)]
    orgLookup = orgLookup[, `:=`(
        user = username,
        team = group_code
    )]
    orgLookup = orgLookup[,user:team]
    
    orgLookup$user = tolower(orgLookup$user)
    pubLookup$user = tolower(pubLookup$user)
    supLookup$user = tolower(supLookup$user)
    
    pubLookup = pubLookup[user %in% parts$user]
    supLookup = supLookup[user %in% parts$user]
    orgLookup = orgLookup[user %in% parts$user]
    
    supLookup$team = sapply(strsplit(supLookup$team, ','), function(x) {return(x[2])})
    
    tmprt = rbind(
        pubLookup,
        supLookup,
        orgLookup
    )
    
    tmprt = tmprt[!is.na(tmprt$team)]
    tmprt = as.data.frame(tmprt)
    
    return(setDT(tmprt))
}

compile_teamparts_2020_PsychologyCapstone = function(repo, path_to_data, instance_name) {
    
    usrs = repo[[instance_name]]$CoreData$parts$user
    
    teamLookup = fread(paste0(path_to_data, instance_name, '/AdminData/usernames.csv'))
    teamLookup$Username = tolower(teamLookup$Username)
    teamLookup = teamLookup[Username %in% usrs]
    
    tmprt = data.table(
        user = character(1000),
        team = character(1000)
    )
    
    k = 1L
    for (j in 1:nrow(teamLookup)) {
        
        tms = paste(teamLookup[['Challenge Team']][j], sep=',')
        tms = strsplit(tms, split = ',')[[1]]
        for (tm in tms) {
            set(tmprt, k, 'user', teamLookup$Username[j])
            set(tmprt, k, 'team', tm)
            k = k + 1L
        }
    }
    
    tmprt = tmprt[user != ""]
    
    tmprt = as.data.frame(tmprt)
    return(setDT(tmprt))
}

compile_teamparts = function(repo, path_to_data, instance_name) {
    
    # Lookup table for relevant functions. This is required because demographic surveys differ
    # syntactically across different experiements, and so each require custom code to tidy the
    # data into a consistent format.
    compile_teamparts = list(
        "x2018_SwarmChallengeExp1" = compile_teamparts_2018_SwarmChallengeExp1,
        "x2020_HuntChallenge" = compile_teamparts_2020_HuntChallenge,
        "x2020_PsychologyCapstone" = compile_teamparts_2020_PsychologyCapstone
    )
    
    if (instance_name %in% names(compile_teamparts)) {
        return(compile_teamparts[[instance_name]](repo, path_to_data, instance_name))
    } else {
        return(NULL)
    }
}


compile_teams_2018_SwarmChallengeExp1 = function(repo, path_to_data, instance_name) {
    
    path = paste0(path_to_data, instance_name, "/KnackData/products.csv")
    
    K = as.data.frame(fread(path))
    colnames(K) <- c("reportCode",
                     "problem",
                     "nRatings",
                     "avgIC",
                     "min",
                     "max",
                     "range",
                     "username",
                     "team_alt",
                     "team",
                     "week",
                     "submitted")
    
    parts = repo[[instance_name]]$CoreData$parts
    teamparts = repo[[instance_name]]$CoreData$teamparts
    parts = merge(teamparts, parts, by = c("user"))
    
    # Create teams table.
    
    tms = unique(teamparts$team)
    teams = data.frame(team = tms,
                       AOMT = NA,
                       divAOMT = NA,
                       medianEdu = NA,
                       type = NA)
    
    getDivAOMT = function(tm) {
        scores = parts[parts$team == tm,]$aomt
        scores = scores[!is.na(scores)]
        return(mean(c(dist(scores))))
    }
    
    getMedianEdu = function(tm) {
        eds = parts[parts$team == tm,]$education
        eds[eds == "High School or GED Equivalency"] = 1
        eds[eds == "Some College"] = 2
        eds[eds == "Bachelor's Degree"] = 3
        eds[eds == "Associate's Degree"] = 4
        eds[eds == "Master's Degree"] = 5
        eds[eds == "Professional or Doctoral Degree (e.g. MD, JD, PhD)"] = 6
        eds = as.numeric(eds)
        
        return(median(eds, na.rm = T))
    }
    
    for (k in 1:nrow(teams)) {
        teams$AOMT[k] = median(parts[parts$team == teams$team[k],]$aomt, na.rm = T)
        teams$divAOMT[k] = getDivAOMT(teams$team[k])
        teams$medianEdu[k] = getMedianEdu(teams$team[k])
        teams$type[k] = parts[parts$team == teams$team[k],]$type[1]
    }
    
    return(setDT(teams))
}

compile_teams_2020_HuntChallenge = function(repo, path_to_data, instance_name) {
    
    path = paste0(path_to_data, instance_name, "/KnackData/teams2020challenge.csv")
    
    K = as.data.frame(fread(path))
    colnames(K) <- c("team",
                     "avgAll",
                     "type",
                     "points",
                     "avg2",
                     "avg3",
                     "avg4",
                     "avg1",
                     "nRatings1",
                     "nRatings2",
                     "nRatings3",
                     "nRatings4",
                     "nRatings2020",
                     "nGeoCorrect")
    K = K[K$type != "Calibration",]
    
    parts = repo[[instance_name]]$CoreData$parts
    teamparts = repo[[instance_name]]$CoreData$teamparts
    parts = merge(teamparts, parts, by = c("user"))
    
    # Create teams table.
    
    # tms = unique(repo[[instance_name]]$PlatformData$analytics$team)
    tms = unique(teamparts$team)
    tms = tms[tms != "melcreate"]
    teams = data.frame(team = tms,
                       AOMT = NA,
                       divAOMT = NA,
                       medianEdu = NA,
                       type = NA)
    
    getDivAOMT = function(tm) {
        scores = parts[parts$team == tm,]$aomt
        scores = scores[!is.na(scores)]
        return(mean(c(dist(scores))))
    }
    
    getMedianEdu = function(tm) {
        eds = parts[parts$team == tm,]$education
        eds[eds == "High School"] = 1
        eds[eds == "Trade or Technical Qualification"] = 2
        eds[eds == "Bachelors"] = 3
        eds[eds == "Graduate Certificate, Diploma or equivalent"] = 4
        eds[eds == "Masters"] = 5
        eds[eds == "Phd"] = 6
        eds[eds == "Prefer not to say"] = NA
        eds = as.numeric(eds)
        
        return(median(eds, na.rm = T))
    }
    
    OTs = c("kosciuszko00219",
            "otway00219",
            "noosa00219",
            "uluru00219",
            "kakadu00219",
            "grampians00219",
            "daintree00219")
    STs = c("tongariro00311",
            "joondalup00311",
            "murramang00311",
            "warrumbungle00311",
            "aoraki00311")
    
    for (k in 1:nrow(teams)) {
        teams$AOMT[k] = median(parts[parts$team == teams$team[k],]$aomt, na.rm = T)
        teams$divAOMT[k] = getDivAOMT(teams$team[k])
        teams$medianEdu[k] = getMedianEdu(teams$team[k])
        if (teams$team[k] %in% STs) {
            teams$type[k] = 'ST'
        } else if (teams$team[k] %in% OTs) {
            teams$type[k] = 'OT'
        } else {
            teams$type[k] = 'PT'
        }
    }
    
    return(setDT(teams))
}

compile_teams_2020_PsychologyCapstone = function(repo, path_to_data, instance_name) {
    
    parts = repo[[instance_name]]$CoreData$parts
    teamparts = repo[[instance_name]]$CoreData$teamparts
    parts = merge(teamparts, parts, by = c("user"))
    
    # Create teams table.
    
    tms = unique(teamparts$team)
    teams = data.frame(team = tms,
                       AOMT = NA,
                       divAOMT = NA,
                       medianEdu = NA,
                       type = rep('UT',length(tms))
    )
    getDivAOMT = function(tm) {
        scores = parts[parts$team == tm,]$aomt
        scores = scores[!is.na(scores)]
        return(mean(c(dist(scores))))
    }
    
    getMedianEdu = function(tm) {
        eds = parts[parts$team == tm,]$education
        eds[eds == "High School or GED Equivalency"] = 1
        eds[eds == "Some College"] = 2
        eds[eds == "Bachelor's Degree"] = 3
        eds[eds == "Associate's Degree"] = 4
        eds[eds == "Master's Degree"] = 5
        eds[eds == "Professional or Doctoral Degree (e.g. MD, JD, PhD)"] = 6
        eds = as.numeric(eds)
        
        return(median(eds, na.rm = T))
    }
    
    for (k in 1:nrow(teams)) {
        teams$AOMT[k] = median(parts[parts$team == teams$team[k],]$aomt, na.rm = T)
        teams$divAOMT[k] = getDivAOMT(teams$team[k])
        teams$medianEdu[k] = getMedianEdu(teams$team[k])
    }
    
    return(setDT(teams))
}

compile_teams = function(repo, path_to_data, instance_name) {
    
    # Lookup table for relevant functions. This is required because demographic surveys differ
    # syntactically across different experiements, and so each require custom code to tidy the
    # data into a consistent format.
    compile_teams = list(
        "x2018_SwarmChallengeExp1" = compile_teams_2018_SwarmChallengeExp1,
        "x2020_HuntChallenge" = compile_teams_2020_HuntChallenge,
        "x2020_PsychologyCapstone" = compile_teams_2020_PsychologyCapstone
    )
    
    if (instance_name %in% names(compile_teams)) {
        dt = 
        return(compile_teams[[instance_name]](repo, path_to_data, instance_name))
    } else {
        return(NULL)
    }
}

compile_probteams_2018_SwarmChallengeExp1 = function(repo, path_to_data, instance_name) {
    
    path = paste0(path_to_data, instance_name, "/KnackData/products.csv")
    
    K = as.data.frame(fread(path))
    colnames(K) <- c("reportCode",
                     "problem",
                     "nRatings",
                     "avgIC",
                     "min",
                     "max",
                     "range",
                     "username",
                     "team_alt",
                     "team",
                     "week",
                     "submitted")
    
    # Correct typos in team names in K.
    K[K$team == "Witjirra4","team"] = "Witjira4"
    K[K$team == "Garawilla1","team"] = "Garrawilla1"
    
    parts = repo[[instance_name]]$CoreData$parts
    teams = repo[[instance_name]]$CoreData$teams
    tms = unique(teams$team)
    problems = c("How Did Arthur Allen Die?", "Kalukistan", "Three Nations", "Drug Interdiction")
    analytics = repo[[instance_name]]$PlatformData$analytics
    responses = repo[[instance_name]]$PlatformData$responses
    
    response_path = paste0(path_to_data,instance_name,"/PlatformData/responses/text/")
    
    probteam = data.frame(team = rep(tms, length(problems)),
                          problem = rep(problems, each = length(tms)),
                          probNum = NA,
                          type = NA,
                          avgIC = NA,
                          nIC = NA,
                          rankIC = NA,
                          activeUsers = NA,
                          textSimReports = NA,
                          textSimResponses = NA,
                          AOMT = NA,
                          divAOMT = NA,
                          medianEdu = NA)
    
    getActiveUsers = function(tm, pr) {
        team_members = analytics[team == tm & problem == pr]
        nActive = sum(team_members$engagement_scaled > 0)
    }
    
    getTextSim = function(tm, pr, scope) {
        if (scope == "reports") {
            file_names = responses[team == tm & problem == pr & response_type == "report"]$response_text
        } else if (scope == "resources") {
            file_names = responses[team == tm & problem == pr & response_type == "resource"]$response_text
        } else if (scope == "responses") {
            file_names = responses[team == tm & problem == pr]$response_text
        }
        
        if (length(file_names) > 1) {
            reports = suppressWarnings(readtext::readtext(paste0(response_path,file_names[1])))  # surpress "*.md" warnings
            
            for (j in 2:length(file_names)) {
                reports = rbind(reports, suppressWarnings(readtext::readtext(paste0(response_path,file_names[j]))))
            }
            
            CORPUS = quanteda::corpus(reports)
            DFM = quanteda::dfm(CORPUS,
                                remove = quanteda::stopwords("english"),
                                stem = TRUE, remove_punct = TRUE, remove_numbers = TRUE)
            DistMat = quanteda::textstat_simil(DFM, method="cosine")
            Distances = DistMat[lower.tri(DistMat)]
            
            library(quanteda)
            # return( (1 - mean(Distances)) / sum(summary(CORPUS)$Tokens) )
            return( mean(Distances) )
        } else {
            return( 1 )
        }
        
    }
    
    getAOMT = function(tm, pr) {
        active_team_members = analytics[team == tm & problem == pr & engagement_scaled > 0]$user
        return( median(parts[parts$user %in% active_team_members,]$aomt, na.rm = T) )
    }
    
    getDivAOMT = function(tm, pr) {
        active_team_members = analytics[team == tm & problem == pr & engagement_scaled > 0]$user
        scores = parts[parts$user %in% active_team_members,]$aomt
        # scores = scores[!is.na(scores)]
        # rval = mean(c(dist(scores)))
        n = length(scores) * (length(scores) - 1) / 2 # Number of unique pairs.
        rval = sum(dist(scores), na.rm = T) / n
        if (is.na(rval)) {
            rval = 0
        }
        return(rval)
    }
    
    getMedianEdu = function(tm, pr) {
        active_team_members = analytics[team == tm & problem == pr & engagement_scaled > 0]$user
        eds = parts[parts$user %in% active_team_members,]$education
        eds[eds == "High School or GED Equivalency"] = 1
        eds[eds == "Some College"] = 2
        eds[eds == "Bachelor's Degree"] = 3
        eds[eds == "Associate's Degree"] = 4
        eds[eds == "Master's Degree"] = 5
        eds[eds == "Professional or Doctoral Degree (e.g. MD, JD, PhD)"] = 6
        eds = as.numeric(eds)
        
        return(median(eds, na.rm = T))
    }
    
    for (k in 1:nrow(probteam)) {
        i = intersect( which(K$team == probteam$team[k]), which(K$problem == probteam$problem[k]))
        l = which(K[K$problem == probteam$problem[k] & nchar(K$team) > 0,]$team == probteam$team[k])
        if (length(i) == 0) {
            next
        }
        j = which(teams$team == probteam$team[k])
        probteam$probNum[k] = K$week[i]
        probteam$type[k] = teams$type[j]
        probteam$avgIC[k] = K$avgIC[i]
        probteam$nIC[k] = K$nRatings[i]
        probteam$rankIC[k] = rank(-K[K$problem == probteam$problem[k] & nchar(K$team) > 0,]$avgIC, ties.method = "min")[l]
        probteam$activeUsers[k] = getActiveUsers(probteam$team[k], probteam$problem[k])
        probteam$textSimReports[k] = getTextSim(probteam$team[k], probteam$problem[k], 'reports')
        probteam$textSimResponses[k] = getTextSim(probteam$team[k], probteam$problem[k], 'responses')
        probteam$AOMT[k] = getAOMT(probteam$team[k], probteam$problem[k])
        probteam$divAOMT[k] = getDivAOMT(probteam$team[k], probteam$problem[k])
        probteam$medianEdu[k] = getMedianEdu(probteam$team[k], probteam$problem[k])
    }
    
    probteam = probteam[!is.na(probteam$avgIC),]
    
    return(setDT(probteam))
}

compile_probteams_2020_HuntChallenge = function(repo, path_to_data, instance_name) {
    
    path = paste0(path_to_data, instance_name, "/KnackData/")
    
    K = as.data.frame(fread(paste0(path, "teams2020challenge.csv")))
    colnames(K) <- c("team",
                     "avgAll",
                     "inRound1",
                     "inRound2",
                     "type",
                     "points",
                     "avg2",
                     "avg3",
                     "avg4",
                     "avg1",
                     "nRatings1",
                     "nRatings2",
                     "nRatings3",
                     "nRatings4",
                     "nRatings2020",
                     "nGeoCorrect",
                     "tightness",
                     "nRedactionEstimates",
                     "probabilityEstimate",
                     "misc1",
                     "misc2")
    K = K[K$type != "Calibration",]
    for (k in 1:4) {
        if (sum( !is.na(K[[paste0('avg',k)]]) & K[[paste0('avg',k)]] == 0 ) > 0) {
            K[!is.na(K[[paste0('avg',k)]]) & K[[paste0('avg',k)]] == 0,][[paste0('avg',k)]] = NA
        }
    }
    
    ratings = fread(paste0(path, "ratings2020challenge.csv"))
    colnames(ratings) = c(
        "reportID",
        "password1",
        "password2",
        "password3",
        "problem",
        "team",
        "participant",
        "created",
        "rater",
        "c1", "c1comment", "c1distinction",
        "c2", "c2comment", "c2distinction",
        "c3", "c3distinction", "c3comment",
        "c4", "c4comment",
        "c5", "c5comment",
        "c6", "c6comment",
        "c7", "c7comment",
        "c8",
        "geo1",
        "geo2",
        "geo3",
        "geo4",
        "c8comment",
        "c1score",
        "c2score",
        "c3score",
        "c4score",
        "c5score",
        "c6score",
        "c7score",
        "c8score",
        "IC",
        "c1na",
        "c2na",
        "c3na",
        "c4na",
        "c5na",
        "c6na",
        "c7na",
        "c8na",
        "ruleBased",
        "ruleBasedScore",
        "ruleBaseAlexAns",
        "c4distinction",
        "c5distinction",
        "c6distinction",
        "c7distinction",
        "c8distinction",
        "lensKit",
        "geo1score",
        "geo2score",
        "geo3score",
        "geo4score",
        "geoOverall",
        "isRedactionTestRating",
        "raterProbabilityEstimate",
        "estTimeTaken",
        "estJustification",
        "estComments",
        "bayes1",
        "bayes2",
        "bayes3",
        "bayes1score",
        "bayes2score",
        "bayes3score",
        "flaw1",
        "flaw2",
        "flaw3",
        "flaw4"
    )
    ratings[ratings == ""] = NA
    ratings[IC == 0,"IC"] = NA
    ratings = ratings[,.(
        problem,
        team,
        participant,
        rater,
        geo1,
        geo2,
        geo3,
        geo4,
        IC,
        geo1score,
        geo2score,
        geo3score,
        geo4score,
        geoOverall,
        isRedactionTestRating,
        raterProbabilityEstimate,
        estTimeTaken,
        estJustification,
        estComments,
        bayes1,
        bayes2,
        bayes3,
        bayes1score,
        bayes2score,
        bayes3score,
        flaw1,
        flaw2,
        flaw3,
        flaw4
    )]
    ratings[,bayesScore := bayes1score + bayes2score + bayes3score]
    
    # RFDratings = fread(paste0(path, "extraRFDratings.csv"))
    # RFDratings[,nFlawsDetected := flaw1 + flaw2 + flaw3 + flaw4]
    
    parts = repo[[instance_name]]$CoreData$parts
    teams = repo[[instance_name]]$CoreData$teams
    tms = unique(teams$team)
    problems = c("Foreign Fighters", "Forecasting Piracy", "Corporate Espionage", "The Park Young-min Case")
    analytics = repo[[instance_name]]$PlatformData$analytics
    responses = repo[[instance_name]]$PlatformData$responses
    
    response_path = paste0(path_to_data,instance_name,"/PlatformData/responses/text/")
    
    probteam = data.frame(team = rep(tms, length(problems)),
                          problem = rep(problems, each = length(tms)),
                          probNum = rep(1:length(problems), each = length(tms)),
                          type = NA,
                          avgIC = NA,
                          nIC = NA,
                          rankIC = NA,
                          nGeoCorrect = NA,
                          probabilityEstimate = NA,
                          tightness = NA,
                          nBayesCorrect = NA,
                          nFlawsDetected = NA,
                          activeUsers = NA,
                          textSimReports = NA,
                          textSimResponses = NA,
                          AOMT = NA,
                          divAOMT = NA,
                          medianEdu = NA)
    
    getActiveUsers = function(tm, pr) {
        team_members = analytics[team == tm & problem == pr]
        nActive = sum(team_members$engagement_scaled > 0)
    }
    
    getTextSim = function(tm, pr, scope) {
        if (scope == "reports") {
            file_names = responses[team == tm & problem == pr & response_type == "report"]$response_text
        } else if (scope == "resources") {
            file_names = responses[team == tm & problem == pr & response_type == "resource"]$response_text
        } else if (scope == "responses") {
            file_names = responses[team == tm & problem == pr]$response_text
        }
        
        if (length(file_names) > 1) {
            reports = suppressWarnings(readtext::readtext(paste0(response_path,file_names[1])))  # surpress "*.md" warnings
            
            for (j in 2:length(file_names)) {
                reports = rbind(reports, suppressWarnings(readtext::readtext(paste0(response_path,file_names[j]))))
            }
            
            CORPUS = quanteda::corpus(reports)
            DFM = quanteda::dfm(CORPUS,
                                remove = quanteda::stopwords("english"),
                                stem = TRUE, remove_punct = TRUE, remove_numbers = TRUE)
            DistMat = quanteda::textstat_simil(DFM, method="cosine")
            Distances = DistMat[lower.tri(DistMat)]
            
            library(quanteda)
            # return( (1 - mean(Distances)) / sum(summary(CORPUS)$Tokens) )
            return( mean(Distances) )
        } else {
            return( 1 )
        }
        
    }
    
    getAOMT = function(tm, pr) {
        active_team_members = analytics[team == tm & problem == pr & engagement_scaled > 0]$user
        return( median(parts[parts$user %in% active_team_members,]$aomt, na.rm = T) )
    }
    
    getDivAOMT = function(tm, pr) {
        active_team_members = analytics[team == tm & problem == pr & engagement_scaled > 0]$user
        scores = parts[parts$user %in% active_team_members,]$aomt
        # scores = scores[!is.na(scores)]
        # rval = mean(c(dist(scores)))
        n = length(scores) * (length(scores) - 1) / 2 # Number of unique pairs.
        rval = sum(dist(scores), na.rm = T) / n
        if (is.na(rval)) {
            rval = 0
        }
        return(rval)
    }
    
    getMedianEdu = function(tm, pr) {
        active_team_members = analytics[team == tm & problem == pr & engagement_scaled > 0]$user
        eds = parts[parts$user %in% active_team_members,]$education
        eds[eds == "High School"] = 1
        eds[eds == "Trade or Technical Qualification"] = 2
        eds[eds == "Bachelors"] = 3
        eds[eds == "Graduate Certificate, Diploma or equivalent"] = 4
        eds[eds == "Masters"] = 5
        eds[eds == "Phd"] = 6
        eds[eds == "Prefer not to say"] = NA
        eds = as.numeric(eds)
        
        return(median(eds, na.rm = T))
    }
    
    getNumFlawsDetected = function(tm) {
        rts = ratings[team == tm & problem == "Park Young-min Case",.(flaw1, flaw2, flaw3, flaw4)]
        rts[rts == "Yes"] = 1
        rts[rts == "No"] = 0

        flaw1 = round(mean(as.numeric(rts$flaw1)))
        flaw2 = round(mean(as.numeric(rts$flaw2)))
        flaw3 = round(mean(as.numeric(rts$flaw3)))
        flaw4 = round(mean(as.numeric(rts$flaw4)))

        return(flaw1 + flaw2 + flaw3 + flaw4)
    }
    
    getTightness = function(tm, k) {
        i = which(probteam$team == tm & probteam$problem == "Forecasting Piracy")
        team_prob = probteam$probabilityEstimate[k]
        rater_probs = ratings[team == tm & isRedactionTestRating == "Yes"]$raterProbabilityEstimate/100
        tightness = mean((rater_probs - team_prob)^2)
        return(tightness)
    }
    
    OTs = c("kosciuszko00219","otway00219","noosa00219","uluru00219","kakadu00219","grampians00219","daintree00219")
    STs = c("tongariro00311",
            "joondalup00311",
            "murramang00311",
            "warrumbungle00311",
            "aoraki00311")
    
    for (k in 1:nrow(probteam)) {
        i = which(K$team == probteam$team[k])
        if (probteam$team[k] %in% OTs) {
            probteam$type[k] = "OT"
        } else if (probteam$team[k] %in% STs) {
            probteam$type[k] = "ST"
        } else {
            probteam$type[k] = "PT"
        }
        probteam$avgIC[k] = K[[paste0("avg", probteam$probNum[k])]][i]
        probteam$nIC[k] = K[[paste0("nRatings", probteam$probNum[k])]][i]
        probteam$rankIC[k] = rank(-K[[paste0("avg", probteam$probNum[k])]], ties.method = "min")[i]
        if (probteam$problem[k] == "Foreign Fighters") {
            probteam$nGeoCorrect[k] = K$nGeoCorrect[i]
        }
        if (probteam$problem[k] == "Forecasting Piracy" & !is.na(probteam$avgIC[k])) {
            probteam$probabilityEstimate[k] = K$probabilityEstimate[i]/100
            probteam$tightness[k] = getTightness(probteam$team[k], k)
        }
        if (probteam$problem[k] == "Corporate Espionage" & !is.na(probteam$avgIC[k])) {
            probteam$nBayesCorrect[k] = round(mean(ratings[team == probteam$team[k] & problem == "Corporate Espionage"]$bayesScore))
        }
        if (probteam$problem[k] == "The Park Young-min Case" & !is.na(probteam$avgIC[k])) {
            probteam$nFlawsDetected[k] = getNumFlawsDetected(probteam$team[k])
            
        }
        probteam$activeUsers[k] = getActiveUsers(probteam$team[k], probteam$problem[k])
        probteam$textSimReports[k] = getTextSim(probteam$team[k], probteam$problem[k], 'reports')
        probteam$textSimResponses[k] = getTextSim(probteam$team[k], probteam$problem[k], 'responses')
        probteam$AOMT[k] = getAOMT(probteam$team[k], probteam$problem[k])
        probteam$divAOMT[k] = getDivAOMT(probteam$team[k], probteam$problem[k])
        probteam$medianEdu[k] = getMedianEdu(probteam$team[k], probteam$problem[k])
    }
    
    probteam = probteam[!is.na(probteam$avgIC),]
    
    return(setDT(probteam))
}

compile_probteams_2020_PsychologyCapstone = function(repo, path_to_data, instance_name) {
    
    path = paste0(path_to_data, instance_name, "/AdminData/")
    
    R = fread(paste0(path, "ratings.csv"))
    R[R == ""] = NA
    
    scores = list(
        Poor = 1,
        Fair = 2,
        Good = 3,
        Excellent = 4
    )
    
    for (k in 1:nrow(R)) {
        R$IC[k] = scores[[R$c1[k]]] +
            scores[[R$c2[k]]] +
            scores[[R$c3[k]]] +
            scores[[R$c4[k]]] +
            scores[[R$c5[k]]] +
            scores[[R$c6[k]]] +
            scores[[R$c7[k]]] +
            scores[[R$c8[k]]]
    }
    
    parts = repo[[instance_name]]$CoreData$parts
    teams = repo[[instance_name]]$CoreData$teams
    tms = unique(teams$team)
    problems = c("Foreign Fighters", "Corporate Espionage")
    analytics = repo[[instance_name]]$PlatformData$analytics
    responses = repo[[instance_name]]$PlatformData$responses
    
    response_path = paste0(path_to_data,instance_name,"/PlatformData/responses/text/")
    
    probteam = data.frame(team = rep(tms, length(problems)),
                          problem = rep(problems, each = length(tms)),
                          probNum = rep(1:length(problems), each = length(tms)),
                          type = NA,
                          avgIC = NA,
                          nIC = NA,
                          rankIC = NA,
                          nGeoCorrect = NA,
                          nBayesCorrect = NA,
                          activeUsers = NA,
                          textSimReports = NA,
                          textSimResponses = NA,
                          AOMT = NA,
                          divAOMT = NA,
                          medianEdu = NA)
    
    getActiveUsers = function(tm, pr) {
        team_members = analytics[team == tm & problem == pr]
        nActive = sum(team_members$engagement_scaled > 0)
    }
    
    getTextSim = function(tm, pr, scope) {
        if (scope == "reports") {
            file_names = responses[team == tm & problem == pr & response_type == "report"]$response_text
        } else if (scope == "resources") {
            file_names = responses[team == tm & problem == pr & response_type == "resource"]$response_text
        } else if (scope == "responses") {
            file_names = responses[team == tm & problem == pr]$response_text
        }
        
        if (length(file_names) > 1) {
            reports = suppressWarnings(readtext::readtext(paste0(response_path,file_names[1])))  # surpress "*.md" warnings
            
            for (j in 2:length(file_names)) {
                reports = rbind(reports, suppressWarnings(readtext::readtext(paste0(response_path,file_names[j]))))
            }
            
            CORPUS = quanteda::corpus(reports)
            DFM = quanteda::dfm(CORPUS,
                                remove = quanteda::stopwords("english"),
                                stem = TRUE, remove_punct = TRUE, remove_numbers = TRUE)
            DistMat = quanteda::textstat_simil(DFM, method="cosine")
            Distances = DistMat[lower.tri(DistMat)]
            
            library(quanteda)
            # return( (1 - mean(Distances)) / sum(summary(CORPUS)$Tokens) )
            return( mean(Distances) )
        } else {
            return( 1 )
        }
        
    }
    
    getAOMT = function(tm, pr) {
        active_team_members = analytics[team == tm & problem == pr & engagement_scaled > 0]$user
        return( median(parts[parts$user %in% active_team_members,]$aomt, na.rm = T) )
    }
    
    getDivAOMT = function(tm, pr) {
        active_team_members = analytics[team == tm & problem == pr & engagement_scaled > 0]$user
        scores = parts[parts$user %in% active_team_members,]$aomt
        # scores = scores[!is.na(scores)]
        # rval = mean(c(dist(scores)))
        n = length(scores) * (length(scores) - 1) / 2 # Number of unique pairs.
        rval = sum(dist(scores), na.rm = T) / n
        if (is.na(rval)) {
            rval = 0
        }
        return(rval)
    }
    
    getMedianEdu = function(tm, pr) {
        active_team_members = analytics[team == tm & problem == pr & engagement_scaled > 0]$user
        eds = parts[parts$user %in% active_team_members,]$education
        eds[eds == "High School or GED Equivalency"] = 1
        eds[eds == "Some College"] = 2
        eds[eds == "Bachelor's Degree"] = 3
        eds[eds == "Associate's Degree"] = 4
        eds[eds == "Master's Degree"] = 5
        eds[eds == "Professional or Doctoral Degree (e.g. MD, JD, PhD)"] = 6
        eds[eds == "Prefer not to say"] = NA
        eds = as.numeric(eds)
        
        return(median(eds, na.rm = T))
    }
    
    for (k in 1:nrow(probteam)) {
        probteam$type[k] = "UT"
        probteam$avgIC[k] = mean(R[team == probteam$team[k] & problem == probteam$problem[k]]$IC)
        probteam$nIC[k] = nrow(R[team == probteam$team[k] & problem == probteam$problem[k]])
        # probteam$rankIC[k] = rank(-K[[paste0("avg", probteam$probNum[k])]], ties.method = "min")[i]
        if (probteam$problem[k] == "Foreign Fighters") {
            probteam$nGeoCorrect[k] = round(mean(R[team == probteam$team[k] & problem == probteam$problem[k]]$nGeoCorrect))
        }
        if (probteam$problem[k] == "Corporate Espionage" & !is.na(probteam$avgIC[k])) {
            probteam$nBayesCorrect[k] = round(mean(R[team == probteam$team[k] & problem == probteam$problem[k]]$nBayesCorrect))
        }
        probteam$activeUsers[k] = getActiveUsers(probteam$team[k], probteam$problem[k])
        probteam$textSimReports[k] = getTextSim(probteam$team[k], probteam$problem[k], 'reports')
        probteam$textSimResponses[k] = getTextSim(probteam$team[k], probteam$problem[k], 'responses')
        probteam$AOMT[k] = getAOMT(probteam$team[k], probteam$problem[k])
        probteam$divAOMT[k] = getDivAOMT(probteam$team[k], probteam$problem[k])
        probteam$medianEdu[k] = getMedianEdu(probteam$team[k], probteam$problem[k])
    }
    
    # probteam = probteam[!is.na(probteam$avgIC),]
    
    return(setDT(probteam))
}



compile_probteams = function(repo, path_to_data, instance_name) {
    
    # Lookup table for relevant functions. This is required because demographic surveys differ
    # syntactically across different experiements, and so each require custom code to tidy the
    # data into a consistent format.
    compile_probteams = list(
        "x2018_SwarmChallengeExp1" = compile_probteams_2018_SwarmChallengeExp1,
        "x2020_HuntChallenge" = compile_probteams_2020_HuntChallenge,
        "x2020_PsychologyCapstone" = compile_probteams_2020_PsychologyCapstone
    )
    
    if (instance_name %in% names(compile_probteams)) {
        return(compile_probteams[[instance_name]](repo, path_to_data, instance_name))
    } else {
        return(NULL)
    }
}

compile_probparts = function(repo, nClusters, generatePlots = F) {
    
    set.seed(5678)
    
    anal = repo[[1]]$PlatformData$analytics
    anal$probteam = NA
    anal$teamFinished = NA
    anal = anal[0,]
    
    for (nm in names(repo)) {
    # for (nm in c("x2020_HuntChallenge","x2018_SwarmChallengeExp1")) {
        analytics = repo[[nm]]$PlatformData$analytics
        probteams = repo[[nm]]$CoreData$probteams
        
        analytics$probteam = paste0(analytics$team, analytics$problem)
        probteams$probteam = paste0(probteams$team, probteams$problem)
        
        teamFinished = function(pt) {
            return(pt %in% probteams$probteam)
        }
        analytics$teamFinished = sapply(analytics$probteam, teamFinished)
        analytics = analytics[analytics$teamFinished,]
        
        anal = rbind(anal, analytics)
    }
    
    # Select contributions for each user instance.
    anal = anal %>%
        dplyr::select(team, problem, user, report_count, resource_count, comment_count, vote_count,
               quick_rating, complete_rating, chat_count)
    
    # Remove outliers: artificially capping chat counts to 100 so they don't cause outliers.
    anal[anal$chat_count >= 100,]$chat_count = 100
    
    # Scale data to a mean of 0 and a standard dev of 1 (relativates high number of chat messages etc.)
    scanal = apply(anal[, 4:10], 2, function(x) {(x-mean(x))/sd(x)})
    scanal = cbind(anal[,1:3], scanal)
    
    # Get Euclidean distances 
    d = dist(scanal[, 4:10]) 
    
    # Consult multiple criteria to decide on number of clusters (commented out because takes longer).
    # nc = NbClust::NbClust(scanal[,4:10], distance = "euclidean", min.nc = 2, max.nc = 15, method = "ward.D")
    # table(nc$Best.nc[1,])
    # We want a bit of distinction so we look for larger number of clusters
    # Try 10 (3 criteria)
    
    # Run hierarchical clustering with Ward method to determine the centers
    fit.ward = hclust(d, method = "ward.D")
    clusters = cutree(fit.ward, k = nClusters) # ward results are much more promising
    
    # Plot dendrogram.
    if (generatePlots) {
        plot(fit.ward, hang = -1, cex = 0.6, main = "Ward Linkage Clustering\n10 Cluster Solution")
        rect.hclust(fit.ward, k = nClusters)
    }
    centers = aggregate(scanal[,4:10], list(clusters), median)
    
    # Test hierarchical with silhouette method.
    sil = cluster::silhouette(clusters, d)
    # Nice visualisation of the silhouette width.
    p = factoextra::fviz_silhouette(sil, print.summary = FALSE) +
        theme_minimal()+
        theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
    if (generatePlots) {
        print(p)
    }
    
    # Now use centers as starting points for k-means clustering.
    set.seed(42)
    fit.km = kmeans(scanal[,4:10], centers = centers[, 2:8], nstart = 1)
    
    # Silhouette width now increased to 0.3
    sil = cluster::silhouette(fit.km$cluster, d)
    p = factoextra::fviz_silhouette(sil, print.summary = FALSE) +
        theme_minimal()+
        theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())  
    if (generatePlots) {
        print(p)
    }
    
    # Stripcharts for k-means: 
    anal[['cluster']] = factor(fit.km$cluster)
    anal[['clusterLabel']] = character(nrow(anal))
    anal[anal$cluster == 1,]$clusterLabel = 'Report Guru' # 'Talkative Multi-talent (Tier 2)'
    anal[anal$cluster == 2,]$clusterLabel = 'Allrounder' # 'Slow-rating Multi-talent (Tier 1)'
    anal[anal$cluster == 3,]$clusterLabel = 'Slow-rating Multi-talent (Tier 2)' # 'Speed-rating Multi-talent (Tier 1)'
    anal[anal$cluster == 4,]$clusterLabel = 'Drop In' # 'Speed-rating Multi-talent (Tier 1)' # 'Allrounder (Tier 1)'
    anal[anal$cluster == 5,]$clusterLabel = 'Talkative Multi-talent (Tier 2)' # 'Report Guru' # 'Slow-rating Multi-talent (Tier 2)'
    anal[anal$cluster == 6,]$clusterLabel = 'Slow-rating Multi-talent (Tier 1)' # 'Report Guru'
    anal[anal$cluster == 7,]$clusterLabel = 'Speed-rating Multi-talent (Tier 2)' # 'Allrounder (Tier 2)'
    anal[anal$cluster == 8,]$clusterLabel = 'Talkative Multi-talent (Tier 1)' # 'Slow-rating Multi-talent (Tier 1)' # 'Drop In' # 'Speed-rating Multi-talent (Tier 2)'
    anal[anal$cluster == 9,]$clusterLabel = 'Resource Guru' # 'Slow-rating Multi-talent (Tier 2)' # 'Single-minded Raters'
    anal[anal$cluster == 10,]$clusterLabel = 'Speed-rating Multi-talent (Tier 1)' # 'Drop In'
    
    D = anal
    for (cn in colnames(D)[4:10]) {
        D[[cn]] = D[[cn]]/max(D[[cn]])
    }
    D = melt(D, id.vars = c("team","problem","user","cluster","clusterLabel"))
    P = list()
    for (p in 1:10) {
        P[[p]] = ggplot(D[D$cluster == p,], aes(x = variable, y = value)) +
            geom_jitter(aes(colour = variable), alpha = 0.3) + ylim(0,1) +
            coord_flip() +
            guides(color = FALSE) +
            theme_linedraw() +
            labs(x = "",
                 y = "Percentile",
                 title = paste("Cluster", p),
                 subtitle = D[D$cluster == p,]$clusterLabel[1]) +
            theme(panel.grid.major = element_blank(), panel.grid.minor.y = element_line(colour="grey", size=0.2))
    }
    pw = P[[1]] + P[[2]] + P[[3]] + P[[4]] + P[[5]] + P[[6]] + P[[7]] + P[[8]] + P[[9]] + P[[10]]
    
    if (generatePlots) {
        ggpubr::ggexport(pw,
                 filename = "Cluster Overview.png",
                 width = 4181,
                 height = 2000,
                 pointsize = 11,
                 res = 300)
        message(paste0("Exported 'Cluster Overview.png'"))
    }
    
    # Create probparts tables.
    for (nm in names(repo)) {
        probparts = repo[[nm]]$PlatformData$analytics
        probparts$cluster = NA
        probparts$clusterLabel = NA
        
        for (k in 1:nrow(probparts)) {
            i = which((anal$problem == probparts$problem[k]) & (anal$user == probparts$user[k]))[1]
            probparts$cluster[k] = as.character(anal$cluster[i])
            probparts$clusterLabel[k] = as.character(anal$clusterLabel[i])
        }
        
        probparts$team_id = NULL
        probparts$problem_id = NULL
        probparts$user_id = NULL
        
        repo[[nm]]$CoreData$probparts = probparts
    }
    
    return(repo)
}

compile_rates_2020_HuntChallenge = function(repo, path_to_data, instance_name) {
    
    path = paste0(path_to_data, instance_name, "/KnackData/")
    
    ratings = fread(paste0(path, "ratings2020challenge.csv"))
    colnames(ratings) = c(
        "reportID",
        "password1",
        "password2",
        "password3",
        "problem",
        "team",
        "participant",
        "created",
        "rater",
        "c1", "c1comment", "c1distinction",
        "c2", "c2comment", "c2distinction",
        "c3", "c3distinction", "c3comment",
        "c4", "c4comment",
        "c5", "c5comment",
        "c6", "c6comment",
        "c7", "c7comment",
        "c8",
        "geo1",
        "geo2",
        "geo3",
        "geo4",
        "c8comment",
        "c1score",
        "c2score",
        "c3score",
        "c4score",
        "c5score",
        "c6score",
        "c7score",
        "c8score",
        "IC",
        "c1na",
        "c2na",
        "c3na",
        "c4na",
        "c5na",
        "c6na",
        "c7na",
        "c8na",
        "ruleBased",
        "ruleBasedScore",
        "ruleBaseAlexAns",
        "c4distinction",
        "c5distinction",
        "c6distinction",
        "c7distinction",
        "c8distinction",
        "lensKit",
        "geo1score",
        "geo2score",
        "geo3score",
        "geo4score",
        "nGeoCorrect",
        "isRedactionTestRating",
        "raterProbabilityEstimate",
        "estTimeTaken",
        "estJustification",
        "estComments",
        "bayes1",
        "bayes2",
        "bayes3",
        "bayes1score",
        "bayes2score",
        "bayes3score",
        "flaw1",
        "flaw2",
        "flaw3",
        "flaw4"
    )
    ratings[ratings == ""] = NA
    ratings[IC == 0,"IC"] = NA
    ratings = ratings[,.(
        problem,
        team,
        rater,
        c1, c1comment,
        c2, c2comment,
        c3, c3comment,
        c4, c4comment,
        c5, c5comment,
        c6, c6comment,
        c7, c7comment,
        c8, c8comment,
        geo1,
        geo2,
        geo3,
        geo4,
        IC,
        geo1score,
        geo2score,
        geo3score,
        geo4score,
        nGeoCorrect,
        isRedactionTestRating,
        raterProbabilityEstimate,
        estTimeTaken,
        estJustification,
        estComments,
        bayes1,
        bayes2,
        bayes3,
        bayes1score,
        bayes2score,
        bayes3score,
        flaw1,
        flaw2,
        flaw3,
        flaw4
    )]
    
    ratings$raterProbabilityEstimate = ratings$raterProbabilityEstimate/100
    
    return(ratings)
}

compile_rates_2020_PsychologyCapstone = function(repo, path_to_data, instance_name) {
    
    path = paste0(path_to_data, instance_name, "/AdminData/")
    
    ratings = fread(paste0(path, "ratings.csv"))
    ratings[ratings == ""] = NA
    
    scores = list(
        Poor = 1,
        Fair = 2,
        Good = 3,
        Excellent = 4
    )
    
    for (k in 1:nrow(ratings)) {
        ratings$IC[k] = scores[[ratings$c1[k]]] +
            scores[[ratings$c2[k]]] +
            scores[[ratings$c3[k]]] +
            scores[[ratings$c4[k]]] +
            scores[[ratings$c5[k]]] +
            scores[[ratings$c6[k]]] +
            scores[[ratings$c7[k]]] +
            scores[[ratings$c8[k]]]
    }
    
    ratings = ratings[,.(
        problem,
        team,
        rater,
        c1, c1comment,
        c2, c2comment,
        c3, c3comment,
        c4, c4comment,
        c5, c5comment,
        c6, c6comment,
        c7, c7comment,
        c8, c8comment,
        IC,
        nGeoCorrect,
        nBayesCorrect
    )]
    
    return(ratings)
}

compile_rates = function(repo, path_to_data, instance_name) {
    
    # Lookup table for relevant functions. This is required because ratings format differ
    # syntactically across different experiements, and so each require custom code to tidy the
    # data into a consistent format.
    compile_rates = list(
        "x2020_HuntChallenge" = compile_rates_2020_HuntChallenge,
        "x2020_PsychologyCapstone" = compile_rates_2020_PsychologyCapstone
    )
    
    if (instance_name %in% names(compile_rates)) {
        return(compile_rates[[instance_name]](repo, path_to_data, instance_name))
    } else {
        return(NULL)
    }
}

#' Compile data from 'raw' SWARM, Qualtrics & Knack CSVs
#'
#' \code{compile_data} Compiles data from 'raw' CSVs exported from the various
#' platforms used (SWARM, Qualtrics, Knack) for the Hunt Challenge 2020,
#' and saves tidied versions of them to the package data file, overwriting any
#' that were already saved.
#'
#' Use this function to refresh the tidied versions of the data whenever the
#' raw data is updated.
#'
#' @export
#' 
#' @examples
#' \dontrun{
#' # (re-)compile data
#' compile_data()
#' }
compile_data = function(path = "data/",
                        use_previous_redactions = TRUE,
                        redaction_patterns = c()) {
    
    require(dplyr)
    require(data.table)
    
    instances = c(
        'x2020_PsychologyCapstone',
        'x2020_HuntChallenge',
        'x2018_SwarmChallengeExp1'
    )
    
    # Initialise repo list.
    repo = list()
    for (instance_name in instances) {
        repo[[instance_name]] = list()
    }
    
    # Populate repo with platform data.
    for (instance_name in instances) {
        repo[[instance_name]][['PlatformData']] = fetchPlatformData(path, instance_name)
    }
    
    # Compile useful tables.
    for (instance_name in instances) {
        repo[[instance_name]][['CoreData']] = list()
        repo[[instance_name]][['CoreData']]$parts = compile_parts(path, instance_name)
        repo[[instance_name]][['CoreData']]$teamparts = compile_teamparts(repo, path, instance_name)
        repo[[instance_name]][['CoreData']]$teams = compile_teams(repo, path, instance_name)
        repo[[instance_name]][['CoreData']]$probteams = compile_probteams(repo, path, instance_name)
        repo[[instance_name]][['CoreData']]$rates = compile_rates(repo, path, instance_name)
    }
    repo = compile_probparts(repo, nClusters = 10, generatePlots = F)
    
    # Reorder folders.
    for (instance_name in instances) {
        repo[[instance_name]] = list(
            CoreData = repo[[instance_name]][['CoreData']],
            PlatformData = repo[[instance_name]][['PlatformData']]
        )
    }
    
    if (use_previous_redactions) {
        repo = apply_previous_redactions(
            repo,
            instances = c('x2020_HuntChallenge',
                          'x2020_PsychologyCapstone'),
            path = 'experiment-data'
        )
    } else {
        patterns = c(redaction_patterns,
                 'phone number', 'my number is',
                 "[[:alnum:]._-]+@[[:alnum:].-]+", # basic regex for email addresses
                 "(^| |\\+)[0-9]{3,}( |-)[0-9]{3}( |-)[0-9]{3}( |$)"    # basic regex for phone numbers
                 )
        repo = run_PII_redaction_session(
            repo,
            patterns,
            c('x2020_HuntChallenge'),
            path = 'experiment-data'
        )
    }
    
    
    # Anonymise real names of raters.
    for (instance_name in instances) {
        if ('rates' %in% names(repo[[instance_name]]$CoreData)) {
            rates = repo[[instance_name]]$CoreData$rates
            rates$rater = as.integer(factor(rates$rater))
            repo[[instance_name]]$CoreData$rates$rater = rates$rater
        }
    }
    
    # Save copy of 'tidy' repo version to experiment-data repository.
    export_repo_to_CSV(repo, 'tidy')
    
    # Remove columns that contain free-text user input.
    user_input_cols = c(
        "interestsOtherInput",
        "priOtherInput",
        "multidisciplinaryExperienceInput",
        "describeAnalyticalExperience",
        "bestThing",
        "worstThing",
        "pfComments",
        "mostValuable",
        "lkSuggestions",
        "ratingToolWhyNot",
        "featureRequests",
        "externalToolsComments",
        "bestQuestionNotAsked",
        "testimonial",
        "otherComments",
        "chat_text",
        "response_title"
    )
    for (instance_name in instances) {
        fldrs = names(repo[[instance_name]])
        for (fldr in fldrs) {
            tbls = names(repo[[instance_name]][[fldr]])
            for (tbl in tbls) {
                cols = names(repo[[instance_name]][[fldr]][[tbl]])
                for (user_input_col in user_input_cols) {
                    if (user_input_col %in% cols) {
                        repo[[instance_name]][[fldr]][[tbl]][[user_input_col]] = NULL
                    }
                }
            }
        }
    }
    
    # Save 'noPII' repo version to experiment-data repository.
    export_repo_to_CSV(repo, 'noPII')
    
    # Save compiled data to package, tidy environment, and reload the package.
    save(repo,
         file="huntr/data/repo.RData")
    remove(repo)
    message("Reloading package...")
    devtools::load_all("huntr")
    
}