source_folder <- "data-raw/dicts/means_only/"
means_file_list <- list.files(source_folder)
# set up base dataframe
cnames <- c(
"term",
"dataset",
"context",
"year",
"component",
"instcodes",
"group",
"E",
"P",
"A",
"n_E",
"n_P",
"n_A",
"sd_E",
"sd_P",
"sd_A",
"cov_EE",
"cov_EP",
"cov_EA",
"cov_PE",
"cov_PP",
"cov_PA",
"cov_AE",
"cov_AP",
"cov_AA"
)
meannames <- c(
"term",
"dataset",
"context",
"year",
"component",
"instcodes",
"group",
"E",
"P",
"A"
)
individual_keys <- c(
"dukecommunity2015",
"dukestudent2015",
"uga2015",
"usstudent2015",
"usmturk2015",
"egypt2015",
"morocco2015",
"usfullsurveyor2015",
"occs2019",
"occs2020",
"occs2021",
"artifactmods2022",
"humanvalues2022",
"products2022"
)
# meta <- utils::read.csv("data-raw/dicts/dict_info.csv")
exclude_keys <- c("prisonersdilemma") #, "occs2019", "occs2020")
mean_epa <- data.frame(matrix(nrow = 0, ncol = length(meannames)))
names(mean_epa) <- meannames
for(file in means_file_list){
path <- paste0(source_folder, "/", file)
component <- stringr::str_extract(file, "(?<=_)[[:alpha:]]*(?=.)")
key <- stringr::str_extract(file, "^[[:alnum:]]*(?=_)")
# we don't need to include the public means-only data for the sets we have individual level data for.
if(!(key %in% individual_keys) & !(key %in% exclude_keys)){
# context <- stringr::str_extract(key, "^[[:alpha:]]*(?=[[:digit:]])")
year <- stringr::str_extract(key, "[[:digit:]]*$")
context <- dict_meta[dict_meta$key == key, "context"]
# I'm leaving year as a regex instead of taking it from meta because this ensures it is numeric and therefore sortable
# year <- meta[meta$key == key, "year"]
filetype <- stringr::str_extract(file, "\\.[[:alpha:]]*$")
datatype <- dplyr::case_when(
grepl("COV", file) ~ "COV",
grepl("SD", file) ~ "SD",
TRUE ~ "mean"
)
con <- file(path, "r")
firstline <- readLines(con = con, n = 1)
close(con)
if(grepl(',', firstline)){
# the file is comma-separated
sep <- ","
} else {
# file is space separated
sep <- " "
}
if(grepl('term', firstline)){
head <- TRUE
} else {
head <- FALSE
}
data <- read.table(path, header = head, sep = sep, quote = "", fill=FALSE)
nc <- ncol(data)
namesplaceholder <- list()
for(i in 1:nc){
namesplaceholder <- append(namesplaceholder, paste0("V", i))
}
names(data) <- namesplaceholder
if(datatype == "mean"){
male <- data %>%
dplyr::select(-c(V5:V7))
female <- data %>%
dplyr::select(-c(V2:V4))
average <- data %>%
dplyr::rowwise() %>%
dplyr::mutate(avE = mean(c(V2, V5)),
avP = mean(c(V3, V6)),
avA = mean(c(V4, V7))) %>%
dplyr::select(-c(V2:V7))
if(ncol(female) == 5){
colnames(male) <- c("term", "E", "P", "A", "instcodes")
colnames(female) <- c("term", "E", "P", "A", "instcodes")
colnames(average) <- c("term", "instcodes", "E", "P", "A")
average <- dplyr::select(average, term, E, P, A, instcodes)
} else {
male$instcodes <- NA
female$instcodes <- NA
average$instcodes <- NA
colnames(male) <- c("term", "E", "P", "A", "instcodes")
colnames(female) <- c("term", "E", "P", "A", "instcodes")
colnames(average) <- c("term", "E", "P", "A", "instcodes")
}
# standardize terms
male <- standardize_terms(male, key, component)
female <- standardize_terms(female, key, component)
average <- standardize_terms(average, key, component)
term_table_input <- cbind(average[,1], rep(1, length(average[,1])))
colnames(term_table_input) <- c("term", key)
# # get institution codes; add to aggregate list
# if("instcodes" %in% colnames(average)){
# inst_table_input <- dplyr::select(average, term, instcodes)
# colnames(inst_table_input) <- c("term", key)
#
# if(component == 'identities'){
# inst_code_table_ident <- dplyr::full_join(inst_code_table_ident, inst_table_input, by = "term")
# } else if (component == 'behaviors'){
# inst_code_table_beh <- dplyr::full_join(inst_code_table_beh, inst_table_input, by = "term")
# } else if (component == 'mods'){
# inst_code_table_mod <- dplyr::full_join(inst_code_table_mod, inst_table_input, by = "term")
# } else if (component == 'settings'){
# inst_code_table_set <- dplyr::full_join(inst_code_table_set, inst_table_input, by = "term")
# }
# }
# are the male and female sets identical? If so the reported values are averages already and need only print one file
if(isTRUE(all.equal(male$E, female$E))){
avonly <- TRUE
} else {
avonly <- FALSE
}
# format average file for long format and append to complete df
average$dataset <- key
average$context <- context
average$year <- year
average$component <- component
average$group <- "all"
average <- dplyr::select(average, all_of(meannames))
mean_epa <- rbind(mean_epa, average)
# then do the same with male and female files if necessary
if(avonly == FALSE){
male$dataset <- key
male$context <- context
male$year <- year
male$component <- component
male$group <- "male"
male <- dplyr::select(male, all_of(meannames))
female$dataset <- key
female$context <- context
female$year <- year
female$component <- component
female$group <- "female"
female <- dplyr::select(female, all_of(meannames))
if(key == "internationaldomesticrelations1981"){
male$group <- "professional"
female$group <- "nonprofessional"
} else if(key == "gaymensanfrancisco1980"){
male$group <- "unsafebetter"
female$group <- "safebetter"
}
mean_epa <- rbind(mean_epa, male)
mean_epa <- rbind(mean_epa, female)
}
}
}
}
mean_epa <- mean_epa %>%
dplyr::rename(comp = component) %>%
dplyr::mutate(
comp = dplyr::case_when(
comp == "identities" ~ "identity",
comp == "behaviors" ~ "behavior",
comp == "mods" ~ "modifier",
comp == "settings" ~ "setting",
TRUE ~ NA_character_
)
) %>%
dplyr::rename(component = comp)
# usethis::use_data(mean_epa, overwrite = TRUE)
#### MEANS DATASETS THAT NEED MORE REFORMATTING: CALCUTTA, LULHAM AND SHANK, MOSTAFAVI ####################################################
source_folder <- "data-raw/dicts/summary_raw"
files <- list.files(source_folder)
for(file in files[files != "17_Cultures_EPA.xlsx"]){
path <- paste0(source_folder, "/", file)
key <- stringr::str_extract(file, "^[[:alnum:]]*")
year <- stringr::str_extract(key, "[[:digit:]]*$")
# print(key)
if(grepl("calcutta", key)){
# the calcutta dataset should be split into two: one for all respondents, one for the subset of respondents that uses the scales correctly.
data <- readr::read_csv(path) %>%
dplyr::rename(term = English) %>%
dplyr::select(-Bengali) %>%
dplyr::rename_with(~stringr::str_replace(., "@", "all_")) %>%
dplyr::rename_with(~stringr::str_replace(., "%", "subset_")) %>%
dplyr::mutate(term = stringr::str_to_lower(term))
# standardize terms
data_std <- standardize_terms(data, key = "calcutta", component = "undetermined") %>%
mutate(term = dplyr::case_when(term == "know_it_all" & all_mE == .16 ~ "know_it_all_translation_1",
term == "know_it_all" & all_mE == -.57 ~ "know_it_all_translation_2",
TRUE ~ term))
# term_table_input_calcutta <- cbind(data_std[,1], rep(1, length(data_std[,1])))
# colnames(term_table_input_calcutta) <- c("term", key)
calcutta <- data_std %>%
tidyr::pivot_longer(cols = c(starts_with("all"), starts_with("subset")),
names_to = c("dataset", ".value"), names_pattern = "([[:alpha:]]*)_(.*)") %>%
dplyr::rowwise() %>%
dplyr::mutate(aE = (mE*mEN + fE*fEN)/(mEN + fEN),
aP = (mP*mPN + fP*fPN)/(mPN + fPN),
aA = (mA*mAN + fA*fAN)/(mAN + fAN),
aEN = mEN + fEN,
aPN = mPN + fPN,
aAN = mAN + fAN,
aE_SD = sqrt(((mEN - 1)*mE_SD^2 + (fEN - 1)*fE_SD^2)/(mEN + fEN - 2)),
aP_SD = sqrt(((mPN - 1)*mP_SD^2 + (fPN - 1)*fP_SD^2)/(mPN + fPN - 2)),
aA_SD = sqrt(((mAN - 1)*mA_SD^2 + (fAN - 1)*fA_SD^2)/(mAN + fAN - 2))
) %>%
dplyr::ungroup() %>%
tidyr::pivot_longer(cols = c(-term, -component, -dataset),
names_to = c("group", ".value"),
names_pattern = "^(.)(.*)") %>%
dplyr::rename(n_E = EN,
n_P = PN,
n_A = AN,
sd_E = E_SD,
sd_P = P_SD,
sd_A = A_SD) %>%
dplyr::mutate(context = dict_meta[dict_meta$key == "calcuttaall2017", "context"],
year = year,
dataset = dplyr::case_when(dataset == "all" ~ "calcuttaall2017",
dataset == "subset" ~ "calcuttasubset2017"),
group = dplyr::case_when(group == "m" ~ "male",
group == "f" ~ "female",
group == "a" ~ "all")) %>%
dplyr::mutate(across(where(is.numeric), ~round(., digits = 2))) %>%
dplyr::mutate(component = ifelse(.data$component == "other", "artifact", .data$component))
} else {
data <- read.csv(path)
if(grepl("employeeorg", key)){
data_std <- data %>%
dplyr::mutate(
term = tolower(stringr::str_squish(term)),
# there are two term formats here; companies and the people who work for them
term = ifelse(condition == "org",
term,
paste0("employee of ", term)
),
mean_E = stringr::str_extract(stringr::str_squish(E), ".*(?=\\s)"),
sd_E = stringr::str_extract(stringr::str_squish(E), "(?<=\\().*(?=\\))"),
mean_P = stringr::str_extract(stringr::str_squish(P), ".*(?=\\s)"),
sd_P = stringr::str_extract(stringr::str_squish(P), "(?<=\\().*(?=\\))"),
mean_A = stringr::str_extract(stringr::str_squish(A), ".*(?=\\s)"),
sd_A = stringr::str_extract(stringr::str_squish(A), "(?<=\\().*(?=\\))"),
component = "identity",
group = "all"
) %>%
dplyr::mutate(across(c(starts_with("mean"), starts_with("sd")), ~round(as.numeric(.), digits = 2))) %>%
dplyr::select(term, component, group, starts_with("mean"), starts_with("sd")) %>%
dplyr::rename(E = mean_E,
P = mean_P,
A = mean_A) %>%
standardize_terms(key = key)
} else if (grepl("generaltech", key)){
data_std <- data %>%
dplyr::rename(term = MEANS,
sd_E = E.1,
sd_P = P.1,
sd_A = A.1) %>%
dplyr::select(-SDs) %>%
dplyr::mutate(across(c(E, P, A, starts_with("sd")), ~round(as.numeric(.), digits = 2))) %>%
dplyr::mutate(component = "artifact",
group = "all") %>%
standardize_terms(key = key)
} else if (grepl("groups2017", key)){
data_std <- data %>%
dplyr::rename(term = Group_concept,
E = Evaluation,
P = Potency,
A = Activity) %>%
dplyr::mutate(component = "identity",
group = "all") %>%
dplyr::mutate(across(c(E, P, A), ~round(as.numeric(.), digits = 2))) %>%
standardize_terms(key = key)
} else if (grepl("groups2019", key)){
data_std <- data %>%
dplyr::rename(term = X.1,
E = Means,
P = X.2,
A = X.3) %>%
dplyr::select(-X) %>%
dplyr::filter(term != "") %>%
dplyr::mutate(component = "identity",
group = "all") %>%
dplyr::mutate(across(c(E, P, A), ~round(as.numeric(.), digits = 2))) %>%
standardize_terms(key = key)
} else if (grepl("nounphrasegrammar2019", key)){
data_std <- data %>%
standardize_terms(key = key) %>%
dplyr::mutate(
mean = as.numeric(stringr::str_extract(stringr::str_squish(rating), ".*(?=\\s)")),
sd = as.numeric(stringr::str_extract(stringr::str_squish(rating), "(?<=\\().*(?=\\))")),
plural = paste0(term, "s")) %>%
dplyr::mutate(term_new = dplyr::case_when(
condition == 1 & stringr::str_extract(term, "^.") %in% c("a", "e", "i", "o", "u") ~ paste0("an_", term),
condition == 1 ~ paste0("a_", term),
condition == 2 ~ paste0("the_", term),
condition == 3 ~ plural,
condition == 4 ~ paste0("the_", plural),
condition == 5 ~ paste0("all_", plural)
)) %>%
dplyr::select(term_new, dimension, mean, sd) %>%
tidyr::pivot_wider(names_from = dimension, values_from = c("mean", "sd")) %>%
dplyr::rename(E = mean_E,
P = mean_P,
A = mean_A,
term = term_new) %>%
dplyr::mutate(across(c(E, P, A, starts_with("sd")), ~round(as.numeric(.), digits = 2))) %>%
dplyr::mutate(component = "identity",
group = "all")
} else if (grepl("techvshuman", key)){
data_std <- data %>%
dplyr::rename(
term_human = X,
term_comp = X.1,
term_ai = X.2,
E_human = Evaluation,
E_comp = X.3,
E_ai = X.4,
P_human = Potency,
P_comp = X.5,
P_ai = X.6,
A_human = Activity,
A_comp = X.7,
A_ai = X.8
) %>%
dplyr::filter(term_human != "human identity") %>%
tidyr::pivot_longer(cols = everything(),
names_to = c(".value", "actortype"),
names_pattern = "(.*)_(.*)") %>%
dplyr::mutate(across(c(E, P, A), ~round(as.numeric(.), digits = 2))) %>%
dplyr::select(-actortype) %>%
dplyr::mutate(component = "identity",
group = "all") %>%
standardize_terms(key = key)
} else if (grepl("ugatech", key)){
data_std <- data %>%
dplyr::mutate(n_E_male = n_male,
n_P_male = n_male,
n_A_male = n_male,
n_E_female = n_female,
n_P_female = n_female,
n_A_female = n_female) %>%
dplyr::rowwise() %>%
dplyr::mutate(
mean_E_average = (mean_E_male * n_male + mean_E_female * n_female)/(n_male + n_female),
mean_P_average = (mean_P_male * n_male + mean_P_female * n_female)/(n_male + n_female),
mean_A_average = (mean_A_male * n_male + mean_A_female * n_female)/(n_male + n_female),
sd_E_average = sqrt(((n_male - 1)*sd_E_male^2 + (n_female - 1)*sd_E_female^2)/(n_male + n_female - 2)),
sd_P_average = sqrt(((n_male - 1)*sd_P_male^2 + (n_female - 1)*sd_P_female^2)/(n_male + n_female - 2)),
sd_A_average = sqrt(((n_male - 1)*sd_A_male^2 + (n_female - 1)*sd_A_female^2)/(n_male + n_female - 2)),
n_E_average = n_male + n_female,
n_P_average = n_male + n_female,
n_A_average = n_male + n_female,
) %>%
dplyr::ungroup() %>%
dplyr::select(-n_male, -n_female) %>%
tidyr::pivot_longer(cols = c(contains("male"), contains("average")),
names_to = c(".value", "dimension", "group"),
names_sep = "_") %>%
dplyr::group_by(term, group) %>%
dplyr::mutate(across(c("mean", "sd"), ~round(., digits = 2))) %>%
tidyr::pivot_wider(names_from = "dimension", values_from = c("mean", "sd", "n")) %>%
dplyr::rename(E = mean_E,
P = mean_P,
A = mean_A) %>%
dplyr::mutate(group = ifelse(group == "average", "all", group))
} else if(grepl("mostafavi", key)){
data_std <- data %>%
dplyr::select(Original_term, component, E, P, A, E_std, P_std, A_std) %>%
dplyr::rename(term = Original_term,
sd_E = E_std,
sd_P = P_std,
sd_A = A_std) %>%
dplyr::mutate(group = "all") %>%
standardize_terms(key = key) %>%
dplyr::filter(term != "")
}
}
data_std <- data_std %>%
dplyr::mutate(context = ifelse(!grepl("calcutta", key), dict_meta[dict_meta$key == key, "context"], context),
year = year,
dataset = key) %>%
dplyr::select(any_of(cnames))
mean_epa <- mean_epa %>%
dplyr::full_join(data_std)
}
#### NOW INDIVIDUAL DATA: USE TO CALCULATE SD/COV AND LUMP TOGETHER TO SAVE #######################
mean_variance_epa <- data.frame(matrix(nrow = 0, ncol = length(cnames) - 1))
individual <- data.frame()
names(mean_variance_epa) <- cnames[cnames != "instcodes"]
source_folder <- "data-raw/dicts/individual"
ind_file_list <- grep("RDS$", list.files(source_folder), value = TRUE)
for(file in ind_file_list){
path <- paste0(source_folder, "/", file)
key <- stringr::str_extract(file, "^[[:alnum:]]*(?=_)")
if(!(key %in% exclude_keys)){
# context <- stringr::str_extract(key, "^[[:alpha:]]*(?=[[:digit:]])")
context <- dict_meta[dict_meta$key == key, "context"]
# I'm leaving year as a regex instead of taking it from meta because this ensures it is numeric and therefore sortable
year <- stringr::str_extract(key, "[[:digit:]]*$")
# year <- meta[meta$key == key, "year"]
data <- readRDS(file = path) %>%
dplyr::mutate(dataset = key,
context = context,
year = year) %>%
dplyr::select(dataset, context, year, dplyr::everything()) %>%
dplyr::mutate(across(everything(), as.character))
sum_data <- epa_summary(data)
# commented out because it fails with the mturk dictionary, I think because it calculates the sd with all values but the vcov matrix only with complete pairs.
# if(!check_sd_cov_vals(sum_data)){
# stop(print(paste("error with current dataset ", key)))
# }
# add in dataset level variables
# in principle you could calculate different gender versions but in practice these are generally not useful so I am going to lump all together.
# if someone wants to do this (for any characteristic) they can using the individual data.
sum_data <- sum_data %>%
dplyr::mutate(dataset = key,
context = context,
year = year,
group = "all",
E = mean_E,
P = mean_P,
A = mean_A) %>%
dplyr::select(any_of(cnames))
# some of this info is duplicative but also this is what bayesact expects so perhaps it's worth keeping all and just making note... the datasets aren't that big
mean_variance_epa <- rbind(mean_variance_epa, sum_data)
if(nrow(individual) == 0){
individual <- data.frame(matrix("", nrow = 0, ncol = ncol(data)))
names(individual) <- names(data)
}
individual <- dplyr::full_join(individual, data)
}
}
duplicateid <- individual %>%
dplyr::select(dataset, userid) %>%
dplyr::distinct() %>%
dplyr::group_by(userid) %>%
dplyr::mutate(n = dplyr::n()) %>%
dplyr::filter(n > 1,
dataset %in% c("usfullsurveyor2015", "usstudent2015")) %>%
dplyr::select(-n)
individual <- individual %>%
dplyr::anti_join(duplicateid)
# # no uga userIDs left in the usfullsurveyor dataset--all UGA folks are accounted for under the uga dataset.
# unique(stringr::str_extract(dplyr::filter(individual, dataset == "usfullsurveyor2015")$userid, "[[:alpha:]]*"))
#
# duplicated <- individual %>%
# dplyr::inner_join(dplyr::select(duplicateid, userid)) %>%
# dplyr::distinct(dplyr::across(-dataset))
# # Only issue is that two people in duke community got the same identifier--modify so they are different.
# problemmatch <- duplicated %>%
# dplyr::group_by(userid, term, component) %>%
# dplyr::mutate(n = dplyr::n()) %>%
# dplyr::filter(n > 1)
#### INSTITUTION CODES #################################
instcodes_df <- utils::read.csv2("data-raw/dicts/instcodes.csv", header = FALSE, sep = ",", col.names = c("term", "component", "instcodes"))
instcodes_df <- rbind(standardize_terms(instcodes_df[which(instcodes_df$component == "identity"),], key = "uga2015", component = "identity"),
standardize_terms(instcodes_df[which(instcodes_df$component == "behavior"),], key = "uga2015", component = "behavior"),
standardize_terms(instcodes_df[which(instcodes_df$component == "modifier"),], key = "uga2015", component = "modifier"),
standardize_terms(instcodes_df[which(instcodes_df$component == "setting"),], key = "uga2015", component = "setting")) %>%
mutate(instcodes = stringr::str_trim(instcodes))
individual <- individual %>%
dplyr::mutate(
userid = dplyr::case_when(userid == "DComm597" & gender == "Female" ~ "DComm597a",
userid == "DComm597" & gender == "Male" ~ "DComm597b",
TRUE ~ userid)
) %>%
dplyr::left_join(instcodes_df, by = c("term", "component")) %>%
dplyr::select(any_of(c("dataset", "context", "year", "userid", "gender", "age",
"raceeth", "race", "race1", "race2", "hisp", "term", "component", "instcodes")),
everything())
individual <- tibble::as_tibble(individual) %>%
dplyr::mutate(E = as.numeric(E),
P = as.numeric(P),
A = as.numeric(A))
epa_summary_statistics <- dplyr::bind_rows(mean_variance_epa, mean_epa) %>%
dplyr::full_join(calcutta) %>%
dplyr::filter(!is.na(E) | !is.na(P) | !is.na(A)) %>%
dplyr::filter(!is.na(term)) %>%
dplyr::arrange(dataset, term) %>%
dplyr::rename(instcodes_old = instcodes) %>%
dplyr::left_join(instcodes_df, by = c("term", "component")) %>%
# merge the old and new instititution codes together -- take the uga one unless there is no uga one
dplyr::mutate(instcodes = ifelse(is.na(instcodes), instcodes_old, instcodes)) %>%
dplyr::mutate(instcodes = stringr::str_trim(instcodes)) %>%
dplyr::select(-instcodes_old) %>%
dplyr::select(term, component, dataset, context, year, group, instcodes, everything())
epa_summary_statistics <- tibble::as_tibble(epa_summary_statistics) %>%
dplyr::mutate(E = as.numeric(E),
P = as.numeric(P),
A = as.numeric(A))
# there are 642 instances where institution codes do not agree between the uga set and whatever the old set was
# I am overwriting the old codes with the uga codes, for consistency
# notequal <- dplyr::filter(epa_summary_statistics, instcodes != stringr::str_trim(instcodes_old))
# # there are 10389 instances where a term has an institution code in some dataset but not in uga
# # are there inconsistencies here?
# notinuga <- dplyr::filter(epa_summary_statistics, is.na(instcodes) & !is.na(instcodes_old)) %>%
# dplyr::select(term, component, instcodes_old) %>%
# dplyr::distinct() %>%
# dplyr::group_by(term, component) %>%
# dplyr::mutate(n = dplyr::n()) %>%
# filter(n > 1)
#
# # there are 27 terms which do not have a code in the uga dataset and which have conflicting codes in other datasets
# length(unique(notinuga$term))
# save the combined summary statistic dataframe
# usethis::use_data(mean_variance_epa, overwrite = TRUE)
usethis::use_data(epa_summary_statistics, overwrite = TRUE, compress = "bzip2")
# save the combined individual dataframe
usethis::use_data(individual, overwrite = TRUE, compress = "bzip2")
check_sd_cov_vals <- function(data){
# check if the diagonal of the vcov matrix is the same as the standard deviation squared (ie is it really the variance)
dat_check <- data %>%
dplyr::mutate(E_var_sd2 = sd_E*sd_E,
P_var_sd2 = sd_P*sd_P,
A_var_sd2 = sd_A*sd_A,
var_E_equal = abs(E_var_sd2 - cov_EE) <= 1e-1,
var_P_equal = abs(P_var_sd2 - cov_PP) <= 1e-1,
var_A_equal = abs(A_var_sd2 - cov_AA) <= 1e-1,
refl1 = cov_EP == cov_PE,
refl2 = cov_EA == cov_AE,
refl3 = cov_AP == cov_PA)
if(!all(dat_check$var_E_equal, na.rm = TRUE) |
!all(dat_check$var_P_equal, na.rm = TRUE) |
!all(dat_check$var_A_equal, na.rm = TRUE)) {
return(FALSE)
} else {
print("variance and sd check out")
}
# check for expected reflection in the vcov matrix
if(!all(dat_check$refl1, na.rm = TRUE) |
!all(dat_check$refl2, na.rm = TRUE) |
!all(dat_check$refl3, na.rm = TRUE)){
return(FALSE)
} else {
print("matrix is upper triangular")
}
return(TRUE)
}
##### CREATE TERM TABLE
keys <- unique(epa_summary_statistics$dataset)
term_table <- epa_summary_statistics %>%
dplyr::select(term, component) %>%
dplyr::distinct()
for(key in keys){
subset_idents <- epa_summary_statistics[epa_summary_statistics[["dataset"]] == key,] %>%
dplyr::select(term, component) %>%
dplyr::distinct() %>%
dplyr::mutate({{key}} := 1)
term_table <- term_table %>%
dplyr::left_join(subset_idents, by = c("term", "component"))
}
term_table <- term_table %>%
dplyr::mutate(across(-c("term", "component"), ~replace(., is.na(.), 0))) %>%
dplyr::select(term, component, everything())
usethis::use_data(term_table, overwrite = TRUE, compress = "bzip2")
#
# instcode_rowequal <- function(row){
# noterm <- row[,2:ncol(row)]
# noterm <- as.character(noterm)
# noterm <- noterm[!is.na(noterm)]
# if(length(unique(noterm)) > 1){
# return(FALSE)
# }
# else{
# return(TRUE)
# }
# }
# inst_code_table_ident <- dplyr::select(inst_code_table_ident, -allsame)
# inst_code_table_beh <- dplyr::select(inst_code_table_beh, -allsame)
# inst_code_table_mod <- dplyr::select(inst_code_table_mod, -allsame)
# inst_code_table_set <- dplyr::select(inst_code_table_set, -allsame)
#
#
# # figure out which are duplicates
# inst_code_table_ident <- inst_code_table_ident %>%
# dplyr::filter(!is.na(term))
# inst_code_table_beh <- inst_code_table_beh %>%
# dplyr::filter(!is.na(term))
# inst_code_table_mod <- inst_code_table_mod %>%
# dplyr::filter(!is.na(term))
# inst_code_table_set <- inst_code_table_set %>%
# dplyr::filter(!is.na(term))
#
#
# inst_code_table_ident_temp <- inst_code_table_ident
# for(i in 1:nrow(inst_code_table_ident)){
# inst_code_table_ident_temp[i, "allsame"] <- instcode_rowequal(inst_code_table_ident[i,])
# }
# inst_code_table_ident <- inst_code_table_ident_temp
#
# inst_code_table_beh_temp <- inst_code_table_beh
# for(i in 1:nrow(inst_code_table_beh)){
# inst_code_table_beh_temp[i, "allsame"] <- instcode_rowequal(inst_code_table_beh[i,])
# }
# inst_code_table_beh <- inst_code_table_beh_temp
#
# inst_code_table_mod_temp <- inst_code_table_mod
# for(i in 1:nrow(inst_code_table_mod)){
# inst_code_table_mod_temp[i, "allsame"] <- instcode_rowequal(inst_code_table_mod[i,])
# }
# inst_code_table_mod <- inst_code_table_mod_temp
#
# inst_code_table_set_temp <- inst_code_table_set
# for(i in 1:nrow(inst_code_table_set)){
# inst_code_table_set_temp[i, "allsame"] <- instcode_rowequal(inst_code_table_set[i,])
# }
# inst_code_table_set <- inst_code_table_set_temp
#
# inst_dups_ident <- dplyr::filter(inst_code_table_ident, allsame == FALSE)
# inst_dups_beh <- dplyr::filter(inst_code_table_beh, allsame == FALSE)
# inst_dups_mod <- dplyr::filter(inst_code_table_mod, allsame == FALSE)
# inst_dups_set <- dplyr::filter(inst_code_table_set, allsame == FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.