Nothing
#Rrepest Isolated for one database (group)
rrepest_base <- function(data, svy, est, by = NULL, over = NULL,
test = FALSE, user_na= FALSE, show_na= FALSE, flag = FALSE,
fast = FALSE, cores = NULL,...) {
# Goal: Dataframe with statistics for TALIS
# ------ INPUTS ------.
######### WHO #########
# data : (dataframe) df to analyze
# svy : List of possible projects to analyse TALISSCH and TALISTCH
######### WHAT #########
# est : (est function) that takes arguments what = estimate, tgt = target, rgr = regressor
# what : (string vector) accepts "mean","var","std", "quant", "iqr", "freq", "lm", "log", "corr", "cov"
# tgt : (string vector) variable from where to get frequencies
# rgr : (string vector) independant variable for regression (1+)
######### WHERE #########
# by.var : (string vector) column in which we'll break down results
# over : (vector string) columns over which to do analysis
######### HOW #########
# test : (bool) If TRUE will calculate the difference between over variables
# user_na : (Bool) TRUE → show nature of user defined missing values for by.var
# show_na : (Bool) TRUE → include na in frequencies of x
# flag : (Bool) TRUE → Show NaN when there is not enough cases (or schools)
# fast : (Bool) TRUE → Only do 6 replicated weights
# ...
# isced : (number) isced level to analyze
# Argument renaming to be adapted in v2 -----------------------------------
by.var <- by
what <- est$what
tgt <- est$tgt
rgr <- est$rgr
# Groups ------------------------------------------------------------------
# if (!is.null(group)) {
# data <- do_group(data = data, grp.l = group)
# }
# Plausible Values DIGITS--------------------------------------------------
# If tgt or rgr has an @ then get digits of PVs
if (any(grepl("@", c(tgt,rgr,by.var,over)))) {
# Get all PV digits
pv.digits <- digits.pvs(svy = svy, ...)
pv <- T #To know when to parallelize
} else {
# if no @ the just iterate over 1 value
pv.digits <- "@"
pv <- F #To know when to parallelize
}
# FORMATING ---------------------------------------------------------------.
# Get optional arguments
extra.args <- list(...)
# If no by.var do it in all df
if (is.null(by.var)) {
data["tot.df"] <- "Total"
by.var <- "tot.df"
}
# TEST: Remove NAs from last variable in over, to avoid yy..(xx-NA) --------.
if (test){
data <- data %>%
drop_na(all_of(over[length(over)]))
}
#GENERAL option: If what is not "gen" convert to lowercase (might be functions in upper case)
if (!("gen" %in% what)) {
# All variables in dataframe must be LOWERCASE
over <- tolower(over)
by.var <- tolower(by.var)
tgt <- tolower(tgt)
rgr <- tolower(rgr)
}
# Formatting depending on continuous or categorical tgt
if ("freq" %in% what) {
# Data formatting
df <- format_data_repest(df = data,
svy = svy,
x = NULL,
# all arguments with an @ get converted into pvs
by.over = get.pv.arguments(pv.digits, c(by.var,over,tgt)),
user_na = user_na,
isced = is.there(extra.args$isced),
...)
# # If there is an OVER variable remove NAs from OVER vars
# if (length(get.pv.arguments(pv.digits,over)) > 0) {
# if (!any(show_na,user_na)) {
# for (i in get.pv.arguments(pv.digits,over)) {
# df <- df %>% drop_na(i)
# }
# }
# }
# # X is categorical in a frequency, must be formatted accordingly
# df <- format_data_categ_vars(df,
# get.pv.arguments(pv.digits, tgt),
# show_na)
}
else if (!("gen" %in% what)) {
# Data formatting
df <- format_data_repest(df = data,
svy = svy,
x = NULL,
# all arguments with an @ get converted into pvs
by.over = get.pv.arguments(pv.digits, c(by.var,over)),
user_na = user_na,
isced = is.there(extra.args$isced),
...)
# If there is an OVER variable remove NAs from OVER vars
if (length(get.pv.arguments(pv.digits,over)) > 0) {
for (i in get.pv.arguments(pv.digits,over)) {
df %>% drop_na(i)
}
}
# Format continuous vars
df <- format_data_cont_vars(df, cont.vars = get.pv.arguments(pv.digits, c(tgt,rgr)))
}
# CLUSTER -----------------------------------------------------------------.
# If pvs, set up cluster
if (pv) {
#Get repweights
rep_weights <- replicated_w_names(svy = svy, ...)
# Select necessary data and convert to to datable
# GENERAL option: Assumes only essential data is preselected by user
if (!("gen" %in% what)) {data.par <- as.data.table(df[,get.pv.arguments(pv.digits,c(tgt,rgr,rep_weights,by.var,over))])}
else {data.par <- as.data.table(data)}
# Set up cluster working nodes and what they have to know
if(is.null(cores)){
# If NULL, action the max of cores minus 1
cl <- makeCluster(max(detectCores()-1,1))
} else {
# If no null, set the minimum between the set number and max-1
cl <- makeCluster(min(detectCores()-1,cores))
}
clusterExport(cl, "data.par",envir = environment())
doParallel::registerDoParallel(cl)
}
# PLAUSIBLE VALUES LOOP -------------------------------------------------
pv.l <- lapply(pv.digits, function(pv.d.i) {
if (pv) message(as.character(pv.d.i)) #counter of pvs
# PARALLEL UNIVERSES ---------------------------------------------------.
# replace @ with that digit in tgt and rgr
tgt <- gsub("@", as.character(pv.d.i), tgt)
rgr <- gsub("@", as.character(pv.d.i), rgr)
# Also for over variables in the over and by.vars if categories created from pvs
by.var <- gsub("@", as.character(pv.d.i), by.var)
over <- gsub("@", as.character(pv.d.i), over)
# REPLICATED WEIGHTS ---------------------------------------------------.
res.l <- lapply(tgt, function(tgt.i){
res.tgt.i <- list()
#-------------- STATISTICS --------------.
# If there is something else than freq
if(any(c("mean","means","meanpct","meanspct","var","std", "sd", "quant", "iqr") %in% what )){
# remove "log","lm" and "freq"
what.statistic <- what[! what %in% c("lm","log","freq","corr","cov")]
# if flag show.n as "flag"
if (flag) {show.n = "flag"} else {show.n = "n"}
# Do statistics
res.stats <-
pv.rrepest.statistics(data = df, svy = svy, statistic = what.statistic,
x = tgt.i, by.var = by.var,over = over, test = test,
flag = flag, user_na = user_na, fast = fast, pv = pv,
...)
# Rename to by.var first column to merge all results
colnames(res.stats)[1] <- "by.var"
# Append result of statistics to list of results
res.tgt.i <- append(res.tgt.i, list(res.stats))
}
#---------------------------------------.
#-------------- FREQUENCIES --------------.
if ("freq" %in% what) {
# Do frequencies
res.freq <-
pv.rrepest.frequencies(data = df, svy = svy, x = tgt.i, by.var = by.var,
over = over, test = test, user_na = user_na,
show_na = show_na, flag = flag, fast = fast, pv = pv,
pvdigits = pv.digits, # Insert pv.digits to erase NAs from over variables
#pcluster = ifelse(pv,cl,c()), # Cluster of workers in case pv exists
...)
# Append result of frequencies to list of results
res.tgt.i <- append(res.tgt.i, list(res.freq))
}
#---------------------------------------.
#-------------- LINEAR REGRESSION --------------.
if ("lm" %in% what) {
# Do regression
res.lm <-
pv.rrepest.lm(data = df, svy = svy, x = rgr, y = tgt.i, by.var = by.var,
over = over, test = test, user_na = user_na, flag = flag,
fast = fast, pv = pv, ...)
# Append result of regression to list of results
res.tgt.i <- append(res.tgt.i, list(res.lm))
}
#---------------------------------------.
#-------------- LOGISTIC REGRESSION --------------.
if ("log" %in% what) {
# Do regression
res.logreg <-
pv.rrepest.logreg(data = df, svy = svy, x = rgr, y = tgt.i, by.var = by.var,
over = over, test = test, user_na = user_na, flag = flag,
fast = fast, pv = pv, ...)
# Append result of regression to list of results
res.tgt.i <- append(res.tgt.i, list(res.logreg))
}
#---------------------------------------.
#-------------- ODDS RATIO DELTA METHOD Part 1: Names --------------.
if ("odr" %in% what) {
# Do regression
res.odr <-
pv.rrepest.odr(data = df, svy = svy, x = rgr, y = tgt.i, by.var = by.var,
over = over, test = test, user_na = user_na, flag = flag,
fast = fast, pv = pv, ...)
# Append result of regression to list of results
res.tgt.i <- append(res.tgt.i, list(res.odr))
}
#---------------------------------------.
#-------------- GENERAL OPTION --------------.
if ("gen" %in% what) {
# Do general calculation
res.gen <- # Here the argument for data is data and not df as no data preparation is done in "gen"
pv.rrepest.gen(data = data, svy = svy, y = tgt.i, by.var = by.var,
over = over, test = test, user_na = user_na, flag = flag,
fast = fast, pv = pv, ...)
# Append result of regression to list of results
res.tgt.i <- append(res.tgt.i, list(res.gen))
}
#---------------------------------------.
# Merge all in list of results if there is a result
if (length(res.tgt.i) > 0) {
res.tgt.i <- reduce(res.tgt.i, full_join, by = "by.var")
}
return(res.tgt.i)
})
# If there is nothing on res.l initialize it
res.l <- Filter(length, res.l)
#-------------- CORRELATION AND COVARIANCE --------------.
# Since corr will use all the variables in tgt we must do a separate loop
if (any(c("corr","cov","cor") %in% what)) {
# Leave what with only cor,corr,cov
what.cov.cor <- what[what %in% c("corr","cov","cor")]
# Loop over all left
res.l.corr <- lapply(what.cov.cor, function(what.i){
corr <- !(what.i == "cov")
res.corr <- pv.rrepest.corr.cov(data = df, svy = svy, x = tgt, by.var = by.var,
over = over, test = test, user_na=user_na, pv = pv,
show_na=show_na, flag = F, fast = fast, corr = corr,
...) #Can't accept flags since many variables
return(res.corr)
})
res.l <- append(res.l,res.l.corr)
}
#---------------------------------------.
# return all results merged into one dataframe
return(reduce(res.l, full_join, by = "by.var"))
})
# Stop the cluster if pvs
if(pv) stopCluster(cl)
###if PIAAC, modify pv.l to account for by.var specific variance factors:
if (svy=="PIAAC") {
pv.l<- add_PIAAC_variance_factors(pv.l=pv.l, pv=pv, df=df,by.var=by.var)
}
# If there are no PVs return first element of the list
if (!pv) {
result <- pv.l[[1]] %>% as_tibble()
} else {
# Check which inputs have an @
pv.inputs <- c(tgt,rgr,by.var,over)[grepl("@", c(tgt,rgr,by.var,over))]
#modify the names and make sure that the pv appears, along with the name of the pv vars in case there is a over variable.
pv.inputs.collapsed<- paste0(pv.inputs,collapse="")
for(pv.i in digits.pvs(svy = svy,...)) {
pv.i.inputs_collapsed<- pv.inputs.collapsed %>% gsub(x = .,pattern="@",replacement=pv.i)
names(pv.l[[pv.i]]) <- c(names(pv.l[[pv.i]])[1:length(by.var)],
paste0(
names(pv.l[[pv.i]])[-c(1:length(by.var))],
pv.i.inputs_collapsed))
}
# If there are PVs Calculate B and Imputation and Sampling Variance for SE
result <- b.se.pv(res.l = pv.l, pv.inputs = pv.inputs, statistic = what) %>%
as_tibble() %>% setNames(nm = str_remove(names(.),paste0(pv.inputs.collapsed,"$")))
}
#-------------- LOGISTIC REGRESSION DELTA METHOD Part 2: Estimates --------------.
if("odr" %in% what){
# Grab b. columns and strip the of the b.
result_names <- names(result)
estimate_names <- substring(result_names[grep("^b\\.",result_names)], first = 3)
# Loop over estimate names and do for β: exp(b.odr), for SE: exp(b.odr) * se.odr
for (est_i in estimate_names){
result[[paste0("se.",est_i)]] <- exp(result[[paste0("b.",est_i)]]) * result[[paste0("se.",est_i)]]
result[[paste0("b.",est_i)]] <- exp(result[[paste0("b.",est_i)]])
}
}
#----------------------------.
return(result)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.