R/repest_auxiliaries.R
In Rrepest: An Analyzer of International Large Scale Assessments in Education

Documented in format_data_repest weighted.std weighted.var

################# General statistics on Continuous variables #################
# 12 Feb 2023
# Rodolfo Ilizaliturri
#############################################################
# Goal: Obtain several statistics using Fay's BRR model

# ------------------.
# --------- WORK FUNCTIONS
# ------------------.

# Variance factors for SE
rrvar.pv <- function(df.rr, svy, pv = F, ...) {
  # Goal: Dispersion (variance) from total mean to get SE of statistic
  # ------ INPUTS ------.
  # df.rr : (dataframe) df containing total (1st columns) and replicate weighted statistic ONLY
  # svy : (string) List of possible projects to analyse PIAAC, PISA, TALISSCH and TALISTCH
  # pv : (Bool) TRUE → We are in plausible values and keep calculation as variance, not SE
  
  #Get args ...
  arguments <- list(...)
  
  # f : (number) variance factor (depends on replication method: BRR, jackknife-1, jk-2,…) (Fay = 1.5)
  if (svy %in% c("TALISTCH","TALISSCH","PISA","PISA2015","PBTS")) f = 1/(0.5^2)
  else if (svy == "SSES") f = 76/2
  else if (svy == "SSES2023") f = 1/(0.5^2) #Changed from Jk1 method to BBR method 
  else if (svy %in% c("ALL","IALS")) f = 30
  else if (svy == "IELS") f = 92/23
  ### for the case of PIAAC, f depends on countries. The correct variance factor will be applied in rrepest_base
  ### before b.se.pv is called.
  else if (svy == "PIAAC") f = 80
  else if (svy == "PISAOOS") f = 29
  else if (svy %in% c("TIMSS","PIRLS")) f = 1/2*150
  else if (svy %in% c("ICCS","ICILS")) f = 1*75
  else if (svy %in% c("ICCS_T","ICILS_T")) f = 1*75
  else if (svy %in% c("ICCS_C","ICILS_C")) f = 1*75
  else if (svy %in% c("LEADER_TALISEC","STAFF_TALISEC","TALISEC_LEADER","TALISEC_STAFF")) f = 92/23

  
  #if there is custom variance factor grab it from the arguments
  if (!is.null(arguments$var.factor)) {
    f = arguments$var.factor
  }
  
  # Calculate disperion from the mean squared time variance factor according to survey
  m <- df.rr[1]
  vec <- df.rr[-1]
  rr.var <- mean((m - vec)^2)*f
  
  if (!pv) {
    # Only one estimation (n.PV = 1), then can take sqrt of Var to get SE
    rr.se <- sqrt(rr.var)
    return(rr.se)
  } else {
    # Return variance for calculation of Sampling and Imputation variance
    return(rr.var)
  }
}

#Number of observations valid for x
n.obs.x <- function(df, by.var, x, svy) {
  # Goal: Number of schools and teachers calculation is performed on
  # ------ INPUTS ------.
  # df : (dataframe) df to analyze
  # by.var : (string) column in which we'll break down results
  # x : (string) variable from where to get means
  # svy : List of possible projects to analyse PIAAC, PISA, TALISSCH and TALISTCH
  
  #All lower caps column names
  colnames(df) <- colnames(df) %>% tolower()
  by.var <- by.var %>% tolower()
  x <- x %>% tolower()
  
  tot.n <- df %>%
    group_by(across(all_of(by.var))) %>%
    {if (svy == "TALISSCH") summarise(., n.obs = sum(!is.na(get(x)))) 
      else if (svy == "TALISTCH") {mutate(.,school.n = ifelse(is.na(get(x)) == T, NA, .data$idschool)) %>% 
          summarise(., n.obs = sum(!is.na(get(x))),
                    n.sch = n_distinct(school.n, na.rm = T))}
      else if (svy == "TALISEC_STAFF") {mutate(.,school.n = ifelse(is.na(get(x)) == T, NA, .data$idcentre)) %>% 
          summarise(., n.obs = sum(!is.na(get(x))),
                    n.sch = n_distinct(school.n, na.rm = T))}
      else if (svy == "PISA2015") {mutate(.,school.n = ifelse(is.na(get(x)) == T, NA, .data$cntschid)) %>% 
          summarise(., n.obs = sum(!is.na(get(x))),
                    n.sch = n_distinct(school.n, na.rm = T))}
      else if (svy == "PISA") {mutate(.,school.n = ifelse(is.na(get(x)) == T, NA, .data$schoolid)) %>% 
          summarise(., n.obs = sum(!is.na(get(x))),
                    n.sch = n_distinct(school.n, na.rm = T))}
      else if (svy %in% c("SSES","SSES2023")) {mutate(.,school.n = ifelse(is.na(get(x)) == T, NA, .data$schid)) %>% 
          summarise(., n.obs = sum(!is.na(get(x))),
                    n.sch = n_distinct(school.n, na.rm = T))}
      else summarise(., n.obs = sum(!is.na(get(x))))
    } %>%
    unite("by.group", all_of(by.var), sep = "|")
  
  #add x colanames
  colnames(tot.n)[-1] <- paste(colnames(tot.n)[-1], tolower(x), sep = ".")
  
  return(tot.n)
}
# n.obs.x(df = df.qqq,by.var = "cnt",x = "wb173q03ha",svy = "PISA2015")
# n.obs.x(df = df.p,by.var = "cntry",x = "tc3g06b",svy = "TALISSCH")
# n.obs.x(df = df.t,by.var = "cntry",x = "tt3g01",svy = "TALISTCH")


#' Formatting target, by, and over variables for Rrepest.
#'
#' @param df (data frame) Data to analyze.
#' @param svy (string) Possible projects to analyse: PIAAC, PISA, TALISSCH, TALISTCH, etc.
#' @param x (string vector) Target variables.
#' @param by.over (string vector) Variables to break analysis by.
#' @param user_na (bool) TRUE → show nature of user defined missing values
#' @param ... Optional arguments such as custom weights (cm.weights) 
#'
#' @return Data frame with variables in numeric format for analysis.
#' @export
#'
#' @examples
#' df1 <- format_data_repest(df_pisa18, "PISA", "pv1math", "cnt")
#' df2 <- format_data_repest(df_pisa18, "PISA", "pv1math", c("cnt","st004d01t"))
#' df3 <- format_data_repest(df_pisa18, "PISA", "pv1math", c("cnt","st004d01t","iscedl"))
#' df4 <- format_data_repest(df_talis18, "TALISTCH", "tt3g02", "cntry", isced = 2)
format_data_repest <- function(df, svy, x, by.over, user_na = F, ...) {
  # Goal: Get database into appropriate format
  # ------ INPUTS ------.
  # df : (dataframe) df to analyze
  # svy : List of possible projects to analyse PIAAC, PISA, TALISSCH and TALISTCH
  # by.over : (string vector) variables to break analysis by
  # user_na : (Bool) TRUE → show nature of user defined missing values
  
  #Get isced from ...
  arguments <- list(...)
  
  # Lowercase variables
  names(df) <- names(df) %>% tolower()
  
  # If isced specified for TALIS filter
  if (svy %in% c("TALISSCH","TALISTCH") & !is.null(arguments$isced)) {
    df <- df %>% 
      filter(idpop %in% arguments$isced)
  }
  
  # Weight names depending on project (TALIS, PISA, etc.)
  if (svy == "TALISTCH") weight.variables <- c("tchwgt",paste0("trwgt",1:100))
  else if (svy == "TALISSCH") weight.variables <- c("schwgt",paste0("srwgt",1:100))
  else if (svy %in% c("PISA","PISA2015")) weight.variables <- c("w_fstuwt",paste0("w_fsturwt",1:80))
  else if (svy == "SSES") weight.variables <- c("wt2019",paste0("rwgt",1:76))
  else if (svy == "SSES2023") weight.variables <- c("wt2023",paste0("rwgt",1:80))
  else if (svy == "PISAOOS") weight.variables <- c("spfwt0",paste0("spfwt",1:30))
  else if (svy == "TALISEC_STAFF") weight.variables <- c("staffwgt",paste0("srwgt",1:92))
  else if (svy == "TALISEC_LEADER") weight.variables <- c("cntrwgt",paste0("crwgt",1:92))
  else if (svy == "PIAAC") weight.variables <- c("spfwt0",paste0("spfwt",1:80))
  else if (svy %in% c("ALL","IALS")) weight.variables <- c("popwt",paste0("replic",1:30))
  else if (svy == "IELS") weight.variables <- c("childwgt",paste0("srwgt",1:92))
  else if (svy =="PBTS") weight.variables <- c("w_stu",paste0("rwgt",1:80))
  else if (svy %in% c("TIMSS","PIRLS")) weight.variables <- c("wgr",paste0("jr",1:150))
  else if (svy %in% c("ICCS","ICILS")) weight.variables <- c("totwgts",paste0("srwgt",1:75))
  else if (svy %in% c("ICCS_T","ICILS_T")) weight.variables <- c("totwgtt",paste0("trwgt",1:75))
  else if (svy %in% c("ICCS_C","ICILS_C")) weight.variables <- c("totwgtc",paste0("crwgt",1:75))
  else if (svy == "SVY") weight.variables <- arguments$cm.weights #custom optional weights that must exist if "SVY"
  
  #if there is custom name of weights grab it from the arguments
  if (!is.null(arguments$cm.weights)) {
    weight.variables <- arguments$cm.weights #custom optional weights
  }
  
  #if PIAAC, compute variance factor at the respondent level
  if (svy=="PIAAC") {
    df <- df %>% mutate(var_fac=
                          case_when(vemethodn==1 ~ (venreps-1)/venreps,
                                    vemethodn==2 ~ 1,
                                    vemethodn==3 ~ 1/venreps,
                                    vemethodn==4 ~ 1/(venreps*(1-vefayfac)^2)
                          ))
  }
  
  
  #If x has an @ for PVs
  if (grep("@", x) %>% length() != 0) {
    x <- get.pv.names(df, x)
  }
  
  num.vars <- c(weight.variables, x)
  
  #Convert x and weights to NUMERIC variables removing user defined na and just leave NA
  df.res <- df %>% 
    mutate(across(contains(num.vars) & ( where(is.labelled)), remove_user_na, user_na_to_na = T)) %>%
    mutate(across(contains(num.vars) & ( where(is.labelled)), as.numeric))
  
  #If we want to know NA identity or not (eg. Not reached, Absent)
  if (!user_na) {
    df.res <- df.res %>% 
      mutate_at(by.over,to_character, user_na_to_na = TRUE)
  }else{
    df.res <- df.res %>% 
      mutate_at(by.over,to_character)
  }
  
  #Show variables that create over categories in column names of results
  if (!is.null(arguments$show_over_vars)) {
    for (i in tolower(arguments$show_over_vars)) {
      df.res[[i]] <- paste(i, df.res[[i]], sep = "..")
    }
  }
  
  
  return(df.res)
}

do_group <- function(data, grp.l, user_na = F, ...){
  # Goal: Create dataframe with the groups on the variables and values defined
  # ------ INPUTS ------.
  # data : (dataframe) Data to analyze
  # grp.l : (list) list of groups to redifine {group_name = {column, values_in_group},...}
  res.l <- lapply(names(grp.l), function(g) {
    # Assign name of group to all the column value
    data %>% 
      filter(get(grp.l[[g]][["column"]]) %in% grp.l[[g]][["cases"]]) %>% 
      mutate(!!grp.l[[g]][['column']] := g) %>% 
      return()
  })
  
  
  for (g.i in sapply(names(grp.l),function(g) grp.l[[g]][["column"]]) %>% unique()) {
    #If we want to know NA identity or not (eg. Not reached, Absent) to get correct format in column
    if (!user_na) {
      data <- data %>% 
        mutate_at(g.i,to_character, user_na_to_na = TRUE)
    }else{
      data <- data %>% 
        mutate_at(g.i,to_character)
    }
  }
  

  return(append(res.l,data))
  
}

#Check existance of argument and use it if so
is.there <- function(argument) {
  if(!is.null(argument)) return(argument)
  else return(NULL)
}


#Plausible Values names from unknown digits of PVs
get.pv.names <- function(df, pv.var) {
  # Goal: Do means, variance or standard deviation
  # ------ INPUTS ------.
  # df : (dataframe) df to analize
  # pv.var : variable that contains an @ symbol to interpret as PVs
  
  pv.split <- strsplit(pv.var,"@")[[1]]
  pv.gex <- paste0("^",pv.split[1],".*",pv.split[2],"$")
  pv.names <- grep(pv.gex,colnames(df) %>% tolower(), value = T)
  
  return(pv.names)
}
#EX. x <- get.pv.names(df,"pv@math")

#Weight names
replicated_w_names <- function(svy, ...) {
  # Goal: Names of weighting variables corresponding to survey
  # ------ INPUTS ------.
  # svy : List of possible projects to analyse PIAAC, PISA, TALISSCH and TALISTCH
  
  #Get custom arguments...
  arguments <- list(...)
  
  # Weight names depending on project (TALIS, PISA, etc.)
  if (svy == "TALISTCH") weight.variables <- c("tchwgt",paste0("trwgt",1:100))
  else if (svy == "TALISSCH") weight.variables <- c("schwgt",paste0("srwgt",1:100))
  else if (svy %in% c("PISA","PISA2015")) weight.variables <- c("w_fstuwt",paste0("w_fsturwt",1:80))
  else if (svy == "PISAOOS") weight.variables <- c("spfwt0",paste0("spfwt",1:30))
  else if (svy == "SSES") weight.variables <- c("wt2019",paste0("rwgt",1:76))
  else if (svy == "SSES2023") weight.variables <- c("wt2023",paste0("rwgt",1:80))
  else if (svy == "TALISEC_STAFF") weight.variables <- c("staffwgt",paste0("srwgt",1:92))
  else if (svy == "TALISEC_LEADER") weight.variables <- c("cntrwgt",paste0("crwgt",1:92))
  else if (svy == "PIAAC") weight.variables <- c("spfwt0",paste0("spfwt",1:80))
  else if (svy %in% c("ALL","IALS")) weight.variables <- c("popwt",paste0("replic",1:30))
  else if (svy == "IELS") weight.variables <- c("childwgt",paste0("srwgt",1:92))
  else if (svy =="PBTS") weight.variables <- c("w_stu",paste0("rwgt",1:80))
  else if (svy %in% c("TIMSS","PIRLS")) weight.variables <- c("wgr",paste0("jr",1:150))
  else if (svy %in% c("ICCS","ICILS")) weight.variables <- c("totwgts",paste0("srwgt",1:75))
  else if (svy %in% c("ICCS_T","ICILS_T")) weight.variables <- c("totwgtt",paste0("trwgt",1:75))
  else if (svy %in% c("ICCS_C","ICILS_C")) weight.variables <- c("totwgtc",paste0("crwgt",1:75))
  else if (svy == "SVY") weight.variables <- arguments$cm.weights #custom optional weights that must exist if "SVY"
  
  #if there is custom name of weights grab it from the arguments
  if (!is.null(arguments$cm.weights)) {
    weight.variables <- arguments$cm.weights #custom optional weights
  }
  
  return(c(weight.variables[1], weight.variables[-1]))
}
#EX. wts <- replicated_w_names("PISA")
#EX. wts <- replicated_w_names("TALISTCH")

add_PIAAC_variance_factors <- function(pv.l, pv, df,by.var){
  # GOAL: multiply each se in pv.l by the variance factor specific to the by.var
  # ------ INPUTS ------
  # pv.l (list of df) for each pv, a df with bs and ses
  # pv (boolean) is there any pv
  # df (df) respondent level data
  # by.var (string) value of the by.var
  
  
  #in case the value of the variance factor is not constant within the estimation sample:
  # take the average, and then print a message
  # can happens when the sample contains several countries 
  var_facs <- df %>%  group_by(all_of(across(by.var))) %>%  
    distinct(across("var_fac")) %>% 
    rename(`by.var`=by.var)
  
  check <- var_facs %>%  count %>% 
    ungroup %>% summarize(max(n)) %>%  pull
  
  if (check>1) {
    var_facs <- var_facs %>%  group_by(all_of(across(by.var))) %>% 
      summarise(var_fac=mean(.data$var_fac, na.rm=TRUE))
    message("Variance factors are not constant within by.var")
  } 
  
  # if there is no pv the 'ses' in pv.l are multiplied by the sqrt of the variance factor
  if (!pv) {
    pv.l[[1]] <-  pv.l[[1]] %>% full_join(var_facs, by= "by.var") %>% 
      mutate(across(starts_with("se"),
                    ~ .x *sqrt(.data$var_fac))) %>% 
      select(-"var_fac")
  } else {
    # if there is a pv the 'ses' in pv.l are multiplied by the variance factor 
    # (in such a case, the 'ses' are actually the square of the true ses)!!!
    pv.l <- lapply(pv.l, FUN = function(pv.i) {
      pv.i %>% full_join(var_facs, by= "by.var") %>% 
        mutate(across(starts_with("se"),
                      ~ .x *.data$var_fac)) %>% 
        select(-"var_fac")
    })
    
  }
  
  
  return(pv.l)
}





get.pv.arguments <- function(digit.pvs, argument) {
  # Goal: Replace @ with all preset PV digits from arguments
  # ------ INPUTS ------.
  # digit.pvs (Numeric vector) : 1:x List of possible digits (comes from digits.pv)
  # argument (String vector) : names of variables to analyse, at least one with a @
  
  res <- lapply(digit.pvs, function(x.i){
    gsub("@", as.character(x.i), argument)
  }) %>%
    # Merge all with only unique elements
    reduce(union)
  
  return(res)
}

#Digits in PVs
digits.pvs <- function(svy, ...) {
  # Goal: Number of plausible values corresponding to survey
  # ------ INPUTS ------.
  # svy : List of possible projects to analyse PIAAC, PISA, TALISSCH and TALISTCH
  
  # Extract number of pvs if svy = "SVY"
  arguments <- list(...)
  
  # Number of plausible values (TALIS, PISA, etc.)
  if (svy %in% c("PISA2015","PISAOOS","PIAAC","ALL","IALS")) n.d.pvs <- 1:10
  else if (svy %in% c("PISA","IELS","PBTS","ICCS",
                      "ICILS","ICCS_T","ICILS_T",
                      "ICCS_C","ICILS_C","TIMSS","PIRLS")) n.d.pvs <- 1:5
  
  #if there is custom number of pvs grab it from the arguments
  if (!is.null(arguments$n.pvs)) {
    n.d.pvs <- 1:arguments$n.pvs
  }
  
  
  return(n.d.pvs)
}

#Statistics
weighted.var <- function(x, w, na.rm = T) {
  # Goal: Obtained weighted variance of a vector
  # ------ INPUTS ------.
  # x : (vector) variable to analyze
  # w : (vector) vector of weights
  # na.rm : (bool) if TRUE (default) remove missing values.
  
  w.var <- sum(w*(x-weighted.mean(x,w,na.rm = na.rm))^2, na.rm = na.rm)/sum(w,na.rm = na.rm)
  return(w.var)
}
weighted.std <- function(x, w, na.rm = T) {
  # Goal: Obtained weighted standard deviation of a vector
  # ------ INPUTS ------.
  # x : (vector) variable to analyze
  # w : (vector) vector of weights
  # na.rm : (bool) if TRUE (default) remove missing values.
  
  w.std <- (sum(w*(x-weighted.mean(x,w,na.rm = na.rm))^2, na.rm = na.rm)/sum(w,na.rm = na.rm))^(1/2)
  return(w.std)
}

#Core loop
doing_statistic_string <- function(statistic, over.by){
  # Goal: Have string to be evaluated on loop and grouped by
  # ------ INPUTS ------.
  # over.by : (string vector) variables to break analysis by
  # statistic : (string) accepts "mean","var","std", "quant", "iqr"
  
  #After, need to fo eval(parse(text= "..."))
  
  #Create string for over.by in data.table
  over.by.str <- paste0("c('",paste0(over.by,collapse = "','"),"')")
  
  #Do string for data.table and statistic
  if (statistic %in% c("quant","iqr")) {
    fun.str <- paste0("dt[, .(w.",
                      statistic," = weighted.",
                      statistic,"(get(x.i),get(w.i),q.iqr)), by = ",
                      over.by.str,"]")

  }else{
    fun.str <- paste0("dt[, .(w.",
                      statistic," = weighted.",
                      statistic,"(get(x.i),get(w.i),na.rm = T)), by = ",
                      over.by.str,"]")
  }
  return(fun.str)
}

#Sampling error
# TALIS
brr.se <- function(v, Fay = 0.5) {
  # Goal: Dispersion (like variance) from total mean to get SE of statistic
  # ------ INPUTS ------.
  # v : (dataframe) df containing total and replicate weighted statistic
  # Fay : (number) Number corresponding to adjustment by method (Fay = 1.5)
  m <- v[1]
  vec <- v[-1]
  rr.var <- mean((m - vec)^2)*1/Fay^2
  rr.se <- sqrt(rr.var)
  
  return(rr.se)
  
}
# PISA

vectorize.stat.on.weights <- function (df, statistic = "mean", rep_weights, x, by.var, over = NULL, test = F, ...) {
  # Goal: Get list of dataframes with total and replicated weight estimates for x (either x or pvs)
  # ------ INPUTS ------.
  # df : (dataframe) df to analize
  # rep_weights : (string vector) names of replicated weight vars
  # over : (vector string) columns over which to do analysis
  # x: (string vector) variable or pvs from wich to extract statistics
  # by.var : (string vector) variables to break analysis by
  # statistic : (string) accepts "mean","var","std", "quant", "iqr"
  # ... Dots
  # q.iqr : (vector numeric) of length 1 if quantile, of length 2 if interquantile range
  
  arguments <- list(...)
  
  dt <- df %>%
    select(all_of(c(rep_weights, x, by.var, over))) %>% #Select only data to analyze
    filter_at(vars(all_of(x)), all_vars(!is.na(.))) %>%  #Filter on only non NA values move here due to performance
    as.data.table() # Convert to data.table
  
  
  # Create string to be evaluated
  # If quantile or iqr add q.iqr to string
  if (!is.null(arguments$q.iqr)) {
    stat.str <- doing_statistic_string(statistic, c(by.var, over))
    stat.str <- sub("q.iqr",
                    paste0("c(",paste0(arguments$q.iqr,collapse = ","),")"),
                    stat.str)
  } else {
    stat.str <- doing_statistic_string(statistic, c(by.var, over))
  }
  
  #----- Loop over x's
  res.x.par <- lapply(x, function(x.i) {
    #----- Loop over weights
    res.w.par <- lapply(rep_weights, function(w.i) {
      #----- Calculate statistic from string stat.str
      res.col <- eval(parse(text = stat.str))
    
      #------ If test for over add tests to res.col
      if (test) {
        res.col <- over.test(res.col, over) %>% 
          column_to_rownames(var= "by.group")
      }else{
        res.col <- res.col %>% 
          remove_rownames %>%
          unite("by.group", all_of(c(by.var,over)), sep = "|") %>% 
          column_to_rownames(var= "by.group")
      }
      
      # -- Track weight number
      names(res.col)[ncol(res.col)] <- paste0("w.",w.i)
      return(res.col)
    })
    res.w.par <- res.w.par %>% reduce(cbind) %>% as.data.frame()
  })
  
  return(res.x.par)
}
#EX. res.list <- vectorize.stat.on.weights(df,"mean",wts,"pv1math","cnt")
#EX. res.list <- vectorize.stat.on.weights(df,"mean",wts, x,"cnt")
#EX. res.list <- vectorize.stat.on.weights(df,"mean",wts,"pv1math","cnt","st004d01t")
#EX. res.list <- vectorize.stat.on.weights(df,"mean",wts,"pv1math","cnt","st004d01t",T)
#EX. res.list <- vectorize.stat.on.weights(df,"mean",wts,"pv1math","cnt",c("st004d01t","iscedl"),T)
#EX. res.list <- vectorize.stat.on.weights(df,"mean",wts,"pv1math","cnt",c("iscedl","st004d01t"),T)
#EX. res.list <- vectorize.stat.on.weights(df,"mean",wts, "tt3g02","cntry")
#EX. res.list <- vectorize.stat.on.weights(df,"quant",wts, "tt3g02","cntry", q.iqr = .5)
#EX. res.list <- vectorize.stat.on.weights(df,"iqr",wts, "tt3g02","cntry", q.iqr = c(.5,.9))


rrepest.stats <- function(df, svy, statistic = "mean", x, by.var, over, test, show.n = "n", user_na=F, ...){
  # Note: Same as rrepest.gstats without the number of schools and teachers
  # Goal: Dataframe with statistic and SE by Fay's BRR model
  # ------ INPUTS ------.
  # df : (dataframe) df to analyze
  # svy : (string) List of possible projects to analyse PIAAC, PISA, TALISSCH and TALISTCH
  # statistic : (string) accepts "mean","var","std"
  # by.var : (string) column in which we'll break down results !IOP!: several variables
  # x : (string) variable from where to get statistics
  # over : (vector string) columns over which to do analysis
  # test : (Bool) make differences across over categories
  # user_na : (Bool) TRUE → show nature of user defined missing values in by.var
  # show.n : (string options) "n" shows n values and keeps all data, "flag"" removes n values and places NaN in rows
  # ... Dot arguments
  # isced : (number) isced level to analyze
  
  #grab optional arguments
  extra <- list(...)
  
  # If there is an OVER variable the data must be flagged and remove NAs from OVER vars
  if (length(over) > 0) {
    show.n <- "flag"
    
    #remove NAs on over
    for (i in over) {
      data <- data %>% drop_na(i)
    }
  }
  # If there is an @ in x grab PVs
  if (grep("@", x) %>% length() != 0) {
    x <- get.pv.names(df.qqq, x)
  }
  # Get weight names
  wts <- replicated_w_names(svy)
  # Format data according to survey
  df <- format_data_repest(df, svy, x, c(by.var,over), user_na, isced = is.there(extra$isced))
  
  # Loop over weights
  res.list <- vectorize.stat.on.weights(df = df,
                                        statistic = statistic,
                                        rep_weights = wts,
                                        x = x,
                                        by.var = by.var,
                                        over = over,
                                        test = test)
  
  
  
}



separate_test_name <- function(test_name){
  # GOAL: Create two strings from the test column name for coverage
  # ------ INPUTS ------.
  # test_name : (string) string containing column name from test with structure nn.(x-y) 
  name_parts <- strsplit(test_name,"..",fixed = TRUE)[[1]]
  test_parts <- strsplit(name_parts[length(name_parts)],"-")[[1]]
  
  string_1 <- paste(c(name_parts[1:(length(name_parts)-1)],substr(test_parts[1],2,nchar(test_parts[1]))),collapse = "..")
  string_2 <- paste(c(name_parts[1:(length(name_parts)-1)],substr(test_parts[2],1,nchar(test_parts[2])-1)),collapse = "..")
  return(c(string_1,string_2))
}

nans_from_se2b <- function(df){
  # GOAL: Force NaNs that appear in se. columns to b. columns from coverage and flags
  # ------ INPUTS ------.
  # df : (dataframe) Resulting df from analysis with se. and b. columns
  for (col_i in names(df)){
    # Get se. columns
    if(startsWith(col_i,"se")){
      # Find the NaN in the column
      nan_s <- which(is.nan(df[col_i][[1]]))
      # If there are, replace the same values with NaN in b.
      if (length(nan_s)>0){
        common_name <- substr(col_i,4,nchar(col_i))
        df[paste0("b.",common_name)][[1]][nan_s] <- NaN
      }
    }
  }
  return(df)
}

n_obs_sch_freq_fix <- function(n_df, by.var, over, x){
  # Goal: Adjust n for observations ans school for frequencies to use in flags.
  # The rule followed by PISA for flagging simple breakdowns/frequencies is to apply the flag at the QUESTION level.
  # ------ INPUTS ------.
  # n_df : (dataframe) Data with n of observations and schools
  # by.var : (string) column in which we'll break down results !IOP!: several variables
  # x : (string) variable from where to get statistics
  # over : (vector string) columns over which to do analysis
  n_df %>% 
    # Separete x from all else
    separate(col = by.group,into = c(by.var, over, x), sep = "\\|") %>%
    unite(col= "by.var.nox",c(by.var, over),sep = "|") %>% 
    # Sum by breakdown excluding x
    group_by(.data$by.var.nox) %>% 
    mutate(across(where(is.numeric), ~ sum(.,na.rm = TRUE))) %>% 
    ungroup() %>% 
    # Reunite into by.group
    unite(col = "by.group",c("by.var.nox",x),sep = "|")
}
Any scripts or data that you put into this service are public.
Rrepest documentation built on April 4, 2025, 2:07 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
Rrepest
An Analyzer of International Large Scale Assessments in Education

R/repest_auxiliaries.R
In Rrepest: An Analyzer of International Large Scale Assessments in Education

Defines functions brr.se doing_statistic_string weighted.std weighted.var digits.pvs get.pv.arguments add_PIAAC_variance_factors replicated_w_names get.pv.names is.there do_group format_data_repest n.obs.x rrvar.pv

Documented in format_data_repest weighted.std weighted.var

Try the Rrepest package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Rrepest An Analyzer of International Large Scale Assessments in Education

R/repest_auxiliaries.R In Rrepest: An Analyzer of International Large Scale Assessments in Education

Defines functions brr.se doing_statistic_string weighted.std weighted.var digits.pvs get.pv.arguments add_PIAAC_variance_factors replicated_w_names get.pv.names is.there do_group format_data_repest n.obs.x rrvar.pv

Documented in format_data_repest weighted.std weighted.var

Try the Rrepest package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Rrepest
An Analyzer of International Large Scale Assessments in Education

R/repest_auxiliaries.R
In Rrepest: An Analyzer of International Large Scale Assessments in Education