R/pv_rrepest_statistics.R
In Rrepest: An Analyzer of International Large Scale Assessments in Education

################# Multiple statistics on Continuous variables #################
# 22 Mar 2023
# Rodolfo Ilizaliturri
#############################################################
# Goal: Obtain several statistics using Fay's BRR model


# ------------------.
# --------- WORK FUNCTIONS
# ------------------.

pv.brr.stat.by <- function(df, statistic = "mean",x, w, by.var, over, test, n.df = list(), svy,...) {
  # Goal: Statistics from final weight
  # ------ INPUTS ------.
  # df : (dataframe) df to analize
  # x: (string) variable from where to get means
  # w : (string) weighting variable
  # by.var : (string vector) variables to break analysis by
  # over : (vector string) columns over which to do analysis
  
  arg <- list(...)
  options(dplyr.summarise.inform = FALSE)
  
  res.col <- df %>% 
    group_by(across(all_of(by.var))) 
  
  # STATISTICS --------------------------------------------------------------.
  if(statistic == "quant") {res.col <- summarise(res.col, w.quant = weighted.quant(get(x),get(w),arg$q.iqr))}
  else if(statistic == "iqr") {res.col <- summarise(res.col, w.iqr = weighted.iqr(get(x),get(w),arg$q.iqr))}
  else if(statistic %in% c("mean","means")) {res.col <- summarise(res.col, w.mean = weighted.mean(get(x),get(w),na.rm = T))}
  else if(statistic %in% c("meanpct","meanspct")) {res.col <- summarise(res.col, w.mean = weighted.mean(get(x),get(w),na.rm = T)*100)}
  else if(statistic == "var") {res.col <- summarise(res.col, w.var = weighted.var(get(x),get(w),na.rm = T))}
  else if(statistic %in% c("std","sd")) {res.col <- summarise(res.col, w.std = weighted.std(get(x),get(w),na.rm = T))}

  # FLAG --------------------------------------------------------------------.
  #If flag attach n.df a nd remove based on condition
  if (length(n.df) > 0) {
    # make into one column over and by var
    res.col <- res.col %>% 
      unite("by.group", all_of(c(by.var, over)), sep = "|")
    
    # Merge both dfs together n and freqs
    res.col <- left_join(res.col, n.df, by = "by.group")
    
    #Assign NaN when not enough coverage for estimation
    res.col <- flags(data = res.col, svy = svy) %>% 
      separate("by.group",into = all_of(by.var), sep = "\\|" )
  }
  

  
  
  #------ If test for over add tests to res.col
  if (test) {
    res.col <- over.test(res.col, over) %>% 
      column_to_rownames(var= "by.group")
  }else{
    res.col <- res.col %>% 
      remove_rownames %>%
      unite("by.group", all_of(by.var), sep = "|") %>% 
      column_to_rownames(var= "by.group")
  }
  
  
  
  # -- Track weight number
  colnames(res.col) <- paste("w.",
                             sub("RWGT","",w),
                             sep = "")
  
  return(res.col)
}

pv.brr.stat.by.PAR <- function(df, statistic = "mean",x, w, by.var, over, test, n.df = list(), svy,...) {
  # Goal: Statistics from final weight in for each since in packages the info can only go 1 enviroment deep
  # ------ INPUTS ------.
  # df : (dataframe) df to analize
  # x: (string) variable from where to get means
  # w : (string) weighting variable
  # by.var : (string vector) variables to break analysis by
  # over : (vector string) columns over which to do analysis
  # ...
  #(functions) weighted.iqr, weighted.quant, weighted.var, weighted.std
  
  arg <- list(...)
  options(dplyr.summarise.inform = FALSE)
  
  res.col <- df %>% 
    group_by(across(all_of(by.var))) 
  
  # STATISTICS --------------------------------------------------------------.
  if(statistic == "quant") {res.col <- summarise(res.col, w.quant = arg$weighted.quant(get(x),get(w),arg$q.iqr))}
  else if(statistic == "iqr") {res.col <- summarise(res.col, w.iqr = arg$weighted.iqr(get(x),get(w),arg$q.iqr))}
  else if(statistic %in% c("mean","means")) {res.col <- summarise(res.col, w.mean = weighted.mean(get(x),get(w),na.rm = T))}
  else if(statistic %in% c("meanpct","meanspct")) {res.col <- summarise(res.col, w.mean = weighted.mean(get(x),get(w),na.rm = T)*100)}
  else if(statistic == "var") {res.col <- summarise(res.col, w.var = arg$weighted.var(get(x),get(w),na.rm = T))}
  else if(statistic %in% c("std","sd")) {res.col <- summarise(res.col, w.std = arg$weighted.std(get(x),get(w),na.rm = T))}
  
  # FLAG --------------------------------------------------------------------.
  #If flag attach n.df a nd remove based on condition
  if (length(n.df) > 0) {
    # make into one column over and by var
    res.col <- res.col %>% 
      unite("by.group", all_of(c(by.var, over)), sep = "|")
    
    # Merge both dfs together n and freqs
    res.col <- left_join(res.col, n.df, by = "by.group")
    
    #Assign NaN when not enough coverage for estimation
    res.col <- arg$flags(data = res.col, svy = svy) %>% 
      separate("by.group",into = all_of(by.var), sep = "\\|" )
  }
  
  
  #------ If test for over add tests to res.col
  if (test) {
    res.col <- arg$over.test(res.col, over) %>% 
      column_to_rownames(var= "by.group")
  }else{
    res.col <- res.col %>% 
      remove_rownames %>%
      unite("by.group", all_of(by.var), sep = "|") %>% 
      column_to_rownames(var= "by.group")
  }
  
  
  
  # -- Track weight number
  colnames(res.col) <- paste("w.",
                             sub("RWGT","",w),
                             sep = "")
  
  return(res.col)
}

pv.loop.stat.on.weights <- function (data, statistic = "mean", rep_weights,
                                     x, by.var, over, test, n.df = list(),
                                     svy, pv = F, ...) {
  # Goal: Means from final weight
  # ------ INPUTS ------.
  # data : (dataframe) df to analize
  # rep_weights : (string vector) names of replicated weight vars
  # over : (vector string) columns over which to do analysis
  # x: (string) variable from where to get means
  # pv : (Bool) TRUE → We are in plausible values and must parallelize

  
  if (!pv) {
    # NON PARALLEL ------------------------------------------------------------.
    #Filter on only non NA values move here due to performance
    df <- data %>% filter(!is.na(get(x)))
    
    rep.w.stat <- lapply(rep_weights, function(i) pv.brr.stat.by(df = df,
                                                              statistic = statistic,
                                                              x = x,
                                                              w = i,
                                                              by.var = by.var,
                                                              over = over,
                                                              test = test,
                                                              n.df = n.df,
                                                              svy = svy,
                                                              ...))
  } else {
    # PARALLEL ---------------------------------------------------------------.
    rep.w.stat <- foreach(w.i = rep_weights,
                          .packages = c("dplyr","tidyr","purrr","tibble", "data.table"),
                          .export = c("pv.brr.stat.by.PAR",
                                      "weighted.std","weighted.var",
                                      "weighted.quant","weighted.iqr",
                                      "flags","over.test","...")) %dopar% {
                                        
                                        #Filter on only non NA values move here due to performance
                                        data.par <- na.omit(data.par, cols = x)
                                        #NOTE: na.omit uses a data.table since it is 7 times faster
                                        pv.brr.stat.by.PAR(df = data.par,
                                                    statistic = statistic,
                                                    x = x,
                                                    w = w.i,
                                                    by.var = by.var,
                                                    over = over,
                                                    test = test,
                                                    n.df = n.df,
                                                    svy = svy,
                                                    weighted.std = weighted.std,
                                                    weighted.var = weighted.var,
                                                    weighted.quant = weighted.quant,
                                                    weighted.iqr = weighted.iqr,
                                                    flags = flags,
                                                    over.test = over.test,
                                                    ... = ...)
                                      }
    
    
  }
  
  
  
  return(reduce(rep.w.stat, cbind))
  }
  

pv.brr.gstats <- function(data, svy, x, by.var, statistic = "mean",
                          over, test, flag, fast, pv = F, ...) {
  # Note: Same as rrepest.gstats without the number of schools and teachers
  # Goal: Dataframe with statistic and SE by Fay's BRR model
  # ------ INPUTS ------.
  # data : (dataframe) df to analyze
  # statistic : (string) accepts "mean","var","std"
  # q.iqr : (vector numeric) of length 1 if quantile, of length 2 if interquantile range 
  # svy : List of possible projects to analyse PIAAC, PISA, TALISSCH and TALISTCH
  # by.var : (string) column in which we'll break down results !IOP!: several variables
  # x : (string) variable from where to get statistics
  # pv : (Bool) TRUE → We are in plausible values and must parallelize
  # user_na : (Bool) TRUE → show nature of user defined missing values in by.var
  
  eclass <- list()
  arg <- list(...)
  
  
  # Get weight names
  weight.names <- replicated_w_names(svy, ...)
  # If fast then less replicated weights
  if (fast) {
    weight.names <- weight.names[1:6]
  }
  
  # Get n data if flag needed
  if (flag) {
    n.df <-  n.obs.x(df = data, by.var = c(by.var, over), x = x, svy = svy)
  } else {
    n.df <- list()
  }
  
  
  # Loop over weights
  brr.res <- pv.loop.stat.on.weights(data = data,
                                  statistic = statistic,
                                  rep_weights = weight.names,
                                  x = x,
                                  by.var = by.var, 
                                  over = over,
                                  test = test,
                                  n.df = n.df,
                                  svy = svy,
                                  pv = pv,
                                  ...)
  
  #----------statistic
  m <- brr.res[1] %>% 
    set_names(nm = paste0("b.",statistic,
                          if_else(statistic %in% c("quant","iqr"),
                                  str_replace_all(
                                    paste0(arg$q.iqr,collapse = "-"),
                                    "0.","0"),
                                  ""),
                          ".", tolower(x) )) %>% 
    rownames_to_column(var = "by.group")
  
  eclass <- append(eclass,list("m"=m))
  
  #----------se
  se <- apply(brr.res, 1, rrvar.pv, svy=svy, pv=pv, ... = ...) %>% 
    as.data.frame() %>% 
    set_names(nm = paste0("se.",statistic,
                          if_else(statistic %in% c("quant","iqr"),
                                  str_replace_all(
                                    paste0(arg$q.iqr,collapse = "-"),
                                    "0.","0"),
                                  ""),
                          ".", tolower(x) )) %>% 
    rownames_to_column(var = "by.group")
  
  eclass <- append(eclass,list("se"=se))
  
  
  return(reduce(eclass, full_join, by = "by.group"))
}


# --------- FINAL FUNCTION



pv.rrepest.statistics <- function(data, svy, statistic = "mean", x, by.var = NULL,
                               over = NULL, test = FALSE, flag = FALSE, user_na=FALSE, 
                               fast = FALSE, pv = FALSE, ...) {
  # Goal: Dataframe with statistic and SE by Fay's BRR model
  # Note: All formatting depends on higher function
  # ------ INPUTS ------.
  # data : (dataframe) df to analyze
  # svy : List of possible projects to analyse PIAAC, PISA, TALISSCH and TALISTCH
  # statistic : (string vector) accepts "mean","var","std", "quant", "iqr"
  # q.iqr : (vector numeric) of length 1 if quantile, of length 2 if interquantile range 
  # by.var : (vector string) column in which we'll break down results
  # over : (vector string) columns over which to do analysis
  # test : (bool) If TRUE will calculate the difference between over variables
  # x : (string) variable from where to get statistics
  # user_na : (Bool) TRUE → show nature of user defined missing values in by.var
  # flag : (Bool) TRUE → Show NaN when there is not enough observations (or schools)
  # pv : (Bool) TRUE → We are in plausible values and must parallelize
  # ...
  # isced : (number) isced level to analyze
  
  #Store resulting DFs
  results <- list()
  
  stats_rr <- statistic
  
  # Data formatting
  by.over <- c(by.var,over)
  
  # In multiple statistics vector, skipping iterations if "quant" or "iqr"
  skip_iter <- 0
  for (i in seq_along(stats_rr)) {
    if (skip_iter > 0) {
      skip_iter <- skip_iter-1
      next}
    
    if (stats_rr[i] == "quant") {
      skip_iter <- 1
      
      #add brr statistics to original results
      results <- append(results, list(
        pv.brr.gstats(data = data, svy = svy, x = x,by.var = by.over, 
                   statistic = "quant", test = test, over = over, flag = flag,
                   fast = fast, pv = pv, 
                   q.iqr = stats_rr[i+1] %>% as.numeric(), ...) )
      )
      
    }else if (stats_rr[i] == "iqr"){
      skip_iter <- 2
      
      #add brr statistics to original results
      results <- append(results, list(
        pv.brr.gstats(data = data, svy = svy, x = x, by.var = by.over, 
                   statistic = "iqr", test = test, over = over, flag = flag,
                   fast = fast, pv = pv, 
                   q.iqr = c(stats_rr[i+1] %>% as.numeric(),
                             stats_rr[i+2] %>% as.numeric()),...) )
      )
      
    }else{
      #add brr statistics to original results
      results <- append(results, list(
        pv.brr.gstats(data = data, svy = svy, x = x, by.var = by.over,
                   test = test, over = over, flag = flag,
                   fast = fast, pv = pv, 
                   statistic = stats_rr[i],...))
      )
    }
    
  }
  
  # Merge all results together
  df.results <- reduce(results, full_join, by = "by.group")
  
  if (length(over) > 0) {
    #Separate all of by.group by | and unite only by by.var, leaving columns from over
    df.results <- df.results %>% 
      separate(by.group, into = by.over, sep = "\\|") %>% 
      unite("by.group", all_of(by.var), sep = "|") 
    
    #Pivot on the columns from over
    df.results <- df.results %>% 
      pivot_wider(names_from = all_of(over),
                  values_from = colnames(df.results)[! colnames(df.results) %in% c("by.group",over)],
                  names_sep = "..")
  }
  
  #If there is an over reorder results to have se following b
  if (length(over) > 0) {
    
    #reorder df with intercalating columns
    n.betas <- (ncol(df.results) - 1)/2
    betas.order <- lapply(1:n.betas,function(x) c(x,x+n.betas)) %>% 
      reduce(function (x,y) c(x,y))
    df.results <- df.results[,c(1,betas.order + 1)]
  }
  
  
  return(df.results)
}
Any scripts or data that you put into this service are public.
Rrepest documentation built on April 4, 2025, 2:07 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
Rrepest
An Analyzer of International Large Scale Assessments in Education

R/pv_rrepest_statistics.R
In Rrepest: An Analyzer of International Large Scale Assessments in Education

Defines functions pv.rrepest.statistics pv.brr.gstats pv.brr.stat.by.PAR pv.brr.stat.by

Try the Rrepest package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Rrepest An Analyzer of International Large Scale Assessments in Education

R/pv_rrepest_statistics.R In Rrepest: An Analyzer of International Large Scale Assessments in Education

Defines functions pv.rrepest.statistics pv.brr.gstats pv.brr.stat.by.PAR pv.brr.stat.by

Try the Rrepest package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Rrepest
An Analyzer of International Large Scale Assessments in Education

R/pv_rrepest_statistics.R
In Rrepest: An Analyzer of International Large Scale Assessments in Education