## STATISTICS functions
#' Describe data
#'
#'
#' @return A data frame
#' @export
#'
#' @examples
#' describe2(iris)
#' describe2(mpg)
#' describe2(iris, group = iris$Species)
describe2 = function(x, group = NULL, all_vars = F) {
#convert logical variables to numeric
for (v in names(x)) {
if (is.logical(x[[v]])) x[[v]] = x[[v]] %>% as.numeric()
}
#remove factors and chrs
x_nonnum = purrr::map_lgl(x, ~is.character(.)|is.factor(.))
x = x[!x_nonnum]
#group level?
if (!is.null(group)) {
#factor for the right order of groups, if the user wants
group = as.factor(group)
#loop groups and bind
y = plyr::ddply(bind_cols(x, ..group = group), .variables = "..group", .fun = function(dd) {
dd %>% select(-`..group`) %>% describe2(all_vars = all_vars)
}) %>% rename(group = ..group)
} else {
#get results
y = psych::describe(as.data.frame(x), na.rm = TRUE, interp = FALSE, skew = TRUE, ranges = TRUE,
trim = 0.1, type = 3, check = TRUE, fast = NULL, quant = NULL,
IQR = FALSE, omit = FALSE, data = NULL)
#fix class of output
class(y) = "data.frame"
#subset, explicit rownames
#subset
if (!all_vars) {
y %<>%
tibble::rownames_to_column(var = "var") %>%
dplyr::select(var, n, mean, median, sd, mad, min, max, skew, kurtosis)
} else {
y %<>%
tibble::rownames_to_column(var = "var")
}
}
as_tibble(y)
}
#' Adjust Cohen's d for measurement error
#'
#' Adjust' Cohen's d for measurement error using the Pearson r approach
#'
#' @param x A vector of d values
#' @param rel A vector of reliabilities
#'
#' @return A vector of adjusted d values
#' @export
#'
#' @examples
#' adj_d_reliability(1.00, 0.8)
adj_d_reliability = function(x, rel) {
#nothing to do
if (rel == 1) return(x)
if (rel == 0) return(Inf * sign(x))
if (!is_between(rel, a = 0, b = 1)) stop("Reliability must be within [0:1]", call. = F)
#convert to r and adjust
r = (x/(x^2+4)^0.5)
#adjust
r_adj = r/sqrt(rel)
#back to d
2 * r_adj/sqrt(1 - r_adj^2)
}
#' Compute correlation confidence intervals from correlations and standard errors
#'
#' @param cor A correlation value or matrix
#' @param se A standard error value or matrix
#' @param ci The desired confidence level
#' @param winsorise Keep values within -1 to 1
#'
#' @return If you entered a single value, you get 2 values back: lower and upper confidence numbers. If you entered matrices, you get 2 matrices back in a list.
#' @export
#'
#' @examples
#' #simple results
#' cor_CI_from_SE(.50, .10)
#' cor_CI_from_SE(.50, .10, ci = .99)
#' #matrices
#' iris_res = suppressWarnings(weights::wtd.cor(iris[-5]))
#' cor_CI_from_SE(iris_res$correlation, iris_res$std.err)
cor_CI_from_SE = function(cor, se, ci = .95, winsorise = T) {
#matrix input?
assert_that(length(cor) == length(se))
#CI
y = c(
cor - qnorm(ci + ((1-ci)/2), lower.tail = T) * se,
cor + qnorm(ci + ((1-ci)/2), lower.tail = T) * se
)
if (winsorise) {
y %<>% winsorise(upper = 1, lower = -1)
}
#matrix input, then restore the matrix
if (is.matrix(cor)) {
#split sizes
num_in_matrix = nrow(cor) * ncol(cor)
y_lower = matrix(y[1:num_in_matrix], nrow = nrow(cor), ncol = ncol(cor))
y_upper = matrix(y[(num_in_matrix+1):(num_in_matrix*2)], nrow = nrow(cor), ncol = ncol(cor))
return(list(
lower = y_lower,
upper = y_upper
))
}
y
}
#' Correlation matrix
#'
#' Outputs a correlation matrix. Supports weights, confidence intervals, correcting for measurement error and rounding.
#'
#' Correction for measurement error is done using the standard Pearson formula: r_true = r_observed / sqrt(reliability_x * reliability_y).
#'
#' Weighted correlations are calculated using wtd.cor or wtd.cors from weights package.
#'
#' `rank_order` can take either a logical scalar or a character scalar. If given TRUE, it will use rank ranking method with the default settings (average ranks). If given a chr scalar, it will use that ranking method. If given FALSE, will not use rank data (default).
#'
#' Confidence intervals are analytic confidence intervals based on the standard error.
#'
#' @param data (data.frame or coercible into data.frame) The data.
#' @param weights (numeric vector, numeric matrix/data.frame or character scalar) Weights to use for the correlations. Can be a numeric vector with weights, the name of a variable in data, or a matrix/data.frame with weights for each variable. If the latter, then harmonic means are used. If none given, defaults to rep(1, nrow(data)).
#' @param by Grouping variable
#' @param reliabilities (num vector) Reliabities used to correct for measurement error. If not present, assumed to be 1.
#' @param CI (numeric scalar) The confidence level to use as a fraction.
#' @param CI_template (character scalar) A template to use for formatting the confidence intervals.
#' @param skip_nonnumeric (logical scalar) Whether to skip non-numeric variables. Defaults to TRUE.
#' @param CI_round (whole number scalar) If confidence intervals are used, how many digits should be shown?
#' @param p_val (log scalar) Add p values or not.
#' @param p_template (chr scalar) If p values are desired, the template to use.
#' @param p_round (int scalar) Number of digits to round p values to. Uses scientific notation for small numbers.
#' @param asterisks The thresholds to use for p value asterisks
#' @param asterisks_only Whether to only include astrisks not numerical values
#' @param rank_order (lgl or chr) Whether to use rank ordered data so as to compute Spearman's correlations instead.
#'
#' @export
#' @examples
#' cor_matrix(iris) #just correlations
#' cor_matrix(iris, CI = .95) #with confidence intervals
#' cor_matrix(iris, CI = .99) #with 99% confidence intervals
#' cor_matrix(iris, p_val = .95) #with p values
#' cor_matrix(iris, p_val = .95, p_template = "%r (%p)") #with p values, with an alternative template
#' cor_matrix(iris, reliabilities = c(.8, .9, .7, .75)) #correct for measurement error
#' cor_matrix(iris, reliabilities = c(.8, .9, .7, .75), CI = .95) #correct for measurement error + CI
#' cor_matrix(iris, rank_order = T) #rank order correlations, default method
#' cor_matrix(iris, rank_order = "first") #rank order correlations, specific method
#' cor_matrix(iris, weights = "Petal.Width") #weights from name
#' cor_matrix(iris, weights = 1:150) #weights from vector
#' #complex weights
#' cor_matrix(iris, weights = matrix(runif(nrow(iris) * 4), nrow = nrow(iris)))
#' cor_matrix(iris, weights = matrix(runif(nrow(iris) * 4), nrow = nrow(iris)), CI = .95)
#' #groups
#' cor_matrix(iris, by = iris$Species)
#' cor_matrix(iris, by = iris$Species, weights = 1:150)
cor_matrix = function(data, weights = NULL, by = NULL, reliabilities = NULL, CI = NULL, CI_template = "%r [%lower %upper]", skip_nonnumeric = T, CI_round = 2, p_val = F, p_template = "%r [%p]", p_round = 3, rank_order = F, asterisks = c(.01, .005, .001), asterisks_only = T) {
#checks
data = as.data.frame(data)
if (skip_nonnumeric) data = extract_num_vars(data)
if (!is_numeric(data)) stop("data contains non-numeric columns!")
is_(rank_order, class = c("character", "logical"), size = 1, error_on_false = T)
#CI and p vals
if (!is.null(CI) && p_val) stop("Cannot both calculate CIs and p values!")
v_noextras = is.null(CI) && !p_val
#grouping
if (!is.null(by)) {
assert_that(is.character(by) | is.factor(by) | is.logical(by))
#drop rows where group is NA but throw warning as this might be an error
if (anyNA(by)) {
warning(call. = F, "Grouping variable contains `NA` values, check your data")
data = data[!is.na(by), ]
#error if no data remaining
if (nrow(data) == 0) stop("Grouping variable contains only `NA` values!", call. = F)
}
#groups and their order
groups = levels(forcats::fct_drop(as.factor(by)))
}
#reliabities
if (is.null(reliabilities)) {
reliabilities = rep(1, ncol(data))
} else {
#check length
if (length(reliabilities) != ncol(data)) stop("reliabilities length incorrect")
}
#weights not given or as character
weights_used = !is.null(weights)
if (is.null(weights)) weights = rep(1, nrow(data))
if (is.character(weights)) {
weights = data[[weights]] #fetch from data
}
#simple weights?
simpleweights = !(is.matrix(weights) || is.data.frame(weights))
if (simpleweights) {
weights = as.numeric(weights) #this prevents some weird data type errors!
if (length(weights) != nrow(data)) stop("weights not the same length as the data!")
}
#rank order data?
if (is.logical(rank_order) && rank_order) {
data = df_rank(data)
}
if (is.character(rank_order)) {
data = df_rank(data, ties.method = rank_order)
}
##simple weights and no extras?
if (simpleweights && v_noextras) {
make_cor_mat_simple = function(group) {
#subset data if any group
if (!is.null(group)) {
data = data[NA_to_F(by == group), ]
weights = weights[NA_to_F(by == group)]
#error if no data for group
if (nrow(data) == 0) stop(stringr::str_glue("No data for group {group}!"))
}
#these just indicate near-1 correlations
m_res = suppressWarnings(weights::wtd.cor(data, weight = weights))
m = m_res$correlation
#correct for unreliability
m = psych::correct.cor(m, reliabilities)
#winsor to -1 to 1
m = winsorise(m, lower = -1, upper = 1)
#reliabilities in diagonal
diag(m) = reliabilities
return(m)
}
#groups or not?
if (!is.null(by)) {
m = purrr::map(groups, make_cor_mat_simple)
names(m) = groups
} else {
m = make_cor_mat_simple(group = NULL)
}
return(m)
}
#complex weights check
if (length(get_dims(weights)) == 2) {
#do weights fit the data?
weights_data = all(get_dims(weights) == get_dims(data))
#do weights fit the correlation matrix?
#weights_cor_matrix = get_dims(weights) == rep(ncol(data), 2)
if (!weights_data) stop(str_c("weights did not fit the data!"))
}
#inner function
#use a function so it can loop across groups if needed
make_cor_mat = function(group = NULL) {
#subset data if any group
if (!is.null(group)) {
data = data[NA_to_F(by == group), ]
weights = weights[NA_to_F(by == group), ]
#error if no data for group
if (nrow(data) == 0) stop(stringr::str_glue("No data for group {group}!"))
}
#make matrix
m = matrix(ncol = ncol(data), nrow = ncol(data))
#fill in results
for (row in 1:nrow(m)) {
for (col in 1:ncol(m)) {
#next if diagonal or above
if (col >= row) next
#simple weights & CI
if (simpleweights && !is.null(CI)) {
#weighted cor
#these just indicate near-1 correlations
r_obj = suppressWarnings(weights::wtd.cor(data[, c(row, col)], weight = weights))
#correct for unreliability
r_obj$correlation %<>% {. / sqrt(reliabilities[col] * reliabilities[row])}
#winsorize
r_obj$correlation %<>% winsorise(1, -1)
#sample size
r_n = psych::pairwiseCount(data[row], data[col])
#format cor
r_r = r_obj$correlation[1, 2] %>% str_round(digits = CI_round)
#confidence interval
r_CI = cor_CI_from_SE(r_obj$correlation[1, 2], se = r_obj$std.err[1, 2], ci = CI)
#rounding
r_CI %<>% str_round(digits = CI_round)
#format and save
m[row, col] = stringr::str_replace(CI_template, "%r", r_r) %>%
str_replace("%lower", r_CI[1]) %>%
str_replace("%upper", r_CI[2])
}
#simple weights & p_val
if (simpleweights && p_val) {
#observed r
r_obj = weights::wtd.cor(data[row], data[col], weight = weights)
#correct for unreliability
r_obj[1] %<>% {. / sqrt(reliabilities[col] * reliabilities[row])}
#winsorize
r_obj[1] %<>% winsorise(1, -1)
#sample size
r_n = psych::pairwiseCount(data[row], data[col])
#rounding
r_r = r_obj[1] %>% str_round(digits = CI_round)
#finalize with p value
if (asterisks_only) {
m[row, col] = r_r + p_to_asterisk(r_obj[, "p.value"], asterisks = asterisks, asterisks_only = T)
} else {
m[row, col] = p_template %>%
str_replace("%r", r_r) %>%
str_replace("%p", p_to_asterisk(r_obj[, "p.value"], asterisks = asterisks, asterisks_only = F))
}
}
#complex weights
if (!simpleweights) {
#for each case, compute the harmonic mean weight
v_weights = weights[, c(row, col)] %>%
t() %>%
psych::harmonic.mean()
#if plain correlation output, compute and be done
if (v_noextras) {
m[row, col] = weights::wtd.cors(data[row], data[col], weight = v_weights) / sqrt(reliabilities[row] * reliabilities[col])
}
#if CI wanted
if (!is.null(CI)) {
#observed r
r_obj = weights::wtd.cor(data[row], data[col], weight = v_weights)
#correct for unreliability
r_obj[1] %<>% {. / sqrt(reliabilities[col] * reliabilities[row])}
#winsorize
r_obj[1] %<>% winsorise(1, -1)
#sample size
r_n = psych::pairwiseCount(data[row], data[col])
#format r
r_r = r_obj[1] %>% str_round(digits = CI_round)
#confidence interval
r_CI = cor_CI_from_SE(r_obj[1], se = r_obj[2], ci = CI)
#rounding
r_CI %<>% str_round(digits = CI_round)
#format and save
m[row, col] = stringr::str_replace(CI_template, "%r", r_r) %>%
str_replace("%lower", r_CI[1]) %>%
str_replace("%upper", r_CI[2])
}
#if p value wanted
if (p_val) {
#observed r
r_obj = weights::wtd.cor(data[row], data[col], weight = v_weights)
#correct for unreliability
r_obj[1] %<>% {. / sqrt(reliabilities[col] * reliabilities[row])}
#winsorize
r_obj[1] %<>% winsorise(1, -1)
#n
r_n = psych::pairwiseCount(data[row], data[col])
#format r
r_r = r_obj[1] %>% str_round(digits = CI_round)
#finalize with p value
if (asterisks_only) {
m[row, col] = r_r + p_to_asterisk(r_obj[, "p.value"], asterisks = asterisks, asterisks_only = T)
} else {
m[row, col] = p_template %>%
str_replace("%r", r_r) %>%
str_replace("%p", p_to_asterisk(r_obj[, "p.value"], asterisks = asterisks, asterisks_only = F))
}
}
}
}
}
#make symmetric
m = MAT_half(m) %>% MAT_vector2full()
#dimnames
rownames(m) = colnames(m) = colnames(data)
#diag
diag(m) = reliabilities
m
}
#groups or not?
if (!is.null(by)) {
m = purrr::map(groups, make_cor_mat)
names(m) = groups
} else {
m = make_cor_mat(group = NULL)
}
m
}
#' Remove redundant variables
#'
#' Remove redundant variables from a data.frame based on a threshold value. This is done by calculating all the intercorrelations, then finding those that correlate at or above the threshold (absolute value), then removing the second pair of each variable and not removing more variables than strictly necessary.
#' @param df (data.frame) A data.frame with numeric variables.
#' @param threshold (numeric scalar) A threshold above which intercorrelations are removed. Defaults to .9.
#' @param cor_method (character scalar) The correlation method to use. Parameter is fed to cor(). Defaults to "pearson".
#' @param messages (boolean) Whether to print diagnostic messages.
#' @export
#' @examples
#' remove_redundant_vars(iris[-5]) %>% head
remove_redundant_vars = function(df, threshold = .9, cor_method = "pearson", messages = T) {
#Check input
if (!is.data.frame(df)) {
stop(paste0("First parameter is not a data frame. Instead it is ", class(df)))
}
if (!is.numeric(threshold)) {
stop(paste0("Second parameter is not numeric. Instead is ", class(num.to.remove)))
}
if (!(threshold > 0 && threshold <= 1)) stop("threshold must be 0>x>=1 !")
if (!is.logical(messages)) stop("messages paramter was not a logical!")
#Old variable names
old_names = colnames(df) #save old variable names
#remove data in diag and top
m = cor(df, use = "p", method = cor_method)
#to long form
m_long = cbind(expand.grid(rownames(m), colnames(m), stringsAsFactors = F),
r = as.vector(m),
abs_r = as.vector(m) %>% abs,
keep = upper.tri(m) %>% as.vector)
#remove self-correlations and duplicates
m_long = m_long[m_long$keep, ]
#sort by abs r
m_long = m_long[order(m_long$abs_r, decreasing = T), ]
#subset
m_long = m_long[1:3]
#over threshold message
m_long_threshold = m_long[m_long$r >= threshold | m_long$r <= -threshold, ]
if(nrow(m_long_threshold) != 0) {
silence({message("The following variable pairs had stronger intercorrelations than |", threshold, "|:")}, messages = messages)
if (messages) df_round(m_long_threshold) %>% print #round and print
} else {
silence({message("No variables needed to be excluded.")}, messages = messages)
return(df)
}
#exclude variables
#one cannot just remove the ones in the Var2 col because this can result in variables being removed despite them not correlating >threshold with any variable
vars_to_exclude = character() #for the varnames
while(T) {
#exit loop if done
if (nrow(m_long_threshold) == 0) break
#add top var in Var2 to the exclusion vector and remove top row
vars_to_exclude = c(vars_to_exclude, m_long_threshold$Var2[1])
m_long_threshold = m_long_threshold[-1, ]
#remove rows that contain any variable from the exclusion vector
exclude_rows = m_long_threshold$Var1 %in% vars_to_exclude | m_long_threshold$Var2 %in% vars_to_exclude
m_long_threshold = m_long_threshold[!exclude_rows, ]
}
#exclude variables
silence({message(str_c("The following variables were excluded:"))}, messages = messages)
silence({message(str_c(vars_to_exclude, collapse = ", "))}, messages = messages)
df = df[!colnames(df) %in% vars_to_exclude]
#return reduced df
return(df)
}
#' Semi-partial correlation with weights
#'
#'
#' Returns the semi-partial correlation. Weights may be used.
#' @param x A numeric vector to correlate with y.
#' @param y A numeric vector to correlate with x after partialing out z.
#' @param z A numeric vector to partial out of y.
#' @param weights A numeric vector of weights to use. If none given, will return unweighted results.
#' @export
semi_par = function(x, y, z, weights = NULL, complete_cases = T) {
#x vector
x = as.vector(x)
y = as.vector(y)
#weights
if (is.null(w)) {
w = rep(1, length(x))
} else {
w = as.vector(w)
}
#data.frame
df = data.frame(x = as.vector(x), #we vectorize the input because otherwise may get strange
y = as.vector(y), #results when input is a df or matrix
z = as.vector(z),
w = as.vector(weights))
#complete cases only
if (complete_cases) {
df = df[complete.cases(df), ]
}
#model
df$y_res = resid(lm(y ~ z, weights = w, data = df))
r_sp = weights::wtd.cor(df$x, df$y_res, weight = df$w)
r = weights::wtd.cor(df$x, df$y, weight = df$w)
return(list(normal = r,
semi_partial = r_sp))
}
#' Semi-partial correlation with weights
#'
#'
#' Returns a table of semi-partial correlations where a dependent variable has first been regressed on the primary predictor variable, and then correlated with each of the secondary predictors in turn.
#' @param df A data frame with the variables.
#' @param dependent A string with the name of the dependent variable.
#' @param primary A string with the name of the primary predictor variable.
#' @param secondaries A character vector with the names of the secondary predictor variables.
#' @param weights A string with the name of the variable to use for weights.
#' @param standardize Whether to standardize the data frame before running results. The weights variable will not be standardized.
#' @export
semi_par_serial = function(df, dependent, primary, secondaries, weights = NULL, standardize = T) {
#subset and deal with lack of weights
if (is.null(weights)) {
weights = "weights_var"
df[weights] = rep(1, nrow(df))
} else {
df["weights_var"] = df[weights] #move the weights var to another name
weights = "weights_var"
}
#subset
df = subset(df, select = c(dependent, primary, secondaries, weights))
#complete cases only
df = na.omit(df)
#standardize
if (standardize) df = df_standardize(df, exclude = weights)
#primary
r_prim = round(weights::wtd.cor(df[, dependent], df[, primary], weight = df[, weights]), 2)
#make results object
results = data.frame(matrix(nrow = length(secondaries), ncol = 2))
rownames(results) = secondaries
colnames(results) = c("Orig. cor", "Semi-partial cor")
#loop over each secondary
for (sec_idx in seq_along(secondaries)) {
#the current secondary var
tmp_secondary = secondaries[sec_idx]
#make the model
tmp_model = stringr::str_c(dependent, " ~ ", primary)
#regress
df$tmp_resids = resid(lm(as.formula(tmp_model), weights = df[, weights], data = df))
#secondary original
r_sec = weights::wtd.cor(df[, dependent], df[, tmp_secondary], weight = df[, weights])[1]
#semi-partial
r_sec_sp = weights::wtd.cor(df$tmp_resids, df[, tmp_secondary], weight = df[, weights])[1]
#save
results[sec_idx, ] = c(r_sec, r_sec_sp)
}
return(results)
}
#' Calculate a partial correlation.
#'
#' Calculates the partial correlation.
#' @param df A data frame.
#' @param x String with the name of the first variable.
#' @param y String with the name of the second variable.
#' @param z String with the name of the control variable.
#' @param weights_var String with the name of the weights variable. Can be left out.
#' @export
MOD_partial = function(df, x, y, z, weights_var = NULL) {
df
x
y
z
#make or move weights
if (is.null(weights_var)) {
df$weights___ = rep(1, nrow(df)) #make unit weights
} else {
df$weights___ = df[[weights_var]] #reassign weights var
}
weights_var = "weights___"
#build models
mod1 = stringr::str_c(x, " ~ ", str_c(z, collapse = " + "))
mod2 = stringr::str_c(y, " ~ ", str_c(z, collapse = " + "))
#fit models
fit1 = lm(mod1, data = df, weights = weights___, na.action = na.exclude)
fit2 = lm(mod2, data = df, weights = weights___, na.action = na.exclude)
#na.exclude is important becus otherwise NA values are removed
#get residuals
resid1 = resid(fit1)
resid2 = resid(fit2)
#correlate
r = weights::wtd.cor(resid1, resid2, weight = df$weights___)
return(r[1])
}
#' Get t value by confidence-level and degree of freedom
#'
#' Wrapper function for \code{qt} to get the t value needed for finding confidence intervals or p values.
#' @param conf (numeric scalar) The confidence level desired as a fraction.
#' @param df (numeric scalar) The degrees of freedom.
#' @param ... (other named arguments) Other arguments to pass to \code{qt}. See that function for details.
#' @export
#' @examples
#' #get t value for 95 pct. confidence interval with df = 20
#' get_t_value(.95, 20)
get_t_value = function(conf, df, ...) {
value = conf + ((1 - conf) / 2)
qt(value, df = df, ...)
}
#' Compute width of confidence interval
#'
#' Calculate the width of a confidence interval given a standard error and sample size. Based on the t distribution.
#'
#' @param se (numeric scalar) The standard error.
#' @param n (numeric scalar) The sample size. Defaults to Inf.
#' @param confidence_level (numeric scalar) The confidence level. Defaults to .95.
#' @param single_arm (logical scalar) Whether to calculate the width for a single arm. Defaults to TRUE.
#'
#' @return The width of the confidence interval.
#' @export
#'
#' @examples
#' conf_interval_width(1) #about 1.96
#' conf_interval_width(1, confidence_level = .99) #about 2.58
conf_interval_width <- function(se, n = Inf, confidence_level = 0.95, single_arm = T) {
#use t distribution, but if no sample size is given, assume infinite degrees of freedom, same as z dist
t_value <- qt(1 - (1 - confidence_level) / 2, df = n - 1)
#single arm width
width <- t_value * se
#double it if desired
if (!single_arm) {
width <- 2 * width
}
return(width)
}
#' Pooled sd
#'
#' Calculate pooled sd.
#' @param x (numeric vector) The data.
#' @param group (vector) Group membership.
#' @export
#' @examples
#' #Wikipedia's example https://en.wikipedia.org/wiki/Pooled_variance
#' v_test_vals = c(31, 30, 29, 42, 41, 40, 39, 31, 28, 23, 22, 21, 19, 18, 21, 20, 19, 18, 17)
#' v_test_group = c(rep(1, 3), rep(2, 4), rep(3, 2), rep(4, 5), rep(5, 5))
#' pool_sd(v_test_vals, v_test_group)
pool_sd = function(x, group) {
#validate input
if (!is_simple_vector(x)) stop("x must be a vector!")
group = as.factor(group)
if (!is.factor(group)) stop("x must be a factor or convertible to that!")
#to df
d = data.frame("x" = x, "group" = group)
#summarize
d_sum = plyr::ddply(d, "group", plyr::summarize,
df = sum(!is.na(x)) - 1,
var = var(x, na.rm = TRUE)) %>%
na.omit() #missing data groups
#weighted sum divided by weights (with Bessel's correction), then square root
#https://en.wikipedia.org/wiki/Pooled_variance
sqrt(sum(d_sum$df * d_sum$var) / sum(d_sum$df))
}
#' Standardized mean differences
#'
#' Calculate standardized mean differneces between all groups.
#' @param x (numeric vector) A vector of values.
#' @param group (vector) A vector of group memberships.
#' @param central_tendency (function) A function to use for calculating the central tendency. Must support a parameter called na.rm. Ideal choices: mean, median.
#' @param dispersion (character or numeric scalar) Either the name of the metric to use (sd or mad) or a value to use.
#' @param dispersion_method (character scalar) If using one of the built in methods for dispersion, then a character indicating whether to use the pooled value from the total dataset (all), the pairwise comparison (pair), or the sd from the total dataset (total).
#' @param extended_output (lgl) Whether to output a list of matrices. Useful for computational reuse.
#' @param CI (num) Confidence interval coverage.
#' @param str_template (chr) A string template to use.
#' @param reliability (num) A reliability to use for correcting for measurement error. Done via [kirkegaard::adj_d_reliability()].
#' @param se_analytic (lgl) Use analytic standard errors. If not, then it will bootstrapping (slower). Always uses bootstrapping for non-mean functions.
#' @param ... (other arguments) Additional arguments to pass to the central tendency function.
#' @export
#' @examples
#' SMD_matrix(iris$Sepal.Length, iris$Species)
#' SMD_matrix(iris$Sepal.Length, iris$Species, extended_output = T)
#' #pairwise SDs
#' SMD_matrix(iris$Sepal.Length, iris$Species, extended_output = T, dispersion_method = "pair")
SMD_matrix = function(x,
group,
central_tendency = wtd_mean,
dispersion = "sd",
dispersion_method = "all",
extended_output = F,
CI = .95,
str_template = "%d [%lower %upper]",
digits = 2,
reliability = 1,
se_analytic = T,
...) {
#input
x
group
#CI z score
se_CI = qnorm(CI + (1 - CI)/2)
#df form
d_x = data.frame(x = x, group = as.factor(group)) %>%
#remove missing data
na.omit
#find uniqs
uniq = levels(d_x$group)
#how many groups
n_groups = length(uniq)
#group sample sizes
group_ns = table2(d_x[[2]], include_NA = F)
group_ns = set_names(group_ns$Count, group_ns$Group) #change to named vector
#make matrices for results
m = matrix(NA, nrow = n_groups, ncol = n_groups)
#set names
colnames(m) = rownames(m) = uniq
#copies
CI_lower = m
CI_upper = m
se = m
pairwise_n = m
m_str = m
pval = m
#loop for each combo
for (row_i in seq_along(uniq)) {
for (col_i in seq_along(uniq)) {
#skip effect size if dia/above diag
if (col_i >= row_i) next
#set values
col = uniq[col_i]
row = uniq[row_i]
#partition data
d_comb = d_x[d_x$group %in% c(col, row), ]
#remove unused factor levels
d_comb$group = forcats::fct_drop(d_comb$group)
#dispersion
if (dispersion == "sd") {
if (dispersion_method == "all") {
disp = pool_sd(d_x$x, d_x$group)
}
if (dispersion_method == "pair") {
disp = pool_sd(d_comb$x, d_comb$group)
}
if (dispersion_method == "total") disp = sd(d_x$x)
} else if (dispersion == "mad") {
#mean of medians, robust
if (dispersion_method == "all") {
disp = plyr::daply(d_x, "group", function(part) {
stats::mad(part$x)
}) %>% mean
}
if (dispersion_method == "pair") {
disp = plyr::daply(d_comb, "group", function(part) {
stats::mad(part$x)
}) %>% mean
}
} else if (is.numeric(dispersion)) disp = dispersion #use given number
#difference
diff = central_tendency(d_comb$x[d_comb$group == col], ...) - central_tendency(d_comb$x[d_comb$group == row], ...)
#devide by dispersion measure
SMD = diff / disp
#save effect size
m[row, col] = SMD
#pairwise sample size
pairwise_n[row, col] = d_comb %>% nrow
#group sample sizes
pair_ns = group_ns[c(row, col)]
#adjust for reliability?
if (reliability != 1) {
# browser()
m[row, col] = adj_d_reliability(m[row, col], rel = reliability)
}
#standard error
#https://stats.stackexchange.com/questions/144084/variance-of-cohens-d-statistic
#note variance vs. se (hence sqrt)!
se[row, col] = sqrt((sum(pair_ns)/prod(pair_ns)) + ((m[row, col]^2) / (2*(sum(pair_ns) - 2))))
#CIs
CI_upper[row, col] = m[row, col] + se_CI * se[row, col]
CI_lower[row, col] = m[row, col] - se_CI * se[row, col]
#p val, 2-tailed
pval[row, col] = pnorm(abs(abs(m[row, col]) / se[row, col]), lower.tail = F) * 2
m_str[row, col] = str_template %>%
#insert estimate
str_replace_all(pattern = "%d", replacement = m[row, col] %>% str_round(digits = digits)) %>%
#insert upper CI
str_replace_all(pattern = "%upper", replacement = CI_upper[row, col] %>% str_round(digits = digits)) %>%
#insert lower CI
str_replace_all(pattern = "%lower", replacement = CI_lower[row, col] %>% str_round(digits = digits)) %>%
#insert n
str_replace_all(pattern = "%n", replacement = pairwise_n[row, col] %>% as.character()) %>%
#insert se
str_replace_all(pattern = "%se", replacement = se[row, col] %>% str_round(digits = digits))
}
}
#names
copy_names(m, m_str)
copy_names(m, se)
copy_names(m, CI_lower)
copy_names(m, CI_upper)
copy_names(m, pval)
copy_names(m, pairwise_n)
#fill upper halves
m %<>% MAT_half2full(diag=T)
m_str %<>% MAT_half2full(diag=T)
se %<>% MAT_half2full(diag=T)
CI_upper %<>% MAT_half2full(diag=T)
CI_lower %<>% MAT_half2full(diag=T)
pval %<>% MAT_half2full(diag=T)
pairwise_n %<>% MAT_half2full(diag=T)
#mode(pairwise_n) = "integer" #change to integer to prevent downstream problems
#output
if (!extended_output) {
return(m)
} else {
list(
d = m,
d_string = m_str,
CI_lower = CI_lower,
CI_upper = CI_upper,
se_z = se_CI,
se = se,
p = pval,
pairwise_n = pairwise_n
)
}
}
#' Calculate homogeneity/heterogeneity
#'
#' Calculate a simple index of homogeneity for nominal data, that is variously called Simpson's, Herfindahl's or Hirschman's index.
#' @param x (a vector) A vector of values.
#' @param reverse (log scalar) Whether to reverse the index to index heterogeneity (default false).
#' @param summary (log scalar) Whether to treat data as summary statistics of the group proportions (default false). If data are given in 0-100 format, it will automatically convert.
#' @export
#' @examples
#' homogeneity(iris$Species)
#' homogeneity(iris$Species, reverse = T)
#' homogeneity(c(.7, .2, .1), summary = T)
#' homogeneity(c(80, 15, 5), summary = T)
homogeneity = function(x, reverse = F, summary = F) {
#not using summary statistics
if (!summary) {
#reversed
if (!reverse) {
return(table(x) %>% prop.table %>% as.vector %>% raise_to_power(2) %>% sum)
}
#reversed
return(table(x) %>% prop.table %>% as.vector %>% raise_to_power(2) %>% sum %>% subtract(1, .))
}
#using summary statistics
if (sum(x) %>% is_between(.99, 1.01)) {
#not reversed
if (!reverse) {
return(x %>% raise_to_power(2) %>% sum)
}
#reversed
return(x %>% raise_to_power(2) %>% sum() %>% subtract(1, .))
} else if (sum(x) %>% is_between(99, 101)) {
#not reversed
if (!reverse) {
return(x %>% divide_by(100) %>% raise_to_power(2) %>% sum)
}
#reversed
return(x %>% divide_by(100) %>% raise_to_power(2) %>% sum %>% subtract(1, .))
} else {
stop("Tried to use summary statistics, but they did not sum to either around 1 or 100 ( 1%)")
}
}
#' Calculate weighted standard deviation
#'
#' Calculated the weighted standard deviation using a vector of values and a vector of weights.
#' @param x (num vector) A vector of values.
#' @param w (num vector) A vector of weights.
#' @param sample (log scalar) Whether this is a sample as opposed to a population (default true).
#' @param error (lgl scalr) Whether to throw an error if there is no data at all or no pairwise complete cases. Default yes.
#' @export
#' @examples
#' set.seed(1)
#' X = rnorm(100)
#' set.seed(1)
#' W = runif(100)
#' sd(X) #0.898
#' wtd_sd(X, W) #0.894, slightly different
#' wtd_sd(X) #0.898, not using weights
wtd_sd = function(x, w = NULL, sample = T, error = F) {
#x vector
x = as.vector(x)
#weights
if (is.null(w)) {
w = rep(1, length(x))
} else {
w = as.vector(w)
}
#make temp df
d = data.frame(x = x, w = w) %>% na.omit
#check sample
if (nrow(d) == 0 & error) stop("There were no complete cases!")
if (nrow(d) == 0) return(NA) #return NA on no cases
#weighted mean
wtd_mean = wtd_mean(x, w, error = error)
#diffs squared
diffs_sq = (x - wtd_mean)^2
#weighted variance
if (sample) wtd_var = sum(diffs_sq, na.rm = T) / (miss_count(x, reverse = T) - 1)
if (!sample) wtd_var = sum(diffs_sq, na.rm = T) / miss_count(x, reverse = T)
#weighted sd
sqrt(wtd_var)
}
#' Calculate a weighted mean
#'
#' This is an improvement on \code{\link{weighted.mean}} in \code{base-r}.
#'
#' The original function returns \code{NA} when there are missing values in the weights vector despite na.rm=T. This function avoids that problem. It also returns a useful error message if there are no complete cases. The function wraps base-r's function.
#' @param x (num vector) A vector of values.
#' @param w (num vector) A vector of weights.
#' @param error (lgl scalr) Whether to throw an error if there is no data at all or no pairwise complete cases. Default yes.
#' @export
#' @examples
#' set.seed(1)
#' X = rnorm(100)
#' set.seed(1)
#' W = runif(100)
#' wtd_mean(X) # not using weights
#' mean(X) #same as above
#' wtd_mean(X, W) #slightly different
wtd_mean = function(x, w = NULL, error = F) {
#x vector
x = as.vector(x)
#weights
if (is.null(w)) {
w = rep(1, length(x))
} else {
w = as.vector(w)
}
#lengths
if (!lengths_match(x, w)) stop("Lengths of x and w do not match!")
#make temp df
d = data.frame(x = x, w = w) %>% na.omit
#check sample
if (nrow(d) == 0 & error) stop("There were no complete cases!")
if (nrow(d) == 0) return(NA) #return NA on no cases
#else
stats::weighted.mean(x = d$x, w = d$w)
}
#' Calculate a weighted sum
#'
#' This is an improvement on \code{\link{sum}} in \code{base-r}.
#'
#' It automatically handles missing data. It returns a useful error message if there are no complete cases.
#' @param x (num vector) A vector of values.
#' @param w (num vector) A vector of weights.
#' @param error (lgl scalr) Whether to throw an error if there is no data at all or no pairwise complete cases. Default yes.
#' @export
#' @examples
#' set.seed(1)
#' X = rnorm(100)
#' set.seed(1)
#' W = runif(100)
#' wtd_sum(X) # not using weights
#' sum(X) #same as above
#' wtd_sum(X, W) #different
wtd_sum = function(x, w = NULL, error=F) {
#x vector
x = as.vector(x)
#weights
if (is.null(w)) {
w = rep(1, length(x))
} else {
w = as.vector(w)
}
#lengths
lengths_match(x, w)
#make temp df
d = data.frame(x = x, w = w) %>% na.omit
#check sample
if (nrow(d) == 0 & error) stop("There were no complete cases!")
if (nrow(d) == 0) return(NA) #return NA on no cases
#calculate
x_w = sum(d$x * d$w, na.rm = T) # sum of x * w
w_sum = sum(d$w, na.rm = T) # sum of w
(x_w / w_sum) * miss_count(d$x, reverse = T) #weighted sum
}
#' Standardize a vector
#'
#' Standardize a vector. Can use weights and robust measures of central tendency and dispersion. Returns a clean vector as opposed to base-r's \code{\link{scale}}.
#' @param x (num vector) A vector of values.
#' @param w (num vector) A vector of weights.
#' @param robust (log vector) Whether to use robust measures (default false). See \code{\link{mad}} and \code{\link{median}}.
#' @param sample (log scalar) Whether this is a sample as opposed to a population (default true).
#' @param focal_group (lgl vector) A subset of the data to standardize the values to. This is useful when you want one subgroup to be the focal group, using their mean/sd as 0/1.
#' @export
#' @examples
#' set.seed(1)
#' X = rnorm(100, mean = 10, sd = 5)
#' set.seed(1)
#' W = runif(100)
#' standardize(X, W)
#' standardize(X, robust = T) #almost the same for these data
standardize = function(x, w = NULL, robust = F, sample = T, focal_group = NULL) {
#x vector
x = as.vector(x)
#weights
if (is.null(w)) {
w = rep(1, length(x))
} else {
w = as.vector(w)
}
#subset?
if (is.null(focal_group)) focal_group = rep(T, length(x))
#assert equal lengths
assertthat::assert_that(length(focal_group) == length(x))
assertthat::assert_that(is.logical(focal_group))
if (anyNA(focal_group)) {
warning("`focal_group` contains `NA` values. These were converted to `FALSE` following tidyverse convention.")
focal_group = focal_group %>% kirkegaard::NA_to_F()
}
#full x
full_x = x
x = x[focal_group]
w = w[focal_group]
#parametric
if (!robust) {
#weighted mean
wtd_mean = weighted.mean(x, w, na.rm = T)
#diffs squared
diffs_sq = (x - wtd_mean)^2
#weighted variance
if (sample) wtd_var = sum(diffs_sq, na.rm = T) / (miss_count(x, reverse = T) - 1)
if (!sample) wtd_var = sum(diffs_sq, na.rm = T) / miss_count(x, reverse = T)
#weighted sd (sample)
wtd_sd = sqrt(wtd_var)
#standardize
return((full_x - wtd_mean) / wtd_sd)
}
#robust
#standard
if (robust) {
#median
mdn = median(x, na.rm = T)
#mad
mad = mad(x, na.rm = T)
#standardize
return((full_x - mdn) / mad)
}
}
#' Transform to 0-1 scale
#'
#' @param x Numeric factor
#'
#' @return Numeric factor on scale 0-1
#' @export
#'
#' @examples
#' transform_01(1:5)
#' transform_01(c(1:3, NA, 4:5))
transform_01 = function(x) {
assertthat::assert_that(!is.character(x))
assertthat::assert_that(!is.factor(x))
#0 length
if (length(x) == 0) return(c())
#set to 0-1 scale
#subtract minimum
x = x - min(x, na.rm = T)
#divide by maximum
x = x / max(x, na.rm = T)
x
}
#' Find a cutoff of a normal distribution that results in a given mean trait value above the cutoff
#'
#' Assuming a normal distribution for a trait and a cutoff value. Estimate what this cutoff value is to obtain a population above the cutoff with a known mean trait value.
#' @param mean_above (num scalar) Mean trait level of population above the cutoff.
#' @param mean_pop (num scalar) Mean trait level of population. Default = 100 (IQ scale).
#' @param sd_pop (num scalar) Standard deviation of the trait in the population. Default = 15 (IQ scale).
#' @param n (num scalar) Sample size to generate in the process. More results in higher precision and memory use. Default = 1e4.
#' @param precision (num scalar) The precision to use. Default = .1.
#' @param below (log scalar) Reverse the model to find cutoffs for
#' @export
#' @examples
#' #what cutoff is needed to get a population above the cutoff with a mean of 115 when the population mean is 100?
#' find_cutoff(115)
#' #try impossible
#' find_cutoff(95)
find_cutoff = function(mean_above, mean_pop = 100, sd_pop = 15, n = 1e4, precision = .1, below = F) {
#chechk
if (mean_above < mean_pop) stop("This model is inapplicable if the mean trait level is lower than the population mean!")
#dangerous loop!
cutoff = mean_pop #begin with unselected group
iter = 1
while (T) {
#replicable
set.seed(1)
#make a population
population = rnorm(n = n, mean = mean_pop, sd = sd_pop)
#get the population above the cutoff
population_above = population[population > cutoff]
#mean above
population_above_mean = mean(population_above)
#check
v_diff = mean_above - population_above_mean
if (abs(v_diff) < precision) {
return(cutoff)
}
#adjust cutoff
if (v_diff > 0) cutoff = cutoff + precision
if (v_diff < 0) cutoff = cutoff - precision
#iter + 1
iter = iter + 1
#check if infinite
if (iter >= 1e5) stop("Loop reached 100k iterations without finding a solution! Use a lower level of precision!")
}
}
#' Calculate row-wise representativeness
#'
#' @param x Data frame of numerical data
#' @param central_tendency_fun Function to use for central tendency
#' @param central_tendencies If custom values for central tendency, input here
#' @param standardize Whether to standardize values to ensure common metric. If not done, variables with larger SDs will be given more influence.
#' @param absolute Whether to use absolute errors (if not, then there is also an attempt to ensure the error directions are balanced across variables)
#'
#' @return A data frame of error values and their mean and medians
#' @export
#'
#' @examples
#' calc_row_representativeness(iris)
calc_row_representativeness = function(x, central_tendency_fun = median, central_tendencies = NULL, standardize = T, absolute = T) {
#skip non-numeric variables
is_num = map_lgl(x, is.numeric)
if (any(!is_num)) message("Skipped non-numeric variables")
x = x[is_num]
#standardize if desired
if (standardize) x = map_df(x, kirkegaard::standardize)
#compute centencies
if (is.null(central_tendencies)) {
#median
if (identical(central_tendency_fun, median)) {
central_tendencies = map_dbl(x, median, na.rm = T)
} else if (identical(central_tendency_fun, mean)) {
#mean
central_tendencies = map_dbl(x, mean, na.rm = T)
} else {
#something else
central_tendencies = map_dbl(x, mean, na.rm = T)
}
} else {
#ensure matching lengths
assert_that(ncol(x) == length(central_tendencies))
}
#subtract central tendencies
if (!absolute) {
x_error = t(t(x) - central_tendencies) %>% as.data.frame()
} else {
x_error = abs(t(t(x) - central_tendencies)) %>% as.data.frame()
}
#add row means and medians, dplyr way
#https://dplyr.tidyverse.org/articles/rowwise.html#per-row-summary-statistics-1
x_error %>%
rowwise() %>%
mutate(
mean = mean(c_across(everything()), na.rm = T),
median = median(c_across(everything()), na.rm = T)
) %>% ungroup() %>%
mutate(row = 1:n()) %>%
select(row, everything())
}
#internal helper functions
#get model R2 adj
get_r2adj = function(x) {
x %>% summary.lm() %>% .$adj.r.squared
}
#boomer code
#https://rdrr.io/cran/rms/src/R/ols.s#sym-print.ols
get_p = function(x) {
1 - pchisq(x$stats['Model L.R.'], x$stats['d.f.'])
}
#' Test for heteroscedasticity
#'
#' @param resid Model residuals
#' @param x Model predictor of interest
#'
#' @return Data frame of results
#' @export
#'
#' @examples
#' #look for mostly nonexistent HS in iris
#' test_HS(resid = resid(lm(Sepal.Length ~ Petal.Length, data = iris2)), x = iris2$Petal.Length)
#' #a lot of HS here
#' test_HS(resid = resid(lm(Petal.Width ~ Petal.Length, data = iris2)), x = iris2$Petal.Length)
test_HS = function(resid, x) {
#data
d = tibble(
resid = standardize(abs(resid)),
x = standardize(x),
resid_rank = rank(resid),
x_rank = rank(x)
)
#fits
mods = list(
fit_linear = rms::ols(resid ~ x, data = d),
fit_spline = rms::ols(resid ~ rms::rcs(x), data = d),
fit_linear_rank = rms::ols(resid_rank ~ x_rank, data = d),
fit_spline_rank = rms::ols(resid_rank ~ rms::rcs(x_rank), data = d)
)
#summarize
y = tibble(
test = c("linear raw", "spline raw", "linear rank", "spline rank"),
r2adj = purrr::map_dbl(mods, get_r2adj),
p = purrr::map_dbl(mods, get_p),
fit = mods
)
#for the rcs, we have to compute the model improvement vs. linear
#https://stackoverflow.com/questions/47937065/how-can-i-interpret-the-p-value-of-nonlinear-term-under-rms-anova
#https://stats.stackexchange.com/questions/405961/is-there-formal-test-of-non-linearity-in-linear-regression
y$p[2] = rms::lrtest(mods$fit_linear, mods$fit_spline)$stats["P"]
y$p[4] = rms::lrtest(mods$fit_linear_rank, mods$fit_spline_rank)$stats["P"]
y %>% dplyr::mutate(log10_p = -log10(p))
}
#' Quantile smoothening
#'
#' @param x Predictor variable
#' @param y Outcome variable
#' @param quantile Which quantile (e.g., .95)
#' @param method Method to use
#' @param k Number of knots for method qgam, see [qgam::qgam]
#' @param window Window size for method running, see [caTools::runquantile]
#'
#' @return Vector of fitted values
#' @export
#'
#' @examples
#' quantile_smooth(iris$Petal.Length, iris$Sepal.Length, quantile = .90)
quantile_smooth = function(x, y, quantile, method = c("qgam", "Rq", "running"), k = 5, window = 50) {
#method
method = match.arg(method[1], method)
if (method == "qgam") {
#same as in example here
#https://rdrr.io/cran/qgam/man/qgam.html
assertthat::assert_that(nchar(system.file(package = "qgam")) > 0)
sink(tempfile())
fit = qgam::qgam(list(y~s(x, k = k, bs = "cr"), ~ s(x, k = k, bs = "cr")),
data = data.frame(x = x, y = y), qu = quantile)
sink()
return(predict(fit))
} else if (method == "Rq") {
#fit with spline
#https://rdrr.io/cran/rms/man/Rq.html
assertthat::assert_that(nchar(system.file(package = "rms")) > 0)
fit = rms::Rq(y ~ rms::rcs(x), data = data.frame(x = x, y = y), tau = quantile)
return(fitted(fit) %>% as.vector())
} else if (method == "running") {
#beware, need to sort the y by x first
#and then return it to original state
assertthat::assert_that(nchar(system.file(package = "caTools")) > 0)
x_order = order(x)
return(caTools::runquantile(y[x_order], k = k, probs = quantile)[x_order])
} else {
stop(str_glue("method not recognized: {method}"), call. = F)
}
}
#' Compute many proportion tests
#'
#' This is a convenient tidy wrapper for `stat::prop.test()`
#'
#' @param x A factor variable
#' @param group A grouping variable
#' @param correct a logical indicating whether Yates' continuity correction should be applied where possible.
#' @param conf_level confidence level of the returned confidence interval. Must be a single number between 0 and 1. Only used when testing the null that a single proportion equals a given value, or that two proportions are equal; ignored otherwise.
#' @param alternative a character string specifying the alternative hypothesis, must be one of "two.sided" (default), "greater" or "less". You can specify just the initial letter. Only used for testing the null that a single proportion equals a given value, or that two proportions are equal; ignored otherwise.
#'
#' @return A data frame, which may have missing levels if a combination did not exist in the data.
#' @export
#'
#' @examples
#' prop_tests(mpg$cyl, mpg$manufacturer)
prop_tests = function(x, group, correct = T, conf_level = .95, alternative = c("two.sided", "less", "greater")) {
#make data frame
tibble(
x = x %>% as.factor(),
group = group %>% as.factor()
) -> xdata
#begin pipe
xdata %>%
#filter NA
filter(!is.na(x), !is.na(group)) %>%
#loop groups
plyr::ddply(("group"), function(dd) {
#loop levels
map_df(levels(dd$x), function(this_level) {
#suppress the continuity correction warnings
suppressWarnings(prop.test(sum(dd$x==this_level), n = nrow(dd), correct = correct, conf.level = conf_level, alternative = alternative) %>% broom::tidy() %>% mutate(n = nrow(dd), n_level = sum(dd$x == this_level)))
}) %>%
mutate(level = levels(dd$x))
}) %>%
mutate(
level = level %>% factor(levels = levels(xdata$x)),
group = group %>% factor(levels = levels(xdata$group)),
) %>%
as_tibble() %>%
select(group, level, everything())
}
#' Test for differential item functioning (DIF)
#'
#' Tests are done following the mirt package approach outlined by Chalmers.
#'
#' @param items Item data to use
#' @param model Item model
#' @param group Group (2 groups at most)
#' @param fscores_pars Any extra scoring parameters used
#' @param messages Show messages
#' @param method Method
#' @param technical Further technical args to pass to mirt
#' @param itemtype Item type
#' @param verbose Verbose output
#' @param DIF_args Arguments to pass to mirt::DIF
#' @param multiple_testing_method Method to use for multiple testing correction, see p.adjust(). Default is "bonferroni"
#' @param ... Other arguments passed to mirt functions
#'
#' @return A list of results
#' @export
#'
#' @examples
#' library(mirt)
#' n = 1000
#' n_items = 10
#'
#' #slopes
#' set.seed(1)
#' a1 = runif(n_items, min = .5, max = 2)
#' a2 = a1
#' a2[1] = 0 #item doesnt work for this group
#' #intercepts
#' i1 = rnorm(n_items, mean = -0.5, sd = 2)
#' i2 = i1
#' i2[2] = -2 #item much harder for this group
#'
#' #simulate data twice
#' d1 = simdata(
#' a1,
#' i1,
#' N = n,
#' itemtype = "2PL",
#' mu = 0
#' )
#'
#' d2 = simdata(
#' a2,
#' i2,
#' N = n,
#' itemtype = "2PL",
#' mu = 1
#' )
#'
#' #combine
#' d = rbind(
#' d1 %>% set_names("item_" + 1:n_items),
#' d2 %>% set_names("item_" + 1:n_items)
#' ) %>% as.data.frame()
#'
#' #find the bias
#' DIF_results = DIF_test(d, model = 1, itemtype = "2PL", group = rep(c(1, 2), each = n))
#' DIF_results$effect_size_items$conservative
#' plot(DIF_results$fits$anchor_conservative)
#' plot(DIF_results$fits$anchor_conservative, type = "trace")
DIF_test = function(items, model, group, fscores_pars = list(full.scores = T, full.scores.SE = T), messages = T, method = "EM", technical = list(), itemtype = NULL, verbose = T, DIF_args = NULL, multiple_testing_method = "bonferroni", ...) {
#deal with missing data in the group var
group_keep = !is.na(group)
items = items[group_keep, ]
group = group[group_keep]
#make mirt args
mirt_args = c(list(data = items, model = model, technical = technical, verbose = verbose, method = method, itemtype = itemtype), list(...))
mirt_args_set2 = mirt_args[!names(mirt_args) %in% c("model", "itemtype")]
#regular fit joint group
if (messages) message("There are 8 steps")
if (messages) message("Step 1: Initial joint fit\n")
mirt_fit = rlang::exec(mirt::mirt, !!!mirt_args)
#step 3
if (!is.character(group) && !is.factor(group)) group = factor(group)
if (messages) message("\nStep 2: Initial MI fit")
mirt_fit_MI = rlang::exec(mirt::multipleGroup, !!!mirt_args, group = group, invariance = c('intercepts','slopes', 'free_means', 'free_var'))
#DIFs
if (messages) message("\nStep 3: Leave one out MI testing")
if (is.null(DIF_args)) {
DIF_args = list(
#test all pars in model
which.par = mirt::coef(mirt_fit, simplify = T)$items %>% colnames(),
scheme = "drop"
)
}
#call DIF() with arguments
DIFs = rlang::exec(
mirt::DIF,
MGmodel = mirt_fit_MI,
!!!(mirt_args[!names(mirt_args) %in% c("data", "model", "group", "itemtype")]),
!!!DIF_args
)
DIFs = DIFs %>% rownames_to_column("item")
DIFs$number = 1:nrow(DIFs)
#adjust p values
DIFs$p_adj = DIFs$p %>% p.adjust(method = multiple_testing_method)
#with significant DIF
DIFs_detected_liberal = DIFs %>% filter(p < .05)
DIFs_detected_conservative = DIFs %>% filter(p_adj < .05)
#subset itmes
items_noDIF_liberal = items %>% dplyr::select(!!setdiff(DIFs$item, DIFs_detected_liberal$item))
items_noDIF_conservative = items %>% dplyr::select(!!setdiff(DIFs$item, DIFs_detected_conservative$item))
#subset models
#tricky!
#if its a g only model, we dont have to do anything
#but if its complex we need name format or Q matrix format
#extract loadings matrix
#convert to Q matrix
mirt_fit_loadings = mirt_fit@Fit$`F`
model_noDIF_liberal_Q = mirt_fit_loadings %>% apply(MARGIN = 2, as.logical) %>% magrittr::set_rownames(rownames(mirt_fit_loadings))
model_noDIF_conservative_Q = model_noDIF_liberal_Q
#set unused items' rows to FALSE
model_noDIF_liberal_Q[DIFs_detected_liberal$item, ] = F
model_noDIF_conservative_Q[DIFs_detected_conservative$item, ] = F
#fit together without DIF
if (messages) message("\nStep 4: Fit without DIF items, liberal threshold")
mirt_fit_noDIF_liberal = rlang::exec(mirt::mirt, model = mirt::mirt.model(model_noDIF_liberal_Q), !!!mirt_args_set2)
if (messages) message("\nStep 5: Fit without DIF items, conservative threshold")
mirt_fit_noDIF_conservative = rlang::exec(mirt::mirt, model = mirt::mirt.model(model_noDIF_conservative_Q), !!!mirt_args_set2)
#with anchors
if (messages) message("\nStep 6: Fit with anchor items, liberal threshold")
mirt_fit_anchors_liberal = rlang::exec(mirt::multipleGroup, !!!mirt_args, group = group, invariance = c(items_noDIF_liberal %>% names(), 'free_means', 'free_var'))
if (messages) message("\nStep 7: Fit with anchor items, conservative threshold")
mirt_fit_anchors_conservative = rlang::exec(mirt::multipleGroup, !!!mirt_args, group = group, invariance = c(items_noDIF_conservative %>% names(), 'free_means', 'free_var'))
#get scores
if (messages) message("\nStep 8: Get scores")
orig_scores = do.call(what = mirt::fscores, args = c(list(object = mirt_fit), fscores_pars))
noDIF_scores_liberal = do.call(what = mirt::fscores, args = c(list(object = mirt_fit_noDIF_liberal), fscores_pars))
noDIF_scores_conservative = do.call(what = mirt::fscores, args = c(list(object = mirt_fit_noDIF_conservative), fscores_pars))
anchor_scores_liberal = do.call(what = mirt::fscores, args = c(list(object = mirt_fit_anchors_liberal), fscores_pars))
anchor_scores_conservative = do.call(what = mirt::fscores, args = c(list(object = mirt_fit_anchors_conservative), fscores_pars))
#in a data frame
scores = list(
#original scores
original = orig_scores,
#after DIF removal
noDIF_liberal = noDIF_scores_liberal,
noDIF_conservative = noDIF_scores_conservative,
#anchor scores
anchor_liberal = anchor_scores_liberal,
anchor_conservative = anchor_scores_conservative
)
#effect sizes
#this only works with 1 dimensional models
#https://groups.google.com/forum/#!topic/mirt-package/hAj7jfdzsxY
if (ncol(mirt_fit_loadings) == 1) {
#item level
effect_size_items = list(
liberal = mirt::empirical_ES(mirt_fit_anchors_liberal, DIF = T, plot = F),
conservative = mirt::empirical_ES(mirt_fit_anchors_conservative, DIF = T, plot = F)
)
#test level
effect_size_test = list(
liberal = mirt::empirical_ES(mirt_fit_anchors_liberal, DIF = F, plot = F),
conservative = mirt::empirical_ES(mirt_fit_anchors_conservative, DIF = F, plot = F)
)
} else {
#we fill in NULLS to keep structure
#item
effect_size_items = list(
liberal = NULL,
conservative = NULL
)
#test
effect_size_test = list(
liberal = NULL,
conservative = NULL
)
}
#out
list(
scores = scores,
fits = list(
original = mirt_fit,
noDIF_liberal = mirt_fit_noDIF_liberal,
noDIF_conservative = mirt_fit_noDIF_conservative,
anchor_liberal = mirt_fit_anchors_liberal,
anchor_conservative = mirt_fit_anchors_conservative
),
DIF_stats = DIFs,
effect_size_items = effect_size_items,
effect_size_test = effect_size_test
)
}
#convert from slope to loading
#' Convert from factor loading to discrimination
#'
#' @param x Vector of factor loadings
#' @param logit_scaling Whether to use logit scaling
#'
#' @return A vector of slopes (discrimination)
#' @export
slope_to_loading = function(x, logit_scaling = T) {
if (logit_scaling) {
scaling_factor = 3
} else {
scaling_factor = 1
}
sqrt(x^2 / (x^2 + scaling_factor))
}
#' Convert from slopes to factor loadings
#'
#' @param x A vector of slopes
#' @param logit_scaling Whether to use logit scaling
#'
#' @return A vector of loadings
#' @export
loading_to_slope = function(x, logit_scaling = T) {
if (logit_scaling) {
scaling_factor = 3
} else {
scaling_factor = 1
}
sqrt((scaling_factor*x^2) / (1 - x^2))
}
#' Calculate item gaps
#'
#' @param x Item data frame
#' @param group A grouping variable
#' @param return_data_frame Whether you want a data frame back, or just a vector
#'
#' @return a data frame or a vector
#' @export
#' @examples
#' set.seed(1)
#' X = matrix(rbinom(10000, 1, .5), ncol = 10)
#' group = rbinom(1000, 1, .5)
#' calc_item_gaps(X, group)
calc_item_gaps = function(x, group, return_data_frame = T) {
# browser()
#subset to complete cases for group
no_na = !is.na(group)
x = x[no_na, ]
group = group[no_na] %>% as.factor()
#input check
assert_that(is.matrix(x) | is.data.frame(x))
x = as.data.frame(x)
assert_that(all(map_lgl(x, is.numeric)))
assert_that(length(levels(group)) == 2)
#decide focal group
focal_group = levels(group)[1]
alt_group = levels(group)[2]
message(str_glue("Focal group is {focal_group}. Positive values mean that {focal_group} > {alt_group}"))
#prep data frame
res = tibble(
item_i = 1:ncol(x),
item = colnames(x),
focal_pass_rate = map_dbl(x[group == focal_group, ], mean, na.rm = T),
alt_pass_rate = map_dbl(x[group == alt_group, ], mean, na.rm = T),
d_gap = qnorm(focal_pass_rate) - qnorm(alt_pass_rate)
)
#return
if (return_data_frame) {
return(res)
} else {
return(res$d_gap)
}
}
#' Create norms and adjust scores for age effects
#'
#' This function adjusts for the effect of age on the mean and standard deviation of a score, and then creates norms for setting the resulting scores to follow the IQ scale of 100/15 for the chosen subgroup.
#'
#' @param score A vector of scores
#' @param age A vector of ages
#' @param norm_group A logical vector of the same length as the score vector, indicating the norm group. Default is all cases.
#' @param p_value The p value threshold for model inclusion. Default is .01.
#'
#' @return A list of results, including corrected scores, age correction models used, and means and standard deviations for the norm group post-correction.
#' @export
#' @rdname Norms_and_age_corrections
#'
#' @examples
#' #simulate some data, mess up the norms, and get them back
#' set.seed(1)
#' data = tibble(
#' true_IQ = c(rnorm(1000, mean = 100, sd = 15), rnorm(1000, mean = 90, sd = 15)),
#' true_z = (true_IQ - 100 ) / 15,
#' age = runif(2000, min = 18, max = 80),
#' norm_group = c(rep(T, 1000), rep(F, 1000))
#' ) %>% mutate(
#' #add an effect of age on the mean and dispersion of scores
#' age_mean_effect = age * 0.3 - 0.3 * 18,
#' age_sd_effect = true_z * 15 * rescale(age, new_min = .80, new_max = 1.20),
#' score = 100 + age_sd_effect + age_mean_effect
#' )
#'
#' #restore the norms
#' norms = make_norms(data$score, data$age, data$norm_group)
#' data$IQ = norms$data$IQ
#' #manually apply norms
#' data$IQ2 = apply_norms(data$score, data$age, prior_norms = norms)
#' data$IQ3 = apply_norms(data$score, data$age,
#' age_model_slope = norms$age_model$coefficients[2],
#' age_model_intercept = norms$age_model$coefficients[1],
#' age_model_abs_slope = norms$age_model_abs$coefficients[2],
#' age_model_abs_intercept = norms$age_model_abs$coefficients[1],
#' mean = norms$norm_desc$mean,
#' sd = norms$norm_desc$sd
#' )
#'
#' #verify that scores were made correct
#' cor(data)
#' GG_scatter(data, "age", "score")
#' #can we detect the age heteroscedasticity too?
#' test_HS(resid = resid(lm(score ~ age, data = data)), x = data$age)
#' #plot the results
#' GG_scatter(data, "true_IQ", "score") + geom_abline(slope = 1, intercept = 0, linetype = 2)
#' GG_scatter(data, "true_IQ", "IQ") + geom_abline(slope = 1, intercept = 0, linetype = 2)
make_norms = function(score, age, norm_group = NULL, p_value = .01) {
#if no norm group, use all rows
if (is.null(norm_group)) {
norm_group = rep(T, length(score))
}
#make a data frame with variables
d_all = tibble(
score = score,
age = age,
norm_group = norm_group
)
#norm subset
d_norm = d_all %>%
filter(norm_group)
#correct for the mean effect of age
age_model = lm(score ~ age, data = d_norm)
age_model_glance = broom::glance(age_model)
#if the p value is too high, we dont use the model
if (age_model_glance$p.value >= p_value) {
d_norm$score_ageadj1 = d_norm$score
} else {
d_norm$score_ageadj1 = resid(age_model)
}
#fit model for for the SD effect of age
age_model_abs = lm(abs(score_ageadj1) ~ age, data = d_norm)
age_model_abs_glance = broom::glance(age_model_abs)
#apply corrections to the full data
#if model is not significant, we dont use it
if (age_model_glance$p.value >= p_value) {
message(str_glue("No detected linear effect of age on the score (p = {p_to_asterisk(age_model_glance$p.value)}). Model not used."))
d_all$score_ageadj1 = d_all$score
} else {
message(str_glue("Detected linear effect of age on the score (p = {p_to_asterisk(age_model_glance$p.value)}). Model used."))
d_all$score_ageadj1 = d_all$score - predict(age_model, newdata = d_all)
}
#if model is not significant, we dont use it
if (age_model_abs_glance$p.value >= p_value) {
message(str_glue("No detected variance effect of age on the score (p = {p_to_asterisk(age_model_abs_glance$p.value)}). Model not used."))
d_all$score_ageadj2 = d_all$score_ageadj1
} else {
message(str_glue("Detected variance effect of age on the score (p = {p_to_asterisk(age_model_abs_glance$p.value)}). Model used."))
d_all$score_ageadj2 = d_all$score_ageadj1 / predict(age_model_abs, newdata = d_all)
}
#find the norm group mean/SD
norm_desc = d_all %>% filter(norm_group) %>% select(score_ageadj2) %>% describe2()
d_all$score_ageadj3 = (d_all$score_ageadj2 - norm_desc$mean) / norm_desc$sd
#output IQ scores
d_all$IQ = d_all$score_ageadj3 * 15 + 100
#list of results
list(
#adjusted data
data = d_all,
#models fitted
age_model = age_model,
age_model_abs = age_model_abs,
#used models?
age_model_used = age_model_glance$p.value < p_value,
age_model_abs_used = age_model_abs_glance$p.value < p_value,
#means and SD for norm group post corrections
norm_desc = norm_desc
)
}
#make a function that applies norms to a new data set based on prior results
#' Apply norms to a new data set
#'
#' @param score A vector of scores to correct
#' @param age A vector of ages to use
#' @param prior_norms A list of prior norms to use from `make_norms`
#' @param age_model_slope A manually supplied slope for the age model
#' @param age_model_intercept A manually supplied intercept for the age model
#' @param age_model_abs_slope A manually supplied slope for the age model
#' @param age_model_abs_intercept A manually supplied intercept for the age model
#' @param mean A manually supplied mean for the norm group
#' @param sd A manually supplied standard deviation for the norm group
#'
#' @return A vector of IQ scores
#' @export
#' @rdname Norms_and_age_corrections
apply_norms = function(score,
age,
prior_norms = NULL,
age_model_slope = 0,
age_model_intercept = 0,
age_model_abs_slope = 0,
age_model_abs_intercept = 0,
mean = NULL,
sd = NULL) {
# browser()
#if prior norms, use them
if (!is.null(prior_norms)) {
#poll values from prior norms
#but don't if they are non-significant
if (prior_norms$age_model_used) {
age_model_slope = prior_norms$age_model$coefficients[2]
age_model_intercept = prior_norms$age_model$coefficients[1]
}
if (prior_norms$age_model_abs_used) {
age_model_abs_slope = prior_norms$age_model_abs$coefficients[2]
age_model_abs_intercept = prior_norms$age_model_abs$coefficients[1]
}
mean = prior_norms$norm_desc$mean
sd = prior_norms$norm_desc$sd
}
#apply the norms
#appy linear correction
if (age_model_slope == 0) {
score_ageadj1 = score
} else {
score_ageadj1 = score - age_model_slope * age - age_model_intercept
}
#apply variance correction
if (age_model_abs_slope == 0) {
score_ageadj2 = score_ageadj1
} else {
score_ageadj2 = score_ageadj1 / (age_model_abs_slope * age + age_model_abs_intercept)
}
#apply norm group correction
score_ageadj3 = (score_ageadj2 - mean) / sd
#IQ scale
IQ = score_ageadj3 * 15 + 100
IQ
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.