Nothing
#' Pre-process data.
#'
#' @param item.data Matrix or data frame of item responses.
#' @param pred.data Matrix or data frame of DIF and/or impact predictors.
#' @param prox.data Vector of observed proxy scores.
#' @param item.type Character value or vector indicating the item response
#' distributions.
#' @param pen.type Character indicating type of penalty.
#' @param tau Optional numeric vector of tau values.
#' @param num.tau Numeric indicating number of tau values to run Reg-DIF on.
#' @param anchor Optional numeric value or vector indicating which item
#' response(s) are anchors (e.g., \code{anchor = 1}).
#' @param stdz Logical value indicating whether to standardize DIF and
#' impact predictors for regularization.
#' @param control Optional list of additional model specification and
#' optimization parameters.
#' @param call Defined from regDIF.
#'
#' @return a \code{"list"} of default controls for \code{"em_estimation"}
#'
#' @keywords internal
#'
preprocess <-
function(item.data,
pred.data,
prox.data,
item.type,
pen.type,
tau,
num.tau,
anchor,
stdz,
control,
call){
# Remove observations with any NA.
if(is.null(prox.data)) {
combined.data <- cbind(pred.data, item.data)
} else {
combined.data <- cbind(pred.data, item.data, prox.data)
}
NA_cases <- apply(combined.data, 1, function(x) any(is.na(x)))
if(sum(NA_cases) == nrow(combined.data)) {
stop(paste0("No observations remain after performing listwise deletion. Consider removing ",
"variables with the greatest amount of missingness."),
call. = FALSE)
}
item.data <- item.data[!NA_cases,]
pred.data <-
if(is.null(dim(pred.data))) {
as.matrix(pred.data[!NA_cases])
} else {
as.matrix(pred.data[!NA_cases,])
}
prox.data <- if(!is.null(prox.data)) prox.data[!NA_cases]
# Control parameters.
final_control <- list(impact.mean.data = pred.data,
impact.var.data = pred.data,
tol = 10^-5,
maxit = 2000,
adapt.quad = FALSE,
num.quad = 21,
int.limits = if(any(item.type == "cfa")) {c(-3,3)} else {c(-6,6)},
optim.method = "UNR",
start.values = list())
if(length(control) > 0) final_control[names(control)] <- control
# Pre-process warnings.
# if(final_control$optim.method == "CD" && final_control$parallel[[1]]) {
# stop(paste0("Parallel computing is not supported for coordinate descent. Use \"UNR\" or ",
# "\"MNR\" for binary item responses or \"UNR\" for categorical item responses."))
# }
if(!(any(item.type == "rasch") ||
any(item.type == "2pl") ||
any(item.type == "graded") ||
any(item.type == "cfa") ||
any(is.null(item.type)))) {
stop("Item response types must be \"rasch\", \"2pl\", \"graded\", or \"cfa\".",
call. = FALSE)
}
if(any(tau < 0)) {
stop("Tau values must be non-negative.", call. = FALSE)
}
if(length(tau) > 1 && all(diff(tau) >= 0)) {
stop("Tau values must be in descending order (e.g., tau = c(-2,-1,0)).", call. = FALSE)
}
if(is.null(anchor) && length(tau) == 1) {
if(tau == 0) {
stop("Anchor item must be specified with tau = 0.", call. = FALSE)
}
}
if(!is.null(anchor) && !is.numeric(anchor)) {
stop("Anchor items must be numeric (e.g., anchor = 1).", call. = FALSE)
}
# if(!is.null(prox.data) && final_control$optim.method == "UNR") {
# stop("Coordinate descent is not yet supported when using observed proxy scores.", call. = FALSE)
# }
if(final_control$adapt.quad == T) {
warning(paste0("Adaptive quadrature is not fully supported. Fixed-point ",
"quadrature is recommended at this time."), call. = FALSE, immediate. = TRUE)
}
if(any(NA_cases)) {
warning(paste0("Removed the following cases with missing values (NA) - ",
paste0(which(NA_cases), collapse = ", ")),
call. = FALSE,
immediate. = TRUE)
}
# Define number of tau values.
if(is.null(tau)){
tau_vec <- 1e20
id_tau <- TRUE
} else {
num.tau <- length(tau)
tau_vec <- tau
id_tau <- FALSE
}
# Speed up computation.
item_data <- as.matrix(apply(as.matrix(item.data),2,as.numeric))
pred_data <- as.matrix(apply(as.matrix(pred.data),2,as.numeric))
prox_data <- if(!is.null(prox.data)) scale(as.matrix(as.numeric(prox.data)))
mean_predictors <-
as.matrix(apply(as.matrix(final_control$impact.mean.data),2,as.numeric))
var_predictors <-
as.matrix(apply(as.matrix(final_control$impact.var.data),2,as.numeric))
# Remove any variables with no variance.
item_data_no_var <- apply(item_data, 2, var) == 0
pred_data_no_var <- apply(pred_data, 2, var) == 0
item_data <- item_data[, !item_data_no_var]
pred_data <- pred_data[, !pred_data_no_var, drop = FALSE]
if(any(item_data_no_var) || any(pred_data_no_var)) {
warning(paste0("Removing the following variables from analysis because they have no variance ",
"(before or after list-wise deletion):"),
paste0("\n"),
paste0(names(item.data)[item_data_no_var],
names(pred.data)[pred_data_no_var], collapse = " "),
call. = FALSE, immediate. = TRUE)
}
# if(ncol(as.matrix(final_control$impact.mean.data)) != ncol(pred_data)) {
# mean_predictors <- mean_predictors[,!(names(pred.data) %in% names(pred.data)[pred_data_no_var])]
# }
#
# if(ncol(as.matrix(final_control$impact.var.data)) != ncol(pred_data)) {
# var_predictors <- var_predictors[,!(names(pred.data) %in% names(pred.data)[pred_data_no_var])]
# }
# Get dimensions of data.
samp_size <- dim(item_data)[1]
num_items <- dim(item_data)[2]
num_predictors <- dim(pred_data)[2]
# Determine initial number of responses
num_responses <-
as.vector(apply(item_data, 2, function(x) length(unique(na.omit(x)))))
# Get multiple characters of item.type for number of items.
if(is.null(item.type)) {
item_type <- sapply(1:num_items, function(item) {
if(num_responses[item] == 2) {
tmp = '2pl'
} else if (num_responses[item] > 2 && num_responses[item] < 7) {
tmp = 'graded'
} else if (num_responses[item] > 6) {
tmp = 'cfa'
}
return(tmp)
})
} else if(length(item.type) == 1) {
item_type = rep(item.type, num_items)
}
# Get item response types.
cat_items <- item_type == 'rasch' |
item_type == '2pl' |
item_type == 'graded'
if(any(cat_items)) {
item_data[,which(cat_items)] <-
apply(item.data[,which(cat_items)],
2,
function(x) as.numeric(as.factor(x)))
num_responses[which(cat_items)] <-
apply(item_data[,which(cat_items)],
2,
function(x) length(unique(na.omit(x))))
}
# if(any(item_type == 'graded')) {
# if(is.null(control$optim.method)) {
# warning(paste0("Graded item response types are not yet supported with ",
# "Multivariate Newton-Raphson (MNR). Using Univariate Newton-Raphson ",
# "(UNR) instead."), call. = FALSE, immediate. = TRUE)
# final_control$optim.method <- "UNR"
# }
# }
# Update number of quad pts for ordered categorical or guassian items.
if(any(num_responses != 2) && final_control$num.quad == 21) {
if(any(num_responses > 6)) {
final_control$num.quad <- 101
} else {
final_control$num.quad <- 51
}
}
# Define fixed quadrature points.
theta <- seq(final_control$int.limits[1],
final_control$int.limits[2], length.out = final_control$num.quad)
if(final_control$adapt.quad == F && final_control$num.quad < 21) {
warning(paste0("When using fixed quadrature, greater than 20 points for ",
"binary item responses or 50 points for ordered ",
"responses is recommended to yield precise estimates."), call. = FALSE,
immediate. = TRUE)
}
# Standardize predictors.
if(stdz == TRUE){
pred_data <- scale(pred_data)
mean_predictors <- scale(mean_predictors)
var_predictors <- scale(var_predictors)
}
# Penalty type.
if(is.null(pen.type)) {
pen_type <- "lasso"
} else {
pen_type <- pen.type
}
# Starting values.
p <- replicate(n=num_items+2,list(NA),simplify=F)
for(item in 1:num_items){
# Different item response types.
if(item_type[item] == "cfa") {
p[[item]] <- c(mean(item_data[,item]),
sqrt(.5*var(item_data[,item])),
rep(0, num_predictors),
rep(0, num_predictors),
.5*var(item_data[,item]),
rep(0, num_predictors))
names(p[[item]]) <- c(paste0('c0_item',item,"_int"),
paste0('a0_item',item, "_"),
paste0('c1_item',item,"_cov",1:num_predictors),
paste0('a1_item',item,"_cov",1:num_predictors),
paste0('s0_item',item, "_"),
paste0('s1_item',item,"_cov",1:num_predictors))
} else if(num_responses[item] == 2) {
p[[item]] <- c(0,
1,
rep(0, num_predictors),
rep(0, num_predictors))
names(p[[item]]) <- c(paste0('c0_item',item,"_int1"),
paste0('a0_item',item,"_"),
paste0('c1_item',item,"_cov",1:num_predictors),
paste0('a1_item',item,"_cov",1:num_predictors))
} else {
p[[item]] <- c(0,
seq(.25,1,length.out = num_responses[item]-2),
1,
rep(0, num_predictors),
rep(0, num_predictors))
names(p[[item]]) <- c(paste0('c0_item',item,"_int",
1:(num_responses[item]-1)),
paste0('a0_item',item,"_"),
paste0('c1_item',item,"_cov",1:num_predictors),
paste0('a1_item',item,"_cov",1:num_predictors))
}
}
p[[(num_items+1)]] <- rep(0,ncol(mean_predictors))
p[[(num_items+2)]] <- rep(0,ncol(var_predictors))
names(p[[(num_items+1)]]) <- paste0(rep(paste0('mean',
1:ncol(mean_predictors))))
names(p[[(num_items+2)]]) <- paste0(rep(paste0('var',
1:ncol(var_predictors))))
# Update starting values if provided by the user.
if(length(final_control$start.values) > 0) {
for(parm in 1:ncol(mean_predictors)) {
p[[(num_items+1)]][[parm]] <- final_control$start.values$impact[parm]
}
for(parm in 1:ncol(var_predictors)) {
p[[(num_items+2)]][[parm]] <-
final_control$start.values$impact[ncol(mean_predictors)+parm]
}
for(item in 1:num_items) {
p[[item]][[1]] <- final_control$start.values$base[item]
p[[item]][[2]] <- final_control$start.values$base[num_items+item]
for(cov in 1:num_predictors) {
p[[item]][[2+cov]] <-
final_control$start.values$dif[((item-1)*num_predictors)+cov]
p[[item]][[length(p[[item]])-num_predictors+cov]] <-
final_control$start.values$dif[(num_items*num_predictors)+((item-1)*num_predictors)+cov]
}
}
}
if(any(item_type == "cfa")){
num_base_parms <- length(c(unlist(p)[grep('c0',names(unlist(p)))],
unlist(p)[grep('a0',names(unlist(p)))],
unlist(p)[grep('s0',names(unlist(p)))]))
num_dif_parms <- length(c(unlist(p)[grep('c1',names(unlist(p)))],
unlist(p)[grep('a1',names(unlist(p)))],
unlist(p)[grep('s1',names(unlist(p)))]))
} else{
num_base_parms <- length(c(unlist(p)[grep('c0',names(unlist(p)))],
unlist(p)[grep('a0',names(unlist(p)))]))
num_dif_parms <- length(c(unlist(p)[grep('c1',names(unlist(p)))],
unlist(p)[grep('a1',names(unlist(p)))]))
}
final_length <- ifelse(is.null(tau),num.tau,length(tau))
final <- list(tau_vec = rep(NA,final_length),
aic = rep(NA,final_length),
bic = rep(NA,final_length),
impact = matrix(NA,ncol=final_length,
nrow=(ncol(mean_predictors)+
ncol(var_predictors))),
base = matrix(NA,ncol=final_length,
nrow=num_base_parms),
dif = matrix(NA,ncol=final_length,
nrow=num_dif_parms),
eap = list(scores = matrix(NA,ncol=final_length,
nrow=length(NA_cases)),
sd = matrix(NA,ncol=final_length,
nrow=length(NA_cases))),
estimator_history = lapply(1:num.tau,
function(x) {
mat <- matrix(0,
ncol=1,
nrow=length(unlist(p))+1)
rownames(mat) <- c(names(unlist(p)),
"observed_ll")
return(mat)
}),
log_lik = NA,
complete_ll_info = list(),
data = vector("list",3),
exit_code = 0,
call = call)
return(list(p = p,
final = final,
item_data = item_data,
pred_data = pred_data,
prox_data = prox_data,
mean_predictors = mean_predictors,
var_predictors = var_predictors,
item_type = item_type,
theta = theta,
final_control = final_control,
num_responses = num_responses,
num_predictors = num_predictors,
samp_size = samp_size,
num_items = num_items,
pen_type = pen_type,
tau_vec = tau_vec,
num_tau = num.tau,
id_tau = id_tau,
num_quad = final_control$num.quad,
adapt_quad = final_control$adapt.quad,
optim_method = final_control$optim.method,
estimator_history = final$estimator_history,
estimator_limit = F,
exit_code = 0,
NA_cases = NA_cases))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.