#' Flexible maximum-likelihood-based regression function
#' @param controlList one or more xregControl-objects (concatenated using c() ) to control maximum-likelihood estimation for different types or sets of data provided in dataList. Can alternatively provide a single formula or list of formulas, in which case an xregControl object will be generated on the fly. An xreg-object can be provided, in which case the xregControlList will be extracted. See examples
#' @param dataList Named list of data.frames for which maximum sum-likelihood should be estimated. Names must correspond to names of xregControl objects. If no names are provided, order will be used to decide which xregControl-object to use.
#' @param start_values Optional named list or vector of numeric start values, to override those provided in the xregControl obects. Previously estimated xreg-objects can be used, in which case fitted values will be extracted.
#' @param fixed_values Optional named list or vector of numeric values that will be held as fixed, and will be available in the same way as the fitted parameters. An xreg-object can be used.
#' @param latent_classes Optional integer parameter defining the number of latent classes to be fitted
#' @param latent_class_parameter Optional character vector with names corresponding to the parameters that are to be varied in the latent classes (if latent_classes > 1)
#' @param latent_id_colname Optional parameter defining the name of the id column defining the individuals (or groups) over which latent classes are to be fitted
#' @param return_type Optional debug function. Default value, "fit", returns and xreg-object with fitted values. "df" returns the final data.frame of estimated values. "first" returns likelihood for providedd start values. "first_df" returns the initial data.frame with start_values. "predict" returns final data.frame. "precalc_df" returns data.frame prior to calculation of likelihood. Useful for debugging of likelihood-function.
#' @param print_sum Optional argument to force continuous printing of the progress of fitting. Can be useful for debugging.
#' @param ... optional arguments to be forwarded to x_mle, optim, xregControl, etc.
#' @description Allows a list of formulas and fitting functions to be used for each type or set of data.
#' @aliases summary.xreg, print.xreg
#' @author Kim Rand
#' @examples
#' control_continuous <- xregControl(formulas = list(x ~ y * YVAR + z + ZVAR, value ~ INTERCEPT + x * XVAR), start_values = c(INTERCEPT = 0, XVAR = 1, YVAR = 1, ZVAR = 1), p_fun = cont_normal, name = "CONTINUOUS")
#' control_dichotomos <- xregControl(formulas = list(value ~ INTERCEPT2 + z * ZVAR), start_values = c(INTERCEPT2 = 0, ZVAR = 1), p_fun = dich_logistic, name = "DICHOTOMOUS")
#' joint_control <- c(control_continuous, control_dichotomous)
#' xreg(controlList = joint_control, dataList = list(CONTINUOUS = df1, DICHOTOMUS = df2))
#' @export
xreg <- function(controlList,
dataList,
start_values = numeric(),
fixed_values = numeric(),
latent_classes = 0,
latent_class_parameters = character(),
latent_id_colname = character(),
return_type = "fit",
print_sum = F,
...) {
dot_args <- list(...)
dot_args[c("p_fun", "p_aggrgation_fun")] <- NULL
if(latent_classes < 2) latent_class_parameters <- character()
if(!is.character(latent_class_parameters)) {
warning("Latent_class_parameters not character. Latent classes will be disregarded.")
latent_class_parameters <- character()
latent_classes <- 0
latent_id_colname <- character()
}
if(!is.character(latent_id_colname)) {
warning("latent_id_colname not character. Latent classes will be disregarded.")
latent_class_parameters <- character()
latent_classes <- 0
latent_id_colname <- character()
}
if(length(latent_id_colname)) latent_id_colname <- latent_id_colname[1]
lik_fun = xlik
startValues <- attr(controlList, "startValues")
if("xreg" %in% class(controlList)) {
if(is.na(start_values[1])) start_values <- controlList$coef
controlList <- controlList$controlList
}
formulas <- controlList
#return(match_formals(fun = xregControl, formulas = controlList))
controlList <- do.call(xregControl, match_formals(fun = xregControl, formulas = controlList))
#print(controlList)
#if("p_fun" %in% names(list(...))) list(...)[["p_fun"]] <- NULL
if("xreg" %in% class(start_values)) startValues <- start_values$coef
if("numeric" %in% class(start_values) & length(start_values) > 0) startValues <- start_values
if(class(dataList) == "data.frame") {
dataList <- list(dataList)
names(dataList)[1] <- names(controlList)[1]
warning(paste0("Dataframe provided where list expected in dataList argument. The dataframe has been enclosed in a list, and named \"", names(controlList)[1], "\" to match the first xregControl-object."))
}
if(!all(names(dataList) %in% names(controlList))) stop(paste("No control list provided for ", paste(names(dataList)[!names(dataList) %in% names(controlList)], collapse = ", ")))
if(is.null(names(dataList))) {
if(length(dataList) <= length(controlList)) {
warning("Provided datalist was unnamed. Names coerced from controlList")
names(dataList) <- names(controlList)[1:length(dataList)]
} else {
stop("Provided datalist was unnamed, and longer than provided controllist. Exiting.")
}
}
if(length(unique(names(startValues))) < length(startValues)) {
warning(paste0("More than one start value assigned for parameters (", paste(names(startValues)[-match(unique(names(startValues)), names(startValues))], collapse = ", "), "). First value used."))
startValues <- startValues[unique(names(startValues))]
}
if(length(fixed_values)) {
if("xreg" %in% class(fixed_values)) {
fixed_values <- fixed_values$pars
#print("Gothere")
}
if("data.frame" %in% class(fixed_values)) {
fixed_df <- fixed_values[, 1:2]
fixed_values <- fixed_values[,1]
names(fixed_values) <- rownames(fixed_df)
} else {
#print(fixed_values)
fixed_df <- data.frame(Estimate = fixed_values, 'Std. Error' = rep(NA, NROW(fixed_values)))
colnames(fixed_df) <- c('Estimate', 'Std. Error')
rownames(fixed_df) <- names(fixed_values)
}
} else {
fixed_df <- data.frame(a = numeric(), b = numeric())
}
colnames(fixed_df) <- c('Estimate', 'Std. Error')
fixedValues <- numeric()
this_call <- match.call()
initControlList <- controlList
aggr <- TRUE
return_lik_df <- (return_type == "df")
return_first <- (return_type == "first")
#return_lik_df <- FALSE
if(return_type == "predict") {
aggr <- FALSE
return_lik_df = TRUE
return_type = "first_df"
#print("GOTHERE")
# print(fixed_values)
}
if(return_type == "first_df") {
return_first <- TRUE
return_first_df <- TRUE
} else {
return_first_df <- FALSE
}
if(return_type == "precalc_df") {
return_first <- TRUE
return_lik_df <- FALSE
return_first_df <- TRUE
}
#print(return_lik_df)
start_vals = vector()
prep_out <- list()
newDataList <- list()
all_defined_vars <- character()
undefined_vars <- character()
for(dataName in names(dataList)) {
rel_col <- vector()
thisControl <- controlList[[dataName]]
data_df <- dataList[[dataName]]
orig_cols <- colnames(data_df)
valueVar <- NA
defined_vars <- colnames(data_df)
local_vars <- character()
for(formula in thisControl$formulas) {
targetName <- as.character(formula[[2]])
rel_col <- c(rel_col, all.vars(formula)[!all.vars(formula) %in% rel_col])
if(is.na(valueVar)) if(targetName %in% colnames(data_df) | (length(grep(paste0(targetName, "\\."), colnames(data_df), value = TRUE)) == 2)) valueVar <- targetName
rhs_vars <- all.vars(formula[[3]])
undefined_vars <- c(undefined_vars, rhs_vars[!rhs_vars %in% defined_vars])
all_defined_vars <- c(all_defined_vars, undefined_vars[!undefined_vars %in% all_defined_vars])
if(!targetName %in% c(colnames(data_df), valueVar)) defined_vars <- c(defined_vars, targetName)
local_vars <- c(local_vars, all.vars(formula), targetName)
}
local_vars <- local_vars[!local_vars %in% colnames(data_df)]
#print(local_vars)
controlList[[dataName]][["defined_vars"]] <- local_vars
censor_bounds <- thisControl$censor_bounds
#print(colnames(data_df))
#print(valueVar)
grep_vals <- grep(paste0(valueVar, "\\."), colnames(data_df), value = TRUE)
if(length(grep_vals) > 2) stop("More than two columns matching target variable.")
if(length(grep_vals ) == 2) {
# Two values identified. Check if one is allways >= the other
if(all((data_df[,grep_vals[1]]-data_df[,grep_vals[2]])>=0)) {
upper_bound_var <- grep_vals[1]
lower_bound_var <- grep_vals[2]
} else if(all((data_df[,grep_vals[1]]-data_df[,grep_vals[2]])<=0)) {
upper_bound_var <- grep_vals[2]
lower_bound_var <- grep_vals[1]
} else stop(paste0("Two columns matched the left-hand side: ", grep_vals[1], " and ", grep_vals[2], ". However, neither was allways >= the other, so they cannot be used as upper/lower bounds. Quitting."))
} else {
if(!valueVar %in% colnames(data_df)) {
stop("No value columns specified. ")
} else {
upper_bound_var <- lower_bound_var <- valueVar
}
}
controlList[[dataName]]$upper_bound_var <- upper_bound_var
controlList[[dataName]]$lower_bound_var <- lower_bound_var
controlList[[dataName]]$valueVar <- valueVar
data_df[, c("internal_ub", "internal_lb")] <- data_df[, c(upper_bound_var, lower_bound_var)]
formula[[2]] <- substitute(value)
e_params <- list(...)
if(length(latent_id_colname)) {
if(latent_id_colname %in% colnames(data_df)) data_df$internal_classgroup <- data_df[, latent_id_colname]
}
else data_df$internal_classgroup <- paste0(dataName, "." , 1:NROW(data_df))
data_df <- within(data_df, {
both_na <- is.na(internal_ub) * is.na(internal_lb)
internal_ub[internal_ub >= max(thisControl$censor_bounds)] <- Inf
internal_lb[internal_lb <= min(thisControl$censor_bounds)] <- -Inf
internal_lb[internal_lb >= max(thisControl$censor_bounds)] <- max(thisControl$censor_bounds)
internal_ub[internal_ub <= min(thisControl$censor_bounds)] <- min(thisControl$censor_bounds)
internal_ub[is.na(internal_ub)] <- Inf
internal_lb[is.na(internal_lb)] <- -Inf
internal_type <- rep(3, NROW(internal_ub))
# print(rbind(int_type = internal_type, int_lb = internal_lb, int_ub = internal_ub))
internal_type[internal_ub == internal_lb] <- 1
# print(internal_ub == internal_lb)
# print(rbind(int_type = internal_type, int_lb = internal_lb, int_ub = internal_ub))
internal_type[internal_ub == Inf] <- 0
# print(rbind(int_type = internal_type, int_lb = internal_lb, int_ub = internal_ub))
internal_type[internal_lb == -Inf] <- 2
# print(rbind(int_type = internal_type, int_lb = internal_lb, int_ub = internal_ub))
internal_count <- 1
})
#print(data_df)
if(sum(data_df$both_na)) {
warning(paste0(sum(data_df$both_na)," rows in which ", valueVar, " was NA were removed."))
data_df <- data_df[!data_df$both_na, -(NCOL(data_df))]
}
if(!is.na((weights_var <- thisControl$weights_var)) & weights_var %in% colnames(data_df)) data_df$internal_count <- data_df[, weights_var]
relevant_columns <- c(rel_col, colnames(data_df)[grep("internal_", colnames(data_df))])
if(aggr) {
new_df <- data_df[, colnames(data_df) %in% relevant_columns]
new_df$internal_unique <- as.numeric(as.factor(apply(new_df, MARGIN = 1, FUN = "paste", collapse = "")))
unique_df <- unique(new_df)
unique_df$internal_count <- aggregate(new_df$internal_count, by = list(new_df$internal_unique), FUN = "sum")[unique_df$internal_unique, 2]
newDataList[[dataName]] <- unique_df
} else {
newDataList[[dataName]] <- data_df
}
controlList[[dataName]]$orig_cols <- orig_cols
startValues <- c(startValues, thisControl$start_values[!names(thisControl$start_values) %in% names(startValues)])
fixedValues <- c(fixedValues, thisControl$fixed_values[!names(thisControl$fixed_values) %in% names(fixedValues)])
all_defined_vars <- c(all_defined_vars, defined_vars[!defined_vars %in% all_defined_vars])
controlList[[dataName]]$obs_types <- c(Uncensored = sum(data_df$internal_count * (data_df$internal_ub == data_df$internal_lb)),
'Left censored' = sum(data_df$internal_count * (data_df$internal_ub > data_df$internal_lb) * (data_df$internal_ub < Inf) * (data_df$internal_lb == -Inf)),
'Right censored'= sum(data_df$internal_count * (data_df$internal_ub > data_df$internal_lb) * (data_df$internal_lb > -Inf) * (data_df$internal_ub == Inf)),
'Intervals'= sum(data_df$internal_count * (data_df$internal_ub > data_df$internal_lb) * (data_df$internal_lb > -Inf) * (data_df$internal_ub < Inf))
)
}
if(length(fixedValues)) {
fixed_values[names(fixedValues)] <- fixedValues
tmp <- rep(NA, NROW(fixed_values))
names(tmp) <- names(fixed_values)
tmp[rownames(fixed_df)] <- fixed_df[,2]
fixed_df <- data.frame(Estimate = fixed_values, 'Std. Error' = tmp)
colnames(fixed_df) <- c('Estimate', 'Std. Error')
}
startValues[names(fixed_values)[names(fixed_values) %in% names(startValues)]] <- fixed_values[names(fixed_values) %in% names(startValues)]
undefined_vars <- unique(undefined_vars[!undefined_vars %in% c(names(startValues), latent_class_parameters)])
undefined_vars <- undefined_vars[!undefined_vars %in% c("Xb", names(fixedValues))]
if(length(undefined_vars)) {
tmp <- paste(undefined_vars, collapse = ", ")
used_from <- "xreg"
if("run_from" %in% names(dot_args)) used_from <- dot_args[["run_from"]]
if(!used_from == "hyreg") warning(paste0("Used formulas contained variables not found in start_values or column names (", tmp,"). These will be initiated with a starting value of 0.1. Defining start values either in xregControl or in the call to xreg is highly recommended."))
tmp <- rep(0.1, length(undefined_vars))
names(tmp) <- undefined_vars
startValues <- c(startValues, tmp)
}
if(latent_classes > 1) {
for(lc in latent_class_parameters) {
tmp <- 0.1
names(tmp) <- lc
if(lc %in% names(startValues)) {
tmp <- startValues[lc]
startValues <- startValues[-which(names(startValues) == lc)]
all_defined_vars <- all_defined_vars[-which(all_defined_vars== lc)]
}
for(i in 1:latent_classes) {
lci <- paste(lc, i, sep = ".")
if(!lci %in% names(startValues)) {
startValues <- c(startValues, tmp)
names(startValues)[length(startValues)] <- lci
}
if(!lci %in% all_defined_vars) {
all_defined_vars <- c(all_defined_vars, lci)
}
}
}
}
if(length((tmp <- names(startValues[!names(startValues) %in% all_defined_vars])))) {
startValues <- startValues[!names(startValues) %in% tmp]
tmp <- paste(tmp, collapse = ", ")
warning(paste0("Start values defined for variables (", tmp, ") not found in column names or formulas. These have been removed."))
}
if("optim_control" %in% names(e_params)){
control <- e_params[["optim_control"]]
dot_args['optim_control'] <- NULL
} else {
boxconstr <- F
if("method" %in% names(list(...))) {
meth <- list(...)[["method"]]
if(meth == "L-BFGS-B") boxconstr <- T
}
# if(boxconstr) control <- list(maxit = 10000, factr = .Machine$double.eps)
# else control <- list(maxit = 10000, abstol = .Machine$double.eps, reltol = .Machine$double.eps)
if(boxconstr) control <- list(maxit = 10000, factr = 1e-11)
else control <- list(maxit = 10000, abstol = 1e-9, reltol = 1e-11)
#control <- list(maxit = 5, abstol = .Machine$double.eps, reltol = .Machine$double.eps)
}
startValues[names(fixed_values)] <- fixed_values
# print(names(newDataList[[1]]))
if(return_first) {
ret <- do.call(lik_fun, c(startValues,
list(
controlList = controlList,
dataList = newDataList,
print_sum = F,
return_lik_df = return_lik_df,
return_first_df = return_first_df,
latent_classes = latent_classes,
latent_class_parameters = latent_class_parameters,
dot_args)))
for(nm in names(ret)) {
#ret[[nm]] <- ret[[nm]][, c(orig_cols, "Xb", "p")]
}
return(ret)
}
testmle <- do.call(x_mle, c(list(minuslogl = lik_fun,
start = as.list(startValues),
controlList = controlList,
dataList = newDataList,
control = control,
print_sum = print_sum,
fixed = as.list(fixed_values),
latent_classes = latent_classes,
latent_class_parameters = latent_class_parameters),
dot_args))
dfs <- do.call(lik_fun, c(c(testmle$coef,fixed_values),
list(
controlList = controlList,
dataList = newDataList,
print_sum = F,
return_lik_df = T,
return_first_df = T,
latent_classes = latent_classes,
latent_class_parameters = latent_class_parameters,
dot_args)))
pars <-cbind(testmle$fullcoef, type = rep("Fitted", NROW(testmle$fullcoef)))
testmle$fixed_values <- fixed_df
testmle$fixed_values <- fixed_df
pars <- rbind(pars, cbind(fixed_df, type = rep("Fixed", NROW(fixed_df))))
res_types <- list()
# res_types["p_sum"] <- numeric()
# res_types["counts"] <- numeric()
pars[, paste0("n_", names(dfs))] <- 0
obs_types <- as.data.frame(matrix(rep(0, length(names(dfs))*4), nrow = 4))
colnames(obs_types) <- names(dfs)
rownames(obs_types) <- c("Uncensored", "Left-censored", "Right-censored", "Intervals")
for(nm in names(dfs)) {
res_types[["p_sum"]][nm] <- sum(controlList[[dataName]]$p_aggregation_fun(dfs[[nm]]))
pars[rownames(pars) %in% controlList[[nm]]$defined_vars,paste0("n_", nm)] <- res_types[["counts"]][nm] <- sum(dfs[[nm]]$internal_count)
obs_types[,nm] <- controlList[[nm]]$obs_types
}
obs_types$total <- rowSums(obs_types)
pars[, "n_sum"] <- rowSums(pars[, paste0("n_", names(dfs)), F])
pars[, 't value'] <- abs(pars$Estimate)/pars$'Std. Error'
pars[,'Pr(>|t|)'] <- 2*pt(pars$'t value', df = pars$n_sum-1, lower.tail = F)
testmle$pars <- pars
testmle$pars_v <- pars[,1]
testmle$coef <- testmle$pars[,1]
names(testmle$coef) <- rownames(testmle$pars)
res_types[["counts"]][["total_count"]] <- sum(res_types[["counts"]])
res_types[["p_sum"]]["total"] <- sum(res_types[["p_sum"]])
testmle$minima <- res_types$p_sum
testmle$obs_types <- obs_types
testmle$info <- rbind(obs_types, total_obs = res_types$counts, minima = res_types$p_sum, logLik = -res_types$p_sum)
names(testmle$pars_v) <- rownames(pars)
res <- list(controlList = controlList,
start_values = start_values,
mle_obj = testmle,
coef = testmle$coef,
coefficients = testmle$coef,
full_coef = testmle$pars,
valueVar = valueVar,
fixed_values = fixed_df,
minima = res_types[["p_sum"]],
counts = res_types[["counts"]],
obs_types = obs_types,
pars = pars,
pars_v = testmle$pars_v)
class(res) <- c("xreg", "list")
return(res)
}
#' @export
summary.xreg <- function(x, ...) summary(x$mle_obj)
#' @export
print.xreg <- function(x, ...) print(x$mle_obj)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.