# Utils
# Generics ------------------
#' Return number of cells for the object
#'
#' Returns the number of cells for the an object
#'
#' @param obj Object
#'
#' @return Number of cells in partition (at least 1)
#' @export
num_cells <- function(obj) {
UseMethod("num_cells", obj)
}
#' Get descriptive data.frame
#'
#' Get information for each cell
#'
#' @param obj partition object
#' @param cont_bounds_inf Should "Inf" be used for continuous bounds (otherwise
#' the bounds from X_range)
#' @param do_str If True, use a string like "(a, b]", otherwise have two
#' separate columns with a and b
#' @param drop_unsplit If True, drop columns for variables overwhich the
#' partition did not split
#' @param digits digits Option (default is NULL)
#' @param unsplit_cat_star Should unsplit categorical variables be listed as
#' "*", otherwise all factor labels will be used.
#' @param ... Additional arguments.
#'
#' @return data.frame with columns: partitioning columns
#' @export
get_desc_df <- function(obj, cont_bounds_inf=TRUE, do_str=FALSE, drop_unsplit=FALSE,
digits=NULL, unsplit_cat_star=TRUE, ...) {
UseMethod("get_desc_df", obj)
}
# General Utils ----------------
#handles vectors and 2D structures
row_sample <- function(data, sample) {
if(is.null(data)) return(NULL)
if(is.null(ncol(data)))
return(data[sample])
return(data[sample, , drop=FALSE])
}
is_vec <- function(X) {
return(is.null(ncol(X)) || ncol(X)==1)
}
get_dim_cat <- function(X) {
if(is.data.frame(X)) {
return(sapply(X, is.factor) & !sapply(X, is.ordered))
}
return(rep(F, ncol(X)))
}
update_names <- function(X) {
if(is.null(colnames(X))){
colnames(X) = paste("X", 1:ncol(X), sep="")
}
return(X)
}
#Note that the verbosity passed in here could be different than a member of the params
my_apply <- function(X, fn_k, apply_verbosity, pr_cl, params) {
K = length(X)
if(requireNamespace("pbapply", quietly = TRUE) & (apply_verbosity>0) & (is.null(pr_cl) || length(pr_cl)<K)) {
rets = do.call(pbapply::pblapply, c(list(X, fn_k), params, list(cl=pr_cl)))
}
else if(!is.null(pr_cl)) {
rets = do.call(parallel::parLapply, c(list(pr_cl, X, fn_k), params))
}
else {
rets = do.call(lapply, c(list(X, fn_k), params))
}
return(rets)
}
replace_k_factor <- function(base_facts, k, new_fact) {
base_facts[[k]] = new_fact
return(base_facts)
}
is_factor_dim_k <- function(X, k) {
return(is.factor(X[, k]))
}
# Standard way to check if vector is constant is const_vectr(), but is O(n).
# Checking element-by-element would often be faster, but this is inefficient in R
# and faster in C. const_vect1() and const_vect2() were two versions (first using
# 'inline', second just Rcpp), but couldn't get to work in building a package.
# The Rcpp version is now in a separate file.
# const_vectr <- function(x) {
# if(length(x)==0) return(TRUE)
# r = range(x)
# return(r[1]==r[2])
# }
# Fold utils --------------------------
gen_folds <- function(y, nfolds) {
assert_that(nfolds>1, msg="Need nfolds>1")
idxOut_new = caret::createFolds(y, k=nfolds, list=TRUE)
idx_new = list()
for(f in 1:nfolds) {
idx_new[[f]] = sort(unlist(idxOut_new[-f], use.names=FALSE))
}
foldids = foldlists_to_foldids(idxOut_new)
return(list(foldids=foldids, index=idx_new, indexOut=idxOut_new))
}
factor_from_idxs <-function(N, nfolds, indexOut) {
folds = vector("numeric", N)
for(f in 1:nfolds) {
folds[indexOut[[f]]] = f
}
folds_f = as.factor(folds)
return(folds_f)
}
foldlists_to_foldids <- function(indexOut) {
nfolds = length(indexOut)
N = sum(sapply(indexOut, length))
foldids = rep(0, N)
for(f in 1:nfolds) {
foldids[indexOut[[f]]] = f
}
return(foldids)
}
foldids_to_foldlists <- function(foldids, nfolds) {
index = list()
indexOut = list()
for(f in 1:nfolds){
index[[f]] = which(foldids!=f)
indexOut[[f]] = which(foldids==f)
}
return(list(index=index, indexOut=indexOut))
}
DS.SINGLE = 0
DS.MULTI_SAMPLE = 1
DS.MULTI_D = 2
DS.MULTI_Y = 3
expand_fold_info <- function(y, cv_folds, m_mode=DS.SINGLE) {
if(length(cv_folds)==1) {
nfolds = cv_folds
folds_ret = gen_folds_m(y, nfolds, m_mode)
foldids = foldlists_to_foldids_m(folds_ret, m_mode)
}
else {
foldids = cv_folds
nfolds = if(m_mode!=DS.MULTI_SAMPLE) max(foldids) else max(foldids[[1]])
folds_ret = foldids_to_foldlists_m(foldids, nfolds, m_mode)
}
return(list(nfolds, folds_ret, foldids))
}
expand_bump_samples <- function(bump_samples, bump_ratio, N, m_mode) {
if(length(bump_samples)==1) {
bump_B = bump_samples
bump_samples <- lapply(seq_len(bump_B), function(b){sample_m(bump_ratio, N, m_mode==DS.MULTI_SAMPLE)})
}
return(bump_samples)
}
# Multi-sample utils ----------------------
is_sep_sample <- function(X) {
return(is.list(X) & !is.data.frame(X))
}
is_sep_estimators <- function(m_mode) {
return(m_mode==DS.MULTI_SAMPLE || m_mode==DS.MULTI_Y)
}
sum_m <- function(data, M_mult) {
if(!M_mult) return(sum(data))
return(sapply(data, sum))
}
ensure_good_X <- function(X) {
if(is_sep_sample(X)) {
return(lapply(X, ensure_good_X))
}
if (is.matrix(X)) {
are_equal(mode(X), "numeric")
}
else {
assert_that(is.data.frame(X), msg="X is not a matrix or data.frame")
if (inherits(X, "tbl")) X <- as.data.frame(X) # tibble's return tibble (rather than vector) for X[,k], making is.factor(X[,k]) and others fail. Could switch to doing X[[k]] for df-like objects
for (k in seq_len(ncol(X))) are_equal(mode(X[[k]]), "numeric")
}
assert_that(ncol(X) >= 1, msg="X has no columns.")
return(X)
}
get_sample_type <- function(y, X, d=NULL, checks=FALSE) {
if(is_sep_sample(X)) { #Different samples
m_mode=DS.MULTI_SAMPLE
M = length(X)
N = sapply(X, nrow)
K = ncol(X[[1]])
if(checks) {
check_list_dims <- function(new_type) {
assert_that(is.list(new_type), length(new_type)==M, msg="Separate samples, but aux param isn't list of same length")
for(m in 1:M) assert_that(length(new_type[[m]])==N[[m]], msg="Separate samples, but aux param's list elements aren't the right length.")
}
check_list_dims(y)
if(!is.null(d)) check_list_dims(d)
for(m in 1:M) {
assert_that(ncol(X[[m]])==K, msg="Separate samples, but X's don't all have the same number of columns.")
}
}
}
else { #Same sample
N = nrow(X)
K = ncol(X)
if(!is.null(d) && is.matrix(d) && ncol(d)>1) {
m_mode= DS.MULTI_D
M = ncol(d)
if(checks){
assert_that(!inherits(d, "tbl"), msg="d not allowed to be a tibble") #TODO: Could silently conver
assert_that(nrow(d)==N, length(y)==N, msg="d and N don't have the right number of rows.")
}
}
else if(!is.null(d) && is.matrix(y) && ncol(y)>1) {
m_mode= DS.MULTI_Y
M = ncol(y)
N = nrow(X)
if(checks){
assert_that(!inherits(y, "tbl"), msg="d not allowed to be a tibble") #TODO: Could silently conver
assert_that(is.null(d) || length(d)==N, nrow(y)==N, msg="d and N don't have the right number of rows.")
}
}
else {
m_mode= DS.SINGLE
M=1
if(checks)
assert_that(is.null(d) || length(d)==N, length(y)==N, msg="d and N don't have the right number of rows.")
}
if(M>1) N= rep(N, M)
}
return(list(M, m_mode, N, K))
}
check_M_K <- function(M, m_mode, K, X_aux, d_aux) {
if(m_mode==DS.MULTI_SAMPLE) {
assert_that(length(X_aux)==M, is.null(d_aux) || length(d_aux)==M, msg="Separate samples, but X_aux or d_aux don't have the right structure.")
for(m in 1:M) assert_that(ncol(X_aux[[m]])==K, msg="Separate samples, but an element of X_aux doesn't have the right number of columns.")
}
else {
assert_that(ncol(X_aux)==K, msg="X_aux doesn't have the right number of columns.")
if(m_mode==DS.MULTI_D) assert_that(ncol(d_aux)==M, msg="MULTI_D case but not the right number of cols in d.")
}
}
# Multi-sample wrappers --------------------
valid_partition_m <- function(M_mult, valid_fn, cell_factor, d, cell_factor_aux=NULL, d_aux=NULL, min_size=0) {
if(M_mult) {
for(m in 1:length(cell_factor)) {
v_ret = valid_fn(cell_factor[[m]], d[[m]], cell_factor_aux[[m]], d_aux[[m]], min_size)
if(v_ret$fail) return(v_ret)
}
return(list(fail=FALSE))
}
return(valid_fn(cell_factor, d, cell_factor_aux, d_aux, min_size))
}
#length of N is M (so even if same sample)
nrow_m <- function(X, M) {
if(is_sep_sample(X)) {
return(sapply(X, nrow))
}
N = nrow(X)
if(M==1) return(N)
return(rep(N, M))
}
# Return M-list if mode_m==1 else sample
sample_m <- function(ratio, N, M_mult) {
if(!M_mult) {
if(length(N)>1) N=N[1] #for modes 2 & 3
return(sample(N, N*ratio, replace=TRUE))
}
return(lapply(N, function(N_s) sample(N_s, N_s*ratio, replace=TRUE)))
}
#assumes separate samples if m_mode==DS.MULTI_SAMPLE
subsample_m <- function(y, X, d, sample) {
M_mult = is_sep_sample(X)
if(!M_mult) {
return(list(row_sample(y,sample), X[sample,,drop=FALSE], row_sample(d,sample)))
}
return(list(mapply(function(y_s, sample_s) y_s[sample_s], y, sample, SIMPLIFY=FALSE),
mapply(function(X_s, sample_s) X_s[sample_s,,drop=FALSE], X, sample, SIMPLIFY=FALSE),
mapply(function(d_s, sample_s) d_s[sample_s], d, sample, SIMPLIFY=FALSE)))
}
gen_split_m <- function(N, tr_split, M_mult) {
if(!M_mult) {
if(length(N>1)) N=N[1] # mode 2 & 3
return(base::sample(N, tr_split*N))
}
return(lapply(N, function(n) base::sample(n, tr_split*n)))
}
split_sample <- function(y, X, d, index_tr) {
list[y_tr, y_es] = list(row_sample(y, index_tr), row_sample(y, -index_tr))
list[d_tr, d_es] = list(row_sample(d, index_tr), row_sample(d, -index_tr))
X_tr = X[index_tr, , drop=FALSE]
X_es = X[-index_tr, , drop=FALSE]
N_est = nrow(X_es)
return(list(y_tr, y_es, X_tr, X_es, d_tr, d_es, N_est))
}
split_sample_m <- function(y, X, d, index_tr) {
if(!is_sep_sample(X)) {
return(split_sample(y, X, d, index_tr))
}
else {
y_tr = y_es = X_tr = X_es = d_tr = d_es = list()
N_est = rep(0, length(X))
for(m in 1:length(X))
list[y_tr[[m]], y_es[[m]], X_tr[[m]], X_es[[m]], d_tr[[m]], d_es[[m]], N_est[m]] = split_sample(y[[m]], X[[m]], d[[m]], index_tr[[m]])
N_est = sapply(X_es, nrow)
}
return(list(y_tr, y_es, X_tr, X_es, d_tr, d_es, N_est))
}
gen_folds_m <-function(y, nfolds, m_mode) {
if(m_mode!=DS.MULTI_SAMPLE) {
if(is.list(y)) y = y[[1]]
if(is.matrix(y)) y = y[,1]
return(gen_folds(y, nfolds))
}
M = length(y)
return(lapply(1:M, function(m) gen_folds(y[[m]], nfolds)))
}
foldlists_to_foldids_m <- function(folds_ret, m_mode) {
if(m_mode!=DS.MULTI_SAMPLE) return(foldlists_to_foldids(folds_ret$indexOut))
return(lapply(folds_ret, function(f_ret) foldlists_to_foldids(f_ret$indexOut)))
}
foldids_to_foldlists_m <- function(foldids, nfolds, m_mode) {
if(m_mode!=DS.MULTI_SAMPLE) return(foldids_to_foldlists(foldids, nfolds))
return(lapply(foldids, function(f_ids) foldids_to_foldlists(f_ids, nfolds)))
}
split_sample_folds_m <- function(y, X, d, folds_ret, f) {
if(!is_sep_sample(X)) {
list[y_f_tr, y_f_cv] = list(row_sample(y, folds_ret$index[[f]]), row_sample(y, folds_ret$indexOut[[f]]))
list[d_f_tr, d_f_cv] = list(row_sample(d, folds_ret$index[[f]]), row_sample(d, folds_ret$indexOut[[f]]))
X_f_tr = X[folds_ret$index[[f]], , drop=FALSE]
X_f_cv = X[folds_ret$indexOut[[f]], , drop=FALSE]
}
else {
y_f_tr = y_f_cv = X_f_tr = X_f_cv = d_f_tr = d_f_cv = list()
for(m in 1:length(X))
list[y_f_tr[[m]], y_f_cv[[m]], X_f_tr[[m]], X_f_cv[[m]], d_f_tr[[m]], d_f_cv[[m]]] = split_sample_folds_m(y[[m]], X[[m]], d[[m]], folds_ret[[m]], f)
}
return(list(y_f_tr, y_f_cv, X_f_tr, X_f_cv, d_f_tr, d_f_cv))
}
fit_and_residualize_m <- function(est_plan, X_tr, y_tr, d_tr, cv_folds, y_es, X_es, d_es, m_mode, M, verbosity, dim_cat) {
if(!is_sep_estimators(m_mode))
return(fit_and_residualize(est_plan, X_tr, y_tr, d_tr, cv_folds, y_es, X_es, d_es, verbosity, dim_cat))
if(m_mode==DS.MULTI_SAMPLE) {
for(m in 1:M)
list[est_plan[[m]], y_tr[[m]], d_tr[[m]], y_es[[m]], d_es[[m]]] = fit_and_residualize(est_plan[[m]], X_tr[[m]], y_tr[[m]], d_tr[[m]], cv_folds[[m]], y_es[[m]], X_es[[m]], d_es[[m]], verbosity, dim_cat)
return(list(est_plan, y_tr, d_tr, y_es, d_es))
}
#m_mode==DS.MULTI_Y
#We overwrite the d's
for(m in 1:M)
list[est_plan[[m]], y_tr[,m], d_tr, y_es[,m], d_es] = fit_and_residualize(est_plan[[m]], X_tr, y_tr[,m], d_tr, cv_folds, y_es[,m], X_es, d_es, verbosity, dim_cat)
return(list(est_plan, y_tr, d_tr, y_es, d_es))
}
Param_Est_m <- function(est_plan, y_cell, d_cell, X_cell, sample=sample, ret_var=FALSE, m_mode) {
if(!is_sep_estimators(m_mode)) { #single estimation
return(est_params(est_plan, y_cell, d_cell, X_cell, sample=sample, ret_var))
}
if(m_mode==DS.MULTI_SAMPLE){
if(ret_var) {
rets = mapply(function(est_plan_s, y_cell_s, d_cell_s, X_cell_s)
unlist(est_params(est_plan_s, y_cell_s, d_cell_s, X_cell_s, sample=sample, ret_var)),
est_plan, y_cell, d_cell, X_cell, SIMPLIFY = TRUE)
return(list(param_ests=rets[1,], var_ests=rets[2,]))
}
rets = mapply(function(est_plan_s, y_cell_s, d_cell_s, X_cell_s)
est_params(est_plan_s, y_cell_s, d_cell_s, X_cell_s, sample=sample, ret_var)[[1]],
est_plan, y_cell, d_cell, X_cell, SIMPLIFY = TRUE)
return(list(param_ests = rets))
}
#m_mode==DS.MULTI_Y
M = ncol(y_cell)
if(ret_var) {
rets = sapply(1:M, function(m) unlist(est_params(est_plan[[m]], y_cell[,m], d_cell, X_cell, sample=sample, ret_var)))
return(list(param_ests=rets[1,], var_ests=rets[2,]))
}
rets = sapply(1:M, function(m) est_params(est_plan[[m]], y_cell[,m], d_cell, X_cell, sample=sample, ret_var)[[1]])
return(list(param_ests = rets))
}
interaction_m <- function(facts, M_mult=FALSE, drop=FALSE) {
if(!M_mult) {
return(interaction(facts, drop=drop))
}
return(lapply(facts, function(f) interaction(f, drop=drop)))
}
interaction2_m <- function(f1, f2, M_mult=FALSE, drop=FALSE) {
if(!M_mult) {
return(interaction(f1, f2, drop=drop))
}
return(mapply(function(f1_s, f2_s) interaction(f1_s, f2_s, drop=drop), f1, f2, SIMPLIFY=FALSE))
}
gen_holdout_interaction_m <- function(factors_by_dim, k, M_mult) {
if(!M_mult)
return(gen_holdout_interaction(factors_by_dim, k))
return(lapply(factors_by_dim, function(f_by_dim) gen_holdout_interaction(f_by_dim, k)))
}
is_factor_dim_k_m <- function(X, k, M_mult) {
if(!M_mult)
return(is_factor_dim_k(X, k))
return(is_factor_dim_k(X[[1]], k))
}
droplevels_m <- function(factor, M_mult) {
if(!M_mult) return(droplevels(factor))
return(lapply(factor, droplevels))
}
apply_mask_m <- function(data, mask, M_mult) {
if(is.null(data)) return(NULL)
if(!M_mult) return(row_sample(data, mask))
return(mapply(function(data_s, mask_s) row_sample(data_s, mask_s), data, mask, SIMPLIFY=FALSE))
}
any_const_m <- function(d_shifted, shifted_cell_factor_nk, m_mode) {
if(m_mode==DS.SINGLE || m_mode==DS.MULTI_Y)
return(any(by(d_shifted, shifted_cell_factor_nk, FUN=const_vect)))
if(m_mode==DS.MULTI_SAMPLE)
return( any(mapply(function(d_shifted_s, shifted_cell_factor_nk_s)
any(by(d_shifted_s, shifted_cell_factor_nk_s, FUN=const_vect))
, d_shifted, shifted_cell_factor_nk )) )
#m_mode==DS.MULTI_D
return( any(apply(d_shifted, 2, function(d_shifted_s) any(by(d_shifted_s, shifted_cell_factor_nk, FUN=const_vect)) )) )
}
gen_cat_window_mask_m <- function(X, k, window) {
if(is.null(X)) return(NULL)
M_mult = is_sep_sample(X)
if(!M_mult) return(X[, k] %in% window)
return(lapply(X, function(X_s) X_s[, k] %in% window))
}
gen_cat_win_split_cond_m <- function(X, win_mask, k, win_split_val) {
M_mult = is_sep_sample(X)
if(!M_mult)
return(factor(X[win_mask, k] %in% win_split_val, levels=c(FALSE, TRUE)))
return(mapply(function(X_s, win_mask_s) factor(X_s[win_mask_s, k] %in% win_split_val, levels=c(FALSE, TRUE)), X, win_mask, SIMPLIFY=FALSE))
}
gen_cont_window_mask_m <- function(X, k, win_LB, win_UB) {
if(is.null(X)) return(NULL)
M_mult = is_sep_sample(X)
if(!M_mult) return(win_LB<X[, k] & X[, k]<=win_UB)
return(lapply(X, function(X_s) win_LB<X_s[, k] & X_s[, k]<=win_UB))
}
gen_cont_win_split_cond_m <- function(X, win_mask, k, X_k_cut) {
M_mult = is_sep_sample(X)
if(!M_mult)
return(factor(X[win_mask, k] <= X_k_cut, levels=c(FALSE, TRUE)))
return(mapply(function(X_s, win_mask_s) factor(X_s[win_mask_s, k] <= X_k_cut, levels=c(FALSE, TRUE)), X, win_mask, SIMPLIFY=FALSE))
}
replace_k_factor_m <- function(base_facts, k, new_fact, M_mult) {
if(!M_mult) {
return(replace_k_factor(base_facts, k, new_fact))
}
for(m in 1:length(base_facts)) {
base_facts[[m]][[k]] = new_fact[[m]]
}
return(base_facts)
}
update_names_m <- function(X) {
if(!is_sep_sample(X))
return(update_names(X))
for(m in 1:length(X)) {
X[[m]] = update_names(X[[m]])
}
return(X)
}
get_dim_cat_m <- function(X) {
if(!is_sep_sample(X))
return(get_dim_cat(X))
return(get_dim_cat(X[[1]]))
}
drop_col_k_m <- function(X, k) {
if(!is_sep_sample(X))
return(X[,-k, drop=FALSE])
for(m in 1:length(X)) {
X[[m]] = X[[m]][,-k, drop=FALSE]
}
return(X)
}
get_col_k_m <- function(X, k) {
if(!is_sep_sample(X))
return(X[,k, drop=FALSE])
for(m in 1:length(X)) {
X[[m]] = X[[m]][,k, drop=FALSE]
}
return(X)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.