# ---------------------------
# generate_analysis.R
# Generate seed_results for
# all method conditions,
# etc.
# ---------------------------
# Necessary functions
# ---------------------------
# ---------------------------
# Need to have UnprotNum, ProtNum, InfNum, and UninfNum defined globally
# With the supplied method and its options:
# 1. Do your method on varying efficacies, at various degrees of distinction
# 2. Save those results (performed on Inf/Uninf)
# Only save the data that you need:
# 1. MCCs
# 2. p-values
# 3. effect sizes
# 4. new group labels
#' efficacy_distance
#'
#' @export
efficacy_distance = function(model_data, nvars, dists, effs, cust_method, wrap_method_params, method_params, score_types, label_types, seed){
train_data = model_data[,colnames(model_data) != "prot_groups"]
colnames(train_data)[colnames(train_data) == "inf_groups"] = "groups"
protect_data = model_data[,colnames(model_data) != "inf_groups"]
colnames(protect_data)[colnames(protect_data) == "prot_groups"] = "groups"
if (!is.null(globals_list$par_effs) && globals_list$par_effs){
eff_results_list = foreach(eff=effs, .inorder=FALSE, .export=c("globals_list"), .packages=c("ksmthesis")) %dopar% {
dists_results = lapply(dists, generate_dist_results, eff, nvars,
train_data, protect_data, cust_method,
wrap_method_params, method_params, seed)
dists_frame = do.call(rbind, lapply(dists_results, generate_dist_frames,
nvars, score_types, label_types))
dists_new_groups = do.call(rbind, lapply(dists_results, generate_dist_new_groups))
gc()
return(list(dists_frame=dists_frame, dists_new_groups=dists_new_groups))
}
} else {
eff_results_list = lapply(effs, function(eff, dists, nvars, train_data,
protect_data, cust_method,
wrap_method_params, method_params,
score_types, label_types, seed){
dists_results = lapply(dists, generate_dist_results, eff, nvars,
train_data, protect_data, cust_method,
wrap_method_params, method_params, seed)
dists_frame = do.call(rbind, lapply(dists_results, generate_dist_frames,
nvars, score_types, label_types))
dists_new_groups = do.call(rbind, lapply(dists_results, generate_dist_new_groups))
gc()
return(list(dists_frame=dists_frame, dists_new_groups=dists_new_groups))
}, dists, nvars, train_data, protect_data, cust_method, wrap_method_params,
method_params, score_types, label_types, seed)
}
effs_frame = lapply(eff_results_list, function(x){ return(x[["dists_frame"]]) })
effs_frame = do.call(rbind, effs_frame)
effs_new_groups = lapply(eff_results_list, function(x){ return(x[["dists_new_groups"]]) })
effs_new_groups = do.call(rbind, effs_new_groups)
gc()
return(list(effs_frame=effs_frame, effs_new_groups=effs_new_groups))
}
#' generate_dist_results
#'
#' @export
generate_dist_results = function(dist, eff, nvars, train_data, protect_data, cust_method, wrap_method_params, method_params, seed){
model_train = train_data[train_data[,"eff"] == eff,]
model_train = model_train[model_train[,"dist"] == dist,]
model_train = as.matrix(model_train[,colnames(model_train) != "eff"])
model_train = as.matrix(model_train[,colnames(model_train) != "dist"])
model_protect = protect_data[protect_data[,"eff"] == eff,]
model_protect = model_protect[model_protect[,"dist"] == dist,]
model_protect = as.matrix(model_protect[,colnames(model_protect) != "eff"])
model_protect = as.matrix(model_protect[,colnames(model_protect) != "dist"])
results = wrap_method(cust_method, wrap_method_params, nvars,
model_train, method_params, model_protect,
seed)
results$dist = dist
results$eff = eff
return(results)
}
#' generate_dist_frames
#'
#' @export
generate_dist_frames = function(dist_results, nvars, score_types, label_types){
# NOTE: jank
# Does not allow for (totally) arbitrary names
colheaders = c(paste("score", score_types, sep='_'),
paste("p_value", label_types, sep='_'),
paste("effect_size", label_types, sep='_'),
"dist", "eff", "component")
stats_frame = lapply(1:nvars, function(i, dist_results){
return(c(dist_results$scores, dist_results$test_p_values[i,],
dist_results$test_effect_sizes[i,], dist_results$dist,
dist_results$eff, i))
}, dist_results)
stats_frame = do.call(rbind, stats_frame)
colnames(stats_frame) = colheaders
return(stats_frame)
}
#' generate_dist_new_groups
#'
#' @export
generate_dist_new_groups = function(dist_results){
tmp = as.matrix(data.frame(new_groups=dist_results$data_sets$new[,"groups"],
prot_groups=dist_results$data_sets$true[,"groups"],
dist=dist_results$dist, eff=dist_results$eff))
tmp = apply(tmp, 2, as.numeric)
return(tmp)
}
# NOTE: cleanup
# ---------------------------
# METHOD WRAPPERS
# ---------------------------
# A method wrapper accepts:
# 1. train_data - Data without group identifiers
# 2. full_data - Data with infection status identifiers
# 3. method_params - Any parameters it requires, bundled in a list
# 4. protect_data - Data with protection status identifiers
# The parent wrapper is required to return:
# 1. The original data, the data with new labels, and the ground truth data
# 2. Scores comparing new assignments to old labels and to true labels
# 3. Test results for each pair of labels, for each component
# 4. Fold changes for each pair of labels, for each component
# A method wrapper is required to:
# 1. Parse its method_params list
# 2. Return:
# a. full_data as old_data (i.e. old/inf labels)
# b. train_data with the new labels as new_data (i.e. predicted status)
# ---------------------------
# Standardized interface to wrapper methods
#' wrap_method
#'
#' @export
wrap_method = function(cust_method, wrap_method_params, nvars, full_data, method_params, protect_data, seed, score_method=mcc_score){
# Remove group information
train_data = as.matrix(full_data[,colnames(full_data) != "groups"])
rownames(train_data) = 1:nrow(train_data)
# [[train_folds/test_folds]]
# [[repeat]]
# [[fold]]
# Generate positive training and unlabeled testing folds
# for One-SVM. Traditional way doesn't apply.
if (identical(cust_method, wrap_one_svm)){
positive_indices = full_data[,"groups"] == globals_list$InfNum
positive_y = full_data[,"groups"][positive_indices]
unlabeled_y = full_data[,"groups"][!positive_indices]
repeat_folds = generate_pu_folds(full_data[,"groups"], globals_list$cv_repeats,
globals_list$cv_folds, seed)
# Generate folds balanced for Inf/Uninf membership (which won't
# actually matter since they're always equal).
} else {
repeat_folds = generate_label_folds(full_data[,"groups"], globals_list$cv_repeats,
globals_list$cv_folds, seed)
}
# Require both since they are wrapper wrappers
if (identical(cust_method, wrap_pu_bagging) || identical(cust_method, wrap_pu_reliable_negative)){
dotboi = list(wrap_method_params=wrap_method_params,
method_params=method_params)
} else {
dotboi = list(method_params=method_params)
}
prot_groups = c(globals_list$UnprotNum, globals_list$ProtNum)
# NOTE: Can't predict using kmeans, and wrap_pu_bagging is already an ensemble,
# so just run them once.
if (identical(cust_method, wrap_kmeans) || identical(cust_method, wrap_pu_bagging)){
results_list = evaluate_method(cust_method, train_data, full_data[,"groups"],
protect_data, dotboi, score_method, prot_groups,
nvars, wrap_method_params)
# NOTE: One-SVM performs like shit when CV'd because MCC is
# supposed to blow up when one class isn't predicted.
# Will this be a problem for other methods?
} else {
results_list = evaluate_method_cv(repeat_folds, cust_method,
train_data, full_data[,"groups"],
protect_data, dotboi, score_method,
prot_groups, nvars, wrap_method_params)
}
data_sets = list(full_data, results_list$new_data, protect_data)
names(data_sets) = c("old", "new", "true")
results_list$data_sets = data_sets
results_list$new_data = NULL
gc()
# Contains:
# $data_sets, a list of data frames
# $scores, a vector of newold newtrue accuracy
# $test_p_values, a data frame of nrow nvars and ncol length(data_sets)
# $test_effect_sizes, a data frame of nrow nvars and ncol length(data_sets)
return(results_list)
}
#' evaluate_method
#'
#' @export
evaluate_method = function(cust_method, x, y, protect_data, dotboi, score_method, prot_groups, nvars, wrap_method_params){
model_results = cust_method(x, y, ...=dotboi)
return(generate_results_list(model_results, x, y, score_method,
prot_groups, protect_data, nvars,
wrap_method_params))
}
#' evaluate_method_cv
#'
#' @export
evaluate_method_cv = function(repeat_folds_list, cust_method, x, y, protect_data, dotboi, score_method, prot_groups, nvars, wrap_method_params){
# Iterate over train and test simultaneously
repeat_results = mapply(function(train_folds, test_folds, cust_method,
x, y, protect_data, dotboi,
score_method, prot_groups,
nvars, wrap_method_params){
fold_results = mapply(function(train_indices, test_indices,
cust_method, x, y,
protect_data, dotboi,
score_method, prot_groups,
nvars, wrap_method_params){
x_test = x[test_indices,]
# Used as part of "old_data" for wrap_test
y_test = y[test_indices]
# NOTE: for wrap_pu_reliable_negative to select best
# preds
dotboi$method_params$y_test = y_test
# TODO: check that this works correctly
if (identical(cust_method, wrap_svm)){
train_x = protect_data[train_indices,]
train_x = train_x[,!(colnames(train_x) == "groups")]
train_y = protect_data[,"groups"][train_indices]
} else {
train_x = x[train_indices,]
train_y = y[train_indices]
}
model_results = cust_method(train_x, train_y, x_test, ...=dotboi)
gc()
return(generate_results_list(model_results, x_test, y_test,
score_method, prot_groups,
protect_data[test_indices,],
nvars, wrap_method_params))
}, train_folds, test_folds, SIMPLIFY=FALSE,
MoreArgs=list(cust_method=cust_method, x=x, y=y,
protect_data=protect_data,
dotboi=dotboi, score_method=score_method,
prot_groups=prot_groups, nvars=nvars,
wrap_method_params=wrap_method_params))
# Average results over folds
aggregated_repeat_results = aggregate_cv_results(fold_results, x)
gc()
return(aggregated_repeat_results)
}, repeat_folds_list$train_folds, repeat_folds_list$test_folds, SIMPLIFY=FALSE,
MoreArgs=list(cust_method=cust_method, x=x, y=y, protect_data=protect_data,
dotboi=dotboi, score_method=score_method,
prot_groups=prot_groups, nvars=nvars,
wrap_method_params=wrap_method_params))
# Average results over repeats
aggregated_results = aggregate_cv_results(repeat_results, x)
gc()
return(aggregated_results)
}
# ---------------------------
# Requires:
# library(effsize)
#' wrap_test
#'
#' @export
wrap_test = function(data_sets, nvars, resnames, wrap_method_params){
p_values = matrix(NA, nrow=nvars, ncol=length(data_sets))
colnames(p_values) = resnames
effect_sizes = matrix(NA, nrow=nvars, ncol=length(data_sets))
colnames(effect_sizes) = resnames
wrap_test_params = wrap_method_params$wrap_test_params
test_params = wrap_method_params$test_params
if (is.null(test_params$alternative) || is.null(wrap_test_params$mu_scalar)){
warning("test_params$alternative, wrap_test_params$mu_scalar, or both were not set. Aborting...", call.=TRUE)
quit()
}
if (is.null(wrap_test_params$method)){
warning("wrap_test_params$method not set. Aborting...", call.=TRUE)
quit()
}
# NOTE: lapply to hellapply
for (i in 1:nvars){
for (j in 1:length(data_sets)){
measure = as.numeric(data_sets[[j]][,i])
group = data_sets[[j]][,"groups"]
group = as.character(group)
group = factor(group, levels=c(globals_list$UnprotNum, globals_list$ProtNum))
# mu is the scalar times the range of all the data
if (!is.null(wrap_test_params$scale_all_measures) && wrap_test_params$scale_all_measures){
test_params$mu = wrap_test_params$mu_scalar*(max(measure) - min(measure))
}
if (length(unique(group)) > 1){
test_params$x = measure[group == levels(group)[1]]
test_params$y = measure[group == levels(group)[2]]
# mu is the scalar times the range of one set of data
if (is.null(test_params$mu)){
test_params$mu = wrap_test_params$mu_scalar*(max(test_params$x) - min(test_params$y))
}
# NOTE:
# mu < 0 and alternative = "less" means you expect
# x will be shifted to the left of y by
# "less" than mu, where "less" is in the
# negative sense (i.e. abs(shift) > abs(mu),
# but shift < mu).
test_result = do.call(wrap_test_params$method, test_params)
p_values[i,j] = test_result$p.value
# NOTE: not well-behaved
# NOTE: save confidence interval?
effect_sizes[i,j] = cliff.delta(test_params$y, test_params$x,
conf.level=0.95)$estimate
} else {
p_values[i,j] = 1
effect_sizes[i,j] = 0
}
}
}
gc()
return(list(p_values=p_values, effect_sizes=effect_sizes))
}
# ---------------------------
# ANALYSIS WRAPPERS
# ---------------------------
#' analyze_method
#'
#' @export
analyze_method = function(method, method_name, wrap_method_params, method_params, model_data, dims, dists, effs, seed=NULL){
# NOTE: de-jank
# Does not allow for arbitrary names, types
score_types = c("newold", "newtrue")
label_types = c("old", "new", "true")
stat_types = list(score=score_types, p_value=label_types, effect_size=label_types)
results = vector("list", 4)
names(results) = c("score", "p_value", "effect_size", "effs_new_groups")
eff_dist_results = efficacy_distance(model_data, dims, dists, effs,
method, wrap_method_params, method_params,
score_types, label_types, seed=seed)
effs_frame = eff_dist_results$effs_frame
effs_new_groups = eff_dist_results$effs_new_groups
for (j in 1:length(stat_types)){
r_name = names(results)[j]
r_types = stat_types[[j]]
tmp_results = lapply(1:dims, function(i, effs_frame, r_name, r_types){
var_results = effs_frame[effs_frame[,"component"] == i,]
tmp = var_results[,which(colnames(var_results) %in%
c(paste(r_name, r_types, sep='_'),
"dist", "eff"))]
tmp = cbind(tmp, component=rep(i, nrow(tmp)), seed=seed)
return(tmp)
}, effs_frame=effs_frame, r_name=r_name, r_types=r_types)
results[[r_name]] = do.call(rbind, tmp_results)
}
results[["effs_new_groups"]] = cbind(effs_new_groups, seed=seed)
gc()
return(results)
}
#' analyze_method_permuted
#'
#' @export
analyze_method_permuted = function(i, method, method_name, wrap_method_params, method_params, permuted_effs_data, model_data, dims, dists, effs, seed=NULL){
permuted_model_data = model_data
if (!is.null(wrap_method_params$use_prot_perm) && wrap_method_params$use_prot_perm){
permuted_labels = permuted_effs_data[permuted_effs_data[,"perm_iter"] == i,"prot_groups"]
permuted_model_data[,"prot_groups"] = permuted_labels
} else {
permuted_labels = permuted_effs_data[permuted_effs_data[,"perm_iter"] == i,"inf_groups"]
permuted_model_data[,"inf_groups"] = permuted_labels
}
if (is.null(seed)){
return(analyze_method(method, method_name, wrap_method_params, method_params,
permuted_model_data, dims, dists, effs))
} else {
return(analyze_method(method, method_name, wrap_method_params, method_params,
permuted_model_data, dims, dists, effs, seed))
}
}
#' wrap_permuted_analyze
#'
#' @export
wrap_permuted_analyze = function(method, method_name, wrap_method_params, method_params, permuted_effs_data, model_data, dims, dists, effs, seed=NULL){
if (!is.null(wrap_method_params$use_prot_perm) && wrap_method_params$use_prot_perm){
permuted_effs_data = permuted_effs_data$prot
} else {
if (is.null(wrap_method_params$use_prot_perm)){
warning("wrap_method_params$use_prot_perm not set - defaulting to FALSE.", call.=TRUE)
}
permuted_effs_data = permuted_effs_data$inf
}
return(lapply(fccu_plevels(permuted_effs_data[,"perm_iter"]),
analyze_method_permuted, method, method_name,
wrap_method_params, method_params, permuted_effs_data,
model_data, dims, dists, effs, seed))
}
# NOTE: seed=NULL is here so that parameter searching code isn't borked. Can
# always change that later.
#' wrap_analyze
#'
#' @export
wrap_analyze = function(model_data, dims, dists, effs, methods, method_names, wrap_params, params, results_dir, seed=NULL){
wrap_analyze_results_file = paste(results_dir, "wrap_analyze_results", '_', seed, sep='')
if (globals_list$do_resume_wrap_analyze && file.exists(wrap_analyze_results_file)){
load(wrap_analyze_results_file)
} else {
methods_results = vector("list", length(methods))
names(methods_results) = method_names
if (is.null(globals_list$do_permute) || !globals_list$do_permute){
model_data = model_data
} else {
permuted_effs_data = model_data$permuted_effs_data
model_data = model_data$effs_data
# of length length(methods), each entry is of length globals_list$permute_iters
permuted_results = mapply(wrap_permuted_analyze, methods, method_names, wrap_params, params,
MoreArgs=list(permuted_effs_data=permuted_effs_data,
model_data=model_data, dims=dims,
dists=dists, effs=effs, seed=seed),
SIMPLIFY=FALSE)
# NOTE: This will propagate NAs! Better check first!
# permuted_results[[1:length(methods)]][[1:permute_iters]][[1-4]]
avg_permuted_results = lapply(permuted_results, function(x){
# NOTE: jank
final_list = vector("list", length(x[[1]]))
# Add the results, divide by number of permutations
for (i in 1:length(final_list)){
final_list[[i]] = lapply(x, function(sub_x, i){ return(sub_x[[i]]) }, i)
# NOTE: If NAs make up > 50% of results, then return NA.
# Otherwise, return the addition of the others.
final_list[[i]] = fccu_add(final_list[[i]], ratio=0.5)
final_list[[i]] = final_list[[i]] / globals_list$permute_iters
}
# NOTE:
# NOTE:
# NOTE: Make the permutations optimistic, just like P/U Bagging is
to_unprot = (final_list[["effs_new_groups"]][,"new_groups"] <
((globals_list$UnprotNum + globals_list$ProtNum) / 2))
# >=
to_prot = !to_unprot
final_list[["effs_new_groups"]][,"new_groups"][to_unprot] = globals_list$UnprotNum
final_list[["effs_new_groups"]][,"new_groups"][to_prot] = globals_list$ProtNum
return(final_list)
})
}
methods_results = mapply(analyze_method, methods, method_names, wrap_params, params,
MoreArgs=list(model_data=model_data, dims=dims, dists=dists,
effs=effs, seed=seed),
SIMPLIFY=FALSE)
if (is.null(globals_list$do_permute) || !globals_list$do_permute){
wrap_analyze_results = methods_results
} else {
wrap_analyze_results = list(methods_results=methods_results,
avg_permuted_methods_results=avg_permuted_results)
}
save(wrap_analyze_results, file=wrap_analyze_results_file)
}
gc()
return(wrap_analyze_results)
}
# ---------------------------
# WRAPPER FOR IT ALL
# ---------------------------
# User doesn't care how you get it, just that they get it
#' generate_seed_results
#'
#' @export
generate_seed_results = function(seed_data, dims, dists, effs, methods, method_names, wrap_params, params, results_dir, seeds, fresh_run){
if (fresh_run){
w_a_expr = expression(
wrap_analyze(seed_data[[s_idx]], dims, dists, effs, methods,
method_names, wrap_params, params, results_dir,
seeds[s_idx])
)
cat("Evaluating methods...\n")
if (globals_list$par_seeds){
seed_results = foreach (s_idx=1:length(seeds), .inorder=FALSE, .export=c("globals_list"), .packages=c("ksmthesis")) %dopar% {
return(eval(w_a_expr))
}
} else {
seed_results = foreach (s_idx=1:length(seeds)) %do% {
return(eval(w_a_expr))
}
}
save(seed_results, file=paste(results_dir, "seed_results", sep=''))
} else {
load(paste(results_dir, "seed_results", sep=''))
}
return(seed_results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.