# ---------------------------
# analysis_helpers.R
# Functions to assist in
# generating results.
# ---------------------------
#' aggregate_cv_results
#'
#' @export
aggregate_cv_results = function(results_list, x){
# Average results across all repeats
preds = lapply(results_list, function(x){ return(x$new_data[,"groups"]) })
rnames = lapply(results_list, function(x){ return(rownames(x$new_data)) })
scores = lapply(results_list, function(x){ return(x$scores) })
test_p_values = lapply(results_list, function(x){ return(x$test_p_values) })
test_effect_sizes = lapply(results_list, function(x){ return(x$test_effect_sizes) })
big_len = length(results_list)
pred = generate_pu_pred(preds, rnames)
# "This is probably dangerous"
scores = fccu_as_evil("+", scores) / big_len
test_p_values = fccu_as_evil("+", test_p_values) / big_len
test_effect_sizes = fccu_as_evil("+", test_effect_sizes) / big_len
# NOTE: optimistic
if (is.logical(results_list[[1]]$pred)){
pred = ifelse(pred >= 0.5, TRUE, FALSE)
# NOTE: do TRUE first, since all positive ints evaluate
# to TRUE.
pred[pred == TRUE] = globals_list$UnprotNum
pred[pred == FALSE] = globals_list$ProtNum
} else {
pred[pred < ((globals_list$UnprotNum + globals_list$ProtNum) / 2)] =
globals_list$UnprotNum
pred[pred >= ((globals_list$UnprotNum + globals_list$ProtNum) / 2)] =
globals_list$ProtNum
}
# Find the correct new_data and attach pred
new_data = x[rownames(x) %in% names(pred),]
new_data = cbind(new_data, groups=pred)
return(list(new_data=new_data, scores=scores, test_p_values=test_p_values,
test_effect_sizes=test_effect_sizes))
}
#' generate_results_list
#'
#' @export
generate_results_list = function(model_results, x_test, y_test, score_method, prot_groups, protect_data_test, nvars, wrap_method_params){
new_data = model_results$new_data
protect_y_test = protect_data_test[,"groups"]
scores = vector("numeric", 2)
names(scores) = c("newold", "newtrue")
score_frame = cbind(obs=y_test, pred=new_data[,"groups"])
scores[1] = score_method(score_frame, lev=prot_groups)
score_frame = cbind(obs=protect_y_test, pred=new_data[,"groups"])
scores[2] = score_method(score_frame, lev=prot_groups)
# NOTE: We are evaluating test results, i.e. how well
# the model did compared to the truth. So we
# compare to the actual labels for that test
# fold. Then, we will average together
# performance on test folds.
data_sets = list(cbind(x_test, groups=y_test), new_data, protect_data_test)
names(data_sets) = c("old", "new", "true")
test_results = wrap_test(data_sets, nvars, c("old", "new", "true"),
wrap_method_params)
test_p_values = test_results[["p_values"]]
test_effect_sizes = test_results[["effect_sizes"]]
return(list(new_data=new_data, scores=scores, test_p_values=test_p_values,
test_effect_sizes=test_effect_sizes))
}
#' generate_pu_pred
#'
#' @export
generate_pu_pred = function(pred, rnames){
pred = mapply(function(pred, rnames){
return(cbind(pred=as.numeric(pred),
rnames=as.numeric(rnames)))
}, pred, rnames, SIMPLIFY=FALSE)
pred = do.call(rbind, pred)
# Nobody should be 0, should be UnprotNum:ProtNum. Therefore these are
# one-class SVM results, and coerced TRUE/FALSE values are present.
if (any(pred == 0)){
# NOTE: TRUE THEN FALSE IS IMPORTANT! ALL INT > 0 EVALUATE TO TRUE!
pred[pred == TRUE] = globals_list$UnprotNum
pred[pred == FALSE] = globals_list$ProtNum
}
# NOTE: is it possible some data will never be predicted on?
# Average results together for all entries. Also reorders them!
tmp = aggregate(as.numeric(pred[,!(colnames(pred) == "rnames")]),
list(pred[,"rnames"]), FUN=median)
pred = tmp[,!(grepl("Group.1", colnames(tmp), fixed=TRUE))]
names(pred) = tmp[,grepl("Group.1", colnames(tmp), fixed=TRUE)]
# NOTE: Current solution to uncertainty is to assign every uncertain
# to protected i.e. an "optimistic guesser".
pred[pred < ((globals_list$UnprotNum + globals_list$ProtNum) / 2)] = globals_list$UnprotNum
pred[pred >= ((globals_list$UnprotNum + globals_list$ProtNum) / 2)] = globals_list$ProtNum
return(pred)
}
# Generate testing folds for labels y (inf/prot)
#' generate_label_folds
#'
#' @export
generate_label_folds = function(y, cv_repeats, k, seed){
# Ensure caret is reproducible
set.seed(seed)
repeat_test_folds = lapply(rep(list(y), cv_repeats), function(y, k){
return(createFolds(y, k))
}, k)
# Training folds are the non-testing indices
repeat_train_folds = lapply(repeat_test_folds, function(x, y){
folds = lapply(x, function(fold, y){
return((1:length(y))[-fold])
}, y)
return(folds)
}, y)
return(list(train_folds=repeat_train_folds, test_folds=repeat_test_folds))
}
# Generate training and testing folds for positive and unlabeled data
# NOTE: This might only work because Uninf and Inf are always equally
# represented, so positive and unlabeled are always equally
# represented. With different group sizes, this might break
# down.
# NOTE: Done this way so that you get back indices in x and y, not indices
# in positive and unlabeled. See usage of test_indices in
# evaluate_method_cv for wrap_one_svm for why you should care.
#' generate_pu_folds
#'
#' @export
generate_pu_folds = function(y, cv_repeats, k, seed){
# Ensure caret is reproducible
set.seed(seed)
# Positive folds are training folds
positive_folds = lapply(rep(list(y), cv_repeats), function(y, k, InfNum){
folds = createFolds(as.factor(y), k, returnTrain=TRUE)
folds = lapply(folds, function(fold, y, InfNum){
return(fold[fold %in% which(y == InfNum)])
}, y, InfNum)
return(folds)
}, k, globals_list$InfNum)
# Unlabeled folds are testing folds
unlabeled_folds = lapply(rep(list(y), cv_repeats), function(y, k, InfNum){
folds = createFolds(as.factor(y), k)
folds = lapply(folds, function(fold, y, InfNum){
return(fold[fold %in% which(y != InfNum)])
}, y, InfNum)
return(folds)
}, k, globals_list$InfNum)
return(list(train_folds=positive_folds, test_folds=unlabeled_folds))
}
# ---------------------------
# CALCULATE STATISTICS
# ---------------------------
# All require:
# library(caret)
# ---------------------------
#' balanced_accuracy
#'
#' @export
balanced_accuracy = function(data, lev, ...){
pred = data[,"pred"]
groups = data[,"obs"]
if (!(length(unique(pred)) > 1)){
return(0)
}
if (is.null(lev) || (length(unique(lev)) > 2)){
warning("Either lev was not supplied or it has more than two classes. Aborting...", call.=TRUE)
quit()
}
pred = factor(pred, levels=lev)
groups = factor(groups, levels=lev)
cmatrix = confusionMatrix(pred, groups)
tp = cmatrix$table[1,1]
tn = cmatrix$table[2,2]
fp = cmatrix$table[1,2]
fn = cmatrix$table[2,1]
bacc = as.double((as.double(tp / (tp + fn)) + as.double(tn / (tn + fp))) / 2)
# If NaN is because we correctly predicted only one group, set to 1
# Else we did something wrong, and set to 0
if (is.nan(bacc)){
if ((length(unique(pred)) == 1) && (length(unique(groups)) == 1)){
bacc = 1
} else {
bacc = 0
}
}
names(bacc) = "Balanced Accuracy"
return(bacc)
}
#' f1_score
#'
#' @export
f1_score = function(data, lev, ...){
pred = data[,"pred"]
groups = data[,"obs"]
if (!(length(unique(pred)) > 1)){
return(0)
}
if (is.null(lev) || (length(unique(lev)) > 2)){
warning("Either lev was not supplied or it has more than two classes. Aborting...", call.=TRUE)
quit()
}
pred = factor(pred, levels=lev)
groups = factor(groups, levels=lev)
cmatrix = confusionMatrix(pred, groups)
tp = cmatrix$table[1,1]
tn = cmatrix$table[2,2]
fp = cmatrix$table[1,2]
fn = cmatrix$table[2,1]
precision = as.double(tp) / as.double(tp + fp)
recall = as.double(tp) / as.double(tp + fn)
f1 = as.double(2 * ((precision * recall) / (precision + recall)))
names(f1) = "F1 Score"
return(f1)
}
#' mcc_score
#'
#' @export
mcc_score = function(data, lev, ...){
pred = data[,"pred"]
groups = data[,"obs"]
if (!(length(unique(pred)) > 1)){
return(0)
}
if (is.null(lev) || (length(unique(lev)) > 2)){
warning("Either lev was not supplied or it has more than two classes. Aborting...", call.=TRUE)
quit()
}
pred = factor(pred, levels=lev)
groups = factor(groups, levels=lev)
cmatrix = confusionMatrix(pred, groups)
tp = cmatrix$table[1,1]
tn = cmatrix$table[2,2]
fp = cmatrix$table[1,2]
fn = cmatrix$table[2,1]
numerator = as.double(tp * tn) - as.double(fp * fn)
denominator = as.double(sqrt(as.double(tp + fp) *
as.double(tp + fn) *
as.double(tn + fp) *
as.double(tn + fn)))
mcc = ifelse(denominator > 0, numerator / denominator, 0)
names(mcc) = "MCC"
return(mcc)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.