R/analysis_helpers.R

Defines functions mcc_score f1_score balanced_accuracy generate_pu_folds generate_label_folds generate_pu_pred generate_results_list aggregate_cv_results

Documented in aggregate_cv_results balanced_accuracy f1_score generate_label_folds generate_pu_folds generate_pu_pred generate_results_list mcc_score

# ---------------------------
# analysis_helpers.R
# Functions to assist in
#	generating results.
# ---------------------------

#' aggregate_cv_results
#'
#' @export
aggregate_cv_results = function(results_list, x){
	# Average results across all repeats
	preds = lapply(results_list, function(x){ return(x$new_data[,"groups"]) })
	rnames = lapply(results_list, function(x){ return(rownames(x$new_data)) })

	scores = lapply(results_list, function(x){ return(x$scores) })
	test_p_values = lapply(results_list, function(x){ return(x$test_p_values) })
	test_effect_sizes = lapply(results_list, function(x){ return(x$test_effect_sizes) })

	big_len = length(results_list)

	pred = generate_pu_pred(preds, rnames)

	# "This is probably dangerous"
	scores = fccu_as_evil("+", scores) / big_len
	test_p_values = fccu_as_evil("+", test_p_values) / big_len
	test_effect_sizes = fccu_as_evil("+", test_effect_sizes) / big_len

	# NOTE: optimistic
	if (is.logical(results_list[[1]]$pred)){
		pred = ifelse(pred >= 0.5, TRUE, FALSE)
		# NOTE: do TRUE first, since all positive ints evaluate
		#	to TRUE.
		pred[pred == TRUE] = globals_list$UnprotNum
		pred[pred == FALSE] = globals_list$ProtNum

	} else {
		pred[pred < ((globals_list$UnprotNum + globals_list$ProtNum) / 2)] =
			globals_list$UnprotNum
		pred[pred >= ((globals_list$UnprotNum + globals_list$ProtNum) / 2)] =
			globals_list$ProtNum
	}

	# Find the correct new_data and attach pred
	new_data = x[rownames(x) %in% names(pred),]
	new_data = cbind(new_data, groups=pred)

	return(list(new_data=new_data, scores=scores, test_p_values=test_p_values,
		    test_effect_sizes=test_effect_sizes))
}

#' generate_results_list
#'
#' @export
generate_results_list = function(model_results, x_test, y_test, score_method, prot_groups, protect_data_test, nvars, wrap_method_params){
	new_data = model_results$new_data

	protect_y_test = protect_data_test[,"groups"]

	scores = vector("numeric", 2)
	names(scores) = c("newold", "newtrue")

	score_frame = cbind(obs=y_test, pred=new_data[,"groups"])
	scores[1] = score_method(score_frame, lev=prot_groups)

	score_frame = cbind(obs=protect_y_test, pred=new_data[,"groups"])
	scores[2] = score_method(score_frame, lev=prot_groups)

	# NOTE: We are evaluating test results, i.e. how well
	#	the model did compared to the truth. So we
	#	compare to the actual labels for that test
	#	fold. Then, we will average together
	#	performance on test folds.

	data_sets = list(cbind(x_test, groups=y_test), new_data, protect_data_test)
	names(data_sets) = c("old", "new", "true")

	test_results = wrap_test(data_sets, nvars, c("old", "new", "true"),
				 wrap_method_params)
	test_p_values = test_results[["p_values"]]
	test_effect_sizes = test_results[["effect_sizes"]]

	return(list(new_data=new_data, scores=scores, test_p_values=test_p_values,
		    test_effect_sizes=test_effect_sizes))
}

#' generate_pu_pred
#'
#' @export
generate_pu_pred = function(pred, rnames){
	pred = mapply(function(pred, rnames){
		return(cbind(pred=as.numeric(pred),
			     rnames=as.numeric(rnames)))
	}, pred, rnames, SIMPLIFY=FALSE)
	pred = do.call(rbind, pred)

	# Nobody should be 0, should be UnprotNum:ProtNum. Therefore these are
	#	one-class SVM results, and coerced TRUE/FALSE values are present.
	if (any(pred == 0)){
		# NOTE: TRUE THEN FALSE IS IMPORTANT! ALL INT > 0 EVALUATE TO TRUE!
		pred[pred == TRUE] = globals_list$UnprotNum
		pred[pred == FALSE] = globals_list$ProtNum
	}

	# NOTE: is it possible some data will never be predicted on?
	# Average results together for all entries. Also reorders them!
	tmp = aggregate(as.numeric(pred[,!(colnames(pred) == "rnames")]),
			list(pred[,"rnames"]), FUN=median)
	pred = tmp[,!(grepl("Group.1", colnames(tmp), fixed=TRUE))]
	names(pred) = tmp[,grepl("Group.1", colnames(tmp), fixed=TRUE)]

	# NOTE: Current solution to uncertainty is to assign every uncertain
	#	to protected i.e. an "optimistic guesser".
	pred[pred < ((globals_list$UnprotNum + globals_list$ProtNum) / 2)] = globals_list$UnprotNum
	pred[pred >= ((globals_list$UnprotNum + globals_list$ProtNum) / 2)] = globals_list$ProtNum

	return(pred)
}

# Generate testing folds for labels y (inf/prot)
#' generate_label_folds
#'
#' @export
generate_label_folds = function(y, cv_repeats, k, seed){
	# Ensure caret is reproducible
	set.seed(seed)

	repeat_test_folds = lapply(rep(list(y), cv_repeats), function(y, k){
		return(createFolds(y, k))
	}, k)

	# Training folds are the non-testing indices
	repeat_train_folds = lapply(repeat_test_folds, function(x, y){
		folds = lapply(x, function(fold, y){
			return((1:length(y))[-fold])
		}, y)

		return(folds)
	}, y)

	return(list(train_folds=repeat_train_folds, test_folds=repeat_test_folds))
}

# Generate training and testing folds for positive and unlabeled data
# NOTE: This might only work because Uninf and Inf are always equally
#	represented, so positive and unlabeled are always equally
#	represented. With different group sizes, this might break
#	down.
# NOTE: Done this way so that you get back indices in x and y, not indices
#	in positive and unlabeled. See usage of test_indices in
#	evaluate_method_cv for wrap_one_svm for why you should care.
#' generate_pu_folds
#'
#' @export
generate_pu_folds = function(y, cv_repeats, k, seed){
	# Ensure caret is reproducible
	set.seed(seed)

	# Positive folds are training folds
	positive_folds = lapply(rep(list(y), cv_repeats), function(y, k, InfNum){
		folds = createFolds(as.factor(y), k, returnTrain=TRUE)

		folds = lapply(folds, function(fold, y, InfNum){
			return(fold[fold %in% which(y == InfNum)])
		}, y, InfNum)

		return(folds)
	}, k, globals_list$InfNum)

	# Unlabeled folds are testing folds
	unlabeled_folds = lapply(rep(list(y), cv_repeats), function(y, k, InfNum){
		folds = createFolds(as.factor(y), k)

		folds = lapply(folds, function(fold, y, InfNum){
			return(fold[fold %in% which(y != InfNum)])
		}, y, InfNum)

		return(folds)
	}, k, globals_list$InfNum)

	return(list(train_folds=positive_folds, test_folds=unlabeled_folds))
}


# ---------------------------
# CALCULATE STATISTICS
# ---------------------------
# All require:
# library(caret)
# ---------------------------

#' balanced_accuracy
#'
#' @export
balanced_accuracy = function(data, lev, ...){
	pred = data[,"pred"]
	groups = data[,"obs"]

	if (!(length(unique(pred)) > 1)){
		return(0)
	}

	if (is.null(lev) || (length(unique(lev)) > 2)){
		warning("Either lev was not supplied or it has more than two classes. Aborting...", call.=TRUE)
		quit()
	}

	pred = factor(pred, levels=lev)
	groups = factor(groups, levels=lev)

	cmatrix = confusionMatrix(pred, groups)
	tp = cmatrix$table[1,1]
	tn = cmatrix$table[2,2]
	fp = cmatrix$table[1,2]
	fn = cmatrix$table[2,1]

	bacc = as.double((as.double(tp / (tp + fn)) + as.double(tn / (tn + fp))) / 2)

	# If NaN is because we correctly predicted only one group, set to 1
	# Else we did something wrong, and set to 0
	if (is.nan(bacc)){
		if ((length(unique(pred)) == 1) && (length(unique(groups)) == 1)){
			bacc = 1

		} else {
			bacc = 0
		}
	}

	names(bacc) = "Balanced Accuracy"

	return(bacc)
}

#' f1_score
#'
#' @export
f1_score = function(data, lev, ...){
	pred = data[,"pred"]
	groups = data[,"obs"]

	if (!(length(unique(pred)) > 1)){
		return(0)
	}

	if (is.null(lev) || (length(unique(lev)) > 2)){
		warning("Either lev was not supplied or it has more than two classes. Aborting...", call.=TRUE)
		quit()
	}

	pred = factor(pred, levels=lev)
	groups = factor(groups, levels=lev)

	cmatrix = confusionMatrix(pred, groups)
	tp = cmatrix$table[1,1]
	tn = cmatrix$table[2,2]
	fp = cmatrix$table[1,2]
	fn = cmatrix$table[2,1]

	precision = as.double(tp) / as.double(tp + fp)
	recall = as.double(tp) / as.double(tp + fn)
	f1 = as.double(2 * ((precision * recall) / (precision + recall)))

	names(f1) = "F1 Score"

	return(f1)
}

#' mcc_score
#'
#' @export
mcc_score = function(data, lev, ...){
	pred = data[,"pred"]
	groups = data[,"obs"]

	if (!(length(unique(pred)) > 1)){
		return(0)
	}

	if (is.null(lev) || (length(unique(lev)) > 2)){
		warning("Either lev was not supplied or it has more than two classes. Aborting...", call.=TRUE)
		quit()
	}

	pred = factor(pred, levels=lev)
	groups = factor(groups, levels=lev)

	cmatrix = confusionMatrix(pred, groups)
	tp = cmatrix$table[1,1]
	tn = cmatrix$table[2,2]
	fp = cmatrix$table[1,2]
	fn = cmatrix$table[2,1]

	numerator = as.double(tp * tn) - as.double(fp * fn)
	denominator = as.double(sqrt(as.double(tp + fp) *
				     as.double(tp + fn) *
				     as.double(tn + fp) *
				     as.double(tn + fn)))

	mcc = ifelse(denominator > 0, numerator / denominator, 0)
	names(mcc) = "MCC"

	return(mcc)
}
kmorrisongr/ksmthesis documentation built on Oct. 5, 2020, 6:41 a.m.