R/generate_analysis.R

Defines functions generate_seed_results wrap_analyze wrap_permuted_analyze analyze_method_permuted analyze_method wrap_test evaluate_method_cv evaluate_method wrap_method generate_dist_new_groups generate_dist_frames generate_dist_results efficacy_distance

Documented in analyze_method analyze_method_permuted efficacy_distance evaluate_method evaluate_method_cv generate_dist_frames generate_dist_new_groups generate_dist_results generate_seed_results wrap_analyze wrap_method wrap_permuted_analyze wrap_test

# ---------------------------
# generate_analysis.R
# Generate seed_results for
#	all method conditions,
#	etc.
# ---------------------------
# Necessary functions
# ---------------------------



# ---------------------------
# Need to have UnprotNum, ProtNum, InfNum, and UninfNum defined globally
# With the supplied method and its options:
#	1. Do your method on varying efficacies, at various degrees of distinction
#	2. Save those results (performed on Inf/Uninf)
# Only save the data that you need:
#	1. MCCs
#	2. p-values
#	3. effect sizes
#	4. new group labels
#' efficacy_distance
#'
#' @export
efficacy_distance = function(model_data, nvars, dists, effs, cust_method, wrap_method_params, method_params, score_types, label_types, seed){
	train_data = model_data[,colnames(model_data) != "prot_groups"]
	colnames(train_data)[colnames(train_data) == "inf_groups"] = "groups"

	protect_data = model_data[,colnames(model_data) != "inf_groups"]
	colnames(protect_data)[colnames(protect_data) == "prot_groups"] = "groups"

	if (!is.null(globals_list$par_effs) && globals_list$par_effs){
		eff_results_list = foreach(eff=effs, .inorder=FALSE, .export=c("globals_list"), .packages=c("ksmthesis")) %dopar% {
			dists_results = lapply(dists, generate_dist_results, eff, nvars,
					       train_data, protect_data, cust_method,
					       wrap_method_params, method_params, seed)

			dists_frame = do.call(rbind, lapply(dists_results, generate_dist_frames,
							    nvars, score_types, label_types))

			dists_new_groups = do.call(rbind, lapply(dists_results, generate_dist_new_groups))

			gc()
			return(list(dists_frame=dists_frame, dists_new_groups=dists_new_groups))
		}

	} else {
		eff_results_list = lapply(effs, function(eff, dists, nvars, train_data,
							 protect_data, cust_method,
							 wrap_method_params, method_params,
							 score_types, label_types, seed){

			dists_results = lapply(dists, generate_dist_results, eff, nvars,
					       train_data, protect_data, cust_method,
					       wrap_method_params, method_params, seed)

			dists_frame = do.call(rbind, lapply(dists_results, generate_dist_frames,
							    nvars, score_types, label_types))

			dists_new_groups = do.call(rbind, lapply(dists_results, generate_dist_new_groups))

			gc()
			return(list(dists_frame=dists_frame, dists_new_groups=dists_new_groups))

		}, dists, nvars, train_data, protect_data, cust_method, wrap_method_params,
			method_params, score_types, label_types, seed)
	}

	effs_frame = lapply(eff_results_list, function(x){ return(x[["dists_frame"]]) })
	effs_frame = do.call(rbind, effs_frame)

	effs_new_groups = lapply(eff_results_list, function(x){ return(x[["dists_new_groups"]]) })
	effs_new_groups = do.call(rbind, effs_new_groups)

	gc()
	return(list(effs_frame=effs_frame, effs_new_groups=effs_new_groups))
}

#' generate_dist_results
#'
#' @export
generate_dist_results = function(dist, eff, nvars, train_data, protect_data, cust_method, wrap_method_params, method_params, seed){
	model_train = train_data[train_data[,"eff"] == eff,]
	model_train = model_train[model_train[,"dist"] == dist,]
	model_train = as.matrix(model_train[,colnames(model_train) != "eff"])
	model_train = as.matrix(model_train[,colnames(model_train) != "dist"])

	model_protect = protect_data[protect_data[,"eff"] == eff,]
	model_protect = model_protect[model_protect[,"dist"] == dist,]
	model_protect = as.matrix(model_protect[,colnames(model_protect) != "eff"])
	model_protect = as.matrix(model_protect[,colnames(model_protect) != "dist"])

	results = wrap_method(cust_method, wrap_method_params, nvars,
			      model_train, method_params, model_protect,
			      seed)

	results$dist = dist
	results$eff = eff

	return(results)
}

#' generate_dist_frames
#'
#' @export
generate_dist_frames = function(dist_results, nvars, score_types, label_types){
	# NOTE: jank
	# Does not allow for (totally) arbitrary names
	colheaders = c(paste("score", score_types, sep='_'),
		       paste("p_value", label_types, sep='_'),
		       paste("effect_size", label_types, sep='_'),
		       "dist", "eff", "component")

	stats_frame = lapply(1:nvars, function(i, dist_results){
		return(c(dist_results$scores, dist_results$test_p_values[i,],
			 dist_results$test_effect_sizes[i,], dist_results$dist,
			 dist_results$eff, i))
	}, dist_results)

	stats_frame = do.call(rbind, stats_frame)
	colnames(stats_frame) = colheaders

	return(stats_frame)
}

#' generate_dist_new_groups
#'
#' @export
generate_dist_new_groups = function(dist_results){
	tmp = as.matrix(data.frame(new_groups=dist_results$data_sets$new[,"groups"],
				   prot_groups=dist_results$data_sets$true[,"groups"],
				   dist=dist_results$dist, eff=dist_results$eff))
	tmp = apply(tmp, 2, as.numeric)

	return(tmp)
}


# NOTE: cleanup
# ---------------------------
# METHOD WRAPPERS
# ---------------------------
# A method wrapper accepts:
#	1. train_data - Data without group identifiers
#	2. full_data - Data with infection status identifiers
#	3. method_params - Any parameters it requires, bundled in a list
#	4. protect_data - Data with protection status identifiers
# The parent wrapper is required to return:
#	1. The original data, the data with new labels, and the ground truth data
#	2. Scores comparing new assignments to old labels and to true labels
#	3. Test results for each pair of labels, for each component
#	4. Fold changes for each pair of labels, for each component
# A method wrapper is required to:
#	1. Parse its method_params list
#	2. Return:
#		a. full_data as old_data (i.e. old/inf labels)
#		b. train_data with the new labels as new_data (i.e. predicted status)
# ---------------------------
# Standardized interface to wrapper methods
#' wrap_method
#'
#' @export
wrap_method = function(cust_method, wrap_method_params, nvars, full_data, method_params, protect_data, seed, score_method=mcc_score){
	# Remove group information
	train_data = as.matrix(full_data[,colnames(full_data) != "groups"])
	rownames(train_data) = 1:nrow(train_data)

	# [[train_folds/test_folds]]
	#	 [[repeat]]
	#		[[fold]]

	# Generate positive training and unlabeled testing folds
	#	for One-SVM. Traditional way doesn't apply.
	if (identical(cust_method, wrap_one_svm)){
		positive_indices = full_data[,"groups"] == globals_list$InfNum
		positive_y = full_data[,"groups"][positive_indices]
		unlabeled_y = full_data[,"groups"][!positive_indices]

		repeat_folds = generate_pu_folds(full_data[,"groups"], globals_list$cv_repeats,
						 globals_list$cv_folds, seed)

	# Generate folds balanced for Inf/Uninf membership (which won't
	#	actually matter since they're always equal).
	} else {
		repeat_folds = generate_label_folds(full_data[,"groups"], globals_list$cv_repeats,
						    globals_list$cv_folds, seed)
	}

	# Require both since they are wrapper wrappers
	if (identical(cust_method, wrap_pu_bagging) || identical(cust_method, wrap_pu_reliable_negative)){
		dotboi = list(wrap_method_params=wrap_method_params,
			      method_params=method_params)

	} else {
		dotboi = list(method_params=method_params)
	}

	prot_groups = c(globals_list$UnprotNum, globals_list$ProtNum)

	# NOTE: Can't predict using kmeans, and wrap_pu_bagging is already an ensemble,
	#	so just run them once.
	if (identical(cust_method, wrap_kmeans) || identical(cust_method, wrap_pu_bagging)){
		results_list = evaluate_method(cust_method, train_data, full_data[,"groups"],
					       protect_data, dotboi, score_method, prot_groups,
					       nvars, wrap_method_params)

	# NOTE: One-SVM performs like shit when CV'd because MCC is
	#	supposed to blow up when one class isn't predicted.
	# 	Will this be a problem for other methods?
	} else {
		results_list = evaluate_method_cv(repeat_folds, cust_method,
						  train_data, full_data[,"groups"],
						  protect_data, dotboi, score_method,
						  prot_groups, nvars, wrap_method_params)
	}

	data_sets = list(full_data, results_list$new_data, protect_data)
	names(data_sets) = c("old", "new", "true")

	results_list$data_sets = data_sets
	results_list$new_data = NULL

	gc()
	# Contains:
	# 	$data_sets, a list of data frames
	# 	$scores, a vector of newold newtrue accuracy
	# 	$test_p_values, a data frame of nrow nvars and ncol length(data_sets)
	# 	$test_effect_sizes, a data frame of nrow nvars and ncol length(data_sets)
	return(results_list)
}


#' evaluate_method
#'
#' @export
evaluate_method = function(cust_method, x, y, protect_data, dotboi, score_method, prot_groups, nvars, wrap_method_params){
	model_results = cust_method(x, y, ...=dotboi)

	return(generate_results_list(model_results, x, y, score_method,
				     prot_groups, protect_data, nvars,
				     wrap_method_params))
}

#' evaluate_method_cv
#'
#' @export
evaluate_method_cv = function(repeat_folds_list, cust_method, x, y, protect_data, dotboi, score_method, prot_groups, nvars, wrap_method_params){
	# Iterate over train and test simultaneously
	repeat_results = mapply(function(train_folds, test_folds, cust_method,
					    x, y, protect_data, dotboi,
					    score_method, prot_groups,
					    nvars, wrap_method_params){

		fold_results = mapply(function(train_indices, test_indices,
					       cust_method, x, y,
					       protect_data, dotboi,
					       score_method, prot_groups,
					       nvars, wrap_method_params){

			x_test = x[test_indices,]
			# Used as part of "old_data" for wrap_test
			y_test = y[test_indices]

			# NOTE: for wrap_pu_reliable_negative to select best
			#	preds
			dotboi$method_params$y_test = y_test

			# TODO: check that this works correctly
			if (identical(cust_method, wrap_svm)){
				train_x = protect_data[train_indices,]
				train_x = train_x[,!(colnames(train_x) == "groups")]
				train_y = protect_data[,"groups"][train_indices]
			
			} else {
				train_x = x[train_indices,]
				train_y = y[train_indices]
			}

			model_results = cust_method(train_x, train_y, x_test, ...=dotboi)

			gc()
			return(generate_results_list(model_results, x_test, y_test,
						     score_method, prot_groups,
						     protect_data[test_indices,],
						     nvars, wrap_method_params))

		}, train_folds, test_folds, SIMPLIFY=FALSE,
		MoreArgs=list(cust_method=cust_method, x=x, y=y,
			      protect_data=protect_data,
			      dotboi=dotboi, score_method=score_method,
			      prot_groups=prot_groups, nvars=nvars,
			      wrap_method_params=wrap_method_params))

		# Average results over folds
		aggregated_repeat_results = aggregate_cv_results(fold_results, x)
		
		gc()
		return(aggregated_repeat_results)

	}, repeat_folds_list$train_folds, repeat_folds_list$test_folds, SIMPLIFY=FALSE,
	MoreArgs=list(cust_method=cust_method, x=x, y=y, protect_data=protect_data,
		      dotboi=dotboi, score_method=score_method,
		      prot_groups=prot_groups, nvars=nvars,
		      wrap_method_params=wrap_method_params))

	# Average results over repeats
	aggregated_results = aggregate_cv_results(repeat_results, x)

	gc()
	return(aggregated_results)
}


# ---------------------------
# Requires:
# library(effsize)

#' wrap_test
#'
#' @export
wrap_test = function(data_sets, nvars, resnames, wrap_method_params){
	p_values = matrix(NA, nrow=nvars, ncol=length(data_sets))
	colnames(p_values) = resnames
	effect_sizes = matrix(NA, nrow=nvars, ncol=length(data_sets))
	colnames(effect_sizes) = resnames

	wrap_test_params = wrap_method_params$wrap_test_params
	test_params = wrap_method_params$test_params

	if (is.null(test_params$alternative) || is.null(wrap_test_params$mu_scalar)){
		warning("test_params$alternative, wrap_test_params$mu_scalar, or both were not set. Aborting...", call.=TRUE)
		quit()
	}

	if (is.null(wrap_test_params$method)){
		warning("wrap_test_params$method not set. Aborting...", call.=TRUE)
		quit()
	}

	# NOTE: lapply to hellapply
	for (i in 1:nvars){
		for (j in 1:length(data_sets)){
			measure = as.numeric(data_sets[[j]][,i])
			group = data_sets[[j]][,"groups"]

			group = as.character(group)
			group = factor(group, levels=c(globals_list$UnprotNum, globals_list$ProtNum))

			# mu is the scalar times the range of all the data
			if (!is.null(wrap_test_params$scale_all_measures) && wrap_test_params$scale_all_measures){
				test_params$mu = wrap_test_params$mu_scalar*(max(measure) - min(measure))
			}

			if (length(unique(group)) > 1){
				test_params$x = measure[group == levels(group)[1]]
				test_params$y = measure[group == levels(group)[2]]

				# mu is the scalar times the range of one set of data
				if (is.null(test_params$mu)){
					test_params$mu = wrap_test_params$mu_scalar*(max(test_params$x) - min(test_params$y))
				}

				# NOTE:
				# mu < 0 and alternative = "less" means you expect
				#	x will be shifted to the left of y by
				#	"less" than mu, where "less" is in the
				#	negative sense (i.e. abs(shift) > abs(mu),
				#	but shift < mu).
				test_result = do.call(wrap_test_params$method, test_params)
				p_values[i,j] = test_result$p.value

				# NOTE: not well-behaved
				# NOTE: save confidence interval?
				effect_sizes[i,j] = cliff.delta(test_params$y, test_params$x,
								conf.level=0.95)$estimate

			} else {
				p_values[i,j] = 1
				effect_sizes[i,j] = 0
			}
		}
	}

	gc()
	return(list(p_values=p_values, effect_sizes=effect_sizes))
}


# ---------------------------
# ANALYSIS WRAPPERS
# ---------------------------

#' analyze_method
#'
#' @export
analyze_method = function(method, method_name, wrap_method_params, method_params, model_data, dims, dists, effs, seed=NULL){
	# NOTE: de-jank
	# Does not allow for arbitrary names, types
	score_types = c("newold", "newtrue")
	label_types = c("old", "new", "true")
	stat_types = list(score=score_types, p_value=label_types, effect_size=label_types)
	results = vector("list", 4)
	names(results) = c("score", "p_value", "effect_size", "effs_new_groups")

	eff_dist_results = efficacy_distance(model_data, dims, dists, effs,
					     method, wrap_method_params, method_params,
					     score_types, label_types, seed=seed)

	effs_frame = eff_dist_results$effs_frame
	effs_new_groups = eff_dist_results$effs_new_groups

	for (j in 1:length(stat_types)){
		r_name = names(results)[j]
		r_types = stat_types[[j]]

		tmp_results = lapply(1:dims, function(i, effs_frame, r_name, r_types){
			var_results = effs_frame[effs_frame[,"component"] == i,]

			tmp = var_results[,which(colnames(var_results) %in%
						 c(paste(r_name, r_types, sep='_'),
						   "dist", "eff"))]
			tmp = cbind(tmp, component=rep(i, nrow(tmp)), seed=seed)

			return(tmp)
		}, effs_frame=effs_frame, r_name=r_name, r_types=r_types)

		results[[r_name]] = do.call(rbind, tmp_results)
	}

	results[["effs_new_groups"]] = cbind(effs_new_groups, seed=seed)

	gc()
	return(results)
}

#' analyze_method_permuted
#'
#' @export
analyze_method_permuted = function(i, method, method_name, wrap_method_params, method_params, permuted_effs_data, model_data, dims, dists, effs, seed=NULL){
	permuted_model_data = model_data

	if (!is.null(wrap_method_params$use_prot_perm) && wrap_method_params$use_prot_perm){
		permuted_labels = permuted_effs_data[permuted_effs_data[,"perm_iter"] == i,"prot_groups"]
		permuted_model_data[,"prot_groups"] = permuted_labels

	} else {
		permuted_labels = permuted_effs_data[permuted_effs_data[,"perm_iter"] == i,"inf_groups"]
		permuted_model_data[,"inf_groups"] = permuted_labels
	}

	if (is.null(seed)){
		return(analyze_method(method, method_name, wrap_method_params, method_params,
				      permuted_model_data, dims, dists, effs))

	} else {
		return(analyze_method(method, method_name, wrap_method_params, method_params,
				      permuted_model_data, dims, dists, effs, seed))
	}
}

#' wrap_permuted_analyze
#'
#' @export
wrap_permuted_analyze = function(method, method_name, wrap_method_params, method_params, permuted_effs_data, model_data, dims, dists, effs, seed=NULL){
	if (!is.null(wrap_method_params$use_prot_perm) && wrap_method_params$use_prot_perm){
		permuted_effs_data = permuted_effs_data$prot

	} else {
		if (is.null(wrap_method_params$use_prot_perm)){
			warning("wrap_method_params$use_prot_perm not set - defaulting to FALSE.", call.=TRUE)
		}

		permuted_effs_data = permuted_effs_data$inf
	}

	return(lapply(fccu_plevels(permuted_effs_data[,"perm_iter"]),
		      analyze_method_permuted, method, method_name,
		      wrap_method_params, method_params, permuted_effs_data,
		      model_data, dims, dists, effs, seed))
}

# NOTE: seed=NULL is here so that parameter searching code isn't borked. Can
#	always change that later.
#' wrap_analyze
#'
#' @export
wrap_analyze = function(model_data, dims, dists, effs, methods, method_names, wrap_params, params, results_dir, seed=NULL){
	wrap_analyze_results_file = paste(results_dir, "wrap_analyze_results", '_', seed, sep='')

	if (globals_list$do_resume_wrap_analyze && file.exists(wrap_analyze_results_file)){
		load(wrap_analyze_results_file)

	} else {
		methods_results = vector("list", length(methods))
		names(methods_results) = method_names

		if (is.null(globals_list$do_permute) || !globals_list$do_permute){
			model_data = model_data

		} else {
			permuted_effs_data = model_data$permuted_effs_data
			model_data = model_data$effs_data

			# of length length(methods), each entry is of length globals_list$permute_iters
			permuted_results = mapply(wrap_permuted_analyze, methods, method_names, wrap_params, params,
						  MoreArgs=list(permuted_effs_data=permuted_effs_data,
								model_data=model_data, dims=dims,
								dists=dists, effs=effs, seed=seed),
						  SIMPLIFY=FALSE)

			# NOTE: This will propagate NAs! Better check first!
			# permuted_results[[1:length(methods)]][[1:permute_iters]][[1-4]]
			avg_permuted_results = lapply(permuted_results, function(x){
				# NOTE: jank
				final_list = vector("list", length(x[[1]]))

				# Add the results, divide by number of permutations
				for (i in 1:length(final_list)){
					final_list[[i]] = lapply(x, function(sub_x, i){ return(sub_x[[i]]) }, i)
					# NOTE: If NAs make up > 50% of results, then return NA.
					#	Otherwise, return the addition of the others.
					final_list[[i]] = fccu_add(final_list[[i]], ratio=0.5)
					final_list[[i]] = final_list[[i]] / globals_list$permute_iters
				}

				# NOTE:
				# NOTE:
				# NOTE: Make the permutations optimistic, just like P/U Bagging is
				to_unprot = (final_list[["effs_new_groups"]][,"new_groups"] <
					     ((globals_list$UnprotNum + globals_list$ProtNum) / 2))
				# >=
				to_prot = !to_unprot

				final_list[["effs_new_groups"]][,"new_groups"][to_unprot] = globals_list$UnprotNum
				final_list[["effs_new_groups"]][,"new_groups"][to_prot] = globals_list$ProtNum

				return(final_list)
			})
		}

		methods_results = mapply(analyze_method, methods, method_names, wrap_params, params,
					 MoreArgs=list(model_data=model_data, dims=dims, dists=dists,
						       effs=effs, seed=seed),
					 SIMPLIFY=FALSE)

		if (is.null(globals_list$do_permute) || !globals_list$do_permute){
			wrap_analyze_results = methods_results

		} else {
			wrap_analyze_results = list(methods_results=methods_results,
					     avg_permuted_methods_results=avg_permuted_results)
		}

		save(wrap_analyze_results, file=wrap_analyze_results_file)
	}

	gc()
	return(wrap_analyze_results)
}


# ---------------------------
# WRAPPER FOR IT ALL
# ---------------------------

# User doesn't care how you get it, just that they get it
#' generate_seed_results
#'
#' @export
generate_seed_results = function(seed_data, dims, dists, effs, methods, method_names, wrap_params, params, results_dir, seeds, fresh_run){
	if (fresh_run){
		w_a_expr = expression(
			wrap_analyze(seed_data[[s_idx]], dims, dists, effs, methods,
				     method_names, wrap_params, params, results_dir,
				     seeds[s_idx])
		)

		cat("Evaluating methods...\n")
		if (globals_list$par_seeds){
			seed_results = foreach (s_idx=1:length(seeds), .inorder=FALSE, .export=c("globals_list"), .packages=c("ksmthesis")) %dopar% {
				return(eval(w_a_expr))
			}

		} else {
			seed_results = foreach (s_idx=1:length(seeds)) %do% {
				return(eval(w_a_expr))
			}
		}

		save(seed_results, file=paste(results_dir, "seed_results", sep=''))

	} else {
		load(paste(results_dir, "seed_results", sep=''))
	}

	return(seed_results)
}
kmorrisongr/ksmthesis documentation built on Oct. 5, 2020, 6:41 a.m.