R/method_helpers.R

Defines functions wrap_kmeans_bisect_internal wrap_boot generate_reliable_negatives_frame generate_reliable_negatives bisect_kmeans

Documented in bisect_kmeans generate_reliable_negatives generate_reliable_negatives_frame wrap_boot wrap_kmeans_bisect_internal

#' bisect_kmeans
#'
#' @export
bisect_kmeans = function(train_data, method_params){
	# Start with one cluster
	E = cbind(train_data, clusters=rep(1, nrow(train_data)))
	iter_max = 100

	while (length(unique(E[,"clusters"])) < method_params$k){
		# Identify largest cluster; if there's a tie, take the first
		clust_sums = sapply(unique(E[,"clusters"]), function(x, E){
			return(sum(E[,"clusters"] == x))
		}, E=E)
		large_clust = unique(E[,"clusters"])[which(clust_sums == max(clust_sums))[1]]

		# Split largest cluster into 2 clusters iter_max times and
		#	take the split with the highest similarity
		# If cluster to split is tiny, we're done
		sub_data = E[E[,"clusters"] == large_clust,]
		if (nrow(sub_data) <= 2){
			break
		}

		models = lapply(1:iter_max, wrap_kmeans_bisect_internal, sub_data)
		cand_clusts = lapply(models, function(x){ return(as.vector(x$cluster)) })
		cand_scores = sapply(models, function(x){ return(x$betweenss / x$totss) })

		# Select the candidate cluster with the best "score"
		# Create new labels for the clusters (instead of "1" and "2")
		final_clusts = cand_clusts[[which(cand_scores == max(cand_scores))[1]]]
		# Do them in this order so 1s that become 2s don't then become 3s, etc.
		final_clusts[final_clusts == 2] = max(E[,"clusters"]) + 2
		final_clusts[final_clusts == 1] = max(E[,"clusters"]) + 1
		E[,"clusters"][E[,"clusters"] == large_clust] = final_clusts
	}

	gc()
	return(E)
}

# Requires method_params$k for bisect_kmeans
# Optionally takes method_params$threshold for identifying clusters to add to RN
# Use bisect_kmeans to generate reliable negative data
# NOTE: if no reliable negatives are found, experiment with method_params$threshold
#' generate_reliable_negatives
#'
#' @export
generate_reliable_negatives = function(x, full_data, positive, method_params){
	# Iteratively define reliable negatives until stopping criteria, per Zhang, B. and Zuo, W. (2009)
	# 1. Run bisecting k-means to create k clusters from all of the Positive + Unlabeled data
	clustered = bisect_kmeans(x, method_params)

	# 2. foreach cluster:
	#	A. If the number of positives in the cluster is below the
	#		threshold, add it to the reliable negatives.
	reliable_negatives = lapply(fccu_plevels(clustered[,"clusters"]), generate_reliable_negatives_frame,
				    full_data, clustered, method_params$threshold)
	reliable_negatives = do.call(rbind, reliable_negatives)

	colnames(reliable_negatives)[colnames(reliable_negatives) == "clusters"] = "groups"
	reliable_negatives[,"groups"] = rep(globals_list$UninfNum, nrow(reliable_negatives))
	# NOTE: Ignore positives that get classified as negatives - not allowed!
	reliable_negatives = reliable_negatives[!(rownames(reliable_negatives) %in% rownames(positive)),]

	gc()
	return(reliable_negatives)
}

#' generate_reliable_negatives_frame
#'
#' @export
generate_reliable_negatives_frame = function(clust, full_data, clustered, threshold){
	pos_data = full_data[clustered[,"clusters"] == clust,]

	if (is.null(threshold)){
		warning("method_params$threshold is NULL, defaulting to 0", call.=TRUE)
		threshold = 0
	}

	# If fraction of positives is not larger than threshold...
	#	Written this way to accomodate threshold = 0.
	if ( !((sum(pos_data[,"groups"] == globals_list$InfNum) / nrow(pos_data)) > threshold) ){
		return(pos_data)
	}

	return(data.frame())
}

#' wrap_boot
#'
#' @export
wrap_boot = function(dummy, unlabeled, positive, k){
	boot_indices = sample(1:nrow(unlabeled), k, replace=TRUE)
	nonboot_indices = which(!(1:nrow(unlabeled) %in% boot_indices))

	booted = as.data.frame(unlabeled[boot_indices,])
	nonbooted = as.data.frame(unlabeled[nonboot_indices,])
	nonbooted = nonbooted[,!colnames(nonbooted) == "groups"]

	return(list(train=rbind(positive, booted), oob=nonbooted))
}

#' wrap_kmeans_bisect_internal
#'
#' @export
wrap_kmeans_bisect_internal = function(dummy, sub_data){
	return(kmeans(sub_data, centers=2, iter.max=nrow(sub_data), nstart=3))
}
kmorrisongr/ksmthesis documentation built on Oct. 5, 2020, 6:41 a.m.