#' bisect_kmeans
#'
#' @export
bisect_kmeans = function(train_data, method_params){
# Start with one cluster
E = cbind(train_data, clusters=rep(1, nrow(train_data)))
iter_max = 100
while (length(unique(E[,"clusters"])) < method_params$k){
# Identify largest cluster; if there's a tie, take the first
clust_sums = sapply(unique(E[,"clusters"]), function(x, E){
return(sum(E[,"clusters"] == x))
}, E=E)
large_clust = unique(E[,"clusters"])[which(clust_sums == max(clust_sums))[1]]
# Split largest cluster into 2 clusters iter_max times and
# take the split with the highest similarity
# If cluster to split is tiny, we're done
sub_data = E[E[,"clusters"] == large_clust,]
if (nrow(sub_data) <= 2){
break
}
models = lapply(1:iter_max, wrap_kmeans_bisect_internal, sub_data)
cand_clusts = lapply(models, function(x){ return(as.vector(x$cluster)) })
cand_scores = sapply(models, function(x){ return(x$betweenss / x$totss) })
# Select the candidate cluster with the best "score"
# Create new labels for the clusters (instead of "1" and "2")
final_clusts = cand_clusts[[which(cand_scores == max(cand_scores))[1]]]
# Do them in this order so 1s that become 2s don't then become 3s, etc.
final_clusts[final_clusts == 2] = max(E[,"clusters"]) + 2
final_clusts[final_clusts == 1] = max(E[,"clusters"]) + 1
E[,"clusters"][E[,"clusters"] == large_clust] = final_clusts
}
gc()
return(E)
}
# Requires method_params$k for bisect_kmeans
# Optionally takes method_params$threshold for identifying clusters to add to RN
# Use bisect_kmeans to generate reliable negative data
# NOTE: if no reliable negatives are found, experiment with method_params$threshold
#' generate_reliable_negatives
#'
#' @export
generate_reliable_negatives = function(x, full_data, positive, method_params){
# Iteratively define reliable negatives until stopping criteria, per Zhang, B. and Zuo, W. (2009)
# 1. Run bisecting k-means to create k clusters from all of the Positive + Unlabeled data
clustered = bisect_kmeans(x, method_params)
# 2. foreach cluster:
# A. If the number of positives in the cluster is below the
# threshold, add it to the reliable negatives.
reliable_negatives = lapply(fccu_plevels(clustered[,"clusters"]), generate_reliable_negatives_frame,
full_data, clustered, method_params$threshold)
reliable_negatives = do.call(rbind, reliable_negatives)
colnames(reliable_negatives)[colnames(reliable_negatives) == "clusters"] = "groups"
reliable_negatives[,"groups"] = rep(globals_list$UninfNum, nrow(reliable_negatives))
# NOTE: Ignore positives that get classified as negatives - not allowed!
reliable_negatives = reliable_negatives[!(rownames(reliable_negatives) %in% rownames(positive)),]
gc()
return(reliable_negatives)
}
#' generate_reliable_negatives_frame
#'
#' @export
generate_reliable_negatives_frame = function(clust, full_data, clustered, threshold){
pos_data = full_data[clustered[,"clusters"] == clust,]
if (is.null(threshold)){
warning("method_params$threshold is NULL, defaulting to 0", call.=TRUE)
threshold = 0
}
# If fraction of positives is not larger than threshold...
# Written this way to accomodate threshold = 0.
if ( !((sum(pos_data[,"groups"] == globals_list$InfNum) / nrow(pos_data)) > threshold) ){
return(pos_data)
}
return(data.frame())
}
#' wrap_boot
#'
#' @export
wrap_boot = function(dummy, unlabeled, positive, k){
boot_indices = sample(1:nrow(unlabeled), k, replace=TRUE)
nonboot_indices = which(!(1:nrow(unlabeled) %in% boot_indices))
booted = as.data.frame(unlabeled[boot_indices,])
nonbooted = as.data.frame(unlabeled[nonboot_indices,])
nonbooted = nonbooted[,!colnames(nonbooted) == "groups"]
return(list(train=rbind(positive, booted), oob=nonbooted))
}
#' wrap_kmeans_bisect_internal
#'
#' @export
wrap_kmeans_bisect_internal = function(dummy, sub_data){
return(kmeans(sub_data, centers=2, iter.max=nrow(sub_data), nstart=3))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.