# Written by Kyle Morrison for analysis of Fc Array Data
library(doParallel)
library(caret)
# -------------------------------------------
# This function builds a random forest classification model using gafs
# -------------------------------------------
fc_gafs = function(fc, groups, stats_dir, folds=10, cor_cutoff=0.9, min_feat_scale=0.25, iters=50, popsize=50, elite=0, pmutation=0.2, pcrossover=0.8, do_permute=FALSE, do_cor=FALSE, do_penalize=FALSE){
# --------------------------------------------------
# Environment setup
# --------------------------------------------------
feat_sel_dir = paste(stats_dir, "gafs/", sep="")
dir.create(feat_sel_dir)
# My laptop has 6 physical cores
cl = makePSOCKcluster(5)
registerDoParallel(cl)
set.seed(1337)
# --------------------------------------------------
# Data prep, dealing with correlation and balancing
# --------------------------------------------------
local = fcdc_impute(fc, fccu_first_feat_col(fc))
if (do_permute){
feat_sel_dir = paste(stats_dir, "gafs/perm/", sep='')
dir.create(feat_sel_dir)
# Shuffle the class labels for permutation control
local[,"group"] = sample(local[,"group"])
cat("Shuffling the class labels\n")
}
if (do_cor){
local = fcff_remove_cor(local, cor_cutoff, "sample")
cat("Removing highly-correlated features (>", cor_cutoff, ")\n")
}
# TODO: STATE THAT LOCAL SHOULD BE SINGLE TIME POINT
fcs = fcmu_fcs_combs(local, groups, 2)
# Do balancing as necessary
group_counts = vector("numeric")
for (g in groups){
group_counts = c(group_counts, sum(local[,"group"] == g))
}
max_subjs = min(group_counts)
# This is not written to support >2 groups right now
min_idx = which(group_counts == max_subjs)
other_idx = which(group_counts != max_subjs)
# --------------------------------------------------
# Balance and/or holdout data
# --------------------------------------------------
if (group_counts[1] != group_counts[2]){
cat("Balancing the groups\n")
# Randomly drop the extra members of fcs[[other_idx]]
# For now, do nothing with the leftover data
balanced_data = rbind(fcs[[min_idx]], fcs[[other_idx]][sample(1:max_subjs, max_subjs),])
group = balanced_data[,"group"]
group = as.factor(group)
balanced_data = balanced_data[,-(1:2)]
# Run gafs on balanced data - should not need my own CV
ga = fcu_wrap_gafs(balanced_data, group, min_feat_scale, iters=iters, popsize=popsize, elite=elite, pmutation=pmutation, pcrossover=pcrossover, do_penalize)
print(ga)
final_feats = ga$ga$final
write(final_feats, file=paste(feat_sel_dir, "final_feats.txt", sep=''))
# --------------------------------------------------
# Use everything for building the model
# --------------------------------------------------
} else {
keep = 1:nrow(fcs[[other_idx]])
train = rbind(fcs[[min_idx]], fcs[[other_idx]][keep,])
group = train[,"group"]
group = as.factor(as.character(group))
train = train[,-1]
if (do_cor){
train = fcff_remove_cor(train, cor_cutoff, "sample")
cat("Removing highly-correlated features (>", cor_cutoff, ")\n")
}
# --------------------------------------------------
# Run gafs on each train, get score
ga = fc_gafs_wrap(train, group, min_feat_scale, iters=35, elite=0, pmutation=0.2, pcrossover=0.8, do_penalize)
print(ga)
final_feats = ga$ga$final
write(final_feats, file=paste(feat_sel_dir, "final_feats.txt", sep=''))
cat("All data used to train - couldn't do any testing\n")
}
}
# -------------------------------------------
# This is the wrapper interface to gafs() called by fc_gafs
# -------------------------------------------
fc_gafs_wrap = function(data, group, min_feat_scale, iters, popsize, elite, pmutation, pcrossover, do_penalize, functions=rfGA){
# Add penalties for feature number
if (do_penalize){
custom_functions = functions
custom_functions$fitness_intern = function(object, x, y, maximize, p){
# rfStats is model$rf$oob(object)
# rfStats(object)[1] gets either RMSE if regression or Accuracy if classification
# [2] gets either Rsquared if regression or Kappa if classification
acc = rfStats(object)[1]
d_acc = desirability::dMax(0,100)
# Fiddle with this until the final feature number settles to a place you are comfortable with
d_num_feats = desirability::dMin(1, p, min_feat_scale)
overall = desirability::dOverall(d_acc, d_num_feats)
des = predict(overall, data.frame(acc,ncol(x)))
return(c(D=des, Accuracy=as.vector(acc)))
}
ga_ctrl = gafsControl(functions=custom_functions, method="cv", number=10, verbose=TRUE, allowParallel=TRUE,
genParallel=TRUE, metric=c(internal="D", external="Accuracy"),
maximize=c(internal=TRUE, external=FALSE))
cat(paste("Maximizing Accuracy (0,100) and minimizing NumFeats (1,p,", min_feat_scale, ")\n", sep=''))
} else {
ga_ctrl = gafsControl(functions=functions, method="repeatedcv", repeats=10, number=10, verbose=TRUE, allowParallel=TRUE, genParallel=TRUE)
cat("Using default optimization\n")
}
ga = gafs(x=data, y=group, iters=iters, popSize=popsize, elite=elite, pmutation=pmutation, pcrossover=pcrossover, gafsControl=ga_ctrl)
return(ga)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.