R/nfold.xval.R
In flightcallr: Classify Night Flight Calls Based on Acoustic Measurements

Documented in nfold.xval

nfold.xval = function(
### Perform n-fold cross-validation to generate a perf object.
measurements, 
### A set of selection table rows with Call_vs_Noise column filled in, and with
### features as generated by generate.seewave.measures()
annotation="", 
### An optional annotation for the plots.
fold=10, 
### n, i.e. how many rounds of cross-validation to perform.
plot.ROC=T, 
### Whether or not to plot the ROC curve pertaining to the cross-validation run.
color.plot=T
### Whether or not the plot should be in color.
) {
  seewave.measures = c("Rugosity", "Crest_Factor", "Temporal_Entropy", "Shannon_Entropy", "Shannon_Entropy_Bandlimited", 
                       "Spectral_Flatness_Measure", "Spectral_Flatness_Measure_Bandlimited", "Spectrum_Roughness", 
                       "Spectrum_Roughness_Bandlimited", "Autocorrelation_Mean", "Autocorrelation_Median", 
                       "Autocorrelation_Standard_Error", "Dominant_Frequency_Mean", "Dominant_Frequency_Standard_Error", 
                       "Specprop_Mean", "Specprop_SD", "Specprop_SEM", "Specprop_Median", "Specprop_Mode", "Specprop_Q25", 
                       "Specprop_Q75", "Specprop_IQR", "Specprop_Cent", "Specprop_Skewness", "Specprop_Kurtosis", 
                       "Specprop_SFM", "Specprop_SH")
  require(ROCR)
  require(randomForest)
  my.xval = list()
  my.xval$predictions = list()
  my.xval$labels = list()
  my.sampsize = 0.99
  if (nchar(annotation) == 0) {
    annotation = paste("sampsize =", my.sampsize)
  }
  wrongest = c()
  for (i in 1:fold) {
    if(i %% 10 == 0) {
      cat(paste("Fold", i, "of", fold, "\n"))
    }
    splits = splitdf(measurements, weight=7/10)
    rf = randomForest(Call_vs_Noise ~ ., data=splits$trainset[, c(seewave.measures, "Call_vs_Noise", "Detector")], 
                      na.action=na.omit, sampsize = my.sampsize * nrow(splits$trainset))
    rf.pred = predict(rf, splits$testset, type="prob")
    truth.numeric = ifelse(splits$testset$Call_vs_Noise == "Call", 0, 1)
    wrongness = abs(rf.pred[, 2] - truth.numeric)
    # I'll have to change this from using the Selection Number to the GUID for later deployments
    names(wrongness) = splits$testset$Selection
    wrongness = wrongness[sample.int(length(wrongness))]  # shuffle to avoid privileging events from later in the bootstrap table
    wrongness = sort(wrongness, decreasing = T)
    wrongness = wrongness[1:(floor(length(wrongness) / floor(runif(1, 1, 100))))]  # we consider a random percentage of the wrongest events
    wrongest = c(wrongest, names(wrongness))
    #print(table(wrongest))
    pred = prediction(rf.pred[,2], splits$testset$Call_vs_Noise)
    my.xval$predictions[[i]] = pred@predictions[[1]]
    my.xval$labels[[i]] = pred@labels[[1]]
  }
  pp = my.xval$predictions
  ll = my.xval$labels
  pred = prediction(pp, ll)
  perf = performance(pred, "tpr", "fpr")
  #op = par(mfrow = c(2, 2))
#  windows();
#  plot(perf, colorize = T, lwd = 2,
#       main = paste("ROC: 10-fold cross-validation\n", annotation))

  if (plot.ROC) {
    windows();
    if (color.plot) {
    plot(perf, avg = "threshold", spread.estimate = "stderror",
         spread.scale=abs(qnorm(0.025, mean=0, sd=1)),
         #show.spread.at = c(0.824),
         lwd = 2, main = paste("Threshold avg w/ 95% confidence intervals\n", annotation),
         colorize = T)
    } else {
      annotation=""
      plot(perf, avg = "threshold", print.cutoffs.at=c(0.1, 0.3, 0.6, 0.9, 0.99, 0.999),
           cutoff.label.function = function(x) return(""),
           #show.spread.at = c(0.824),
           lwd = 2, main = paste("Receiver Operating Characteristic", annotation))      
    }
  }
  ##value<< A list with elements
  list(perf, ##<< a perf object
  difficulty=wrongest) ##<< a vector signifying how frequently the row in question is incorrectly identified in cross-validation
  ##end<<
}