## Copyright (c) 2004-2015, Ph. Grosjean <phgrosjean@sciviews.org>
##
## This file is part of ZooImage
##
## ZooImage is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 2 of the License, or
## (at your option) any later version.
##
## ZooImage is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with ZooImage. If not, see <http://www.gnu.org/licenses/>.
## TODO: disable next button when in last fraction
## TODO: save temporary and final results in the zidb file
## TODO: correct bugs with the back button
## Load libraries
#library(svMisc) #, lib.loc = "./Libraries/V3")
#library(svDialogs) #, lib.loc = "./Libraries/V3")
#library(zooimage) #, lib.loc = "./Libraries/V3")
##library(RANN) # Only function: nn2()... apparently not used here!
#library(randomForest)
#library(e1071)
#library(RWeka)
#library(C50)
#library(ipred)
#### Functions not used, but kept for possible future use ######################
## Area under the curve
auc <- function (x, y) {
if (length(x) != length(y) || length(x) < 2)
stop("'x' and 'y' must have same length >= 2")
sum(diff(x) * (y[-1] + y[-length(y)]) / 2)
}
#### Functions NOT used by errorCorrection(), but by examples ##################
## TODO: corrHist() and addPoints() are used together... make it one plot() method?
## TODO: make it S3 object with print, summry and plot methods + predict + ...
## Function to plot size spectrum of particles.
## Can be used for error correction of biomass (perspectives)
corrHist <- function (data, subset, sizeParam, interval) {
data <- data[subset, sizeParam]
if (!length(data)) return(0)
hist(data, breaks = seq(min(data) - (2* interval), max(data) +
(2 * interval), interval), plot = FALSE)
}
## Add points to an histogram
addPoints <- function (x, ...) {
if (!inherits(x, "histogram"))
stop("x must be an object of class 'histogram'")
points(x = x$mids[x$counts > 0], y = x$counts[x$counts > 0], ...)
}
## Calculation of the residual error based on global error estimation
residError <- function (ErrorEstim, Error, Validated, na.rm = FALSE) {
nObs <- length(Validated)
if (sum(Validated) == nObs) return(0) # No error since everybody validated
nErr <- ErrorEstim * nObs
nErrVal <- sum(Error & Validated, na.rm = na.rm)
nErrRes <- nErr - nErrVal
max(nErrRes / nObs, 0) # To avoid potential negative values
}
## Evolution of residual error
residErrorEvol <- function (ErrorEstimEvol, Error, Validated, Step,
na.rm = FALSE) {
nIter <- length(ErrorEstimEvol)
if (nIter == 0) return(numeric(0))
res <- numeric(nIter)
for (i in 1:nIter) {
Valid <- Validated & Step < i #Step %in% 0:(i - 1)
res[i] <- residError(ErrorEstim = ErrorEstimEvol[i], Error = Error,
Validated = Valid, na.rm = na.rm)
}
res
}
#### Functions used both by errorCorrection() and by examples ##################
## Compute the Bray-Curtis dissimilarity index on two vectors
## Note that this is faster than vegan::vegdist for two vectors
## and it remains faster for a vector versus a matrix. But fir all pairs,
## use vegan::vegdist(x, method = "bray") instead
## x must be a reference vector, and y can be a vector of same length,
## or a matrix or a data frame with same number of rows
dissimilarity <- function (x, y, na.rm = FALSE) {
if (is.data.frame(y)) y <- as.matrix(y)
if (!is.numeric(x) || !is.numeric(y))
stop("'x' and 'y' must be numeric")
if (!is.null(dim(x)))
stop("'x' must be a vector, not a matrix or an array")
if (is.matrix(y)) {
if (length(x) != nrow(y))
stop("'y' must have same rows as the length of 'x'")
## The matrix calculation version
colSums(abs(x - y), na.rm = na.rm) /
colSums(rbind(y, sum(x, na.rm = na.rm)), na.rm = na.rm)
} else { # x and y must be vectors of same length
if (!is.null(dim(y)) || length(x) != length(y))
stop("'x' and 'y' must be two vectors of same length")
## The simpler vector version
sum(abs(x - y), na.rm = na.rm) / sum(x, y, na.rm = na.rm)
}
}
#### Functions used only by errorCorrection() ##################################
## Calculation of the global error based on validated items
## /!\ NOT based on items randomly selected --> Bad approximation
.globalError <- function (suspect, error, subset, na.rm = FALSE) {
suspTot <- sum(suspect, na.rm = na.rm)
trustTot <- sum(!suspect, na.rm = na.rm)
vSusp <- suspect[subset]
vTrust <- !suspect[subset]
susp <- sum(vSusp, na.rm = na.rm)
trust <- sum(vTrust, na.rm = na.rm)
pSusp <- susp / suspTot
pTrust <- trust / trustTot
wTrust <- pSusp / pTrust
if (is.na(wTrust) || !is.finite(wTrust)) wTrust <- 1
vErr <- error[subset]
err <- sum(vErr[vSusp], na.rm = na.rm) + wTrust * sum(vErr[vTrust],
na.rm = na.rm)
tot <- susp + wTrust * trust
err / tot
}
## Error in each of the four fractions
.errorInFractions <- function (suspect, error, validated, iteration = NA,
na.rm = FALSE) {
## The four fractions
suspValIdx <- suspect & validated
suspNotValIdx <- suspect & !validated
trustValIdx <- !suspect & validated
trustNotValIdx <- !suspect & !validated
## Error in each fraction
errSuspValIdx <- error[suspValIdx]
errSuspNotValIdx <- error[suspNotValIdx]
errTrustValIdx <- error[trustValIdx]
errTrustNotValIdx <- error[trustNotValIdx]
## Number of items in each fraction
nSuspVal <- sum(suspValIdx, na.rm = na.rm)
nSuspNotVal <- sum(suspNotValIdx, na.rm = na.rm)
nSuspTot <- nSuspVal + nSuspNotVal
nTrustVal <- sum(trustValIdx, na.rm = na.rm)
nTrustNotVal <- sum(trustNotValIdx, na.rm = na.rm)
nTrustTot <- nTrustVal + nTrustNotVal
## Number of error in each fraction
nErrSuspVal <- sum(errSuspValIdx, na.rm = na.rm)
nErrSuspNotVal <- sum(errSuspNotValIdx, na.rm = na.rm)
nErrSuspTot <- nErrSuspVal + nErrSuspNotVal
nErrTrustVal <- sum(errTrustValIdx, na.rm = na.rm)
nErrTrustNotVal <- sum(errTrustNotValIdx, na.rm = na.rm)
nErrTrustTot <- nErrTrustVal + nErrTrustNotVal
## Number of error in validated fraction if random distribution of the error
nErrSuspValRd <- nErrSuspTot / nSuspTot * nSuspVal
nErrTrustValRd <- nErrTrustTot / nTrustTot * nTrustVal
## Error rate in each fraction
errSuspVal <- nErrSuspVal / nSuspVal
errSuspNotVal <- nErrSuspNotVal / nSuspNotVal
errTrustVal <- nErrTrustVal / nTrustVal
errTrustNotVal <- nErrTrustNotVal / nTrustNotVal
## Weight for trusted items
probaSusp <- nSuspVal / nSuspTot
probaTrust <- nTrustVal / nTrustTot
weightTrust <- probaSusp / probaTrust
## Results: data frame
if (!all(is.na(iteration))) {
## Calculation of error in validated fraction at current iteration
suspStepIdx <- suspect & iteration
trustStepIdx <- !suspect & iteration
errSuspStepIdx <- error[suspStepIdx]
errTrustStepIdx <- error[trustStepIdx]
nSuspStep <- sum(suspStepIdx, na.rm = na.rm)
nTrustStep <- sum(trustStepIdx, na.rm = na.rm)
nErrSuspStep <- sum(errSuspStepIdx, na.rm = na.rm)
nErrTrustStep <- sum(errTrustStepIdx, na.rm = na.rm)
errSuspStep <- nErrSuspStep / nSuspStep
errTrustStep <- nErrTrustStep / nTrustStep
res <- data.frame(errSuspVal = errSuspVal, errTrustVal = errTrustVal,
errSuspNotVal = errSuspNotVal, errTrustNotVal = errTrustNotVal,
errSuspStep = errSuspStep, errTrustStep = errTrustStep,
nErrSuspTot = nErrSuspTot, nErrTrustTot = nErrTrustTot,
nErrSuspVal = nErrSuspVal, nErrTrustVal = nErrTrustVal,
nErrSuspStep = nErrSuspStep, nErrTrustStep = nErrTrustStep,
nErrSuspValRd = nErrSuspValRd, nErrTrustValRd = nErrTrustValRd,
nErrSuspNotVal = nErrSuspNotVal, nErrTrustNotVal = nErrTrustNotVal,
nSuspTot = nSuspTot, nTrustTot = nTrustTot, nSuspVal = nSuspVal,
nTrustVal = nTrustVal, nSuspStep = nSuspStep,
nTrustStep = nTrustStep, nSuspNotVal = nSuspNotVal,
nTrustNotVal = nTrustNotVal, weightTrust = weightTrust)
} else {
## Calculation of error in global sample
res <- data.frame(errSuspVal = errSuspVal, errTrustVal = errTrustVal,
errSuspNotVal = errSuspNotVal, errTrustNotVal = errTrustNotVal,
nErrSuspTot = nErrSuspTot, nErrTrustTot = nErrTrustTot,
nErrSuspVal = nErrSuspVal, nErrTrustVal = nErrTrustVal,
nErrSuspValRd = nErrSuspValRd, nErrTrustValRd = nErrTrustValRd,
nErrSuspNotVal = nErrSuspNotVal, nErrTrustNotVal = nErrTrustNotVal,
nSuspTot = nSuspTot, nTrustTot = nTrustTot, nSuspVal = nSuspVal,
nTrustVal = nTrustVal, nSuspNotVal = nSuspNotVal,
nTrustNotVal = nTrustNotVal, weightTrust = weightTrust)
}
res
}
## Compute probabilities that will be used for suspect detection
classProba <- function (data, predicted = data$Predicted, classifier,
diff.max = 0.2, prop.bio = NULL) {
## Get first and second identification probabilities
## TODO: only works for randomForest and does not use mlearning!
if (inherits(classifier, "randomForest")) {
if (inherits(classifier, "ZIClass")) {
data <- attr(classifier, "calc.vars")(data)
}
class(classifier) <- class(classifier)[-1]
prob <- predict(classifier, newdata = data, class.only = FALSE,
type = "membership")
} else stop("Suspect detection not yet implemented for this algorithm")
max.ids <- apply(prob, 1, sort, decreasing = TRUE)[1:2, ]
first <- max.ids[1, ]
second <- max.ids[2, ]
## 1) Coefficient quantifying quality of identification
identCoef <- sqrt(first * pmin((first - second) / (first * diff.max), 1))
## 2) Probability if identification in that group
#first
## 3) Difference between first and second probabilities
probaDiff <- first - second
## 4) Proportion of the group (rares groups tend to have more FP!)
predTable <- table(predicted)
prop <- predTable / sum(predTable)
gpProp <- prop[as.character(predicted)]
# 5) ProbaBio modulation?
if (is.null(prop.bio)) {
proba <- data.frame(IdentCoef = identCoef, TreeProp = first,
ProbaDiff = probaDiff, GpProp = gpProp)
} else {
bioFact <- prop.bio[as.character(predicted)]
proba <- data.frame(IdentCoef = identCoef, TreeProp = first,
ProbaDiff = probaDiff, GpProp = gpProp, BioFact = bioFact)
}
attr(proba, "IdentParam") <- list(DiffMax = diff.max,
ProbaBio = prop.bio, ProbaTable = prob)
proba
}
## Compute the proportion of false positives according to Bayes theorem
corrProba <- function (proba, predicted, confusion, nobs) {
stats <- summary(confusion, type = c("Recall", "Specificity"))
table.pred <- table(predicted)
prop <- table.pred / nobs
recall <- stats$Recall
specif <- stats$Specificity
fpprop <- numeric(length(table.pred))
for (name in names(table.pred)) {
prop.a <- prop[name]
prop.b <- 1 - prop.a
recall.a <- stats[name, ]$Recall
fprate <- 1 - stats[name, ]$Specificity
b.in.a <- fprate * prop.b
a.in.a <- recall.a * prop.a
fpprop[name] <- b.in.a / (b.in.a + a.in.a)
if (fpprop[name] == "NaN") fpprop[name] <- 1
}
proba$FP <- fpprop[as.character(predicted)]
## Compare this proportion to group proportion
## PhG: If proba$GpProp is not defined, I got an error here...
## ... so, try to recalculate it now in this case
if (is.null(proba$GpProp)) {
predTable <- table(predicted)
prop <- predTable / sum(predTable)
proba$GpProp <- prop[as.character(predicted)]
}
proba$GpFPDiff <- proba$GpProp - proba$FP
proba
}
## Detect suspect particles using different algorithms and update corr$Suspect
suspect <- function (data, proba, error, subset, predicted, confusion,
algorithm = "rf", knn.k = 3, svm.kernel = "polynomial", svm.degree = 5, ...) {
algorithm <- match.arg(algorithm,
c("rf", "knn", "svm", "bayes", "c50", "glm"))
## In the improbable case we have no error at all, consider everyone as suspect...
if (all(as.character(error) != "Error"))
return(factor(rep("Error", nrow(set)), levels = levels(error)))
## Update proba with latest calculations, according to a corrected confusion matrix
proba <- corrProba(proba, predicted, confusion, nrow(data))
set <- cbind(data, proba, Error = error)
## TODO: mlearning is there for complete interchangeability of algorithms
## thus, we should not need this switch() construct!
res <- switch(algorithm,
rf = {
rf.model <- mlearning(formula = Error ~ ., data = set,
subset = subset, method = "mlRforest", ...)
susp <- predict(rf.model, newdata = set, type = "class")
susp[subset] <- predict(rf.model, type = "class", method = "oob")
},
knn = {
# set <- subset(set, select = -c(Error))
# warning("Prediction of training set by 'knn' method without cross validation")
# susp <- knn(train = set[subset, ], test = set,
# cl = corr$Error[subset], k = knn.k, ...)
stop("Not implemented yet!")
},
svm = {
# svm.model <- mlearning(Error ~ ., data = set,
# subset = subset, kernel = svm.kernel, degree = svm.degree,
# method = "mlSvm", ...)
# susp <- predict(svm.model, newdata = set, type = "class")
# susp[subset] <- predict(svm.model, type = "class", method = "cv")
stop("Not implemented yet!")
},
bayes = {
# nb.model <- mlearning(Error ~ ., data = set,
# subset = subset, method = "mlNaiveBayes", ...)
# susp <- predict(nb.model, newdata = set, type = "class")
# susp[subset] <- predict(nb.model, type = "class", method = "cv")
stop("Not implemented yet!")
},
c50 = {
# C50.train <- set[subset, ]
# c50.model <- C5.0(formula = Error ~ ., data = C50.train, ...)
# susp <- predict(c50.model, newdata = set, type = "class")
# susp[subset] <- cv(y = C50.train$Error, formula = Error ~ .,
# data = C50.train, model = C5.0, predict =
# function(object, newdata) predict(object, newdata,
# type = "class"), k = 10, predictions = TRUE)$predictions
stop("Not implemented yet!")
},
glm = {
# glm.model <- Logistic(Error ~ ., data = set, subset = subset, ...)
# warning("Prediction of training set by 'glm' method without cross-validation")
# susp <- predict(glm.model, newdata = set, type = "class")
stop("Not implemented yet!")
},
stop("Unknown algorithm '", algorithm, "'")
)
susp
}
## Main function for error correction
## Replace data by ZIobj and put zidb in first position... or group both items
## and how to retrieve class then???
## Group options in a single object too
errorCorrection <- function (data, classifier, mode = "validation",
fraction = 0.05, sample.min = 100, sample.max = 200, grp.min = 2,
random.sample = 0.1, algorithm = "rf", diff.max = 0.2, prop.bio = NULL,
rare = 0.01, zidb = NULL, testdir = NULL, id = NULL,
result = ".last_valid", envir = parent.frame()) {
#### Parameters explanations
#### Data and classifier
## data -- the dataset to study
if (missing(data) || !inherits(data, "ZIDat"))
stop("data must be a ZIdat object")
## Temporary hack to eliminate possible unwanted columns!
data$Id.1 <- NULL
data$X.Item.1 <- NULL
## Make sure items are sorted alphabetically according to their id
if (is.null(data$Id)) {
data <- data[order(paste(data$Label, data$Item, sep = "_")), ]
} else data <- data[order(data$Id), ]
## classifier -- the classifier to use to classify particles
if (missing(classifier) || !inherits(classifier, "ZIClass"))
stop("classifier must be a ZIClass object")
####### calc.vars <- attr(classifier, "calc.vars")
#### General parameters of the iterative correction
## mode -- the mode (validation, stat or demo)
mode <- match.arg(mode, c("validation", "demo", "stat"))
if (mode != "validation" & is.null(data$Class))
stop("data requires a 'Class' column in this mode")
## fraction -- fraction of sample to validate
fraction <- as.numeric(fraction)[1]
if (fraction < 0.01 || fraction > 1)
stop("fraction must be between 0.01 and 1")
## sample.min -- minimum number of particles to validate ate each step
sample.min <- as.integer(sample.min)[1]
if (sample.min < 1)
stop("sample.min must be a positive integer")
## sample.max -- maximum number of particles to validate ate each step
sample.max <- as.integer(sample.max)[1]
if (sample.max < 1)
stop("sample.max must be a positive integer")
if (sample.max < sample.min)
stop("sample.max must be higher or equal than sample.min")
## grp.min -- minimum number of particles of each group to validate
grp.min <- as.integer(grp.min)[1]
if (grp.min < 1 || grp.min > sample.min)
stop("grp.min must be a positive integer and cannot be larger than sample.min")
## random.sample -- fraction of random sample in the validation set
random.sample <- as.numeric(random.sample)[1]
if (random.sample < 0 || random.sample > 1)
stop("random.sample must be a number between 0 and 1")
#### Parameters for the detection of suspects
## algorithm -- algorithm used to detect suspect particles
## diff.max -- maximum tolerated difference in probabilities for class identification
diff.max <- as.numeric(diff.max)[1]
if (diff.max < 0)
stop("diff.max must be a positive number or zero")
## proba.bio -- groups probabilities, using biological information
if (length(prop.bio) && (!is.numeric(prop.bio) || is.null(names(prop.bio))))
stop("prop.bio must be a named vector (groups) of numbers")
## rare -- detection of rare items
rare <- as.numeric(rare)[1]
if (rare < 0 || rare > 0.2)
stop("rare must be between 0 and 0.2")
## zidb -- path to the zidbfile to manually validate
## testdir -- path of the directory used for manual validation
## Variables
proba <- NULL # identification probabilities (additional vars for suspect detection)
corr <- NULL # main informations about the correction
validated <- NULL # validated class of the particles
validation.data <- NULL # Data send as validation of a given step
predicted <- NULL # predictions at the beginning
step <- -1 # current iteration
sample.size <- NULL # size of the next subsample to validate
## TODO: eliminate this and get it from table(corr$Step)
validated.fracs <- 0 # fraction of items validated at each step
## Manual validation variables
ntrusted <- NULL # number of trusted particles detected
nsuspect <- NULL # number of suspect particles detected
ntrusted.tovalid <- NULL # number of trusted particles to validate
nsuspect.tovalid <- NULL # number of suspect particles to validate
testset <- NULL # subsample to validate
## Validation of particles TODO: get rid of these two flags!
step.manual <- FALSE # should the user validate particles?
testset.validated <- FALSE # is the testset validated?
## Plankton sorter variables
sorter.title <- NULL # title of the plankton sorter page
sorter.id <- NULL # identifier of the plankton sorter page
## Results
manual.history <- NULL # history of the manual confusion matrix
manual2.history <- NULL # history of manual + 2nd ident confusion matrix
corr.confusion <- NULL # corrected confusion matrix
classRare <- NULL # String with the list of classes considered as rare
cell.confusion <- NULL # corrected confusion matrix for cells
bioweight.confusion <- NULL # corrected confusion matrix for bioweight
correction.history <- NULL # history of the correction confusion matrix
correctionCell.history <- NULL # history of the correction confusion matrix for cells
correctionBio.history <- NULL # history of the correction confusion matrix for bioweight
error.estim.data <- NULL # data used to estimate the error
error.estim <- NULL # history of the error estimation
error.estim.history <- NULL # evolution of the error estimation
error.estim.rd <- NULL # error using validated, randomly selected items
error.estim.rd.history <- NULL # evolution of the error estimation
suspect.history <- list() # history of suspect detection
validated.history <- list() # history of validated particles
error.valid.history <- list() # history of error among validated items
error.suspect.history <- list() # history of error among suspect items
error.trusted.history <- list() # history of error among suspect items
nsuspect.history <- NULL # history of the number of suspect items
ntrusted.history <- NULL # history of the number of trusted items
nsuspect.tovalid.history <- NULL # history of suspect validation
ntrusted.tovalid.history <- NULL # history of trusted validation
errInFract.history <- data.frame() # evolution of the error in each fraction
## Initialize the data for error correction (data, proba, corr and others)
initialize <- function () {
## Check that this directory exists and is empty
if (mode != "stat") {
if (is.null(testdir))
testdir <<- file.path(tempdir(),
noExtension(zidb))
if (file.exists(testdir)) {
res <- dlgMessage(paste("Temporary validation directory already",
"exists. Do we erase old data there?"), type = "okcancel")$res
if (res == "cancel")
stop("testdir (", testdir, ") must not exist yet!")
unlink(testdir, recursive = TRUE)
}
dir.create(testdir, recursive = TRUE)
if (!file.exists(testdir))
stop("cannot create 'testdir'!")
testdir <<- normalizePath(testdir)
## Put required files there: create the planktonSorter directory
plSort <- file.path(testdir, "planktonSorter")
dir.create(plSort)
plFiles <- dir(system.file("planktonSorter", package = "zooimage"),
full.names = TRUE)
res <- file.copy(plFiles, file.path(plSort, basename(plFiles)))
if (any(!res))
stop("Problem when copying one or more plankton sorter files")
}
## In the other modes, indicate that we prepare the validation environment
cat("Preparing a validation environment...\n")
## Be sure that data is correctly ordered
data <<- data[order(data$Item), ]
## Be sure 'Id' exists in data
data$Id <- makeId(data)
## Make sure items' membership is predicted by the classifier
predicted <- data$Predicted
## TODO: shouldn't we do this all the time???
if (is.null(predicted)) # Predict class if it wasn't done yet
predicted <- predict(classifier, data, class.only = TRUE,
type = "class")
predicted <<- predicted
# if (mode != "validation") {
## Store validated items and table of validated items
validated <<- data$Class
## TODO: why this, and Class put in validated?
data$Class <- NULL
data <<- data
# }
## Keep all data
### data <<- attr(classifier, "calc.vars")(data)
### ## Do not keep uncomplete attributes
### data <<- data[, sapply(data, FUN = function(x) !any(is.na(x)))]
proba <<- classProba(data, predicted, classifier, diff.max = diff.max,
prop.bio = prop.bio)
nobs <- nrow(data)
error <- factor(rep("Error", nobs), levels = c("Correct", "Error"))
## Compute the second possible identification
secondIdent <- function (groups) {
proba.table <- attr(proba, "IdentParam")$ProbaTable
proba.order <- apply(proba.table, 1, order, decreasing = TRUE)
fst.ids <- proba.order[1, ]
scd.ids <- proba.order[2, ]
proba.max <- apply(proba.table, 1, sort, decreasing = TRUE)[1, ]
max.ids <- proba.max == 1
scd.ids[max.ids] <- fst.ids[max.ids]
factor(groups[scd.ids], levels = groups)
}
predicted2 <- secondIdent(levels(predicted))
predTable <- table(predicted)
prop <- predTable / sum(predTable)
classRare <<- names(which(prop < rare))
## Creation of corr object
corr <<- data.frame(Actual = predicted, Actual2 = predicted2,
Predicted = predicted, Predicted2 = predicted2, Validated = FALSE,
Error = error, Step = step, Suspect = rep(TRUE, nobs),
Rare = predicted %in% classRare, RdValidated = rep(Inf, nobs),
OtherGp = rep(FALSE, nobs))
## Confusion matrix of original classifier
train.validated <- attr(classifier, "response")
train.predicted <- attr(classifier, "cvpredict")
train.conf <- confusion(x = train.predicted, y = train.validated)
## First error correction: we start with Solow et al. estimation
## Compute correction of abundances using a method similar to Solow et al.,
## but that is not stuck with singular matrices problems
## TODO: this takes a long time => try Solow et al first before using this!
correctionLab <- function (x, conf) {
l <- length(x)
if (l != ncol(conf))
stop("'x' has not the same length than 'conf'")
toMat <- function(x, byrow = TRUE)
matrix(rep(x, l), ncol = l, byrow = byrow)
predMat <- toMat(colSums(conf))
xMat <- toMat(x)
## Remove 0
notZero <- function (x) {
if (0 %in% x) x[x==0] <- 1
x
}
confB <- conf / notZero(predMat) * xMat
x2 <- rowSums(confB)
classMat <- toMat(rowSums(conf), byrow = FALSE)
while (!isTRUE(all.equal(x, x2, tolerance = 0.00001))) {
x <- x2
confA <- conf / notZero(classMat) * toMat(x2, byrow = FALSE)
xMat2 <- toMat(colSums(confA))
confB <- confA / notZero(xMat2) * xMat
x2 <- rowSums(confB)
}
round(x2)
}
tablePredicted <- table(predicted)
correction.history <<- correctionLab(x = tablePredicted,
conf = train.conf)
manual.history <<- tablePredicted
manual2.history <<- manual.history
## Increment step (should be 0 now, because initial value is -1)
step <<- step + 1
## Determine the number of vignettes to manually validate
setSampleSize()
}
## Compute the size of the next subsample: update sample.size
setSampleSize <- function () {
## Number of items we want to take
sample.size <<- ceiling(nrow(data) * fraction)
## No less than sample.min
sample.size <<- max(sample.size, sample.min)
## According to complexity of the training set, take possibly more
sample.size <<- max(sample.size, grp.min * length(levels(predicted)))
## ... but no more than sample.max
sample.size <<- min(sample.size, sample.max)
## Or how much remains?
sample.size <<- min(sample.size, nrow(corr[!corr$Validated, ]))
}
## Determine the subsample to validate
## Update Step and RdValidated (used for error estimation)
## Automatically place vignettes in directories for manual validation
prepareTest <- function () {
nobs <- nrow(data)
if (step < 1) {
## At first time, take a random subsample
## Same as considering everything as suspect
#PhG nsuspect <<- nobs
#PhG ntrusted <<- 0
sample.ids <- sample(1:nobs, size = sample.size)
corr$Step[sample.ids] <<- step
corr$RdValidated[sample.ids] <<- step
nsuspect.tovalid.history <<- c(nsuspect.tovalid.history, sample.size)
ntrusted.tovalid.history <<- c(ntrusted.tovalid.history, 0)
nsuspect.history <<- c(nsuspect.history, nobs)
ntrusted.history <<- c(ntrusted.history, 0)
} else { # Step > 0
## Mix trusted and suspect particles
#validated <- corr[corr$Validated, ]
notvalidated <- corr[!corr$Validated, ]
nsuspect <<- sum(notvalidated$Suspect) #nrow(notvalidated[notvalidated$Suspect, ])
ntrusted <<- nrow(notvalidated) - nsuspect #nrow(notvalidated[!notvalidated$Suspect, ])
## Determine the number of random items used in error estimation
numRandom <- max(sample.size - nsuspect,
round(random.sample * sample.size))
ids <- as.character(1:nobs)
## All items not included in RdValidated
tosample.ids <- ids[is.infinite(corr$RdValidated)]
## Items to validate
randomsample.ids <- as.numeric(sample(tosample.ids,
size = min(numRandom, length(tosample.ids))))
## Used to estimate error at step i
corr$RdValidated[randomsample.ids] <<- step
newstep <- corr$Step[randomsample.ids]
## Except those already validated and used for error estimation
newstep[newstep == -1] <- step
corr$Step[randomsample.ids] <<- newstep
notvalid.ids <- ids[!corr$Validated & corr$RdValidated == step]
## Number of items to valid in order to achieve sample.size
numSample <- sample.size - length(notvalid.ids)
if (numSample > 0) {
## Randomly select suspect items not validated
suspnotval.ids <- ids[!corr$Validated & corr$Suspect &
is.infinite(corr$RdValidated) & corr$Step == -1]
if (length(suspnotval.ids)) {
suspsample.ids <- as.numeric(sample(suspnotval.ids,
size = min(numSample, length(suspnotval.ids))))
corr$Step[suspsample.ids] <<- step
numSample <- numSample - length(suspsample.ids)
}
if (numSample > 0) {
## Randomly select trusted items not validated
trustnotval.ids <- ids[!corr$Validated & !corr$Suspect &
is.infinite(corr$RdValidated) & corr$Step == -1]
if (length(trustnotval.ids)) {
trustsample.ids <- as.numeric(sample(trustnotval.ids,
size = min(numSample, length(trustnotval.ids))))
corr$Step[trustsample.ids] <<- step
}
}
}
############### stratified random sampling ###############
# if (numSample > 0) {
# ## Select the same number of suspect items not validated in each class
# suspnotval.ids <- ids[!corr$Validated & corr$Suspect &
# is.infinite(corr$RdValidated) & corr$Step == -1]
# if (length(suspnotval.ids)) {
# splitGp <- split(suspnotval.ids, list(corr[suspnotval.ids,]$Predicted))
# strat.samples <- lapply(splitGp, function(x) x[sample(1:NROW(x),
# min(NROW(x), round(numSample/length(unique(corr$Predicted[as.numeric(suspnotval.ids)])))),
# replace = FALSE)])
# suspsample.ids <- as.numeric(do.call(c, strat.samples))
# corr$Step[suspsample.ids] <<- step
# numSample <- numSample - length(suspsample.ids)
# }
#
# if (numSample > 0) {
# ## If not completed, randomly select suspects items not validated
# suspnotval.ids <- ids[!corr$Validated & corr$Suspect &
# is.infinite(corr$RdValidated) & corr$Step == -1]
# if (length(suspnotval.ids)) {
# suspsample.ids <- as.numeric(sample(suspnotval.ids,
# size = min(numSample, length(suspnotval.ids))))
# corr$Step[suspsample.ids] <<- step
# numSample <- numSample - length(suspsample.ids)
# }
# }
#
# if (numSample > 0) {
# ## If not completed, Select the same number of trusted items not validated in each class
# trustnotval.ids <- ids[!corr$Validated & !corr$Suspect &
# is.infinite(corr$RdValidated) & corr$Step == -1]
# if (length(trustnotval.ids)) {
# splitGp <- split(trustnotval.ids, list(corr[trustnotval.ids,]$Predicted))
# strat.samples <- lapply(splitGp, function(x) x[sample(1:NROW(x),
# min(NROW(x), round(numSample/length(unique(corr$Predicted[as.numeric(trustnotval.ids)])))),
# replace = FALSE)])
# trustsample.ids <- as.numeric(do.call(c, strat.samples))
# corr$Step[trustsample.ids] <<- step
# numSample <- numSample - length(trustsample.ids)
# }
# }
#
# if (numSample > 0) {
# ## If not completed, randomly select trusted items not validated
# trustnotval.ids <- ids[!corr$Validated & !corr$Suspect &
# is.infinite(corr$RdValidated) & corr$Step == -1]
# if (length(trustnotval.ids)) {
# trustsample.ids <- as.numeric(sample(trustnotval.ids,
# size = min(numSample, length(trustnotval.ids))))
# corr$Step[trustsample.ids] <<- step
# numSample <- numSample - length(trustsample.ids)
# }
# }
############### ############### ###############
nsuspect.tovalid <- length(ids[corr$Step == step & corr$Suspect])
ntrusted.tovalid <- length(ids[corr$Step == step & !corr$Suspect])
nsuspect.history <<- c(nsuspect.history, nsuspect)
ntrusted.history <<- c(ntrusted.history, ntrusted)
nsuspect.tovalid.history <<- c(nsuspect.tovalid.history, nsuspect.tovalid)
ntrusted.tovalid.history <<- c(ntrusted.tovalid.history, ntrusted.tovalid)
}
if (mode != "stat") {
## Make sure the R Httpd server is started
tools <- getNamespace("tools")
if (R.Version()$`svn rev` >= 67550) {
port <- tools::startDynamicHelp(NA)
} else {
port <- tools$httpdPort
}
if (port == 0) port <- startDynamicHelp(TRUE)
if (port == 0) stop("Impossible to start the R httpd server")
subdir <- paste0("step", step + 1) # Because it start at 0,
## but this is really the beginning of next step now
stepdir <- file.path(testdir, subdir)
dir.create(stepdir)
Zidb <- zidbLink(zidb)
## Write data in test directory
keepRows <- corr$Step == step
Vigns <- data[keepRows, "Id"]
imgext <- Zidb[[".ImageType"]]
## Extract all these images to stepdir
vigpath <- file.path(stepdir, paste(Vigns, imgext, sep = "."))
names(vigpath) <- Vigns
if (length(Vigns))
for (j in 1:length(Vigns))
writeBin(Zidb[[Vigns[j]]], vigpath[j])
## Create all directories of groups
path <- attr(classifier, "path")
names(path) <- basename(path)
## Create planktonSorter.html file
vigfiles <- basename(vigpath)
pred <- as.character(corr[keepRows, "Predicted"])
names(vigfiles) <- pred
Sample <- as.character(sub("\\+.+_[0-9]+", "", Vigns[1]))
if (step > 0) {
## Recreate previous page with sorter items
oldPage <- file.path(testdir, paste0("step", step),
"planktonSorter.html")
oldItems <- corr$Step == (step - 1)
oldVigns <- paste(data[oldItems, "Id"], imgext, sep = ".")
oldNames <- as.character(corr[oldItems, "Actual"])
oldNames[is.na(oldNames)] <- "[other]"
names(oldVigns) <- oldNames
res <- planktonSorterPage(groups = path, vigns = oldVigns,
title = sorter.title, id = sorter.id,
step = step, file = oldPage)
}
## Create new page
plSorterPage <- file.path(stepdir, "planktonSorter.html")
sorter.title <<- paste0(Sample, "/Step", step + 1)
sorter.id <<- paste(id, step + 1, sep = "/")
res <- planktonSorterPage(groups = path, vigns = vigfiles,
title = sorter.title, id = sorter.id,
step = step + 1, file = plSorterPage)
cat(paste("You have to validate vignettes in", subdir,
"directory", "before next iteration...\n"))
browseURL(paste0("file://", plSorterPage)) #Does not work on linux, "?v=", round(runif(1, max = 100000))))
}
testset.validated <<- FALSE
}
## Retrieve test set (validated or not)
getTest <- function ()
testset <<- corr[corr$Step == step, ]
## Automatic validation of the particles (only in demo and stat mode)
# Read manual validation on hardrive
# Update corr$Actual column
validate <- function () {
if (mode == "stat") {
corr[corr$Step == step, ]$Actual <<- validated[corr$Step == step]
} else {
## TODO: change this!
# ## Read Test set with getTest() from zooimage package
# SubDir <- file.path(testdir, paste0("step", step))
# TestSetStep <- getTest(SubDir)[, c("Id", "Class")]
## Id if items validated at step 'i' and included in data
## ????
# dfId <- data.frame(Id = as.factor(rownames(data))[corr$Step == step])
# ## Reorder TestSetStep to replace values in corr object
# test2 <- merge(dfId, TestSetStep, by = "Id", sort = FALSE)
# ## Be sure that names correspond
# if (!isTRUE(all(test2$Id == dfId$Id)))
# stop("'Id' in manual validation doesn't correspond with 'Id' ",
# "from dataset used in error correction.")
if (mode == "validation") {
if (is.null(validation.data)) {
return()
} else {
stepVal <- as.numeric(validation.data[1, 2]) - 1
if (is.na(stepVal) || !length(stepVal) || stepVal < 0)
stop("Wrong validation data (step = ", stepVal, ")")
grps <- validation.data[-1, 1]
## Transform [other] into NA
grps[grps == "[other]"] <- NA
testStep <- data.frame(Id = validation.data[-1, 2],
Class = factor(grps, levels = levels(corr$Actual)))
validation.data <<- NULL
dfId <- data.frame(Id = makeId(data)[corr$Step == stepVal])
test2 <- merge(dfId, testStep, by = "Id", sort = FALSE)
corr[corr$Step == stepVal, ]$Actual <<- test2$Class
step <<- stepVal
## TODO: save these results also in the zidb file!
}
} else {
corr[corr$Step == step, ]$Actual <<- validated[corr$Step == step]
}
}
## Update column for vignettes impossible to identify or that belong to
## other group than those included in the classifier
corr$OtherGp <<- is.na(corr$Actual)
testset.validated <<- TRUE
}
## Estimate global error in the sample based on RdValidated colum
estimateError <- function () {
## At first step, use all validated particles to estimate the error
if (step == 0) {
error.estim.data <<- corr[corr$Step == step, ]
## Manage NAs
error.estim <<- sum(error.estim.data$Actual[!error.estim.data$OtherGp] !=
error.estim.data$Predicted[!error.estim.data$OtherGp]) /
(nrow(error.estim.data) - sum(error.estim.data$OtherGp))
error.estim.rd <<- error.estim
error.estim.history <<- error.estim
error.estim.rd.history <<- error.estim.history
} else { # data used previously + a portion of the validated test set
## All error at step i
Error <- corr$Actual != corr$Predicted
## Calculation of the global error
error.estim <<- .globalError(suspect = corr$Suspect[!corr$OtherGp],
error = Error[!corr$OtherGp],
subset = corr$Validated[!corr$OtherGp], na.rm = TRUE)
error.estim.history <<- c(error.estim.history, error.estim)
error.estim.rd.history <<- c(error.estim.rd.history, error.estim.rd)
}
## Error in the different fractions
#PhG if (mode != "validation") {
if (mode == "stat") {
error <- validated != corr$Predicted
errInFract <- .errorInFractions(suspect = corr$Suspect,
error = error, validated = corr$Validated,
iteration = (corr$Step == step), na.rm = TRUE)
errInFract.history <<- rbind(errInFract.history, errInFract)
}
}
## Compute the weighted sum of validated suspect and trusted particles
## Confusion matrix to estimate the abundance
estimateAbundance <- function () {
## At the first step all particles are considered suspect
if (step == 0) {
error.conf <- confusion(error.estim.data$Predicted,
error.estim.data$Actual, useNA = "no") # remove NAs
corr.confusion <<- error.conf / sum(error.conf) *
(nrow(data) - sum(corr$OtherGp)) # remove NAs
## For cells
if ("Nb_cells" %in% names(data)) {
error.conf.cell <- xtabs(data$Nb_cells[corr$Step==step] ~
error.estim.data$Actual + error.estim.data$Predicted,
exclude = c(NA, NaN))
cell.confusion <<- error.conf.cell /
sum(error.conf.cell) * (sum(data$Nb_cells) -
sum(data$Nb_cells[corr$OtherGp])) # remove NAs
}
## For biovolumes
if ("BioWeight" %in% names(data)) {
error.conf.bioweight <- xtabs(data$BioWeight[corr$Step==step] ~
error.estim.data$Actual + error.estim.data$Predicted,
exclude = c(NA, NaN))
bioweight.confusion <<- error.conf.bioweight /
sum(error.conf.bioweight) * (sum(data$BioWeight) -
sum(data$BioWeight[corr$OtherGp])) # remove NAs
}
## Calculate error in valid data and in both suspect and trusted parts
error.valid.history[[step + 1]] <<-
error.estim.data$Actual != error.estim.data$Predicted
error.suspect.history[[step + 1]] <<-
error.estim.data$Actual != error.estim.data$Predicted
error.trusted.history[[step + 1]] <<- rep(FALSE, sum(error.conf))
} else {
## Confusion matrix suspect vs trusted
nSuspTot <- sum(corr$Suspect & !corr$OtherGp)
valSuspIdx <- corr$Validated & !corr$OtherGp & corr$Suspect
valTrustIdx <- corr$Validated & !corr$OtherGp & !corr$Suspect
nSuspVal <- sum(valSuspIdx)
nTrustVal <- sum(valTrustIdx)
confSuspVal <- confusion(corr$Predicted[valSuspIdx],
corr$Actual[valSuspIdx])
confTrustVal <- confusion(corr$Predicted[valTrustIdx],
corr$Actual[valTrustIdx])
confSusp.w <- confSuspVal / nSuspVal * nSuspTot
notValTrustIdx <- !corr$Validated & !corr$Suspect
confNotValTrust <- confusion(corr$Predicted[notValTrustIdx],
corr$Actual[notValTrustIdx])
corr.confusion <<- confSusp.w + confTrustVal + confNotValTrust
## For cells
if ("Nb_cells" %in% names(data)) {
nCellSuspTot <- sum(data$Nb_cells[corr$Suspect &
!corr$OtherGp])
nCellSuspVal <- sum(data$Nb_cells[valSuspIdx])
nCellTrustVal <- sum(data$Nb_cells[valTrustIdx])
confSuspValCell <- xtabs(data$Nb_cells[valSuspIdx] ~
corr$Actual[valSuspIdx] + corr$Predicted[valSuspIdx],
exclude = c(NA, NaN))
confTrustValCell <- xtabs(data$Nb_cells[valTrustIdx] ~
corr$Actual[valTrustIdx] + corr$Predicted[valTrustIdx],
exclude = c(NA, NaN))
confSuspCell.w <- confSuspValCell / nCellSuspVal * nCellSuspTot
confNotValTrustCell <- xtabs(data$Nb_cells[notValTrustIdx] ~
corr$Actual[notValTrustIdx] + corr$Predicted[notValTrustIdx],
exclude = c(NA, NaN))
cell.confusion <<-
confSuspCell.w + confTrustValCell + confNotValTrustCell
}
## For biovolumes
if ("BioWeight" %in% names(data)) {
nBioSuspTot <- sum(data$BioWeight[corr$Suspect & !corr$OtherGp])
nBioSuspVal <- sum(data$BioWeight[valSuspIdx])
nBioTrustVal <- sum(data$BioWeight[valTrustIdx])
confSuspValBio <- xtabs(data$BioWeight[valSuspIdx] ~
corr$Actual[valSuspIdx] + corr$Predicted[valSuspIdx],
exclude = c(NA, NaN))
confTrustValBio <- xtabs(data$BioWeight[valTrustIdx] ~
corr$Actual[valTrustIdx] + corr$Predicted[valTrustIdx],
exclude = c(NA, NaN))
confSuspBio.w <- confSuspValBio / nBioSuspVal * nBioSuspTot
confNotValTrustBio <- xtabs(data$BioWeight[notValTrustIdx] ~
corr$Actual[notValTrustIdx] + corr$Predicted[notValTrustIdx],
exclude = c(NA, NaN))
bioweight.confusion <<-
confSuspBio.w + confTrustValBio + confNotValTrustBio
}
error.valid.history[[step + 1]] <<- testset$Actual != testset$Predicted
if (nsuspect > 0) {
error.suspect.history[[step + 1]] <<-
testset[testset$Suspect, ]$Actual !=
testset[testset$Suspect, ]$Predicted
}
if (ntrusted > 0) {
error.trusted.history[[step + 1]] <<-
testset[!testset$Suspect, ]$Actual !=
testset[!testset$Suspect,]$Predicted
}
}
}
## Compute the corrected coefficients for particles, cells, biovolume
# estimateCoeffs <- function () {
# ## For particles (colonies)
# col.confusion <- table(corr$Predicted[corr$Validated], corr$Actual[corr$Validated], useNA = "no") # remove NAs
# corr.coeffs <- ifelse(!colSums(col.confusion), rowSums(col.confusion),
# rowSums(col.confusion)/colSums(col.confusion))
# ## For cells
# if ("Nb_cells" %in% names(data)) {
# cell.confusion <- xtabs(data$Nb_cells[corr$Validated] ~
# corr$Predicted[corr$Validated] +
# corr$Actual[corr$Validated], exclude = c(NA, NaN))
# corr.coeffs <- cbind(corr.coeffs, ifelse(!colSums(cell.confusion), rowSums(cell.confusion),
# rowSums(cell.confusion)/colSums(cell.confusion)))
# }
#
# ## For biovolumes
# if ("BioWeight" %in% names(data)) {
# bioweight.confusion <- xtabs(data$BioWeight[corr$Validated] ~
# corr$Predicted[corr$Validated] +
# corr$Actual[corr$Validated], exclude = c(NA, NaN))
# corr.coeffs <- cbind(corr.coeffs, ifelse(!colSums(bioweight.confusion), rowSums(bioweight.confusion),
# rowSums(bioweight.confusion)/colSums(bioweight.confusion)))
# }
# corr.coeffs
# }
## Estimate error and abundance
## Update Validated, training set and histories
correct <- function () {
getTest()
curr.validated.ids <- corr$Step == step
corr$Validated[curr.validated.ids] <<- TRUE
corr$Error[corr$Validated & (corr$Actual == corr$Predicted)] <<- "Correct"
corr$Actual2 <<- corr$Actual
if (step > 0) {
ids <- !corr$Validated & corr$Suspect
corr$Actual2[ids] <<- corr$Predicted2[ids]
}
estimateError()
estimateAbundance()
#estimateCoeffs()
validated.fracs <<- c(validated.fracs, sample.size)
correction.history <<- cbind(correction.history,
rowSums(corr.confusion))
if ("Nb_cells" %in% names(data)) {
correctionCell.history <<- cbind(correctionCell.history,
rowSums(cell.confusion))
}
if ("BioWeight" %in% names(data)) {
correctionBio.history <<- cbind(correctionBio.history,
rowSums(bioweight.confusion))
}
manual.history <<- cbind(manual.history, table(corr$Actual))
manual2.history <<- cbind(manual2.history, table(corr$Actual2))
setSampleSize() # Set the next subsample size
}
process <- function () {
if (!step.manual) {
if (step > 0) {
susp <- suspect(attr(classifier, "calc.vars")(data), proba, error = corr$Error,
subset = corr$Validated, predicted = predicted,
confusion = corr.confusion, algorithm = algorithm)
## Update Suspect column
corr$Suspect <<- susp == "Error"
## Keep a track of the detection to evaluate performance
suspect.history[[step]] <<- susp
validated.history[[step]] <<- corr$Validated
}
prepareTest()
step.manual <<- TRUE
} else if (testset.validated) {
step.manual <<- FALSE
if (mode == "stat") {
getTest()
correct()
}
cat(paste("Step", step + 1, "finished \n"))
step <<- step + 1
} else warning("You have to complete the validation first \n")
flush.console()
}
## TODO: process called twice, and it seems that the step.manual flag is
## indeed triggering something else each time... This is *crazy*
## Do change this!!!
processValidation <- function () {
if (sample.size > 0) {
#if (!isTRUE(step.manual)) {
# process()
#} else {
# #validate()
# process()
#}
process()
} else cat("Correction done!\n")
}
processDemo <- function () {
if (sample.size > 0) {
#PhG process()
#PhG validate()
process()
} else cat("Correction done!\n")
}
processStats <- function () {
repeat {
if (sample.size > 0) {
process()
validate()
process()
} else {
cat("Correction done!\n")
break
}
}
}
## Accessors
iterate <- function () {
if (step < 0) initialize()
switch(mode,
validation = processValidation(),
demo = processDemo(),
stat = processStats(),
stop("'mode' must be 'validation', 'demo', or 'stat'"))
}
setValidation <- function (validation.data = NULL) {
validation.data <<- validation.data
validate()
correct()
## Recreate the page with current sorting by default
Zidb <- zidbLink(zidb)
imgext <- Zidb[[".ImageType"]]
## Create all directories of groups
path <- attr(classifier, "path")
names(path) <- basename(path)
## Recreate previous page with sorter items
oldPage <- file.path(testdir, paste0("step", step + 1),
"planktonSorter.html")
oldItems <- corr$Step == step
oldVigns <- paste(data[oldItems, "Id"], imgext, sep = ".")
oldNames <- as.character(corr[oldItems, "Actual"])
oldNames[is.na(oldNames)] <- "[other]"
names(oldVigns) <- oldNames
res <- planktonSorterPage(groups = path, vigns = oldVigns,
title = sorter.title, id = sorter.id,
step = step + 1, file = oldPage)
## Create the diagnostic graphs
reportdir <- file.path(testdir, paste0("step", step + 1))
reportplot <- file.path(reportdir, "ReportError.png")
unlink(reportplot)
png(reportplot, width = 800, height = 500)
plotResult("b")
dev.off()
## Create a page displaying the current state of correction process
reportfile <- file.path(reportdir, "planktonSorterResults.html")
res <- planktonSorterReport(title = sorter.title, id = sorter.id,
step = step + 1, file = reportfile)
## This is a trick to avoid using cached version in memory!
browseURL(paste0("file://", reportfile)) # Does not work on linux, "?v=", round(runif(1, max = 100000))))
process()
return(reportfile)
}
finish <- function () {
cat("Validation done...\n")
abd <- getAbundance()
print(abd)
## Create an object with these results...
test <- data.frame(Id = makeId(data), data, Class = corr$Actual, Validated = corr$Validated, Suspect = corr$Suspect)
#test <- data.frame(Id = makeId(data), data, Class = corr$Actual)
attr(test, "path") <- attr(classifier, "path")
class(test) <- unique(c("ZI3Test", "ZITest", class(data)))
assign(result, test, envir = envir)
cat("Object `", result, "` created with validation results!\n", sep = "")
## Erase the temporary directory on disk...
unlink(testdir, recursive = TRUE)
## Return abundances in this file
abd
}
## Return the estimated abundance
getAbundance <- function ()
rowSums(corr.confusion)
## Return the estimated error
getErrorEstimation <- function ()
error.estim
## Evaluate the quality of the classifier for the suspects
evaluateDetection <- function () {
if (!length(suspect.history)) return(NULL)
error <- factor(rep("Error", nrow(data)), levels = c("Correct", "Error"))
error[validated == corr$Predicted] <- "Correct"
conf <- list()
for (i in 1:length(suspect.history)) {
validated <- validated.history[[i]]
conf[[paste0("Step", i)]] <-
summary(confusion(x = error[!validated],
y = as.factor(suspect.history[[i]][!validated])))
}
list(error = error, confusion = conf)
}
## Compare graphically the result with manual validation
## and manual validation + second identification
plotResult <- function (type = c("dissimilary", "partition", "barplot")) {
#if (mode == "validation")
# stop("This function is not available in validation mode.")
type <- match.arg(type[1], c("dissimilarity", "partition", "barplot"))
if (type == "dissimilarity") {
if (is.null(validated)) {
## In case we are in validation mode, we don't have this!...
## We start from original prediction
abundances <- as.vector(table(predicted[!corr$OtherGp]))
} else {
abundances <- as.vector(table(validated[!corr$OtherGp]))
}
error1 <- dissimilarity(abundances, manual.history, na.rm = TRUE) * 100
error3 <- dissimilarity(abundances, correction.history, na.rm = TRUE) * 100
par(mar = c(5, 4, 4, 4) + 0.1)
plot(cumsum(validated.fracs) / nrow(corr) * 100, error1,
type = "l", xlab = "Validated fraction (%)",
ylab = "Dissimilarity (%)", col = "green", xlim = c(0, 100),
ylim = c(0, max(error1, error3) * 1.02),
main = "Dissimilarity at each iteration")
## Baseline for manual validation
lines(c(0, 100), c(error1[1], 0), col = 1, lty = 2)
## Line for correction
lines(cumsum(validated.fracs) / nrow(corr) * 100, error3, col = "red")
grid()
legend("topright", legend = c("Random validation",
"Suspects validation", "Error correction"),
col = c("black", "green","red"), lty = c(2, 1, 1))
} else if (type == "partition") {
fracs <- cumsum(validated.fracs[-1]) / nrow(corr)
errByFrac <- sapply(error.valid.history, sum, na.rm = TRUE) /
validated.fracs[-1]
suspByFrac <- nsuspect.tovalid.history / validated.fracs[-1]
suspByFrac[1] <- 0
par(mar = c(5, 4, 4, 4) + 0.1)
plot(fracs * 100, errByFrac * 100, type = "l", xlab = "Validated fraction (%)",
ylab = "Suspect and error (%)", xlim = c(0, 100), ylim = c(0, 100), col = "red",
main = "Suspects and error at each iteration")
lines(fracs * 100, suspByFrac * 100, col = "black")
grid()
legend("topright", legend = c("Suspect", "Error"),
col = c("black", "red"), cex = 0.8, lwd = 2)
} else { # Should be type == "barplot"
thresholdDiffDiss <- 5 # Differential dissimilarity <= 5%
nbStep <- ceiling(nrow(data) / validated.fracs[-1][1])
errByFrac <- sapply(error.valid.history, sum, na.rm = TRUE) /
validated.fracs[-1]
suspByFrac <- nsuspect.tovalid.history / validated.fracs[-1]
#suspByFrac[1] <- 0
## case 1 item => projection, case more => another projection...
dat <- rbind(suspByFrac * 100, errByFrac * 100)
diffDiss <- sapply(2:ncol(correction.history), function (x)
dissimilarity(correction.history[, x - 1], correction.history[, x],
na.rm = TRUE) * 100
)
xcoord <-
seq(0.7, ceiling(nrow(data) / validated.fracs[-1][1]) * 1.2, by = 1.2)
if (step < 1) {
suspRemain <- NA
stepSD <- round((errByFrac*nsuspect.history -
errByFrac*nsuspect.tovalid.history) /
nsuspect.tovalid.history) + (step+1)
idxStepSD <- stepSD
coordStepSD <- mean(c(xcoord[idxStepSD], xcoord[idxStepSD + 1]))
} else {
suspRemain <- c(NA, nsuspect.history[2:(step+1)] -
nsuspect.tovalid.history[2:(step+1)])
stepSD <- round(suspRemain / nsuspect.tovalid.history) + 1:(step+1)
if (length(which(suspRemain == 0)) > 0) {
idxStepSD <- which(suspRemain == 0)[1]
} else {
idxStepSD <- tail(stepSD,1)
}
coordStepSD <- mean(c(xcoord[idxStepSD], xcoord[idxStepSD + 1]))
}
par(mfrow = c(2, 1), mar = c(4, 4, 1, 4) + 0.1)
bp1 <- barplot(suspRemain, #xlab = "Validation step",
ylab = "Nb remaining suspects", xlim = c(0.2,
xcoord[ceiling(idxStepSD + (length(xcoord) - idxStepSD) / 3)]),
ylim = c(0, max(suspRemain, diffDiss, na.rm = TRUE)), yaxs = "r",
col = "grey10", cex.axis = .7, cex.main = 1, ann = FALSE,
yaxt = "n", #main = "Remaining suspects and differential dissimilarity")
)
title(expression(bold("Remaining suspects") *
phantom("\tand\tdifferential dissimilarity")),
col.main = "grey10", cex.main = 1)
title(expression(phantom("Remaining suspects\t") * "and" *
phantom("\tdifferential dissimilarity")),
col.main = "black", cex.main = 1)
title(expression(phantom("Remaining suspects\tand\t") *
bold("differential dissimilarity")),
col.main = "blue", cex.main = 1)
# legend("top", legend = c("Remaining suspects","Diff dissimilarity"),
# fill = c("grey20","blue"), cex = .6, bty = "n", adj = c(0,0))
axis(side = 1, at = seq(bp1[1], by = 1.2, length.out = nbStep),
labels = 1:nbStep, cex.axis = .7)
if (step > 0) axis(side = 2, cex.axis = .7)
par(new = TRUE)
plot(bp1, diffDiss, type = "o", col = "blue", ylim = c(0, 100),
xlim = c(0.2, xcoord[ceiling(idxStepSD + (length(xcoord) -
## TODO: why '+' at the end of next line???
idxStepSD) / 3)]), lwd = 3, axes = FALSE, ann = FALSE) +
## TODO: why '+' at the end of next line???
axis(side = 4, col = "blue", col.axis = "blue", cex.axis = .7) +
mtext("Differential dissimilarity (%)", side = 4, line = 3,
col = "blue")
abline(v = coordStepSD, lwd = 2, lty = 2, col = "dimgrey")
text(x = coordStepSD + .5, y = 90, "SD", srt = -90, pos = 4,
cex = 0.6, col = "dimgrey")
if (length(which(diffDiss < thresholdDiffDiss)) > 0) {
coordStepEC <- mean(c(xcoord[which(diffDiss < thresholdDiffDiss)[1]],
xcoord[which(diffDiss < thresholdDiffDiss)[1]+1]))
abline(v = coordStepEC, lwd = 2, lty = 2, col = "darkgoldenrod")
text(x = coordStepEC + .5, y = 90, "EC", srt = -90, pos = 4,
cex = 0.6, col = "darkgoldenrod")
}
grid()
box()
bp2 <- barplot(dat, xlab = "Validated step", beside = TRUE,
ylab = "Suspect and corrected error (%)",
xlim = c(0.2, xcoord[ceiling(idxStepSD + (length(xcoord) -
idxStepSD) / 3)]),
ylim = c(0, 100), col = c("#dddddd", "#dd0000"),
cex.axis = .7, cex.main = 1, width = .5, space = c(0, .4),
#main = "Suspects and error corrected at each iteration")
)
title(expression(bold("Nbr of suspects") *
phantom("\tand\tcorrected error")),
col.main = "gray40", cex.main = 1)
title(expression(phantom("Nbr of suspects\t") * "and" *
phantom("\tcorrected error")),
col.main = "black", cex.main = 1)
title(expression(phantom("Nbr of suspects\tand\t") *
bold("corrected error")),
col.main = "#dd0000", cex.main = 1)
# legend("topright", legend = c("Nbr of suspect", "Corrected error"),
# fill = c("#dddddd", "#dd0000"), cex = .6, bty = "n", adj = c(0, 0))
axis(side = 1, at = seq(mean(bp2[, 1]), by = 1.2, length.out = nbStep),
labels = 1:nbStep, cex.axis = .7)
axis(side = 4, cex.axis = .7)
grid()
box()
}
}
getEnv <- function ()
environment(getEnv)
invisible(list(iterate = iterate, validate = setValidation, done = finish, abundance = getAbundance,
error = getErrorEstimation, evaluate = evaluateDetection,
plot = plotResult, environment = getEnv))
}
## Validated vs Suspect???
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.