Nothing
######################################################################################
# tdmClassifyLoop:
#
#' Core classification double loop returning a \code{\link{TDMclassifier}} object.
#'
#' tdmClassifyLoop contains a double loop (opts$NRUN and CV-folds)
#' and calls \code{\link{tdmClassify}}. It is called by all classification R-functions main_*. \cr
#' It splits - if \code{tset} is NULL - the data in \code{dset} into training and validation
#' data according to \code{opts$TST.kind}.\cr
#' It returns an object of class \code{\link{TDMclassifier}}.
#'
#'
#' @param dset the data frame containing training and validation data.
#' @param tset [NULL] If not NULL, this is the test data set. If NULL, we are in tuning and the validation data
#' set is build from \code{dset} according to the procedure prescribed in \code{opts$TST.*}.
#' @param response.variables name of column which carries the target variable - or -
#' vector of names specifying multiple target columns
#' (these columns are not used during prediction, only for evaluation)
#' @param input.variables vector with names of input columns
#' @param opts a list from which we need here the following entries
#' \describe{
#' \item{\code{NRUN}}{ number of runs (outer loop)}
#' \item{\code{TST.SEED}}{ =NULL: get a new random number seed with \code{\link{tdmRandomSeed}}. =any value: set the random number seed
#' to this value to get reproducible random numbers and thus reproducible training-test-set-selection.
#' (only relevant in case TST.kind=="cv" or "rand") (see also MOD.SEED in \code{\link{tdmClassify}})}
#' \item{\code{TST.kind}}{ how to create cvi, handed over to \code{\link{tdmModCreateCVindex}}. If TST.kind="col", then cvi is taken from dset[,opts$TST.col].}
#' \item{\code{GD.RESTART}}{ [TRUE] =TRUE/FALSE: do/don't restart graphic devices}
#' \item{\code{GD.DEVICE}}{ ["non"|"win"|"pdf"|"png"]}
#' }
#'
#' @return \code{result}, an object of class \code{\link{TDMclassifier}}, this is a list with results, containing
#' \item{lastRes}{ last run, last fold: result from \code{\link{tdmClassify}}}
#' \item{C_train}{ classification error on training set}
#' \item{G_train}{ gain on training set}
#' \item{R_train}{ relative gain on training set (percentage of max. gain on this set)}
#' \item{*_vali}{ --- similar, with vali set instead of training set --- }
#' \item{*_vali2}{ --- similar, with vali2 set instead of training set --- }
#' \item{Err}{ a data frame with as many rows as opts$NRUN and 9 columns corresponding to
#' the nine variables described above }
#' \item{predictions}{ last run: data frame with dimensions [nrow(dset),length(response.variable)]. In case of CV, all
#' CV predictions (for each record in dset), in other cases mixed validation / train set predictions. }
#' \item{predictTest}{ predictions on the test set \code{tset} (NULL if \code{tset==NULL} )}
#' \item{predProbList}{ a list, \code{predProbList[[i]]} has the prediction probabilities
#' of the ith run. See info on \code{predProb} in \code{\link{tdmClassify}}. }
#'
#' Each performance measure \code{C_*, G_*, R_*} is a vector of length \code{opts$NRUN}. To be specific, \code{C_train[i]} is the
#' classification error on the training set from the \code{i}-th run. This error is \code{mean(res$allEVAL$cerr.trn)}, i.e. the
#' mean of the classification errors from all response variables when \code{res} is the return value of \code{\link{tdmClassify}}.
#' In the case of cross validation, for each performance measure an additional averaging over all folds is done.
#'
#' @seealso \code{\link{print.TDMclassifier}}, \code{\link{tdmClassify}}, \code{\link{tdmRegress}}, \code{\link{tdmRegressLoop}}
#' @author Wolfgang Konen (\email{wolfgang.konen@@th-koeln.de}), THK
#' @aliases TDMclassifier
#' @example demo/demo00-0classif.r
#' @export
######################################################################################
tdmClassifyLoop <- function(dset,response.variables,input.variables,opts,tset=NULL) {
if (exists(".Random.seed")) SAVESEED<-.Random.seed #save the Random Number Generator RNG status
if (is.null(tset)) {
tsetStr = c("Validation", "validation");
} else {
tsetStr = c("Test", " test");
}
if (class(opts)[1] != "tdmOpts") stop("Class of object opts is not tdmOpts");
if (is.null(opts$PRE.PCA.numericV)) opts$PRE.PCA.numericV <- input.variables;
if (opts$NRUN<=0) stop(sprintf("opts$NRUN has to be positive, but it is %d",opts$NRUN));
if (opts$PRE.PCA!="none" & opts$PRE.SFA!="none") stop("It is not allowed to activate opts$PRE.PCA and opts$PRE.SFA simultaneously.")
#if (opts$READ.TST==TRUE & opts$TST.kind!="col")
# warning(sprintf("Are you sure you want opts$READ.TST=TRUE, but opts$TST.kind!='col'? Actual value is opts$TST.kind='%s'.",opts$TST.kind));
if (!all(response.variables %in% names(dset)))
stop(sprintf("Not all response.variables are in names(dset)!\n %s\n response.variables=%s",
"Note that response.variables have to be strings (names of columns in dset), not the columns themselves.",
paste(response.variables,collapse=",")))
if (!is.null(opts$CLS.gainmat)) {
#browser()
for (respVar in response.variables)
testit::assert(sprintf("tdmClassifyLoop: Size of opts$CLS.gainmat and number of levels in dset$%s do not match", respVar)
, nrow(opts$CLS.gainmat)==length(levels(dset[,respVar]))
, ncol(opts$CLS.gainmat)==length(levels(dset[,respVar])) )
}
predProbList=list();
C_train <- C_vali <- C_vali2 <- G_train <- G_vali <- NULL # reserve names (dynamic extension of
R_train <- R_vali <- R_vali2 <- G_vali2 <- NULL # these vectors at and of for-i-loop)
Err <- NULL
for (i in 1:opts$NRUN) {
if (opts$NRUN>1) {
if (opts$GD.RESTART) tdmGraphicCloseDev(opts);
tdmGraphicInit(opts);
}
opts$i = i;
#=============================================================================
# PART 3: CREATE NFOLD CROSSVALIDATION INDEX OR DIVIDE IN TRAINING / VALI SET
#=============================================================================
if (is.null(opts$TST.SEED)) {
# NEW: when called via SPOT, the RNG might be at (different but) fixed seed in each call.
# But we want different seeds (for test set selection) to see the variability
set.seed(tdmRandomSeed());
} else if (opts$TST.SEED=="algSeed") { # use the seed from SPOT:
# opts$ALG.SEED is set in tdmStartSpot2 to des$SEED[k]+r. This meens that the overall r'th
# evaluation of a design point gets the seed spotConfig$alg.seed+r
newseed=opts$ALG.SEED+(opts$i-1)+opts$NRUN*(opts$rep-1);
set.seed(newseed);
} else {
newseed=opts$TST.SEED+(opts$i-1)+opts$NRUN*(opts$rep-1);
set.seed(newseed); # if you want reproducably the same training/test sets,
} # but different for each run i and each repeat (opts$rep)
#print(.Random.seed[1:6])
cvi <- tdmModCreateCVindex(dset,response.variables,opts,stratified=TRUE);
nfold = max(cvi,1); # Why ',1'? - In the special case where all records belong to train sample (max(cvi)=0), we want
# to have exactly one fold (one pass through k-loop) and NOT k in { 1, 0 }
#=============================================
# PART 4: MODELING, EVALUATION, VISUALIZATION
#=============================================
alltrn = alltst = NULL;
predictions <- as.data.frame(dset[,response.variables]);
names(predictions) <- response.variables;
predProb=list();
for (k in 1:nfold) {
opts$k=k;
opts$the.nfold = nfold;
cat1(opts,"\n")
if (any(names(dset)=="IND.dset")) stop("Name clash in dset, which has already a column IND.dset. Please consider renaming it.")
if (is.null(tset)) {
d_test <- dset[cvi==k, ] # validation set
d_test <- tdmBindResponse(d_test , "IND.dset", which(cvi==k));
} else { # i.e. if (!is.null(tset))
# validation set (or test set) is passed in from the caller
if (opts$TST.kind %in% c("cv")) #,"col"))
stop(sprintf("Option opts$TST.kind=\"%s\" together with tset!=NULL is currently not implemented. Consider opts$TST.kind=\"rand\".",opts$TST.kind));
d_test <- tset;
d_test <- tdmBindResponse(d_test , "IND.dset", rep(-1,nrow(tset)));
}
d_train <- dset[cvi!=k & cvi>=0, ] # training set (disregard records with cvi<0)
d_dis <- dset[cvi!=k & cvi==-1, ] # disregard set (needed for active learning)
d_train <- tdmBindResponse(d_train, "IND.dset", which(cvi!=k & cvi>=0));
d_dis <- tdmBindResponse(d_dis , "IND.dset", which(cvi!=k & cvi==-1));
ntst=nrow(d_test);
ntrn=nrow(d_train);
#if (nrow(d_train)+nrow(d_test)+nrow(d_dis) != nrow(dset))
# stop("Something wrong, the union of d_train, d_test and d_dis does not cover dset");
d_preproc <- NULL; # no preprocessing -> no mem-consumption for d_preproc (if condition on next line is false)
if (opts$PRE.PCA!="none" | opts$PRE.SFA!="none") {
if (opts$PRE.allNonVali) {
d_preproc <- dset[cvi!=k, ] # all non-validation-data are used for preproc
} else {
d_preproc <- d_train
}
}
if (opts$PRE.PCA!="none") {
# a) do PCA on the numeric variables of d_train, if opts$PRE.PCA!="none"
# b) add monomials of degree 2 for the first opts$PRE.PCA.npc numeric variables
# c) apply this PCA and monomials to d_test and d_dis in the same way
other.variables <- setdiff(input.variables,opts$PRE.PCA.numericV);
pca <- tdmPrePCA.train(d_preproc,opts); # see tdmPreprocUtils.r
d_train <- tdmPrePCA.apply(d_train,pca$pcaList,opts,d_train)$dset;
d_test <- tdmPrePCA.apply(d_test,pca$pcaList,opts,d_train)$dset;
d_dis <- tdmPrePCA.apply(d_dis,pca$pcaList,opts,d_train)$dset;
input.variables <- union(pca$numeric.variables,other.variables);
#opts$PRE.SFA.numericV <- pca$numeric.variables; # -- now obsolete, no SFA after PCA allowed --
if (length(setdiff(input.variables,names(d_train)))>0)
stop("Some elements of input.variables are not columns of d_train");
}
# the SFA preprocessing is now in tdmClassify, inside response.variable for-loop
# (because SFA (for classification) depends on response.variable)
res <- tdmClassify(d_train,d_test,d_dis,d_preproc,response.variables,input.variables,opts,tsetStr)
# predProb$Val: bind the different folds of CV (cross validation) together. If no CV, then nfold=1 ->> only results from one vali set.
predProb$Val = rbind(predProb$Val,res$predProb$Val);
# predProb$Trn: take only the results from the first fold
if (k==1) predProb$Trn = res$predProb$Trn;
alltrn = rbind(alltrn,as.data.frame(list(cerr.trn=mean(res$allEVAL$cerr.trn) * ntrn
,gain.trn=mean(res$allEVAL$gain.trn) * ntrn
,rgain.trn=mean(res$allEVAL$rgain.trn) * ntrn
,ntrn=ntrn
)));
alltst = rbind(alltst,as.data.frame(list(cerr.tst=mean(res$allEVAL$cerr.tst) * ntst
,gain.tst=mean(res$allEVAL$gain.tst) * ntst
,rgain.tst=mean(res$allEVAL$rgain.tst) * ntst
,cerr.tst2=mean(res$allEVAL$cerr.tst2) * ntst
,gain.tst2=mean(res$allEVAL$gain.tst2) * ntst
,rgain.tst2=mean(res$allEVAL$rgain.tst2)
,ntst=ntst
)));
if (is.null(tset)) {
predictions[cvi==k,response.variables] <- res$d_test[,paste("pred_",response.variables,sep="")];
predictTest = NULL;
} else {
predictTest = res$d_test[,paste("pred_",response.variables,sep="")];
}
if (!(opts$TST.kind=="cv")) {
predictions[cvi!=k & cvi>=0,response.variables] <- res$d_train[1:ntrn,paste("pred_",response.variables,sep="")];
# Why '1:ntrn'? - If opts$ncopies>0, then tdmClassify has added
# opts$ncopies bootstrap patterns to the ntrn original patterns.
# These bootstrap patterns have to be discarded here.
}
} # for (k in 1:nfold)
### -- OLD: Err = colSums(allerr)/nfold # assumes that each fold has the same (or nearly the same) size
# each measure is a weighted average over folds, weighted with #cases in each fold:
Err = rbind(Err,c(colSums(alltrn)/sum(alltrn$ntrn),colSums(alltst)/sum(alltst$ntst)));
Err[i,"ntrn"]=ifelse(opts$TST.kind=="cv",nrow(dset),nrow(d_train)); # the number of distinct cases
Err[i,"ntst"]=ifelse(opts$TST.kind=="cv",nrow(dset),nrow(d_test)); # used for this line of measures
predProbList[[i]] <- list()
predProbList[[i]]$Val <- predProb$Val;
predProbList[[i]]$Trn <- predProb$Trn;
cat1(opts,"\n",ifelse(opts$TST.kind=="cv","CV","")
,paste0("Relative gain (",opts$rgain.type,") on training set "),Err[i,"rgain.trn"],"%\n")
cat1(opts,"", ifelse(opts$TST.kind=="cv","CV","")
,paste0("Relative gain (",opts$rgain.type,") on ",tsetStr[2]," set "),Err[i,"rgain.tst"],"%\n\n")
#opts$name.of.prediction <- paste("pred_", response.variable, sep="")
C_train[i] = Err[i,"cerr.trn"]
G_train[i] = Err[i,"gain.trn"]
R_train[i] = Err[i,"rgain.trn"]
C_vali[i] = Err[i,"cerr.tst"]
G_vali[i] = Err[i,"gain.tst"]
R_vali[i] = Err[i,"rgain.tst"]
C_vali2[i] = Err[i,"cerr.tst2"]
G_vali2[i] = Err[i,"gain.tst2"] # for comparision in case of opts$MOD.method="RF": results with
R_vali2[i] = Err[i,"rgain.tst2"] # default cutoff 1/n.class instead of opts$CLS.cutoff
#=============================================
# PART 5: SOME MORE GRAPHICS
#=============================================
if (opts$GD.DEVICE!="non" & !is.null(opts$gr.fctcall)) {
# execute the graphics command given in text string opts$gr.fctcall
eval(parse(text=opts$gr.fctcall));
}
} # for (i in 1:opts$NRUN)
#write.table(d_test, file=paste(opts$dir.output, sub(".csv", "", filename), "_predictions.csv", sep=""), quote=FALSE, sep=";", dec=".", row.names=FALSE, col.names=TRUE)
#=============================================
# PART 6: OVERALL EVALUATION
#=============================================
if (opts$NRUN>1) {
# print output averaged over all NRUN runs "for (i)"
cat1(opts,"\nAverage over all ",opts$NRUN," runs: \n")
cat1(opts,sprintf("cerr$train: (%7.5f +- %7.5f)%%\n", mean(C_train)*100, sd(C_train)*100));
cat1(opts,sprintf("cerr$vali: (%7.5f +- %7.5f)%%\n", mean(C_vali)*100, sd(C_vali)*100));
cat1(opts,sprintf("gain$train: (%7.2f +- %4.2f)\n", mean(G_train), sd(G_train)));
cat1(opts,sprintf("gain$vali: (%7.2f +- %4.2f)\n", mean(G_vali), sd(G_vali)));
cat1(opts,sprintf("rgain.train: %7.3f%%\n", mean(R_train)));
cat1(opts,sprintf("rgain.vali: %7.3f%%\n\n", mean(R_vali)));
}
result = list(lastRes = res # last run, last fold: result from tdmClassify
#, opts = res$opts # deprecated (12/2011), use result$lastRes$opts or Opts(result)
, C_train = C_train
, G_train = G_train
, R_train = R_train
, C_vali = C_vali
, G_vali = G_vali
, R_vali = R_vali
, C_vali2 = C_vali2
, G_vali2 = G_vali2
, R_vali2 = R_vali2
, Err = Err
, predictions = predictions # predictions on dset from the *last* run: all CV predictions in case of CV,
# mixed vali-train predictions else
, predictTest = predictTest
, predProbList = predProbList # ith element: a list with predProb data from the ith run
);
if (!is.null(opts$TST.COL)) # needed when result is input for coTraining (see ssl_methods.r)
if (opts$TST.COL %in% names(dset)) result$TST = dset[,opts$TST.COL]
class(result) <- c("TDMclassifier","TDM") # this causes > result; or > print(result);
# NOT to print out the whole list (might be very long!!)
# but to call instead the function print.TDMclassifier
# (which in turn calls tdmClassifySummary)
if (exists("SAVESEED")) assign(".Random.seed", SAVESEED, envir=globalenv()); #load the saved RNG status
result;
} # tdmClassifyLoop
######################################################################################
# tdmClassifySummary
#
#' Print summary output for \code{result} from \code{tdmClassifiyLoop} and add \code{result$y}.
#'
#' \code{result$y} is "minus OOB rgain" on training set for methods RF or MC.RF.
#' \code{result$y} is "minus rgain" on test set (=validation set) for all other methods.
#' \code{result$y} is the quantity which the tuner seeks to minimize.
#'
#' @param result return value from a prior call to \code{\link{tdmClassifyLoop}}, an object of class \code{TDMclassifier}.
#' @param opts a list from which we need here the following entries
#' \describe{
#' \item{\code{NRUN}}{ number of runs (outer loop)}
#' \item{\code{method}}{}
#' \item{\code{VERBOSE}}{}
#' \item{\code{dset}}{ [NULL] if !=NULL, attach it to result}
#' }
#' @param dset [NULL] if not NULL, add this data frame to the return value (may cost a lot of memory!)
#'
#' @return \code{result}, an object of class \code{TDMclassifier}, with \code{result$y}, \code{result$sd.y}
#' (and optionally also \code{result$dset}) added
#'
#' @seealso \code{\link{tdmClassify}}, \code{\link{tdmClassifyLoop}}, \code{\link{print.TDMclassifier}}, \code{\link{tdmRegressSummary}}
#' @author Wolfgang Konen, FHK, Sep'2010 - Oct'2011
#' @export
######################################################################################
tdmClassifySummary <- function(result,opts,dset=NULL)
{
res <- result$lastRes;
cat1Records <- function (nrow_noCV) {
cat1(opts,ifelse(opts$TST.kind=="cv"
, sprintf(" (on %d records in %d folds)",nrow(res$d_train)+nrow(res$d_test),opts$TST.NFOLD)
, sprintf(" (on %d records)",nrow_noCV)
),"\n");
}
#print2(opts,res$allEVAL); # EVAL for each response variable , but only for lastRes
y = mean(result$R_vali);
ytr = mean(result$R_train);
maxScore = result$G_vali[1]/(result$R_vali[1]/100);
maxScore.tr = result$G_train[1]/(result$R_train[1]/100);
z=data.frame(TYPE=c("rgain","meanCA","minCA","bYouden")
,DESC=c("relative gain, i.e. percent of correctly classified records"
,"mean class accuracy, i.e. average over class levels"
,"minimum class accuracy", "balanced Youden index"));
cat1(opts,sprintf("\nRelative gain is \"%s\"",opts$rgain.type));
cat1(opts,sprintf(" (%s)", z$DESC[which(z$TYPE==opts$rgain.type)]));
if (opts$MOD.method %in% c("RF","MC.RF") & opts$RF.OOB==TRUE) {
cat1(opts,sprintf("\n%sTrain OOB relative gain: %7.3f",ifelse(opts$TST.kind=="cv","CV ",""),ytr));
cat1(opts,ifelse(opts$NRUN>1,sprintf(" +-%7.3f",sd(result$R_train)),""));
cat1Records(nrow(res$d_train));
# cat1(opts,sprintf(" (on %d records)",nrow(res$d_train)));
result$y=-ytr; # the score (to be minimized by SPOT) is "minus OOB score"
result$sd.y=sd(result$R_train);
} else {
cat1(opts,"\n");
result$y=-y; # the score (to be minimized by SPOT) is "minus test score"
result$sd.y=sd(result$R_vali);
}
cat1(opts,sprintf("%s Vali relative gain: %7.3f",ifelse(opts$TST.kind=="cv","CV ",""),y));
cat1(opts,ifelse(opts$NRUN>1,sprintf(" +-%7.3f",sd(result$R_vali)),""));
cat1Records(nrow(res$d_test));
cat1(opts,sprintf("%s Vali2 relative gain (predict always with %s): %7.3f"
,ifelse(opts$TST.kind=="cv","CV ",""), res$allEVAL$test2.string, mean(result$R_vali2)));
cat1(opts,ifelse(opts$NRUN>1,sprintf(" +-%7.3f",sd(result$R_vali2)),""));
cat1Records(nrow(res$d_test));
if (!is.null(dset)) result$dset=dset; # might cost a lot of memory
result;
} # tdmClassifySummary
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.