Nothing
######################################################################################
######################################################################################
#
# HELPER FUNCTIONS FOR MODELING
#
######################################################################################
# require(randomForest); # now via direct call 'randomForest::'
######################################################################################
#
#
######################################################################################
# tdmModCreateCVindex:
#' Create and return a training-validation-set index vector.
#'
#' Depending on the value of member TST.kind in list opts, the returned index cvi is
#' \enumerate{
#' \item TST.kind="cv": a random cross validation index P([111...222...333...]) - or -
#' \item TST.kind="rand": a random index with P([00...11...-1-1...]) for training (0), validation (1) and disregard (-1) cases - or -
#' \item TST.kind="col": the column dset[,opts$TST.COL] contains the training (0), validation (1) and disregard (-1) set division
#' (and all records with a value <0 in column TST.COL are disregarded).
#' }
#' Here P(.) denotes random permutation of the sequence. \cr
#' The disregard set is optional, i.e. cvi may contain only 0 and 1, if desired. \cr
#' Special case TST.kind="cv" and TST.NFOLD=1: make *every* record a training record, i.e. index [000...]. \cr
#' In case TST.kind="rand" and stratified=TRUE a \emph{stratified} sample is drawn, where the strata in the
#' training case reflect the rel. frequency of each level of the **1st** response variable
#' and are ensured to be at least of size 1.\cr
#' In summary, TST.kind="cv" means cross validation (TST.NFOLD models are built with TST.NFOLD
#' different train-validation data sets), while TST.kind="rand" or "col" means one model build
#' with a random ("rand") or user-defined ("col") training-validation split.
#'
#' @param dset the data frame for which cvi is needed
#' @param response.variables issue a warning if \code{length(response.variables)>1}. Use the first
#' response variable for determining strata size.
#' @param opts a list from which we need here the following entries
#' \itemize{
#' \item TST.kind: ["cv"|"rand"|"col"]
#' \item TST.NFOLD: number of CV folds (only relevant in case TST.kind=="cv")
#' \item TST.COL: column of dset containing the (0/1/<0) index (only relevant in case TST.kind=="col")
#' or NULL if no such column exists
#' \item TST.valiFrac: fraction of records to set aside for validation (only relevant in case TST.kind=="rand")
#' \item TST.trnFrac: [1-opts$TST.valiFrac] fraction of records to use for training (only relevant in case TST.kind=="rand")
#' }
#' @param stratified [F] do stratified sampling for TST.kind="rand" with at least one training
#' record for each response variable level (classification)
#'
#' @return cvi training-validation-set (0/>0) index vector
#' (all records with cvi<0, e.g. from column TST.COL, are disregarded)
#'
#' @note Currently stratified sampling in case TST.KIND='rand' does only work correctly for \emph{one} response variable.
#' If there are more than one, the right fraction of validation records is taken, but the strata are drawn w.r.t. the
#' first response variable. (For multiple response variables we would have to return a list of cvi's or to
#' call tdmModCreateCVindex for each response variable anew.)
#
######################################################################################
tdmModCreateCVindex <- function(dset,response.variables,opts,stratified=FALSE) {
L = nrow(dset)
if (opts$TST.kind=="cv") {
#===============================================
# CREATE NFOLD CROSSVALIDATION INDEX
#===============================================
cat1(opts,opts$filename,": Creating cross validation index cvi ...\n")
nfold = opts$TST.NFOLD
if (opts$TST.NFOLD <= 0)
stop(sprintf("tdmModCreateCVindex: opts$TST.NFOLD must be > 0. Current value is opts$TST.NFOLD = %d", opts$TST.NFOLD));
cvi = NULL
ilow=1
for (m in 1:nfold) {
ihigh = floor(L/nfold*m)
cvi[ilow:ihigh] = m # i.e. [111..2222...333...]
ilow = ihigh+1
}
cvi = sample(cvi) # permute CVI randomly
if (opts$TST.NFOLD == 1)
cvi = 0*cvi; # special case: if NFOLD==1 make every record a training record
# TODO: check that each training set combination (n-1 folds) has at least one record per class level.
# Can be done by checking that each fold has less records of a certain level than the overall number of records for this level.
# If not, permute anew
} else {
#=============================================
# DIVIDE INTO TRAINING SET / VALIDATION SET
#=============================================
if (opts$TST.kind=="rand") { # make a (random) division
if (is.null(opts$TST.valiFrac))
stop(sprintf("tdmModCreateCVindex: opts$TST.valiFrac is not defined"));
if (is.null(opts$TST.trnFrac)) opts$TST.trnFrac = 1-opts$TST.valiFrac;
if (opts$TST.valiFrac >= 1 | opts$TST.valiFrac < 0)
stop(sprintf("tdmModCreateCVindex: opts$TST.valiFrac must be in [0,1). Current value is opts$TST.valiFrac = %f", opts$TST.valiFrac));
if (opts$TST.trnFrac > 1 | opts$TST.trnFrac <= 0)
stop(sprintf("tdmModCreateCVindex: opts$TST.trnFrac must be in (0,1]. Current value is opts$TST.trnFrac = %f", opts$TST.trnFrac));
if (opts$TST.valiFrac+opts$TST.trnFrac>1)
stop(sprintf("tdmModCreateCVindex: opts$TST.valiFrac+opts$TST.trnFrac > 1. Current value is opts$TST.valiFrac = %f"
, opts$TST.valiFrac,", opts$TST.trnFrac = %f", opts$TST.trnFrac ));
if (stratified) {
# ** NEW 06/2011 ** the division is done by ***stratified*** random sampling (recommended for classification):
cat1(opts,opts$filename,": Stratified random training-validation-index with opts$TST.valiFrac = ",opts$TST.valiFrac*100,"%\n");
rv <- dset[,response.variables[1]];
lrv = length(response.variables);
if (lrv>1) warning(sprintf("Stratified sampling is only done w.r.t. 1st response variable. It is not guaranteed to work for all %d response variables",lrv))
# calculate tfr, the number of training set records for each level of the response variable
tfr <- sapply(unique(rv),function(x) { round(opts$TST.trnFrac*length(which(rv==x))) });
# (this seems complicated, but the simpler command: tfr <- round((1-opts$TST.valiFrac)*table(rv));
# does not work, because 'table' orders the levels alphabetically but 'unique' (or 'strata') below
# requires them in the order they appear in column rv.)
#
tfr[tfr<1] <- 1; # ensure that there is at least one record for each class
cvi <- rep(-1,L);
urv <- unique(rv);
for (i in 1:length(urv)) cvi[ sample(which(rv==urv[i]), tfr[i]) ] <- 0;
#
#--- OLD and slow ---:
# the code below with strata from package sampling does the same as the two lines above,
# but it is prohibitively slow when dset gets larger(50000 rows and more)
#require(sampling);
#s2=strata(dset,c(response.variables[1]),size=tfr, method="srswor");
#cvi[s2$ID_unit] <- 0; # set the training records
# select opts$TST.valiFrac of the non-training data as validation data:
idxCviMinus <- which(cvi==-1);
p <- sample(length(idxCviMinus));
vfr <- opts$TST.valiFrac*L; # index where the validation data ends
if (trunc(vfr)>0) cvi[idxCviMinus[p[1:vfr]]] <- 1;
} else { # i.e. stratified=FALSE
# simple random sampling (recommended for regression):
p <- sample(L) # random permutation of indices 1:L
# calculate tfr, the record where the validation set starts (opts$TST.valiFrac)
tfr <- opts$TST.trnFrac*L; # index where the training data ends
vfr <- (1-opts$TST.valiFrac)*L; # index where the validation data starts
cat1(opts,opts$filename,": Random training-validation-index with opts$TST.valiFrac = ",opts$TST.valiFrac*100,"%\n")
cvi <- rep(-1,L);
if (trunc(tfr)==0) stop(sprintf("No training data! opts$TST.trnFrac=%7.2f",opts$TST.trnFrac));
cvi[p[1:tfr]] <- 0; # training set index ( opts$TST.trnFrac percent of the data)
if (trunc(vfr)<L) cvi[p[(vfr+1):L]] <- 1; # validation set index ( opts$TST.valiFrac percent of the data)
}
} # opts$TST.kind=="rand"
else # i.e. opts$TST.kind=="col"
{ # take the validation-training-division as delivered in dset[,opts$TST.COL]
if (is.null(opts$TST.COL))
stop(sprintf("tdmModCreateCVindex: opts$TST.COL is NULL, but opts$TST.kind=='col'"));
if (!(opts$TST.COL %in% names(dset)))
stop(sprintf("tdmModCreateCVindex: Data frame dset does not contain a column opts$TST.COL named \"%s\"", opts$TST.COL));
cat1(opts,opts$filename,": Using training-validation-index from column",opts$TST.COL,"\n");
cvi <- dset[,opts$TST.COL];
if (!is.null(opts$TST.trnFrac)) {
cat1(opts,opts$filename,": Selecting only ",opts$TST.trnFrac*100,"%% of the available training data for the actual training \n");
cat1(opts,opts$filename,": (If you are in unbiased run and want all available training data, set tdm$TST.trnFrac to NULL)\n");
# deselect 1-opts$TST.trnFrac of all training data (set their cvi to -1):
idxCviNull <- which(cvi==0);
L0 = length(idxCviNull);
p <- sample(L0);
vfr <- opts$TST.trnFrac*L0; # index where the training data to use ends
if (trunc(vfr)>0) cvi[idxCviNull[p[(vfr+1):L0]]] <- -1;
}
}
}
cvi;
}
######################################################################################
# adjust sampsize (RF) in case of classification:
# - if samp==NULL, set to 3000
# - if samp is scalar, clip it to total number of training records
# - if samp is vector of length n.class, clip each element to number of training
# records for that class level
######################################################################################
tdmModAdjustSampsizeC <- function(samp, to.model, response.variable, opts) {
if (is.null(samp)) samp=3000;
if (length(samp)==1) newsamp = min(nrow(to.model),samp)
else {
newsamp <- as.vector(summary(to.model[,response.variable]));
n.class <- length(newsamp);
#browser()
if (length(samp)!=n.class)
stop(sprintf("Wrong length of sampsize 'samp'! It must be either a scalar or a vector of length n.class=%d.\n samp = ",n.class),
sprintf("%d ",samp));
newsamp <- pmin(newsamp,samp); # clip any element of samp which is larger than the respective
# number of training records (newsamp)
}
if (any(is.na(newsamp)))
stop(sprintf("sampsize 'samp' contains NA's! Consider appropriate lines 'opts$RF.samp[i] = ...' in opts-construction.\n samp = "),
paste(samp,collapse=", "))
if (any(samp!=newsamp)) cat1(opts,"Clipping sampsize to ",newsamp,"\n");
newsamp;
}
######################################################################################
# adjust sampsize (RF) in case of regression:
# - if samp==NULL, set to 3000
# - clip it to total number of training records
######################################################################################
tdmModAdjustSampsizeR <- function(samp, to.model, response.variable, opts) {
if (is.null(samp)) samp=3000;
if (length(samp)==1) newsamp = min(nrow(to.model),samp)
else {
stop(sprintf("Wrong length of sampsize! It must be a scalar, but it is samp = "),
sprintf("%d ",samp));
}
if (any(samp!=newsamp)) cat1(opts,"Clipping sampsize to ",newsamp,"\n");
newsamp;
}
######################################################################################
# adjust cutoff (optional parameter for classification with Random Forest or SVM)
# a) if length(cutoff)=n.class-1, add cutoff[n.class] as the remainder to 1.
# b) if cutoff[w] < 0 for exactly one w, then set cutoff[w] as the remainder to 1.
# (If the others have a sum >= 1, then issue a warning and scale them to sum 0.9.)
# c) if sum(cutoff) != 1, scale it to 1
# d) if sum(cutoff) > 1 by a small amount (<=1e-8), reduce it to 1 or smaller.
######################################################################################
tdmModAdjustCutoff <- function(cutoff,n.class,text="cutoff")
{
if (!is.null(cutoff)) {
if (any(is.na(cutoff)))
stop(sprintf("cutoff '%s' contains NA's! Consider appropriate lines '%s[i] = ...' in opts-construction.\n %s = ",text,text,text),
paste(cutoff,collapse=", "))
if (length(cutoff)==n.class-1) cutoff=c(cutoff,-1); # assume that the last element is to be adjusted as remainder to 1
if (length(cutoff)!=n.class) stop(sprintf("length(cutoff)=%d differs from n.class=%d. cutoff = "
,length(cutoff),n.class)
,sprintf("%7.3f ",cutoff));
w = which(cutoff<0);
if (length(w)>1) stop("cutoff has more than one element < 0.\n cutoff = ",sprintf("%7.3f ",cutoff));
if (length(w)==1) {
s = sum(cutoff[-w]);
if (s>=1) {
warning(sprintf("One element of %s is not specified (<0), but the others have a sum>=1, there is no remainder. ",text),
sprintf("%s = c(%s).\n ",text,paste(cutoff,collapse=",")),
"Reducing the sum of the others to 0.9.")
#browser()
cutoff=cutoff/s*0.9;
}
cutoff[w] = 1-sum(cutoff[-w]);
}
if (abs(sum(cutoff)-1)>1e-8) cutoff = cutoff/sum(cutoff);
#
# Due to rounding inaccurarcies, it can happen that sum(cutoff)=1+ 2.2e-16,
# i.e. slightly larger than 1. randomForest does not allow this,
# sum(cutoff) may not be larger than 1. The following code enforces this
# by modifying cutoff as little as possible:
eps = sum(cutoff)-1;
if (eps>0) {
if (eps>1e-8) stop("Something wrong with eps"); # this should not happen
eps = max(1.5*eps,1e-10);
cutoff = cutoff-eps/n.class;
if (sum(cutoff)>1) stop("Something wrong, sum(cutoff) is still >1 ");
}
}
cutoff;
}
#--- the old, now deprecated version ----------------
######################################################################################
# adjust cutoff (optional parameter for Random Forest and SVM)
# a) if length(cutoff)=n.class-1, add cutoff[n.class] as the remainder to 1.
# b) if sum(cutoff) != 1, scale it to 1
# c) if sum(cutoff) > 1 by a small amount (<=1e-8), reduce it to 1 or smaller.
######################################################################################
tdmModAdjustCutoff.OLD <- function(cutoff,n.class)
{
if (!is.null(cutoff)) {
if (length(cutoff)!=n.class) {
if (length(cutoff)!=n.class-1)
stop("Length of cutoff is not n.class-1!");
cutoff[n.class] = 1-sum(cutoff[1:(n.class-1)]);
}
if (abs(sum(cutoff)-1)>1e-8) cutoff = cutoff/sum(cutoff);
#
# Due to rounding inaccurarcies, it can happen that sum(cutoff)=1+ 2.2e-16,
# i.e. slightly larger than 1. randomForest does not allow this,
# sum(cutoff) may not be larger than 1. The following code enforces this
# by modifying cutoff as little as possible:
eps = sum(cutoff)-1;
if (eps>0) {
if (eps>1e-8) stop("Something wrong with eps");
eps = max(1.5*eps,1e-10);
cutoff = cutoff-eps/n.class;
if (sum(cutoff)>1) stop("Something wrong, sum(cutoff) is still >1 ");
}
}
cutoff;
}
######################################################################################
# tdmModSortedRFimport:
#
#' Sort the input variables decreasingly by their RF-importance.
#'
#' Build a Random Forest using \code{importance=TRUE}. Usually the RF is smaller (50 trees), to speed up computation.
#' Use na.roughfix for missing value replacement.
#' Decide which input variables to keep and return them in SRF$input.variables
#'
#' @param d_train training set
#' @param response.variable the target column from \code{d_train} to use for the RF-model
#' @param input.variables the input columns from \code{d_train} to use for the RF-model
#' @param opts options, here we use the elements [defaults in brackets]:
#' \itemize{
#' \item SRF.kind: \cr
#' ="xperc": keep a certain importance percentage, starting from the most important variable \cr
#' ="ndrop": drop a certain number of least important variables \cr
#' ="nkeep": keep a certain number of most important variables \cr
#' ="none": do not call \code{\link{tdmModSortedRFimport}} at all (see tdmRegress.r and tdmClassify.r)
#' \item SRF.ndrop: [0] how many variables to drop (if SRF.kind=="ndrop")
#' \item SRF.XPerc: [0.95] if >=0, keep that importance percentage, starting with the most
#' important variables (if SRF.kind=="xperc")
#' \item SRF.calc: [TRUE] =TRUE: calculate importance & save on SRF.file, =F: load from SRF.file
#' (SRF.file = Output/<filename>.SRF.<response.variable>.Rdata)
#' \item SRF.ntree: [50] number of RF trees
#' \item SRF.verbose: [2]
#' \item SRF.maxS: [40] how many variables to show in plot
#' \item SRF.minlsi: [1] a lower bound for the length of SRF$input.variables
#' \item RF.sampsize: sampsize for RF, set prior to calling this func via tdmModAdjustSampsize(opts$SRF.samp,...)
#' \item GD.DEVICE: if !="non", then make a bar plot on current graphic device
#' \item CLS.CLASSWT: class weight vector to use in random forest training
#' }
#' @return \code{SRF}, a list with the following elements:
#' \item{input.variables}{ the vector of input variables which remain after importance
#' processing. These are sorted by decreasing importance.}
#' \item{s_input}{all input.variables sorted by decreasing (**NEW**) importance}
#' \item{s_imp1}{ the importance for s_input}
#' \item{s_dropped}{ vector with name of dropped variables}
#' \item{lsd}{ length of s_dropped}
#' \item{perc}{ the percentage of total importance which is in the dropped variables}
#' \item{opts}{ some defaults might have been added}
#'
#' @author Wolfgang Konen, Patrick Koch \email{wolfgang.konen@@th-koeln.de}
#' @export
######################################################################################
tdmModSortedRFimport <- function(d_train, response.variable, input.variables, opts)
{
# require(randomForest); # now via direct call 'randomForest::'
opts <- tdmOptsDefaultsSet(opts);
ptm <- proc.time();
filename <- opts$filename;
#--- this is now in tdmEnvTMakeNew.r ---
# dir.output <- paste(dirname(opts$dir.output),basename(opts$dir.output),sep="/") # remove trailing "/", if it exists
# if (!file.exists(dir.output)) {
# success = dir.create(dir.output);
# if (!success) stop(sprintf("Could not create dir.output=%s",dir.output));
# }
# SRF.file <- paste(paste(dir.output,filename,sep="/"),"SRF",response.variable,"Rdata",sep=".")
############################################################
# PART 1: Calculate SRF importance (or load it from opts)
############################################################
if (opts$SRF.calc==TRUE) {
if (opts$SRF.kind=="xperc") {
if (opts$SRF.XPerc<0) stop(sprintf("opts$SRF.XPerc < 0 : %f",opts$SRF.XPerc));
if (opts$SRF.XPerc>1) stop(sprintf("opts$SRF.XPerc > 1 : %f",opts$SRF.XPerc));
}
formul <- formula(paste(response.variable, "~ .")) # use all possible input variables
to.model <- d_train[,c(response.variable,input.variables)]
cat1(opts,filename,": Train RF (importance, sampsize=", opts$RF.sampsize,") ...\n")
if (!is.null(opts$CLS.CLASSWT)) {
cat1(opts,"Class weights: ", opts$CLS.CLASSWT,"\n")
#cwt = opts$CLS.CLASSWT*1;# strange, but necessary: if we omit '*1' then cwt seems to be a copy-by-reference of
# opts$CLS.CLASSWT. After randomForest call cwt is changed and also opts$CLS.CLASSWT would be changed (!)
# (Another way to hinder RF to change the parameter 'classwt' on output is to make this vector a *named*
# vector. This is actually the case, because in tdmClassify we decorate opts$CLS.CLASSWT with the
# level names of response.variable)
cwt = sprintf("classwt=c(%s)",paste(opts$CLS.CLASSWT,collapse=","))
} else { cwt="classwt=NULL"; }
if (!is.null(opts$SRF.cutoff)) {
cat1(opts,"Cutoff: ", opts$SRF.cutoff,"\n");
#cutoff = sprintf("cutoff=c(%s)",paste(opts$SRF.cutoff,collapse=",")) # DON*T!! may lead to "Incorrect cutoff" error due to rounding inaccuracies
} else { cutoff=NULL; }
res.SRF = switch(opts$SRF.method
,"RFimp" = fsRfImportance(opts,formul,to.model,cwt)
#,"lasso" = fsLasso(opts)
,"INVALID"
);
if (res.SRF[1]=="INVALID1") {
stop(sprintf("*** Invalid FS method=%s ***\n",opts$SRF.method));
}
if (is.null(res.SRF))
stop(sprintf("FS method %s did not return a suitable result 'res.SRF'",opts$SRF.method));
imp1 <- res.SRF$imp1;
s_input <- input.variables[order(imp1,decreasing=TRUE)] # input.variables sorted by increasing importance
s_imp1 <- imp1[order(imp1,decreasing=TRUE)]
s_imp2 <- pmax(s_imp1,0);
# It can happen that unimportant variables have a negative importance s_imp1. This is only a statistical
# artefact, the importance is compatible with 0 (see notes_RF.doc for details). Since cumsum below does not
# allow negative values, we make a copy s_imp2 with all negative values clipped to 0.
cat1(opts,filename,": Saving SRF (sorted RF) importance info on opts ...\n");
opts$srf[[response.variable]] = data.frame(s_input=s_input
,s_imp1=s_imp1
,s_imp2=s_imp2
);
}
else { # i.e. if (opts$SRF.calc==F)
cat1(opts,filename,": Loading sorted RF importance info from opts ...\n")
#if (!file.exists(SRF.file)) stop(sprintf("tdmModSortedRFimport: SRF.file=%s does not exist"));
#load(file=SRF.file);
if (!any(names(opts$srf)==response.variable))
stop(sprintf("list opts$srf contains no data frame named %s. Consider opts$SRF.calc==TRUE.",response.variable));
s_input=opts$srf[[response.variable]]$s_input;
s_imp1 =opts$srf[[response.variable]]$s_imp1;
s_imp2 <- pmax(s_imp1,0);
} # if (opts$SRF.calc)
if (is.null(s_input) | is.null(s_imp1))
stop("opts$SRF.s_input or opts$SRF.s_imp1 are not available. Consider opts$SRF.calc=TRUE.");
############################################################
# PART 2: Select important variables
############################################################
if (opts$SRF.kind=="ndrop") {
# remove the SRF.ndrop input variables which have the lowest importance
# (but keep at least one):
opts$SRF.ndrop <- min(opts$SRF.ndrop,length(s_input)-1)
lsi <- length(s_input)-opts$SRF.ndrop;
} else { # i.e. opts$SRF.kind=="xperc" or "nkeep"
if (opts$SRF.kind=="nkeep") {
opts$SRF.nkeep <- lsi <- min(opts$SRF.nkeep,length(s_input));
} else { # i.e. opts$SRF.kind=="xperc"
# keep the minimal number of those most important input variables which
# sum up together to more than opts$SRF.XPerc * (sum of all importances)
w = which(cumsum(s_imp2)/sum(s_imp2)<=opts$SRF.XPerc)
if (length(w)==0) w=0; # for the pathological case that s_imp2 is all-zero beyond the first element -> w is empty
lsi <- max(w)+1;
lsi <- min(lsi,length(s_imp1)); # for the case SRF.XPerc==1: lsi may not be larger than #input.variables
}
}
# don't let lsi = length(input.variables) become smaller than minlsi
minlsi <- min(length(s_input),opts$SRF.minlsi);
lsi <- max(lsi,minlsi);
input.variables <- as.character(s_input[1:lsi])
s_dropped <- setdiff(s_input,input.variables);
lsd <- length(s_dropped);
# what percentage of total importance is in the dropped variables:
perc <- (1-sum(s_imp2[1:(length(s_imp2)-lsd)])/sum(s_imp2))*100;
if (opts$SRF.kind=="xperc" ) {
if (sum(s_imp2)==0)
stop(sprintf("Can't continue: None of the importances is >0 --> probably you have very imbalanced data!\n %s %s",
"Consider to enlarge your dataset with more examples from every class - OR -\n",
"to set opts$SRF.kind='none'."));
if (perc/100>1-opts$SRF.XPerc | perc<0)
stop(sprintf("Something wrong with perc: %f",perc));
}
if (opts$SRF.verbose>=2) {
lmi <- length(s_input);
cat(sprintf("Variables sorted by importance (%d %s):\n", lmi,
ifelse(lmi>100,", we show first 100 only","")));
print(s_input[1:min(lmi,100)]);
if (opts$SRF.kind=="ndrop" | (opts$SRF.kind=="xperc" & opts$SRF.XPerc>0.5)) {
cat(sprintf("Dropped columns (%d [=%5.1f%% of total importance]%s):\n", lsd, perc,
ifelse(lsd>100,", we show first 100 only","")));
if (lsd>0) print(s_dropped[1:min(lsd,100)]);
} else { # i.e. opts$SRF.kind=="nkeep" or (..=="xperc" & SRF.XPerc <= 0.5)
cat(sprintf("Kept columns (%d [=%5.1f%% of total importance]%s):\n", lsi, 100-perc,
ifelse(lsi>100,", we show first 100 only","")));
if (lsi>0) print(s_input[1:min(lsi,100)]);
}
cat(sprintf("Proc time: %5.2f\n",(proc.time()-ptm)[1]));
}
############################################################
# PART 3: Plot the importance
############################################################
if (opts$GD.DEVICE!="non") {
maxS <- min(length(s_imp1),opts$SRF.maxS)
tdmGraphicNewWin(opts)
oldmar = par()$mar
if (opts$GD.DEVICE!="rstudio") { # some barplots cause an "Error in plot.new() : figure margins too large"
# on device RStudioGD >> we omit barplot if in RStudio
par(mar=c(10,3,2,2)+0.1)
barplot(t(s_imp1[1:maxS]), names.arg=s_input[1:maxS], las=3, cex.lab=0.75, col=2, main=paste(filename, response.variable, sep=" : "))
par(mar=oldmar)
}
tdmGraphicCloseWin(opts);
if (opts$SRF.calc==TRUE & opts$SRF.method=="RFimp") {
tdmGraphicNewWin(opts)
randomForest::varImpPlot(res.SRF,n.var=maxS) # NEW: plot both MeanDecreaseAccuracy and MeanDecreaseGini
tdmGraphicCloseWin(opts);
}
}
SRF = list(input.variables=input.variables
, s_input=s_input
, s_imp1=s_imp1
, s_dropped=s_dropped # vector with name of dropped variables
, lsd=lsd # length of s_dropped
, perc=perc # the percentage of total importance which is in the dropped variables
, opts=opts # some elements might have been added
);
}
# fsRfImportance: helper for tdmModSortedRFimport
fsRfImportance <- function(opts,formul, to.model, cwt) {
# require(randomForest); # for 'na.roughfix', now via direct call 'randomForest::'
testit::assert("Cutoff is bigger than 1",sum(opts$SRF.cutoff)<=1)
# we work here with a command text string and eval(...) to allow for the presence or
# absence of certain options like "mtry" or "cutoff" which are not allowed to be NULL:
rf.options = "ntree=opts$SRF.ntree, importance=TRUE"; # NEW: only with 'importance=TRUE' we see MeanDecreaseAccuracy,
# otherwise we get only MeanDecreaseGini (faster, but sometimes unreliable)
rf.options = paste(rf.options,"sampsize=opts$RF.sampsize",sep=",")
rf.options = paste(rf.options,cwt,"na.action=randomForest::na.roughfix","proximity=F",sep=",")
if (!is.null(opts$SRF.mtry)) rf.options = paste(rf.options,"mtry=opts$SRF.mtry",sep=",")
if (!is.null(opts$SRF.cutoff)) rf.options = paste(rf.options,"cutoff=opts$SRF.cutoff",sep=",")
#
#if (!is.null(opts$SRF.cutoff)) rf.options = paste(rf.options,cutoff,sep=",") # DON'T!! this may lead to "Incorrect cutoff" error due to rounding!
#
#dbg_chase_cutoff_bug(formul,to.model,d_train,response.variable,rf.options,opts);
#print(.Random.seed[1:6]);
flush.console();
res.SRF <- NULL;
#browser()
eval(parse(text=paste("res.SRF <- randomForest::randomForest( formul, data=to.model,",rf.options,")")));
# select MeanDecreaseAccuracy-importance (NEW 05/11, together with switch 'importance=TRUE' above)
res.SRF$imp1 <- randomForest::importance(res.SRF, type=1, scale=opts$SRF.scale);
#print(.Random.seed[1:6]);
# only for debug or in-depth-analysis:
#analyzeImportance(res.SRF,input.variables,opts);
res.SRF;
}
# fsLasso <- function(opts) {
# # *** does not yet work in all cases !!! ***
# require(lasso2);
# formul <- formula(paste("as.numeric(",response.variable,") ~ .")) # use all possible input variables
# res.SRF <- NULL;
# #browser()
# eval(parse(text=paste("res.SRF <- l1ce( formul, data=to.model, bound=0.5)")));
# res.SRF$imp1 <- as.matrix(res.SRF$coefficients[-1]);
#
# res.SRF;
# }
# helper for fsRfImportance
# (only for debug or in-depth-analysis of the importance)
analyzeImportance <- function(res.SRF,input.variables,opts) {
#
# 1st finding: variables which appear never as split variable have necessarily importance=0
#
splitVar=NULL; # the union of all variables which appear ever as a split in a tree of the forest
splitLst=NULL; # the list of all split variables (variables may appear more than once)
rn=rownames(res.SRF$imp1);
for (k in 1:res.SRF$ntree) splitVar=union(splitVar,randomForest::getTree(res.SRF,k=k,labelVar=T)$"split var")
splitVar = sort(splitVar); # remove the <NA>
noSplitVar = sort(setdiff(input.variables,splitVar))
for (k in 1:res.SRF$ntree) splitLst=c(splitLst,as.character(randomForest::getTree(res.SRF,k=k,labelVar=T)$"split var"))
splitLst=sort(splitLst);
impZeroVar = sort(rn[res.SRF$imp1==0])
cat(sprintf("%d variables have importance exactly zero\n",length(impZeroVar)));
cat(sprintf("%d variables never appear in a split\n",length(noSplitVar)));
if (length(setdiff(noSplitVar,impZeroVar))==0) cat("All variables never appearing in a split have importance exactly zero\n")
cat(sprintf("setdiff(impZeroVar,noSplitVar) is: ")); print(setdiff(impZeroVar,noSplitVar))
#
# 2nd finding: variables with negative importance have a sligthly smaller split count than those with positive importance
# Most variables with negative importance have a very small raw importance (%IncMSE), it is the division by
# importanceSD which makes the negative value big.
#
impNegVar = sort(rn[res.SRF$imp1<0]);
impPosVar = setdiff(splitVar,impNegVar);
cat("Split count for variables with negative importance (we show first 6 only):\n")
for (v in impNegVar[1:min(length(impNegVar),6)]) cat(sprintf("%s: %d %e\n", v,length(which(splitLst==v)),res.SRF$imp1[rownames(res.SRF$imp1)==v]))
avg=0; for (v in impNegVar) avg = avg + length(which(splitLst==v));
cat("Avg. count:", avg/length(impNegVar),"\n");
cat("Split count for variables with positive importance (we show first 6 only):\n")
for (v in impPosVar[1:min(length(impPosVar),6)]) cat(sprintf("%s: %d %f\n", v,length(which(splitLst==v)),res.SRF$imp1[rownames(res.SRF$imp1)==v]))
avg=0; for (v in impPosVar) avg = avg + length(which(splitLst==v));
cat("Avg. count:", avg/length(impPosVar),"\n");
#
par(mfcol=c(1,2))
ind1 = order(res.SRF$imp1)
plot(res.SRF$imp1[ind1],ylim=c(-2,3))
points(res.SRF$importance[ind1,1]*15,col="green")
ind2 = order(res.SRF$importance[,1])
plot(res.SRF$importance[ind2,1]*10,col="green",col.lab="green",ylim=c(-2,3))
points(res.SRF$imp1[ind2])
par(mfcol=c(1,1))
#
# 3rd finding: variables with |importance|=1.01... (exactly the same value) have mostly a split count of exactly 1.
# The quotient %IncMSE/importanceSD can be shown to be exactly +-sqrt(N/(N-1)) in this case.
#
N=opts$SRF.ntree;
impOneVar = sort(rn[abs(res.SRF$imp1)==sqrt(N/(N-1))]); # all variables with |importance| = 1.010153 in case N=50
cat(sprintf("Split count for variables with |importance| =%9.6f (we show first 6 only):\n",sqrt(N/(N-1)) ));
for (v in impOneVar[1:min(length(impOneVar),6)]) cat(sprintf("%s: %d %f\n", v,length(which(splitLst==v)),res.SRF$imp1[rownames(res.SRF$imp1)==v]))
avg=0; for (v in impOneVar) avg = avg + length(which(splitLst==v));
cat("Avg. count:", avg/length(impOneVar),"\n");
#browser()
}
######################################################################################
# tdmModVote2Target
#
#' Analyze how the vote fraction corresponds to reliability of prediction.
#'
#' This function analyzes whether in different vote bins the trained RF makes
#' predictions with different reliability. Only for RF-prediction in case of
#' binary (0/1) classification. \cr\cr Expected result: The larger the fraction
#' of trees voting for class 0 is, the smaller is the percentage of true class-1-
#' cases in this vote bin.
#' This function is somewhat specialized for the DMC2010-task.
#'
#' @param vote0 vector: which fraction of trees votes for class 0?
#' @param pred vector: the predicted class for each record (0/1)
#' @param target vector: the true class for each vector (0/1)
#'
#' @return a data frame with columns
#' \item{vcut}{ vote cut v}
#' \item{count}{ number of cases with vote fraction in [v[i-1],v[i]]}
#' \item{pred0}{ fraction of 0-predictions}
#' \item{pCorr}{ fraction of correct predictions}
#' \item{pR}{ fraction of true 1-cases}
#'
#' @author Wolfgang Konen \email{wolfgang.konen@@th-koeln.de}
#' @export
######################################################################################
tdmModVote2Target <- function(vote0,pred,target) {
tvot <- data.frame(v0=vote0,pred=pred,targ=target);
vcut <- c(0.4,0.5,0.6,0.65,0.70,0.73,0.75,0.80,0.85,0.90,0.95,1.0)
R <- nrow(tvot)
tres <- data.frame()
i=1; vprev=0;
for (v in vcut) {
ind <- which(vprev< tvot$v0 & tvot$v0 <= v)
I <- length(ind);
tres[i,"vcut"] <- v;
tres[i,"count"] <- I;
tres[i,"frac"] <- I/R;
tres[i,"pred0"] <- length(which(tvot$pred[ind]==0))/I;
tres[i,"pCorr"] <- length(which(tvot$targ[ind]==tvot$pred[ind]))/I;
tres[i,"pR"] <- length(which(tvot$targ[ind]==1))/I;
i=i+1; vprev=v;
}
tres;
}
######################################################################################
# tdmModConfmat
#
#' Calculate confusion matrix, gain and RGain measure.
#'
#' @param d data frame
#' @param colreal name of column in d which contains the real class
#' @param colpred name of column in d which contains the predicted class
#' @param opts a list from which we use the elements: \itemize{
#' \item \code{gainmat}: the gain matrix for each possible outcome, same size as \code{cm$mat} (see below). \cr
#' \code{gainmat[R1,P2]} is the gain associated with a record of real class R1 which we
#' predict as class P2. (gain matrix = - cost matrix)
#' \item \code{rgain.type}: one out of \{"rgain" | "meanCA" | "minCA" | "bYouden"
#' | "arROC" | "arLIFT" | "arPRE" \},
#' affects output \code{cm$mat} and \code{cm$rgain}, see below.
#' }
#' @param predProb if not NULL, a data frame with as many rows as data frame \code{d}, containing
#' columns (index, true label, predicted label, prediction score). Is only needed
#' for \code{opts$rgain.type=="ar*"}.
#'
#' @return \code{cm}, a list containing:
#' \item{mat}{ matrix with real class levels as rows, predicted class levels columns. \cr
#' \code{mat[R1,P2]} is the number of records with real class R1
#' predicted as class P2, if opts$rgain.type=="rgain".
#' If opts$rgain.type=="meanCA" or "minCA", then show this number as percentage
#' of "records with real class R1" (percentage of each row).
#' CAUTION: If there are NA's in column \code{colpred}, those cases are missing
#' in \code{mat} (!) (but the class errors are correct as long as there are
#' no NA's in column \code{colreal})}
#' \item{cerr}{ class error rates, vector of size nlevels(colreal)+1. \cr
#' \code{cerr[X]} is the misclassification rate for real class X. \cr
#' \code{cerr["Total"]} is the total classification error rate.}
#' \item{gain}{ the total gain (sum of pointwise product \code{opts$gainmat*cm$mat}) }
#' \item{gain.vector}{ gain.vector[X] is the gain attributed to real class label X.
#' gain.vector["Total"] is again the total gain.}
#' \item{gainmax}{ the maximum achievable gain, assuming perfect prediction}
#' \item{rgain}{ Depending on the value of \code{opts$rgain.type}: \cr
#' \code{"rgain"}: ratio gain/gainmax in percent, \cr
#' \code{"meanCA"}: mean class accuracy percentage (i.e. mean(diag(cm$mat)), \cr
#' \code{"minCA"}: min class accuracy percentage (i.e. min(diag(cm$mat)), \cr
#' \code{"bYouden"}: balanced Youden index: min(sensitivity,specificity), \cr
#' \code{"arROC"}: area under ROC curve (a number in [0,1]), \cr
#' \code{"arLIFT"}: area between lift curve and horizontal line 1.0, \cr
#' \code{"arPRE"}: area under precision-recall curve (a number in [0,1])
#' }
#'
#' @note For all measures \code{rgain} holds: The higher, the better. \cr
#' The last four elements of \code{opts$rgain.type=} \code{"bYouden","arROC",}
#' \code{"arLIFT","arPre"} are only available for binary classification. \cr
#' For case \code{"bYouden"}: \cr
#' sensitivity = \code{TP/(TP+FN)} \cr
#' specificity = \code{TN/(TN+FP)} \cr
#'
#' @author Wolfgang Konen (\email{wolfgang.konen@@th-koeln.de}), Patrick Koch
#' @seealso \code{\link{tdmClassify}} \code{\link{tdmROCRbase}}
#' @export
######################################################################################
tdmModConfmat <- function(d,colreal,colpred,opts,predProb=NULL)
{
if (is.null(opts$rgain.type)) opts$rgain.type="rgain";
if (!(opts$rgain.type %in% c("rgain","meanCA","minCA","bYouden","arROC","arLIFT","arPRE")))
stop(sprintf("Invalid option opts$rgain.type=\"%s\"\n Allowed values are \"rgain\",\"meanCA\",\"minCA\",\"bYouden\",\"arROC\",\"arLIFT\",\"arPRE\"",opts$rgain.type));
AREA = opts$rgain.type %in% c("arROC","arLIFT","arPRE");
if (AREA & is.null(predProb))
stop(sprintf("Can not calculate the measure rgain for opts$rgain.type==%s, if argument predProb is NULL",opts$rgain.type));
gainmat = opts$CLS.gainmat;
col.real <- factor(d[,colreal],levels=colnames(gainmat));
col.pred <- factor(d[,colpred],levels=colnames(gainmat));
#cmat = matrix(0,nrow=nlevels(col.real),ncol=nlevels(col.pred),
# dimnames=list(levels(col.real),levels(col.pred)))
#-- cmat now simpler to obtain with function table (see below)
ccase = matrix(0,nrow=1,ncol=nlevels(col.real)+1,
dimnames=list("class cases:",c(levels(col.real),"Total")))
# class cases
for (rc in levels(col.real)) {
ccase[1,rc] <- length(which(col.real==rc))
#d1 <- d[col.real==rc, ] # all records with real class rc
#ccase[1,rc] <- dim(d1)[1]
#for (pc in levels(col.pred)) {
# cmat[rc,pc] <- length(which(d1[,colpred]==pc))
#}
}
ccase[1,"Total"] <- sum(ccase) # all cases, including those with NA in col.pred
# (but excluding those with NA in col.real)
# confusion matrix
cmat <- table(actual=col.real,predicted=col.pred)
# each case with NA in colpred is in *no* cell of cmat
# class errors
cerr = matrix(0,nrow=1,ncol=nlevels(col.pred)+1,
dimnames=list("class errors:",c(levels(col.pred),"Total")))
for (rc in levels(col.real)) {
cerr[1,rc] <- 1-cmat[rc,rc]/ccase[1,rc]
}
cerr[1,"Total"] <- 1-sum(diag(cmat))/ccase[1,"Total"]
gain = sum(gainmat*cmat)
gain.vector = matrix(0,nrow=1,ncol=nlevels(col.pred)+1,
dimnames=list("gain.vector",c(levels(col.pred),"Total")))
for (rc in levels(col.real)) {
gain.vector[1,rc] <- sum(gainmat[rc,] * cmat[rc,])
}
gain.vector[1,"Total"] <- sum(gain.vector)
gainmax = sum(apply(gainmat,1,max)*rowSums(cmat))
# the 1st term is a vector containing the max gain for each row (true class)
# the 2nd term is a vector containing the #records for each true class
rgain = gain/gainmax*100;
nacase = length(which(is.na(col.pred)));
if (sum(cmat)+nacase!=ccase[1,"Total"]) {
#browser()
stop("tdmModConfmat: Something wrong in NA-counting!");
}
if (opts$rgain.type %in% c("meanCA","minCA")) {
for (rc in levels(col.real)) {
cmat[rc,] <- cmat[rc,]/ccase[1,rc];
}
rgain=switch(opts$rgain.type
, "meanCA" = mean(diag(cmat))
, "minCA" = min(diag(cmat))
);
}
if (AREA) {
perf=switch(opts$rgain.type
, "arROC" = tdmROCR_calc(predProb,"tpr","fpr")
, "arLIFT" = tdmROCR_calc(predProb,"lift","rpp")
, "arPRE" = tdmROCR_calc(predProb,"prec","rec")
);
rgain=switch(opts$rgain.type
, "arROC" =, "arPRE" = tdmROCR_area(perf,"ROC")
, "arLIFT" = tdmROCR_area(perf,"lift")
);
}
balancedYouden <- function(cm) {
testit::assert("Confusion matrix cm has to have two rows for opts$rgain.type=='bYouden'",nrow(cm)==2)
sensitivity <- cm[2,2]/(cm[2,1]+cm[2,2])
specificity <- cm[1,1]/(cm[1,1]+cm[1,2])
return(min(sensitivity,specificity))
}
if (opts$rgain.type=="bYouden") {
rgain = balancedYouden(cmat)
}
cm = list( mat=cmat
,cerr=cerr
,ccase=ccase
,nacase=nacase
,gain=gain
,gain.vector=gain.vector
,gainmax=gainmax
,rgain=rgain
);
return(cm)
}
# debug info to chase the "Incorrect cutoff specified bug" in training randomForest
dbg_chase_cutoff_bug <- function(formul,to.model,d_train,response.variable,rf.options,opts) {
print2(opts,c(Targets=NA,table(to.model[,response.variable])))
dir.Rdata <- paste(dirname(opts$dir.Rdata),basename(opts$dir.Rdata),sep="/") # remove trailing "/", if it exists
if (!file.exists(dir.Rdata)) {
success = dir.create(dir.Rdata);
if (!success) stop(sprintf("Could not create dir.Rdata=%s",dir.Rdata));
}
save(formul,to.model,response.variable,rf.options,opts,file=paste(dir.Rdata,"rf_input_dbg.Rdata",sep="/"));
cat1(opts,"RF-debug-data saved to", paste(dir.Rdata,"rf_input_dbg.Rdata",sep="/"),"\n");
if (!is.null(opts$SRF.cutoff))
if (length(levels(d_train[,response.variable]))!=length(opts$SRF.cutoff)) {
warning("Cutoff problems ahead!!",immediate.=TRUE)
browser()
}
if (sum(opts$SRF.cutoff)-1>0) {
warning(sprintf("Sorry, but sum(opts$SRF.cutoff) is larger than 1 by %f",sum(opts$SRF.cutoff)-1),immediate.=TRUE)
browser()
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.