R/CVSetup.R

Defines functions CVSetup preProc logFunc quantileFunc

source("quantilenorm.R")
CVSetup = function(Internal,mySettings,Xtrain,ytrain){
  'Do some setup and return PrepareDataInfo
  '
  PrepareDataInfo = list()
  sett = mySettings$preProcessing
  timeString = format(Sys.time(), "%Y%m%d%H%M")
  N = dim(Xtrain)[1] # number of samples
  # check missing values
  if(any(is.na(Xtrain))){
    cat("Missing values detected in Xtrain. Imputing by median of values present in the sample(s).")
    tempf=function(x){
      if(any(is.na(x)))
        x[is.na(x)]=median(x, na.rm=TRUE)
      return(x)
    }
    Xtrain=t(apply(Xtrain,1,tempf)) # transpose required because apply sucks
  }
  if(any(is.na(ytrain))){
    stop("ytrain must not have any missing values.")
  }
  
  #pre process full data and store
  preProcout = preProc(settings=sett,Xtrain=Xtrain)
  Xtrain.norm = preProcout$xouttrain
  PrepareDataInfo$PreProcModel = preProcout$Model # data.frame with as many entries as there are steps
  rm(preProcout)
  PrepareDataInfo$fulltraindatafilename = file.path(mySettings$directory$intermediate,
  sprintf("%s_FullData_%s.Rdata", mySettings$projectname,timeString))
  save(Xtrain.norm, ytrain, file = PrepareDataInfo$fulltraindatafilename)
  # CV enabled?
  if(sett$crossValidation$CVEnable){
    if(!require("cvTools")){
      stop("Package cvTools is a requirement to run this pipeline. Please install yaml.")
    }
    if(sett$crossValidation$LOOCV){
      CVRepeats=1
      CVFolds=N
    }
    else{ # not LOOCV
      CVRepeats=sett$crossValidation$CVRepeats
      CVFolds=sett$crossValidation$CVFolds
    }
    # CHANGE drawing of folds into stratified:
    set.seed(sett$crossValidation$randomSeed)
    cvfolds = cvFolds(n=N, K = CVFolds, R = CVRepeats)
    #print(cvfolds)
    PrepareDataInfo$cvfoldfilenames = data.frame(matrix(NA, nrow = CVRepeats, ncol = CVFolds))
    PrepareDataInfo$cvInfo = cvfolds # store for later use
    
    # loop through repeats and folds
    for (iii in 1:cvfolds$R){ # repeats
      parcvfoldfilenames = foreach(jjj = 1:cvfolds$K, .combine=c) %dopar% { # folds
        trainInds = cvfolds$subsets[cvfolds$which!=jjj,iii]
        testInds = cvfolds$subsets[cvfolds$which==jjj,iii]
        CVX.train = Xtrain[trainInds,,drop=F]
        CVX.test = Xtrain[testInds,,drop=F]
        if(is.matrix(ytrain)){
          CVy.train = ytrain[trainInds,,drop=F]
          CVy.test = ytrain[testInds,,drop=F]
        }
        else{ 
          CVy.train = ytrain[trainInds]
          CVy.test = ytrain[testInds]
        }

        ## pre-process X
        normTemp = preProc(CVX.train,settings=sett,Xtest=CVX.test)
        CVX.train.norm = normTemp$xouttrain
        CVX.test.norm = normTemp$xouttest
        #PrepareDataInfo$cvfoldfilenames[iii,jjj] = file.path(mySettings$directory$intermediate, sprintf("%s_CVRepeat%dFold%d_%s.Rdata",mySettings$projectname,iii,jjj,timeString))
        cvfoldfilename = file.path(mySettings$directory$intermediate, sprintf("%s_CVRepeat%dFold%d_%s.Rdata",mySettings$projectname,iii,jjj,timeString))
        save(CVX.train.norm, 
                  CVX.test.norm, 
                  CVy.train, 
                  CVy.test, 
                  trainInds, 
                  testInds, 
             file = cvfoldfilename)
        cvfoldfilename # return value for parallel loop 
      }
      PrepareDataInfo$cvfoldfilenames[iii,] = parcvfoldfilenames
    } 
  }
  return(PrepareDataInfo)
}
    
preProc=function(settings,Xtrain=NULL,Xtest=NULL,Model=NULL){
  output=list()
  # if normalisation etc. steps disabled: 
  if(is.null(settings$preProcSteps)||length(settings$preProcSteps)<1){
    output$xouttrain=Xtrain
    output$xouttest=Xtest
    return(output)
  }
  NumPreProcSteps=length(settings$preProcSteps)
  NewModel=data.frame(matrix(nrow=1,ncol=NumPreProcSteps)) # temporary storage for preprocessing models
  # otherwise, loop through preProcSteps
  for(ii in 1:NumPreProcSteps){
    SubModel=Model[[ii]] # extract preprocessing model for this step (possibly NULL)
    funcHandle=get( paste(tolower(settings$preProcSteps[[ii]]$method),"Func",sep="") )
    outtemp=funcHandle(settings$preProcSteps[[ii]],Xtrain,Xtest,SubModel)
    Xtrain=outtemp$xouttrain # update Xtrain
    Xtest=outtemp$xouttest # update Xtest
    NewModel[[ii]]=outtemp$Model
    rm(outtemp)
  }
  output=list(xouttrain=Xtrain,xouttest=Xtest,Model=NewModel)
  return(output)
}

logFunc=function(stepSettings,Xtrain=NULL,Xtest=NULL,Model=NULL){
  myBase=stepSettings$value
  if(!is.null(Xtrain))
    Xtrain=log(Xtrain,myBase)
  if(!is.null(Xtest))
    Xtest=log(Xtest,myBase)
  output=list(xouttrain=Xtrain,xouttest=Xtest,Model=NA)
  return(output)
}

quantileFunc=function(stepSettings,Xtrain=NULL,Xtest=NULL,Model=NULL){
  output=list(xouttrain=NULL,xouttest=NULL,Model=NULL)
  if(!is.null(Model)){
    quantiles=Model$quantiles
    if(!is.null(Xtrain)){
      temp=quantilenorm(Xtrain, refquant=quantiles) 
      output$xouttrain=temp$xout
      rm(temp)
    }
  }else{
    if(is.character(stepSettings$value)){
        my_quant_method = stepSettings$value
        my_quant_prob = NA 
    }else
    {
        my_quant_method = "quant"
        my_quant_prob = stepSettings$value
    }

    temp=quantilenorm(Xtrain, method=my_quant_method, quantprob=my_quant_prob)
    quantiles=temp$quantiles
    output$xouttrain=temp$xout
    rm(temp)
  }
  if(!is.null(Xtest)){
    temp2=quantilenorm(Xtest, refquant=quantiles)
    output$xouttest=temp2$xout
  }
  output$Model$quantiles = quantiles
  return(output)
}
mjafin/PredictiveModelPipeline documentation built on Feb. 11, 2020, 3:39 a.m.