RLR: Reinforcement Learning with R

Documented in Act.QLearningPersExpPath Extend.Memory.QLearningPersExpPath Get.Def.Par.QLearningPersExpPath Initialise.QLearningPersExpPath Play.Multiple.Games.QLearningPersExpPath Play.On.Strategy.QLearningPersExpPath Q.on.hist.PD.QLearning Replay.QLearningPersExpPath Setup.QLearningPersExpPath Train.On.Memory.QLearningPersExpPath Train.QLearningPersExpPath Update.Evaluator.QLearningPersExpPath Update.Memory.QLearningPersExpPath

#Model Persistent QPathing with Exploration Path

#' Default Parameters for (improved) Q-Learning
#' 
#' Returns the 'algo.par' necessary for \code{\link{Setup.QLearningPersExpPath}}, \code{\link{Initialise.QLearningPersExpPath}} and \code{\link{Train.QLearningPersExpPath}}
#' 
#' @param setting String which defines the setting of the default parameters. \itemize{
#'   \item "ThesisOpt.XGB" - Default setting. The same parameters are used as for the final results of the thesis of Martin Kies. Uses the Gradient Boosting Variant.
#'   \item "ThesisOpt.RNN" - The same parameters are used as for the final results of the thesis of Martin Kies. Uses the Recurrent Neural Network (LSTM) Variant.
#'   \item "Legacy.v.0.1.6" - Uses the default settings of version 0.1.6 of this package
#' }
#'
#' @export
Get.Def.Par.QLearningPersExpPath <- function(setting="ThesisOpt.XGB"){
  
  if(is.null(setting)||setting=="ThesisOpt.XGB"){
    algo.params <- Get.Def.Par.QLearningPersExpPath.Legacy.ThesisOpt.XGB()
  } else if (setting=="ThesisOpt.RNN"){
    algo.params <- Get.Def.Par.QLearningPersExpPath.Legacy.ThesisOpt.RNN()
  } else if (setting=="Legacy.v.0.1.6"){
    algo.params <- Get.Def.Par.QLearningPersExpPath.Legacy.v.0.1.6()
  } else if (setting=="QLearning.Basic"){
    algo.params <- Get.Def.Par.QLearningPersExpPath.QLearning.Basic()
  } else {
    stop("Can't generate default parameters algo.par for QLearningPersExpPath due to misspecified setting.")
  }
  
  return(algo.params)
}

#' Controlled Copying of Models
#'
#' Returns a new, updated evaluator
#'
#' @param evaluator Evaluation consisting of the two models
#' @param new.model Model with which we want to update. If \code{NULL}, no update is done.
#' @param new.model.surp If not \code{NULL}, the surprise model is updated accordingly.
#' @param new.model.fam If not \code{NULL}, the familiarity model is updated accordingly.
#' @param which May be `both`, `cur` or `best` and updates respectively.
#' @export
Update.Evaluator.QLearningPersExpPath <- function(evaluator=evaluator, new.model=NULL, new.model.surp=NULL, new.model.fam=NULL, which.update="both", model.par){
  #restore.point("Update.Evaluator.QLearningPersExpPath")

  #Update main model?
  if(!is.null(new.model)){
    switch(which.update,
           both={
             update.best <- TRUE
             update.cur <- TRUE
           },
           cur={
             update.best <- FALSE
             update.cur <- TRUE
           },
           best={
             update.best <- TRUE
             update.cur <- FALSE
           },
           {#default
             stop("which.update does not specify which model should be updated. 'both', 'cur' and 'best' are supported.")
           }
    )

    if(model.par$name=="Neural.Network.Basic"){
      if(update.best) evaluator$model.best <- new.model
      if(update.cur) evaluator$model.cur <- new.model
    } else {
      if(update.best) evaluator$model.best <- new.model
      if(update.cur) evaluator$model.cur <- new.model
    }
  }

  #Update Surprise?
  if(!is.null(new.model.surp)){
    evaluator$model.surp <- new.model.surp
  }

  #Update Familiarity?
  if(!is.null(new.model.fam)){
    evaluator$model.fam <- new.model.fam
  }

  return(evaluator)
}

# Calculate familiarity Score
#' @export
Update.Familiarity.Model.QLearningPersExpPath <- function(game.object, model.fam, model.par.fam, x_train, action){
  restore.point("Update.Familiarity.Model.QLearningPersExpPath")

  n.act <- game.object$game.par(game.object)$output.nodes

  action.space.composite <- t(sapply(action,FUN=function(x){return((1:n.act)[-x])}))
  action.space.composite.long <- matrix(action.space.composite,ncol=1)
  action.all <- matrix(c(action,action.space.composite.long),ncol=1)
  y_train.all <- matrix(c(rep(1,length(action)),rep(0,length(action.all)-length(action))),ncol=1)

  if(model.par.fam$name=="RNN.Basic"){
    states.l.list <- lapply(1:n.act,FUN=function(x){x_train})
    x_train.all <- abind(states.l.list, along=1)
  } else {
    x_train.all <- do.call(rbind, replicate(n.act, x_train, simplify=FALSE))
  }

  model.fam <- model.par.fam$train(model.fam, model.par.fam, x_train.all, y_train.all, action.all)

  return(model.fam)
}

# Calculate degree of Surprise
#' @export
Update.Surprise.Model.QLearningPersExpPath <- function(game.object, model.surp, model.par.surp, x_train, action, surprise.score){
  #restore.point("Update.Surprise.Model.QLearningPersExpPath")
  model.surp <- model.par.surp$train(model.surp, model.par.surp, x_train, surprise.score, action)$model
  return(model.surp)
}

#Memory.Function
#' Trains model based on memory
#'
#' @param model.par Model parameters for main model
#' @param model.par.surp Model parameters for Surprise model. If \code{NULL}, then no suprise model is trained.
#' @param model.par.fam Model parameters for Familiarity model. If \code{NULL}, then no familiarity model is trained.
#' @param type May be init for memory initialization and replay for replay.
#' @export
Train.On.Memory.QLearningPersExpPath <- function(model, model.surp=NULL, model.fam=NULL, model.par, model.par.surp=NULL, model.par.fam=NULL, algo.var, algo.par, game.object, type="init", mem.replay=NULL, ...){
  Weigh.History <- function(y.pred,Q.path.train, minibatch.no=NULL, algo.par, mem.length, type, MC.factor=NULL){
    #restore.point("Weigh.History")

    if(type=="init"){
      weights <- rep(1, times=mem.length)
    } else {
      if(algo.par$hybrid.switch){
        weights <- MC.factor
      } else {
        weights <- minibatch.no/mem.length #weights according to memory oldness
      }

    }

    Q.path.train[is.na(Q.path.train)] <- y.pred[is.na(Q.path.train)]

    res <- (1-weights)*y.pred + weights*Q.path.train

    return(res)
  }

  restore.point("Train.On.Memory.QLearningPersExpPath")

  #Generate Training sample (Batch) out of complete memory
  if(type=="replay"){
    if(length(algo.var$memory)<algo.par$batch.size){
      batch.size <- length(algo.var$memory)
    } else {
      batch.size <- algo.par$batch.size
    }

    if(algo.par$force.last>batch.size){
      force.last <- batch.size
    } else {
      force.last <- algo.par$force.last
    }
    if(force.last==batch.size){
      minibatch <- algo.var$memory[(length(algo.var$memory)-force.last+1):length(algo.var$memory)]
      minibatch.no <- (length(algo.var$memory)-force.last+1):length(algo.var$memory)
    } else {
      if(force.last==0){
        minibatch.no <- c(sample(1:length(algo.var$memory), batch.size))
      } else {
        minibatch.no <- c((length(algo.var$memory)-force.last+1):length(algo.var$memory),sample(1:length(algo.var$memory), batch.size-force.last))
      }
      minibatch <- algo.var$memory[minibatch.no]
    }
  } else if (type=="init"){
    minibatch <- algo.var$memory
    minibatch.no <- NULL
    batch.size <- length(minibatch)
  } else {
    stop("type in Train.On.Memory.QLearningPersExpPath not supported")
  }

  if(!algo.par$use.rnn){ #Default case (NN, XGBoost, ... etc)
    x_train <- t(sapply(minibatch, FUN=function(x){
      return(x$state)
    }))

    x_next_state <- t(sapply(minibatch, FUN=function(x){
      return(x$next.state)
    }))

    action <- sapply(minibatch, FUN=function(x){
      return(x$action)
    })
  } else {
    if(model.par$single.dimensional){
      input.nodes <- game.object$game.par(game.object)$input.nodes.time
      output.nodes <- game.object$game.par(game.object)$output.nodes
    } else {
      stop("With RNN only single.dimensional is supported") #Is not a problem here but later on
      input.nodes <- game.object$game.par(game.object)$input.nodes.time
    }
    dim.sizes <- c(batch.size,dim(minibatch[[1]]$state)[2],input.nodes)
    x_train <- array(NA, dim=dim.sizes)
    x.state.extract <- lapply(minibatch, FUN=function(x){
      return(t(x$state))
    })

    action <- sapply(minibatch, FUN=function(x){
       return(x$action)
    })

    for (j in 1:length(x.state.extract)) {
      x_train[j,,] <- x.state.extract[[j]]
    }

    x_next_state <- array(NA, dim=dim.sizes)
    x.next.state.extract <- lapply(minibatch, FUN=function(x){
      return(t(x$next.state))
    })
    for (j in 1:length(x.next.state.extract)) {
      x_next_state[j,,] <- x.next.state.extract[[j]]
    }

    # for (j in 1:length(x.next.state.extract)) {
    #   cur.round <- sum(action.hist[[j]][,1]!=model.par$mask.value)+1
    #   action.long <- rep(0,output.nodes)
    #   action.long[action[j]] <- 1
    #   x_next_state[j,,1:input.nodes.state] <- x.next.state.extract[[j]]
    #   x_next_state[j,,(input.nodes.state+1):input.nodes] <- action.hist[[j]]
    #   if(cur.round <= dim(x_next_state)[2]){
    #     x_next_state[j,cur.round,(input.nodes.state+1):input.nodes] <- action.long
    #   } else {
    #     #do nothing, but should never be reached.
    #   }
    # }
  }

  reward <- sapply(minibatch, FUN=function(x){
    return(x$reward)
  })

  done <- sapply(minibatch, FUN=function(x){
    return(x$done)
  })

  if(algo.par$hybrid.Q || algo.par$MC){
    Q.path <- sapply(minibatch, FUN=function(x){
      return(x$Q.path)
    })
  }

  if(algo.par$curio.beta>0){
    R.phi <- sapply(minibatch,FUN=function(x){
      R.phi.orig <- x$R.phi
      R.phi <- R.phi.orig * algo.par$curio.decay.memory^(algo.var$experience.count-x$experience.count)
      return(R.phi)
    })
  }

  if(!is.null(model.fam)){
    fam.score <- model.par.fam$predict(model.fam,model.par.fam, x_train, action)
    model.fam <- Update.Familiarity.Model.QLearningPersExpPath(game.object, model.fam, model.par.fam, x_train, action)$model #Train new model
  }

  if(type=="replay"){
    replay.no <- algo.par$replay.intensive
  } else {
    if(is.null(mem.replay)){
      replay.no <- 1
    } else {
      replay.no <- mem.replay
    }
  }
  if(algo.par$only.experienced){
    Q.model.old <- model.par$predict(model,model.par, x_train, action)
  } else {
    Q.model.old <- model.par$predict(model,model.par, x_train)
  }


  for(i in 1:replay.no){
    if(replay.no>1) print(paste0(i,". round of replay(",replay.no,")"))

    Q.model.next <- apply(model.par$predict(model,model.par, x_next_state), 1, FUN=max)
    #Go through all given that action was taken
    if(algo.par$only.experienced){
      y.pred <- (1-algo.par$a)*Q.model.old + algo.par$a * (reward + algo.par$gamma*Q.model.next) #normal update "Q learning
    } else {
      y.pred <- t(sapply(1:length(action),FUN=function(x){
        #restore.point("y_train.intern")
        res <- Q.model.old[x,]
        res[action[x]] <- (1-algo.par$a)*Q.model.old[x,action[x]] + algo.par$a * (reward[x] + algo.par$gamma*Q.model.next[x]) #normal update "Q learning but for both actions
        return(res)
      }))
    }
    if(algo.par$hybrid.Q || algo.par$MC){
      if(algo.par$only.experienced){
        Q.path.train <- (1-algo.par$hybrid.Q.a.MC)*Q.model.old + algo.par$hybrid.Q.a.MC*(Q.path)
      } else {
        Q.path.train <- t(sapply(1:length(action),FUN=function(x){
          #restore.point("y_train.intern")
          res <- Q.model.old[x,]
          res[action[x]] <- (1-algo.par$hybrid.Q.a.MC)*Q.model.old[x,action[x]] + algo.par$hybrid.Q.a.MC*(Q.path[x])
          return(res)
        }))
      }
      if(algo.par$hybrid.Q){
        y_train <- Weigh.History(y.pred,Q.path.train, minibatch.no, algo.par, mem.length=length(algo.var$memory), type, MC.factor=algo.var$MC.factor)
      } else if (algo.par$MC){
        y_train <- Q.path.train
      } else { # should never be reached
        stop("Something is wrong with the parameters hybrid.Q or MC")
      }
    } else {
        y_train <- y.pred
    }

    if(algo.par$curio.beta>0){
      y_train <- y_train+R.phi
    }

    if(algo.par$block.expl.surp!=0 || (algo.par$block.expl.multi!=0 && (algo.par$expl.path.multi.start.frac.surp!=0||algo.par$expl.path.multi.end.frac.surp!=0))){ #We want surprise scaling
      if(!algo.par$only.experienced){
        stop("Surprise factoring only implemented with the option \'only experienced\'")
      }
      surprise.score <- (Q.model.old-y_train)^2
      model.surp <- Update.Surprise.Model.QLearningPersExpPath(game.object, model.surp, model.par.surp, x_train, action, surprise.score) #Train new model
    }

    #Setup necessary precision
    if(!is.null(model.par$enforce.increasing.precision)&&model.par$enforce.increasing.precision==TRUE){
      prec.repeat <- TRUE
    } else {
      prec.repeat <- FALSE
    }

    if(type=="init"){
      model.par$epochs <- algo.par$mem.init.epochs
    }

    #Main Part -> Training of model
    ## We want stability and e.g. NN might produce NA values
    #good.vals <- (complete.cases(x_train)&complete.cases(y_train))
    #x_train <- x_train[good.vals,]
    #if(algo.par$only.experienced){
    #  y_train <- y_train[good.vals]
    #} else {
    #  y_train <- y_train[good.vals,]
    #}
    #action <- action[good.vals]

    #restore.point("before model.train")
    if(algo.par$only.experienced){
      model.train <- model.par$train(model, model.par, x_train, y_train, action)
    } else {
      model.train <- model.par$train(model, model.par, x_train, y_train)
    }

    model <- model.train$model
    fit.obj <- model.train$fit.obj
    #restore.point("before.pre.training")
    #If model is not trained enough, repeat Training until ready
    if(is.null(model.par$single.train) || !model.par$single.train){
      if(type=="replay"){
        if(prec.repeat && is.null(algo.var$cur.loss)){
          algo.var$cur.loss <- mean(fit.obj$metrics$loss)
        } else if (prec.repeat) {
          counter <- 0
          while(mean(fit.obj$metrics$loss)>algo.var$cur.loss){
            counter <- counter+1
            writeLines(paste0("Loss was still ",round(mean(fit.obj$metrics$loss),5), " but ",round(algo.var$cur.loss,5), " needed","\n",collapse=""))
            if(algo.par$only.experienced){
              model.train <- model.par$train(model, model.par, x_train, y_train, action)
            } else {
              model.train <- model.par$train(model, model.par, x_train, y_train)
            }
            model <- model.train$model
            fit.obj <- model.train$fit.obj
            if(counter>model.par$give.up.precision){
              break
            }
          }
        }
        if(prec.repeat){
          algo.var$cur.loss <- mean(fit.obj$metrics$loss)
        }
      } else {
        stop.cond <- FALSE
        while(!stop.cond){
          cur.loss <- mean(fit.obj$metrics$loss)
          writeLines(paste0("Loss is ",round(mean(fit.obj$metrics$loss),5), ", so we continue","\n",collapse=""))
          if(algo.par$only.experienced){
            model.train <- model.par$train(model, model.par, x_train, y_train, action)
          } else {
            model.train <- model.par$train(model, model.par, x_train, y_train)
          }
          model <- model.train$model
          fit.obj <- model.train$fit.obj
          if(mean(fit.obj$metrics$loss)>cur.loss*(1-algo.par$mem.expected.improve)){
            stop.cond <- TRUE
          }
        }
      }
    }
  }
  return(nlist(model, model.fam, model.surp, algo.var))
}

#Q-Learning with Discrete Choices
#' Sets up a model based on model parameters
#'
#' @param game.object Game Object as defined by \code{Get.Game.Object.<NAME>}.
#' @param model.par Model parameters. If \code{NULL}, the function \code{Get.Def.Par.QLearningPersExpPath()} is called.
#' @param model.par.surp Model parameters for surprise model. If \code{NULL}, we assume that we do not want to use a surprise model.
#' @param model.par.fam Model parameters for the model which tries to gauge familiarity with the specific state/action combination. If \code{NULL}, we assume that no familiarity model is needed.
#' @export
Setup.QLearningPersExpPath <- function(game.object, algo.par=NULL, model.par, model.par.surp=NULL, model.par.fam=NULL){
  restore.point("Setup.QLearningPersExpPath")
  if(is.null(algo.par)){
    algo.par <- Get.Def.Par.QLearningPersExpPath()
  }

  game.par <- game.object$game.par(game.object)

  if(is.null(model.par$name)){
    stop("name parameter of model.par missing!")
  }

  model.best <- model.par$setup(model.par, game.par)
  model.cur <- model.par$setup(model.par, game.par)

  if(algo.par$block.expl.fam>0||(algo.par$block.expl.multi>0 && (algo.par$expl.path.multi.start.frac.fam!=0||algo.par$expl.path.multi.end.frac.fam!=0))){ #Setup only relevant if used at all
    if(is.null(model.par.fam$name)){
      stop("name parameter of model.par.fam missing!")
    }
    model.fam <- model.par.fam$setup(model.par.fam, game.par)
  } else {
    model.fam <- NULL
  }

  if(algo.par$block.expl.surp>0||(algo.par$block.expl.multi>0 && (algo.par$expl.path.multi.start.frac.surp!=0||algo.par$expl.path.multi.end.frac.surp!=0))){ #Setup only relevant if used at all
    if(is.null(model.par.surp$name)){
      stop("name parameter of model.par.surp missing!")
    }
    model.surp <- model.par.fam$setup(model.par.surp, game.par)
  } else {
    model.surp <- NULL
  }

  evaluator <- nlist(model.best, model.cur, model.surp, model.fam)

  return(evaluator)
}

#' Set changeable model variables
#'
#' Returns a list with the following items \itemize{
#' \item \strong{epsilon} [only in case of epsilon.greedy] Specifies how often the Algorithm tries a random move. Initialized with \code{epsilon.start} of \code{model.par}.
#' \item \strong{memory} Memory of the Algorithm given past games. May be intitialised to give a starting point to learning.
#' }
#'
#' @param game.object A Game Object as defined by \code{Get.Game.Object.<Name>}. Necessary in the case of memory intitialisation.
#' @param model.par Parameters of QLearning specification. Have to be specified and should be identical to the model.par as given to \code{Setup.QLearningPersExpPath()}.
#' @param memory.init Which type of initialization should take place? It \code{NULL}, the option \code{none} is used. The following types are supported \itemize{
#' \item \strong{none} No initialization takes place. Memory is an empty list.
#' \item \strong{self.play} The other strategies play against themselves - to understand possible secret handshakes. The following \code{memory.param} are expected: \itemize{
#' \item \strong{no} How often should the other strategies play against themselves?
#' }
#' }
#' If combinations of different memories are needed, one can use the function \code{Extend.Memory.QLearningPersExpPath()}
#' @param memory.param Parameters necessary for the chosen \code{memory.init}.
#' @param model.par Parameters of model i.e. Neural Network. Currently only used for RNN.
#'@export
Initialise.QLearningPersExpPath <- function(game.object=NULL, algo.par, memory.init=NULL, memory.param = NULL, model.par=NULL){
  restore.point("Initialise.QLearningPersExpPath")
  if(is.null(memory.init)){
    memory.init <- "none"
  }

  algo.var <- list()
  algo.var$epsilon <- algo.par$epsilon.start
  algo.var$path.goal.var <- algo.par$expl.path.var.start
  algo.var$path.goal.shock <- algo.par$expl.path.shock.start
  algo.var$path.goal.surp <- algo.par$expl.path.surp.start
  algo.var$path.goal.fam <- algo.par$expl.path.fam.start
  algo.var$expl.path.var <- algo.par$expl.path.var.start.var
  algo.var$expl.path.shock <- algo.par$expl.path.shock.start.shock
  algo.var$expl.path.surp <- algo.par$expl.path.surp.start.surp
  algo.var$expl.path.fam <- algo.par$expl.path.fam.start.fam
  algo.var$expl.path.vs.var <- algo.par$expl.path.vs.start.var
  algo.var$expl.path.vs.shock <- algo.par$expl.path.vs.start.shock
  algo.var$expl.path.multi.var <- algo.par$expl.path.multi.start.var
  algo.var$expl.path.multi.shock <- algo.par$expl.path.multi.start.shock
  algo.var$expl.path.multi.surp <- algo.par$expl.path.multi.start.surp
  algo.var$expl.path.multi.fam <- algo.par$expl.path.multi.start.fam
  algo.var$MC.factor <- 1
  algo.var$experience.count <- 1
  algo.var$memory <- list() #items: state, action, next.state, reward

  #Curiosity
  if(algo.par$curio.beta>0){
    algo.var$phi.table <- Start.Phi.Table(game.object$feature.types(game.object))
  } else {
    algo.var$phi.table <- NULL
  }


  if (memory.init != "none") {
    algo.var <- Extend.Memory.QLearningPersExpPath(algo.var, algo.par=algo.par, game.object=game.object, memory.type=memory.init, memory.param=memory.param, model.par=model.par)
  }

  return(algo.var)
}

#' Extend Memory by specified experiences
#'
#' Returns modified algo.var, where memory has been extended as specified.
#'
#' @param algo.var A variable algorithm object, where to be modified variables are saved. Given by \code{Initialise.QLearningPersExpPath()}
#' @param game.object A game object as defined by \code{Get.Game.Object.<Name>}.
#' @param memory.init Which type of extension should take place? The following types are supported \itemize{
#' \item \strong{self.play} The other strategies play against themselves - to understand possible secret handshakes. If I am myself part of the other strategies, the "self" strategy is ignored. The following \code{memory.param} are expected: \itemize{
#' \item \strong{no} How often should the other strategies play against themselves?
#' }
#' \item \strong{solid.foundation} Not only self.play, but also a random initialisation with increasing defect probabilities.The following \code{memory.param} are expected: \itemize{
#' \item \strong{self.no} How often should the other strategies play against themselves?
#' \item \strong{rep.no} How often should a random strategy be played? The defection probability is linearly increased.
#' }
#' }
#' If combinations of different memories are needed, one can use the function multiple times.
#' @param memory.param Parameters necessary for the chosen \code{memory.type}.
#' @param model.par Parameters of model (i.e. Neural Network). Currently only used for RNNs mask value.
#'@export
Extend.Memory.QLearningPersExpPath <- function(algo.var, algo.par=NULL, game.object, memory.type, memory.param=NULL, model.par=NULL){
  restore.point("Extend.Memory.QLearningPersExpPath")

  if(memory.type == "self.play"){
    new.mem <- unlist(lapply(1:memory.param$no,FUN=function(x){
      if(!is.null(game.object$supports) && any(game.object$supports == "memory.self.play")){
        return(game.object$memory.self.play(game.object, algo.par))
      }
    }), recursive=FALSE)
  } else if (memory.type== "solid.foundation"){
    self.mem <- unlist(lapply(1:memory.param$self.no,FUN=function(x){
      if(!is.null(game.object$supports) && any(game.object$supports == "memory.self.play")){
        return(game.object$memory.self.play(game.object, algo.par))
      }
    }), recursive=FALSE)
    def.arr <- seq(0,1,length.out = memory.param$rep.no)
    rand.mem <- unlist(lapply(def.arr,FUN=function(x){
      if(!is.null(game.object$supports) && any(game.object$supports == "memory.random.play")){
        algo.par$def.prob <- x
        return(game.object$memory.random.play(game.object, algo.par))
      }
    }), recursive=FALSE)
    new.mem <- c(self.mem, rand.mem)
  } else {
    stop(paste0("memory.type ",memory.type," not supported."))
  }
  if(algo.par$use.rnn){
    new.mem <- lapply(new.mem,FUN=function(x){
      if(any(is.na(x$state))) x$state[is.na(x$state)] <- model.par$mask.value
      if(any(is.na(x$next.state))) x$next.state[is.na(x$next.state)] <- model.par$mask.value
      return(x)
    })
  }
  algo.var <- Update.Memory.QLearningPersExpPath(algo.var, new.mem, game.object, algo.par)

  return(algo.var)
}

#' Add historic Q-Values and Curiosity to memory
#'
#' @export
Update.Memory.QLearningPersExpPath <- function(algo.var, new.mem, game.object, algo.par, best.effort=NULL){
  restore.point("Update.Memory.QLearning")

  if(is.null(new.mem)||length(new.mem)==0){
    return(algo.var)
  }

  #Relevant Portion for Hybrid Q and MC Q
  if(algo.par$hybrid.Q || algo.par$MC){
    #Identify start/end points
    start.candidates <- sapply(new.mem,FUN=function(x){x$start})
    if(is.null(unlist(start.candidates))){
      starts <- 1
    } else {
      starts <-  which(start.candidates)
    }
    ends <- which(sapply(new.mem,FUN=function(x){x$done}))
    no.paths <- length(starts)

    for(x in 1:no.paths){
      #restore.point("inside.update.memory.qlearningpersExpPath")
      if (algo.par$hybrid.Q.apply=="sensible" && !is.null(best.effort)){
        if(all(best.effort)){
          Q.path.on <- rep(TRUE,ends[x]-starts[x]+1)
        } else {
          deviation.point <- max(which(best.effort == FALSE))
          Q.path.on <- rep(TRUE,ends[x]-starts[x]+1)
          Q.path.on[1:deviation.point] <- FALSE
        }
      } else if (algo.par$hybrid.Q.apply=="always"){
        Q.path.on <- rep(TRUE,ends[x]-starts[x]+1)
      } else if (is.null(best.effort)){
        Q.path.on <- rep(FALSE,ends[x]-starts[x]+1)
      }
      for(i in ends[x]:starts[x]){
        if(i == ends[x]){
          if(Q.path.on[i-starts[x]+1]){
            new.mem[[i]]$Q.path <- new.mem[[i]]$reward
          } else {
            new.mem[[i]]$Q.path <- NA
          }
        } else {
          if(Q.path.on[i-starts[x]+1]){
            new.mem[[i]]$Q.path <- new.mem[[i]]$reward + algo.par$gamma * new.mem[[i+1]]$Q.path
          } else {
            new.mem[[i]]$Q.path <- NA
          }
        }
      }
    }
  }

  #Add Experience count to gauge age of experience
  if(!is.null(algo.var$experience.count)){
    new.mem <- lapply(new.mem,FUN=function(x){
      x$experience.count <- algo.var$experience.count
      return(x)
    })
  }

  #Add Curiosity
  if(algo.par$curio.beta>0){
    for(i in 1:length(new.mem)){
      rho <- Calc.Rho(phi.table=algo.var$phi.table,phi=new.mem[[i]]$next.state)
      if(algo.par$N.type=="naive"){
        t <- sum(algo.var$phi.table[[1]]$count)
        algo.var$phi.table <- Update.Phi.Table(phi.table=algo.var$phi.table,phi=new.mem[[i]]$next.state,phi.cont.digits=algo.par$phi.cont.digits)
        R.phi <- t*rho
      } else {
        restore.point("r.Phi")
        algo.var$phi.table <- Update.Phi.Table(phi.table=algo.var$phi.table,phi=new.mem[[i]]$next.state,phi.cont.digits=algo.par$phi.cont.digits)
        rho.next <- Calc.Rho(phi.table=algo.var$phi.table,phi=new.mem[[i]]$next.state)
        N.phi <- rho*(1-rho.next)/(rho.next-rho)
        R.phi <- algo.par$curio.beta/sqrt(N.phi)
      }
      new.mem[[i]]$R.phi <- min(R.phi,algo.par$curio.cap)
    }
  }

  algo.var$memory <- c(algo.var$memory,new.mem)

  return(algo.var)
}

#' Train model of Q learning
#'
#' @param model.par.surp If \code{NULL}, no suprise model is generated/updated.
#' @param model.par.fam If \code{NULL}, no familiarity model is generated/updated.
#' @export
Replay.QLearningPersExpPath <- function(evaluator, model.par, model.par.surp=NULL,model.par.fam=NULL, algo.par, algo.var, game.object){
  restore.point("Replay.QLearningPersExpPath")

  algo.var$experience.count <- algo.var$experience.count + 1 #With each Replay our experience increases
  train.res <- Train.On.Memory.QLearningPersExpPath(model=evaluator$model.cur, model.surp=evaluator$model.surp, model.fam=evaluator$model.fam, model.par=model.par, model.par.surp=model.par.surp, model.par.fam=model.par.fam, algo.var=algo.var, algo.par=algo.par, game.object=game.object, type="replay")
  evaluator <- Update.Evaluator.QLearningPersExpPath(evaluator, new.model=train.res$model, new.model.surp = train.res$model.surp, new.model.fam=train.res$model.fam, which.update="cur", model.par)
  algo.var <- train.res$algo.var

  return(nlist(evaluator, algo.var))
}

#'
#'
#'@export
Change.Algo.Var.QLearningPersExpPath <- function(algo.par, algo.var, cur.block, blocks){
  restore.point("Change.Algo.Var.QLearningPersExpPath")

  if(algo.par$action.policy=="epsilon.greedy" && algo.var$epsilon > algo.par$epsilon.min && cur.block!=blocks){
    if(algo.par$epsilon.decay.type=="linear"){
      algo.var$epsilon <- seq(algo.par$epsilon.start,algo.par$epsilon.min,length.out=blocks)[cur.block+1]
    } else if(algo.par$epsilon.decay.type=="exponential"){
      decay.par <- (algo.par$epsilon.min/algo.par$epsilon.start)^(algo.par$replay.intensive/blocks)
      algo.var$epsilon <- decay.par*algo.var$epsilon
    } else {
      algo.var$epsilon <- algo.par$epsilon.decay*algo.var$epsilon
    }
  } else if (algo.par$action.policy=="exploration.path") {

    exploit.data.length <- max(length(algo.var$analysis$score.best.block),length(algo.var$analysis$score.cur.block), na.rm=TRUE)

    #Var data points
    if(!is.null(algo.par$block.expl.var)&&algo.par$block.expl.var!=0){
      if(exploit.data.length<algo.par$expl.path.var.data.base){
        rel.best.effort.var <-  mean(c(algo.var$analysis$score.best.block,algo.var$analysis$score.cur.block), na.rm=TRUE)
        rel.expl.var <- mean(c(algo.var$analysis$score.expl.block.var))
      } else {
        if(!is.null(algo.par$block.best)&&algo.par$block.best!=0){
          mean.best <- algo.var$analysis$score.best.block[exploit.data.length:(exploit.data.length-algo.par$expl.path.var.data.base+1)]
        } else {
          mean.best <- NA
        }
        if(!is.null(algo.par$block.cur)&&algo.par$block.cur!=0){
          mean.cur <- algo.var$analysis$score.cur.block[exploit.data.length:(exploit.data.length-algo.par$expl.path.var.data.base+1)]
        } else {
          mean.cur <- NA
        }

        rel.best.effort.var <-  mean(c(mean.best,mean.cur), na.rm=TRUE)
        rel.expl.var <- mean(c(algo.var$analysis$score.expl.block.var[length(algo.var$analysis$score.expl.block.var):(length(algo.var$analysis$score.expl.block.var)-algo.par$expl.path.var.data.base+1)]))
      }
    }

    #Shock data points
    if(!is.null(algo.par$block.expl.shock)&&algo.par$block.expl.shock!=0){
      if(exploit.data.length<algo.par$expl.path.shock.data.base){
        rel.best.effort.shock <-  mean(c(algo.var$analysis$score.best.block,algo.var$analysis$score.cur.block), na.rm=TRUE)
        rel.expl.shock <- mean(c(algo.var$analysis$score.expl.block.shock))
      } else {
        if(!is.null(algo.par$block.best)&&algo.par$block.best!=0){
          mean.best <- algo.var$analysis$score.best.block[exploit.data.length:(exploit.data.length-algo.par$expl.path.shock.data.base+1)]
        } else {
          mean.best <- NA
        }
        if(!is.null(algo.par$block.cur)&&algo.par$block.cur!=0){
          mean.cur <- algo.var$analysis$score.cur.block[exploit.data.length:(exploit.data.length-algo.par$expl.path.shock.data.base+1)]
        } else {
          mean.cur <- NA
        }

        rel.best.effort.shock <-  mean(c(mean.best,mean.cur), na.rm=TRUE)
        rel.expl.shock <- mean(c(algo.var$analysis$score.expl.block.shock[length(algo.var$analysis$score.expl.block.shock):(length(algo.var$analysis$score.expl.block.shock)-algo.par$expl.path.shock.data.base+1)]))
      }
    }

    #Surp data points
    if(!is.null(algo.par$block.expl.surp)&&algo.par$block.expl.surp!=0){
      if(exploit.data.length<algo.par$expl.path.surp.data.base){
        rel.best.effort.surp <-  mean(c(algo.var$analysis$score.best.block,algo.var$analysis$score.cur.block), na.rm=TRUE)
        rel.expl.surp <- mean(c(algo.var$analysis$score.expl.block.surp))
      } else {
        if(!is.null(algo.par$block.best)&&algo.par$block.best!=0){
          mean.best <- algo.var$analysis$score.best.block[exploit.data.length:(exploit.data.length-algo.par$expl.path.surp.data.base+1)]
        } else {
          mean.best <- NA
        }
        if(!is.null(algo.par$block.cur)&&algo.par$block.cur!=0){
          mean.cur <- algo.var$analysis$score.cur.block[exploit.data.length:(exploit.data.length-algo.par$expl.path.surp.data.base+1)]
        } else {
          mean.cur <- NA
        }

        rel.best.effort.surp <-  mean(c(mean.best,mean.cur), na.rm=TRUE)
        rel.expl.surp <- mean(c(algo.var$analysis$score.expl.block.surp[length(algo.var$analysis$score.expl.block.surp):(length(algo.var$analysis$score.expl.block.surp)-algo.par$expl.path.surp.data.base+1)]))
      }
    }

    #Familiarity data points
    if(!is.null(algo.par$block.expl.fam)&&algo.par$block.expl.fam!=0){
      if(exploit.data.length<algo.par$expl.path.fam.data.base){
        rel.best.effort.fam <-  mean(c(algo.var$analysis$score.best.block,algo.var$analysis$score.cur.block), na.rm=TRUE)
        rel.expl.fam <- mean(c(algo.var$analysis$score.expl.block.fam))
      } else {
        if(!is.null(algo.par$block.best)&&algo.par$block.best!=0){
          mean.best <- algo.var$analysis$score.best.block[exploit.data.length:(exploit.data.length-algo.par$expl.path.fam.data.base+1)]
        } else {
          mean.best <- NA
        }
        if(!is.null(algo.par$block.cur)&&algo.par$block.cur!=0){
          mean.cur <- algo.var$analysis$score.cur.block[exploit.data.length:(exploit.data.length-algo.par$expl.path.fam.data.base+1)]
        } else {
          mean.cur <- NA
        }

        rel.best.effort.fam <-  mean(c(mean.best,mean.cur), na.rm=TRUE)
        rel.expl.fam <- mean(c(algo.var$analysis$score.expl.block.fam[length(algo.var$analysis$score.expl.block.fam):(length(algo.var$analysis$score.expl.block.fam)-algo.par$expl.path.fam.data.base+1)]))
      }
    }

    #VarShock Combi data points
    if(!is.null(algo.par$block.expl.vs)&&algo.par$block.expl.vs!=0){
      #restore.point("varshock data")
      if(exploit.data.length<algo.par$expl.path.vs.data.base){
        rel.best.effort.vs <-  mean(c(algo.var$analysis$score.best.block,algo.var$analysis$score.cur.block), na.rm=TRUE)
        rel.expl.vs <- mean(c(algo.var$analysis$score.expl.block.vs))
        rel.sub.vs.var <- mean(algo.var$analysis$sub.block.vs.var)
        rel.sub.vs.shock <- mean(algo.var$analysis$sub.block.vs.shock)
      } else {
        if(!is.null(algo.par$block.best)&&algo.par$block.best!=0){
          mean.best <- algo.var$analysis$score.best.block[exploit.data.length:(exploit.data.length-algo.par$expl.path.vs.data.base+1)]
        } else {
          mean.best <- NA
        }
        if(!is.null(algo.par$block.cur)&&algo.par$block.cur!=0){
          mean.cur <- algo.var$analysis$score.cur.block[exploit.data.length:(exploit.data.length-algo.par$expl.path.vs.data.base+1)]
        } else {
          mean.cur <- NA
        }

        rel.best.effort.vs <-  mean(c(mean.best,mean.cur), na.rm=TRUE)
        rel.expl.vs <- mean(c(algo.var$analysis$score.expl.block.vs[length(algo.var$analysis$score.expl.block.vs):(length(algo.var$analysis$score.expl.block.vs)-algo.par$expl.path.vs.data.base+1)]))
        rel.sub.vs.var <- mean(algo.var$analysis$sub.block.vs.var[length(algo.var$analysis$score.expl.block.vs):(length(algo.var$analysis$score.expl.block.vs)-algo.par$expl.path.vs.data.base+1)])
        rel.sub.vs.shock <- mean(algo.var$analysis$sub.block.vs.shock[length(algo.var$analysis$score.expl.block.vs):(length(algo.var$analysis$score.expl.block.vs)-algo.par$expl.path.vs.data.base+1)])
      }
    }

    #Multi data points
    if(!is.null(algo.par$block.expl.multi)&&algo.par$block.expl.multi!=0){
      restore.point("multi data")

      #Aggregate cur and best
      aggr.vec <- sapply(1:exploit.data.length,FUN=function(x){
        mean(c(algo.var$analysis$score.best.block[x],algo.var$analysis$score.cur.block[x]), na.rm=TRUE)
      })

      #Smoothed vector of cur and best over db discounted data points
      aggr.vec.smoothed <- sapply(1:exploit.data.length, FUN=function(x){
        Weighted.Discount(aggr.vec[max(1,x-algo.par$expl.path.multi.best.db):x],discount=algo.par$expl.path.multi.best.disc)
      })

      #
      #
      # if(exploit.data.length<algo.par$expl.path.multi.data.base){
      #   rel.best.effort.multi <-  mean(c(algo.var$analysis$score.best.block,algo.var$analysis$score.cur.block), na.rm=TRUE)
      #   rel.expl.multi <- mean(c(algo.var$analysis$score.expl.block.multi))
      #   rel.sub.multi.var <- mean(algo.var$analysis$sub.block.multi.var)
      #   rel.sub.multi.shock <- mean(algo.var$analysis$sub.block.multi.shock)
      #   rel.sub.multi.surp <- mean(algo.var$analysis$sub.block.multi.surp)
      #   rel.sub.multi.fam <- mean(algo.var$analysis$sub.block.multi.fam)
      # } else {
      #   if(!is.null(algo.par$block.best)&&algo.par$block.best!=0){
      #     mean.best <- algo.var$analysis$score.best.block[exploit.data.length:(exploit.data.length-algo.par$expl.path.multi.data.base+1)]
      #   } else {
      #     mean.best <- NA
      #   }
      #   if(!is.null(algo.par$block.cur)&&algo.par$block.cur!=0){
      #     mean.cur <- algo.var$analysis$score.cur.block[exploit.data.length:(exploit.data.length-algo.par$expl.path.multi.data.base+1)]
      #   } else {
      #     mean.cur <- NA
      #   }
      #
      #   rel.best.effort.multi <-  mean(c(mean.best,mean.cur), na.rm=TRUE)
      #   rel.expl.multi <- mean(c(algo.var$analysis$score.expl.block.multi[length(algo.var$analysis$score.expl.block.multi):(length(algo.var$analysis$score.expl.block.multi)-algo.par$expl.path.multi.data.base+1)]))
      #   rel.sub.multi.var <- mean(algo.var$analysis$sub.block.multi.var[length(algo.var$analysis$score.expl.block.multi):(length(algo.var$analysis$score.expl.block.multi)-algo.par$expl.path.multi.data.base+1)])
      #   rel.sub.multi.shock <- mean(algo.var$analysis$sub.block.multi.shock[length(algo.var$analysis$score.expl.block.multi):(length(algo.var$analysis$score.expl.block.multi)-algo.par$expl.path.multi.data.base+1)])
      #   rel.sub.multi.surp <- mean(algo.var$analysis$sub.block.multi.surp[length(algo.var$analysis$score.expl.block.multi):(length(algo.var$analysis$score.expl.block.multi)-algo.par$expl.path.multi.data.base+1)])
      #   rel.sub.multi.fam <- mean(algo.var$analysis$sub.block.multi.fam[length(algo.var$analysis$score.expl.block.multi):(length(algo.var$analysis$score.expl.block.multi)-algo.par$expl.path.multi.data.base+1)])
      # }
    }

    #Var Path
    if(!is.null(algo.par$block.expl.var)&&algo.par$block.expl.var!=0){
      if(algo.par$expl.path.var.decay.type=="linear"){
        algo.var$path.goal.var <- seq(algo.par$expl.path.var.start,algo.par$expl.path.var.end,length.out=blocks)[cur.block+1]
      } else if(algo.par$expl.path.var.decay.type=="exponential"){
        decay.par <- (algo.par$expl.path.var.end/algo.par$expl.path.var.start)^(algo.par$replay.intensive/blocks)
        algo.var$path.goal.var <- decay.par*algo.var$path.goal.var
      }
      if(rel.best.effort.var<0){
        path.goal.var <- rel.best.effort.var/algo.var$path.goal.var
      } else {
        path.goal.var <- rel.best.effort.var*algo.var$path.goal.var
      }
    }

    #Shock Path
    if(!is.null(algo.par$block.expl.shock)&&algo.par$block.expl.shock!=0){
      if(algo.par$expl.path.shock.decay.type=="linear"){
        algo.var$path.goal.shock <- seq(algo.par$expl.path.shock.start,algo.par$expl.path.shock.end,length.out=blocks)[cur.block+1]
      } else if(algo.par$expl.path.shock.decay.type=="exponential"){
        decay.par <- (algo.par$expl.path.shock.end/algo.par$expl.path.shock.start)^(algo.par$replay.intensive/blocks)
        algo.var$path.goal.shock <- decay.par*algo.var$path.goal.shock
      }
      if(rel.best.effort.shock<0){
        path.goal.shock <- rel.best.effort.shock/algo.var$path.goal.shock
      } else {
        path.goal.shock <- rel.best.effort.shock*algo.var$path.goal.shock
      }
    }

    #Surp Path
    if(!is.null(algo.par$block.expl.surp)&&algo.par$block.expl.surp!=0){
      if(algo.par$expl.path.surp.decay.type=="linear"){
        algo.var$path.goal.surp <- seq(algo.par$expl.path.surp.start,algo.par$expl.path.surp.end,length.out=blocks)[cur.block+1]
      } else if(algo.par$expl.path.surp.decay.type=="exponential"){
        decay.par <- (algo.par$expl.path.surp.end/algo.par$expl.path.surp.start)^(algo.par$replay.intensive/blocks)
        algo.var$path.goal.surp <- decay.par*algo.var$path.goal.surp
      }
      if(rel.best.effort.surp<0){
        path.goal.surp <- rel.best.effort.surp/algo.var$path.goal.surp
      } else {
        path.goal.surp <- rel.best.effort.surp*algo.var$path.goal.surp
      }
    }

    #fam Path
    if(!is.null(algo.par$block.expl.fam)&&algo.par$block.expl.fam!=0){
      if(algo.par$expl.path.fam.decay.type=="linear"){
        algo.var$path.goal.fam <- seq(algo.par$expl.path.fam.start,algo.par$expl.path.fam.end,length.out=blocks)[cur.block+1]
      } else if(algo.par$expl.path.fam.decay.type=="exponential"){
        decay.par <- (algo.par$expl.path.fam.end/algo.par$expl.path.fam.start)^(algo.par$replay.intensive/blocks)
        algo.var$path.goal.fam <- decay.par*algo.var$path.goal.fam
      }
      if(rel.best.effort.fam<0){
        path.goal.fam <- rel.best.effort.fam/algo.var$path.goal.fam
      } else {
        path.goal.fam <- rel.best.effort.fam*algo.var$path.goal.fam
      }
    }

    #VS Path
    if(!is.null(algo.par$block.expl.vs)&&algo.par$block.expl.vs!=0){
      if(algo.par$expl.path.vs.decay.type=="linear"){
        algo.var$path.goal.vs <- seq(algo.par$expl.path.vs.start,algo.par$expl.path.vs.end,length.out=blocks)[cur.block+1]
      } else if(algo.par$expl.path.vs.decay.type=="exponential"){
        decay.par <- (algo.par$expl.path.vs.end/algo.par$expl.path.vs.start)^(algo.par$replay.intensive/blocks)
        algo.var$path.goal.vs <- decay.par*algo.var$path.goal.vs
      }
      if(rel.best.effort.vs<0){
        path.goal.vs <- rel.best.effort.vs/algo.var$path.goal.vs
      } else {
        path.goal.vs <- rel.best.effort.vs*algo.var$path.goal.vs
      }
    }

    #Multi Path
    if(!is.null(algo.par$block.expl.multi)&&algo.par$block.expl.multi!=0){
      restore.point("multi.path")
      if(algo.par$expl.path.multi.decay.type=="linear"){
        algo.var$path.goal.multi <- seq(algo.par$expl.path.multi.start,algo.par$expl.path.multi.end,length.out=blocks)[1:cur.block]
      } else if(algo.par$expl.path.multi.decay.type=="exponential"){
        decay.par <- (algo.par$expl.path.multi.end/algo.par$expl.path.multi.start)^(algo.par$replay.intensive/blocks)
        algo.var$path.goal.multi <- algo.par$expl.path.multi.start*decay.par^(0:(cur.block-1))
      }
      path.goal.multi <- sapply(1:exploit.data.length,FUN=function(x){
        if(aggr.vec.smoothed[x]<0){
          return(aggr.vec.smoothed[x]/algo.var$path.goal.multi[x])
        } else {
          return(aggr.vec.smoothed[x]*algo.var$path.goal.multi[x])
        }
      })
    }

    #Recalibration Var
    if(!is.null(algo.par$block.expl.var)&&algo.par$block.expl.var!=0){
      if(rel.expl.var>rel.best.effort.var){ #We are doing something good here do not change anything
        print(paste0(c(" [Var] Best Effort: ", round(rel.best.effort.var,2), " and path goal is ", round(path.goal.var,2),". We have ", round(rel.expl.var,2), " which is better than best effort, so we do not change anything."),collapse=""))
      } else if(path.goal.var>rel.expl.var){ #We are worse than the Path
          print(paste0(c(" [Var] Best Effort: ", round(rel.best.effort.var,2), " so path goal is ", round(path.goal.var,2),". We have ", round(rel.expl.var,2), " so we decrease variance from ", round(algo.var$expl.path.var,3), " to ", round(algo.var$expl.path.var/algo.par$expl.path.var.momentum,3)),collapse=""))
          algo.var$expl.path.var <- algo.var$expl.path.var/algo.par$expl.path.var.momentum
      } else { #We are better than the path
          print(paste0(c(" [Var] Best Effort: ", round(rel.best.effort.var,2), " so path goal is ", round(path.goal.var,2),". We have ", round(rel.expl.var,2), " so we increase variance from ", round(algo.var$expl.path.var,3), " to ", round(algo.var$expl.path.var*algo.par$expl.path.var.momentum,3)),collapse=""))
          algo.var$expl.path.var <- algo.var$expl.path.var*algo.par$expl.path.var.momentum
      }
    }

    #Recalibration Shock
    if(!is.null(algo.par$block.expl.shock)&&algo.par$block.expl.shock!=0){
      if(rel.expl.shock>rel.best.effort.shock){ #We are doing something good here do not change anything
        print(paste0(c(" [Shock] Best Effort: ", round(rel.best.effort.shock,2), " and path goal is ", round(path.goal.shock,2),". We have ", round(rel.expl.shock,2), " which is better than best effort, so we do not change anything."),collapse=""))
      } else if(path.goal.shock>rel.expl.shock){ #We are worse than the Path
        print(paste0(c(" [Shock] Best Effort: ", round(rel.best.effort.shock,2), " so path goal is ", round(path.goal.shock,2),". We have ", round(rel.expl.shock,2), " so we decrease epsilon shock from ", round(algo.var$expl.path.shock,3), " to ", round(algo.var$expl.path.shock/algo.par$expl.path.shock.momentum,3)),collapse=""))
        algo.var$expl.path.shock <- algo.var$expl.path.shock/algo.par$expl.path.shock.momentum
      } else { #We are better than the path
        old.shock <- algo.var$expl.path.shock
        new.shock <- min(algo.var$expl.path.shock*algo.par$expl.path.shock.momentum,1)
        algo.var$expl.path.shock <- new.shock

        if(old.shock>=1){
            print(paste0(c(" [Shock] Best Effort: ", round(rel.best.effort.shock,2), " so path goal is ", round(path.goal.shock,2),". We have ", round(rel.expl.shock,2), " but increase in shock is not possible as already 1."),collapse=""))
        } else {
            print(paste0(c(" [Shock] Best Effort: ", round(rel.best.effort.shock,2), " so path goal is ", round(path.goal.shock,2),". We have ", round(rel.expl.shock,2), " so we increase epsilon shock from ", round(old.shock,3), " to ", round(new.shock,3)),collapse=""))
        }
      }
    }

    #Recalibration Surp
    if(!is.null(algo.par$block.expl.surp)&&algo.par$block.expl.surp!=0){
      if(rel.expl.surp>rel.best.effort.surp){ #We are doing something good here do not change anything
        print(paste0(c(" [Surp] Best Effort: ", round(rel.best.effort.surp,2), " and path goal is ", round(path.goal.surp,2),". We have ", round(rel.expl.surp,2), " which is better than best effort, so we do not change anything."),collapse=""))
      } else if(path.goal.surp>rel.expl.surp){ #We are worse than the Path
        print(paste0(c(" [Surp] Best Effort: ", round(rel.best.effort.surp,2), " so path goal is ", round(path.goal.surp,2),". We have ", round(rel.expl.surp,2), " so we decrease surprise factor from ", round(algo.var$expl.path.surp,3), " to ", round(algo.var$expl.path.surp/algo.par$expl.path.surp.momentum,3)),collapse=""))
        algo.var$expl.path.surp <- algo.var$expl.path.surp/algo.par$expl.path.surp.momentum
      } else { #We are better than the path
        print(paste0(c(" [Surp] Best Effort: ", round(rel.best.effort.surp,2), " so path goal is ", round(path.goal.surp,2),". We have ", round(rel.expl.surp,2), " so we increase surprise factor from ", round(algo.var$expl.path.surp,3), " to ", round(algo.var$expl.path.surp*algo.par$expl.path.surp.momentum,3)),collapse=""))
        algo.var$expl.path.surp <- algo.var$expl.path.surp*algo.par$expl.path.surp.momentum
      }
    }

    #Recalibration fam
    if(!is.null(algo.par$block.expl.fam)&&algo.par$block.expl.fam!=0){
      if(rel.expl.fam>rel.best.effort.fam){ #We are doing something good here do not change anything
        print(paste0(c(" [Fam] Best Effort: ", round(rel.best.effort.fam,2), " and path goal is ", round(path.goal.fam,2),". We have ", round(rel.expl.fam,2), " which is better than best effort, so we do not change anything."),collapse=""))
      } else if(path.goal.fam>rel.expl.fam){ #We are worse than the Path
        print(paste0(c(" [Fam] Best Effort: ", round(rel.best.effort.fam,2), " so path goal is ", round(path.goal.fam,2),". We have ", round(rel.expl.fam,2), " so we decrease fam factor from ", round(algo.var$expl.path.fam,3), " to ", round(algo.var$expl.path.fam/algo.par$expl.path.fam.momentum,3)),collapse=""))
        algo.var$expl.path.fam <- algo.var$expl.path.fam/algo.par$expl.path.fam.momentum
      } else { #We are better than the path
        print(paste0(c(" [Fam] Best Effort: ", round(rel.best.effort.fam,2), " so path goal is ", round(path.goal.fam,2),". We have ", round(rel.expl.fam,2), " so we increase fam factor from ", round(algo.var$expl.path.fam,3), " to ", round(algo.var$expl.path.fam*algo.par$expl.path.fam.momentum,3)),collapse=""))
        algo.var$expl.path.fam <- algo.var$expl.path.fam*algo.par$expl.path.fam.momentum
      }
    }

    #Recalibration VarShockCombi
    if(!is.null(algo.par$block.expl.vs)&&algo.par$block.expl.vs!=0){
      #restore.point("recalibrate.varshock")
      if(rel.expl.vs>rel.best.effort.vs){ #We are doing something good here do not change anything
        print(paste0(c(" [VS] Best Effort: ", round(rel.best.effort.vs,2), " and path goal is ", round(path.goal.vs,2),". We have ", round(rel.expl.vs,2), " which is better than best effort, so we do not change anything."),collapse=""))
      } else if(path.goal.vs>rel.expl.vs){ #We are worse than the Path

        old.vs.var <- algo.var$expl.path.vs.var
        old.vs.shock <- algo.var$expl.path.vs.shock

        target.shock <- max((1-cur.block/blocks)*(algo.par$expl.path.vs.start.shock.frac),algo.par$expl.path.vs.min.shock.frac)
        target.var <- 1-target.shock
        current.var <- rel.sub.vs.var/(rel.sub.vs.var+rel.sub.vs.shock)
        if(is.nan(current.var)) current.var <- 0
        current.shock <- 1-current.var
        diff.total <- abs(target.var-current.var) + abs(target.shock-current.shock)
        if(current.var>target.var){ #Var has to be reduced more severely
          w.var <- min(0.5+algo.par$expl.path.vs.strength*diff.total,1)
          w.shock <- 1-w.var
        } else {
          w.shock <- min(0.5 + algo.par$expl.path.vs.strength*diff.total,1)
          w.var <- 1-w.shock
        }

        new.vs.var <- old.vs.var/((algo.par$expl.path.vs.momentum-1)*w.var+1)
        new.vs.shock <- old.vs.shock/((algo.par$expl.path.vs.momentum-1)*w.shock+1)

        print(paste0(c(" [VS] Best Effort: ", round(rel.best.effort.vs,2), " so path goal is ", round(path.goal.vs,2),". We have ", round(rel.expl.vs,2), " so we decrease vs-epsilon shock from ", round(old.vs.shock,3), " to ", round(new.vs.shock,3)," and vs-var from ", round(old.vs.var,3), " to ", round(new.vs.var,3)," with a weight for var of ", round(w.var,2), " (>0.5 means more reduction of var)"),collapse=""))

        algo.var$expl.path.vs.var <- new.vs.var
        algo.var$expl.path.vs.shock <- new.vs.shock

      } else { #We are better than the path

        old.vs.var <- algo.var$expl.path.vs.var
        old.vs.shock <- algo.var$expl.path.vs.shock

        target.shock <- max((1-cur.block/blocks)*(algo.par$expl.path.vs.start.shock.frac),algo.par$expl.path.vs.min.shock.frac)
        target.var <- 1-target.shock
        current.var <- rel.sub.vs.var/(rel.sub.vs.var+rel.sub.vs.shock)
        if(is.nan(current.var)) current.var <- 0
        current.shock <- 1-current.var
        diff.total <- abs(target.var-current.var) + abs(target.shock-current.shock)
        if(current.var<target.var){ #Var has to be increased more severely
          w.var <- min(0.5+algo.par$expl.path.vs.strength*diff.total,1)
          w.shock <- 1-w.var
        } else {
          w.shock <- min(0.5 + algo.par$expl.path.vs.strength*diff.total,1)
          w.var <- 1-w.shock
        }

        new.vs.var <- old.vs.var*((algo.par$expl.path.vs.momentum-1)*w.var+1)
        new.vs.shock <- min(old.vs.shock*((algo.par$expl.path.vs.momentum-1)*w.shock+1),1)

        print(paste0(c(" [VS] Best Effort: ", round(rel.best.effort.vs,2), " so path goal is ", round(path.goal.vs,2),". We have ", round(rel.expl.vs,2), " so we increase vs-epsilon shock from ", round(old.vs.shock,3), " to ", round(new.vs.shock,3)," and vs-var from ", round(old.vs.var,3), " to ", round(new.vs.var,3)," with a weight for var of ", round(w.var,2), " (>0.5 means more increase of var)"),collapse=""))

        algo.var$expl.path.vs.var <- new.vs.var
        algo.var$expl.path.vs.shock <- new.vs.shock
      }
    }
  }

  #Recalibration Multi
  if(!is.null(algo.par$block.expl.multi)&&algo.par$block.expl.multi!=0){
    restore.point("change.path")

    old.multi.var <- algo.var$expl.path.multi.var
    old.multi.shock <- algo.var$expl.path.multi.shock
    old.multi.surp <- algo.var$expl.path.multi.surp
    old.multi.fam <- algo.var$expl.path.multi.fam

    #Target suboptimality fractions in respective round
    start.fracs <- c(algo.par$expl.path.multi.start.frac.var, algo.par$expl.path.multi.start.frac.shock,algo.par$expl.path.multi.start.frac.surp, algo.par$expl.path.multi.start.frac.fam)
    start.fracs <- start.fracs/sum(start.fracs) #norm for safety
    end.fracs <- c(algo.par$expl.path.multi.end.frac.var, algo.par$expl.path.multi.end.frac.shock,algo.par$expl.path.multi.end.frac.surp, algo.par$expl.path.multi.end.frac.fam)
    end.fracs <- end.fracs/sum(end.fracs)
    target.fracs <- t(sapply(1:cur.block,FUN=function(x){
      start.fracs+x/blocks*(end.fracs-start.fracs)
    }))

    #Observed suboptimality fractions
    total.sub <- algo.var$analysis$sub.block.multi.var + algo.var$analysis$sub.block.multi.shock + algo.var$analysis$sub.block.multi.surp + algo.var$analysis$sub.block.multi.fam
    fracs <- t(sapply(1:cur.block,FUN=function(x){
      cur.frac <- c(algo.var$analysis$sub.block.multi.var[x],algo.var$analysis$sub.block.multi.shock[x],algo.var$analysis$sub.block.multi.surp[x],algo.var$analysis$sub.block.multi.fam[x])/total.sub[x]
      if(total.sub[x]==0) cur.frac <- rep(0,4)
      return(cur.frac)
    }))

    #Transforming fractions to real values
    cur.abs.sub.var <- (aggr.vec.smoothed-algo.var$analysis$score.expl.block.multi)*fracs[,1]
    goal.abs.sub.var <- (aggr.vec.smoothed-path.goal.multi)*target.fracs[,1]
    err.vec.var <- goal.abs.sub.var - cur.abs.sub.var

    cur.abs.sub.shock <- (aggr.vec.smoothed-algo.var$analysis$score.expl.block.multi)*fracs[,2]
    goal.abs.sub.shock <- (aggr.vec.smoothed-path.goal.multi)*target.fracs[,2]
    err.vec.shock <- goal.abs.sub.shock - cur.abs.sub.shock

    cur.abs.sub.surp <- (aggr.vec.smoothed-algo.var$analysis$score.expl.block.multi)*fracs[,3]
    goal.abs.sub.surp <- (aggr.vec.smoothed-path.goal.multi)*target.fracs[,3]
    err.vec.surp <- goal.abs.sub.surp - cur.abs.sub.surp

    cur.abs.sub.fam <- (aggr.vec.smoothed-algo.var$analysis$score.expl.block.multi)*fracs[,4]
    goal.abs.sub.fam <- (aggr.vec.smoothed-path.goal.multi)*target.fracs[,4]
    err.vec.fam <- goal.abs.sub.fam - cur.abs.sub.fam

    pid.val.var <- PID.controller(err.vec=err.vec.var, Kp=algo.par$expl.path.multi.Kp.var, Ki=algo.par$expl.path.multi.Ki.var, Kd=algo.par$expl.path.multi.Kd.var, Kp.db=algo.par$expl.path.multi.Kp.db, Ki.db=algo.par$expl.path.multi.Ki.db, Kd.db=algo.par$expl.path.multi.Kd.db, Kp.disc=algo.par$expl.path.multi.Kp.disc, Kd.disc=algo.par$expl.path.multi.Kd.disc, Ki.disc=algo.par$expl.path.multi.Ki.disc)
    pid.val.shock <- PID.controller(err.vec=err.vec.shock, Kp=algo.par$expl.path.multi.Kp.shock, Ki=algo.par$expl.path.multi.Ki.shock, Kd=algo.par$expl.path.multi.Kd.shock, Kp.db=algo.par$expl.path.multi.Kp.db, Ki.db=algo.par$expl.path.multi.Ki.db, Kd.db=algo.par$expl.path.multi.Kd.db, Kp.disc=algo.par$expl.path.multi.Kp.disc, Kd.disc=algo.par$expl.path.multi.Kd.disc, Ki.disc=algo.par$expl.path.multi.Ki.disc)
    pid.val.surp <- PID.controller(err.vec=err.vec.surp, Kp=algo.par$expl.path.multi.Kp.surp, Ki=algo.par$expl.path.multi.Ki.surp, Kd=algo.par$expl.path.multi.Kd.surp, Kp.db=algo.par$expl.path.multi.Kp.db, Ki.db=algo.par$expl.path.multi.Ki.db, Kd.db=algo.par$expl.path.multi.Kd.db, Kp.disc=algo.par$expl.path.multi.Kp.disc, Kd.disc=algo.par$expl.path.multi.Kd.disc, Ki.disc=algo.par$expl.path.multi.Ki.disc)
    pid.val.fam <- PID.controller(err.vec=err.vec.fam, Kp=algo.par$expl.path.multi.Kp.fam, Ki=algo.par$expl.path.multi.Ki.fam, Kd=algo.par$expl.path.multi.Kd.fam, Kp.db=algo.par$expl.path.multi.Kp.db, Ki.db=algo.par$expl.path.multi.Ki.db, Kd.db=algo.par$expl.path.multi.Kd.db, Kp.disc=algo.par$expl.path.multi.Kp.disc, Kd.disc=algo.par$expl.path.multi.Kd.disc, Ki.disc=algo.par$expl.path.multi.Ki.disc)

    #new.multi.var <- max(old.multi.var*min(max((1+pid.val.var),0.5),1.5),0)
    #new.multi.shock <- min(max(old.multi.shock*min(max((1+pid.val.shock),0.5),1.5),0),1)
    #new.multi.surp <- max(old.multi.surp*min(max((1+pid.val.surp),0.5),1.5),0)
    #new.multi.fam <- max(old.multi.fam*min(max((1+pid.val.fam),0.5),1.5),0)
    new.multi.var <- max(old.multi.var+pid.val.var,0)
    new.multi.shock <- max(old.multi.shock+pid.val.shock,0)
    new.multi.surp <- max(old.multi.surp+pid.val.surp,0)
    new.multi.fam <- max(old.multi.fam+pid.val.fam,0)

    print(paste0(c(" [Multi] Best Effort: ", round(aggr.vec.smoothed[cur.block],2), " with path goal ", round(path.goal.multi[cur.block],2),". We have ", round(algo.var$analysis$score.expl.block.multi[cur.block],2),"."),collapse=""))
    print(paste0(c("  [Var] Changes: ", round(old.multi.var,2), " to ", round(new.multi.var,2),", as frac ", round(fracs[cur.block,1],3)," with goal ", round(target.fracs[cur.block,1],3), " and abs ",round(cur.abs.sub.var[cur.block],3)," with goal ",round(goal.abs.sub.var[cur.block],3)),collapse=""))
    print(paste0(c("  [Shock] Changes: ", round(old.multi.shock,2), " to ", round(new.multi.shock,2),", as frac ", round(fracs[cur.block,2],3)," with goal ", round(target.fracs[cur.block,2],3), " and abs ",round(cur.abs.sub.shock[cur.block],3)," with goal ",round(goal.abs.sub.shock[cur.block],3)),collapse=""))
    print(paste0(c("  [Surp] Changes: ", round(old.multi.surp,2), " to ", round(new.multi.surp,2),", as frac ", round(fracs[cur.block,3],3)," with goal ", round(target.fracs[cur.block,3],3), " and abs ",round(cur.abs.sub.surp[cur.block],3)," with goal ",round(goal.abs.sub.surp[cur.block],3)),collapse=""))
    print(paste0(c("  [Fam] Changes: ", round(old.multi.fam,2), " to ", round(new.multi.fam,2),", as frac ", round(fracs[cur.block,4],3)," with goal ", round(target.fracs[cur.block,4],3), " and abs ",round(cur.abs.sub.fam[cur.block],3)," with goal ",round(goal.abs.sub.fam[cur.block],3)),collapse=""))

    algo.var$expl.path.multi.var <- new.multi.var
    algo.var$expl.path.multi.shock <- new.multi.shock
    algo.var$expl.path.multi.surp <- new.multi.surp
    algo.var$expl.path.multi.fam <- new.multi.fam

    # if(rel.expl.multi>rel.best.effort.multi){ #We are doing something good here do not change anything
    #   print(paste0(c(" [Multi] Best Effort: ", round(rel.best.effort.multi,2), " and path goal is ", round(path.goal.multi,2),". We have ", round(rel.expl.multi,2), " which is better than best effort, so we do not change anything."),collapse=""))
    # } else { #We have to change the variables
    #   old.multi.var <- algo.var$expl.path.multi.var
    #   old.multi.shock <- algo.var$expl.path.multi.shock
    #   old.multi.surp <- algo.var$expl.path.multi.surp
    #   old.multi.fam <- algo.var$expl.path.multi.fam
    #
    #   #Target suboptimality fractions
    #   start.fracs <- c(algo.par$expl.path.multi.start.frac.var, algo.par$expl.path.multi.start.frac.shock,algo.par$expl.path.multi.start.frac.surp, algo.par$expl.path.multi.start.frac.fam)
    #   start.fracs <- start.fracs/sum(start.fracs) #norm for safety
    #   end.fracs <- c(algo.par$expl.path.multi.end.frac.var, algo.par$expl.path.multi.end.frac.shock,algo.par$expl.path.multi.end.frac.surp, algo.par$expl.path.multi.end.frac.fam)
    #   end.fracs <- end.fracs/sum(end.fracs)
    #   target.fracs <- start.fracs+cur.block/blocks*(end.fracs-start.fracs)
    #
    #   #Observed supoptimality fractions
    #   total.sub <- rel.sub.multi.var + rel.sub.multi.shock + rel.sub.multi.surp + rel.sub.multi.fam
    #   current.fracs <- c(rel.sub.multi.var,rel.sub.multi.shock,rel.sub.multi.surp,rel.sub.multi.fam)/total.sub
    #   if(total.sub==0) current.fracs <- rep(1/length(current.fracs),length(current.fracs))
    #
    #   #var
    #   if(target.fracs[1]<current.fracs[1]){ #We want to decrease the noise
    #     factor.var <- 1/algo.par$expl.path.multi.momentum.internal
    #   } else {
    #     factor.var <- algo.par$expl.path.multi.momentum.internal
    #   }
    #
    #   #shock
    #   if(target.fracs[2]<current.fracs[2]){ #We want to decrease the noise
    #     factor.shock <- 1/algo.par$expl.path.multi.momentum.internal
    #   } else {
    #     factor.shock <- algo.par$expl.path.multi.momentum.internal
    #   }
    #
    #   #surp
    #   if(target.fracs[3]<current.fracs[3]){ #We want to decrease the noise
    #     factor.surp <- 1/algo.par$expl.path.multi.momentum.internal
    #   } else {
    #     factor.surp <- algo.par$expl.path.multi.momentum.internal
    #   }
    #
    #   #fam
    #   if(target.fracs[4]<current.fracs[4]){ #We want to decrease the noise
    #     factor.fam <- 1/algo.par$expl.path.multi.momentum.internal
    #   } else {
    #     factor.fam <- algo.par$expl.path.multi.momentum.internal
    #   }
    #
    #   if(path.goal.multi>rel.expl.multi){ #We are worse than the Path
    #     #Worse than path means we want to increase our performance, which means decreasing the noise
    #     basic.multi.factor <- 1/algo.par$expl.path.multi.momentum
    #   } else { #We are better than the path
    #     basic.multi.factor <- algo.par$expl.path.multi.momentum
    #   }
    #
    #   new.multi.var <- old.multi.var*basic.multi.factor*factor.var
    #   new.multi.shock <- old.multi.shock*basic.multi.factor*factor.shock
    #   new.multi.surp <- old.multi.surp*basic.multi.factor*factor.surp
    #   new.multi.fam <- old.multi.fam*basic.multi.factor*factor.fam
    #
    #   print(paste0(c(" [Multi] Best Effort: ", round(rel.best.effort.multi,2), " with path goal ", round(path.goal.multi,2),". We have ", round(rel.expl.multi,2),"."),collapse=""))
    #   print(paste0(c("  [Var] Changes: ", round(old.multi.var,2), " to ", round(new.multi.var,2),", as frac ", round(current.fracs[1],3)," with goal ", round(target.fracs[1],3)),collapse=""))
    #   print(paste0(c("  [Shock] Changes: ", round(old.multi.shock,2), " to ", round(new.multi.shock,2),", as frac ", round(current.fracs[2],3)," with goal ", round(target.fracs[2],3)),collapse=""))
    #   print(paste0(c("  [Surp] Changes: ", round(old.multi.surp,2), " to ", round(new.multi.surp,2),", as frac ", round(current.fracs[3],3)," with goal ", round(target.fracs[3],3)),collapse=""))
    #   print(paste0(c("  [Fam] Changes: ", round(old.multi.fam,2), " to ", round(new.multi.fam,2),", as frac ", round(current.fracs[4],3)," with goal ", round(target.fracs[4],3)),collapse=""))
    #
    #   algo.var$expl.path.multi.var <- new.multi.var
    #   algo.var$expl.path.multi.shock <- new.multi.shock
    #   algo.var$expl.path.multi.surp <- new.multi.surp
    #   algo.var$expl.path.multi.fam <- new.multi.fam
    # }
  }

  algo.var$MC.factor <- algo.var$MC.factor*algo.par$hybrid.decay

  return(algo.var)
}

#' Determines which action to take
#'
#' @export
Act.QLearningPersExpPath <- function(state, model, model.surp=NULL, model.fam=NULL, model.par, model.par.surp=NULL, model.par.fam=NULL, algo.par, algo.var, game.object, eval.only=FALSE, best.guess.episode=FALSE, block.type=NULL){
  restore.point("Act.QLearningPersExpPath")

  Surprise.Formula.QLearningPersExpPath <- function(pred.values,surp.score,algo.par, algo.var){
    res <- pred.values + algo.var$expl.path.surp*surp.score
    return(res)
  }

  Familiarity.Formula.QLearningPersExpPath <- function(pred.values,fam.score,algo.par, algo.var){
    res <- pred.values + algo.var$expl.path.fam*(1-fam.score)
    return(res)
  }

  res <- list(action=NA, action.info=list(best.effort=NA, suboptimality=0, noise.type=NA))

  if(eval.only||best.guess.episode){
    act.values <- model.par$predict(model,model.par,state)
    res$action <- which.is.max(act.values)
    if(is.na(res$action)) res$action <- sample(1:game.object$game.par(game.object)$output.nodes,1)
    res$action.info$best.effort <- TRUE
    return(res)
  }

  if(algo.par$action.policy=="epsilon.greedy"){
    if(runif(1) <= algo.var$epsilon){
      game.par <- game.object$game.par(game.object)
      res$action <- sample(1:game.par$output.nodes,1)
      pred.values <- model.par$predict(model,model.par,state)
      if(res$action==which.is.max(pred.values)){
        res$action.info$best.effort <- TRUE
      } else {
        res$action.info$best.effort <- FALSE
        res$action.info$suboptimality <- max(pred.values)-pred.values[res$action]
      }
      return(res)
    } else {
      act.values <- model.par$predict(model,model.par,state)
      res$action <- which.is.max(act.values)
      if(is.na(res$action)) res$action <- sample(1:game.object$game.par(game.object)$output.nodes,1)
      res$action.info$best.effort <- TRUE
      return(res)
    }
  } else if (algo.par$action.policy == "exploration.path") {
    #restore.point("within.expl.path")
    if(block.type=="var"){
      pred.values <- model.par$predict(model,model.par,state)
      rands <- rnorm(length(pred.values),mean=0, sd=algo.var$expl.path.var)
      act.values <- pred.values + rands
      res$action <- which.is.max(act.values)
      if(is.na(res$action)) res$action <- sample(1:game.object$game.par(game.object)$output.nodes,1)
      if(res$action==which.is.max(pred.values)){
        res$action.info$best.effort <- TRUE
      } else {
        res$action.info$best.effort <- FALSE
        res$action.info$suboptimality <- max(pred.values)-pred.values[res$action]
      }
      return(res)
    } else if(block.type=="shock"){
      if(runif(1) <= algo.var$expl.path.shock){
        game.par <- game.object$game.par(game.object)
        res$action <- sample(1:game.par$output.nodes,1)
        pred.values <- model.par$predict(model,model.par,state)
        if(res$action!=which.is.max(pred.values)){
          res$action.info$best.effort <- FALSE
          res$action.info$suboptimality <- max(pred.values)-pred.values[res$action]
        } else {
          res$action.info$best.effort <- TRUE
        }
        return(res)
      } else {
        act.values <- model.par$predict(model,model.par,state)
        res$action <- which.is.max(act.values)
        if(is.na(res$action)) res$action <- sample(1:game.object$game.par(game.object)$output.nodes,1)
        res$action.info$best.effort <- TRUE
        return(res)
      }
    } else if(block.type=="surp"){
      if(!is.null(model.surp)){
        pred.values <- model.par$predict(model,model.par,state)
        surp.score <- model.par.surp$predict(model.surp,model.par.surp,state)
        act.values <- Surprise.Formula.QLearningPersExpPath(pred.values,surp.score,algo.par, algo.var)
        res$action <- which.is.max(act.values)
        if(res$action!=which.is.max(pred.values)){
          res$action.info$best.effort <- FALSE
          res$action.info$suboptimality <- max(pred.values)-pred.values[res$action]
        } else {
          res$action.info$best.effort <- TRUE
        }
        return(res)
      } else {
        stop("Surprise Model not usable in Action selection")
      }
    } else if(block.type=="fam"){
      if(!is.null(model.fam)){
        pred.values <- model.par$predict(model,model.par,state)
        fam.score <- model.par.fam$predict(model.fam,model.par.fam,state)
        act.values <- Familiarity.Formula.QLearningPersExpPath(pred.values,fam.score,algo.par, algo.var)
        res$action <- which.is.max(act.values)
        if(res$action!=which.is.max(pred.values)){
          res$action.info$best.effort <- FALSE
          res$action.info$suboptimality <- max(pred.values)-pred.values[res$action]
        } else {
          res$action.info$best.effort <- TRUE
        }
        return(res)
      } else {
        stop("Familiarity Model not usable in Action selection")
      }
    } else if(block.type=="vs"){
      pred.values <- model.par$predict(model,model.par,state)

      #Shock part
      if(runif(1) <= algo.var$expl.path.vs.shock){
        game.par <- game.object$game.par(game.object)
        res$action <- sample(1:game.par$output.nodes,1)

        if(res$action==which.is.max(pred.values)){
          res$action.info$best.effort <- TRUE
        } else {
          res$action.info$best.effort <- FALSE
          res$action.info$suboptimality <- max(pred.values)-pred.values[res$action]
          res$action.info$noise.type <- "shock"
        }
        return(res)
      } else {
        rands <- rnorm(length(pred.values),mean=0, sd=algo.var$expl.path.vs.var)
        act.values <- pred.values + rands
        res$action <- which.is.max(act.values)
        if(is.na(res$action)) res$action <- sample(1:game.object$game.par(game.object)$output.nodes,1)

        if(res$action==which.is.max(pred.values)){
          res$action.info$best.effort <- TRUE
        } else {
          res$action.info$best.effort <- FALSE
          res$action.info$suboptimality <- max(pred.values)-pred.values[res$action]
          res$action.info$noise.type <- "var"
        }
        return(res)
      }
    } else if(block.type=="multi"){
      pred.values <- model.par$predict(model,model.par,state)
      best.action <- which.is.max(pred.values)

      #Shock part
      if(runif(1) <= algo.var$expl.path.multi.shock){
        game.par <- game.object$game.par(game.object)
        res$action <- sample(1:game.par$output.nodes,1)

        if(res$action==which.is.max(pred.values)){
          res$action.info$best.effort <- TRUE
        } else {
          res$action.info$best.effort <- FALSE
          res$action.info$suboptimality <- pred.values[best.action]-pred.values[res$action]
          res$action.info$noise.type <- "shock"
        }
        return(res)
      } else {
        rands <- rnorm(length(pred.values),mean=0, sd=algo.var$expl.path.multi.var)
        if(!is.null(model.fam)) {
          fam.noise <- algo.var$expl.path.multi.fam*(1-model.par.fam$predict(model.fam,model.par.fam,state))
        } else {
          fam.noise <- rep(0,length(pred.values))
        }
        if(!is.null(model.surp)) {
          surp.noise <- algo.var$expl.path.multi.surp*(model.par.surp$predict(model.surp,model.par.surp,state))
        } else {
          surp.noise <- rep(0,length(pred.values))
        }
        act.values <- pred.values + rands + fam.noise + surp.noise
        res$action <- which.is.max(act.values)
        if(is.na(res$action)) res$action <- sample(1:game.object$game.par(game.object)$output.nodes,1)
        if(res$action==which.is.max(pred.values)){
          res$action.info$best.effort <- TRUE
        } else {
          res$action.info$best.effort <- FALSE
          res$action.info$suboptimality <- pred.values[best.action]-pred.values[res$action]
          res$action.info$noise.type <- "multi"

          #Calculate Suoptimality Actions
          total.deciding.noise <- (rands[res$action]-rands[best.action]) + (fam.noise[res$action]-fam.noise[best.action]) + (surp.noise[res$action]-surp.noise[best.action])
          if(total.deciding.noise==0) total.deciding.noise <- 1 #If the which.is.max was the random decision
          res$action.info$noise.fractions <- c((rands[res$action]-rands[best.action])/total.deciding.noise, (fam.noise[res$action]-fam.noise[best.action])/total.deciding.noise, (surp.noise[res$action]-surp.noise[best.action])/total.deciding.noise)
          names(res$action.info$noise.fractions) <- c("var","fam","surp")
        }
        return(res)
      }
    } else {
      stop("block type not supported")
    }
  } else {
    stop("Wrong action policy specified in algo.par.")
  }
}

#' Play the game based on strategy
#'
#' @export
Play.On.Strategy.QLearningPersExpPath <- function(model, model.surp=NULL, model.fam=NULL, model.par, model.par.surp=NULL, model.par.fam=NULL, algo.par, algo.var, game.object, best.guess.episode=FALSE, eval.only=TRUE, block.type=NULL){
  restore.point("Play.On.Strategy.QLearningPersExpPath")
  state <- game.object$start.state(game.object)
  score <- 0
  new.mem <- c()
  best.effort <- c()
  suboptimality <- list()
  suboptimality$total <- 0
  start <- TRUE
  if(!is.null(block.type)&&block.type=="vs"){
    suboptimality$var <- 0
    suboptimality$shock <- 0
  }
  if(!is.null(block.type)&&block.type=="multi"){
    suboptimality$var <- 0
    suboptimality$shock <- 0
    suboptimality$surp <- 0
    suboptimality$fam <- 0
  }

  #if(algo.par$action.policy=="exploration.path"){
  #  algo.var$expl.path.var.cur <- runif(1,min=0,max=algo.var$expl.path.var)
  #}

  while(TRUE){
    restore.point("within.Train.QLearning.II")

    vis.state <- t(game.object$state.2.array(game.state=state, game.object=game.object)) # not a real state but what the algorithm sees. Could be a lot smaller than the real game state [but might depend on encoding]
    if(algo.par$use.rnn){
      if(any(is.na(vis.state))) vis.state[is.na(vis.state)] <- model.par$mask.value
    }
    action.full <- Act.QLearningPersExpPath(state=vis.state, model=model, model.surp=model.surp, model.fam=model.fam, model.par=model.par, model.par.surp=model.par.surp, model.par.fam=model.par.fam, algo.par=algo.par, algo.var=algo.var, game.object=game.object, eval.only=eval.only, best.guess.episode=best.guess.episode, block.type=block.type)
    action <- action.full[["action"]]
    if(action>2){
      stop("heretmp")
    }
    best.effort <- c(best.effort,action.full$action.info$best.effort)
    suboptimality$total <- suboptimality$total+action.full$action.info$suboptimality

    #Deal with suboptimality juggling with vs
    if(!is.null(block.type)&&block.type=="vs"){
      if(!is.na(action.full$action.info$noise.type)){
        if(action.full$action.info$noise.type == "var"){
          suboptimality$var <- suboptimality$var + action.full$action.info$suboptimality
        } else if(action.full$action.info$noise.type == "shock"){
          suboptimality$shock <- suboptimality$shock + action.full$action.info$suboptimality
        } else {
          stop("unknown noise type")
        }
      }
    }

    #Deal with suboptimality juggling with multi
    if(!is.null(block.type)&&block.type=="multi"){
      if(!is.na(action.full$action.info$noise.type)){
        if(action.full$action.info$noise.type == "multi"){
          suboptimality$var <- suboptimality$var + action.full$action.info$suboptimality*action.full$action.info$noise.fractions["var"]
          suboptimality$surp <- suboptimality$surp + action.full$action.info$suboptimality*action.full$action.info$noise.fractions["surp"]
          suboptimality$fam <- suboptimality$fam + action.full$action.info$suboptimality*action.full$action.info$noise.fractions["fam"]
        } else if(action.full$action.info$noise.type == "shock"){
          suboptimality$shock <- suboptimality$shock + action.full$action.info$suboptimality
        } else {
          stop("unknown noise type")
        }
      }
    }

    next.state.full <- game.object$state.transition(game.state=state,action=action,game.object=game.object)

    next.state <- next.state.full$next.state
    vis.next.state <- t(game.object$state.2.array(game.state=next.state, game.object=game.object))
    reward <- next.state.full$reward
    if("discounted.reward" %in% game.object$supports){
      score <- game.object$discounted.reward(game.object, game.state=state, old.score=score, reward=reward)
    } else {
      score <- score+reward
    }
    done <- next.state.full$game.finished

    #Build memory cache
    if(algo.par$use.rnn){
      if(any(is.na(vis.next.state))) vis.next.state[is.na(vis.next.state)] <- model.par$mask.value
    }
    new.mem[[length(new.mem)+1]] <- list(state=vis.state, action=action, next.state=vis.next.state, reward=reward, done=done, start=start)
    start <- FALSE
    if(done){
      break
    }
    state <- next.state
  }
  res <- nlist(score,new.mem, best.effort, suboptimality)
}

#' Train multiple games
#'
#' @export
Play.Multiple.Games.QLearningPersExpPath <- function(episodes, model,model.surp=NULL, model.fam=NULL, model.par, model.par.surp=NULL, model.par.fam=NULL,algo.par,algo.var,game.object,type="best", eval.only=TRUE){
  #restore.point("Play.Multiple.Games")
  if(is.null(episodes)||is.na(episodes)||episodes==0){
    return(NULL)
  }
  if((type=="expl.var"||type=="expl.vs" || type=="expl.surp"|| type=="expl.fam" || type=="expl.multi") &&  algo.par$action.policy!='exploration.path'){
    return(NULL)
  }
  if(type=="best"){
    l.res <- lapply(1:episodes,FUN=function(x){
      return(Play.On.Strategy.QLearningPersExpPath(model, model.surp=model.surp, model.fam=model.fam, model.par, model.par.surp=model.par.surp, model.par.fam=model.par.fam, algo.par, algo.var, game.object, best.guess.episode = TRUE, eval.only=eval.only))
    })
  } else if(type=="cur"){
    l.res <- lapply(1:episodes,FUN=function(x){
      return(Play.On.Strategy.QLearningPersExpPath(model, model.surp=model.surp, model.fam=model.fam, model.par, model.par.surp=model.par.surp, model.par.fam=model.par.fam, algo.par, algo.var, game.object, best.guess.episode = TRUE, eval.only=eval.only))
    })
  } else if(type=="expl.var"){
    l.res <- lapply(1:episodes,FUN=function(x){
      return(Play.On.Strategy.QLearningPersExpPath(model, model.surp=model.surp, model.fam=model.fam, model.par, model.par.surp=model.par.surp, model.par.fam=model.par.fam, algo.par, algo.var, game.object, best.guess.episode = FALSE, eval.only=eval.only, block.type="var"))
    })
  } else if(type=="expl.shock"){
    l.res <- lapply(1:episodes,FUN=function(x){
      return(Play.On.Strategy.QLearningPersExpPath(model, model.surp=model.surp, model.fam=model.fam, model.par, model.par.surp=model.par.surp, model.par.fam=model.par.fam, algo.par, algo.var, game.object, best.guess.episode = FALSE, eval.only=eval.only, block.type="shock"))
    })
  } else if (type=="expl.surp"){
    l.res <- lapply(1:episodes,FUN=function(x){
      return(Play.On.Strategy.QLearningPersExpPath(model, model.surp=model.surp, model.fam=model.fam, model.par, model.par.surp=model.par.surp, model.par.fam=model.par.fam, algo.par, algo.var, game.object, best.guess.episode = FALSE, eval.only=eval.only, block.type="surp"))
    })
  } else if (type=="expl.fam"){
    l.res <- lapply(1:episodes,FUN=function(x){
      return(Play.On.Strategy.QLearningPersExpPath(model, model.surp=model.surp, model.fam=model.fam, model.par, model.par.surp=model.par.surp, model.par.fam=model.par.fam, algo.par, algo.var, game.object, best.guess.episode = FALSE, eval.only=eval.only, block.type="fam"))
    })
  } else if(type=="expl.vs"){
    l.res <- lapply(1:episodes,FUN=function(x){
      return(Play.On.Strategy.QLearningPersExpPath(model, model.surp=model.surp, model.fam=model.fam, model.par, model.par.surp=model.par.surp, model.par.fam=model.par.fam, algo.par, algo.var, game.object, best.guess.episode = FALSE, eval.only=eval.only, block.type="vs"))
    })
  } else if(type=="expl.multi"){
    l.res <- lapply(1:episodes,FUN=function(x){
      return(Play.On.Strategy.QLearningPersExpPath(model, model.surp=model.surp, model.fam=model.fam, model.par, model.par.surp=model.par.surp, model.par.fam=model.par.fam, algo.par, algo.var, game.object, best.guess.episode = FALSE, eval.only=eval.only, block.type="multi"))
    })
  }
  new.mem <- unlist(lapply(l.res,FUN=function(x){x$new.mem}), recursive=FALSE)
  score <- unlist(lapply(l.res,FUN=function(x){x$score}), recursive=FALSE)
  best.effort <- unlist(lapply(l.res,FUN=function(x){x$best.effort}), recursive=FALSE)
  suboptimality <- list()
  suboptimality$total <- unlist(lapply(l.res,FUN=function(x){x$suboptimality$total}), recursive=FALSE)
  suboptimality$var <- unlist(lapply(l.res,FUN=function(x){x$suboptimality$var}), recursive=FALSE)
  suboptimality$shock <- unlist(lapply(l.res,FUN=function(x){x$suboptimality$shock}), recursive=FALSE)
  suboptimality$surp <- unlist(lapply(l.res,FUN=function(x){x$suboptimality$surp}), recursive=FALSE)
  suboptimality$fam <- unlist(lapply(l.res,FUN=function(x){x$suboptimality$fam}), recursive=FALSE)
  return(nlist(new.mem,score,best.effort, suboptimality))
}

#' Train a model based on Q-Learning
#'
#' @export
Train.QLearningPersExpPath <- function(evaluator, model.par, algo.par, algo.var, model.par.surp=NULL, model.par.fam=NULL, game.object, blocks, eval.only=FALSE, eval.each.round=NULL, start.w.training=TRUE, out.file=NULL){
  restore.point("Train.QLearningPersExpPath")

  if(is.null(algo.var$analysis)){
    algo.var$analysis <- list()
    algo.var$analysis$score.expl.var <- NA
    algo.var$analysis$score.expl.block.var <- NA
    algo.var$analysis$score.expl.shock <- NA
    algo.var$analysis$score.expl.block.shock <- NA
    algo.var$analysis$score.expl.surp <- NA
    algo.var$analysis$score.expl.block.surp <- NA
    algo.var$analysis$score.expl.fam <- NA
    algo.var$analysis$score.expl.block.fam <- NA
    algo.var$analysis$score.best <- NA
    algo.var$analysis$score.best.block <- NA
    algo.var$analysis$score.cur <- NA
    algo.var$analysis$score.cur.block <- NA
    algo.var$analysis$score.expl.vs <- NA
    algo.var$analysis$score.expl.block.vs <- NA
    algo.var$analysis$score.expl.multi <- NA
    algo.var$analysis$score.expl.block.multi <- NA
    algo.var$analysis$sub.vs.var <- NA
    algo.var$analysis$sub.block.vs.var <- NA
    algo.var$analysis$sub.vs.shock <- NA
    algo.var$analysis$sub.block.vs.shock <- NA
    algo.var$analysis$sub.multi <- NA
    algo.var$analysis$sub.block.multi <- NA
    algo.var$analysis$sub.multi.var <- NA
    algo.var$analysis$sub.block.multi.var <- NA
    algo.var$analysis$sub.multi.shock <- NA
    algo.var$analysis$sub.block.multi.shock <- NA
    algo.var$analysis$sub.multi.surp <- NA
    algo.var$analysis$sub.block.multi.surp <- NA
    algo.var$analysis$sub.multi.fam <- NA
    algo.var$analysis$sub.block.multi.fam <- NA
    algo.var$analysis$sub.vs <- NA
    algo.var$analysis$sub.block.vs <- NA
    algo.var$analysis$sub.var <- NA
    algo.var$analysis$sub.block.var <- NA
    algo.var$analysis$sub.shock <- NA
    algo.var$analysis$sub.block.shock <- NA
    algo.var$analysis$sub.surp <- NA
    algo.var$analysis$sub.block.surp <- NA
    algo.var$analysis$sub.fam <- NA
    algo.var$analysis$sub.block.fam <- NA
    algo.var$analysis$time <- NA
    algo.var$analysis$external.eval <- NA
  }

  if(is.null(algo.var$path.goal.var)) algo.var$path.goal.var <- algo.par$expl.path.var.start
  if(is.null(algo.var$path.goal.shock)) algo.var$path.goal.shock <- algo.par$expl.path.shock.start
  if(is.null(algo.var$path.goal.surp)) algo.var$path.goal.surp <- algo.par$expl.path.surp.start
  if(is.null(algo.var$path.goal.fam)) algo.var$path.goal.fam <- algo.par$expl.path.fam.start
  if(is.null(algo.var$path.goal.vs)) algo.var$path.goal.vs <- algo.par$expl.path.vs.start
  if(is.null(algo.var$path.goal.multi)) algo.var$path.goal.multi <- algo.par$expl.path.multi.start

  #Toggle Surprise model
  if(algo.par$block.expl.surp==0 && (algo.par$block.expl.multi==0 || (algo.par$block.expl.multi>0 && (algo.par$expl.path.multi.start.frac.surp==0&&algo.par$expl.path.multi.end.frac.surp==0)))){
    model.par.surp <- NULL
  }
  if(algo.par$block.expl.fam==0 && (algo.par$block.expl.multi==0 || (algo.par$block.expl.multi>0 && (algo.par$expl.path.multi.start.frac.fam==0&&algo.par$expl.path.multi.end.frac.fam==0)))){
    model.par.fam <- NULL
  }

  if(length(algo.var$memory)>0 && start.w.training && !eval.only){
    replay.res <- Replay.QLearningPersExpPath(evaluator, model.par, model.par.surp = model.par.surp, model.par.fam=model.par.fam, algo.par, algo.var, game.object)
    evaluator <- Update.Evaluator.QLearningPersExpPath(evaluator=evaluator, new.model=replay.res$model, new.model.surp=replay.res$model.surp, new.model.fam=replay.res$model.fam, which.update="both", model.par=model.par)
    algo.var <- replay.res$algo.var
  }

  start.block <- max(length(algo.var$analysis$score.best.block),length(algo.var$analysis$score.cur.block))+1
  if(is.na(algo.var$analysis$score.best.block)&&is.na(algo.var$analysis$score.cur.block)){
    start.block <- 1
  }

  for(i in start.block:blocks){
    restore.point("within.Train.QLearningPersExpPath")

    #### Start Outside Info for self-play
    assign(game.object$basic.name.eval.model, evaluator$model.best, envir=.GlobalEnv)
    assign(game.object$basic.name.eval.model.par, model.par, envir=.GlobalEnv)
    ##### End Outside Info for self-play

    tic()
    best.results <- Play.Multiple.Games.QLearningPersExpPath(episodes=algo.par$block.best,model=evaluator$model.best,model.surp=evaluator$model.surp, model.fam=evaluator$model.fam, model.par, model.par.surp=model.par.surp, model.par.fam=model.par.fam, algo.par,algo.var,game.object,type="best", eval.only=eval.only)
    cur.results <- Play.Multiple.Games.QLearningPersExpPath(episodes=algo.par$block.cur,model=evaluator$model.cur,model.surp=evaluator$model.surp, model.fam=evaluator$model.fam, model.par, model.par.surp=model.par.surp, model.par.fam=model.par.fam, algo.par,algo.var,game.object,type="cur", eval.only=eval.only)
    expl.results.var <- Play.Multiple.Games.QLearningPersExpPath(episodes=algo.par$block.expl.var,model=evaluator$model.cur,model.surp=evaluator$model.surp, model.fam=evaluator$model.fam, model.par, model.par.surp=model.par.surp, model.par.fam=model.par.fam, algo.par,algo.var,game.object,type="expl.var", eval.only=eval.only)
    expl.results.shock <- Play.Multiple.Games.QLearningPersExpPath(episodes=algo.par$block.expl.shock,model=evaluator$model.cur,model.surp=evaluator$model.surp, model.fam=evaluator$model.fam, model.par, model.par.surp=model.par.surp, model.par.fam=model.par.fam, algo.par,algo.var,game.object,type="expl.shock", eval.only=eval.only)
    expl.results.surp <- Play.Multiple.Games.QLearningPersExpPath(episodes=algo.par$block.expl.surp,model=evaluator$model.cur,model.surp=evaluator$model.surp, model.fam=evaluator$model.fam, model.par, model.par.surp=model.par.surp, model.par.fam=model.par.fam, algo.par,algo.var,game.object,type="expl.surp", eval.only=eval.only)
    expl.results.fam <- Play.Multiple.Games.QLearningPersExpPath(episodes=algo.par$block.expl.fam,model=evaluator$model.cur,model.surp=evaluator$model.surp, model.fam=evaluator$model.fam, model.par, model.par.surp=model.par.surp, model.par.fam=model.par.fam, algo.par,algo.var,game.object,type="expl.fam", eval.only=eval.only)
    expl.results.vs <- Play.Multiple.Games.QLearningPersExpPath(episodes=algo.par$block.expl.vs,model=evaluator$model.cur,model.surp=evaluator$model.surp, model.fam=evaluator$model.fam, model.par, model.par.surp=model.par.surp, model.par.fam=model.par.fam, algo.par,algo.var,game.object,type="expl.vs", eval.only=eval.only)
    expl.results.multi <- Play.Multiple.Games.QLearningPersExpPath(episodes=algo.par$block.expl.multi,model=evaluator$model.cur,model.surp=evaluator$model.surp, model.fam=evaluator$model.fam, model.par, model.par.surp=model.par.surp, model.par.fam=model.par.fam, algo.par,algo.var,game.object,type="expl.multi", eval.only=eval.only)

    #Update memory
    algo.var <- Update.Memory.QLearningPersExpPath(algo.var, best.results$new.mem, game.object, algo.par, best.effort=best.results$best.effort)
    algo.var <- Update.Memory.QLearningPersExpPath(algo.var, cur.results$new.mem, game.object, algo.par, best.effort=cur.results$best.effort)
    algo.var <- Update.Memory.QLearningPersExpPath(algo.var, expl.results.var$new.mem, game.object, algo.par, best.effort=expl.results.var$best.effort)
    algo.var <- Update.Memory.QLearningPersExpPath(algo.var, expl.results.shock$new.mem, game.object, algo.par, best.effort=expl.results.shock$best.effort)
    algo.var <- Update.Memory.QLearningPersExpPath(algo.var, expl.results.surp$new.mem, game.object, algo.par, best.effort=expl.results.surp$best.effort)
    algo.var <- Update.Memory.QLearningPersExpPath(algo.var, expl.results.fam$new.mem, game.object, algo.par, best.effort=expl.results.fam$best.effort)
    algo.var <- Update.Memory.QLearningPersExpPath(algo.var, expl.results.vs$new.mem, game.object, algo.par, best.effort=expl.results.vs$best.effort)
    algo.var <- Update.Memory.QLearningPersExpPath(algo.var, expl.results.multi$new.mem, game.object, algo.par, best.effort=expl.results.multi$best.effort)


    #If whished, eval.each.round evaluates current model externally
    if(!is.null(eval.each.round)&&eval.each.round!=0){
      restore.point("eval.internal")
      if(is.null(game.object$external.eval)){
        stop("This game does not support External Evaluation!")
      }
      external.eval <- game.object$external.eval(game.object,eval.no=eval.each.round)
      if(is.na(algo.var$analysis$external.eval[1])){
        algo.var$analysis$external.eval <- external.eval
      } else {
        algo.var$analysis$external.eval <- c(algo.var$analysis$external.eval,external.eval)
      }
    }

    #Update scores
    if(!is.null(expl.results.var)){
      if(is.na(algo.var$analysis$score.expl.var[1])){
        algo.var$analysis$score.expl.var <- expl.results.var$score
        algo.var$analysis$score.expl.block.var <- mean(expl.results.var$score)
      } else {
        algo.var$analysis$score.expl.var <- c(algo.var$analysis$score.expl.var, expl.results.var$score)
        algo.var$analysis$score.expl.block.var <- c(algo.var$analysis$score.expl.block.var,mean(expl.results.var$score))
      }
    }

    if(!is.null(expl.results.shock)){
      if(is.na(algo.var$analysis$score.expl.shock[1])){
        algo.var$analysis$score.expl.shock <- expl.results.shock$score
        algo.var$analysis$score.expl.block.shock <- mean(expl.results.shock$score)
      } else {
        algo.var$analysis$score.expl.shock <- c(algo.var$analysis$score.expl.shock, expl.results.shock$score)
        algo.var$analysis$score.expl.block.shock <- c(algo.var$analysis$score.expl.block.shock,mean(expl.results.shock$score))
      }
    }

    if(!is.null(expl.results.surp)){
      if(is.na(algo.var$analysis$score.expl.surp[1])){
        algo.var$analysis$score.expl.surp <- expl.results.surp$score
        algo.var$analysis$score.expl.block.surp <- mean(expl.results.surp$score)
      } else {
        algo.var$analysis$score.expl.surp <- c(algo.var$analysis$score.expl.surp, expl.results.surp$score)
        algo.var$analysis$score.expl.block.surp <- c(algo.var$analysis$score.expl.block.surp,mean(expl.results.surp$score))
      }
    }

    if(!is.null(expl.results.fam)){
      if(is.na(algo.var$analysis$score.expl.fam[1])){
        algo.var$analysis$score.expl.fam <- expl.results.fam$score
        algo.var$analysis$score.expl.block.fam <- mean(expl.results.fam$score)
      } else {
        algo.var$analysis$score.expl.fam <- c(algo.var$analysis$score.expl.fam, expl.results.fam$score)
        algo.var$analysis$score.expl.block.fam <- c(algo.var$analysis$score.expl.block.fam,mean(expl.results.fam$score))
      }
    }

    if(!is.null(expl.results.vs)){
      if(is.na(algo.var$analysis$score.expl.vs[1])){
        algo.var$analysis$score.expl.vs <- expl.results.vs$score
        algo.var$analysis$score.expl.block.vs <- mean(expl.results.vs$score)
      } else {
        algo.var$analysis$score.expl.vs <- c(algo.var$analysis$score.expl.vs, expl.results.vs$score)
        algo.var$analysis$score.expl.block.vs <- c(algo.var$analysis$score.expl.block.vs,mean(expl.results.vs$score))
      }
    }

    if(!is.null(expl.results.multi)){
      if(is.na(algo.var$analysis$score.expl.multi[1])){
        algo.var$analysis$score.expl.multi <- expl.results.multi$score
        algo.var$analysis$score.expl.block.multi <- mean(expl.results.multi$score)
      } else {
        algo.var$analysis$score.expl.multi <- c(algo.var$analysis$score.expl.multi, expl.results.multi$score)
        algo.var$analysis$score.expl.block.multi <- c(algo.var$analysis$score.expl.block.multi,mean(expl.results.multi$score))
      }
    }

    if(!is.null(best.results)){
      if(is.na(algo.var$analysis$score.best[1])){
        algo.var$analysis$score.best <- best.results$score
        algo.var$analysis$score.best.block <- mean(best.results$score)
      } else {
        algo.var$analysis$score.best <- c(algo.var$analysis$score.best, best.results$score)
        algo.var$analysis$score.best.block <- c(algo.var$analysis$score.best.block,mean(best.results$score))
      }
    }

    if(!is.null(cur.results)){
      if(is.na(algo.var$analysis$score.cur[1])){
        algo.var$analysis$score.cur <- cur.results$score
        algo.var$analysis$score.cur.block <- mean(cur.results$score)
      } else {
        algo.var$analysis$score.cur <- c(algo.var$analysis$score.cur, cur.results$score)
        algo.var$analysis$score.cur.block <- c(algo.var$analysis$score.cur.block,mean(cur.results$score))
      }
    }

    #Update suboptimality
    if(!is.null(expl.results.var)){
      if(is.na(algo.var$analysis$sub.var[1])){
        algo.var$analysis$sub.var <- expl.results.var$suboptimality$total
        algo.var$analysis$sub.block.var <- mean(expl.results.var$suboptimality$total)
      } else {
        algo.var$analysis$sub.var <- c(algo.var$analysis$sub.var,expl.results.var$suboptimality$total)
        algo.var$analysis$sub.block.var <- c(algo.var$analysis$sub.block.var,mean(expl.results.var$suboptimality$total))
      }
    }

    if(!is.null(expl.results.shock)){
      if(is.na(algo.var$analysis$sub.shock[1])){
        algo.var$analysis$sub.shock <- expl.results.shock$suboptimality$total
        algo.var$analysis$sub.block.shock <- mean(expl.results.shock$suboptimality$total)
      } else {
        algo.var$analysis$sub.shock <- c(algo.var$analysis$sub.shock,expl.results.shock$suboptimality$total)
        algo.var$analysis$sub.block.shock <- c(algo.var$analysis$sub.block.shock,mean(expl.results.shock$suboptimality$total))
      }
    }

    if(!is.null(expl.results.surp)){
      if(is.na(algo.var$analysis$sub.surp[1])){
        algo.var$analysis$sub.surp <- expl.results.surp$suboptimality$total
        algo.var$analysis$sub.block.surp <- mean(expl.results.surp$suboptimality$total)
      } else {
        algo.var$analysis$sub.surp <- c(algo.var$analysis$sub.surp,expl.results.surp$suboptimality$total)
        algo.var$analysis$sub.block.surp <- c(algo.var$analysis$sub.block.surp,mean(expl.results.surp$suboptimality$total))
      }
    }

    if(!is.null(expl.results.fam)){
      if(is.na(algo.var$analysis$sub.fam[1])){
        algo.var$analysis$sub.fam <- expl.results.fam$suboptimality$total
        algo.var$analysis$sub.block.fam <- mean(expl.results.fam$suboptimality$total)
      } else {
        algo.var$analysis$sub.fam <- c(algo.var$analysis$sub.fam,expl.results.fam$suboptimality$total)
        algo.var$analysis$sub.block.fam <- c(algo.var$analysis$sub.block.fam,mean(expl.results.fam$suboptimality$total))
      }
    }

    if(!is.null(expl.results.vs)){
      if(is.na(algo.var$analysis$sub.vs[1])){
        algo.var$analysis$sub.vs <- expl.results.vs$suboptimality$total
        algo.var$analysis$sub.block.vs <- mean(expl.results.vs$suboptimality$total)
        algo.var$analysis$sub.vs.var <- expl.results.vs$suboptimality$var
        algo.var$analysis$sub.block.vs.var <- mean(expl.results.vs$suboptimality$var)
        algo.var$analysis$sub.vs.shock <- expl.results.vs$suboptimality$shock
        algo.var$analysis$sub.block.vs.shock <- mean(expl.results.vs$suboptimality$shock)
      } else {
        algo.var$analysis$sub.vs <- c(algo.var$analysis$sub.vs,expl.results.vs$suboptimality$total)
        algo.var$analysis$sub.block.vs <- c(algo.var$analysis$sub.block.vs,mean(expl.results.vs$suboptimality$total))
        algo.var$analysis$sub.vs.var <- c(algo.var$analysis$sub.vs.var,expl.results.vs$suboptimality$var)
        algo.var$analysis$sub.block.vs.var <- c(algo.var$analysis$sub.block.vs.var,mean(expl.results.vs$suboptimality$var))
        algo.var$analysis$sub.vs.shock <- c(algo.var$analysis$sub.vs.shock,expl.results.vs$suboptimality$shock)
        algo.var$analysis$sub.block.vs.shock <- c(algo.var$analysis$sub.block.vs.shock,mean(expl.results.vs$suboptimality$shock))
      }
    }

    if(!is.null(expl.results.multi)){
      if(is.na(algo.var$analysis$sub.multi[1])){
        algo.var$analysis$sub.multi <- expl.results.multi$suboptimality$total
        algo.var$analysis$sub.block.multi <- mean(expl.results.multi$suboptimality$total)
        algo.var$analysis$sub.multi.var <- expl.results.multi$suboptimality$var
        algo.var$analysis$sub.block.multi.var <- mean(expl.results.multi$suboptimality$var)
        algo.var$analysis$sub.multi.shock <- expl.results.multi$suboptimality$shock
        algo.var$analysis$sub.block.multi.shock <- mean(expl.results.multi$suboptimality$shock)
        algo.var$analysis$sub.multi.surp <- expl.results.multi$suboptimality$surp
        algo.var$analysis$sub.block.multi.surp <- mean(expl.results.multi$suboptimality$surp)
        algo.var$analysis$sub.multi.fam <- expl.results.multi$suboptimality$fam
        algo.var$analysis$sub.block.multi.fam <- mean(expl.results.multi$suboptimality$fam)
      } else {
        algo.var$analysis$sub.multi <- c(algo.var$analysis$sub.multi,expl.results.multi$suboptimality$total)
        algo.var$analysis$sub.block.multi <- c(algo.var$analysis$sub.block.multi,mean(expl.results.multi$suboptimality$total))
        algo.var$analysis$sub.multi.var <- c(algo.var$analysis$sub.multi.var,expl.results.multi$suboptimality$var)
        algo.var$analysis$sub.block.multi.var <- c(algo.var$analysis$sub.block.multi.var,mean(expl.results.multi$suboptimality$var))
        algo.var$analysis$sub.multi.shock <- c(algo.var$analysis$sub.multi.shock,expl.results.multi$suboptimality$shock)
        algo.var$analysis$sub.block.multi.shock <- c(algo.var$analysis$sub.block.multi.shock,mean(expl.results.multi$suboptimality$shock))
        algo.var$analysis$sub.multi.surp <- c(algo.var$analysis$sub.multi.surp,expl.results.multi$suboptimality$surp)
        algo.var$analysis$sub.block.multi.surp <- c(algo.var$analysis$sub.block.multi.surp,mean(expl.results.multi$suboptimality$surp))
        algo.var$analysis$sub.multi.fam <- c(algo.var$analysis$sub.multi.fam,expl.results.multi$suboptimality$fam)
        algo.var$analysis$sub.block.multi.fam <- c(algo.var$analysis$sub.block.multi.fam,mean(expl.results.multi$suboptimality$fam))
      }
    }

    #Build output message
    output.message <- c("block: ",i," of ",blocks," with the following scores:")
    if(!is.null(best.results)) output.message <- paste0(c(output.message, " best: ",round(mean(best.results$score),2)),collapse="")
    if(!is.null(cur.results)) output.message <- paste0(c(output.message, " cur: ",round(mean(cur.results$score),2)),collapse="")
    if(!is.null(expl.results.var)) output.message <- paste0(c(output.message, " expl.var: ",round(mean(expl.results.var$score),2)),collapse="")
    if(!is.null(expl.results.surp)) output.message <- paste0(c(output.message, " expl.surp: ",round(mean(expl.results.surp$score),2)),collapse="")
    if(!is.null(expl.results.fam)) output.message <- paste0(c(output.message, " expl.fam: ",round(mean(expl.results.fam$score),2)),collapse="")
    if(!is.null(expl.results.vs)) output.message <- paste0(c(output.message, " expl.vs: ",round(mean(expl.results.vs$score),2)),collapse="")
    if(!is.null(expl.results.multi)) output.message <- paste0(c(output.message, " expl.multi: ",round(mean(expl.results.multi$score),2)),collapse="")
    if(!is.null(expl.results.shock) && algo.par$action.policy=="exploration.path") output.message <- paste0(c(output.message, " expl.shock(",round(algo.var$expl.path.shock,3),"): ",round(mean(expl.results.shock$score),2)),collapse="")
    if(!is.null(expl.results.shock) && algo.par$action.policy=="epsilon.greedy") output.message <- paste0(c(output.message, " expl.shock(",round(algo.var$epsilon,3),"): ",round(mean(expl.results.shock$score),2)),collapse="")

    print(output.message)

    if(length(algo.var$memory)>algo.par$max.mem){
      algo.var$memory <- algo.var$memory[-c(sample(1:length(algo.var$memory),round(length(algo.var$memory)*algo.par$remove.memory)))]
    }

    if(!eval.only){
      #If Score is better with new model, change best model
      if(!is.null(cur.results) && !is.null(best.results)){
        if(mean(cur.results$score)>mean(best.results$score)){
          evaluator <- Update.Evaluator.QLearningPersExpPath(evaluator, new.model=evaluator$model.cur, which.update = "best", model.par=model.par)
          print("New best model chosen!")
        }
      } else {
        if(!is.null(cur.results)){
          evaluator <- Update.Evaluator.QLearningPersExpPath(evaluator, new.model=evaluator$model.cur, which.update = "best", model.par=model.par)
        }
      }

      #Train new cur.model
      replay.res <- Replay.QLearningPersExpPath(evaluator=evaluator, model.par=model.par, model.par.surp=model.par.surp, model.par.fam=model.par.fam, algo.par=algo.par, algo.var=algo.var, game.object=game.object)
      evaluator <- replay.res$evaluator
      algo.var <- replay.res$algo.var

      #End time measurement
      toc.res <- toc(quiet=TRUE)
      used.time <- toc.res$toc-toc.res$tic
      #Update time
      if(is.na(algo.var$analysis$time[1])){
        algo.var$analysis$time <- used.time
      } else {
        algo.var$analysis$time <- c(algo.var$analysis$time,used.time)
      }

      #Change Algovariables
      algo.var <- Change.Algo.Var.QLearningPersExpPath(algo.par, algo.var, cur.block=i, blocks=blocks)
    }

    #Save intermediate results in case of computer crash etc.
    if(is.null(out.file)){
      out.file <- "out.save"
    }

    #Only same types of models are supported as of right now
    if((!is.null(model.par.surp)&&model.par$name!=model.par.surp$name)||(!is.null(model.par.fam)&&model.par$name!=model.par.fam$name)){
      stop("Models of Surprise, Familiarity and the Main Model have to be currently of the same type (Neural Network, XGBoost,...)")
    }

    if(!(model.par$name %in% c("Neural.Network.Basic", "RNN.Basic"))){ #Basic scenario, we do not need to worry about pointers
      save(evaluator, algo.var,file=out.file)
    } else {
      if(model.par$name=="Neural.Network.Basic"){
        file.sub.name <- "NN"
      } else {
        file.sub.name <- "RNN"
      }
      if(!is.null(evaluator$model.cur)&&("keras.engine.training.Model" %in% class(evaluator$model.cur))){
        cur.file.path <- paste0(getwd(),"/",format(Sys.time(), "%Y-%m-%d %H")," cur.",file.sub.name,"_save")
        evaluator$model.cur.out.file <- cur.file.path
        save_model_hdf5(object=evaluator$model.cur, filepath=cur.file.path, overwrite = TRUE, include_optimizer = TRUE)
      }
      if(!is.null(evaluator$model.best)&&("keras.engine.training.Model" %in% class(evaluator$model.best))){
        best.file.path <- paste0(getwd(),"/",format(Sys.time(), "%Y-%m-%d %H")," best.",file.sub.name,"_save")
        evaluator$model.best.out.file <- best.file.path
        save_model_hdf5(object=evaluator$model.best, filepath=best.file.path, overwrite = TRUE, include_optimizer = TRUE)
      }
      if(!is.null(evaluator$model.surp)&&("keras.engine.training.Model" %in% class(evaluator$model.surp))){
        surp.file.path <- paste0(getwd(),"/",format(Sys.time(), "%Y-%m-%d %H")," surp.",file.sub.name,"_save")
        evaluator$model.surp.out.file <- surp.file.path
        save_model_hdf5(object=evaluator$model.surp, filepath=surp.file.path, overwrite = TRUE, include_optimizer = TRUE)
      }
      if(!is.null(evaluator$model.fam)&&("keras.engine.training.Model" %in% class(evaluator$model.fam))){
        fam.file.path <- paste0(getwd(),"/",format(Sys.time(), "%Y-%m-%d %H")," fam.",file.sub.name,"_save")
        evaluator$model.fam.out.file <- fam.file.path
        save_model_hdf5(object=evaluator$model.fam, filepath=fam.file.path, overwrite = TRUE, include_optimizer = TRUE)
      }
      save(evaluator, algo.var,file=out.file)
    }
  }

  return(list(evaluator=evaluator, algo.var=algo.var))
}


#' Q-values based on history of IPD
#'
#' May become deprecated if changes in IPD are made. Just used for easier analysis.
#'
#' @export
Q.on.hist.PD.QLearning <- function(model, model.par, game.object, hist.me, hist.other, hist.real.me="not given"){
  restore.point("Q.on.hist.PD.QLearning")

  if(length(hist.me)!=length(hist.other)){
    stop("both histories have to have the same ")
  }

  state <- list()
  state$round <- length(hist.me)+1
  state$me.last.see <- hist.me[length(hist.me)]
  state$other.last.see <- hist.other[length(hist.other)]
  state$game.finished <- FALSE

  #as seen
  hist.me.full <- rep(NA,game.object$game.pars$T.max)
  if(length(hist.me)>0) hist.me.full[1:length(hist.me)] <- hist.me

  hist.other.full <- rep(NA,game.object$game.pars$T.max)
  if(length(hist.other)>0) hist.other.full[1:length(hist.other)] <- hist.other

  state$history.see <- cbind(hist.me.full,hist.other.full)

  #real
  if(is.null(hist.real.me) || hist.real.me != "not given"){
    hist.real.me.full <- rep(NA,game.object$game.pars$T.max)
    if(length(hist.real.me)>0) hist.real.me.full[1:length(hist.real.me)] <- hist.real.me

    hist.real.other.full <- rep(NA,game.object$game.pars$T.max)

    state$history.real <- cbind(hist.real.me.full,hist.real.other.full)
  }

  x_train <- t(game.object$state.2.array(game.state=state, game.object=game.object))

  if(model.par$name=="RNN.Basic"){
    x_train[is.na(x_train)] <- model.par$mask.value
  }

  return(model.par$predict(model,model.par, x_train))
}