R/LikGradapproxHess.general.R
In flexmsm: A General Framework for Flexible Multi-State Survival Modelling

Documented in LikGradapproxHess.general

#' Likelihood, gradient and Hessian for univariate transition intensity based models
#'
#' @param params Parameters vector.
#' @param data Dataset in proper format.
#' @param full.X Full design matrix.
#' @param MM List of necessary setup quantities.
#' @param pen.matr.S.lambda Penalty matrix multiplied by smoothing parameter lambda.
#' @param aggregated.provided Whether aggregated form was provided (may become obsolete in the future if we see original dataset as special case of aggregated where \code{nrep = 1}).
#' @param do.gradient Whether or not to compute the gradient.
#' @param do.hessian Whether or not to compute the Hessian.
#' @param pmethod Method to be used for computation of transition probability matrix. See help of \code{msm()} for further details.
#' @param death Whether the last state is an absorbing state.
#' @param Qmatr.diagnostics.list List of maximum absolute values of the Q matrices computed during model fitting.
#' @param verbose Whether to print out the progress being made in computing the likelihood, gradient and Hessian.
#' @param parallel Whether or not to use parallel computing (only for Windows users for now).
#' @param no_cores Number of cores used if parallel computing chosen. The default is 2. If \code{NULL}, all available cores are used.
#'
#' @return Penalized likelihood, gradient and Hessian associated with model at given parameters, for use by trust region algorithm.
#' @export
#'
#'
LikGradapproxHess.general = function(params, data = NULL, full.X = NULL, MM, pen.matr.S.lambda, aggregated.provided = FALSE,
                               do.gradient = TRUE, do.hessian = TRUE, pmethod = 'analytic', death,
                               Qmatr.diagnostics.list = NULL,
                               verbose = FALSE, parallel = FALSE, no_cores = 2){


  # Initialize
  l.par = 0                                             # log-likelihood
  G = rep(0, MM$l.params)                               # gradient
  H = matrix(0, ncol = MM$l.params, nrow = MM$l.params) # hessian

  if(is.null(MM$cens.state)) MM$cens.state = -99

  if(parallel){
    if(is.null(no_cores)) no_cores <- parallel::detectCores(logical = TRUE)  # returns the number of available hardware threads, and if it is FALSE, returns the number of physical cores
    if(!is.null(no_cores) & no_cores > parallel::detectCores(logical = TRUE)) stop('You can use up to ', parallel::detectCores(logical = TRUE), ' cores on this device. Please change the value provided for no_cores.')

    # Break the data in no_cores sections
    peeps = unique(data$`(id)`)
    part.lik = floor(length(peeps)/no_cores)
    part.lik.last = length(peeps)  - part.lik*(no_cores-1)

    sta = 1; sto = sta + part.lik - 1
    data.list = list(list(data[data$`(id)` %in% peeps[sta:sto], ]))
    data.list[[1]][[2]] = full.X[which(data$`(id)` %in% peeps[sta:sto]), ]

    for(i in 2:no_cores){
      sta = sto + 1
      sto = sta + ifelse(i < no_cores, part.lik, part.lik.last) - 1
      data.list[[i]] = list(data[data$`(id)` %in% peeps[sta:sto], ])
      data.list[[i]][[2]] = full.X[which(data$`(id)` %in% peeps[sta:sto]), ]
    }

    parallel.LikGradapproxHess = function(data.parallel) {
      LikGradapproxHess.general(params, data = data.parallel[[1]], full.X = data.parallel[[2]], MM = MM, pen.matr.S.lambda = matrix(0, nrow = nrow(pen.matr.S.lambda), ncol = ncol(pen.matr.S.lambda)), # *****
                          aggregated.provided = FALSE, do.gradient = do.gradient, do.hessian = do.hessian,
                          pmethod = pmethod, death = death,
                          Qmatr.diagnostics.list = Qmatr.diagnostics.list,
                          parallel = FALSE)
    }


    cl <- parallel::makeCluster(no_cores)
    # parallel::clusterEvalQ(cl, {library(flexmsm)})

    parallel::clusterExport(cl, varlist = c('LikGradapproxHess.general', 'Q.matr.setup.general', 'P.matr.comp', 'dP.matr.comp', 'd2P.matr.comp',
                                            'params', 'data', 'full.X', 'MM', 'pen.matr.S.lambda',
                                            'aggregated.provided', 'do.gradient', 'do.hessian',
                                            'pmethod', 'death',
                                            'Qmatr.diagnostics.list'), envir = environment())
    parallel.LikGradapproxHess <- parallel::parLapply(cl, data.list, parallel.LikGradapproxHess)
    parallel::stopCluster(cl)



    Q.diag.list.temp = c()
    for(i in 1:no_cores){
      l.par = l.par + parallel.LikGradapproxHess[[i]]$value
      G = G + parallel.LikGradapproxHess[[i]]$gradient
      H = H + parallel.LikGradapproxHess[[i]]$hessian
      Q.diag.list.temp = c(Q.diag.list.temp, parallel.LikGradapproxHess[[i]]$Qmatr.diagnostics.list[[length(parallel.LikGradapproxHess[[i]]$Qmatr.diagnostics.list)]])
    }
    Qmatr.diagnostics.list[[length(Qmatr.diagnostics.list)+1]] = Q.diag.list.temp

    l.par = -l.par; G = -G; H = -H # because we will be flipping the sign later, when we add the real final penalty



  } else { # **** ORIGINAL NOT PARALLELISED CODE ****



    Q.matr.object = Q.matr.setup.general(params[MM$pos.optparams], MM$nstates, full.X, MM$start.pos.par, MM$l.short.formula, MM$whereQ,
                                         firstD = do.gradient, secondD = FALSE, bound.eta = FALSE, # note: bound.eta was only for debugging purposes, don't touch this
                                         pos.optparams = MM$pos.optparams, pos.optparams2 = MM$pos.optparams2)
    Qmatr = Q.matr.object$Qmatr
    dQmatr = Q.matr.object$dQmatr
    # d2Qmatr = Q.matr.object$d2Qmatr
    comp.par.mapping = Q.matr.object$comp.par.mapping


    # [03/01/2021] Introducing a check to verify whether pathological behaviour is occurring within the Q matrix and when this starts
    # to happen ------------------------------------------------------------------------- PART A -----------------------------------
    if(!is.null(Qmatr.diagnostics.list)) Qmatr.i.maxs = c()
    # ------------------------------------------------------------------------------------------------------------------------------

    # Set first id, Q and dQ (then only fish out new Q matrix and new dQ matrix when this changes) --- only id
    id.t = data$"(id)"[1]
    # *******************************************************************************************************


    for(i in 1:nrow(data)){ # perhaps change to while(i < nagg) where nagg is the number of rows in the aggregated dataset as done in Jackson lik.c code (on github)

      # # For checking the speed of this new implementation
      # print(paste(i, '/'  , nrow(data), sep = ''))

      if( (i == nrow(data) || data$"(id)"[i] != data$"(id)"[i+1]) ) next # i.e. skip the last observation for each individual

      if( data$"(id)"[i] != id.t ) id.t = data$"(id)"[i] # update id.t when it changes

      fromstate = data$"(fromstate)"[i]
      tostate = data$"(tostate)"[i]
      timelag = data$"(timelag)"[i]
      nrep = data$"(nrep)"[i]
      living.exact = data$"(living.exact)"[i]

      Qmatr.i   = Qmatr[,,i]
      dQmatr.i  = dQmatr[,,,i]
      # d2Qmatr.i = d2Qmatr[,,,i] #d2Qmatr[,,,,i]

      # [03/01/2021] Introducing a check to verify whether pathological behaviour is occurring within the Q matrix and when this starts
      # to happen ------------------------------------------------------------------------- PART B -----------------------------------

      if(!is.null(Qmatr.diagnostics.list)) Qmatr.i.maxs = c(Qmatr.i.maxs,  max(abs(Qmatr.i)))
      # --------------------------------------------------------------------------------------------------------------------------------

      if(pmethod == 'eigendecomp'){
        # *** EIGENDECOMPISITION ***
        # Compute P matrix and extract eigendecomposition quantities
        # times.eig = c()
        # start.eig = Sys.time()
        P.comp.i = P.matr.comp(Qmatr.i, timelag)

        Pmatr.i = P.comp.i$Pmatr
        decomp.obj.i = P.comp.i$decomp.obj

        # Compute dP matrices (this is an array of dimension nstates x nstates x l.params)
        if(do.gradient) dPmatr.i = dP.matr.comp(nstates = MM$nstates, l.params = MM$l.params, timelag = timelag,
                                                dQmatr.i = dQmatr.i, method = 'eigendecomp', decomp.obj.i = decomp.obj.i)$dPmatr

        # # Compute d2P matrices (this is an array of dimension nstates x nstates x l.params x l.params)
        # if(do.hessian) d2Pmatr.i = d2P.matr.comp(nstates = MM$nstates, l.params = MM$l.params, timelag, dQmatr.i, d2Qmatr.i, method = 'eigendecomp',
        #                                          decomp.obj.i = decomp.obj.i, start.pos.par = MM$start.pos.par, comp.par.mapping = comp.par.mapping)

      }



      if(pmethod == 'analytic'){ # DO WE NEED COMP.PAR.MAPPING HERE TOO?? PROBABLY! REMEMBER TO CHECK THIS!!
        # *** ANALYTIC EXPRESSIONS ***
        start.anal = Sys.time()
        Pmatr.i = P.matr.comp(Qmatr = Qmatr.i, timelag = timelag, method = 'analytic')$Pmatr

        dPmatr.i = dP.matr.comp(nstates = MM$nstates, l.params = MM$l.params, timelag = timelag,
                                dQmatr.i = dQmatr.i, method = 'analytic',
                                Pmatr.i = Pmatr.i, Qmatr.i = Qmatr.i)$dPmatr

        # d2Pmatr.i = d2P.matr.comp(nstates = MM$nstates, l.params = MM$l.params, timelag = timelag,
        #                           dQmatr.i = dQmatr.i, d2Qmatr.i = d2Qmatr.i, start.pos.par = MM$start.pos.par,
        #                           method = 'analytic',
        #                           Qmatr.i = Qmatr.i, Pmatr.i = Pmatr.i, dPmatr.i = dPmatr.i)

        end.anal = Sys.time()
      }



      if(pmethod == 'scaling&squaring'){
        # *** SCALING AND SQUARING ***
        start.new = Sys.time()
        Pmatr.comp.i = P.matr.comp(Qmatr = Qmatr.i, timelag = timelag, method = 'scaling&squaring')

        Pmatr.i =  Pmatr.comp.i$Pmatr
        SS.obj.i = Pmatr.comp.i$SS.obj

        dPmatr.comp.i = dP.matr.comp(nstates = MM$nstates, l.params = MM$l.params, timelag = timelag,
                                     dQmatr.i = dQmatr.i, method = 'scaling&squaring', Qmatr.i = Qmatr.i, SS.obj.i = SS.obj.i)
        dPmatr.i.SS = dPmatr.comp.i$dPmatr
        SS.obj.i = dPmatr.comp.i$SS.obj

        # d2Pmatr.i.SS = d2P.matr.comp(nstates = MM$nstates, l.params = MM$l.params, timelag = timelag,
        #                              dQmatr.i = dQmatr.i, d2Qmatr.i = d2Qmatr.i, start.pos.par = MM$start.pos.par,
        #                              method = 'scaling&squaring',
        #                              Qmatr.i = Qmatr.i, SS.obj.i = SS.obj.i)
        end.new = Sys.time()

        dPmatr.i = dPmatr.i.SS
        # d2Pmatr.i = d2Pmatr.i.SS

      }


      # do.gradient = TRUE # might be useful for debugging purposes
      # do.hessian = TRUE # might be useful for debugging purposes

      if((tostate != MM$nstates & fromstate != MM$cens.state & tostate != MM$cens.state & !living.exact) | !death) {

        # log-likelihood --------------------------
        l.par = l.par + log(Pmatr.i[fromstate, tostate]) * nrep

        # These are just checks used in debug
        if( is.nan(log(Pmatr.i[fromstate, tostate])) ) stop(paste('A NaN was produced at observation', i))
        if( -log(Pmatr.i[fromstate, tostate]) == Inf ) stop(paste('A Inf was produced at observation', i))
        # print(paste('i =', i, log(Pmatr.i[fromstate, tostate]) ))


        if(do.gradient){

          # gradient --------------------------------------------------------------------------------------
          G = G + dPmatr.i[fromstate, tostate, ]/Pmatr.i[fromstate, tostate] * nrep # exploiting array structure

        }


        if(do.hessian){

          # hessian -----------------------------------------------------------------------------------------
          # second.deriv.Ind.i = matrix(0, ncol = MM$l.params, nrow = MM$l.params)
          prod.first.deriv.Ind.i = matrix(0, ncol = MM$l.params, nrow = MM$l.params)
          for(k in 1:MM$l.params){
            for(l in k:MM$l.params){
              prod.first.deriv.Ind.i[k,l] = dPmatr.i[fromstate, tostate, k] * dPmatr.i[fromstate, tostate, l]
              # second.deriv.Ind.i[k,l] = d2Pmatr.i[fromstate, tostate, k, l]
            }
          }
          H = H + ( # second.deriv.Ind.i/Pmatr.i[fromstate, tostate]
                   - prod.first.deriv.Ind.i/Pmatr.i[fromstate, tostate]^2) * nrep

        }


      } else if (tostate == MM$nstates & death) { # absorbing state

        # log-likelihood -------------------------------------------------------------------------------------------
        exact.death.term = 0
        for(s in 1:(MM$nstates-1)) exact.death.term = exact.death.term + Pmatr.i[fromstate, s] * Qmatr.i[s, MM$nstates]
        l.par = l.par + log(exact.death.term) * nrep

        if(do.gradient){

          # gradient -------------------------------------------------------------------------------------------------
          exact.death.term = 0
          exact.death.term.der = 0
          for(s in 1:(MM$nstates-1)){
            exact.death.term = exact.death.term + Pmatr.i[fromstate, s]*Qmatr.i[s, MM$nstates]
            exact.death.term.der = exact.death.term.der + (dPmatr.i[fromstate, s, ]*Qmatr.i[s, MM$nstates] + Pmatr.i[fromstate, s]*dQmatr.i[s, MM$nstates, ])
          }

          G = G + exact.death.term.der/exact.death.term * nrep

        }

        if(do.hessian){

          # hessian -----------------------------------------------------------------------------------------------------
          exact.death.term = 0
          single.term.1 = 0
          single.term.2 = 0
          # square.term = 0



          for(s in 1:(MM$nstates-1)){

            first.deriv.term.1 = rep(0, MM$l.params)
            first.deriv.term.2 = matrix(0, MM$l.params)
            # second.deriv.Ind.i = matrix(0, ncol = MM$l.params, nrow = MM$l.params)
            # second.deriv.Q.Ind.i = matrix(0, ncol = MM$l.params, nrow = MM$l.params)
            # prod.first.deriv.Ind.i.1 = matrix(0, ncol = MM$l.params, nrow = MM$l.params)
            # prod.first.deriv.Ind.i.2 = matrix(0, ncol = MM$l.params, nrow = MM$l.params)

            comp.par = 1 # to properly map d2Q given its compact form

            for(k in 1:MM$l.params){

              first.deriv.term.1[k] = dPmatr.i[fromstate, s, k]*Qmatr.i[s, MM$nstates] + Pmatr.i[fromstate, s]*dQmatr.i[s, MM$nstates, k]

              # for(l in k:MM$l.params){ # only computation of upper triangle is needed (reconstruct full matrix from this later)

                # prod.first.deriv.Ind.i.1[k,l] = dPmatr.i[fromstate, s, k]*dQmatr.i[s, MM$nstates, l]
                # prod.first.deriv.Ind.i.2[k,l] = dQmatr.i[s, MM$nstates, k]*dPmatr.i[fromstate, s, l]

                # second.deriv.Ind.i[k,l] = d2Pmatr.i[fromstate, s, k, l]

                # # COMPACT IMPLEMENTATION (remember to uncomment comp.par as well) **************
                # if( is.null(comp.par.mapping) ){
                #   if( max(which(k - MM$start.pos.par >= 0)) == max(which(l - MM$start.pos.par >= 0)) ) { # ... but remember block-diagonal structure, this checks to which transition intensity the parameter belongs, if not same for r1 and r2 then d2Q is just matrix of zeroes
                #     second.deriv.Q.Ind.i[k,l] = d2Qmatr.i[s, MM$nstates, comp.par]
                #     comp.par = comp.par + 1
                #   } else {
                #     second.deriv.Q.Ind.i[k,l]s = 0
                #   }
                # } else {
                #
                #   if( !is.na(comp.par.mapping[k, l]) ){
                #     second.deriv.Q.Ind.i[k,l] = d2Qmatr.i[s, MM$nstates, comp.par.mapping[k, l]]
                #   } else {
                #     second.deriv.Q.Ind.i[k,l] = 0
                #   }
                #
                # }
                # # *********************************************************************************

              # }
            }

            exact.death.term = exact.death.term + Pmatr.i[fromstate, s]*Qmatr.i[s, MM$nstates]
            single.term.1 = single.term.1 + first.deriv.term.1
            # square.term = square.term + (second.deriv.Ind.i*Qmatr.i[s, MM$nstates] + prod.first.deriv.Ind.i.1 + prod.first.deriv.Ind.i.2 + Pmatr.i[fromstate, s]*second.deriv.Q.Ind.i)

          }
          matr.single.term.1 = matrix(rep(single.term.1, MM$l.params), ncol = MM$l.params, byrow = TRUE)
          matr.single.term.2 = t(matr.single.term.1)

          matr.single.term.1[lower.tri(matr.single.term.1, diag = FALSE)] = 0 # only upper triangle
          matr.single.term.2[lower.tri(matr.single.term.2, diag = FALSE)] = 0 # only upper triangle

          H = H + (#square.term/exact.death.term
            - matr.single.term.1*matr.single.term.2/exact.death.term^2) * nrep

        }


      } else if (living.exact) { # exactly observed living state

        stop('The exactly observed living state is currently not supported when the approximation of the Hessian is computed. \nPlease use the analytic Hessian option instead or contact authors.')

        # l.par = l.par + ( Qmatr.i[fromstate, fromstate] * timelag + log(Qmatr[fromstate, tostate, i+1]) ) * nrep # last term is 'i+1' because the Q we multiply by is the one found in the arrival time, and since we are assuming left-cont piecewise const we need the Q from the following observation
        #
        # if(do.gradient) G = G + (dQmatr[fromstate, tostate, , i+1] -  Qmatr[fromstate, tostate, i+1] * dQmatr.i[fromstate, fromstate, ] *  timelag ) / Qmatr[fromstate, tostate, i+1] * nrep
        #
        # if(do.hessian){
        #
        #   prod.first.deriv.Ind.i.1 = prod.first.deriv.Ind.i.2 = prod.first.deriv.Ind.i.3 = prod.first.deriv.Ind.i.3 = matrix(0, ncol = MM$l.params, nrow = MM$l.params)
        #   prod.first.deriv.Ind.i.4 = prod.first.deriv.Ind.i.5 = prod.first.deriv.Ind.i.6 = prod.first.deriv.Ind.i.7 = prod.first.deriv.Ind.i.3 = matrix(0, ncol = MM$l.params, nrow = MM$l.params)
        #   second.deriv.Q.Ind.i = matrix(0, ncol = MM$l.params, nrow = MM$l.params)
        #   second.deriv.Q.Ind.ip1 = matrix(0, ncol = MM$l.params, nrow = MM$l.params)
        #
        #   comp.par = 1
        #
        #   for(k in 1:MM$l.params){
        #     for (l in k:MM$l.params){
        #       prod.first.deriv.Ind.i.1[k,l] =  dQmatr[fromstate, tostate, k, i+1] * dQmatr.i[fromstate, fromstate, l] * timelag
        #       prod.first.deriv.Ind.i.2[k,l] =  Qmatr[fromstate, tostate, k, i+1] * dQmatr.i[fromstate, fromstate, k] * dQmatr.i[fromstate, fromstate, l] * timelag^2
        #       prod.first.deriv.Ind.i.3[k,l] =  dQmatr[fromstate, tostate, l, i+1] * dQmatr.i[fromstate, fromstate, k] * timelag
        #
        #       prod.first.deriv.Ind.i.4[k,l] =  dQmatr[fromstate, tostate, k, i+1] * dQmatr[fromstate, tostate, l, i+1]
        #       prod.first.deriv.Ind.i.5[k,l] =  Qmatr[fromstate, tostate, i+1] * dQmatr[fromstate, tostate, k, i+1] * dQmatr.i[fromstate, fromstate, l] * timelag
        #       prod.first.deriv.Ind.i.6[k,l] =  Qmatr[fromstate, tostate, i+1] * dQmatr.i[fromstate, tostate, k] * dQmatr[fromstate, fromstate, l, i+1] * timelag
        #       prod.first.deriv.Ind.i.7[k,l] =  Qmatr[fromstate, tostate, i+1]^2 * dQmatr.i[fromstate, tostate, k] * dQmatr.i[fromstate, tostate, l] * timelag^2
        #
        #       # COMPACT IMPLEMENTATION (remember to uncomment comp.par as well) **************
        #       if( is.null(comp.par.mapping) ){
        #         if( max(which(k - MM$start.pos.par >= 0)) == max(which(l - MM$start.pos.par >= 0)) ) { # ... but remember block-diagonal structure, this checks to which transition intensity the parameter belongs, if not same for r1 and r2 then d2Q is just matrix of zeroes
        #           second.deriv.Q.Ind.i[k,l] = d2Qmatr.i[fromstate, fromstate, comp.par]
        #           second.deriv.Q.Ind.ip1[k,l] = d2Qmatr[fromstate, tostate, comp.par,i+1]
        #
        #           comp.par = comp.par + 1
        #         } else {
        #           second.deriv.Q.Ind.i[k,l] = 0
        #         }
        #       } else {
        #
        #         if( !is.na(comp.par.mapping[k, l]) ){
        #           second.deriv.Q.Ind.i[k,l] =   d2Qmatr.i[fromstate, fromstate, comp.par.mapping[k, l]]
        #           second.deriv.Q.Ind.ip1[k,l] = d2Qmatr[fromstate, tostate, comp.par.mapping[k, l],i+1]
        #
        #         } else {
        #           second.deriv.Q.Ind.i[k,l] = 0
        #           second.deriv.Q.Ind.ip1[k,l] = 0
        #         }
        #
        #       }
        #       # *********************************************************************************
        #     }
        #   }
        #
        #   H = H + (Qmatr[fromstate, tostate, i+1]^-1 * ( prod.first.deriv.Ind.i.1 + prod.first.deriv.Ind.i.2 + second.deriv.Q.Ind.ip1 + prod.first.deriv.Ind.i.3 + Qmatr[fromstate, tostate, i+1] * second.deriv.Q.Ind.i * timelag) + Qmatr[fromstate, tostate, i+1]^-2 * ( prod.first.deriv.Ind.i.4 + prod.first.deriv.Ind.i.5 + prod.first.deriv.Ind.i.6 + prod.first.deriv.Ind.i.7 ) ) * nrep
        #
        # }

      } else if (tostate == MM$cens.state) { # censored state

        stop('The censored state option is currently not supported when the approximation of the Hessian is computed. \nPlease use the analytic Hessian option instead or contact authors.')

        # # log-likelihood --------------------------
        # l.par = l.par + log( sum(Pmatr.i[fromstate, ]) ) * nrep
        #
        # # gradient --------------------------------------------------------------------------------------
        # if(do.gradient) G = G + apply(dPmatr.i[fromstate, , ], MARGIN = 2, FUN = sum)/sum(Pmatr.i[fromstate, ]) * nrep # exploiting array structure
        #
        #
        #
        # if(do.hessian){
        #
        #   # hessian -----------------------------------------------------------------------------------------
        #   second.deriv.Ind.i = matrix(0, ncol = MM$l.params, nrow = MM$l.params)
        #   prod.first.deriv.Ind.i = matrix(0, ncol = MM$l.params, nrow = MM$l.params)
        #   for(k in 1:MM$l.params){
        #     for(l in k:MM$l.params){
        #       prod.first.deriv.Ind.i[k,l] = sum(dPmatr.i[fromstate, , k]) * sum(dPmatr.i[fromstate, , l])
        #       second.deriv.Ind.i[k,l] = sum(d2Pmatr.i[fromstate, , k, l])
        #     }
        #   }
        #   H = H + (second.deriv.Ind.i/sum(Pmatr.i[fromstate, ]) - prod.first.deriv.Ind.i/sum(Pmatr.i[fromstate, ])^2) * nrep
        #
        # }

      } else if (fromstate == MM$cens.state) { # censored state starting point - not sure if this is correct !! keep an eye on it

        stop('The censored state option is currently not supported when the approximation of the Hessian is computed. \nPlease use the analytic Hessian option instead or contact authors.')

    #     # log-likelihood --------------------------
    #     l.par = l.par + log( sum(Pmatr.i[, tostate]) ) * nrep
    #
    #     # gradient --------------------------------------------------------------------------------------
    #     if(do.gradient) G = G + apply(dPmatr.i[, tostate, ], MARGIN = 2, FUN = sum)/sum(Pmatr.i[, tostate]) * nrep # exploiting array structure
    #
    #
    #
    #     if(do.hessian){
    #
    #       # hessian -----------------------------------------------------------------------------------------
    #       second.deriv.Ind.i = matrix(0, ncol = MM$l.params, nrow = MM$l.params)
    #       prod.first.deriv.Ind.i = matrix(0, ncol = MM$l.params, nrow = MM$l.params)
    #       for(k in 1:MM$l.params){
    #         for(l in k:MM$l.params){
    #           prod.first.deriv.Ind.i[k,l] = sum(dPmatr.i[, tostate, k]) * sum(dPmatr.i[, tostate, l])
    #           second.deriv.Ind.i[k,l] = sum(d2Pmatr.i[, tostate, k, l])
    #         }
    #       }
    #       H = H + (second.deriv.Ind.i/sum(Pmatr.i[, tostate]) - prod.first.deriv.Ind.i/sum(Pmatr.i[, tostate])^2) * nrep
    #
    #     }
    #
      }
    }

  }



  # Obtain penalty terms
  S.h  <- pen.matr.S.lambda                      # hess
  S.h1 <- 0.5*crossprod(params, S.h)%*%params    # lik
  S.h2 <- S.h%*%params                           # grad

  # Penalize log-likelihood
  lik.pen = -l.par + S.h1

  # Penalize gradient
  G.pen = -G + S.h2

  # Penalize hessian
  # H[lower.tri(H, diag = FALSE)] = H[upper.tri(H, diag = FALSE)] # reconstruct full matrix --- THIS IS WRONG !!!
  if(!parallel){ # no need as we already return the full matrix with parallel
    H = H + t(H)         # reconstruct full matrix
    diag(H) = diag(H)/2  # reconstruct full matrix
  }
  H.pen = -H + S.h



  # Q DIAGNOTICS - may be useful
  if(!is.null(Qmatr.diagnostics.list) & !parallel){
    Qmatr.diagnostics.list[[length(Qmatr.diagnostics.list)+1]] = Qmatr.i.maxs
  }

  # --------------------------------------------------------------------------------------------------------------------------------


  if(verbose) print(paste(Sys.time(), '// lik.pen =', lik.pen, "// max abs grad =", round(max(abs(G.pen)), 5), '// min eigen =', round(min(eigen(H.pen)$values), 5)))



  list(value = lik.pen, gradient = G.pen, hessian = H.pen, # penalized log-lik, gradient and Hessian
       S.h = S.h, S.h1 = S.h1, S.h2 = S.h2,                # penalty matrices
       l = -l.par,                                         # unpenalized negative log-likelihood
       Qmatr.diagnostics.list = Qmatr.diagnostics.list     # containing maximum abs Q values, for diagnostics
  )

}