R/nls_multstart.R
In nls.multstart: Robust Non-Linear Regression using AIC Scores

#' Finds the best fit of non-linear model based on AIC score
#'
#' Finds the best estimated model using non-linear least squares regression
#' using nlsLM(). The best fit is determined using AIC scores.
#'
#' @param formula a non-linear model formula, with the response on the left of a
#'   ~ operator and an expression involving parameters on the right.
#' @param data (optional) data.frame, list or environment in which to evaluate
#'   the variables in \code{formula} and \code{modelweights}.
#' @param iter number of combinations of starting parameters which will be tried
#'   . If a single value is provided, then a shotgun/random-search/lhs approach
#'   will be used to sample starting parameters from a uniform distribution
#'   within the starting parameter bounds. If a vector of the same length as the
#'   number of parameters is provided, then a gridstart approach will be used to
#'   define each combination of that number of equally spaced intervals across
#'   each of the starting parameter bounds respectively. Thus, c(5,5,5) for
#'   three fitted parameters yields 125 model fits.  Supplying a vector for
#'   \code{iter} will override \code{convergence_count}.
#' @param start_lower lower boundaries for the start parameters. If missing,
#'   this will default to -1e+10.
#' @param start_upper upper boundaries for the start parameters. If missing,
#'   this will default to 1e+10.
#' @param supp_errors if \code{supp_errors = 'Y'}, then warning messages will be
#'   suppressed and no error messages from \code{\link[minpack.lm]{nlsLM}} will
#'   be shown, reducing the number of error messages printed while the model
#'   attempts to converge using poor starting parameters. We advise to only use
#'   \code{supp_errors = 'Y'} when confident in the bounds of your starting
#'   parameters.
#' @param convergence_count The number of counts that the winning model should
#'   be undefeated for before it is declared the winner. This argument defaults
#'   to 100. If specified as \code{FALSE}, then all of the iterations will be
#'   fitted, and the best model selected. Note that \code{convergence_count} can
#'   only be used with a shotgun/random-search approach, and not with a
#'   gridstart approach. This argument will be ignored if a gridstart approach
#'   is specified by a vector input for \code{iter}.
#' @param control specific control can be specified using
#'   \code{\link[minpack.lm]{nls.lm.control}}.
#' @param modelweights Optional model weights for the nls. If \code{data} is
#'   specified, then this argument should be the name of the numeric weights
#'   vector within the \code{data} object.
#' @param lhstype Method to use for Latin Hypercube Sampling using \code{\link[lhs]{lhs}}. Choice of
#' \code{"random"} (simple random lhs, fast), \code{"improved"} (lhs with
#' optimised euclidean distance between points, medium speed), \code{"maximin"}
#' (lhs with maximised minimum distance between points, medium speed), or
#' \code{"genetic"} (lhs optimised to the S optimilaity criterion using a
#' genetic algorithm, slow). If not set, a purely random (shotgun) approach is
#' taken. Only used if \code{iter} is a single number.
#' @param \dots Extra arguments to pass to \code{\link[minpack.lm]{nlsLM}} if
#'   necessary.
#' @return returns a nls object of the best estimated model fit.
#' @note Useful additional arguments for \code{\link[minpack.lm]{nlsLM}}
#'   include: \code{na.action = na.omit}, \code{lower/upper = c()} where these
#'   represent upper and lower boundaries for parameter estimates.
#' @author Daniel Padfield
#' @author Granville Matheson
#' @author Francis Windram
#' @seealso \code{\link[minpack.lm]{nlsLM}} for details on additional arguments
#'   to pass to the nlsLM function. \code{\link[lhs]{lhs}} for details of Latin Hypercube Sampling
#' @examples
#' # load in data
#'
#' data("Chlorella_TRC")
#' Chlorella_TRC_test <- Chlorella_TRC[Chlorella_TRC$curve_id == 1,]
#'
#' # run nls_multstart()
#'
#' # define the Sharpe-Schoolfield equation
#' schoolfield_high <- function(lnc, E, Eh, Th, temp, Tc) {
#'  Tc <- 273.15 + Tc
#'  k <- 8.62e-5
#'  boltzmann.term <- lnc + log(exp(E/k*(1/Tc - 1/temp)))
#'  inactivation.term <- log(1/(1 + exp(Eh/k*(1/Th - 1/temp))))
#'  return(boltzmann.term + inactivation.term)
#' }
#'
#' fits <- nls_multstart(ln.rate ~ schoolfield_high(lnc, E, Eh, Th, temp = K, Tc = 20),
#'                 data = Chlorella_TRC_test,
#'                 iter = 500,
#'                 start_lower = c(lnc=-10, E=0.1, Eh=0.5, Th=285),
#'                 start_upper = c(lnc=10, E=2, Eh=5, Th=330),
#'                 lower = c(lnc=-10, E=0, Eh=0, Th=0),
#'                 supp_errors = 'Y')
#'
#' @name nls_multstart
#' @export

# getting rid of undefined variables note
if(base::getRversion() >= "2.15.1")  utils::globalVariables(c("AICval", 'iteration', 'startpars'))

nls_multstart <-
  # arguments needed for nls_multstart ####
  function(formula, data = parent.frame(), iter, start_lower, start_upper,
           supp_errors = c("Y", "N"), convergence_count = 100, control,
           modelweights, lhstype=c("none", "random", "improved", "maximin", "genetic"), ...) {

    # set default values
    if (missing(supp_errors)) {
      supp_errors <- "N"
    }

    # create model ####
    formula <- stats::as.formula(formula)

    # define parameters to estimate and independent variable ####
    if (any(class(data) %in% c("data.frame", "list", "environment"))) {
      params_ind <- all.vars(formula[[3]])[all.vars(formula[[3]]) %in% names(data)]
      params_est <- all.vars(formula[[3]])[!all.vars(formula[[3]]) %in% names(data)]
      params_dep <- all.vars(formula[[2]])
    } else {
      cli::cli_abort(c("data should be a {.cls data.frame}, {.cls list} or an {.cls environment}",
      "x"="You've provided: {.cls {class(data)}}"))
    }

    # set up parameter boundaries ####

    if (missing(start_lower) || missing(start_upper)) {
      cli::cli_warn(c("!"="No boundaries specified for the starting values of sought parameters.",
                      " "="Default values of +/- {.val {1e+10}} will be used. This is likely to slow the process of finding the best model.",
                      " "="Continue with default values [Y/n]?"))
      r <- readline("? ")

      if (tolower(r) == "n") {
        cli::cli_abort("Please enter upper and lower parameter boundaries as start_lower and start_upper in the function arguments.")
      }
    }

    if (missing(start_lower)) start_lower <- rep(-10 ^ 10, length(params_est))
    if (missing(start_upper)) start_upper <- rep(10 ^ 10, length(params_est))

    if (length(start_lower) != length(params_est) || length(start_upper) != length(params_est)) {
      cli::cli_abort(c(
        "x"="There must be as many parameter starting bounds as there are parameters.",
        "",
        "*"="Parameters to estimate: {.val {params_est}}",
        "*"="Parameters supplied as input data: {.val {params_ind}}",
        "*"="Number of parameters in start_lower: {.val {length(start_lower)}}",
        "*"="Number of parameters in start_upper: {.val {length(start_upper)}}",
        "",
        "!"="If there are parameters listed as input data which should be estimated,",
        " "="this can be caused by variables in the environment/data with the same name as model parameters.",
        ""
      ))
    }

    params_bds <- data.frame(
      param = params_est,
      low.bds = unlist(start_lower),
      high.bds = unlist(start_upper),
      stringsAsFactors = FALSE
    )

    # nlsLM controls - this can stay the same, potential to be overridden
    if (missing(control)) {
      control <- minpack.lm::nls.lm.control(maxiter = 1000, ftol = .Machine$double.eps, ptol = .Machine$double.eps)
    }


    # transform input arguments
    silent <- ifelse(supp_errors == "Y", TRUE, FALSE)

    # if silent is TRUE, temporarily switch off warnings
    if(silent == TRUE){
      oo <- options(warn=-1)
      on.exit(options(oo))
    }

    if ("modelweights" %in% all.vars(formula)) {
      cli::cli_abort(c("x"="The variable name {.var modelweights} is reserved for model weights.","!"="Please change the name of this variable."))
    }

    if (missing(modelweights)) {
      data$modelweights <- rep(1, length(data[[params_dep]]))
    } else {
      data$modelweights <- eval(substitute(modelweights), data)
    }


    if (length(iter) == 1) {
      multistart_type <- "shotgun"
    } else if (length(iter) == nrow(params_bds)) {
      multistart_type <- "gridstart"
    } else {
      cli::cli_abort(c("x"="{.arg iter} should be a single number for the shotgun or lhs approaches and of the same length as the number of parameters for the gridstart approach.",
                       "*"="Number of parameters: {.val {nrow(params_bds)}}"))
    }

    if (multistart_type == "gridstart" && convergence_count != FALSE) {
      cli::cli_warn(c("!"="A gridstart approach cannot be applied with {.arg convergence_count}.","i"="Convergence count will be set to {.val {FALSE}}"))
      convergence_count <- FALSE
    }

    lhstype <- rlang::arg_match(lhstype)

    # set up start values ####

    ## SHOTGUN ##

    if (multistart_type == "shotgun") {
      if (lhstype == "none") {
        strt <- purrr::map2(params_bds$low.bds, params_bds$high.bds, ~runif(iter, .x, .y))
        names(strt) <- params_bds$param
        strt <- dplyr::bind_rows(strt)
      } else {
        lhs_mapper <- switch(lhstype,
                             random=lhs::randomLHS,  # ~400us for 500x5
                             improved=lhs::improvedLHS, # ~135ms
                             maximin=lhs::maximinLHS,  # ~135ms
                             genetic=lhs::geneticLHS,  # ~500ms
                             {cli::cli_warn(c("!"="Not a known lhs approach, defaulting to {.val {'random'}}"));lhs::randomLHS})
        lhs_map <- lhs_mapper(n=iter, k=nrow(params_bds))
        rmat <- apply(params_bds[,2:3], 1, function(x) {diff(range(x))})
        strt <- tibble::as_tibble(t(apply(lhs_map, 1, function(x) {(x * rmat) + params_bds$low.bds})))
        colnames(strt) <- params_bds$param
      }
    }


    ## GRIDSTART ##

    if (multistart_type == "gridstart") {
      params_bds$iter <- iter
      strt <- purrr::pmap(
        as.list(params_bds[, -1]),
        function(low.bds, high.bds, iter) seq(from = low.bds, to = high.bds, length.out = iter)
      )
      names(strt) <- params_bds$param
      strt <- tibble::as_tibble(expand.grid(strt))
    }


    # Fit nls model using LM optimisation across multiple starting values


    #### With convergence ####

    if (multistart_type == "shotgun" && convergence_count != FALSE) {

      # set count to 0
      count <- 0

      # create AIC vector
      stored_AIC <- Inf

      # create empty fit
      fit <- NULL
      fit_best <- NULL

      for (j in 1:nrow(strt)) {
        start.vals <- as.list(strt[j, ])

        # try and fit the model for every set of searching parameters

        try(
          fit <- minpack.lm::nlsLM(
            formula,
            start = start.vals,
            control = control,
            data = data,
            weights = modelweights, ...
          ),
          silent = silent
        )


        # if the AIC score of the next fit model is < the AIC of the stored fit fit, replace the fit
        # the output to ensure the best model is selected

        if (is.null(fit) && is.null(fit_best)) {
          count <- 0
        }
        else {
          count <- ifelse(stored_AIC <= stats::AIC(fit), count + 1, 0)
        }
        if (count == convergence_count) {
          break
        }

        if (!is.null(fit) && stored_AIC > stats::AIC(fit)) {
          stored_AIC <- stats::AIC(fit)
          fit_best <- fit
        }
      }
    }


    #### Without convergence_count ####

    if (convergence_count == FALSE) {
      strt$iteration <- 1:nrow(strt)

      allfits <- tidyr::nest(strt, startpars = -iteration)


      # create empty fit
      fit <- NULL
      fit_best <- NULL


      # fit_function saves only AIC values so as not to fill memory with model fits

      fit_aic <- function(startpars) {
        start.vals <- as.list(startpars[[1]])

        try(
          fit <- minpack.lm::nlsLM(
            formula,
            start = start.vals,
            control = control,
            data = data,
            weights = modelweights, ...
          ),
          silent = silent
        )

        AICval <- ifelse(!is.null(fit), stats::AIC(fit), Inf)

        return(AICval)
      }

      allfits <- dplyr::group_by(allfits, iteration)
      allfits <- dplyr::mutate(allfits, AICval = purrr::map_dbl(
        startpars,
        ~fit_aic(startpars)
      ))
      allfits <- dplyr::arrange(allfits, AICval)

      fit_best <- minpack.lm::nlsLM(
        formula,
        start = allfits$startpars[[1]],
        control = control,
        data = data,
        weights = modelweights, ...
      )
    }

    return(fit_best)
  }
Any scripts or data that you put into this service are public.
nls.multstart documentation built on April 3, 2025, 6:14 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
nls.multstart
Robust Non-Linear Regression using AIC Scores

R/nls_multstart.R
In nls.multstart: Robust Non-Linear Regression using AIC Scores

Try the nls.multstart package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

nls.multstart Robust Non-Linear Regression using AIC Scores

R/nls_multstart.R In nls.multstart: Robust Non-Linear Regression using AIC Scores

Try the nls.multstart package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

nls.multstart
Robust Non-Linear Regression using AIC Scores

R/nls_multstart.R
In nls.multstart: Robust Non-Linear Regression using AIC Scores