R/milags_functions.R

Defines functions get_theme cut_the_data smooth_data get_def_pars get_all_methods_lag calc_lag get_lag plot_lag_fit fit_max_infl_lag lag_biomass_incr fit_exp_lag plot_data make_grwoth_curve_df calc_baranyi_fit_lag get_init_pars_baranyi calc_lagistic_fit_lag get_init_pars_logistic calc_lag_fit_to_baranyi_with_lag calc_lag_fit_to_logistic_with_lag choose_lag_fit_algorithm_logistic choose_lag_fit_algorithm_baranyi compare_algorithms fit_exp_lag_to_curve get_n0

Documented in calc_baranyi_fit_lag calc_lag calc_lag_fit_to_baranyi_with_lag calc_lag_fit_to_logistic_with_lag calc_lagistic_fit_lag choose_lag_fit_algorithm_baranyi choose_lag_fit_algorithm_logistic compare_algorithms cut_the_data fit_exp_lag fit_exp_lag_to_curve fit_max_infl_lag get_all_methods_lag get_def_pars get_init_pars_baranyi get_init_pars_logistic get_lag get_n0 get_theme lag_biomass_incr make_grwoth_curve_df plot_data plot_lag_fit smooth_data

############################################# helper functions #########################################
#' get_n0
#'
#' Gets the initial biomass to relate to
#' @param biomass vector of biomass (chronologically ordered as in growth curve)
#' @param n0_method "first.observation" if the first point is taken as the initial biomass or
#' "minimal.observation" if the minimal biomass is taken is the initial point.
#' In "healthy" growth curves these options should be equivalent
#' but sometimes a drop in OD/biomass is observed at the beginning of a growth curve.
#' In this case it is not obvious what to assume the initial biomass is.
#' @import dplyr
#' @import ggplot2
#' @import minpack.lm
#' @import nlsMicrobio
#' @rawNamespace import(testthat, except = getNamespaceExports("testthat"))
#' @returns a value of the initial biomass (either the first observation or the minimum value depending on the parameter N0.method)
get_n0 <- function(biomass, n0_method) {
  # Get the initial biomass value
  if (n0_method == "first.observation") {
    n0_calc <- biomass[1]
  } else if (n0_method == "minimal.observation") {
    n0_calc <- min(biomass)
  }
  return(n0_calc)
}



#' fit_exp_lag_to_curve
#'
#' Fits the lag to one growth curve based on the basic tangent method
#' @param data a data frame with two required columns names: "time" and "biomass",
#' This is data from one growth curve only, one (mean) observation per time
#' @param n0 the initial biomass (a tangent line crossing N0 line will determine the lag)
#' @param tangent_method "local.regression" (if the tangent is fitted to a number of points around the maximal growth rate)
#' or "to.point" (if the tangent is fitted only to the point where the growth rate is maximal); defaults to "to.point"
#' @param curve_points if tangent_method = "local.regression" then curve_points is the number of points the line is fitted to;
#' defaults to 3 i.e. the point with the maximal uptake rate one point before and one point after
#' @returns line_slope: slope of the tangent line,
#' line_intercept: intercept of the tangent line,
#' lag: lag,
#' tangent_points: i..e a data frame of all points selected for fitting the line
fit_exp_lag_to_curve <- function(data, n0, tangent_method = "to.point", curve_points = 3) {
  data <- data %>% as.data.frame() %>%
    mutate(log_biomass = log(biomass),
           db = log_biomass - dplyr::lag(log_biomass,1),
           dt = time - dplyr::lag(time,1),
           time_diff = (time + dplyr::lag(time))/2,
           central_scheme_db = dplyr::lead(log_biomass) - dplyr::lag(log_biomass,1),
           central_scheme_dt = dplyr::lead(time) - dplyr::lag(time,1),
           db_over_dt = central_scheme_db/central_scheme_dt) #db/dt)
  log_n0 <- log(n0)
  # Get the maximum growth rate
  max_gr_rate <- max(data$db_over_dt, na.rm = TRUE)
  ind_max_diff <- which(data$db_over_dt == max_gr_rate)

  if (length(ind_max_diff) > 1) {
    warning("Multiple points with max derivative: taking the first one")
    ind_max_diff <- ind_max_diff[1]
  }
  if (tangent_method == "to.point") {
    # 1. take simply one maximal growth rate point and calculate a tangent line to it
    # by definition the slope of this line will be equal to its derivative i.e. the
    exp_gr_points <- data[ind_max_diff, ]
    time_max_diff <- exp_gr_points$time
    max_diff_log_biomass <- exp_gr_points$log_biomass
    # ax + b = y
    # a = max.growth.rate
    # we know one point (x,y) = (time.max.diff, log.biomass.at.max.diff)
    line_slope <- max_gr_rate
    line_intercept <- max_diff_log_biomass - line_slope*time_max_diff
  } else if (tangent_method == "local.regression") {
    # 2. Take N points around the maximal growth rate and fit a line,
    # linearly extrapolate
    around_points <- floor((curve_points-1)/2)
    exp_gr_points <- data[(ind_max_diff-around_points):(ind_max_diff+around_points),]
    mod <- lm(log_biomass ~ time, exp_gr_points)
    line_intercept <- unname(mod$coefficients[1])
    line_slope <- unname(mod$coefficients[2])
  }
  # at x=lag we have y = log.N0, so lag = (y-b)/a
  lag <- (log_n0 - line_intercept)/line_slope
  return(list(line_slope = line_slope, line_intercept = line_intercept, lag = lag,
              tangent_points = exp_gr_points %>% select(time, tangent_point = biomass)))
}




#' compare_algorithms
#'
#' Compares results of 3 objects obtained from running nls
#' @param nls_LM_no_bound first object resulting from running nls
#' @param nls_PORT second object resulting from running nls
#' @param nlsres_LM third object resulting from running nls
#' @returns the best fitting object (lowest Res.Sum Sq provided that all coefficients are nonnegative)
compare_algorithms <- function(nls_LM_no_bound, nls_PORT, nlsres_LM) {
  if (!all(is.na(nls_LM_no_bound))) {
    s <- summary(nls_LM_no_bound)
    if (all(as.numeric(s$coefficients[,1]) >= 0)) {
      nls_LM_no_bound <- nls_LM_no_bound
    } else {
      nls_LM_no_bound <- NA
    }
  }
  # first compare two bounded models
  if (!all(is.na(nls_PORT)) & !all(is.na(nlsres_LM))) {
    # if both bounded models are available compare them and choose the better one
    anova_res <- anova(nlsres_LM, nls_PORT)
    # in case multiple are same good take the first one
    better_model_idx <- which(anova_res$`Res.Sum Sq` == min(anova_res$`Res.Sum Sq`))[1]
    if (better_model_idx ==1) {nls_bounded <- nlsres_LM} else if (better_model_idx == 2) {nls_bounded <- nls_PORT}
  } else if (!all(is.na(nls_PORT))) {
    nls_bounded <- nls_PORT
  } else if (!all(is.na(nlsres_LM))) {
    nls_bounded <- nlsres_LM
  } else {
    nls_bounded <- NA
  }

  # the netter bounded model compare to unbounded
  if (!all(is.na(nls_bounded)) & !all(is.na(nls_LM_no_bound))) {
    # if both bounded models are available compare them and choose the better one
    anova_res <- anova(nls_bounded, nls_LM_no_bound)
    # in case multiple are same good take the first one
    better_model_idx <- which(anova_res$`Res.Sum Sq` == min(anova_res$`Res.Sum Sq`))[1]
    if (better_model_idx ==1) {nls_a <- nls_bounded} else if (better_model_idx == 2) {nls_a <- nls_LM_no_bound}
  } else if (!all(is.na(nls_bounded))) {
    nls_a <- nls_bounded
  } else if (!all(is.na(nls_LM_no_bound))) {
    nls_a <- nls_LM_no_bound
  } else {
    nls_a <- NA
  }
  return(nls_a)

}




#' choose_lag_fit_algorithm_baranyi
#'
#' Runs nlsLM/nls algorithms with three different parameter setups to fit the best Baranyi parameters to our data and chooses the best model
#' @param gr_curve data from one specific growth curve with the following columns: LOG10N, t
#' @param LOG10N0 init value for the LOG10N0 parameter
#' @param init_lag initial value for the lag
#' @param init_mumax initial value for the mumax parameter
#' @param init_LOG10Nmax initial value for the LOG10Nmax parameter
#' @param max_iter max. number of iterations
#' @param lower_bound lower bound for the bounded nls optimization;
#' @returns the best nls fitting object with parameters fitted to Baranyi model (lowest Res.Sum Sq provided that all coefficients are nonnegative)
choose_lag_fit_algorithm_baranyi <- function(gr_curve,
                                                     LOG10N0,
                                                     init_lag,
                                                     init_mumax,
                                                     init_LOG10Nmax,
                                                     max_iter,
                                                     lower_bound) {

  # choose best from LM and port
  # Sometimes the lower.bound argument makes the fit much worse.
  tryCatch(
    expr =
      {nlsres_LM <- nlsLM(formula = baranyi,
                        data = gr_curve,
                        start = list(lag=init_lag, mumax=init_mumax, LOG10N0 = LOG10N0, LOG10Nmax = init_LOG10Nmax),
                        control = nls.control(maxiter = max_iter),
                        lower = lower_bound)
      },
    error = function(cond) {
      # this operator assigns value outside the error environment
      nlsres_LM <<- NA
    })
  tryCatch(
    expr =
      {nls_LM_no_bound <- nlsLM(formula = baranyi,
                                 data = gr_curve,
                                 start = list(lag=init_lag, mumax=init_mumax, LOG10N0 = LOG10N0, LOG10Nmax = init_LOG10Nmax),
                                 control = nls.control(maxiter = max_iter))
      },
    error = function(cond) {
      nls_LM_no_bound <<- NA
    })
  tryCatch(
    expr =
      {nls_PORT <- nls(baranyi,
                        data = gr_curve,
                        start = list(lag=init_lag, mumax=init_mumax, LOG10N0 = LOG10N0, LOG10Nmax = init_LOG10Nmax),
                        algorithm = "port",
                        control = nls.control(maxiter = max_iter),
                        lower = lower_bound)
      },
    error = function(cond) {
      nls_PORT <<- NA
    })

  nls_a <- compare_algorithms(nls_LM_no_bound, nls_PORT, nlsres_LM)
  # consider the model without lower bounds only if the resulted estimates are above 0
  return(nls_a)

}



#' choose_lag_fit_algorithm_logistic
#'
#' Runs nlsLM/nls algorithms with three different parameter setups to fit the best Logistic model parameters to our data and chooses the best model
#' @param gr_curve data from one specific growth curve with the following columns: LOG10N, t
#' @param n0 the initial biomass
#' @param init_gr_rate initial value for the growth rate
#' @param init_K initial value for the saturation parameter K
#' @param init_lag initial value for the lag parameter
#' @param max_iter max. number of iterations; defaults to 100
#' @param lower_bound lower bound for the bounded nls optimization; defaults to 0
#' @returns the best nls fitting object with parameters fitted to logistic model (lowest Res.Sum Sq provided that all coefficients are nonnegative)
choose_lag_fit_algorithm_logistic <- function(gr_curve, n0,
                                                      init_gr_rate = init_gr_rate,
                                                      init_K = init_K,
                                                      init_lag = init_lag,
                                                      max_iter = 100,
                                                      lower_bound = c(0,0,0)) {
  # choose best from LM and port
  # Sometimes the lower.bound argument makes the fit much worse.
  tryCatch(
    expr =
      {nlsres_LM <- nlsLM(formula = biomass ~ n0 + (time >= lag)*n0*(-1+K*exp(gr_rate*(time-lag))/(K - n0 + n0*exp(gr_rate*(time - lag)))),
                        data = gr_curve,
                        start = list(gr_rate = init_gr_rate, K=init_K, lag = init_lag),
                        control = nls.control(maxiter = max_iter),
                        lower = lower_bound)
      },
    error = function(cond) {
      nlsres_LM <<- NA
    })
  tryCatch(
    expr =
      {nls_LM_no_bound <- nlsLM(formula = biomass ~ n0 + (time >= lag)*n0*(-1+K*exp(gr_rate*(time-lag))/(K - n0 + n0*exp(gr_rate*(time - lag)))),
                                 data = gr_curve,
                                 start = list(gr_rate = init_gr_rate, K=init_K, lag = init_lag),
                                 control = nls.control(maxiter = max_iter))
      },
    error = function(cond) {
      nls_LM_no_bound <<- NA
    })
  tryCatch(
    expr =
      {nls_PORT <- nls(biomass ~ n0 + (time >= lag)*n0*(-1+K*exp(gr_rate*(time-lag))/(K - n0 + n0*exp(gr_rate*(time - lag)))), gr_curve,
                        list(gr_rate = init_gr_rate, K=init_K, lag = init_lag),
                        algorithm = "port",
                        control = nls.control(maxiter = max_iter),
                        lower = lower_bound)},
    error = function(cond) {
      nls_PORT <<- NA
    })

  nls_a <- compare_algorithms(nls_LM_no_bound, nls_PORT, nlsres_LM)
  # consider the model without lower bounds only if the resulted estimates are above 0
  return(nls_a)

}



#' calc_lag_fit_to_logistic_with_lag
#'
#' Runs nlsLM/nls algorithm of the user's choice to fit the  Logistic model parameters to our data
#' @param gr_curve data from one specific growth curve with these two columns: time and biomass
#' @param n0 the initial biomass
#' @param init_gr_rate initial value for the growth rate
#' @param init_K initial value for the saturation parameter K
#' @param init_lag initial value for the lag parameter
#' @param algorithm defaults to "auto" which chooses between bounded and unbounded Levenberg-Marquardt method and the bounded port method
#' @param max_iter max. number of iterations; defaults to 100
#' @param lower_bound lower bound for the bounded nls optimization; defaults to 0
#' @returns lag and the nls fitting object with parameters fitted to logistic model
calc_lag_fit_to_logistic_with_lag <- function(gr_curve, n0,
                                                      init_gr_rate = init_gr_rate,
                                                      init_K = init_K,
                                                      init_lag = init_lag,
                                                      algorithm = "auto",# Levenberg-Marquardt", # or "port" for nls
                                                      max_iter = 100,
                                                      lower_bound = c(0,0,0)) {
  tryCatch(
    expr =
      {if (algorithm == "auto") {
        nls_a <- choose_lag_fit_algorithm_logistic(gr_curve, n0,
                                                            init_gr_rate = init_gr_rate, init_K = init_K, init_lag = init_lag,
                                                            max_iter = max_iter,
                                                            lower_bound = lower_bound)

        # nlsLM( is a more robust version of nls, using  Levenberg-Marquardt algorithm
      } else if (algorithm == "Levenberg-Marquardt") {
        nls_a <- nlsLM(formula = biomass ~ n0 + (time >= lag) * n0 * (-1 + K * exp(gr_rate * (time - lag)) / (K - n0 + n0 * exp(gr_rate * (time - lag)))),
                       data = gr_curve,
                       start = list(gr_rate = init_gr_rate, K = init_K, lag = init_lag),
                       control = nls.control(maxiter = max_iter))
        #lower= lower.bound)
      } else if (algorithm == "port") {
        nls_a <- nls(biomass ~ n0 + (time >= lag) * n0 * (-1 + K * exp(gr_rate * (time - lag)) / (K - n0 + n0 * exp(gr_rate * (time - lag)))), gr_curve,
                     list(gr_rate = init_gr_rate, K = init_K, lag = init_lag),
                     algorithm = "port",
                     control = nls.control(maxiter = max_iter))
        #lower= lower.bound)
      } else {
        nls_a <- nls(biomass ~ n0 + (time >= lag) * n0 * (-1 + K * exp(gr_rate * (time - lag)) / (K - n0 + n0 * exp(gr_rate * (time - lag)))), gr_curve,
                     list(gr_rate = init_gr_rate, K = init_K, lag = init_lag),
                     algorithm = algorithm,
                     control = nls.control(maxiter = max_iter))
      }
        lag_N <- coef(nls_a)[names(coef(nls_a)) == "lag"] %>% unname()
      },
    error = function(cond) {
      lag_N <<- NA
      nls_a <<- NA
      #print(cond)
    })
  return(list(lag_N = lag_N, nls_a = nls_a))
}



#' calc_lag_fit_to_baranyi_with_lag
#'
#' Runs nlsLM/nls algorithms with three different parameter setups to fit the best Logistic model parameters to our data and chooses the best model
#' @param gr_curve data from one specific growth curve with these two columns: time and biomass
#' @param LOG10N0 the decimal logarithm of initial biomass
#' @param init_lag initial value for the lag parameter
#' @param init_mumax initial value for the mumax parameter
#' @param init_LOG10Nmax initial value for the LOG10Nmax parameter
#' @param algorithm defaults to "auto" which chooses between bounded and unbounded Levenberg-Marquardt method and the bounded port method
#' @param max_iter max. number of itertaions; defaults to 100
#' @param lower_bound lower.bound for the bounded nls optimisation; defaults to 0
#' @returns lag and the nls fitting object with parameters fitted to logistic model
calc_lag_fit_to_baranyi_with_lag <- function(gr_curve, LOG10N0 = NULL, init_lag = NULL, init_mumax = NULL, init_LOG10Nmax = NULL, algorithm = "auto", max_iter = 100, lower_bound = c(0,0,0, 0)) {
  tryCatch(
    expr =
      {if (algorithm == "auto") {
        nls_a <- choose_lag_fit_algorithm_baranyi(gr_curve,
                                                           LOG10N0 = LOG10N0,
                                                           init_lag = init_lag,
                                                           init_mumax = init_mumax,
                                                           init_LOG10Nmax = init_LOG10Nmax,
                                                           max_iter = max_iter,
                                                           lower_bound = lower_bound)

        # nlsLM( is a more robust version of nls, using  Levenberg-Marquardt algorithm
      } else if (algorithm == "Levenberg-Marquardt") {
        nls_a <- nlsLM(baranyi, gr_curve,
                       list(lag=init_lag, mumax=init_mumax, LOG10N0 = LOG10N0, LOG10Nmax = init_LOG10Nmax),
                       control = nls.control(maxiter = max_iter))
        #lower= lower.bound)
      } else if (algorithm == "port") {
        nls_a <- nls(baranyi, gr_curve,
                     list(lag=init_lag, mumax=init_mumax, LOG10N0 = LOG10N0, LOG10Nmax = init_LOG10Nmax),
                     algorithm = "port",
                     control = nls.control(maxiter = max_iter))
        #lower= lower.bound)
      } else {
        nls_a <- nls(baranyi, gr_curve,
                     list(lag=init_lag, mumax=init_mumax, LOG10N0 = LOG10N0, LOG10Nmax = init_LOG10Nmax),
                     algorithm = algorithm,
                     control = nls.control(maxiter = max_iter))
      }
        lag_N <- coef(nls_a)[names(coef(nls_a)) == "lag"] %>% unname()
      },
    error = function(cond) {
      lag_N <<- NA
      nls_a <<- NA
    })
  return(list(lag_N = lag_N, nls_a = nls_a))
}



#' get_init_pars_logistic
#'
#' Finds reasonable approximation for logistic growth curve parameters (K, lag. growth rate) based on the growth curve and some initial values
#' These approximations will be used as the initial values for the proper optimization algorithm run later.
#' @param data_this_curve data from one specific growth curve with these two columns: time and biomass
#' @param this_n0 the initial biomass
#' @param init_K initial value for the saturation parameter K
#' @param init_lag initial value for the lag parameter
#' @param init_gr_rate initial value for the growth rate
#' @param min_b defaults to 0.2; mina and minb define where to look for exponential phase: it will be where the biomass is between min + (max-min)*(min_a TO min_b)
#' @param min_a defaults to 0.8
#' @returns list of parameters: init_K, init_lag, init_gr_rate,
get_init_pars_logistic = function(data_this_curve, this_n0, init_K, init_lag, init_gr_rate, min_b = 0.2, min_a = 0.8) {
  if (is.null(init_K)) {
    max_this_data <- max(data_this_curve$biomass, na.rm = TRUE)
    init_K <- max_this_data %>% as.numeric()
  }
  if (is.null(init_lag)) {
    init_lag <- calc_lag(data_this_curve, method = "tangent",pars = get_def_pars()) %>% pull(lag) %>% unique() %>% as.numeric()
  }

  if (is.null(init_gr_rate)) {
    data_this_curve_exp <- data_this_curve %>%
      mutate(
        max_biomass = max(data_this_curve$biomass),
        min_threshold = this_n0 + min_b*(max_biomass - this_n0),
        max_threshold = this_n0 + min_a*(max_biomass - this_n0)) %>%
      # take only the points that are in between min and max
      filter(biomass <= max_threshold & biomass >= min_threshold)
    data_this_curve_exp$logdata = log(data_this_curve_exp$biomass/this_n0)
    if (nrow(data_this_curve_exp %>% filter(!is.na(time) & !is.na(logdata)) > 0)) {
      mod = lm(logdata ~ time, data = data_this_curve_exp)
      # this growth rate is assuming an exponential model so it will be generally underestimated
      init_gr_rate = mod$coefficients[2] %>% unname()
      # we have real r = r(1-N/K) so let us take
      n_mid = median(data_this_curve_exp$biomass)
      init_gr_rate = init_gr_rate/(1-n_mid/init_K)
    } else {
      init_gr_rate = 0.1
    }
    #data_this_curve_exp$predicted = predict(mod, data_this_curve_exp)
  }
  return(list(init_K = init_K, init_lag = init_lag, init_gr_rate = init_gr_rate))
}



#' calc_lagistic_fit_lag
#'
#' Calculates lag based on fitting logistic model to data
#' @param data a data frame with two required columns names: "time" and "biomass",and one optional column: "curve_id"
#' This is data from may come from multiple growth curves
#' @param n0 a data frame describing initial biomass for each of the curves, i.e. it has two obligatory columns: "curve_id", "N0"
#' @param init_gr_rate initial value for the growth rate, defaults to NULL in which case it will be approximated based on the data
#' @param init_K initial value for the saturation parameter K, defaults to NULL in which case it will be approximated based on the data
#' @param init_lag initial value for the lag parameter, defaults to NULL in which case it will be approximated based on the data
#' @param algorithm eg. "auto", "Levenberg-Marquardt", "port"
#' @param max_iter Maximum number of iterations
#' @param return_all_params defaults to FALSE, TRUE if you also want to get K and growth.rate apart from lag
#' @param min_b defaults to 0.2; mina and minb define where to look for exponential phase: it will be where the biomass is between min + (max-min)*(lower.bound.for.gr TO upper.bound.for.gr)
#' @param min_a defaults to 0.8
#' @returns growth curve data with additional columns  ('lag', and predicted biomass 'predicted'), and the fitting object if return.all.params was set to TRUE
calc_lagistic_fit_lag <- function(data, n0, init_gr_rate = NULL, init_K = NULL, init_lag = NULL, algorithm, max_iter, return_all_params = FALSE,
                                      min_b = 0.2, min_a = 0.8) {
  if (!("curve_id" %in% names(data))) {
    data$curve_id = NA
  }

  i <- 0
  fiting_list <- list()
  data_new <- data %>% filter(FALSE) %>% mutate( time = numeric(0), biomass = numeric(0),curve_id = character(0), lag = numeric(0), log_info = character(0))
  for (this_curve_id in unique(data$curve_id)) {
    i <- i + 1
    data_this_curve <- data %>% filter(curve_id == this_curve_id) %>% select(time, biomass, curve_id)
    this_n0 <- n0 %>% filter(curve_id == this_curve_id) %>% pull(n0)
    init_pars <- get_init_pars_logistic(data_this_curve, this_n0, init_K, init_lag, init_gr_rate)
    this_fit_obj <- calc_lag_fit_to_logistic_with_lag(gr_curve = data_this_curve,
                                                                     n0 = this_n0,
                                                                     init_gr_rate = init_pars$init_gr_rate,
                                                                     init_K = init_pars$init_K,
                                                                     init_lag = init_pars$init_lag,
                                                                     algorithm = algorithm,
                                                                     max_iter = max_iter)
    data_this_curve <- data_this_curve %>%
      mutate(lag = round(this_fit_obj$lag_N, 1))
    data_this_curve$predicted = if (!any(is.na(this_fit_obj$nls_a))) {predict(this_fit_obj$nls_a, data_this_curve) } else {data_this_curve$predicted = NA}
    data_new <- rbind(data_new, data_this_curve)

    fiting_list[[i]] <- this_fit_obj$nls_a
  }

  if (!return_all_params) {
    return(data_new)
  } else {
    return(list(data_new = data_new, mod_fit = fiting_list))
  }
}



#' get_init_pars_baranyi
#'
#' Finds reasonable approximation for baranyi growth curve parameters (init_mumax, lag) based on the growth curve and some initial values
#' These approximations will be used as the initial values for the proper optimization algorithm run later.
#' @param data_this_curve data from one specific growth curve with these two columns: time and biomass
#' @param this_n0 the initial biomass
#' @param init_lag initial value for the lag parameter
#' @param init_gr_rate initial value for the growth rate
#' @param min_b defaults to 0.2; mina and minb define where to look for exponential phase: it will be where the biomass is between min + (max-min)*(mina TO minb)
#' @param min_a defaults to 0.8
#' @returns list of parameters: init_mumax, init_lag
get_init_pars_baranyi <- function(data_this_curve, this_n0, init_lag, init_gr_rate, min_b = 0.2, min_a = 0.8) {
  if (is.null(init_lag)) {
    init_lag <- calc_lag(data_this_curve, method = "tangent", pars = get_def_pars()) %>% pull(lag) %>% unique() %>% as.numeric()
  }

  if (is.null(init_gr_rate)) {
    data_this_curve_exp <- data_this_curve %>%
      mutate(
        max_biomass = max(biomass),
        min_threshold = this_n0 + min_b * (max_biomass - this_n0),
        max_threshold = this_n0 + min_a * (max_biomass - this_n0)) %>%
      # take only the points that are in between min and max
      filter(biomass <= max_threshold & biomass >= min_threshold)
    data_this_curve_exp$logdata <- log(data_this_curve_exp$biomass/this_n0)
    if (nrow(data_this_curve_exp %>% filter(!is.na(time) & !is.na(logdata))  > 0)) {
      mod <- lm(logdata ~ time, data = data_this_curve_exp)
      # this growth rate is assuming an exponential model so it will be generally underestimated
      init_gr_rate <- mod$coefficients[2] %>% unname()
      # we have real r = r(1-N/K) so let us take
      n_mid <- median(data_this_curve_exp$biomass)
      init_mumax <- init_gr_rate
    } else {
      init_mumax <- 0.1
    }
  } else {
    init_mumax <- init_gr_rate
  }
  return(list(init_mumax = init_mumax, init_lag = init_lag))
}



#' calc_baranyi_fit_lag
#'
#' Calculates lag based on fitting baranyi model to data
#' @param data a data frame with two required columns names: "time" and "biomass",and one optional column: "curve_id"
#' This is data from may come from multiple growth curves
#' @param n0 a data frame describing initial biomass for each of the curves, i.e. it has two obligatory columns: "curve_id", "N0"
#' @param init_lag initial value for the lag parameter, defaults to NULL in which case it will be approximated based on the data
#' @param init_gr_rate initial value for the growth rate, defaults to NULL in which case it will be approximated based on the data
#' @param algorithm eg. "auto", "Levenberg-Marquardt", "port", defaults to "auto"
#' @param max_iter Maximum number of iterations, defaults to 100
#' @returns growth curve data with additional columns  ('lag', and predicted biomass 'predicted')
calc_baranyi_fit_lag <- function(data, n0, init_lag = NULL, init_gr_rate = NULL, algorithm = "auto", max_iter = 100) {
  data_new <- data %>% filter(FALSE) %>% mutate(lag = numeric(0), predicted = numeric(0))
  for (this_curve_id in unique(data$curve_id)) {
    data_this_curve <- data %>% filter(curve_id == this_curve_id)
    this_n0 <- n0 %>% filter(curve_id == this_curve_id) %>% pull(n0)

    data_this_curve_for_model <- data_this_curve %>%
      mutate(LOG10N = log10(biomass), t = time) %>%
      select(LOG10N, t)
    init_LOG10N0 <- log10(n0 %>% filter(curve_id == this_curve_id) %>% pull(n0))
    init_LOG10Nmax <- max(data_this_curve_for_model$LOG10N)
    init_pars_baranyi <- get_init_pars_baranyi(data_this_curve, this_n0, init_lag, init_gr_rate)

    fit_obj_this_curve <- calc_lag_fit_to_baranyi_with_lag(gr_curve = data_this_curve_for_model,
                                                                          LOG10N0 = init_LOG10N0,
                                                                          init_lag = init_pars_baranyi$init_lag,
                                                                          init_mumax = init_pars_baranyi$init_mumax,
                                                                          init_LOG10Nmax = init_LOG10Nmax,
                                                                          algorithm = algorithm,
                                                                          max_iter = max_iter,
                                                                          lower_bound = c(0,0,0, 0))
    data_this_curve <- data_this_curve %>%
      mutate(lag = round(fit_obj_this_curve$lag_N,1))
    data_this_curve$predicted <- if (!any(is.na(fit_obj_this_curve$nls_a))) {10^(predict(fit_obj_this_curve$nls_a, data_this_curve)) } else {data_this_curve$predicted = NA}
    data_new = rbind(data_new, data_this_curve)
  }
  return(data_new)
}



############################################# exported functions #########################################
#' make_grwoth_curve_df
#'
#' Create a growth curve data frame that can be later passed to the lag clculation functions
#' @param time numeric vector of times when biomass was measured  (chronologically ordered as in growth curve)
#' @param biomass numeric vector of measured biomass values  (chronologically ordered as in growth curve)
#' @param curve_id character vector of growth curve identifiers (i.e. if there are multiple measurements done at the same time point, they should have different curve_id)
#' @returns a data frame representing growth curve data
#' @export
make_grwoth_curve_df <- function(time, biomass, curve_id = NULL) {

  if(!(is.numeric(time) | is.integer(time))) stop("Time must be a numeric vector")
  if(!(is.numeric(biomass) | is.integer(biomass))) stop("Biomass must be a numeric vector")
  length_time <- length(time)
  length_biomass <- length(biomass)

  if(length_time != length_biomass)
    stop( "time and biomass vectors must be of the same length")

  if(!is.null(curve_id) & length(curve_id) != length_biomass)
    stop("time and curve_id vectors must be of the same length")

  if (is.null(curve_id)) {
    df = data.frame(time = time, biomass = biomass)
  } else {
    df = data.frame(time = time, biomass = biomass, curve_id = curve_id)
  }
  return(df)
}



#' plot_data
#'
#' Plots the provided growth curve (one single growth curve) on logarithmic scale
#' @param data_new a data frame with two required columns names: "time" and "biomass"
#' @param log10_transform if to plot y axis (biomass) on log10 scale
#' @returns ggplot object with a growth curve
#' @export
plot_data <- function(data_new, log10_transform = TRUE) {
  if (log10_transform) {
  data_new <- data_new %>%
    mutate(biomass = log10(biomass))
  }
  g <- ggplot(data_new, aes(x = time, y = biomass))  +
    geom_line(col = "blue") +
    geom_point(col = "blue") +
    xlab("time") +
    theme(axis.text.y.right = element_text(colour="black"),
          axis.text.y = element_text(colour="blue"),
          axis.title.y = element_text(colour="blue"),
          axis.title.y.right = element_text(colour="black"))

  if (log10_transform) {
    g = g + ylab("Log10(biomass)")
  } else {
    g = g +ylab("Biomass")
  }
  return(g)

}



#' fit_exp_lag
#'
#' Fits the lag to multiple growth curves based on the basic tangent method
#' @param data a data frame with two required columns names: "time" and "biomass",and one optional column: "curve_id"
#' This is data from may come from multiple growth curves
#' @param tangent_method "local.regression" (if the tangent is fitted to a number of points around the maximal growth rate)
#' or "to.point" (if the tangent is fitted only to the point where the growth rate is maximal); defaults to "to.point"
#' @param n0 the initial biomass (a tangent line crossing N0 line will determine the lag)
#' @param curve_points if tangent_method = "local.regression" then curve_points is the number of points the line is fitted to;
#' defaults to 3 i.e. the point with the maximal uptake rate one point before and one point aftter
#' @returns growth curve data (as input) together with additional columns: lag, line.intercept and line.slope
#' @export
fit_exp_lag <- function(data, tangent_method, n0, curve_points = 3) {
  if (!("curve_id" %in% names(data))) {
    data$curve_id = NA
  }
  data_new <- data %>% filter(FALSE) %>% mutate(lag = numeric(0),line_intercept = numeric(0), line_slope = numeric(0))
  for (this_curve_id in unique(data$curve_id)) {
    data_this_curve <- data %>% filter(curve_id == this_curve_id)
    this_n0 <- n0 %>% filter(curve_id == this_curve_id) %>% pull(n0)
    lag_obj <- fit_exp_lag_to_curve(data_this_curve, this_n0, tangent_method, curve_points)
    data_this_curve <- data_this_curve %>%
      mutate(lag = round(lag_obj$lag,1)) %>%
      mutate(line_intercept = lag_obj$line_intercept,
             line_slope = lag_obj$line_slope) %>%
      left_join(lag_obj$tangent_points)
    data_new <- rbind(data_new, data_this_curve)
  }

  data_new$predicted <- NA
  # data.new has columns with lag, intercept, slope, log.N0
  return(data_new)
}



#' lag_biomass_incr
#'
#' Fits the lag to multiple growth curves based on the biomass increase method
# It Find where biomass increases by a predefined threshold from N0
#' @param data a data frame with two required columns names: "time" and "biomass",and one optional column: "curve_id"
#' This is data from may come from multiple growth curves
#' @param threshold A value of the biomass increase that we can surely associate with the end of the lag phase rather than random variation during the lag
#' @param n0 the initial biomass (lag will be defined as the time point where the difference between biomass and N0 reaches a predefined threshold)
#' @returns growth curve data (as input) together with additional columns: N0, increase.from.N0, lag
#' @export
lag_biomass_incr <- function(data, threshold, n0) {
  data_new <- data %>% filter(FALSE) %>% mutate(lag = numeric(0))
  for (this_curve_id in unique(data$curve_id)) {
    data_this_curve <- data %>%
      filter(curve_id == this_curve_id) %>%
      left_join(n0, by = "curve_id") %>%
      mutate(incr_from_n0 = biomass - n0)


    #find where second derivative is maximal
    threshold_diff <- which(data_this_curve$incr_from_n0 >= threshold)
    first_threshold_diff <- threshold_diff[1]
    lag_this_curve <- data_this_curve$time[first_threshold_diff]
    data_this_curve <- data_this_curve %>%
      mutate(
        lag = round(lag_this_curve,1))
    data_new <- rbind(data_new, data_this_curve)
  }
  return(data_new)
}



#' fit_max_infl_lag
#'
#' Fits the lag to multiple growth curves based on the max growth acceleration method
#' It finds where the second derivative is the largest
#' @param data a data frame with two required columns names: "time" and "biomass",and one optional column: "curve_id"
#' This is data from may come from multiple growth curves
#' @returns growth curve data (as input) together with additional columns: lag, log.biomass, time.diff, time.av, second.deriv.b, biomass.increase
#' @export
fit_max_infl_lag <- function(data) {
  #deriv <- function(t, x) diff(x) / diff(t)
  #middle_pts <- function(x) x[-1] - diff(x) / 2
  #second_deriv <- function(t,x) deriv(middle_pts(t), deriv(t, x))
  data_new <- data %>% filter(FALSE) %>% mutate(log_biomass = numeric(0), diff = numeric(0), lag = numeric(0))
  for (this_curve_id in unique(data$curve_id)) {
    data_this_curve <- data %>%
      filter(curve_id == this_curve_id) %>%
      arrange(time) %>%
      as.data.frame() %>%
      mutate(log_biomass = log(biomass),
             time_diff = mean(c(diff(time), diff(dplyr::lag(time))), na.rm = TRUE),
             time_av = (time + dplyr::lag(time))/2) %>%
      mutate(
        #second.deriv.b = c(NA,second_deriv(time, log.biomass), NA),
        # central scheme
        second_deriv_b = (dplyr::lead(log_biomass) + dplyr::lag(log_biomass) - 2 * log_biomass)/time_diff^2,
        # we only look at second derivative if we know the first derivative is positive!
        # In empirical data we sometimes see a decrease in biomass but we don;t want to look at the second derivative there!
        biomass_incr = dplyr::lead(log_biomass)  > dplyr::lag(log_biomass)
      )

    #find where second derivative is maximal
    max_second_deriv_b <- max(data_this_curve$second_deriv_b, na.rm=TRUE)
    # take first point when max derivative if there are multiple
    ind_max_second_deriv_b <- which(data_this_curve$second_deriv_b == max_second_deriv_b)[1]
    lag_this_curve <- data_this_curve$time[ind_max_second_deriv_b]
    data_this_curve <- data_this_curve %>%
      mutate(
        lag = round(lag_this_curve, 1))
    data_new <- rbind(data_new, data_this_curve)
  }
  return(data_new)
}



#' plot_lag_fit
#'
#' Plots the provided growth curve (one single growth curve) together with the calculated lag and and the rationale for lag calculation
#' @param data_new a data frame output by Calculate.Lag function: it needs to have the following columns: "time", "biomass", "tangent.point", "predicted.data", "threshold", "N0", "second.deriv.b", "line.intercept", "line.slope"
#' @param log10_transform if to plot y axis (biomass) on log10 scale
#' @param print_lag_info if set to "TRUE" prints the lag length on the graph
#' @returns ggplot object with a growth curve
#' @export
plot_lag_fit <- function(data_new, print_lag_info = TRUE, log10_transform = TRUE) {
  max_narm <- function(x) ifelse( !all(is.na(x)), max(x, na.rm=T), NA)
  min_narm <- function(x) ifelse( !all(is.na(x)), min(x, na.rm=T), NA)
  lag.method = unique(data_new$lag_calculation_method)

  data_new <- data_new %>%
    group_by(curve_id) %>%
    mutate(x_mid = mean(time),
           lag_info = paste0("Lag = ", round(lag, 3), "."),
           max_second_deriv_b = max_narm(second_deriv_b),
           min_second_deriv_b = min_narm(second_deriv_b)) %>%
    ungroup()

  if (log10_transform) {
    data_new <- data_new %>%
      mutate(
        tangent_point = log10(tangent_point),
        biomass = log10(biomass),
        predicted_data = log10(predicted_data),
        threshold = log10(threshold),
         n0 = log10(exp(log(n0))),
        line_intercept = line_intercept/log(10),
        line_slope = line_slope/log(10))
  }

  data_new <- data_new %>%
    mutate(
        y_max_for_curve = max(biomass),
        y_min_for_curve = min(biomass),
        text_y = 1.005*y_max_for_curve,
        second_deriv_b_scaled = (second_deriv_b - min_second_deriv_b)/(max_second_deriv_b - min_second_deriv_b)*(y_max_for_curve - y_min_for_curve) + y_min_for_curve
           #y.limit = 1.1*y.max.for.curve
    ) %>%
    ungroup() %>%
    mutate(min_N0 = min(biomass),
           max_N0 = max(biomass))


  max_time <- max(data_new$time)
  size_n0_line <- 1
  size_lag_line <- 1
  g <- ggplot(data_new, aes(x = time, y = biomass))  +
    geom_vline(aes(xintercept = lag), linewidth = size_lag_line, col = "red", linetype = "dashed") +
    geom_line(col = "blue") +
    xlab("time") +
    xlim(c(0, max_time)) +
    facet_grid(curve_id ~ lag_calculation_method, scales = "free_y") +
    theme_bw() +
    theme(axis.text.y.right = element_text(colour = "black"),
          axis.text.y = element_text(colour = "black"),
          axis.title.y = element_text(colour = "black"),
          axis.title.y.right = element_text(colour = "black"))

  if (print_lag_info) {
    g <- g +  geom_text(aes(x = x_mid, y = text_y, label = lag_info), size = 6, col = "red")
  }
  if (log10_transform) {
    g = g + ylab("log10(biomass)")
  } else {
    g = g + ylab("Biomass")
  }



  if (length(lag.method) == 1 & lag.method == "biomass increase") {
    g = g +
      geom_line(aes(y = threshold), col = "darkgreen", na.rm = TRUE)
  }
  if (length(lag.method) == 1 & lag.method == "tangent") {
    g = g +
      geom_hline(aes(yintercept = n0), linewidth = size_n0_line, col = "black", na.rm = TRUE) +
      geom_point(aes(y = tangent_point), col = "darkgreen", size = 2, na.rm = TRUE)
    # on non-log scale it doesn't make sense to plot the tangent line because it is tangent to the log of biomass
    if (log10_transform) {
      g = g +
        geom_abline(aes(intercept = line_intercept, slope = line_slope), col = "darkgreen", na.rm = TRUE)
    }
  }
  if (length(lag.method) == 1 & lag.method == "parameter fitting to a model" & any(!is.na(data_new$lag))) {
    g = g +
      geom_line(aes(y = predicted_data), col = "darkgreen", na.rm = TRUE)
  }
  if (length(lag.method) == 1 & lag.method == "max growth acceleration") {
    g = g +
      geom_line(aes(y = second_deriv_b_scaled), col = "darkgreen", alpha = 0.5, na.rm = TRUE)
  }

  return(g)
}



#' get_lag
#'
#' The most basic function that calculates lags based on growth curve data, selected method and parameters.
#' It uses calc_lag function and strips the results to only get lag parameter for each growth curve id.
#' @param data a data frame with two required columns names: "time" and "biomass",and one optional column: "curve_id"
#' This is data from may come from multiple growth curves
#' @param method method of lag calculation, choose one of the follwoing: "exponential", "biomass increase", "max growth acceleration", "parameter fitting to a model"
#' @param pars a list of parameters. Get.default.parameters function can be used to get the default ones. Otherwise create your onwn list with the following names:
#' - model: if method = "parameter fitting to a model" , one of the following models needs to be chosen: "logistic", "baranyi"
#' - n0_method: first.observation" if the first point is taken as the initial biomass or
#' "minimal.observation" if the minimal biomass is taken is the initial point.
#' In "healthy" growth curves these options should be equivalent
#' but sometimes a drop in OD/biomass is observed at the beginning of a growth curve.
#' In this case it is not obvious what to assume the initial biomass is.
#' - tangent_method "local.regression" (if the tangent is fitted to a number of points around the maximal growth rate)
#' or "to.point" (if the tangent is fitted only to the point where the growth rate is maximal); defaults to "to.point"
#' - threshold: A value of the biomass increase that we can surely associate with the end of the lag phase rather than random variation durinh the lag. Defaults to 10^2
#' - curve_points: if tangent.method = "local.regression" then curve_points is the number of points the line is fitted to;
#' defaults to 3 i.e. the point with the maximal uptake rate one point before and one point after
#' - init_gr_rate: if logistic model is fitted. Defaults to  NULL in which case the initial value will be based on the data
#' - init_lag: if a logistic model is fitted, Defaults to NULL in which case the initial value will be based on the data
#' - algorithm: if method = "parameter fitting to a model", nls algorithm to run the model fit; defaults to "auto" which will choose the best between bounded and unbounded "Levenberg-Marquardt" and bounded "port"
#' - max_iter =  if method = "parameter fitting to a model", the maximum number of nls iterations, defaults to  100
#' @returns lag per each curve_id
#' @export
get_lag <- function(data, method, pars) {
  data_extended = calc_lag(data, method, pars)
  lags_data = data_extended %>%
    group_by(curve_id) %>%
    summarise(lag = unique(lag)) %>%
    ungroup()
  return(lags_data)
}




#' calc_lag
#'
#' The main function that calculates lags based on growth curve data, selected method and parameters and returns an extended growth rate data frame (extended by multiple columns with parameters related to lag calculation)
#' @param data a data frame with two required columns names: "time" and "biomass",and one optional column: "curve_id"
#' This is data from may come from multiple growth curves
#' @param method method of lag calculation, choose one of the follwoing: "exponential", "biomass increase", "max growth acceleration", "parameter fitting to a model"
#' @param pars a list of parameters. Get.default.parameters function can be used to get the default ones. Otherwise create your onwn list with the following names:
#' - model: if method = "parameter fitting to a model" , one of the following models needs to be chosen: "logistic", "baranyi"
#' - n0_method: first.observation" if the first point is taken as the initial biomass or
#' "minimal.observation" if the minimal biomass is taken is the initial point.
#' In "healthy" growth curves these options should be equivalent
#' but sometimes a drop in OD/biomass is observed at the beginning of a growth curve.
#' In this case it is not obvious what to assume the initial biomass is.
#' - tangent_method "local.regression" (if the tangent is fitted to a number of points around the maximal growth rate)
#' or "to.point" (if the tangent is fitted only to the point where the growth rate is maximal); defaults to "to.point"
#' - threshold: A value of the biomass increase that we can surely associate with the end of the lag phase rather than random variation durinh the lag. Defaults to 10^2
#' - curve_points: if tangent.method = "local.regression" then curve_points is the number of points the line is fitted to;
#' defaults to 3 i.e. the point with the maximal uptake rate one point before and one point after
#' - init_gr_rate: if logistic model is fitted. Defaults to  NULL in which case the initial value will be based on the data
#' - init_lag: if a logistic model is fitted, Defaults to NULL in which case the initial value will be based on the data
#' - algorithm: if method = "parameter fitting to a model", nls algorithm to run the model fit; defaults to "auto" which will choose the best between bounded and unbounded "Levenberg-Marquardt" and bounded "port"
#' - max_iter =  if method = "parameter fitting to a model", the maximum number of nls iterations, defaults to  100
#' @returns growth curve data (time, biomass, curve_id) with the following additional columns:  log_biomass, lag, line_slope, line_intercept, lag_calc_method, predicted_data, diff, second_deriv_b, tangent_point, threshold
#' @export
calc_lag <- function(data, method, pars) {
  if (!("curve_id" %in% names(data))) {
    data$curve_id = "growth.curve"
  }

  n0 <- data %>%
    group_by(curve_id) %>%
    arrange(time) %>%
    summarise(n0 = get_n0(biomass, pars$n0_method)) %>%
    ungroup() %>%
    mutate(log_n0 = log(n0))


  if (method == "tangent") {
    sel_tangent_method <- pars$tangent_method
    data_new <- fit_exp_lag(data,
                            n0 = n0,
                            tangent_method = sel_tangent_method,
                            curve_points = pars$curve_points)
    data_new <- data_new %>%
      select(time, biomass, curve_id, lag, line_slope, line_intercept, tangent_point) %>%
      mutate(lag_calculation_method = "tangent",
             log_biomass = log(biomass),
             predicted_data = NA,
             diff = NA,
             second_deriv_b = NA,
             threshold = NA) %>%
      select(time, biomass, log_biomass, curve_id, lag, line_slope, line_intercept, lag_calculation_method, predicted_data, diff, second_deriv_b, tangent_point, threshold)


  } else if (method == "biomass increase") {
    sel_threshold <- pars$threshold
    data_new <- lag_biomass_incr(data,
                                 threshold = sel_threshold,
                                 n0 = n0)
    data_new <- data_new %>%
      select(time, biomass, curve_id, lag) %>%
      mutate(lag_calculation_method = "biomass increase",
             log_biomass = log(biomass),
             predicted_data = NA,
             second_deriv_b = NA,
             line_intercept = NA,
             line_slope = NA,
             tangent_point = NA,
             diff = NA) %>%
      left_join(n0, by = "curve_id") %>%
      mutate(threshold = n0 + sel_threshold) %>%
      select(time, biomass, log_biomass, curve_id, lag, line_slope, line_intercept, lag_calculation_method, predicted_data, diff, second_deriv_b, tangent_point, threshold)

  } else if (method == "max growth acceleration") {
    data_new <-  fit_max_infl_lag(data)
    data_new <- data_new %>%
      select(time, biomass, log_biomass, curve_id, lag, second_deriv_b) %>%
      mutate(lag_calculation_method = "max growth acceleration",
             line_intercept = NA,
             line_slope = NA,
             predicted_data = NA,
             diff = NA,
             tangent_point = NA,
             threshold= NA) %>%
      select(time, biomass, log_biomass, curve_id, lag, line_slope, line_intercept, lag_calculation_method, predicted_data, diff, second_deriv_b, tangent_point, threshold)

  } else if (method == "parameter fitting to a model") {
    sel_model <- pars$model
    if (sel_model == "logistic") {
      data_new <- calc_lagistic_fit_lag(data, n0,
                                        init_gr_rate = pars$init_gr_rate,
                                        init_K = pars$init_K,
                                        init_lag = pars$init_lag,
                                        algorithm = pars$algorithm,
                                        max_iter = pars$max_iter)

    } else if (sel_model == "baranyi") {
      data_new <- calc_baranyi_fit_lag(data,
                                       n0,
                                       init_lag = pars$init_lag,
                                       init_gr_rate = pars$init_gr_rate,
                                       algorithm = pars$algorithm,
                                       max_iter = pars$max_iter) %>%
        mutate(lag_calculation_method = "Fitting lagged baranyi")

    } else {
      stop("model not implemented")
    }
    data_new <- data_new %>%
      select(time, biomass, curve_id, lag, predicted_data = predicted) %>%
      mutate(lag_calculation_method = "parameter fitting to a model",
             log_biomass = log(biomass),
             diff = NA,
             second_deriv_b = NA,
             line_intercept = NA,
             line_slope = NA,
             tangent_point = NA,
             threshold = NA) %>%
      select(time, biomass, log_biomass, curve_id, lag, line_slope, line_intercept, lag_calculation_method, predicted_data, diff, second_deriv_b, tangent_point, threshold)

  } else {
    stop("method not implemented")
  }
  data_new$lag[data_new$lag < 0] = NA
  data_new <- data_new %>%
    left_join(n0)# %>%
  #data.new$lag[data.new$lag < 0] = 0
  return(data_new)
}


#' get_all_methods_lag
#'
#' Runs the main function that calculates lags based on growth curve data based on all possible methods.
#' @param data a data frame with two required columns names: "time" and "biomass",and one optional column: "curve_id"
#' This is data from may come from multiple growth curves
#' @param biomass_incr_threshold A value of the biomass increase that we can surely associate with the end of the lag phase rather than random variation during the lag.
#' Needs to be set specifically to avoid unconscious use of the value set by default. If set to NULL, the value from pars will be taken
#' @param pars a list of parameters. defaults to the ones set by get_def_pars function. Otherwise create your own list with the following names:
#' - model: if method = "parameter fitting to a model" , one of the following models needs to be chosen: "logistic", "baranyi"
#' - n0_method: first.observation" if the first point is taken as the initial biomass or
#' "minimal.observation" if the minimal biomass is taken is the initial point.
#' In "healthy" growth curves these options should be equivalent
#' but sometimes a drop in OD/biomass is observed at the beginning of a growth curve.
#' In this case it is not obvious what to assume the initial biomass is.
#' - tangent.method "local.regression" (if the tangent is fitted to a number of points around the maximal growth rate)
#' or "to.point" (if the tangent is fitted only to the point where the growth rate is maximal); defaults to "to.point"
#' - threshold: A value of the biomass increase that we can surely associate with the end of the lag phase rather than random variation during the lag. Defaults to 10^2
#' - curve_points: if tangent_method = "local.regression" then curve_points is the number of points the line is fitted to;
#' defaults to 3 i.e. the point with the maximal uptake rate one point before and one point after
#' - init_growth.rate: if logistic model is fitted. Defaults to  NULL in which case the initial value will be based on the data
#' - init_lag: if a logistic model is fitted, Defaults to NULL in which case the initial value will be based on the data
#' - algorithm: if method = "parameter fitting to a model", nls algorithm to run the model fit; defaults to "auto" which will choose the best between bounded and unbounded "Levenberg-Marquardt" and bounded "port"
#' - max_iter =  if method = "parameter fitting to a model", the maximum number of nls iterations, defaults to  100
#' @returns growth curve data (time, biomass, curve_id) with the column: lag_calculation_method, and with the following additional columns:  log_biomass, lag, line_slope, line_intercept, lag_calculation_method, predicted_data, diff, second_deriv_b, tangent_point, threshold
#' Note that each growth curve will appear
#' @export
get_all_methods_lag <- function(data, biomass_incr_threshold, pars = NULL) {
  if (is.null(pars)) {
    pars <- get_def_pars()
  }
  if (is.null(biomass_incr_threshold))  {
    biomass_incr_threshold <- pars$threshold
  }
  pars_logistic <- pars
  pars_logistic$model <- "logistic"
  data_new_logistic <- calc_lag(data = data,
                                    method = "parameter fitting to a model",
                                    pars = pars_logistic) %>%
    mutate(lag_calculation_method = "par. fitting to logistic model")

  pars_baranyi <- pars
  pars_baranyi$model <- "baranyi"
  data_new_baranyi <- calc_lag(data = data,
                                   method = "parameter fitting to a model",
                                   pars = pars_baranyi) %>%
    mutate(lag_calculation_method = "par. fitting to baranyi model")

  data_new_max_infl <- calc_lag(data = data,
                                    method = "max growth acceleration",
                                    pars = pars) %>%
    mutate(lag_calculation_method = "max growth acceleration")

  pars_to_point <- pars
  pars_to_point$tangent_method <- "to.point"
  data_new_exp <- calc_lag(data = data,
                                method = "tangent",
                                pars = pars_to_point
  ) %>%
    mutate(lag_calculation_method = "tangent to max growth point")

  pars_local_regr <- pars
  pars_local_regr$tangent_method <- "local.regression"
  data_new_exp2 <- calc_lag(data = data,
                                 method = "tangent",
                                 pars = pars_local_regr)%>%
    mutate(lag_calculation_method = "tangent to max growth line")

  pars$threshold <- biomass_incr_threshold
  data_new_biominc <-  calc_lag(data = data,
                                    method = "biomass increase",
                                    pars = pars)%>%
    mutate(lag_calculation_method = "biomass increase")


  data_all_with_lag <- data_new_max_infl %>%
    rbind(data_new_exp) %>%
    rbind(data_new_exp2) %>%
    rbind(data_new_biominc) %>%
    rbind(data_new_baranyi) %>%
    rbind(data_new_logistic)

  data_all_with_lag$lag[data_all_with_lag$lag < 0] = NA
  return(data_all_with_lag)
}




#' get_def_pars
#' Set defaults parameters used by calc_lag function
#' @returns list of parameters
#' @export
get_def_pars <- function() {
  pars <- list(model = "logistic",
              n0_method = "first.observation",
              tangent_method = "local.regression",
              threshold = 10^2,
              curve_points = 3,
              init_gr_rate = NULL,#0.1,
              init_lag = NULL, #3.5,
              algorithm = "auto",#"default", "Levenberg-Marquardt",#
              max_iter = 100)
  return(pars)
}



#' smooth_data
#' Smoothens growth curves data
#' @param data a data frame with two required columns names: "time" and "biomass",and one optional column: "curve_id"
#' This is data from may come from multiple growth curves
#' @param smooth_kind kind used for the smooth functions, defaults to "3RS3R"
#' @returns smoothened data
#' @export
smooth_data <- function(data, smooth_kind = "3RS3R") {
  if (!("curve_id" %in% names(data))) {
    data$curve_id <- "growth.curve"
  }
  data_smooth <- data  %>% filter(FALSE)
  for (this_curve_id in unique(data$curve_id)) {
    data_this_curve <- data %>% filter(curve_id == this_curve_id) %>%
      mutate(biomass_smooth = smooth(biomass, kind = smooth_kind)) %>%
      select(time, biomass = biomass_smooth, curve_id)
    data_smooth <- rbind(data_smooth, data_this_curve)
  }
  return(data_smooth)
}



#' cut_the_data
#' Subsets the data frame containing only the observations up to the specified maximum time
#' @param data a data frame with two required columns names: "time" and "biomass",and one optional column: "curve_id"
#' This is data from may come from multiple growth curves
#' @param max_time max. time at which we want to cut the growth curve data
#' @returns cut data
#' @export
cut_the_data <- function(data, max_time) {
  data_short <- data %>% filter(time <= max_time)
  return(data_short)
}



#' get_theme
#'
#' This function sets a ggplot theme without grid.
#' The theme removes the major and minor grid lines, sets a white background with a gray border and adjusts the text size.
#' @param text_size defaults to 12
#' @returns a ggplot theme
#' @export
get_theme <- function(text_size = 12) {
  my_theme <- theme(
    panel.grid.major = element_blank(),#element_line(colour = "black", size = 0.05),
    panel.grid.minor = element_blank(),#element_line(colour = "black", size = 0.05),
    panel.background = element_rect(fill = "white",
                                    colour = "gray",
                                    size = 0.5, linetype = "solid"),
    text = element_text(size = text_size),
    panel.border = element_rect(linetype = "solid", fill = NA, colour = "black")
  )
  return(my_theme)
}

Try the miLAG package in your browser

Any scripts or data that you put into this service are public.

miLAG documentation built on April 3, 2025, 8:09 p.m.