R/likelihood_data.R

Defines functions ld_dim_g3stock ld_dim_tag ld_dim_age ld_dim_length

parse_levels <- function (lvls, var_name) {
    m <- suppressWarnings(as.numeric(lvls))
    if (!anyNA(m)) return(data.frame(
        names = lvls,
        lower_incl = TRUE,
        lower_bound = m,
        upper_bound = c(tail(m, -1), Inf),  # NB: No data about final bound, assume open-ended
        upper_incl = FALSE,
        open_ended_upper = TRUE,
        stringsAsFactors = FALSE))

    m <- regmatches(lvls, regexec('^(\\[|\\()(.*),(.*)(\\]|\\))', lvls))
    if (all(vapply(m, length, numeric(1)) == 5)) return(data.frame(
        names = lvls,
        lower_incl = vapply(m, function (mm) identical(mm[[2]], '['), logical(1)),
        lower_bound = vapply(m, function (mm) as.numeric(mm[[3]]), numeric(1)),
        upper_bound = vapply(m, function (mm) as.numeric(mm[[4]]), numeric(1)),
        upper_incl = vapply(m, function (mm) identical(mm[[5]], ']'), logical(1)),
        open_ended_upper = is.infinite(as.numeric(tail(m, 1)[[1]][[4]])),
        stringsAsFactors = FALSE))

    m <- regmatches(lvls, regexec('^(.+):(.+)$', lvls))
    if (all(vapply(m, length, numeric(1)) == 3)) return(data.frame(
        names = lvls,
        lower_incl = TRUE,
        lower_bound = vapply(m, function (mm) as.numeric(mm[[2]]), numeric(1)),
        upper_bound = vapply(m, function (mm) as.numeric(mm[[3]]), numeric(1)),
        upper_incl = FALSE,
        open_ended_upper = is.infinite(as.numeric(tail(m, 1)[[1]][[3]])),
        stringsAsFactors = FALSE))

    stop("Unknown form of ", var_name, " levels, see ?cut for formatting: ", paste(lvls, collapse = ", "))
}

g3l_likelihood_data <- function (nll_name, data, missing_val = 0, area_group = NULL, model_history = "", all_stocks = list(), all_fleets = list(), all_predators = list()) {
    # vector of col names, will cross them off as we go
    handled_columns <- structure(as.list(seq_along(names(data))), names = names(data))

    # Work out time dimension, but don't add it just yet
    if ('year' %in% names(data)) {
        # NB: Let g3s_time_convert() worry about if the step column is there or not
        #     Suppress warnings from tibbles that it might be missing
        data$time <- g3s_time_convert(data$year, suppressWarnings(data$step))
        handled_columns$year <- NULL
        handled_columns$step <- NULL
    } else if ('time' %in% names(data)) {  # Convert our time=1999-01 strings back
        data$time <- g3s_time_convert(data$time)
        handled_columns$time <- NULL
    } else {
        stop("Data must contain a year column")
    }

    modelstock <- g3_storage(paste(nll_name, "model", sep = "_"))
    obsstock <- g3_storage(paste(nll_name, "obs", sep = "_"))
    maps <- list()

    # Turn incoming data into stocks with correct dimensions
    d <- ld_dim_length(data)
    if (!is.null(d[[1]])) {
        modelstock <- copydim(modelstock, d[[1]])
        obsstock <- copydim(obsstock, d[[1]])
        data$length <- d[[2]]
        handled_columns$length <- NULL
    } else {
        # Stocks currently have to have a length vector, even if it only has one element
        d[[1]] <- g3s_length(g3_storage("x"), c(0))
        modelstock <- copydim(modelstock, d[[1]])
        obsstock <- copydim(obsstock, d[[1]])
    }

    # Add early time dimension for surveyindices
    if (identical(model_history, 'early')) {
        modelstock <- g3s_time(
            modelstock,
            sort(unique(data$time)))
        obsstock <- g3s_time(
            obsstock,
            sort(unique(data$time)))
        data$time <- g3s_time_labels(data$time)
    }

    d <- ld_dim_age(data)
    if (!is.null(d[[1]])) {
        modelstock <- copydim(modelstock, d[[1]])
        obsstock <- copydim(obsstock, d[[1]])
        data$age <- d[[2]]
        handled_columns$age <- NULL
    }

    d <- ld_dim_tag(data)
    if (!is.null(d[[1]])) {
        modelstock <- copydim(modelstock, d[[1]])
        obsstock <- copydim(obsstock, d[[1]])
        data$tag <- d[[2]]
        handled_columns$tag <- NULL
    }

    d <- ld_dim_g3stock(data, "stock", all_stocks)
    if (!is.null(d[[1]])) {
        modelstock <- copydim(modelstock, d[[1]])
        # NB: We intersect obbstock onto modelstock, so we should copy that rather than the current stock
        obsstock <- g3s_manual(
            obsstock,
            var_base_name = names(d[[1]]$dim),
            dimnames = d$groups,
            intersect_idx_f = as.symbol( gsub("stock__", paste0(modelstock$name, "__"), d[[1]]$iter_ss, fixed = TRUE) ))
        # NB: g3stock doesn't modify data, so don't bother with d[[2]]
        handled_columns[[names(d[[1]]$dim)]] <- NULL
        maps[["stock"]] <- d$map
    }

    d <- ld_dim_length(data, col_name = 'predator_length')
    if (!is.null(d[[1]])) {
        modelstock <- copydim(modelstock, d[[1]], prefix = 'predator_')
        obsstock <- copydim(obsstock, d[[1]], prefix = 'predator_')
        data$predator_length <- d[[2]]
        handled_columns$predator_length <- NULL
    }

    d <- ld_dim_age(data, col_name = 'predator_age')
    if (!is.null(d[[1]])) {
        modelstock <- copydim(modelstock, d[[1]], prefix = 'predator_')
        obsstock <- copydim(obsstock, d[[1]], prefix = 'predator_')
        data$predator_age <- d[[2]]
        handled_columns$predator_age <- NULL
    }

    d <- ld_dim_tag(data, col_name = 'predator_tag')
    if (!is.null(d[[1]])) {
        modelstock <- copydim(modelstock, d[[1]], prefix = 'predator_')
        obsstock <- copydim(obsstock, d[[1]], prefix = 'predator_')
        data$predator_tag <- d[[2]]
        handled_columns$predator_tag <- NULL
    }

    d <- ld_dim_g3stock(data, "predator", all_predators)
    if (!is.null(d[[1]])) {
        modelstock <- copydim(modelstock, d[[1]])
        # NB: We intersect obbstock onto modelstock, so we should copy that rather than the current stock
        obsstock <- g3s_manual(
            obsstock,
            var_base_name = names(d[[1]]$dim),
            dimnames = d$groups,
            intersect_idx_f = as.symbol( gsub("stock__", paste0(modelstock$name, "__"), d[[1]]$iter_ss, fixed = TRUE) ))
        # NB: g3stock doesn't modify data, so don't bother with d[[2]]
        handled_columns[[names(d[[1]]$dim)]] <- NULL
        maps[["predator"]] <- d$map
    }

    d <- ld_dim_g3stock(data, "fleet", all_fleets)
    if (!is.null(d[[1]])) {
        modelstock <- copydim(modelstock, d[[1]])
        # NB: We intersect obbstock onto modelstock, so we should copy that rather than the current stock
        obsstock <- g3s_manual(
            obsstock,
            var_base_name = names(d[[1]]$dim),
            dimnames = d$groups,
            intersect_idx_f = as.symbol( gsub("stock__", paste0(modelstock$name, "__"), d[[1]]$iter_ss, fixed = TRUE) ))
        # NB: g3stock doesn't modify data, so don't bother with d[[2]]
        handled_columns[[names(d[[1]]$dim)]] <- NULL
        maps[["fleet"]] <- d$map
    }

    # Add time dimension if it's supposed to be last
    if (!identical(model_history, 'early')) {
        # NB: We always add time to observations, whereas only when explicitly requested for model
        if (identical(model_history, 'late')) {
            modelstock <- g3s_time(
                modelstock,
                sort(unique(data$time)))
        }
        obsstock <- g3s_time(
            obsstock,
            sort(unique(data$time)))
        data$time <- g3s_time_labels(data$time)
    }

    # NB: area has to be last, so we can sum for the entire area/time
    if ('area' %in% names(data)) {
        # NB: Ignore MFDB attributes on purpose, we're interested in the aggregated areas here
        used_areas <- as.character(unique(data$area))
        if (is.null(area_group)) {
            # If no area grouping provided, assume area_group are integers already
            if (suppressWarnings(anyNA(as.integer(used_areas)))) {
                stop("Areas in data don't have integer names, but areas not provided")
            }
            area_group <- structure(
                as.integer(used_areas),
                names = used_areas)
        } else {
            # Filter area_group by what we actually need
            area_group <- area_group[names(area_group) %in% used_areas]
        }
        area_group <- area_group[order(names(area_group))]  # Dimension order should match data
        modelstock <- g3s_areagroup(modelstock, area_group)
        obsstock <- g3s_areagroup(obsstock, area_group)
        handled_columns$area <- NULL
    }

    # Generate full table based on stock
    full_table <- as.data.frame.table(
        g3_stock_instance(obsstock, 0),
        stringsAsFactors = TRUE)
    # Use freq column to preserve ordering of output
    full_table$Freq <- seq_len(nrow(full_table))
    full_table <- merge(full_table, data, all.x = TRUE)

    # Assume everything other column is an observation
    obs_arrays <- lapply(names(handled_columns), function (col_name) {
        if (identical(missing_val, 'stop')) {
            if (any(is.na(full_table[[col_name]]))) stop("Missing values in data")
        } else {
            # Fill in missing values with given value
            full_table[[col_name]][is.na(full_table[[col_name]])] <- missing_val
        }
        return(array(full_table[[col_name]][order(full_table$Freq)],
            dim = obsstock$dim,
            dimnames = obsstock$dimnames ))
    })
    if (length(obs_arrays) == 0) stop("Should be at least one observation in likelihood data")

    # Rename old names to instance names we expect later
    names(obs_arrays) <- gsub("^weight$", "wgt", gsub("^number$", "num", names(handled_columns)))

    return(list(
        modelstock = modelstock,
        obsstock = obsstock,
        done_aggregating_f = if ('step' %in% names(data)) ~TRUE else ~cur_step_final,
        maps = maps,
        obs_arrays = obs_arrays,
        nll_name = nll_name))
}

ld_dim_length <- function(data, col_name = 'length') {
    mfdb_min_bound <- function (x) { if (is.null(attr(x, 'min'))) x[[1]] else attr(x, 'min') }
    mfdb_max_bound <- function (x) { if (is.null(attr(x, 'max'))) tail(x, 1) else attr(x, 'max') }
    data_col <- data[[col_name]]

    if (is.null(data_col)) {
        # No length dimension
        stock <- NULL
    } else if (!is.null(attr(data, col_name, exact = TRUE))) {
        length_groups <- attr(data, col_name, exact = TRUE)

        # Make sure length groups are contiguous
        if (!isTRUE(all.equal(
                unname(head(vapply(length_groups, mfdb_max_bound, numeric(1)), -1)),
                unname(tail(vapply(length_groups, mfdb_min_bound, numeric(1)), -1))))) {
            stop("Gaps in length groups are not supported")
        }

        # Form length groups using lower bound from all groups
        length_vec <- vapply(length_groups, mfdb_min_bound, numeric(1))

        open_ended_upper <- isTRUE(attr(length_groups[[length(length_groups)]], 'max_open_ended'))
        if (!open_ended_upper) {
            # Not open ended, so final bound should be max of last item
            length_vec <- c(
                length_vec,
                mfdb_max_bound(length_groups[[length(length_groups)]]))
        }

        if (isTRUE(attr(length_groups[[1]], 'min_open_ended'))) {
            # Lower bound open-ended, so set first lengthgroup to start at 0
            length_vec[[1]] <- 0
        }

        # We want to use our own names, so remove MFDB's
        names(length_vec) <- NULL

        # Convert data$length to use our naming
        data_col <- factor(data_col, levels = names(length_groups))

        stock <- g3s_length(g3_storage("x"), length_vec, open_ended = open_ended_upper)
    } else {
        # Force length to be a factor if not already
        if (!is.factor(data_col)) {
            # Make sure levels are ordered according to cut strings
            lvls <- parse_levels(unique(data_col), col_name)
            lvls <- lvls[with(lvls, order(lower_bound, upper_bound)), 'names']
            data_col <- factor(data_col, levels = lvls)
        }

        lvls <- parse_levels(levels(data_col), col_name)
        open_ended_upper <- lvls$open_ended_upper[[1]]

        length_vec <- if (open_ended_upper) lvls$lower_bound else c(lvls$lower_bound, tail(lvls$upper_bound, 1))

        if (any(!lvls$lower_incl) || any(lvls$upper_incl)) {
            stop("length intervals should be inclusive-lower, i.e. cut(..., right=FALSE): ", paste(lvls$names, collapse = ", "))
        }
        if (!isTRUE(all.equal(tail(lvls$lower_bound, -1), head(lvls$upper_bound, -1)))) {
            stop("Gaps in length groups are not supported: ", paste(lvls$names, collapse = ", "))
        }

        stock <- g3s_length(g3_storage("x"), length_vec, open_ended = open_ended_upper)
    }

    if (!is.null(data_col)) levels(data_col) <- stock$dimnames$length
    return(list(stock, data_col))
}

ld_dim_age <- function(data, col_name = 'age') {
    mfdb_eval <- function (x) { if (is.call(x)) eval(x) else x }
    data_col <- data[[col_name]]

    if (is.null(data_col)) {
        # No age dimension
        stock <- NULL
    } else if (!is.null(attr(data, col_name, exact = TRUE))) {
        age_groups <- attr(data, col_name, exact = TRUE)
        age_groups <- lapply(age_groups, mfdb_eval)  # Convert seq(2, 4) back to 2,3,4

        # We want to use our own names, so remove MFDB's
        stock <- g3s_agegroup(g3_storage("x"), unname(age_groups))

        # Convert data_col to use our naming
        data_col <- factor(data_col, levels = names(age_groups))
        levels(data_col) <- stock$dimnames[[col_name]]
    } else if (is.numeric(data_col)) {
        # Numeric age columns don't need grouping
        age_groups <- seq(min(data_col), max(data_col))

        stock <- g3s_age(g3_storage("x"), min(data_col), max(data_col))
        # Convert age data to use our naming
        data_col <- factor(
            data_col,
            levels = age_groups,
            labels = stock$dimnames[[col_name]])
    } else {
        if (!is.factor(data_col)) {
            # Make sure levels are ordered according to cut strings
            lvls <- parse_levels(unique(data_col))
            lvls <- lvls[with(lvls, order(lower_bound, upper_bound)), 'names']
            data_col <- factor(data_col, levels = lvls)
        }
        lvls <- parse_levels(levels(data_col), "age")

        if (is.infinite(tail(lvls$upper_bound, 1))) {
            # No support for infinite upper bound, bodge
            lvls$upper_bound[[length(lvls$upper_bound)]] <-
                lvls$lower_bound[[length(lvls$lower_bound)]] +
                1  # NB: It's not going to be upper-inclusive, so will subtract one at next step
        }

        # Account for lower_incl / upper_incl
        lvls$lower_bound <- ifelse(!lvls$lower_incl, lvls$lower_bound + 1, lvls$lower_bound)
        lvls$upper_bound <- ifelse(!lvls$upper_incl, lvls$upper_bound - 1, lvls$upper_bound)
        age_groups <- lapply(seq_len(nrow(lvls)), function (i) seq(lvls[i, "lower_bound"], lvls[i, "upper_bound"]))
        # NB: We never set the original names on age_groups

        stock <- g3s_agegroup(g3_storage("x"), age_groups)
        levels(data_col) <- stock$dimnames[["age"]]  # NB: [[col_name]] isn't the right choice here, as it won't be prefixed
    }
    return(list( stock, data_col ))
}

ld_dim_tag <- function(data, col_name = 'tag') {
    data_col <- data[[col_name]]

    if (is.null(data_col)) {
        # No tag dimension
        stock <- NULL
    } else {
        if (is.factor(data_col)) {
            tag_ids <- structure(
                seq_along(levels(data_col)),
                names = levels(data_col))
        } else {
            tag_ids <- as.integer(unique(data_col))
        }
        stock <- g3s_tag(g3_storage("x"), tag_ids, force_untagged = FALSE)
    }
    return(list( stock, data_col ))
}

ld_dim_g3stock <- function(data, col_name = "stock", all_stocks) {
    col_name_re <- paste0(col_name, "_re")
    if (col_name %in% names(data) && col_name_re %in% names(data)) {
        stop("Don't support both ", col_name, " and ", col_name_re)
    }

    if (col_name %in% names(data)) {
        actual_col <- col_name
        # Unique stock string groupings, in order
        stock_groups <- as.character(data[[col_name]][!duplicated(data[[col_name]])])

        # stock_map: list of stock$name --> index of stock_groups it should be added to
        # Start off with everything mapping to NULL
        stock_map <- structure(
            rep(list(NULL), length(all_stocks)),
            names = vapply(all_stocks, function (s) s$name, character(1)))

        # For 1..(max name parts) and all stocks...
        for (n in seq_len(max(vapply(all_stocks, function (s) length(s$name_parts), integer(1))))) {
            for (i in seq_along(all_stocks)) {
                s <- all_stocks[[i]]

                # Get all (n)-long combinations of (s)' name parts. fish_imm_f --> c("fish_f", "imm_f", ...)
                if (n > length(s$name_parts)) next
                name_combn <- apply(utils::combn(s$name_parts, n), 2, function (x) paste(x, collapse = "_"))

                # If any one of these matches a stock_group, assign this stock to that string
                # NB: We do shortest first, so longer matches will override shorter ones
                matches <- which(stock_groups %in% name_combn)
                if (length(matches) > 0) stock_map[[i]] <- head(matches, 1)
            }
        }

        unused_groups <- setdiff(
            seq_along(stock_groups),
            unique(unlist(stock_map)) )
        if (length(unused_groups) > 0) {
            stop(col_name, " groups matched no ", col_name, " in likelihood data: ", paste(stock_groups[unused_groups], collapse = ", "))
        }
    } else if (col_name_re %in% names(data)) {
        actual_col <- col_name_re
        # Start off with everything mapping to NULL
        stock_map <- structure(
            rep(list(NULL), length(all_stocks)),
            names = vapply(all_stocks, function (s) s$name, character(1)))

        # For each regex, find all matches and map to that index
        stock_groups <- as.character(data[[col_name_re]][!duplicated(data[[col_name_re]])])
        for (i in rev(seq_along(stock_groups))) {  # NB: Reverse so first ones have precedence
            stock_map[grep(stock_groups[[i]], names(stock_map))] <- i
        }

        unused_regexes <- setdiff(
            seq_along(stock_groups),
            unique(unlist(stock_map)) )
        if (length(unused_regexes) > 0) {
            stop("stock_re regexes matched no stocks in likelihood data: ", paste(stock_groups[unused_regexes], collapse = ", "))
        }
    } else {
        # No relevant column --> nothing to do
        return(list(NULL))
    }

    # Turn stock_map mapping into formula
    intersect_idx_f <- list_to_stock_switch(
        # Wrap each value of stock_map with g3_idx(i)
        # NB: -1 will cause g3s_manual() to ignore this intersect
        sapply(stock_map, function (i) substitute(g3_idx(i), list(i = if (is.null(i)) -1L else i)) ),
        # Variable in g3l_distribution() this should be looking for
       if (col_name == "stock") "stock" else "predstock" )

    # Make diagnostic fleet map with names for g3_distribution_preview()
    map <- vapply(
        stock_map,
        function (i) if (is.null(i)) as.character(NA) else stock_groups[[i]],
        character(1) )

    return(list(
        g3s_manual(g3_storage("x"), actual_col, stock_groups, intersect_idx_f),
        NULL,  # This is data_col in other methods
        groups = stock_groups,
        map = map ))
}

# Copy a single dimension from (new_stock) atop (old_stock), renaming dimension by adding (prefix)
copydim <- function (inner_stock, new_stock, prefix = '') {
     old_dim <- names(new_stock$dim)[[1]]
     new_dim <- paste0(prefix, old_dim)

     sym_list <- list()
     # Prefix any instance of symbols (sl) in code / character vector (in_c) with (prefix)
     add_prefix <- function (in_c, sl) {
         repl_fn <- function (x) {
             sym <- as.character(if (is.symbol(x)) x else x[[1]])
             # Add our prefix to stock__ names
             sym <- gsub('^stock__', paste0('stock__', prefix), sym)
             # If something is already renamed, (e.g. x__agegroup), swap in new name
             sym <- gsub(paste0('^\\Q', new_stock$name, '\\E__'), paste0(inner_stock$name, '__', prefix), sym)
             # If "old_dim", add to the beginning
             if (sym == old_dim) sym <- paste0(prefix, sym)

             if (is.character(x)) x[[1]] <- sym  # NB: When renaming environment names
             else if (is.symbol(x)) x <- as.symbol(sym)
             else if (is.call(x)) {
                 for (i in seq_along(x)) {
                     # Replace function name, recurse into arguments
                     x[[i]] <- if (i == 1) as.symbol(sym) else add_prefix(x[[i]], sl)
                 }
             }

             return(x)
         }

         if (is.character(in_c)) return(vapply(in_c, repl_fn, character(1)))

         # Form call_replace(in_c, sl[[1]] = repl_fn, sl[[2]] = repl_fn, ...)
         args <- c(
             list(in_c),
             rep(list(repl_fn), length(sl)) )
         names(args) <- c("", sl)
         do.call(call_replace, args, quote = TRUE)
     }

     # Arguments to call_replace to add prefix to everything in repl_fn
     sym_list <- c(
         names(new_stock$env),
         if (nzchar(prefix)) old_dim else NULL,
         new_stock$iter_ss[[old_dim]])

     # Add prefix to references to symbols in (sym_list)
     repl_env_fn <- function (env) {
         out <- as.list(env)
         names(out) <- add_prefix(names(out), sym_list)

         # Recurse into any environments contained by this one
         for (n in names(out)) {
             if (rlang::is_formula(attr(out[[n]], "g3_global_init_val"))) {
                 # Add things defined here to the list of things we should be prefixing
                 sym_list <- c(
                     sym_list,
                     names(environment(out[[n]])),
                     names(environment(attr(out[[n]], "g3_global_init_val"))) )
                 out[[n]] <- structure(
                     call_to_formula(
                         add_prefix(rlang::f_rhs(out[[n]]), sym_list),
                         repl_env_fn(environment(out[[n]])) ),
                     g3_global_init_val = call_to_formula(
                         add_prefix(rlang::f_rhs( attr(out[[n]], "g3_global_init_val") ), sym_list),
                         repl_env_fn(environment(attr(out[[n]], "g3_global_init_val"))) ))
             } else if (rlang::is_formula(out[[n]])) {
                 sym_list <- c(sym_list, names(environment(out[[n]])))
                 out[[n]] <- call_to_formula(
                     add_prefix(rlang::f_rhs(out[[n]]), sym_list),
                     repl_env_fn(environment(out[[n]])) )
             }
         }
         return(out)
     }
     new_env <- repl_env_fn(new_stock$env)

     inner_stock$dim[[new_dim]] <- new_stock$dim[[old_dim]]
     inner_stock$dimnames[[new_dim]] <- new_stock$dimnames[[old_dim]]
     inner_stock$iterate[[new_dim]] <- add_prefix(new_stock$iterate[[old_dim]], sym_list)
     inner_stock$iter_ss[[new_dim]] <- add_prefix(new_stock$iter_ss[[old_dim]], sym_list)
     inner_stock$intersect[[new_dim]] <- add_prefix(new_stock$intersect[[old_dim]], sym_list)
     inner_stock$interact[[new_dim]] <- add_prefix(new_stock$interact[[old_dim]], sym_list)
     inner_stock$with[[new_dim]] <- add_prefix(new_stock$with[[old_dim]], sym_list)
     inner_stock$env <- as.environment(c(as.list(inner_stock$env), new_env))
     # NB: Leave name_parts, name as-is
     return(inner_stock)
}
gadget-framework/gadget3 documentation built on June 13, 2025, 5:06 a.m.