R/biodivprobgen_utilities.R

#===============================================================================

                        #  biodivprobgen_utilities.R

#===============================================================================

#  2015 04 27 - BTL
#       Created by extracting functions from generateSetCoverProblem.R.

#===============================================================================

#-------------------------------------------------------------------------------

#' Safe version of R sample function
#'
#' 2015 04 08 - BTL
#'  I just got bitten very badly by the incredibly annoying behavior of R's
#'  sample() function, so here is a replacement function that I need to
#'  use everywhere now.
#'  When I called sample with a vector that sometimes had length n=1,
#'  it sampled from 1:n instead of returning the single value.
#'  This majorly screwed all kinds of things in a very subtle, very hard
#'  to find way.
#'
#-------------------------------------------------------------------------------

#' @param x Either a vector of one or more elements from which to choose, or a positive integer.
#' @param ... Arguments that can be passed on to base::sample()
#'
#' @return Returns a value that is probably an integer but is definitely not null
#' @export

#-------------------------------------------------------------------------------

safe_sample = function (x,...) { if (length (x) == 1) x else sample (x,...) }

#===============================================================================

#-------------------------------------------------------------------------------

#' Return NA for any given value whose length is 0
#'
#' This is originally built to be used on each element of a list inside an
#' lapply where you're trying to get rid of NULLs, numeric(0)s, etc. in a
#' list to be converted to a data frame.
#'
#-------------------------------------------------------------------------------

#' @param value any value
#'
#' @return NA or the input argument
#' @export

#-------------------------------------------------------------------------------

fix_0_length_list_elements <- function (value)
    {
        #  length == 0 should handle NULL, logical(0), numeric(0), etc.
        #  These are all things that as.data.frame (list) chokes on.
    if (length (value) == 0)
        return (NA) else return (value)
    }

#===============================================================================

#-------------------------------------------------------------------------------

#' Convert list to a new list with 0 length values replaced by NA
#'
#-------------------------------------------------------------------------------

#' @param a_list a list
#'
#' @return the input list with its 0 length values replaced by NA values and
#' non-0 length values left as they were
#' @export

#-------------------------------------------------------------------------------

list_with_0_length_vals_replaced_by_NA <- function (a_list)
    {
    if (! is.list (a_list))
        stop_bdpg ("list_with_0_length_vals_replaced_by_NA: argument is not a list.")

    return (lapply (a_list, fix_0_length_list_elements))
    }

#===============================================================================

#-------------------------------------------------------------------------------

#' Convert list to data frame with 0 length values replaced by NA
#'
#-------------------------------------------------------------------------------

#' @param a_list a list
#'
#' @return a data frame whose columns correspond to elements of the list and
#' any 0 length data value in the input list has been replaced with NA
#' @export

#-------------------------------------------------------------------------------

list_as_data_frame_with_0_length_vals_replaced_by_NA <- function (a_list)
    {
    return (as.data.frame (list_with_0_length_vals_replaced_by_NA (a_list)))
    }

#===============================================================================

#-------------------------------------------------------------------------------

#' Get parameter value that should be integer and return 1 if value is null
#'
#-------------------------------------------------------------------------------

#' @param value a value that is probably a number, but could be anything
#'
#' @return Returns a value that is probably an integer but is definitely not null
#' @export

#-------------------------------------------------------------------------------

value_or_1_if_null <- function (value)
    {
    if (is.null (value)) 1 else value
    }

#===============================================================================

#-------------------------------------------------------------------------------

#' Get parameter value that should be boolean and return FALSE if value is null
#'
#-------------------------------------------------------------------------------

#' @param value a value that is probably a boolean, but could be anything
#'
#' @return Returns a value that is probably a boolean but is definitely not null
#' @export

#-------------------------------------------------------------------------------

value_or_FALSE_if_null <- function (value)
    {
    if (is.null (value)) FALSE else value
    }

#===============================================================================

#' Write results to files
#'
#' This function is used two different ways.
#' It's called when the program quits because there are too many species
#' or it's called when the program runs successfully.
#'
#-------------------------------------------------------------------------------

#' @param csv_outfile_name  Name of the csv file to write results to, not
#'        including csv extension but not including path
#' @param results_df  data frame containing the results
#' @param run_ID  integer ID of a run
#' @param out_dir  character string telling what directory to put the results
#'                 file in
#' @param tzar_run_id_field_name character string containing name of tzar run ID
#' field in output file
#'
#' @return Returns nothing.

#-------------------------------------------------------------------------------

write_results_to_files <- function (csv_outfile_name,
                                    results_df,
                                    run_ID,    #parameters,
                                    out_dir,
                                    tzar_run_id_field_name
                                    # ,
                                    # cur_result_row=1    #  Added 2016 03 28 - BTL.
                                   )
    {
        #  Write the results out to 2 separate and nearly identical files.
        #  The only difference between the two files is that the run ID in
        #  one of them is always set to 0 and in the other, it's the correct
        #  current run ID.  This is done to make it easier to automatically
        #  compare the output csv files of different runs when the only thing
        #  that should be different between the two runs is the run ID.
        #  Having different run IDs causes diff or any similar comparison to
        #  think that the run outputs don't match.  If they both have 0 run ID,
        #  then diff's output will correctly flag whether there are differences
        #  in the outputs.
        #
        #  File names currently in project.yaml:
        #      summary_filename: $$output_path$$prob_diff_results.csv
        #      summary_without_run_id_filename: $$output_path$$prob_diff_results_with_0_run_id.csv

            #  Build file name and path for auxiliary file where run_id is
            #  zeroed out.
    csv_outfile_name_base = tools::file_path_sans_ext (csv_outfile_name)
    csv_outfile_name_ext  = tools::file_ext (csv_outfile_name)    #  Probably "csv" but just being cautious here...
    csv_outfile_name_with_0_run_id = paste0 (csv_outfile_name_base,
                                             "_with_0_run_id.",
                                             csv_outfile_name_ext)

    summary_WITHOUT_run_id_path = file.path (out_dir, csv_outfile_name_with_0_run_id)
                                             ##parameters$summary_without_run_id_filename)

    results_df[[tzar_run_id_field_name]] = 0
    write.csv (results_df, file = summary_WITHOUT_run_id_path, row.names = FALSE)

    summary_WITH_run_id_path = file.path (out_dir, csv_outfile_name)
                                          ##parameters$summary_filename)
    results_df[[tzar_run_id_field_name]] = run_ID    #parameters$run_id
    write.csv (results_df, file = summary_WITH_run_id_path, row.names = FALSE)
    }

#===============================================================================

get_default_identical_PU_costs <- function (num_PUs, cost = 1)
    {
    return (rep (cost, num_PUs))
    }

#===============================================================================

#-------------------------------------------------------------------------------

#' See if there are any duplicate links
#'
#' I'm not certain, but I think that the original Xu algorithm doesn't allow
#' duplicate links.  I don't think that they would invalidate the solution,
#' but I suspect that they could make the problem easier by giving more
#' reasons to choose a pair of planning units.  Consequently, I'm looking for
#' duplicate links.
#'
#'  Note that you can only uniquely decode an edge list from an occurrence
#'  matrix if the species only occurs on 2 patches, i.e., the underlying
#'  assumption in the Xu problem generator.
#'  However, the only reason I'm building this routine now is to run it
#'  on the Xu benchmark problems to see if those allow any duplicate
#'  edges, i.e., more than one species occurring on the same pair of
#'  patches.
#'
#-------------------------------------------------------------------------------

#' @inheritParams std_param_defns
#'
#' @return Returns an edge list, a two column integer matrix of
#'     node IDs with one row for each edge and columns for the 2 ends of
#'     the edge; quits if any duplicate edges are found.

#-------------------------------------------------------------------------------

see_if_there_are_any_duplicate_links = function (occ_matrix, num_spp)
    {
    num_PU_spp_pairs = sum (occ_matrix)

    edge_list = matrix (0,
                        nrow = num_spp,
                        ncol = 2,
                        byrow = TRUE)

    colnames (edge_list) = c("from_node", "to_node")

    num_spp_warnings = 0
    num_spp_on_no_patches = 0
    rows_to_delete = list()

    for (cur_spp_row in 1:num_spp)
        {
        cur_spp_occurs_on_patches = which (occ_matrix [cur_spp_row,] == 1)
        num_patches_for_cur_spp = length (cur_spp_occurs_on_patches)

        if ((num_patches_for_cur_spp != 2) & (num_patches_for_cur_spp != 0))
            {
            num_spp_warnings = num_spp_warnings + 1
            cat ("\nWARNING ", num_spp_warnings, ": spp ", cur_spp_row, " is on ",
                 num_patches_for_cur_spp, " patches, PU_IDs ", sep='')
            print (cur_spp_occurs_on_patches)
            rows_to_delete [[length (rows_to_delete) + 1]] = cur_spp_row

            } else
            {
            if (num_patches_for_cur_spp == 0)
                {
                num_spp_on_no_patches = num_spp_on_no_patches + 1
                rows_to_delete [[length (rows_to_delete) + 1]] = cur_spp_row

                } else
                {
                edge_list [cur_spp_row, "from_node"] = cur_spp_occurs_on_patches [1]
                edge_list [cur_spp_row, "to_node"] = cur_spp_occurs_on_patches [2]

                }  #  end else - spp is on exactly 2 patches
            }  #  end else - spp is on either 0 or 2 patches
        }  #  end for - all spp rows

    if (num_spp_warnings > 0)
        cat ("\n----->  num_spp_warnings = ", num_spp_warnings, sep='')

    if (num_spp_on_no_patches > 0)
        {
        cat ("\n----->  num_spp_on_no_patches = ", num_spp_on_no_patches, sep='')
        }

    if (length (rows_to_delete) > 0)
        {
        rows_to_delete = unlist (rows_to_delete)
        cat ("\n----->  rows_to_delete = ", sep='')
        print (rows_to_delete)

        edge_list = edge_list [- rows_to_delete,,drop=FALSE]
        }

    row_nums_of_duplicates = which (duplicated (edge_list))
    num_duplicates = length (row_nums_of_duplicates)
    cat ("\n\nindices (duplicates) = ", row_nums_of_duplicates, "\n")
    cat ("\nduplicates = \n")
    print (edge_list[row_nums_of_duplicates,,drop=FALSE])
    cat ("\n\nnumber of duplicates = ", num_duplicates, "\n", sep='')

    if (num_duplicates > 0)
        {
        stop_bdpg (paste0 ("\n\nERROR: ", num_duplicates,
                      " duplicate species in the Xu benchmark file."))
        }

    return (edge_list)
    }

#===============================================================================

#-------------------------------------------------------------------------------

#' Build PU spp pair indices from occ matrix
#'
#' Build planning unit vs. species indices from occupancy matrix.
#'
#-------------------------------------------------------------------------------

#' @param occ_matrix matrix
#'
#' @return Returns PU_spp_pair_indices data frame

#-------------------------------------------------------------------------------

build_PU_spp_pair_indices_from_occ_matrix <- function (occ_matrix)
    {
    #------------------------------------------------------------------
    #  2018 01 21 - BTL
    #  NOTE: I've just replaced a couple of nested for loops here
    #  that built PU_spp_pair_indices in a way that contained the
    #  same values as are returned now, but in a different order.
    #  I don't think it will matter downstream of this since I think
    #  the data from here gets copied into another structure and
    #  sorted at some point and I don't think anything else is paying
    #  any attention to the order between here and the sort.
    #  However, for the moment, I want to flag this here in case
    #  something strange starts happening elsewhere and this might
    #  be an indirect cause.
    #------------------------------------------------------------------

    occupied_row_col_pairs = which (occ_matrix > 0, arr.ind=TRUE)
    PU_spp_pair_indices = data.frame (PU_ID  = occupied_row_col_pairs [,2],
                                      spp_ID = occupied_row_col_pairs [,1])

    return (PU_spp_pair_indices)
    }

#===============================================================================
#     Compute what fraction of species meet their representation targets.
#===============================================================================

#-------------------------------------------------------------------------------

find_indices_of_spp_with_unmet_rep = function (spp_occ_matrix,
                                               candidate_solution_PU_IDs,
                                               num_spp,
                                               spp_rep_targets
                                               )
    {
    spp_rep_fracs = compute_rep_fraction (spp_occ_matrix,
                                          candidate_solution_PU_IDs,
                                          spp_rep_targets
                                          )

    return (which (spp_rep_fracs < 1))
    }

#===============================================================================

#-------------------------------------------------------------------------------

compute_frac_spp_covered =
        function (spp_occ_matrix,
                  candidate_solution_PU_IDs,
                  num_spp,
                  spp_rep_targets
                  )
    {
    indices_of_spp_with_unmet_rep =
        find_indices_of_spp_with_unmet_rep (spp_occ_matrix,
                                            candidate_solution_PU_IDs,
                                            num_spp,
                                            spp_rep_targets
                                            )

    return (1 - (length (indices_of_spp_with_unmet_rep) / num_spp))
    }

#===============================================================================

#' Add missing PUs to PU_Count data frame
#'
#'  When error is added to the input data, it sometimes results in
#'  planning units that appear to have no species on them.  This function
#'  adds these planning units back into the PU_Count data frame so that the
#'  apparent problem has the same number of planning units as the correct
#'  problem.
#'
#'  When the pu_spp_pair_indices are written out in marxan's input
#'  format, there is no record of the empty planning units in those
#'  pairs since each pair is a planning unit ID followed by the ID
#'  of a species on that planning unit.
#'
#'  Consequently, marxan's output solutions will have fewer planning
#'  units than the problem generator generated and you will get size
#'  warnings (that should be errors) when comparing them to things
#'  like nodes$dependent_set_member.
#'  For example, here is the error that showed this was happening:
#'
#'       Error in marxan_best_df_sorted$SOLUTION - nodes$dependent_set_member :
#'       (converted from warning) longer object length is not a multiple of shorter object length
#'
#'  To fix this, you need to add entries for each of the missing PUs.
#'
#' NOTE:  There could also be correct problems that have some patches with no
#' species on them.
#'
#' NOTE:  Not sure if matters, but I had commented in here that the old name
#' for this function was: add_missing_PUs_to_marxan_solutions().  Probably
#' don't need to track that anymoe, but will leave it here for the moment.

#-------------------------------------------------------------------------------

#' @inheritParams std_param_defns
#'
#' @return Returns \code{marxan solution} data frame that includes all planning
#'     unit IDs from the correct problem rather than only the IDs that occurred
#'     in the apparent problem (see description of \code{marxan_solution} in
#'     input argument list for this function)

#-------------------------------------------------------------------------------

add_missing_PU_rows_to_PU_Count_dataframe = function (marxan_solution,
                                                all_correct_node_IDs,
                                                PU_col_name,
                                                presences_col_name)
    {
        #  Marxan solutions are data frames with one row for each planning unit.
        #  They have 2 columns, one for the planning unit IDs and the other
        #  for the count or indicator of presence/absence.
        #  The second column usually contains 0 or 1 to indicate presence
        #  or absence of that PU in the marxan solution.
        #  However, in the case of marxan's summed solution, the second
        #  column contains the number of iterations (restarts) where that
        #  planning unit appeared in marxan's solution.
        #
        #  Search for the missing planning unit IDs, then add one line
        #  to the table for each missing planning unit ID.
        #  Set the presences field for each of those lines to be 0.

    missing_PU_IDs = setdiff (all_correct_node_IDs, marxan_solution [ , PU_col_name])
    num_missing_PU_IDs = length (missing_PU_IDs)

    if (num_missing_PU_IDs > 0)
        {
        missing_rows = matrix (c(missing_PU_IDs, rep(0,num_missing_PU_IDs)),
                               nrow=num_missing_PU_IDs,
                               ncol=2,
                               dimnames=list(NULL,c(PU_col_name,presences_col_name)))

        marxan_solution = rbind (marxan_solution, missing_rows)
        }

    return (marxan_solution)
    }

#===============================================================================

#-------------------------------------------------------------------------------

        #  Count the number of species on each PU.
        #  The initial count may have to be cleaned up because
        #  some nodes may not appear in the PU_spp_pair_indices
        #  table if they have no species occurrences on them.
        #  For example, if false negative errors were added to a
        #  Xu problem to simulate detectability issues, nodes that had
        #  species on them in the initial problem definition may
        #  no longer have any species on them and not appear in the table.
        #  Downstream processes may expect those nodes to still appear
        #  in the list of counts but have a zero count, so you need
        #  to add them back into the table.

clean_up_final_link_counts_for_each_node <- function (PU_spp_pair_indices,
                                                      all_correct_node_IDs,
                                                      PU_col_name,
                                                      presences_col_name)
    {
    final_link_counts_for_each_node_without_missing_rows =
                            plyr::count (PU_spp_pair_indices, vars=PU_col_name)

    final_link_counts_for_each_node =
        add_missing_PU_rows_to_PU_Count_dataframe (final_link_counts_for_each_node_without_missing_rows,
                                             all_correct_node_IDs,
                                             PU_col_name, presences_col_name)

    return (final_link_counts_for_each_node)
    }

#===============================================================================

#-------------------------------------------------------------------------------

#' Compute final PU and species counts and plot degree and abundance distributions.
#'
#-------------------------------------------------------------------------------

#' @inheritParams std_param_defns
#'
#' @return final_link_counts_for_each_node, i.e., species count for each node, i.e., 2 column data frame of PU_IDs vs. number of species on corresponding PU
#'
#' @details
#'  Count the number of species on each PU.
#'  If a PU has no species on it, it won't necessarily be in the
#'  PU_spp_pair_indices table, so this routine adds the PU back into the table
#'  with a zero count.
#'
#'  This means that even though PU_spp_pair_indices could be either a correct
#'  or an apparent set of PU-spp pairs, all_correct_node_IDs HAS
# ' to be the full set of CORRECT node IDs.
#' @examples
#' \dontrun{
#' tot_num_nodes = 6
#' PU_spp_pair_indices = data.frame (PU_ID=1:tot_num_nodes, spp_ID=c(1,1,2,2))
#' all_correct_node_IDs = 1:tot_num_nodes
#' final_link_counts =
#'     clean_up_final_link_counts_for_each_node (PU_spp_pair_indices,
#'                                               all_correct_node_IDs,
#'                                               "PU_ID",
#'                                               "freq")
#'          }
#'
#-------------------------------------------------------------------------------

summarize_and_plot_graph_and_distribution_structure_information =
    function (PU_spp_pair_indices,  #  either correct or apparent
              cor_or_app_label,     #  either correct or apparent
              all_correct_node_IDs,  #  MUST BE correct only
              plot_output_dir,
              spp_col_name,
              PU_col_name,
              presences_col_name)
    {
        #  Count the number of PUs each species occurs on.
        #  For the Xu problem generator, all species should occur on
        #  exactly 2
cat("\nstarting summarize_and_plot_graph_and_distribution_structure_information()")

    final_node_counts_for_each_link =
            plyr::count (PU_spp_pair_indices, vars=spp_col_name)
cat("\njust after count()")

        #  Count the number of species on each PU.
        #  If a PU has no species on it, it won't necessarily be in the
        #  PU_spp_pair_indices table, so add the PU back into the table
        #  with a zero count.
        #
        #  So, even though PU_spp_pair_indices could be either a correct
        #  or an apparent set of PU-spp pairs, all_correct_node_IDs HAS
        #  to be the full set of CORRECT node IDs.

    final_link_counts_for_each_node =
            clean_up_final_link_counts_for_each_node (PU_spp_pair_indices,
                                                      all_correct_node_IDs,
                                                      PU_col_name,
                                                      presences_col_name)
cat("\njust after clean_up_final_link_counts_for_each_node()")

    plot_degree_and_abundance_dists_for_node_graph (
                                final_link_counts_for_each_node,
                                final_node_counts_for_each_link,
                                PU_col_name,
                                plot_output_dir,
                                cor_or_app_label,
                                spp_col_name)
cat("\njust after plot_degree_and_abundance_dists_for_node_graph()")


    return (final_link_counts_for_each_node)
    }

#===============================================================================

#' Save bd problem to disk
#'
#' After a problem is generated, its R representation is saved to disk so
#' that it can be archived and re-used in future experiments without having
#' to regenerate the problem.  Saving it is also useful for reproducibility
#' in that it allows re-creation of exactly the problem used in an experiment.
#'
#-------------------------------------------------------------------------------

#' @details
#' Writes to a file whose name contains:
#' \itemize{
#'  \item{the UUID of the problem}
#'  \item{whether it's a basic problem or a wrapped problem}
#'  \item{whether it's a correct or an apparent problem}
#' }
#'
#-------------------------------------------------------------------------------

#' @inheritParams std_param_defns
#'
#' @return Returns input object with its checksum slot filled

#-------------------------------------------------------------------------------

save_rsprob <- function (rsprob, exp_root_dir)
    {
    base_outdir = get_RSprob_path_topdir (rsprob, exp_root_dir)
    rsprob      = save_obj_with_checksum (rsprob,
                                          #saved_rsprob_filename,
                                          base_outdir)

    return (rsprob)
    }


#===============================================================================

load_saved_obj_from_file <- function (path_to_file)
    {
    return (readRDS (path_to_file))
    }

#===============================================================================

#' Strip Trailing Slash Off Of Path If There Is One
#'
#' Tzar puts a slash on the end of the output path and this causes problems
#' when using it to build full paths to file names.  This function is
#' primarily here to strip that off but can be used on any path.
#'
#-------------------------------------------------------------------------------

#' @param path character string containing a path that may or may not end in a
#'     slash
#'
#' @return character string containing original path argument with trailing
#'     slash removed if there was one, otherwise, same as original
#' @export

#-------------------------------------------------------------------------------

strip_trailing_slash <- function (path)
    {
    last_char = stringr::str_sub (path, nchar (path), nchar (path))

        #  Originally, this looked for the platform-specific file separator,
        #  but that may cause a problem, since tzar and R both seem to always
        #  use a slash and then translate back and forth to the
        #  platform-specific only when actually talking to the OS.
        #  So, changing back to just checking for a slash.

    if (last_char == "/")                                   #.Platform$file.sep)
        path = stringr::str_sub (path,1,nchar(path)-1)

    return (path)
    }

#===============================================================================

#'  Imitate unix touch function to create an empty file.
#'
#'  I couldn't find an existing function for this, but did find a
#'  stack overflow question that suggested using write.table() on an
#'  empty table, so that's what I've done.
#'
#-------------------------------------------------------------------------------

#' @param file_path_to_touch character string
#'
#' @return Returns nothing
#' @export

#-------------------------------------------------------------------------------

touch <- function (file_path_to_touch)
    {
    file_path_to_touch = normalizePath (file_path_to_touch, mustWork=FALSE)
#    cat ("\nfile_path_to_touch = '", file_path_to_touch, "'\n")
    write.table (data.frame(),
                 file = file_path_to_touch,
                 col.names=FALSE)
    }

#===============================================================================

#-------------------------------------------------------------------------------

#' Validate input as boolean and replace with default on empty input if desired
#'
#' This function is intended to be a more flexible replacement for calling
#' is.logical().  In particular, it makes it possible to replace empty values
#' with a default value (e.g., FALSE) and to define whether NULL and/or NA
#' are treated as empty values.  It also allows control over whether numeric
#' values are considered boolean by this validation function.
#' This is sometimes useful when validating an input option where a numeric
#' value would be suggestive of a mistake in the input file.
#'
#' Note that even if they are set to TRUE, the flags treat_NULL_as_empty and
#' treat_NA_as_empty are ignored if def_on_empty is FALSE.

#-------------------------------------------------------------------------------

#' @param var_value the value to be checked to see if it's a boolean
#' @param def_on_empty boolean flag indicating whether to return a default
#'     value instead of the input value when the input value is empty (where
#'     empty is defined by other flags below)
#' @param def a TRUE or FALSE default value to return instead of the input value
#'     when a default is requested
#' @param treat_NULL_as_empty a boolean flag set to TRUE if a NULL input is to
#'     be treated as an empty input; FALSE otherwise
#' @param treat_NA_as_empty a boolean flag set to TRUE if an NA input is to
#'     be treated as an empty input; FALSE otherwise
#' @param allow_num a boolean flag set to TRUE if a numeric input is to be
#'     allowed and returned as FALSE if 0 and TRUE if non-zero
#'
#' @return Returns the input value if it is boolean or, returns a specified
#'     boolean if other arguments force a valid converted or default boolean
#'     to return; otherwise, throws an error
#'
#' @export
#'
#' @examples
#' x <- TRUE
#' vb (x)
#' vb (-150, allow_num = TRUE)
#' vb (0, allow_num = TRUE)
#' vb (NULL, def_on_empty = TRUE)
#' vb (NULL, def_on_empty = TRUE, def = TRUE)
#' vb (NA, def_on_empty = TRUE, def = TRUE, treat_NA_as_empty = TRUE)
#' vb (NA, def_on_empty = TRUE, treat_NA_as_empty = TRUE)
#' vb (NULL, def_on_empty = TRUE, def = 10, treat_NULL_as_empty = TRUE, allow_num = TRUE)
#' \dontrun{
#'                 #  These all generate errors
#' vb (1)                     #  error - not boolean & allow_num not set to TRUE
#' vb (0)
#' vb (1000)
#' vb ("aString")                        #  error - not boolean
#' vb (NULL)                             #  error - def_on_empty not set
#' vb (NA)            #  error - def_on_empty not set, treat_NA_as_empty not set
#' vb (NULL, treat_NULL_as_empty = TRUE) #  error - def_on_empty not set
#' vb (NA, treat_NA_as_empty = TRUE)     #  error - def_on_empty not set to TRUE
#'         #  error - non-numeric default, but allow_num not TRUE
#' vb (NA, def_on_empty = TRUE, def = 10, treat_NA_as_empty = TRUE)
#'          }

#-------------------------------------------------------------------------------

vb <- function (var_value, def_on_empty = FALSE, def = FALSE,
                treat_NULL_as_empty = TRUE,
                treat_NA_as_empty = FALSE,
                allow_num = FALSE)
    {
        #  A little extra logic is required here when building error messages
        #  to keep error messages from being misleading when a bad input value
        #  invokes the use of a caller-provided default value and that value
        #  is also bad.
        #  Without the extra logic, the bad default value is reported as the
        #  name of the input value in the error message instead of giving
        #  the name of the original input variable.

    var_name = deparse (substitute (var_value))  #  Get var_name arg as string
    err_string_lead = "Value"

    err_string_lead_for_def = "Default value"
    if (def_on_empty &&
            ((treat_NULL_as_empty && is.null (var_value))
                    ||
#  Getting an error message when I use is.na() and have warnings set to errors.
#  is.na() documentation says:
#  anyNA(NULL) is false: is.na(NULL) is logical(0) with a warning.
#                 (treat_NA_as_empty && is.na (var_value))))
             (treat_NA_as_empty && anyNA (var_value))))
        {
        var_value = def
        err_string_lead = err_string_lead_for_def
        }

    if (is.numeric (var_value))
        {
        if (allow_num) var_value = (var_value != 0)  #  Set 0 FALSE, non-0 TRUE
        if (err_string_lead == err_string_lead_for_def)
            {
            err_string_lead = "Default numeric value"
            }
        else
            err_string_lead = "Numeric value"
        }

    if (!is.logical (var_value) ||
        (anyNA (var_value) && (!def_on_empty || !treat_NA_as_empty)))
        {
        stop_bdpg (paste0 (err_string_lead, " '", var_value,
                      "' used for input variable ", var_name,
                      " must be boolean"))
        }

    return (var_value)
    }

#===============================================================================

#-------------------------------------------------------------------------------

#' Validate input as numeric and replace with default on empty input if desired
#'
#' This function is intended to be a more flexible replacement for calling
#' is.numeric().  In particular, it makes it possible to replace empty values
#' with a default value (e.g., 0) and to define whether NULL and/or NA
#' are treated as empty values.  It also allows checking whether the input
#' (or a resulting default) value falls in a given range.  If no range is
#' specified, then any numeric value is allowed.  It also allows specification
#' of whether the bounds represent an open, closed, or semi-closed interval
#' through the bounds_type argument.  That argument is a 2 character string
#' composed of any combination of 'i' and 'e' to designate whether the bounds
#' are inclusive or exclusive.  The first character of the string is for the
#' lower bound and the second is for the upper bound, e.g., if the string is
#' "ei", then the function will check whether the value is strictly > the
#' lower bound and <= to the upper bound.
#'
#' Note that even if they are set to TRUE, the flags treat_NULL_as_empty and
#' treat_NA_as_empty are ignored if def_on_empty is FALSE.

#-------------------------------------------------------------------------------

#' @param var_value the value to be checked to see if it's a numeric in range
#' @param range_lo the lower bound of the range to see if input value falls in
#' @param range_hi the upper bound of the range to see if input value falls in
#' @param bounds_types a 2 character string indicating whether the lower and
#'     upper bounds are inclusive or exclusive bounds, with 'i' meaning
#'     inclusive and 'e' meaning exclusive; legal strings are "ii", "ie", "ei",
#'     and "ee"
#' @param def_on_empty boolean flag indicating whether to return a default
#'     value instead of the input value when the input value is empty (where
#'     empty is defined by other flags below)
#' @param def a TRUE or FALSE default value to return instead of the input value
#'     when a default is requested
#' @param treat_NULL_as_empty a boolean flag set to TRUE if a NULL input is to
#'     be treated as an empty input; FALSE otherwise
#' @param treat_NA_as_empty a boolean flag set to TRUE if an NA input is to
#'     be treated as an empty input; FALSE otherwise
#'
#' @return Returns the input value if it is numeric and in range or, returns a
#'     specified numeric value if other arguments force a valid default
#'     value to return; otherwise, throws an error
#'
#' @export
#'
#' @examples
#' x <- 0.7
#' vn (x)
#' vn (x, range_lo = 0, range_hi = 1)
#' vn (x, range_lo = 0.7, range_hi = 1, bounds_types = "ie")
#' vn (100, range_hi = 100)
#' vn (NULL, def_on_empty = TRUE, def = 0)
#' vn (NULL, def_on_empty = TRUE)
#' vn (NA, def_on_empty = TRUE, def = -999)
#' vn (NA, range_hi = 100, bounds_types = "ee",
#'     def_on_empty = TRUE, def=15, treat_NA_as_empty = TRUE)
#' \dontrun{
#' vn (1000, range_hi = 100, bounds_types = "ii")                       #  error
#' vn (0.7, range_lo = 0.7, bounds_types = "ei")                        #  error
#' vn (NULL)                                                            #  error
#' vn (NA)                                                              #  error
#' vn (NA, range_lo = -10, range_hi = 10, bounds_types = "ee",
#'     def_on_empty = TRUE, def = 15)                                   #  error
#' vn (NULL, range_lo = -10, range_hi = 10, bounds_types = "ee",
#'     def_on_empty = TRUE, def = 15, treat_NULL_as_empty = TRUE)       #  error
#'          }

#-------------------------------------------------------------------------------

vn <- function (var_value,
                range_lo=-Inf, range_hi=Inf, bounds_types = "ii",
                def_on_empty = FALSE,
                def = 0,
                treat_NULL_as_empty = TRUE,
                treat_NA_as_empty = TRUE)
    {
        #--------------------------------------------------------
        #  Get var_name arg as string to use in error messages.
        #--------------------------------------------------------

    var_name = deparse (substitute (var_value))
    err_string_lead = "Validating"

        #-------------------------------------------------------------------
        #  If caller wants to replace empty input with a default value,
        #  and the input is empty, then go ahead and make the replacement.
        #  Because the default value itself might not be numeric or
        #  in range, continue on to check that value as you would a
        #  value that was passed in normally.
        #-------------------------------------------------------------------

    if (def_on_empty &&
            ((treat_NULL_as_empty && is.null (var_value))
                    ||
             (treat_NA_as_empty && anyNA (var_value))))
        {
        var_value = def
        err_string_lead = "Default value"
        }

        #-----------------------------------------------------------------
        #  Check the value that will be returned to see if it's numeric,
        #  regardless of whether it is the value that was passed in or
        #  the default value.
        #-----------------------------------------------------------------

    if (! is.numeric (var_value))
        stop_bdpg (paste0 (err_string_lead, " '", var_value,
                      "' used for input variable ", var_name,
                      ", must be numeric"))

        #-----------------------------------------------------------------
        #  Make sure that the range hi and lo are themselves numeric and
        #  are in the proper order to specify a range.
        #-----------------------------------------------------------------

    if (! is.numeric (range_lo))
        stop_bdpg (paste0 (err_string_lead, " '", var_value,
                      "' used for input variable ", var_name,
                      ", range_lo = '", range_lo, "' must be numeric"))

    if (! is.numeric (range_hi))
        stop_bdpg (paste0 (err_string_lead, " '", var_value,
                      "' used for input variable ", var_name,
                      ", range_hi = '", range_hi, "' must be numeric"))

    if (range_lo > range_hi)
        stop_bdpg (paste0 (err_string_lead, " '", var_value,
                      "' used for input variable ", var_name,
                      ", range_lo = '", range_lo, "' must be <= ",
                      "range_hi = '", range_hi, "'"))

        #----------------------------------------------------------------------
        #  Check that the range bounds are specified correctly and that the
        #  var_value to be returned from the function falls within the bounds.
        #
        #  The upper and lower bounds can be either exclusive or inclusive
        #  bounds. This is specified by a 2 character string with the first
        #  character for the lower bound and the second character for the
        #  upper bound.  For each bound, the specifier is an "i" if the bound
        #  is inclusive and an "e" if it's exclusive, e.g., if a value must
        #  be >= to the lower bound and strictly less than the upper bound,
        #  the bounds_type would be "ie".
        #----------------------------------------------------------------------

    if (! is.character (bounds_types))
        stop_bdpg (paste0 (err_string_lead, " '", var_value,
                      "' used for input variable ", var_name,
                      ", bounds_types = '", range_hi, "' must be a string"))

    if (nchar (bounds_types) != 2)
        stop_bdpg (paste0 (err_string_lead, " '", var_value,
                      "' used for input variable ", var_name,
                      ", bounds_types = '", range_hi, "' must be a 2 character string"))

    lower_bound_type = substring (bounds_types, 1, 1)
    if (lower_bound_type == "i")
        {
        if (var_value < range_lo)
            {
            stop_bdpg (paste0 (err_string_lead, " '", var_value,
                          "' used for input variable ", var_name,
                          ", must be >= ", "range_lo = '", range_lo, "'"))
            }
        } else if (lower_bound_type == "e")
        {
        if (var_value <= range_lo)
            {
            stop_bdpg (paste0 (err_string_lead, " '", var_value,
                          "' used for input variable ", var_name,
                          ", must be > ", "range_lo = '", range_lo, "'"))
            }
        } else  #  bounds_type NOT "i" or "e"
        {
        stop_bdpg (paste0 (err_string_lead, " '", var_value,
                      "' used for input variable ", var_name,
                      ", lower_bound_type = '", lower_bound_type, "' must be 'i' or 'e'"))
        }

    upper_bound_type = substring (bounds_types, 2, 2)
    if (upper_bound_type == "i")
        {
        if (var_value > range_hi)
            {
            stop_bdpg (paste0 (err_string_lead, " '", var_value,
                          "' used for input variable ", var_name,
                          ", must be <= ", "range_hi = '", range_hi, "'"))
            }
        } else if (upper_bound_type == "e")
        {
        if (var_value >= range_hi)
            {
            stop_bdpg (paste0 (err_string_lead, " '", var_value,
                          "' used for input variable ", var_name,
                          ", must be < ", "range_hi = '", range_hi, "'"))
            }
        } else  #  bounds_type NOT "i" or "e"
        {
        stop_bdpg (paste0 (err_string_lead, " '", var_value,
                      "' used for input variable ", var_name,
                      ", upper_bound_type = '", upper_bound_type, "' must be 'i' or 'e'"))
        }

    return (var_value)
    }

#===============================================================================

browser_bdpg <- function (browser_on = TRUE,
                          replacement_string = "browser() would execute here.\n")
    {
    if (browser_on) browser() else cat (replacement_string)
    }



#-------------------------------------------------------------------------------

#' Wrapper for stop function to allow extra controls over browsing etc.
#'
#' When stop() is called from code inside tzar emulation, it sometimes
#' crashes abruptly without writing the error message to the console and
#' with no clue about where the crash happened.  This function writes the
#' stop message to the console before calling stop.
#' In addition, it looks at two different flags to see whether it should
#' call browser() before calling stop().
#'
#' Printing the stop message before calling stop will cause
#' the message to be printed twice when not under tzar emulation, however,
#' that's not a big deal compared to not seeing the message at all under
#' emulation.
#'
#' The two flags that the function considers when deciding whether to call
#' browser() are browse_on_crash and bdpg.emulating_tzar.
#'
#' The function
#' first looks to see if a variable called browse_on_crash is visible to it
#' (e.g., has been defined as a global variable).  If it exists, then
#' whatever value it has determines whether browser() is called, i.e., if
#' TRUE, then browser() is called and if FALSE, it's not called.
#'
#' If browse_on_crash does not exist, then the function will check the global
#' options list using the getOption() call to see if a flag has been
#' set there called "bdpg.emulating_tzar".  If it exists and is set to TRUE,
#' then browser() will be called.  If it exists and is set to FALSE or if
#' it does not exist in the options list, then browser() will not be called.
#'
#' There is also an argument called "browser_on" that will probably never be
#' used except during testing of the function.  It's meant to flag whether to
#' actually call browser() when the flags say to do it or whether instead,
#' it should write a replacement string indicating that this is where the
#' browser would be called.  This is just to make it easier to test the
#' function's logic without having to interact with the browser.

#-------------------------------------------------------------------------------

#' @param msg a character string containing the error message to be passed
#'     to the stop function
#' @param browser_on boolean indicating whether to call the browser before
#'     calling stop; TRUE implies browser should be called when the function's
#'     logic says to call the browser and FALSE implies that a replacement
#'     string should be written to the console when the logic says to call
#'     the browser
#'
#' @return No return value
#' @export

#-------------------------------------------------------------------------------

stop_bdpg <- function (msg="Stopping...", browser_on = TRUE)
    {
        #  Write the message that will be passed to stop() on the console
        #  since stop's version is not getting output anywhere when
        #  running under tzar emulation.
        #  This will cause the message to be printed twice when not under
        #  tzar emulation, but that's not a big deal compared to not seeing
        #  the message at all under emulation.

    cat ("\n\nIn stop_bdpg: ", msg, "\n")

        #  If any value is defined for browse_on_crash,
        #  whatever that value is, it overrides anything else.
        #  Generally, it will NOT be defined, but this logic
        #  makes it possible to turn off the tzar-related option below
        #  for a single run or group of runs if you want.

    if (!exists ("browse_on_crash"))
        {
            #  If browse_on_crash variable is not defined, then
            #  check to see if running under tzar emulation.
            #  If so, then do bring up the browser since you're
            #  probably debugging.
            #  If you're not doing tzar emulation, then you're
            #  probably doing normal runs and don't want to
            #  bring up the browser on a crash.

        if (getOption ("bdpg.emulating_tzar", default=FALSE))
            {
            browse_on_crash = TRUE

            } else browse_on_crash = FALSE
        }

    if (browse_on_crash) browser_bdpg (browser_on)

    stop (msg)
    }

#===============================================================================
langfob/bdpg documentation built on Dec. 8, 2022, 5:33 a.m.