R/gen_lognormal_overlay.R

#===============================================================================
#
#                         gen_lognormal_overlay.R
#
#  Functions to generate a vector of counts of PUs to be occupied by a set
#  of species so that the distribution of counts is roughly lognormal.
#  The intention is to add these species to the species already generated
#  by the Xu problem generator so that the resulting combined distribution
#  is roughly lognormal.
#
#  When the distributions are specified here, they are to be given the number
#  of species that have been generated by the Xu problem generator as a
#  count of the number of species to occur on exactly 2 PUs.  The generator
#  here will try to build a lognormal distributions of counts that contains
#  that given number of species occurring on exactly 2 PUs.  Then, when this
#  distribution is returned to the problem generator, that code can add
#  occurrences for all of the other species in the distribution that occur
#  on some number of PUs other than 2 PUs and use the existing Xu species
#  for the ones generated here as occurring on exactly 2 PUs.
#
#  This generator is guaranteed not to return less than the given number
#  of "species on exactly 2 PUs" species.  However, it's possible that
#  it will return more than that.  This is not a problem though, because
#  these extra species that occur on exactly 2 PUs can be treated like any
#  of the other species that occur on some other PU count than 2.
#
#-------------------------------------------------------------------------------
#
#  Various Rmd files of notes generated for exploring lognormal to build the
#  functions in this file are in:
#      .../ProblemDifficulty/ProbDiff_Notes/LognormalAbundanceDistributionBuilding
#
#-------------------------------------------------------------------------------
#
#  History
#   - 2016 04 05 - Created - BTL.
#
#===============================================================================
#===============================================================================

#  Hack to quiet CHECK command for data frame column names that CHECK flags
#  as "no visible binding for global variable".
#  This is the officially sanctioned hack for doing this, e.g., see
#  https://github.com/STAT545-UBC/Discussion/issues/451
#  https://github.com/tidyverse/magrittr/issues/29
#  http://r.789695.n4.nabble.com/globalVariables-td4593980.html

if(getRversion() >= "2.15.1")  utils::globalVariables(c("x"))

#===============================================================================

#-------------------------------------------------------------------------------

#            plot_hist_and_normal_curve_for_sampled_lognormal_data ()
#
#  Function to plot a histogram for a sampled lognormal distribution and
#  then lay over it, the curve of the normal distribution that it is
#  approximating.  (Since it's a lognormal distribution, taking the
#  log of the sampled points should produce a normal distribution.)
#
#-------------------------------------------------------------------------------

plot_hist_and_normal_curve_for_sampled_lognormal_data =
                                function (rounded_abundances, meanlog, sdlog)
    {
        #  Plot histogram of sampled lognormal data.

    hist (log (rounded_abundances), freq=FALSE, breaks=20)

        #  Overlay a curve for the normal distribution
        #  that the sample should approximate.

    curve (dnorm (x, mean=meanlog, sd=sdlog), add=TRUE, lwd=2)

    plot (sort(rounded_abundances, decreasing=TRUE))

    }  #  end function - plot_hist_and_normal_curve_for_sampled_lognormal_data ()

#===============================================================================

#-------------------------------------------------------------------------------

#' Generate rounded abundances
#'
#'  Function to generate a sampled lognormal distribution whose values are
#'  rounded to produce a vector of counts of the number of PUs that each
#'  species will occupy.
#'
#'  No zero counts are allowed, so any species that was assigned a 0 count in
#'  the lognormal generation is removed from the returned vector.
#'  This means that the number of species returned may be less than the value
#'  of tot_num_spp_after_wrapping specified in the inputs.
#'
#-------------------------------------------------------------------------------

#' @param tot_num_spp_after_wrapping integer
#' @param meanlog numeric
#' @param sdlog numeric
#' @param add_one_to_abundances boolean
#' @param plot_rounded_abundances boolean
#'
#' @return Returns a vector with one count for each species

#-------------------------------------------------------------------------------

gen_rounded_abundances = function (tot_num_spp_after_wrapping, meanlog, sdlog,
                                   add_one_to_abundances,
                                   plot_rounded_abundances=FALSE)
    {
        #  Note that "abundance" is used here to mean the number of PUs
        #  that a species occurs on.
    abundances                   = rlnorm (tot_num_spp_after_wrapping, meanlog = meanlog, sdlog = sdlog)
    rounded_abundances           = round (abundances)    #  Returns a vector.
#browser()
        #---------------------------------------------------------------------
        #  rlnorm() often generates a huge number of species that occur on
        #  just 1 patch, which will probably make the problems much easier
        #  since every patch containing a species that only occurs once will
        #  have to be included in the solution.
        #  So, give the option to add one to all of the patch counts as a
        #  way of shifting the distribution slightly.
        #---------------------------------------------------------------------

    if (add_one_to_abundances)
        rounded_abundances = rounded_abundances + 1

    num_spp_on_exactly_2_patches = length (which (rounded_abundances == 2))
    max_abundance_ct             = max (rounded_abundances)

    abundance_data = list()
    abundance_data$rounded_abundances           = rounded_abundances
    abundance_data$num_spp_on_exactly_2_patches = num_spp_on_exactly_2_patches
    abundance_data$max_abundance_ct             = max_abundance_ct

    if (plot_rounded_abundances)
        plot_hist_and_normal_curve_for_sampled_lognormal_data (rounded_abundances,
                                                               meanlog,
                                                               sdlog)

    return (abundance_data)

    }  #  end function - gen_rounded_abundances ()

#===============================================================================
#
#-------------------------------------------------------------------------------

#                           gen_lognormal ()
#
#  No longer used?
#
#  Function to repeatedly generate a sampled lognormal distribution
#  whose values are within specified bounds in terms of the largest
#  element value (max_max_abundance) and the maximum allowed number
#  of individual elements whose value is exactly 2 (max_frac_spp_on_2_PUs).
#

#-------------------------------------------------------------------------------

gen_lognormal = function (tot_num_spp_after_wrapping,
                          max_frac_spp_on_2_PUs, max_max_abundance,
                          min_meanlog = 0.5, max_meanlog = 1.5,
                          min_sdlog = 0.5, max_sdlog = 1.0,
                          add_one_to_abundances,
                          max_iterations
                          )
    {
    cat ("\nAt start of gen_lognormal()",
         ": max_frac_spp_on_2_PUs = ", max_frac_spp_on_2_PUs,
         ", max_max_abundance = ", max_max_abundance,
         "\n")

    frac_of_spp_that_are_on_2_PUs = .Machine$double.xmax    #1000000    #Inf
    max_abundance_ct             = .Machine$double.xmax    #1000000  #Inf
    cur_iteration                = 0

        #-----------------------------------------------------------------------
        #  Draw abundance distributions to try to find a distribution where:
        #       - frac_of_spp_that_are_on_2_PUs <= max_frac_spp_on_2_PUs
        #       AND
        #       - max_abundance_ct <= max_abundance_ct
        #  May not be possible, so also set a limit on the number of iterations
        #  of trying.
        #-----------------------------------------------------------------------

    while ((frac_of_spp_that_are_on_2_PUs > max_frac_spp_on_2_PUs) |
           (max_abundance_ct > max_max_abundance))
        {
        cur_iteration = cur_iteration + 1

        cat ("\n\n================================================================================",
             "\n\niteration: ", cur_iteration, "\n\n")

        if (cur_iteration > max_iterations)
            stop_bdpg (paste0 ("\n\ngen_lognormal() exceeded max_iterations = ",
                  max_iterations,
                  " without finding a distribution satisfying: \n",
                  "\tfrac_of_spp_that_are_on_2_PUs <= max_frac_spp_on_2_PUs = ",
                  max_frac_spp_on_2_PUs,
                  "\tmax_abundance_ct <= max_abundance_ct = ", max_abundance_ct,
                  "\n\n"
                  ))

            #  Randomly draw a new meanlog and sdlog and try them as the basis
            #  of a lognormal distribution.

        meanlog = runif (1, min=min_meanlog, max=max_meanlog)
        sdlog   = runif (1, min=min_sdlog,   max=max_sdlog)
        abundance_data = gen_rounded_abundances (tot_num_spp_after_wrapping,
                                                 meanlog, sdlog,
                                                 add_one_to_abundances,
                                                 plot_rounded_abundances=FALSE)

        rounded_abundances           = abundance_data$rounded_abundances
        num_spp_on_exactly_2_patches = abundance_data$num_spp_on_exactly_2_patches
        max_abundance_ct             = abundance_data$max_abundance_ct

        sorted_rounded_abundances = sort (rounded_abundances, decreasing=TRUE)    #  Returns a vector.
        cat ("\n\nsorted_rounded_abundances = \n")
        print (sorted_rounded_abundances)


    #  2016 06 21 - BTL
    #  Have moved this trimming of 0's into a new function trim_abundances()
    #  that takes care of trimming in a more general way that also allows
    #  trimming of 1's and trimming of species that are too abundant.

        # nonexistant_spp = which (rounded_abundances == 0)
        # num_spp_on_0_patches = length (nonexistant_spp)
        #
        # frac_of_spp_that_are_on_2_PUs =
        #     num_spp_on_exactly_2_patches / (tot_num_spp_after_wrapping - num_spp_on_0_patches)
        #
        # cat ("\n\nnum_spp_on_exactly_2_patches = ", num_spp_on_exactly_2_patches,
        #      "\nnum_spp_on_0_patches = ", num_spp_on_0_patches,
        #      "\nfrac_of_spp_that_are_on_2_PUs = ", frac_of_spp_that_are_on_2_PUs,
        #      "\nmax_abundance_ct = ", max_abundance_ct,
        #      "\nnonexistant_spp = \n")
        # print (nonexistant_spp)
        # cat ("\n")

        }  #  end while - good distribution not found

    cat ("\n\n")

        #  2016 06 21 - BTL
        #  see comment above about trimming...
    # if (num_spp_on_0_patches > 0)
    #     {
    #     rounded_abundances_without_nonexistant_spp = rounded_abundances [-nonexistant_spp]
    #     } else
    #     {
    #     rounded_abundances_without_nonexistant_spp = rounded_abundances
    #     }
    # return (rounded_abundances_without_nonexistant_spp)

    return (rounded_abundances)

    }  #  end function - gen_lognormal ()

#===============================================================================
#
calculate_mu <- function (num_PUs_per_spp_ie_rarity, num_spp_with_given_num_PUs, sigma, plusOrMinus1)
    {
    cat ("\n\nIn calculate_mu():  num_PUs_per_spp_ie_rarity = ", num_PUs_per_spp_ie_rarity,
             ", num_spp_with_given_num_PUs = ", num_spp_with_given_num_PUs,
             ", sigma = ", sigma,
             ", plusOrMinus1 = ", plusOrMinus1,
         "\n\n")

    error_string = ""

    if (num_PUs_per_spp_ie_rarity < 1)
            error_string <-
                paste0 (error_string, "num_PUs_per_spp_ie_rarity = ", num_PUs_per_spp_ie_rarity,
                        ".  Must be >= 1.\n")

    if (num_spp_with_given_num_PUs < 1)
            error_string <-
                paste0 (error_string, "num_spp_with_given_num_PUs = ",
                        num_spp_with_given_num_PUs,
                        ".  Must be >= 1.\n")

    sigma_lower_bound <-
        0.398942 / (num_spp_with_given_num_PUs * num_PUs_per_spp_ie_rarity)

    if (sigma <= sigma_lower_bound)
            error_string <-
                paste0 (error_string, "sigma = ", sigma,
                        ".  Must be > sigma_lower_bound = ", sigma_lower_bound,
                        "\n")

    if (!((plusOrMinus1 == 1) | (plusOrMinus1 == -1)))
            error_string <-
                paste0 (error_string, "plusOrMinus1 = ",
                        plusOrMinus1, ".  Must = 1 or -1.\n")

    if (nchar (error_string) > 0)  stop_bdpg (error_string) else
        {
        return (log (num_PUs_per_spp_ie_rarity) +
                    plusOrMinus1 * 1.41421 *
                    sqrt ( (sigma^2) *
                           log (2.50663 *
                                num_spp_with_given_num_PUs *
                                num_PUs_per_spp_ie_rarity*sigma
                               )
                          )
                )
        }
    }

#===============================================================================
#
#                               EF ()
#
#  Evaluation function (EF) to be used by an optimizer to search for a
#  (meanlog, sdlog) pair that generates a lognormal distribution as close as
#  possible to the specified number of species on 2 PUs and largest abundance
#  count.
#
#  The constraints are as follows:
#
#       - number of species occurring on exactly 2 PUs must be >= to the
#         number given in min_num_spp_on_2_PUs.  It is not allowed to be
#         less than that min.
#           - The minimum used is the number of species in the original
#             Xu problem.
#           - The distance is calculated as:
#                   (num_spp_on_exactly_2_patches - min_num_spp_on_2_PUs)
#                   -----------------------------------------------------
#                               min_num_spp_on_2_PUs
#
#       - the species in the generated lognormal distribution that occurs on
#         the highest number of patches is
#           - less than or equal to the specified max_max_abundance value
#           - as close as possible to the target_max_abundance_ct
#               - It is allowed to be less than, greater than, or equal to the
#                 specified target, but it is penalized proportionally to
#                 its distance from that target.
#           - The distance is calculated as:
#                   abs (max_abundance_ct - target_max_abundance_ct)
#                   ------------------------------------------------
#                               target_max_abundance_ct
#
#  The EF values each of the two constraints equally by minimizing the
#  maximum of the two values, so it can't get a good score by focusing
#  on just one of the criteria.
#
#
    #--------------------------------------------------------------------
    #  It may be useful to keep track of the values visited during the
    #  optimization so that they can be saved for possible later use as
    #  part of a big lookup table in place of running an optimizer.
    #  So, allow specifying an output file (which will later by specified
    #  in the yaml file) and the column headings for the file.
    #  If the value given for the outfile is NULL, however, the values
    #  will not be written out.
    #--------------------------------------------------------------------

#===============================================================================

#-------------------------------------------------------------------------------

#' Evaluation Function to be passed to optim()
#'
#'Note that the "Local Variable Structures and examples" section below was
#'not labelled as belonging to a function "EF".  It was labelled as belonging
#'to a function "fn".  I'm assuming that the variables actually belong to EF
#'because "fn" is the argument name that EF gets assigned to when passed to
#'optim().
#'
#-------------------------------------------------------------------------------

#' @param seed_value integer
#' @param tot_num_spp_after_wrapping integer
#' @param min_num_spp_on_2_PUs integer
#' @param max_max_abundance integer
#' @param target_max_abundance_ct integer
#' @param add_one_to_abundances boolean
#' @param outfile character string
#' @param mean_sd_pair 2 element numeric vector containing the mean and
#'     standard deviation to use for the lognormal
#'
#' @return Returns numeric score from evaluating the given lognormal; the
#'     closer to 0 the better
#' @export

#-------------------------------------------------------------------------------

EF <- function (

seed_value,
               tot_num_spp_after_wrapping,
               min_num_spp_on_2_PUs,
               max_max_abundance,
               target_max_abundance_ct,
               add_one_to_abundances,
               outfile,
               mean_sd_pair)
    {
    cat ("\n\nStarting EF:",
"\n    seed_value              = ", seed_value,
         "\n    tot_num_spp_after_wrapping     = ", tot_num_spp_after_wrapping,
         "\n    min_num_spp_on_2_PUs    = ", min_num_spp_on_2_PUs,
         "\n    max_max_abundance       = ", max_max_abundance,
         "\n    target_max_abundance_ct = ", target_max_abundance_ct,
         "\n    add_one_to_abundances   = ", add_one_to_abundances,
         "\n    mean_sd_pair [1]        = ", mean_sd_pair [1],
         "\n    mean_sd_pair [2]        = ", mean_sd_pair [2],
         "\n", sep='')

#set.seed (seed_value)
    abundance_data = gen_rounded_abundances (tot_num_spp_after_wrapping,
                                             mean_sd_pair [1],
                                             mean_sd_pair [2],
                                             add_one_to_abundances,
                                             plot_rounded_abundances=FALSE)

    num_spp_on_exactly_2_patches = abundance_data$num_spp_on_exactly_2_patches
    max_abundance_ct             = abundance_data$max_abundance_ct

#     cat ("\n\n    num_spp_on_exactly_2_patches        = ", num_spp_on_exactly_2_patches, sep='')
#     cat ("\n    max_abundance_ct                    = ", max_abundance_ct, sep='')
#     cat ("\n    target_max_abundance_ct             = ", target_max_abundance_ct, sep='')

    warn_on_constraint_violation = TRUE
    if (warn_on_constraint_violation)
        {
        if (num_spp_on_exactly_2_patches < min_num_spp_on_2_PUs)
            {
            cat ("\n\n    EF returning large value due to constraint violation ",
                 "in lognormal generator: ",
                 "\n    num_spp_on_exactly_2_patches (", num_spp_on_exactly_2_patches,
                 ") < min_num_spp_on_2_PUs (", min_num_spp_on_2_PUs, ")\n", sep='')
            }

        if (max_abundance_ct > max_max_abundance)
            {
            cat ("\n\n    EF returning large value due to constraint violation ",
                 "in lognormal generator: ",
                 "\n    max_abundance_ct (", max_abundance_ct,
                 ") > max_max_abundance (", max_max_abundance, ")\n", sep='')
            }
        }

    double_diff_penalty_exponent = 5    #  Should be an input option.
    double_diff = abs (num_spp_on_exactly_2_patches - min_num_spp_on_2_PUs)

    if (num_spp_on_exactly_2_patches < min_num_spp_on_2_PUs)  #  More heavily penalize undershooting num on 2 patches
          {
                #  Adding 1 to the double_diff because if double_diff has
                #  an absolute value of 1, raising it to a power will not
                #  change its value.

          double_diff_penalty = (double_diff + 1) ^ double_diff_penalty_exponent
          double_diff = double_diff + double_diff_penalty
          }

    max_abundance_penalty_exponent = 2    #  Should be an input option.
    max_abundance_diff = abs (max_abundance_ct - target_max_abundance_ct)
    if (max_abundance_ct > max_max_abundance)
          {
          max_abundance_penalty =
              (max_abundance_ct - max_max_abundance) ^
              max_abundance_penalty_exponent
          max_abundance_diff = max_abundance_diff + max_abundance_penalty
        }

    double_frac        = double_diff / min_num_spp_on_2_PUs
    max_abundance_frac = max_abundance_diff / target_max_abundance_ct

#         cat ("\n    double_frac        = ", double_frac, sep='')
#         cat ("\n    max_abundance_frac = ", max_abundance_frac, sep='')

    score = max (double_frac, max_abundance_frac)


#     if ((num_spp_on_exactly_2_patches < min_num_spp_on_2_PUs) |
#         (max_abundance_ct > max_max_abundance))
#         {
#         score = .Machine$double.xmax   #1000000    #Inf
#
#         }


#     if ((num_spp_on_exactly_2_patches < min_num_spp_on_2_PUs)  |
#         (max_abundance_ct > max_max_abundance))
#         {
#         score = .Machine$double.xmax   #1000000    #Inf
#
#         } else
#         {
#         double_diff        = num_spp_on_exactly_2_patches - min_num_spp_on_2_PUs
#         max_abundance_diff = abs (max_abundance_ct - target_max_abundance_ct)
#
# #         cat ("\n    double_diff        = ", double_diff, sep='')
# #         cat ("\n    max_abundance_diff = ", max_abundance_diff, sep='')
#
#         double_frac        = double_diff / min_num_spp_on_2_PUs
#         max_abundance_frac = max_abundance_diff / target_max_abundance_ct
#
# #         cat ("\n    double_frac        = ", double_frac, sep='')
# #         cat ("\n    max_abundance_frac = ", max_abundance_frac, sep='')
#
#         score = max (double_frac, max_abundance_frac)
#         }

        #  Keep track of search iteration number.
        #  Doing this through a global variable since it's hard or impossible
        #  to update its value on each EF call inside the generic R optimizer
        #  function.
#    EF_num <<- EF_num + 1
    EF_num = getOption ("bdpg.EF_num") + 1
    options (bdpg.EF_num = EF_num)

        #  If echoing EF value to an output file, do it now.
    if (! is.null (outfile))
        {
        out_record = paste (EF_num,
                            score,
                            mean_sd_pair [1],
                            mean_sd_pair [2],
                            double_frac,
                            max_abundance_frac,
                            min_num_spp_on_2_PUs,
                            target_max_abundance_ct,
                            max_max_abundance,
                    seed_value,
                            tot_num_spp_after_wrapping,
                            add_one_to_abundances,
                            sep=',')

        cat (out_record, file=outfile, sep="\n", append=TRUE)
        }

    cat ("\nEF RETVAL ", EF_num, ": ", score, sep='')

    return (score)

    }  #  end function - EF ()

#===============================================================================

#-------------------------------------------------------------------------------

#' Search for approximating lognormal
#'
#' Function to search for a lognormal that comes close to meeting the
#'  constraints specified in the argument list, i.e.,
#'  \enumerate{
#'      \item{to generate roughly the given number of species but never more
#'          than the given number (tot_num_spp_after_wrapping)}
#'      \item{to come close to the given number of species that occur on
#'          exactly 2 PUs without ever falling below that minimum value
#'          (min_num_spp_on_2_PUs)}
#'      \item{to have the most abundant generated species have roughly
#'          the target value (target_max_abundance_ct), but never exceed
#'          the given max value (max_max_abundance).}
#'}
#'
#-------------------------------------------------------------------------------

#' @param seed_value random seed to pass to optim function
#' @param tot_num_spp_after_wrapping number of species to generate in distribution
#' @param min_num_spp_on_2_PUs minimum number of species that occur on exactly 2 patches in distribution
#' @param max_max_abundance largest maximum abundance allowed in distribution
#' @param target_max_abundance_ct desired maximum abundance in distribution
#' @param initial_meanlog initial value of lognormal's meanlog in search for distribution
#' @param initial_sdlog initial value of lognormal's sdlog in search for distribution
#' @param max_iterations maximum number of iterations allowed for search
#' @param add_one_to_abundances boolean flag to indicate whether to add 1 to all abundances
#' @param outfile output file
#'
#' @return Returns the best lognormal found in the given maximum number
#'  of search iterations (max_iterations).  The value returned is a structure
#'  containing the following information about the best lognormal curve found:
#'       $meanlog - the meanlog used in specifying the curve, e.g., 1.035326
#'       $sdlog   - the sdlog used in specifying the curve, e.g., 0.9283901
#'       $score   - the EF score for the curve, i.e., how well does it fit the
#'                  given constraints (where smaller is better), e.g., 0.1
#'       $abundance_data - a structure containing specifics of the curve, e.g.,
#'           $abundance_data$rounded_abundances - a vector of the number of PUs
#'                                                each species occurs on, e.g.,
#'   [1]  2  4  3  2  7  3  8 15  5  5  9  6 10  4 13  4  7  4  6  6  6  2  2 14  3  9  4  5  5
#'  [30]  2  2  2  2  1  2  3  2  2  4  3  5  3  3 10 17  4 15  3  6  2  5 28  6  4  2 10  2  3
#'  [59]  8 23  4  2  5  2  4  4  3  8  8  3  6  2  2  1  4  3  1  5  3 16  2  6  9  3  2  5  7
#'  [88]  4  3  4  6  2  4  3  3  6  2  3  5  3
#'           $abundance_data$num_spp_on_exactly_2_patches - number of species
#'                                                          that occur on exactly
#'                                                          2 PUs, e.g., 22
#'           $abundance_data$max_abundance_ct - number of PUs that the most
#'                                              abundant species occurs on,
#'                                              e.g., 28
#' @export
#'
#-------------------------------------------------------------------------------
#  Other search algorithm possibilities:
#
#         Pragmatic Programming Techniques
#         Monday, January 14, 2013
#         Optimization in R
#         http://horicky.blogspot.com.au/2013/01/optimization-in-r.html
#
#         Optimising a Noisy Objective Function
#         http://www.exegetic.biz/blog/2013/07/optimising-a-noisy-objective-function/
#         Posted by Andrew Collier on 16 July 2013.
#
#         DEoptim {DEoptim}
#         http://www.inside-r.org/packages/cran/DEoptim/docs/DEoptim
#         Differential Evolution Optimization
#         Package:
#         DEoptim
#         Version:
#         2.2-3
#         Description
#         Performs evolutionary global optimization via the Differential Evolution algorithm.
#         Usage
#         DEoptim(fn, lower, upper, control = DEoptim.control(), ..., fnMap=NULL)
#-------------------------------------------------------------------------------

search_for_approximating_lognormal <- function (
                                        seed_value,
                                                tot_num_spp_after_wrapping,
                                                min_num_spp_on_2_PUs,
                                                max_max_abundance,
                                                target_max_abundance_ct,
                                                initial_meanlog,
                                                initial_sdlog,
                                                max_iterations,
                                                add_one_to_abundances,
                                                outfile
                                                )
    {

    cat ("\n\nStart of search_for_approximating_lognormal:",
"\n    seed_value = ", seed_value,
         "\n    tot_num_spp_after_wrapping = ", tot_num_spp_after_wrapping,
         "\n    min_num_spp_on_2_PUs = ", min_num_spp_on_2_PUs,
         "\n    max_max_abundance = ", max_max_abundance,
         "\n    target_max_abundance_ct = ", target_max_abundance_ct,
         "\n    initial_meanlog = ", initial_meanlog,
         "\n    initial_sdlog = ", initial_sdlog,
         "\n    max_iterations = ", max_iterations,
         "\n    add_one_to_abundances = ", add_one_to_abundances,
         "\n    outfile = ", outfile,
         "\n\n"
         )

        #  If echoing EF value to an output file,
        #  write the column headers to the outfile.

    if (! is.null (outfile))
        {
        cat (paste0 ("\nEF_num,score,meanlog,sdlog,double_frac,",
                     "max_abundance_frac,min_num_spp_on_2_PUs,",
                     "target_max_abundance_ct,max_max_abundance,",
                     "seed_value,tot_num_spp_after_wrapping,add_one_to_abundance",
                     sep=''),
             file=outfile, sep="\n")
        }

        #-------------------------------------------------------------------
        #  Run the optimizer.
        #  Hints on how to use optim() derived from text and code at:
        #      http://machinelearningmastery.com/convex-optimization-in-r/
        #-------------------------------------------------------------------

    result <- optim (par = c(initial_meanlog, initial_sdlog),
                     EF,
                     control=c(maxit=max_iterations),
                seed_value = seed_value,
                     tot_num_spp_after_wrapping = tot_num_spp_after_wrapping,
                     min_num_spp_on_2_PUs = min_num_spp_on_2_PUs,
                     max_max_abundance = max_max_abundance,
                     target_max_abundance_ct = target_max_abundance_ct,
                     add_one_to_abundances = add_one_to_abundances,
                     outfile = outfile
                     )

#set.seed (seed_value)
    abundance_data = gen_rounded_abundances (tot_num_spp_after_wrapping,
                                             result$par [1],
                                             result$par [2],
                                             add_one_to_abundances)

    lognormal_result = list (meanlog = result$par [1],
                             sdlog = result$par [2],
                             score = result$value,
                             abundance_data = abundance_data
                            )


    cat ("\n\n=============>  FINAL RESULT  <=============\n\n")
    cat ("\nscore = ", result$value,
         ", meanlog = ", result$par [1], ", sdlog = ", result$par [2],
         "\nmin_num_spp_on_2_PUs = ", min_num_spp_on_2_PUs,
         ", abundance_data$num_spp_on_exactly_2_patches = ",
         abundance_data$num_spp_on_exactly_2_patches,
         "\ntarget_max_abundance_ct = ", target_max_abundance_ct,
         ", max_max = ", max_max_abundance,
         ", abundance_data$max_abundance_ct = ",
         abundance_data$max_abundance_ct,
         "\n\n", sep='')

    show_abundance_data = FALSE    #  Should be an input option or gone altogether...
    if (show_abundance_data)
        {
        print (lognormal_result)
        cat ("\n")
        }

    return (lognormal_result)

    }  #  end function - search_for_approximating_lognormal ()

#===============================================================================

#-------------------------------------------------------------------------------

#' Find lognormal to wrap around Xu problem
#'
#' Uses optimizer to search for parameters of a lognormal distribution that
#' will have at least as many species occurring on exactly 2 patches as the
#' given Xu problem and meet the other curve shape criteria given in the
#' argument list.
#'
#-------------------------------------------------------------------------------

#' @param Xu_bdprob a Xu_bd_problem object
#' @param desired_Xu_spp_frac_of_all_spp numeric
#' @param solution_frac_of_landscape numeric
#' @param desired_max_abundance_frac numeric
#' @param seed_value_for_search integer
#' @param max_search_iterations integer
#' @param add_one_to_lognormal_abundances boolean
#' @param search_outfile_name character string
#' @inheritParams std_param_defns
#'
#' @return Returns numeric vector of abundances

#-------------------------------------------------------------------------------

find_lognormal_to_wrap_around_Xu = function (Xu_bdprob, parameters,
                                             desired_Xu_spp_frac_of_all_spp,
                                             solution_frac_of_landscape,
                                             desired_max_abundance_frac,
                    seed_value_for_search,
                                             max_search_iterations,
                                             add_one_to_lognormal_abundances,
                                             search_outfile_name)
    {
    cat ("\n\n>>>>>>>>>>>>>>>>>>>>>>  ABOUT TO find_lognormal_to_wrap_around_Xu()  <<<<<<<<<<<<<<<<<<<<<<<<<<<<\n\n")

    #---------------------------------------------------------------------------

        #  Parameters related to and/or derived from the Xu problem.

    Xu_tot_num_spp       = Xu_bdprob@num_spp    #  all spp in Xu are on 2 PUs
    Xu_tot_num_PUs       = Xu_bdprob@num_PUs    #  num dep nodes + num indep nodes

    Xu_nodes             = Xu_bdprob@nodes
    Xu_dep_set           = get_dependent_node_IDs (Xu_nodes)
    num_Xu_dep_set_nodes = length (Xu_dep_set)

    #---------------------------------------------------------------------------

        #  Derived control parameters.

    min_num_spp_on_2_PUs       = Xu_tot_num_spp  #  Try for all 2's occurring only in the dependent set

    tot_num_PUs_in_landscape = round (num_Xu_dep_set_nodes / solution_frac_of_landscape)
    if (tot_num_PUs_in_landscape < Xu_tot_num_PUs)
        {
        warning (paste0 ("\n\ntot_num_PUs_in_landscape (",
                         tot_num_PUs_in_landscape, ") < Xu_tot_num_PUs (",
                         Xu_tot_num_PUs, ").  ",
#                         "\nResetting tot_num_PUs_in_landscape to ",
#                         "Xu_tot_num_PUs, ",
#                         "\nwhich resets solution_frac_of_landscape from ",
#                         solution_frac_of_landscape, " to ",
#                         num_Xu_dep_set_nodes / Xu_tot_num_PUs,
                         "\n\n"
                         ))
#        solution_frac_of_landscape = num_Xu_dep_set_nodes / Xu_tot_num_PUs
        }

        #-----------------------------------------------------------------------
        #  2018 12 26 - BTL
        #  Replacing num_spp_to_generate with tot_num_spp_after_wrapping
        #  because num_spp_to_generate is ambiguous, i.e., is it the number
        #  of new species to add in the wrap or is it the total number of
        #  species in the final wrapped distribution including the original
        #  Xu species?  The correct answer is the total number of species
        #  in the final wrap, but the code that was here doesn't do that.
        #
        #  I supposedly fixed a bug in this back on 2018 11 08 in commit
        #  de06807e "Fix bug in computing num_spp_to_generate for wrap".
        #  Instead of fixing it, I made it calculate the number of new species
        #  to add in the wrap (i.e., excluding the original Xu species),
        #  however, it needs to be the total number of species in the final
        #  wrapped distribution.  This led to fewer species in the wrapped
        #  distribution, which still worked in the sense that it didn't
        #  usuall crash the wrapping code, but it made the fraction of Xu
        #  species in the final distribution slightly off.
        #  However, when the desired_Xu_spp_frac_of_all_spp was 50%, it did
        #  crash the wrapping code because it meant that the lognormal search
        #  was trying to fit a lognormal that had exactly the same number of
        #  total points as points on exactly 2 PUs.
        #
        #  So, I'm changing the name to make it clearer exactly what the
        #  aim of this computation is and changing the computation itself so
        #  that it gets to that aim instead of the incorrect aim that was
        #  introduced in the November "bug fix".  Hopefully, this time it
        #  really IS a fix.  To that end, I'll also add a validation test
        #  after the wrap is built and in that test, make sure that the
        #  desired fraction is met.
        #-----------------------------------------------------------------------
        #    OLD (INCORRECT) CODE that calculated num new spp to add:
        #        num_spp_to_generate =
        #            round (Xu_tot_num_spp * (1 - desired_Xu_spp_frac_of_all_spp) /
        #                                    desired_Xu_spp_frac_of_all_spp)
        #-----------------------------------------------------------------------
        #    CORRECT CODE:
    tot_num_spp_after_wrapping = round (Xu_tot_num_spp / desired_Xu_spp_frac_of_all_spp)
        #-----------------------------------------------------------------------

    max_abundance_frac         = min (1.0,
                                      max (0,
                                           desired_Xu_spp_frac_of_all_spp,
                                           desired_max_abundance_frac))
    target_max_abundance_ct    = round (tot_num_PUs_in_landscape *
                                        max_abundance_frac)  #  Desired max number of patches to allow any species to occur on.
    max_max_abundance_ct       = tot_num_PUs_in_landscape  #  Max number of patches to allow any species to occur on.

    #---------------------------------------------------------------------------

        #  Try to find a good starting point for the search algorithm.
        #
        #  Calculate some initial start points for the optimization based on
        #  choosing an sdlog value and solving for mu given that sdlog.
        #  Hopefully, this will get you to a start point that is somewhere
        #  near a decent solution.
        #  If the calculated value is illegal (which is checked by the
        #  calculate_mu() routine) or if the optimization fails or returns
        #  a large EF value, then try some other combinations of sigma and
        #  either the bottom rarity (i.e., spp on exactly 2 PUs) or the
        #  top rarity (i.e., spp on the max allowed abundance ct or less).
        #
        #  AT THE MOMENT, I'M JUST TRYING ONE OF THESE TO SEE IF THINGS RUN.
        #  2016 05 28 - BTL.

    num_PUs_per_spp_ie_rarity <- 2
    num_spp_with_given_num_PUs <- min_num_spp_on_2_PUs
    sigma <- 0.1
    plusOrMinus1 <- 1
    mu <- calculate_mu (num_PUs_per_spp_ie_rarity,
                        num_spp_with_given_num_PUs,
                        sigma,
                        plusOrMinus1)

    initial_meanlog_for_search <- mu
    initial_sdlog_for_search <- sigma

    #---------------------------------------------------------------------------

        #  Need to do a couple of quick sanity checks on those parameters
        #  to make sure that they can't generate nonsense or a crash?
        #  Just a dummy placeholder routine until I know what needs checking.

    do_sanity_checks ()

    #---------------------------------------------------------------------------

        #  Ready to search for a useful lognormal distribution, i.e.,
        #  one that approximately fits the desired attributes passed in.

#    EF_num <<- 0
    options (bdpg.EF_num = 0)

    lognormal_search_results =
        search_for_approximating_lognormal (

seed_value_for_search,
                                            tot_num_spp_after_wrapping,
                                            min_num_spp_on_2_PUs,
                                            max_max_abundance_ct,
                                            target_max_abundance_ct,
                                            initial_meanlog_for_search,
                                            initial_sdlog_for_search,
                                            max_search_iterations,
                                            add_one_to_lognormal_abundances,
                                            search_outfile_name
                                            )

    rounded_abundances = lognormal_search_results$abundance_data$rounded_abundances
#browser()
    plot (sort(rounded_abundances, decreasing = TRUE))
#max_abund = max (rounded_abundances)
#normalized_abundances_xxx = rounded_abundances / max_abund
#plot (normalized_abundances_xxx)

        #-------------------------------------------------------------------
        #  Make sure that the in the final wrapped distribution,
        #  the original Xu species make up the desired fraction of all
        #  the species together (since there was a bug in that before).
        #  Creating some intermediate variables here that arent' strictly
        #  necessary but make it easier to test and to generate error msg.
        #-------------------------------------------------------------------

    tot_num_spp_in_wrapped_spp_set = length (rounded_abundances)
    Xu_spp_frac_of_full_wrapped_spp_set =
        Xu_tot_num_spp / tot_num_spp_in_wrapped_spp_set

    if (abs (Xu_spp_frac_of_full_wrapped_spp_set -
             desired_Xu_spp_frac_of_all_spp) > 0.000001)
        {
        stop_bdpg (paste0 ("\nXu_spp_frac_of_full_wrapped_spp_set (",
                           Xu_spp_frac_of_full_wrapped_spp_set,
                           ") must equal desired_Xu_spp_frac_of_all_spp (",
                           desired_Xu_spp_frac_of_all_spp, ").\n"))
        }

    return (rounded_abundances)

    }  #  end function - find_lognormal_to_wrap_around_Xu ()

#===============================================================================
langfob/bdpg documentation built on Dec. 8, 2022, 5:33 a.m.