R/flower_simulator.R

Defines functions spatial_flowering_grid spatial_flowering_sampler generate_random_spatial_gradient generate_clustered_points

#######################################
# Methods to simulated large scale flowering observations
#######################################

#' @description   Generate x,y points in a clumped fashion. 
#' The number of clump centers is a poisson dist. with lambda=5.
#' Optionally set the the percentage of points n which
#' will be drawn from a spatially uniform distribution. This results
#' in clumped points that are not "too" clumped. 
#' 
#' @param n numeric The number of points to return
#' @param xlimits vector Lower and upper bounds of the x axis
#' @param ylimits vector Lower and upper bounds of the y axis
#' @param uniform_random_percent numeric Percent of samples to be drawn from a spatially uniform, as
#'                                       opposed to clustered, distribution.
#'                                       
#' @return data.frame Two column data.frame of x,y locations 
generate_clustered_points = function(n,
                                     xlimits,
                                     ylimits,
                                     uniform_random_percent=0.2){
  uniform_n = floor(n * uniform_random_percent)
  clumped_n = n - uniform_n
  point_pattern = spatstat::rThomas(5, 0.05, clumped_n*1.5)
  
  # Spatstat will generate a random number of points, so from
  # those choose the n points requested.
  number_of_random_points = length(point_pattern$x)
  if(number_of_random_points < clumped_n){
    # If not enough random points were generated by spatstat try again
    return(generate_clustered_points(n=n, 
                                     xlimits=xlimits,
                                     ylimits=ylimits, 
                                     uniform_random_percent = uniform_random_percent))
  } else {
    uniform_x = runif(n=uniform_n, min=0, max=1)
    uniform_y = runif(n=uniform_n, min=0, max=1)
    random_selection = sample(1:number_of_random_points, size = clumped_n,replace = FALSE)
    x = c(point_pattern$x[random_selection], uniform_x)
    y = c(point_pattern$y[random_selection], uniform_y)
    return(dplyr::tibble(x=x, y=y))
  }
}

#' Generate a non-linear spatial gradient
#' Random draws from a spatial model.
#' x and y are coordinates and thus should be equal length.
#' returns a vector of the same length with values between 0-1 representing
#' a spatially correlated random field
#' 
#' @param x vector Coordinates of sample locations on the x axis
#' @param y vector Coordinates of sample locatiosn on the y axis
#' @param seed numeric random seed to create reproducable results
#' 
#' @return vector A vector of length(x/y) representing the underlying random surface (from 0-1) 
#'                of the respective point.
generate_random_spatial_gradient = function(x,y,seed=NULL){
  # The random surface generated depends on the seed as well as the unique set
  # of points used in predict(variogram_model,...). This functions needs to be
  # able to reproduce the same random surfrance regardless of the points used.
  # Thus it uses the fixed_grid, which combined with the same seed will produce
  # the same random surface. The points of interest (x,y in the arguments) are
  # then calculated from the nearest neighbor of the fixed grid.
  if(!is.null(seed)){
    set.seed(seed)
  }
  fixed_grid = expand.grid(x=seq(0,1,0.01),
                           y=seq(0,1,0.01))
  
  varigram_model = gstat::gstat(formula=z~1, locations=~x+y, dummy=T, beta=0.5, nmax=20,
                                model=gstat::vgm(psill=0.025,model="Gau",range=0.6, nugget = 0))
  p = predict(varigram_model, newdata=fixed_grid, nsim=1)$sim1
  p = (p-min(p)) / (max(p) - min(p)) # scale to 0-1
  
  # in spatstat::nnwhich() all the coordinates, both from the fixed_grid and the sample
  # coordinates x,y need to be input all at once. coordinate_sources differentiates them. 
  coordinate_sources = factor(c(rep('grid_point',nrow(fixed_grid)),rep('sample_point',length(x))))
  
  nearest_point = spatstat::nnwhich(c(fixed_grid$x,x),c(fixed_grid$y,y), by=coordinate_sources)
  # 2 columns returned. One for the nearest grid point, one for the nearest 
  # sample point. 
  if(colnames(nearest_point)[1]!='grid_point'){stop('grid point vs data point alignment not working')}
  is_sample_point = coordinate_sources=='sample_point'
  
  return(p[nearest_point[is_sample_point,1]])
}

#' @description Simulate large scale phenology observations.
#'
#' @param n integer The number of samples to return.
#' @param x 
#' @param y
#' @param sample_type string Either 'po' (default) for presence only, or 'pa' for presence/absence
#' @param fraction_present numeric Between 0-1, the fraction of presence observations. Ignored if sample_type is 'po'.
#' @param distribution_type string The underlying flowering distribution to simulate. 'a' for a normal derived curve, 'b' for
#'                                  a beta derived curve, 'c' for a uniform derived curve. 
#' @param xlimits vector Lower and upper bounds of the x axis spatial domain
#' @param ylimits vector Lower and upper bounds of the y axis spatial domin
#' @param start_doy integer The start day of year (doy) of flowering in the simulation
#' @param flowering_length integer The length of the flowering season
#' @param flowering_gradient numeric  The strenght of the flowering gradient
#' @param spatial_gradient_type string Either 'linear', where the spatial gradient is a function of the y axis, or
#'                                     'non-linear' where it is a function of a random surface. In both cases the day
#'                                     of flowering is shifted by flowering_gradient * beta. Where beta is the y axis
#'                                     for 'linear', or the underlying random surface (scaled 0-1) for 'non-linear'
#' @param clustering bool Whether random samples should be clustered around 1-several points. See generate_clustered_points()
#' @param clustering_uniform_random_percent numeric Percent of samples to be drawn from a spatially uniform, as
#'                                                  opposed to clustered, distribution.
#' @param seed numeric Seed to allow reproducable results
#' 
#' @return data.frame A 5 column data.frame with columns x,y (the coordinates), doy (day of year of the observation),
#'                    flower_present (either 0 or 1. Only 1 if sample_type='po'), and true_start_doy (the true simulated start day of the observation)
#'
spatial_flowering_sampler = function(n,
                                     x=NULL,
                                     y=NULL,
                                     sample_type='po',
                                     # po: presence only, pa: presence and absence
                                     fraction_present=0.8,
                                     distribution_type='a',
                                     xlimits = c(0,1),
                                     ylimits = c(0,1),
                                     start_doy = 90,
                                     flowering_length = 30,
                                     flowering_gradient = 10/0.1,
                                     spatial_gradient_type = 'linear',
                                     # 'linear': gradient is a functions of the y axis, 'non-linear': gradient is a 
                                     # spatially random.
                                     clustering=FALSE,
                                     clustering_uniform_random_percent=0.2,
                                     seed=NULL){
  if(!is.null(seed)){
    set.seed(seed)
  }
  
  if(start_doy + flowering_length>365){
    stop('start_doy plus flowering_length cannot exceed 365')
  }
  
  if(clustering) {
    clustered_points = generate_clustered_points(n=n,
                                                 xlimits=xlimits,ylimits=ylimits,
                                                 uniform_random_percent=clustering_uniform_random_percent)
    x = clustered_points$x
    y = clustered_points$y
  } else if(is.null(x) && is.null(y)) {
    # Generate spatially uniform points      
    x=runif(n=n, min=xlimits[1], max=xlimits[2])
    y=runif(n=n, min=ylimits[1], max=ylimits[2])
  } else {
    # User user supplied points
    if(length(x)!=length(y)){stop('x and y must have the same length')}
    if(length(x)!=n){stop('x and y must be the same as n')}
  }
  
  # make weights for each julian day corresponding to the chance of observing an open flower
  # corresponds to flower abundance
  doy_probabilites = rep(0, 365)
  if(distribution_type=='a'){
    doy_probabilites[start_doy:(start_doy+flowering_length-1)] = dnorm(seq(from=-3,to=3, length.out = flowering_length))
  } else if(distribution_type=='b'){
    doy_probabilites[start_doy:(start_doy+flowering_length-1)] = dbeta(seq(from=0,to=1, length.out = flowering_length), shape1=2, shape2=5)
  } else if(distribution_type=='c'){
    stop('type c not implemented yet')
  }
  
  # presences and absences included
  if(sample_type=='pa'){
    num_presence = ceiling(n*fraction_present)
    num_absence  = n - num_presence
    
    sampled_doy =  sample(1:365, size=num_presence, replace=T, prob = doy_probabilites)
    sampled_doy =  c(sampled_doy,sample(1:365, size=num_absence, replace=T, prob = max(doy_probabilites) - doy_probabilites))
    flower_present = c(rep.int(1,num_presence),rep.int(0,num_absence))
    # only presences
  } else if(sample_type=='po'){
    sampled_doy = sample(1:365, size=n, replace=T, prob = doy_probabilites)
    flower_present = rep.int(1,n)
  } else {
    stop(paste0('unknown sample type: ',sample_type))
  }
  
  true_start_doy = rep.int(start_doy, n)
  
  if(spatial_gradient_type == 'linear'){
    sampled_doy = sampled_doy + round(y*flowering_gradient, 0)
    true_start_doy = true_start_doy + round(y * flowering_gradient, 0)
    
  } else if(spatial_gradient_type == 'non-linear'){
    non_linear_gradient = generate_random_spatial_gradient(x=x,y=y, seed=seed)
    sampled_doy = sampled_doy + round(non_linear_gradient*flowering_gradient, 0) 
    true_start_doy = true_start_doy + round(non_linear_gradient * flowering_gradient, 0)
    
  } else {
    stop(paste0('unknown spatial gradient: ',spatial_gradient_type))
  }
  
  # Sanity check
  if(any(sampled_doy < 1)){
    stop('negative sample doy generated')
  }
  if(any(sampled_doy>365)){
    warning('sampled doy > 365 generated, consider setting more realistic parameters')
  }
  if(any(true_start_doy < 1)){
    stop('negative true_start_doy generated')
  }
  if(any(true_start_doy>365)){
    warning('true_start_doy > 365 generated, consider setting more realistic parameters')
  }
  
  
  return(dplyr::tibble(x=x,y=y,doy=sampled_doy,flower_present=flower_present, true_start_doy=true_start_doy))
}

#' @description Generate an evenly spaced x,y grid of a flowering gradient. All arguments match those used in 
#'              spatial_flowering_sampler(). This is usefull for visualising the underlying phenology of a simulated
#'              phenology run from spatial_flowering_sampler().
#'              
#' @param xlimits vector Lower and upper bounds of the x axis spatial domain
#' @param ylimits vector Lower and upper bounds of the y axis spatial domin
#' @param cell_size numeric The spacing between points within the x,y domain
#' @param start_doy integer The start day of year (doy) of flowering in the simulation
#' @param flowering_length integer The length of the flowering season
#' @param flowering_gradient numeric  The strenght of the flowering gradient
#' @param spatial_gradient_type string Either 'linear', where the spatial gradient is a function of the y axis, or
#'                                     'non-linear' where it is a function of a random surface. In both cases the day
#'                                     of flowering is shifted by flowering_gradient * beta. Where beta is the y axis
#'                                     for 'linear', or the underlying random surface (scaled 0-1) for 'non-linear'
#' @param seed numeric Seed to allow reproducable results
#' 
#' @return data.frame A 3 column data.frame consisting of x,y (coordinates), and onset (the true simulated start day of all points)

#TODO: Example here showing usage with ggplot::geom_raster()

spatial_flowering_grid = function(xlimits = c(0,1),
                                  ylimits = c(0,1),
                                  cell_size = 0.05,
                                  start_doy = 180,
                                  flowering_length = 30,
                                  flowering_gradient = 10/0.1,
                                  spatial_gradient_type = 'linear',
                                  seed=NULL){
  
  full_grid = expand.grid(x=seq(xlimits[1], xlimits[2], by=cell_size),
                          y=seq(ylimits[1], ylimits[2], by=cell_size))
  
  if(spatial_gradient_type == 'linear'){
    full_grid$onset = start_doy + round(full_grid$y * flowering_gradient, 0)
    
  } else if(spatial_gradient_type == 'non-linear'){
    if(is.null(seed)){stop('Seed must be set if generating non-linear spatial gradient')}
    non_linear_gradient = generate_random_spatial_gradient(x=full_grid$x,y=full_grid$y, seed=seed)
    full_grid$onset = start_doy + round(non_linear_gradient * flowering_gradient, 0)
    
  } else {
    stop(paste0('unknown spatial gradient: ',spatial_gradient_type))
  }
  
  return(full_grid)
}
sdtaylor/flowergrids documentation built on Sept. 15, 2021, 1:09 p.m.