In akeyel/behlpr: Functions to run my biodiversity analysis

Introduction to vignette

FILL IN PLEASE!!! Note to self: Probably will want to split into three vignettes from an organization standpoint

Index to analyses

1) Simulated data analysis to test analysis tools

2) Proof-of-concept analysis to get things working - simplified inputs - illustrates approach a) Species Distribution Model b) SpatialDemography Model c) Comparison

3) Main analysis a) Run Species Distribution Model b) Run SpatialDemography Model c) Compare statistical model with process model

# Set up common information for all data runs
spath = "C:/docs/beplants/Scripts/"
in.path = "C:/docs/beplants/drafts/BE_data_paper/R_inputs/"
out.path = "C:/docs/beplants/drafts/BE_data_paper/R_outputs/"
DispPath = sprintf("%sdispersal_tables/", spath) # Path to dispersal tables

#install.packages(be_hlpr) #Supporting package needs to be installed in order to run the below code

# Load packages
library(spatialdemography)
setwd(spath)

if (dev.mode == 0){
  library(be_hlpr)
}else{
  source("be_hlpr/R/be_hlpr.r")
  source("be_hlpr/R/be_sdm_hlpr.r")
  source("be_hlpr/R/be_spdem_hlpr.r")
  library(myr) #Make this a package dependency?
  }

1) Simulated data analysis to test analysis tools

Here, we examine a 2 x 2 landscape (see Keyel et al. in review) to examine the analysis tools used here. We assumed that the initial deterministic spatialdemography model run defined reality.

We then evaluated the ability of the species distribution model (SDM) to identify meaningful response traits. We then re-ran the simulation model using the estimated response traits (from the SDM), and compared this to the "reality" established by the initial model run.

Concerns with this approach: The SDM will have an incredibly small sample size to work from, given a 2 x 2 landscape Possible solutions: - can increase landscape size if necessary The metacommunity model will have an advantage, as "reality" was derived from a model run. - can do an example the other way - have the species distribution model define "reality" and then examine model performance.

But let's see how it performs first, before I over-analyze things. Also, the main purpose of this simulation is to check that the code works, not to produce biologically interesting results. And neither of the above should interfere with checking the technical implementation of the script!

sim.path = sprintf("%ssimulation/", in.path)
ipath = sprintf("%sinputs/", sim.path)
lpath = sprintf("%slandscape/", sim.path)
opath = sprintf("%ssimulation/", out.path)
if (!file.exists(opath)){ dir.create(opath)}
scale.vec = "landscape" # Set scale of evaluation
sp.locs = sprintf("%s/inputs/locations_spdem_ex1.csv", sim.path)

## Generate "Reality"
run.name = "simulation"
run.path = opath #**# see if this fixes my problem (even if the results won't be in sd_runs)
file.ending = "spdem_ex1"

# Set up species file and initial locations of species
spfile = sprintf("%s/Species/Species_spdem_ex1.csv", sim.path)

spdem = do.spatial.demography(run.name, run.path, spfile, sp.locs, ipath, file.ending, lpath, DisPath, opath, scale.vec)

## Run species distribution model

# Set up basic settings
sdm.type = 1 # 1 = simple, 2 = with biomod2, 3 = with previous data

# set up species data: extract SDM input from metacommunity output
sp.data = spdem.extract(sp.locs) 

# Extract land use data 
lu.path = "C:/docs/beplants/drafts/BE_data_paper/R_inputs/simulation/landscape/landcover_scn1.csv"
#NOTE: this won't capture the response to the herbicide. And is currently based on the first timestep. Needs fixing if more than one timestep is desired (which at the moment is not the case!). Also note that the landuse file uses changesteps instead of timesteps, so a more complicated mapping would be needed. I leave that to another programmer, or another day.

#herb.path = etwas
predictors.vec = c("landuse-sim") # ,"herbicide-sim"
predictors.paths = c(lu.path) # , herb.path
out.info = setup.predictors(sp.data, predictors.vec, predictors.paths)

my.data = out.info[[1]]
my.index = out.info[[2]]

# Only select the last land use column #**# Automated solution would be helpful here - this is done manually using knowledge.
col.index = rep(0,ncol(my.data))
col.index[ncol(my.data)] = 1 

# Do basic descriptive analysis of data
#**# Think critically about what descriptive info you want and the implications for your analysis
#do.descriptives(my.data, my.index) #**# NEEDS CODING

# Run SDM analysis
sdm.out = do.sdm(my.data, my.index, col.index, sdm.type)
validation.diagnostics.lst = sdm.out[[1]]
thresholds = sdm.out[[2]] #**# Need a way to make this more general if we start comparing linear models too. #**# Think about saving thresholds to file, to make it easier to save them!

# Visually examine quality of sdm results for all species
check.sdm.quality(validation.diagnostics.lst, "Land Use")

# Visualize SDM analysis
#visualize.sdm.main(sdm.out)

# Create table output of results for further analysis
model.type = "SDM-simulation-threshold"
model.name = "landuse"
results.table = sprintf("%sResults.csv", opath)
sp.names = names(sp.data[2:length(sp.data)]) # Drop plotyear column, but retain all other species names.
my.sdm.results = output.results.to.table(model.type, model.name, results.table, sp.names, validation.diagnostics.lst, do.append = FALSE) 


## Run SpatialDemography model
# NOTE: Some inputs used below were defined above.

run.path = sprintf("%s/spdem_sdm/", opath)
run.name = "spdem_sdm"
file.ending = "spdem_sdm"
inputs.ending = "spdem_ex1"

# Set up species file & initial locations outside of function, then provide as input
spbase = sprintf("%s/Species/Species_base.csv", sim.path)
spfile = sprintf("%s/Species/Species_sdm.csv", sim.path)

prepare.spfile(spbase, spfile, thresholds)

spdem.out = do.spatial.demography(run.name, run.path, spfile, sp.locs, ipath, file.ending, lpath, DispPath, opath,  scale.vec, inputs.ending)

# Data from last model run (#**# improve acquisition of paths)
v.path = "C:/docs/beplants/drafts/BE_data_paper/R_outputs/simulation/spdem_sdm/Example1/"
m.path = "C:/docs/beplants/drafts/BE_data_paper/R_outputs/simulation/Example1/"
validation.file = sprintf("%sSpeciesData.csv", v.path)
model.file = sprintf("%sSpeciesData.csv", m.path)
validation.timestep = 12
model.timestep = 12

n.sp = 3
# Compare model-produced results to observed data
validation.diagnostics.lst = evaluate.spatialdemography(validation.file, model.file, validation.timestep, model.timestep, n.sp)

# Write results to table
model.type = "metacommunity"
model.name = "landuse"
sp.names = c("sp1","sp2","sp3")
my.spdem.results = output.results.to.table(model.type, model.name, results.table, sp.names, validation.diagnostics.lst)

# Examine results table
combined.results = read.csv(results.table)
combined.results

2) Proof-of-concept analysis

dpath = "C:/docs/beplants/datasets/"
opath = sprintf("%s/proof_of_concept/", out.path)

### Run Species Distribution model

# Set up basic settings
#out.pdf = "C:/docs/beplants/Scripts/be_sdm/y2009.pdf"
sdm.type = 1 # 1 = simple, 2 = with biomod2, 3 = with previous data
#drop.rare = 0

# Set up species data
sp.file = sprintf("%s/combined/SpeciesDataBE.csv", dpath)
sp.lookup.file = sprintf("%s/combined/species_lookup.csv", dpath)
sp.data = read.sp.data(sp.file, sp.lookup.file)
#head(sp.data) # comes out organized with a column for every species, with a single plotyear column as ID.

# Drop many species to speed up analysis (can optimize later)
sp.data = sp.data[ ,1:5]

# Set paths to predictor variables
# Potential predictor variables: c("climate", "landuse", "biotic", "isolation")
lu.path = sprintf("%sBExIS/landuse/16029.csv", dpath)

# Run analysis just for land use
predictors.vec = c("landuse")
predictors.paths = c(lu.path)
out.info = setup.predictors(sp.data, predictors.vec, predictors.paths)
my.data = out.info[[1]]
my.index = out.info[[2]]
#col.index = out.info[[3]] #**# THis has yet to be coded
col.index = rep(0,ncol(my.data))
col.index[(ncol(my.data) - 3):ncol(my.data)] = 1 # Try to just select the last 4 land use columns. #**# This is a hacky patch & requires a much better coding solution

# Do basic descriptive analysis of data
#**# Think critically about what descriptive info you want and the implications for your analysis
#do.descriptives(my.data, my.index)

# Drop rare species (if desirable)
#if (drop.rare == 1){
#  stop("This has not been retested since last modification")
#  out.info = drop.rare.species(my.data, my.index, col.index)
#  my.data = out.info[[1]]
#  my.index = out.info[[2]]
#  #col.index = out.info[[3]] #**# this has yet to be coded  
#  }

# Run SDM analysis
sdm.out = do.sdm(my.data, my.index, col.index, sdm.type)
validation.diagnostics.lst = sdm.out[[1]]
thresholds = sdm.out[[2]] #**# Need a way to make this more general if we start comparing linear models too.

# Visually examine quality of sdm results for all species
check.sdm.quality(validation.diagnostics.lst, "Land Use")

# Visualize SDM analysis
#visualize.sdm.main(sdm.out)

# Create table output of results for further analysis
model.type = "SDM-threshold"
model.name = "landuse"
results.table = sprintf("%sResults.csv", opath)
sp.names = names(sp.data[2:length(sp.data)]) # Drop plotyear column, but retain all other species names.
my.sdm.results = output.results.to.table(model.type, model.name, results.table, sp.names, validation.diagnostics.lst) 

### Run SpatialDemography

#Set up paths for SpatialDemography
template.dir = sprintf("%sinput_templates/", in.path) # Path to templates to provide the basis for the analyses
lpath = "C:/docs/beplants/drafts/BE_data_paper/R_inputs/input_templates/landscape/"
file.ending = "BE"  

# Set up species file
sp.base = sprintf("%s/Species/spfile_base.csv", template.dir) 
spfile = sprintf("%s/Species/spfile_spdem.csv", template.dir) 
prepare.spfile(sp.base, spfile, thresholds)

# Set up basic parameters for simulation
start.year = 2009 #**# Arbitrarily chosen starting year
pool.type = "plot" # region & all are other anticipated options
landscape.extent = 5 #**# can try different landscape sizes
n.sim = 1
n.plots = 150
n.sp = ncol(sp.dat) - 1 # Remove plotyear from count

# Restrict data set to starting year #**# Note: species pools should be set up prior to this step
spdem.out = spdem.prep(my.data, start.year, col.index) # Initialize a new data object (should I just use my.data?)
sp.dat = spdem.out[[1]]
land.dat = spdem.out[[2]]

# get number of cells in landscape
n.cells = landscape.extent ^ 2 

# get position of focal cell
focal.cell.position = floor(median(seq(1,n.cells))) # median of the sequence gets the middle number. Floor ensures that it is a whole number.

# Set up model validation data
#**# Non-optimal approach - convert my.data to a SpeciesData.csv format to allow me to use an existing function.
model.timestep = 2010
actual.presences.lst = convert.sp.data(sp.data, model.timestep) # Create the model object based on sp.data

# Create a list to hold results from ALL simulations
sim.lst = list()
# loop through simulations (note: for pool.type = plot, it should be fairly deterministic?)
for (k in 1:n.sim){

  predicted.presences.lst = rep(list(NA), n.sp)
  # Loop through plots in the exploratories
  for (p in 1:n.plots){


    # Restrict data set to a specific plot
    if (nrow(sp.dat) != n.plots) { stop("Species dataset does not match plot dataset. Something is wrong.")}
    if (nrow(land.dat) != n.plots) {stop("Landscape data dataset does not match plot dataset. Something is wrong.")}
    plot = sp.dat$plots[p]
    sp.plot.dat = sp.dat[sp.dat$plots == plot, ]
    land.plot.dat = land.dat[land.dat$plots == plot, ]
    # Set up landscape configuration (hacky way for now, will come back to this)
    # Currently using fixed values which may or may not correpsond to the focal cell. This stupid, because we need the value for the focal cell. In addition to species pool, we need an environmental pool!). Maybe I should just use LUI and not worry about mowing, grazing, and fertilization to start with??
    # customize to focal plot 
    fert.plot = land.plot.dat[[2]]
    mowing.plot = land.plot.dat[[3]]
    grazing.plot = land.plot.dat[[4]]
    lui.plot = land.plot.dat[[5]]

    lui = rep(lui.plot, n.cells) 
    fert = rep(fert.plot, n.cells) 
    mowing = rep(mowing.plot, n.cells) 
    grazing = rep(grazing.plot, n.cells)
    landscape = list(lui, fert, mowing, grazing) #**# make this correspond to an input for spatialdemography

    # Create landscape layer files (can you just input landscape objects to spatialdemography? I think so???)
    lpath = landscape # Hidden functionality that needs better documentation: this can also be the landscape itself, if properly formatted!

    # Set up environmental layers
    #**# NEED TO FIX ENVIRONMENTAL LAYERS TO CORRESPOND TO THOSE THAT HAVE BEEN ASSESSED FOR THRESHOLDS
    #prepare.env.lyrs(STUFF) #**# Needs scripting

    #Run name & run path
    run.path = sprintf("%splot_%s_sim_%s", opath, p, k)
    run.name = sprintf("plot_%s_sim_%s", p, k)

    # Set up locations file
    sp.locs = sprintf("%s/locations_%s_%s.csv", template.dir, pool.type, Digit(n.sim, 4))
    prepare.sp.locations(sp.locs, sp.plot.dat, pool.type, n.cells) # Create sp.locs file

    scale.vec = "landscape" #c("landscape", "focal cell") #**# Figure out how to add your focal cell to this!

    # Run main model
    spdem.out = do.spatial.demography(run.name, run.path, spfile, sp.locs,template.dir, file.ending, lpath, DisPath, opath, scale.vec)    

    # Extract result from focal cell & append to list
    output.file = sprintf("%sLanduse/SpeciesData.csv", run.path)
    validation.timestep = 2 #**# Don't run it for 6 timesteps then!
    predicted.presences.lst = extract.cell.values(predicted.presences.lst, output.file, focal.cell.position, validation.timestep)
    }

  # Compare model-produced results to observed data
  validation.diagnostics.lst = evaluate.spatialdemography.v2(predicted.presences.lst, actual.presences.lst, n.sp)

  # Add this runs accuracy stats to the running total.
  sim.lst = append(sim.lst, list(validation.diagnostics.lst))

  }

# Convert validation.diagnostics from multiple simulations into an average result & output the average result
validation.diagnostics.lst = summarize.validation(sim.lst, n.sp)

# Write average accuracy to file
model.type = "metacommunity-agg-mean"
model.name = "landuse"
sp.names = names(sp.data)[2:ncol(sp.data)] # Exclude plotyear column
my.spdem.results = output.results.to.table(model.type, model.name, results.table, sp.names, validation.diagnostics.lst)

# Examine results table
combined.results = read.csv(results.table)
combined.results