Nothing
###############################################################################
# DivE - Diversity Estimator
# Authors: Daniel Laydon, Aaron Sim, Charles Bangham, Becca Asquith
# Date: 03 Feb 2020
# Version: 1.2
#
# Contents:
# A. SUBSAMPLE AND RAREFACTION DATA GENERATION
# 1. Other functions
# 2a-2b. Subsample sizes generation functions
# 3a-3h. Rarefaction and diversity data generation functions (incl. summary)
# 4a. Curvature function
# B. CURVE FITTING
# 1a-1n. Supporting fitting functions
# 2a-2b. Fitting functions
# 3a-3e. Fitting output functions (incl. summary)
# C. SCORING AND MODEL COMPARISON
# 1a-1f. Rounding functions
# 2a. Scoring single model
# 3a-3d. Scoring single model output
# 4a. Comparison of models
# 5a-5d. Comparison of models output
# 6a. Combining separate runs
# 7a. Diversity estimate for different population size
###############################################################################
######################## A. SUBSAMPLE DATA GENERATION #########################
######################## A1. Function to format input #########################
# Two formats accepted:
# i. dataframe with two variables - species id, number of species
# ii. dataframe, vector or matrix with one dimension
FormatInput <- function(x, ...)
{
if (is.data.frame(x) && (dim(x)[2] == 2)) {
as.character(rep(x[,1], x[,2]))
} else if (is.vector(x)) {
as.character(x)
} else if ((is.data.frame(x) && dim(x)[2] == 1) || (is.matrix(x) && dim(x)[2] == 1)) {
as.character(x[,1])
} else {
stop("Incorrect sample input format - reformat and try again")
}
}
############## A2a. Function to generate vector of subsample lengths #############
# Inputs: Main sample, desired number of subsamples
# By default produces a vector of length six with equally spaced subsample sizes
GenSubsampLengths <- function(main.samp, N_Subsamp)
{
if ((length(N_Subsamp))!=1) {
sort(N_Subsamp, decreasing=TRUE)
} else {
SampInt <- length(main.samp)/N_Subsamp
sort(round(seq(from=SampInt, to=length(main.samp), by=SampInt)), decreasing=TRUE)
}
}
############### A2b. Generate subsample lengths function #################
DivSampleNum <- function(ms, n=6)
{
if ((length(n)==1) && (n<2)) {
stop('Number of nested subsamples (subsizes) must be 2 or greater')
}
main.samp <- FormatInput(ms)
GenSubsampLengths(main.samp=main.samp, N_Subsamp=n)
}
############ A3a. Function to generate vector of rarefaction datapoints ############
# Inputs: Any sample, desired interval between rarefaction points, maximum data size of subsample, minimum data size of subsample
GenRarefacLengths <- function(samp, N_Rarefac, Max_Rarefac=length(FormatInput(samp)), Min_Rarefac=1)
{
RarefacInt <- N_Rarefac
out <- round(seq(from=Min_Rarefac, to=Max_Rarefac, by=RarefacInt))
if (out[length(out)]==Max_Rarefac)
{
return (out)
} else
{
out <- c(out, Max_Rarefac)
return (out)
}
}
############ A3b. Function to determine diversity of a (sub)sample ###########
# Inputs: Any vector
DivCount <- function(samp) {length(unique(samp))}
##### A3c. Function to generate the sample diversity values at rarefaction datapoints for fitting ####
# Inputs: Sample, interval between rarefaction datapoints, minimum data size of subsample, Iterations, maximum data size of subsample
GenRarefacDiv <- function(samp, N_Rarefac, Min_Rarefac=1, B, Max_Rarefac=length(FormatInput(samp)))
{
l <- length(samp)
RarefacLengths <- GenRarefacLengths(samp, N_Rarefac, Max_Rarefac, Min_Rarefac)
s <- length(RarefacLengths)
OrderVec <- as.vector(apply(matrix(sample(1:(l*B)), nrow=B, ncol=l), 1, order))
ShuffleMat <- matrix(samp[OrderVec], nrow=B, ncol=l, byrow=TRUE)
RarefacDivMean <- rep(1, s)
RarefacDivSD <- rep(0, s)
for (i in (2:s))
{
temp <- apply(ShuffleMat[,1:RarefacLengths[i]], 1, DivCount)
RarefacDivMean[i] <- mean(temp)
RarefacDivSD[i] <- sd(temp)
}
list(RarefacXAxis=RarefacLengths,
RarefacYAxis=RarefacDivMean,
div_sd=RarefacDivSD,
NResamples=B)
}
############# A3d. Generate subsamples/diversity function #############
DivSubsamples <- function(mainsamp, nrf, minrarefac=1, maxrarefac=length(FormatInput(mainsamp)), NResamples=1000)
{
samp <- FormatInput(mainsamp)
xyvals <- GenRarefacDiv(samp=samp, N_Rarefac=nrf, Min_Rarefac=minrarefac, B=NResamples, Max_Rarefac=maxrarefac)
xyvals$call <- match.call()
class(xyvals) <- "DivSubsamples"
xyvals
}
################ A3e. Print method - subsample/diversity data ################
print.DivSubsamples <- function(x, ...)
{
cat("RarefacXAxis:\n")
print(x$RarefacXAxis)
cat("\nRarefacYAxis (Mean diversity values):\n")
print(x$RarefacYAxis)
}
################ A3f. Summary method: subsample/diversity data ################
summary.DivSubsamples <- function(object, ...)
{
TotDiv <- object$RarefacYAxis[length(object$RarefacYAxis)]
TotRarefac <- length(object$RarefacXAxis)
SampSize <- object$RarefacXAxis[TotRarefac]
N_Iter <- object$NResamples
ave.sdpc <- mean(object$div_sd/object$RarefacYAxis)
TAB <- cbind(Subsample.size=SampSize,
Subsample.diversity=TotDiv,
No.of.rarefac.points=TotRarefac,
Iterations=N_Iter,
Ave.StdErr=ave.sdpc)
dss.sum <- list(call=object$call, rsum=TAB)
class(dss.sum) <- "summary.DivSubsamples"
dss.sum
}
############ A3g. Print summary method - subsample/diversity data ############
print.summary.DivSubsamples <- function(x, ...)
{
cat("Call:\n")
print(x$call)
cat("\nSubsample data summary:\n")
print(x$rsum)
}
############ A3h. Method to extract the relevant nested subset of a DivSubsamples object ############
# Inputs: divsubsample object, size of nested subsample desired
DivSubsamples_Nested <- function(dss, maxsamp)
{
DSStemp <- dss
EndIndex <- which.min(abs(maxsamp-dss$RarefacXAxis))
DSStemp$RarefacXAxis <- dss$RarefacXAxis[1:EndIndex]
DSStemp$RarefacYAxis <- dss$RarefacYAxis[1:EndIndex]
DSStemp$div_sd <- dss$div_sd[1:EndIndex]
return (DSStemp)
}
############ A4a. Method to calculate the curvature of a rarefaction curve ############
# Inputs: divsubsample object
Curvature <- function(dss)
{
if ((length(dss) > 1) && inherits(dss[[1]], "DivSubsamples"))
{
SubSizes = c()
for (Sub in 1:length(dss))
SubSizes[Sub] = tail(dss[[Sub]]$RarefacXAxis,1)
LargestSub = order(SubSizes)[1]
dss = dss[[LargestSub]]
}
x_val <- dss$RarefacXAxis
y_val <- dss$RarefacYAxis
x_end <- tail(x_val, 1)
y_end <- tail(y_val, 1)
AnB <- 0.5*x_end*y_end
x_vec <- c(x_val[1], diff(x_val))
y_vec_left <- c(0,y_val)[1:(length(y_val))]
y_vec_right <- y_val[1:length(y_val)]
y_vec <- (y_vec_right + y_vec_left)/2
A <- sum(x_vec*y_vec) - AnB
output <- A/AnB
return (output)
}
############################# B. CURVE FITTING ################################
####### B1a. Function to predict the diversity values from subsample x-values according to a given model #######
# Input: Model id, rarefaction x-axis values, model parameters
# Output: dataframe of x and y axis plotting values
PredDiv <- function(model, rarefac, params)
{
model.param <- as.list(params)
div.temp <- model(rarefac, model.param)
if(any(div.temp=="NaN")) div.temp <- rep(Inf, length(rarefac)) # Ensures an infinite ssr
data.frame(RarefacXAxis=rarefac, RarefacYAxis=div.temp)
}
####### B1b. Function to calculate the residuals (modCost) #######
# Input: Model id, model parameters, DivSubsamples object
ModelCost <- function(model, params, dss)
{
xypred <- PredDiv(model, dss$RarefacXAxis, params)
xyactual <- data.frame(RarefacXAxis=dss$RarefacXAxis, RarefacYAxis=dss$RarefacYAxis)
modCost(xypred, xyactual, x="RarefacXAxis")
}
####### B1c. Function to calculate the discrepancy as the mean of pointwise percentage error #######
# Input: Model id, model parameters, DivSubsamples object
ModelCostAbs <- function(model, params, dss)
{
ypred <- PredDiv(model, dss$RarefacXAxis, params)$RarefacYAxis
yactual <- dss$RarefacYAxis
mean(abs((ypred-yactual)/yactual))
}
####### B1d. Function to convert parameter ranges to vectors #######
# Input: model.set, Param.ranges (possibly truncated)
ConvertRanges <- function(model.set, param.ranges)
{
BoundsLower <- list()
BoundsUpper <- list()
for (mod in 1:length(model.set))
{
BoundsLower[[names(model.set)[mod]]] = param.ranges[[mod]]["Lower",]
BoundsUpper[[names(model.set)[mod]]] = param.ranges[[mod]]["Upper",]
}
list(lower=BoundsLower, upper=BoundsUpper)
}
####### B1e. Function to perform a global Fit #######
# Input: Function, initial parameters, lower parameter bounds, upper parameter bounds, Max no. of iterations, minimum variance
GlobalFit <- function(func, init.list, lower.bd, upper.bd, numit, varleft)
{
cat("Choosing optimal initial parameters for global fit", "\n")
init.param <- (lower.bd + upper.bd)/2
cost.lowest <- func(init.param)$model
for (i in 1:dim(init.list)[1])
if (all((init.list[i,] >= lower.bd)) & all((init.list[i,] <= upper.bd)))
{
cost.temp <- func(init.list[i,])$model
if (cost.temp < cost.lowest)
{
cost.lowest <- cost.temp
init.param <- init.list[i,]
}
}
cat("Performing global fit", "\n")
modFit(f = func, p = init.param, lower=lower.bd, upper=upper.bd, # "Pseudo" Global Fit
method = "Pseudo", control = c(numiter = numit, varleft=varleft, verbose=TRUE))
}
####### B1f. Function to perform a local Fit #######
# Input: Function, initial parameters, lower parameter bounds, upper parameter bounds, Max no. of iterations, minimum variance
LocalFit <- function(func, init.param, lower.bd, upper.bd)
{
cat("Performing local fit", "\n")
localfit <- modFit(f = func, p = init.param, lower=lower.bd, upper=upper.bd, # Local Fit
method = "Marq")
return(localfit)
}
####### B1g. Function to perform a combined global-local Fit #######
# Input: Function, initial parameters, lower parameter bounds, upper parameter bounds, Max no. of iterations, minimum variance
CombinedFit <- function(func, init.param, lower.bd, upper.bd, numit, varleft)
{
g.fit <- GlobalFit(func, init.param, lower.bd, upper.bd, numit, varleft)
LocalFit(func, g.fit$par, lower.bd, upper.bd)
}
####### B1h. Function to convert function "funk" to function of a single variable #######
# Input: function, parameters
SingleVar <- function(funk, params)
{
fn = function(x)
{
funk(x, params)
}
return(fn)
}
####### B1i. Function to find intersections between curves #######
# Input: function, parameters1, parameter2, lower x-limit, upper x-limit
Crossings <- function(model, param1, param2, xvals)
{
# yvals1 <- model(xvals, param1)
# yvals2 <- model(xvals, param2)
# s1 <- try(SpatialLines(list(Lines(list(Line(cbind(xvals , yvals1))) , ID=1))), silent = TRUE)
# s2 <- try(SpatialLines(list(Lines(list(Line(cbind(xvals , yvals2))) , ID=1))), silent = TRUE)
#
# temp.int <- gIntersection(s1, s2)
# int.len <- length(temp.int)
#
# if (int.len != 0)
# row.names(temp.int) <- seq(1,int.len)
#
# intersections <- try(as.data.frame(temp.int), silent=TRUE)
intersections <- NULL
intersections
}
####### B1j. Function to evaluate distance between curves #######
# Input: function, parameters1, parameter2, lower x-limit, upper x-limit
Simm <- function(model, param1, param2, lowerlimit, upperlimit, tot.length)
{
intersections <- Crossings(model, param1, param2, seq(lowerlimit, upperlimit, length.out=tot.length))
# No intersections
if (length(intersections) == 0 | inherits(intersections, "try-error"))
{
dist <- abs(as.numeric(as.character(integrate(SingleVar(model, param1), lower = lowerlimit, upper = upperlimit))[1]) -
as.numeric(as.character(integrate(SingleVar(model, param2), lower = lowerlimit, upper = upperlimit))[1]))
return(dist)
} else if (length(intersections) != 0 & !inherits(intersections, "try-error"))
{
# with intersections
limits <- c(lowerlimit)
for (i in (1:(dim(intersections)[1]))) limits <- c(limits, intersections[i,1])
limits <- c(limits, upperlimit)
dist <- 0
for(interval in 1:(dim(intersections)[1]+1))
{
dist <- dist + abs(as.numeric(as.character(integrate(SingleVar(model, param1), lower=limits[interval],
upper=limits[interval+1]))[1]) -
as.numeric(as.character(integrate(SingleVar(model, param2), lower=limits[interval],
upper=limits[interval+1]))[1]))
}
return(dist)
}
}
####### B1k. Function to evaluate the area under the reference curve #######
# Input: function, parameters1, lower x-limit, upper x-limit
RefArea <- function(model, param1, lowerlimit, upperlimit)
{
total.area <- abs(as.numeric(as.character(integrate(SingleVar(model, param1), lower = lowerlimit, upper = upperlimit))[1]))
return(total.area)
}
####### B1l. Function to evaluate the mean squared distances between curves #######
# Input: function, parameters1, parameter2, lower x-limit, upper x-limit
MSRcurves <- function(dist)
{
return(sum(dist*dist)/(2*nrow(dist)))
}
####### B1m. Function to evaluate the monotoniticy of the fitted curve #######
# Input: Model, parameters, minimum datapoint, maximum datapoint, number of points to test
IsMonotonic <- function(model, params, min.samp, max.samp, tot.length)
{
xvals <- seq(min.samp, max.samp, length.out=tot.length)
all(diff(model(xvals, params))>=0)
}
####### B1n. Function to evaluate the negativity of the 2nd derivative of the fitted curve #######
# Input: Model, parameters, minimum data size, maximum data size, number of points to test
IsSlowing <- function(model, params, min.samp, max.samp, tot.length)
{
xvals <- seq(min.samp, max.samp, length.out=tot.length)
all(round(diff(diff(model(xvals, params))), 5)<=0)
}
####### B2a. Fit a single sample to a single model #######
FitSample <- function(model, init.param, lower.bd, upper.bd, dss, numit, varleft)
{
func <- function(p)
{
ModelCost(model, p, dss)
}
CombinedFit(func, init.param, lower.bd, upper.bd, numit, varleft)
}
####### B2b. Fit all samples to a single model #######
# Input: SS=list of subsample sizes, model, initial parameters, parameter boundaries,
# list of diversity subsample objects, numit, varleft, population total estimate, vector of original samples,
# minimum sample size (for criteria 3 & 4)
FitAllSubs <- function(SS, model.list, init.param, param.range, dSS, numit, varleft, tot.pop, v1, minsampsize)
{
###### Extract model and name ######
model <- model.list[[1]]
model.name <- names(model.list)
###### Create lower and upper boundaries ######
lower.bd <- param.range[1,]
upper.bd <- param.range[2,]
###### Create storage matrices ######
# Parameter names
param.mat <- matrix(nrow=length(SS), ncol=ncol(init.param))
rownames(param.mat) <- as.character(SS)
colnames(param.mat) <- names(init.param[1,])
# Criteria 1: Sum of squares
ssr.mat <- matrix(nrow=length(SS), ncol=1)
rownames(ssr.mat) <- as.character(SS)
colnames(ssr.mat) <- c("Sum_of_squares")
# Criteria 1: Mean squared residual
ms.mat <- matrix(nrow=length(SS), ncol=1)
rownames(ms.mat) <- as.character(SS)
colnames(ms.mat) <- c("Mean_squared_residual")
# Criteria 1: Discrepancy - mean abs error
discrep.mat <- matrix(nrow=length(SS), ncol=1)
rownames(discrep.mat) <- as.character(SS)
colnames(discrep.mat) <- c("Mean_absolute_error")
# Criteria 2: Local diversity prediction
local.mat <- matrix(nrow=length(SS), ncol=1)
rownames(local.mat) <- as.character(SS)
colnames(local.mat) <- c("Predicted_local_diversity")
# Criteria 2: Global diversity prediction
global.mat <- matrix(nrow=length(SS), ncol=1)
rownames(global.mat) <- as.character(SS)
colnames(global.mat) <- c("Predicted_global_diversity")
# Criteria 2: Accuracy to observed dataset diversity
AccuracyToObserved.mat <- matrix(nrow=length(SS), ncol=1)
rownames(AccuracyToObserved.mat) <- as.character(SS)
colnames(AccuracyToObserved.mat) <- c("Accuracy_to_observed_dataset_diversity")
# Criteria 3: Similarity of curves (local and global)
sim.local.mat <- matrix(rep(0, length(SS)*length(SS)), nrow=length(SS), ncol=length(SS))
rownames(sim.local.mat) <- as.character(SS)
colnames(sim.local.mat) <- as.character(SS)
sim.global.mat <- matrix(rep(0, length(SS)*length(SS)), nrow=length(SS), ncol=length(SS))
rownames(sim.global.mat) <- as.character(SS)
colnames(sim.global.mat) <- as.character(SS)
# Criteria 3: Mean squared distance between curves (local and global)
msr.curve.mat <- matrix(nrow=2, ncol=1)
rownames(msr.curve.mat) <- c("local", "global")
colnames(msr.curve.mat) <- c("MSR_between_subsample_curves")
# Criteria 3: Distance from reference curve (local and global)
sim.local.ref <- matrix(nrow=length(SS)-1, ncol=1)
rownames(sim.local.ref) <- as.character(SS[2:length(SS)])
colnames(sim.local.ref) <- c("Distance_from_local_reference_curve")
sim.global.ref <- matrix(nrow=length(SS)-1, ncol=1)
rownames(sim.global.ref) <- as.character(SS[2:length(SS)])
colnames(sim.global.ref) <- c("Distance_from_global_reference_curve")
# Criteria 4: Plausibility (Monotonicity and decreasing 2nd derivate)
monotonic.local.mat <- matrix(nrow=length(SS), ncol=1)
monotonic.global.mat <- matrix(nrow=length(SS), ncol=1)
slowing.local.mat <- matrix(nrow=length(SS), ncol=1)
slowing.global.mat <- matrix(nrow=length(SS), ncol=1)
plausibility.mat <- matrix(nrow=length(SS), ncol=3)
rownames(monotonic.local.mat) <- as.character(SS)
rownames(monotonic.global.mat) <- as.character(SS)
rownames(slowing.local.mat) <- as.character(SS)
rownames(slowing.global.mat) <- as.character(SS)
rownames(plausibility.mat) <- as.character(SS)
colnames(monotonic.local.mat) <- c("Is monotonic (local)")
colnames(monotonic.global.mat) <- c("Is monotonic (global)")
colnames(slowing.local.mat) <- c("Has_decreasing_2nd_derivative (local)")
colnames(slowing.global.mat) <- c("Has_decreasing_2nd_derivative (global)")
colnames(plausibility.mat) <- c("Is plausible", "Monotonically increasing", "Slowing")
###### Loop through different samples in SS #######
for (b in 1:length(SS))
{
cat("Performing fitting routine for sample ", b, "\n")
if (b>1) rbind(init.param, fit$par) # whether to use meta.list or previous fitting values
fit <- try(FitSample(model, init.param, lower.bd, upper.bd, dSS[[b]], numit, varleft), silent=TRUE)
if(inherits(fit, "try-error")) next
param.mat[b,] <- fit$par # Assign parameter values
ssr.mat[b,] <- fit$ssr # Assign sum of squared residuals
ms.mat[b,] <- fit$ms # Assign mean squared residuals
discrep.mat[b,] <- ModelCostAbs(model, fit$par, dSS[[b]]) # Assign discrepancy mean absolute residuals (in percentages)
local.mat[b,] <- model(length(v1), fit$par) # Assign local diversity prediction
global.mat[b,] <- model(tot.pop, fit$par) # Assign global diversity prediction
AccuracyToObserved.mat[b,] <- (model(length(v1), fit$par) - length(unique(v1)))/length(unique(v1)) # Assign % accuracy of local diversity predition
# Assign plausibility assessments
monotonic.local.mat[b,] <- IsMonotonic(model, fit$par, min.samp=minsampsize, max.samp=length(v1), tot.length=length(v1)-minsampsize+1)
monotonic.global.mat[b,] <- IsMonotonic(model, fit$par, min.samp=minsampsize, max.samp=tot.pop, tot.length=min(tot.pop-minsampsize+1,2000))
slowing.local.mat[b,] <- IsSlowing(model, fit$par, min.samp=minsampsize, max.samp=length(v1), tot.length=length(v1)-minsampsize+1)
slowing.global.mat[b,] <- IsSlowing(model, fit$par, min.samp=minsampsize, max.samp=tot.pop, tot.length=min(tot.pop-minsampsize+1,2000))
plausibility.mat[b,2] <- ((monotonic.local.mat[b,])*(monotonic.global.mat[b,]))==1
plausibility.mat[b,3] <- (slowing.global.mat[b,])==1
plausibility.mat[b,1] <- (plausibility.mat[b,2]*plausibility.mat[b,3])==1
}
# Assign similarity measures (local and global)
norm.fac.local <- try(RefArea(model, param.mat[1,], minsampsize, length(v1)), silent=TRUE)
norm.fac.global <- try(RefArea(model, param.mat[1,], minsampsize, tot.pop), silent=TRUE)
for (b in 1:length(SS))
for (c in (b:length(SS)))
if (b==c)
{
sim.local.mat[b,c] <- 0
sim.global.mat[b,c] <- 0
} else
{
local.tmp.mat <- try(Simm(model, param.mat[b,], param.mat[c,], lowerlimit=minsampsize,
upperlimit=length(v1), tot.length=length(v1)-minsampsize+1), silent=TRUE)
global.tmp.mat <- try(Simm(model, param.mat[b,], param.mat[c,], lowerlimit=minsampsize,
upperlimit=tot.pop, tot.length=tot.pop-minsampsize+1), silent=TRUE)
if(!inherits(local.tmp.mat, "try-error") & !inherits(global.tmp.mat, "try-error"))
{
sim.local.mat[b,c] <- local.tmp.mat
sim.global.mat[b,c] <- global.tmp.mat
}
}
sim.local.mat <- sim.local.mat + t(sim.local.mat)
sim.global.mat <- sim.global.mat + t(sim.global.mat)
sim.local.mat.tmp <- try(sim.local.mat/norm.fac.local, silent=TRUE)
sim.global.mat.tmp <- try(sim.global.mat/norm.fac.global, silent=TRUE)
if (!inherits(sim.local.mat.tmp, "try-error") & !inherits(sim.global.mat.tmp, "try-error"))
{
sim.local.mat <- sim.local.mat.tmp
sim.global.mat <- sim.global.mat.tmp
}
for (b in 2:length(SS))
{
sim.local.ref[b-1,] <- sim.local.mat[b,1]
sim.global.ref[b-1,] <- sim.global.mat[b,1]
}
# Assign msr between sample curves
msr.curve.mat[1,1] <- MSRcurves(sim.local.mat)
msr.curve.mat[2,1] <- MSRcurves(sim.global.mat)
list(param=param.mat, ssr=ssr.mat, ms=ms.mat, discrep=discrep.mat, local=local.mat, global=global.mat, AccuracyToObserved=AccuracyToObserved.mat,
subsamplesizes=SS, datapoints=dSS, modelname=names(model.list), numparam=ncol(param.mat), sampvar=msr.curve.mat,
mono.local=monotonic.local.mat, mono.global=monotonic.global.mat, slowing.local=slowing.local.mat,
slowing.global=slowing.global.mat, plausibility=plausibility.mat, dist.local=sim.local.mat, dist.global=sim.global.mat,
local.ref.dist=sim.local.ref, global.ref.dist=sim.global.ref, popsize=tot.pop, themodel=model)
}
####### B3a. Fit single model function #######
FitSingleMod <- function(model.list, init.param, param.range, main.samp, tot.pop=(100*(DivSampleNum(main.samp,2)[1])),
numit=10^5, varleft=1e-8, data.default=TRUE, subsizes=6, dssamps=list(),
nrf=1, minrarefac=1, NResamples=1000, minplaus=10, fitloops=2)
{
v1 <- FormatInput(main.samp) # Convert samp to a vector of length=main sample length and with species ids
if (data.default)
{
cat("Creating subsamples and rarefaction data", "\n")
SS <- DivSampleNum(main.samp, subsizes)
# Create divsubsample object for each sample
samp <- v1
dSS <- list()
main.dss <- DivSubsamples(mainsamp=samp, nrf=nrf, minrarefac=minrarefac, maxrarefac=SS[1])
dSS[[1]] <- main.dss
for (b in (2:length(SS)))
{
max.ss <- SS[b]
dss_temp <- DivSubsamples_Nested(main.dss, max.ss)
dSS[[b]] <- dss_temp
}
} else
{
SS <- subsizes # largest element of samp.vec must be < length(v1)
# Create divsubsample object for each sample
dSS <- dssamps
}
# Fit the samples
cat("Fitting loop 1", "\n")
fitsingle <- FitAllSubs(SS, model.list, init.param, param.range, dSS, numit, varleft, tot.pop, v1, minplaus)
if (fitloops > 1)
for (c in (2:fitloops))
{
cat("Fitting loop", c, "\n")
rbind(init.param, fitsingle$param)
fitsingle <- FitAllSubs(SS, model.list, init.param, param.range, dSS, numit, varleft, tot.pop, v1, minplaus)
}
fitsingle$call <- match.call()
class(fitsingle) <- "FitSingleMod"
fitsingle
}
####### B3b. Print method - fit of single model #######
print.FitSingleMod <- function(x, ...)
{
cat("Fitting results for model:", x$modelname, "\n")
cat("Model parameters ($param):\n")
print(x$param)
cat("\nSubsample sizes ($subsamplesizes):\n")
print(x$subsamplesizes)
cat("\nMeasures for discrepancies between rarefaction data and model fits ($discrep, $ssr, $ms):\n")
print(cbind(x$discrep, x$ssr, x$ms))
cat("\nDiversity predictions and the local prediction accuracy ($AccuracyToObserved, $local, $global):\n")
print(cbind(x$AccuracyToObserved, x$local, x$global))
cat("\nSimilarity measure: normalised area between curve and largest-sample curve ($local.ref.dist):\n")
print(x$local.ref.dist)
cat("\nPlausibility measures: monotonicity and decreasing 2nd derivative ($plausibility):\n")
print(x$plausibility)
}
####### B3c. Summary method: fit of single model #######
summary.FitSingleMod <- function(object, ...)
{
tot.subsamp <- length(object$datapoints)
num.rf <- length((object$datapoints[[1]])$RarefacXAxis)
fitted.model <- object$modelname
n.param <- object$numparam
TAB <- cbind(No.of.subsamples=tot.subsamp,
No.of.rarefaction.points=num.rf,
Fitted.model.name=fitted.model,
No.of.model.parameters=n.param)
fit.sum <- list(call=object$call,
rsum=TAB)
class(fit.sum) <- "summary.FitSingleMod"
fit.sum
}
####### B3d. Print summary method - fit of single model #######
print.summary.FitSingleMod <- function(x, ...)
{
cat("Call:\n")
print(x$call)
cat("\nSingle model fit setup summary:\n")
print(x$rsum)
}
####### B3e. Plot method - fit of single model #######
plot.FitSingleMod <- function(x, range="local", ...)
{
points.colours <- c("red", "green", "brown", "orange", "blue", "yellow")
# Define plot limits
ylim.list <- c()
for (i in 1:length(x$subsamplesizes))
{
# Define fitted curve
func1 <- function(z) {
x$themodel(z, x$param[i,])
}
if (range=="local") xvalue <- x$datapoints[[i]]$RarefacXAxis[length(x$datapoints[[i]]$RarefacXAxis)] else xvalue <- x$popsize
yvalue <- x$datapoints[[i]]$RarefacYAxis[length(x$datapoints[[i]]$RarefacYAxis)]
ypred <- func1(xvalue)
max.y <- max(yvalue, ypred)
ylim.list[i] <- max.y
}
ylim.choose <- max(ylim.list)
# Plot main sample diversity
Sample_number <- x$datapoints[[1]]$RarefacXAxis[length(x$datapoints[[1]]$RarefacXAxis)]
Species_count <- x$datapoints[[1]]$RarefacYAxis[length(x$datapoints[[1]]$RarefacXAxis)]
if (range=="local")
{
xupper <- Sample_number*1.1
yupper <- ylim.choose*1.1
} else if (range=="global")
{
xupper <- x$popsize*1.1
yupper <- ylim.choose*1.1
}
plot(x=Sample_number, y=Species_count, pch=20, col="purple", cex=3, xlim=c(0,xupper),
ylim=c(0, yupper), ...)
# Plot curves and points
for (j in 1:length(x$subsamplesizes))
{
# Plot the points
points(x=x$datapoints[[j]]$RarefacXAxis,
y=x$datapoints[[j]]$RarefacYAxis,
pch=4, col=points.colours[j])
# Define the fitted function
func1 <- function(z) {
x$themodel(z, x$param[j,])
}
# Plot the fitted function
if (range=="local")
{
xval <- seq(0, Sample_number, 0.1)
yval <- func1(xval)
} else if (range=="global")
{
xval <- seq(0, x$popsize, 1)
yval <- func1(xval)
}
lines(x=xval, y=yval, lty=1, col=points.colours[j], lwd=2)
}
# Purpledot
points(x=Sample_number, y=Species_count, pch=20, col="purple", cex=3)
}
####################### C. SCORING AND MODEL COMPARISON #######################
####### C1a. Whole number check #######
# Input: Single number
# Output: TRUE/FALSE
IsWholeNumber <- function(x, tol = .Machine$double.eps^0.5)
{
return(abs(x - round(x)) < tol)
}
####### C1b. Binning function - Single number, absolute divisor #######
# Input: Number, divisor
# Output: Number rounded UP to multiple of divisor
BinFunkSingleNumber <- function(x,d)
{
if (is.na(x)) {
return(x)
} else if ( x == 0 ) { ## need this line as otherwise 0 value gets 0 score, want minimum score to be 1, regardless of divisor.
return(1)
} else {
X = abs(x)
res = X%%d
if(IsWholeNumber(X/d)) {
return(( ((X-res)/d) ) )
} else {
return( ( ((X-res)/d) + 1 ) )
}
}
}
####### C1c. Binning function - Vector of numbers, absolute divisor #######
# Input: Vector of numbers, divisor
# Output: Vector of number rounded UP to multiple of divisor
BinFunk <- function(xx,dd) # this rounds each entry of xx DOWN to the last multiple of dd.
{
return(sapply(xx , BinFunkSingleNumber, d = dd))
}
####### C1d. Binning function - Single number, Percentage precision #######
# Input: Number, precison
# Output: Number rounded to specified precision
Bfunprop = function(a, percent)
{
if (is.na(a)) {
return(a)
} else {
if (a >= 0) {
return( floor((1/percent) * a)/(1/percent) )
}
if (a < 0) {
return( ceiling((1/percent) * a)/(1/percent) )
}
}
}
####### C1e. Binning function - Vector of numbers, Percentage precision #######
# Input: Vector of numbers, precison
# Output: Vector of numbers rounded to specified precision
BBprop=function(aa,percentwecareabout)
{
if(percentwecareabout == 0) {
return(aa)
} else {
return(sapply(aa , Bfunprop, percent = percentwecareabout))
}
}
####### C1f. Geometric mean function #######
# Input: Top estimates
# Output: Geometric mean
GeoMean <- function(x)
{
if (any(is.na(x))) {warning('global diversity values are unreliable due to one or more poor model fits amongst top models')}
return(exp(mean(log(x), na.rm=TRUE)))
}
####### C2a. Scoring function - Single model #######
# Input: FitsingleMod, Sample aggregation method ("Equal", "Proportional", "InvProportional"),
# Criteria rounding values c(fit, accuracy, local similarity), plausibility penalty score
# Output: Single model scores - Five criteria - 1. Fit, 2. Accuracy, 3. Local similarity,
# 4. Monotonicity (local-global combined), 5. Plausible-global
SingleModScore <- function(fsm, precision.lv = c(0.0001, 0.005, 0.005), plaus.pen=500)
{
nS <- length(fsm$subsamplesizes) # No. of samples
samp.wts <- rep(1/nS, nS)
## Un-aggregated scores
fit.scores <- BinFunk(fsm$discrep, precision.lv[1])
accuracy.scores <- BinFunk(fsm$AccuracyToObserved, precision.lv[2])
similarity.scores <- BinFunk(fsm$local.ref.dist, precision.lv[3])
plausible.scores <- rep(500, nS)
# Monotonic and plausibility scores penalties
for (i in 1:nS) {
if (is.na(fsm$plausibility[i,1]) || fsm$plausibility[i,1]==FALSE) {
plausible.scores <- plaus.pen
}
}
## Aggregated scores
fit.agg <- sum(fit.scores*samp.wts)
accuracy.agg <- sum(accuracy.scores*samp.wts)
similarity.agg <- sum(similarity.scores*samp.wts[2:length(samp.wts)])/sum(samp.wts[2:length(samp.wts)])
plausible.agg <- sum(plausible.scores*samp.wts)
list(fit=fit.agg, accuracy=accuracy.agg, similarity=similarity.agg, plausibility=plausible.agg,
binsize=precision.lv, plausibility.penalty=plaus.pen,
modname=fsm$modelname)
}
####### C3a. Score single model function #######
ScoreSingleMod <- function(fsm, precision.lv = c(0.0001, 0.005, 0.005), plaus.pen=500)
{
scoresingle <- SingleModScore(fsm, precision.lv, plaus.pen)
scoresingle$call <- match.call()
class(scoresingle) <- "ScoreSingleMod"
scoresingle
}
####### C3b. Print method - score of single model #######
print.ScoreSingleMod <- function(x, ...)
{
cat("Scoring results ($[colnames]):\n")
score.output <- matrix(c(x$fit, x$accuracy, x$similarity, x$plausibility), nrow=1, ncol=4)
rownames(score.output) <- as.character(x$modname)
colnames(score.output) <- c("discrepancy", "accuracy", "similarity", "plausibility")
print(score.output)
}
####### C3c. Summary method: score of single model #######
summary.ScoreSingleMod <- function(object, ...)
{
fitted.model <- object$modname
binsize <- object$binsize
plauspen <- object$plausibility.penalty
TAB <- cbind(Model.name=fitted.model,
precision.fit=binsize[1],
precision.accuracy=binsize[2],
precision.similarity=binsize[3],
Plausibility.penalty=plauspen)
score <- list(call=object$call,
rsum=TAB)
class(score) <- "summary.ScoreSingleMod"
score
}
####### C3d. Print summary method - score of single model #######
print.summary.ScoreSingleMod <- function(x, ...)
{
cat("Call:\n")
print(x$call)
cat("\nSingle model scoring setup summary:\n")
print(x$rsum)
}
####### C4a. Combine attributes of a single model #######
# Input: ScoreSingleMod object, Criteria weights (zero=exclude from scoring)
# Output: Single model score
CombineCriteria <- function(ssm, crit.wts=c(1.0, 1.0, 1.0, 1.0))
{
vect <- c(ssm$fit, ssm$accuracy, ssm$similarity, ssm$plausibility)
vect <- sum((vect*crit.wts)/sum(crit.wts))
return(vect)
}
####### C4b. Fit, score and compare multiple models #######
# Input: List of models, List of initial parameters, List of Lower bounds, List of upper bounds,
# main.samp, tot.pop, numit, varleft, subsizes, dssamps, nrf, minrarefac, NResamples, minplaus,
# precision.lv, plaus.pen, crit.wts, fitloops, numpred
# Output: List of i) Model scores ii) List of dss iii) List of ssm iv) predictions x3 vii) original model list
MultipleScoring <- function(models, init.params, param.ranges, main.samp, tot.pop, numit, varleft,
subsizes, dssamps, nrf, minrarefac, NResamples, minplaus, precision.lv, plaus.pen, crit.wts,
fitloops, numpred)
{
num.mod <- length(models)
fmm <- list()
ssm <- matrix(rep(NA, num.mod*4), nrow=num.mod, ncol=4)
colnames(ssm) <- c("fit", "accuracy", "similarity", "plausibility")
mod.score <- matrix(rep(NA, num.mod), nrow=num.mod, ncol=1)
mod.rownames <- as.character(rep(NA, num.mod))
# Create the samples and divsubsample object
if (length(dssamps)==0)
{
cat("Creating subsamples and rarefaction data", "\n")
mas.dss <- list() # Create 3 master samples
mas.SS <- DivSampleNum(main.samp, subsizes) # Create master vector of 3 samples sizes
samp <- FormatInput(main.samp) # Format main sample
dss.main <- DivSubsamples(mainsamp=samp, nrf=nrf, minrarefac=minrarefac, maxrarefac=mas.SS[1], NResamples=NResamples)
mas.dss[[1]] <- dss.main
for (b in (2:length(mas.SS))) # Create master list of DivSubsamples
{
max.ss <- mas.SS[b]
dss <- DivSubsamples_Nested(dss.main, max.ss)
mas.dss[[b]] <- dss
}
} else
{
cat("Loading predefined subsamples", "\n")
mas.SS <- DivSampleNum(main.samp, subsizes)
samp <- FormatInput(main.samp) # Format main sample
mas.dss <- dssamps
}
timeremain = "tbd"
ptm <- proc.time()[3]
# Loop throught the different models
for (i in (1:num.mod))
{
# Fit model
cat("Fitting model", i, "of", length(models),"(Est. time remaining:", timeremain, "mins)", "\n")
fsm.temp <- FitSingleMod(model.list=models[i], init.param=init.params[[i]], param.range=param.ranges[[i]],
main.samp, data.default=FALSE, tot.pop= tot.pop, numit=numit, varleft=varleft,
subsizes=mas.SS, dssamps=mas.dss, nrf=nrf, minrarefac=minrarefac, NResamples=NResamples,
minplaus=minplaus, fitloops=fitloops)
# Score model criteria
cat("Scoring model", i, "\n")
ssm.temp <- try(ScoreSingleMod(fsm=fsm.temp, precision.lv = precision.lv, plaus.pen=plaus.pen), silent=TRUE)
if(inherits(ssm.temp, "try-error")) next
# Aggregate criteria score
cat("Aggregating scoring for model", i, "\n")
mod.score.temp <- CombineCriteria(ssm=ssm.temp, crit.wts=crit.wts)
fmm[[fsm.temp$modelname]] <- fsm.temp
ssm[i,1] <- ssm.temp$fit
ssm[i,2] <- ssm.temp$accuracy
ssm[i,3] <- ssm.temp$similarity
ssm[i,4] <- ssm.temp$plausibility
mod.score[i,] <- mod.score.temp
mod.rownames[i] <- fsm.temp$modelname
# Calculate time
timeremain <- round(((proc.time()[3] - ptm)*(num.mod - i)/i)/60)
}
rownames(mod.score) <- mod.rownames
colnames(mod.score) <- "Combined score"
rownames(ssm) <- mod.rownames
# calculate estimate
num.models <- length(fmm)
m <- min(numpred, num.models)
topx_scores <- sort(unique(mod.score))[1:m]
lenx <- length(topx_scores)
topx_index <- which(mod.score %in% topx_scores)
predx.vector <- c()
for (i in 1:lenx)
{
tmp <- ((fmm[[topx_index[i]]])$global)[1,]
predx.vector <- c(predx.vector, tmp)
}
predx <- GeoMean(predx.vector)
predx_high <- max(predx.vector)
predx_low <- min(predx.vector)
list(model.scores=mod.score, fmm=fmm, ssm=ssm, estimate=predx, upper_estimate=predx_high, lower_estimate=predx_low, models=models, m=m)
}
####### C5a. Fit/score multiple models function #######
DiveMaster <- function(models, init.params, param.ranges, main.samp, tot.pop=(100*(DivSampleNum(main.samp,2)[1])), numit=10^5, varleft=1e-8,
subsizes=6, dssamps=list(), nrf=1, minrarefac=1, NResamples=1000, minplaus=10,
precision.lv=c(0.0001, 0.005, 0.005), plaus.pen=500,
crit.wts=c(1.0, 1.0, 1.0, 1.0), fitloops=2, numpred=5)
{
mainfit <- MultipleScoring(models, init.params, param.ranges, main.samp, tot.pop, numit, varleft,
subsizes, dssamps, nrf, minrarefac, NResamples, minplaus, precision.lv, plaus.pen, crit.wts, fitloops, numpred)
mainfit$call <- match.call()
class(mainfit) <- "DiveMaster"
mainfit
}
####### C5b. Print method - score of multiple models #######
print.DiveMaster <- function(x, ...)
{
cat("Model score ($model.scores):\n")
print(x$model.scores)
cat("\nPopulation diversity estimate ($estimate):\n")
print(x$estimate)
}
####### C5c. Summary method: fit/scores of multiple models #######
summary.DiveMaster <- function(object, ...)
{
num.models <- length(object$fmm)
best.index <- which.min(object$model.scores)
best.model <- (object$fmm[[best.index]])$modelname
div.pred.from.top.model <- ((object$fmm[[best.index]])$global)[1,]
TAB <- data.frame(Number_of_models_fitted=as.numeric(num.models),
Best_scoring_model=best.model,
Estimated_global_diversity=as.numeric(object$estimate),
Diversity_upperbound=as.numeric(object$upper_estimate),
Diversity_lowerbound=as.numeric(object$lower_estimate))
res <- list(call=object$call,
rsum=TAB)
class(res) <- "summary.DiveMaster"
res
}
####### C5d. Print summary method - fits/scores of multiple models #######
print.summary.DiveMaster <- function(x, ...)
{
cat("Call:\n")
print(x$call)
cat("\nMultiple Model fitting and scoring summary:\n")
print(x$rsum)
}
####### C6. Combining several DiveMaster objects into a single DiveMaster object #######
# Input: List of DiveMaster objects
# Output: Single combined DiveMaster object
CombDM <- function(dmlist)
{
l <- length(dmlist) # number of DiveMaster objects
out <- dmlist[[1]]
if (l>1)
{
for (i in 2:l)
{
out$model.scores <- rbind(out$model.scores, dmlist[[i]]$model.scores)
out$ssm <- rbind(out$ssm, dmlist[[i]]$ssm)
out$fmm <- c(out$fmm, dmlist[[i]]$fmm)
out$models <- c(out$models, dmlist[[i]]$models)
out$m <- min(out$m, dmlist[[i]]$m)
}
# Recompute estimate
num.models <- length(out$fmm)
m <- min(out$m, num.models)
topx_scores <- sort(unique(out$model.scores))[1:m]
lenx <- length(topx_scores)
topx_index <- which(out$model.scores %in% topx_scores)
predx.vector <- c()
for (j in 1:lenx)
{
tmp <- ((out$fmm[[topx_index[j]]])$global)[1,]
predx.vector <- c(predx.vector, tmp)
}
predx <- GeoMean(predx.vector)
predx_high <- max(predx.vector)
predx_low <- min(predx.vector)
out$estimate <- predx
out$upper_estimate <- predx_high
out$lower_estimate <- predx_low
}
class(out) <- "DiveMaster"
return(out)
}
####### C7. Give a diversity estimate for a different population size based on top-x score models #######
# Input: DiveMaster object, population size, Number of top scores models to include in estimate
# Output: Single population estimate
PopDiversity = function (dm, popsize, TopX = NULL)
{
if (is.null(TopX)) m <- dm$m else
{
if (TopX <= length(dm$fmm)) m <- TopX else
{
m <- min(5, length(dm$fmm))
warning("Number of models specified is greater then number of models fitted")
}
}
topx_scores <- sort(unique(dm$model.scores))[1:m]
lenx <- length(topx_scores)
topx_index <- which(dm$model.scores %in% topx_scores)
predx.vector <- c()
for (i in 1:lenx)
{
tmp <- dm$models[[topx_index[i]]](popsize, dm$fmm[[topx_index[i]]]$param[1, ])
predx.vector <- c(predx.vector, tmp)
}
predx <- GeoMean(predx.vector)
predx_high <- max(predx.vector)
predx_low <- min(predx.vector)
return(list(estimate = predx, upper_estimate = predx_high,
lower_estimate = predx_low))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.