NEMO: What the Package Does (One Line, Title Case)

Documented in Dfunabc

#' @title Distance of observed taxa versus sample distribution via ABC-rejection
#'
#' @param gradient A numeric vector representing the gradient along which the taxa has been observed (e.g. concentration of nutrients).
#' @param taxa A character vector with the names of all taxa needs to be the same length at the gradient.
#' @param dist Assumed distribution for the data generating model. This can either be a lognormal (lnrom; default) or gamma (gamma) distribution.
#' @param pr1dist An argument that indicates what prior the location parameter takes "lnorm", "gamma" or "norm".
#' for the gamma distribution this is not the scale but rate parameter, defaulting "lnorm" to scale 1.
#' @param par1 An numeric argument that indicates which indicates what the scale parameter should be.
#' @param pr2dist An argument that indicates what prior the scale parameter takes for "lnorm", "gamma" or "norm".
#' for the gamma distribution this is not the scale but rate parameter, defaulting "gamma" with rate 1.
#' @param par2 An numeric argument that indicates which indicates what the scale parameter should be.
#' @param qtol The value for the quantile tolerance. A lower value selects simulated parameters that fall closer to the observed parameters. Default is 0.01 and indicates that the closest 1 percent is accepted and the rest rejected.
#' @param nsim Number of simulations used.
#' @param xlab An argument representing the name of the gradient.
#' @param size Size of the points in plot "A". Default is 6 but might be to large.
#' @param print.progress Prints the progress of the number of simulations completed (length of the priors). By default is TRUE
#' @param timing Prints the time that was needed to complete the simulations. By default is TRUE.
#' @param penalty The argument indicates whether the intervals of D and SP need to be adjusted based on the observed shift in loc linear adjustment of the distributional parameters.
#'
#' @details The function returns a list of 4 objects: a data frame with the Taxa with D an S, a data frame with S for all taxa,
#' a plotted figure (ggplot2) and a data frame with the simulate, priors and distance of the accepted values of each taxa.
#'
#' @export Dfunabc
#'
#' @examples
#'  \dontrun{
#'  library(readr)
#' df <- read_csv(url("https://raw.githubusercontent.com/snwikaij/Data/main/Aquatic_Botany_Kaijser_et_al._2019.csv"))
#'
#' #Place Cl (Median) in a vector
#'  gradient = df$Mediaan
#'
#' #Place taxon names in a vector
#' taxa     = df$Soort
#'
#' #Simulate model (may take up to 5-10 min)
#' results <- Dfunabc(df$Mediaan, df$Soort,
#' dist = "lnorm", pr1dist = "lnorm", par1 = 0.5,
#' nsim = 200000, qtol= 0.005, size=4)
#'
#' #Display the results
#' results$Plot}
Dfunabc  <- function(gradient, taxa,
                     dist ="lnorm",
                     pr1dist = "lnorm", par1 = 1,
                     pr2dist = "gamma", par2 = 1,
                     nsim = 200000, qtol= 0.005,
                     D_min = 0, xlab = "Gradient", size = 6,
                     print.progress = T, timing = T,
                     penalty = F){

  #Start timing
  if(timing == T){stime <- Sys.time()}

  if(qtol <= 0){stop("Quantile tolerance cannot be <= 0")}
  if(qtol >= 0.05){warning("Quantile tolerance >= 0.05 is considered `high`")}
  if(nsim <= 0){stop("Number of simulation cannot be <=0")}
  if(nsim <= 50000){warning("Number of simulations <= 50000 is considered 'low")}
  if(D_min < 0 | D_min >= 1){stop("Minimal D cannot be negative or higher than 1")}
  if(D_min > 0){warning(paste("The mimimal D included in calculation of the mean DP for positive and negative
  is set to", D_min,".", "If this was intentional the this warning can be ignored."))}

  #Combine environmental gradient and character vector
    df     <- data.frame(gradient=gradient, Taxa=as.character(taxa))
    df     <- na.omit(df)
    df     <- df[!df$Taxa %in% names(table(df$Taxa)[table(df$Taxa) == 1]),]

  #ABC-rejection function with quantile tolerance qtol
    spqtol <- function(gr, pr1dist, par1, pr2dist, par2, dist, nsim, qtol){

      llists       <- list()
      priorpar1    <- rep(NA, nsim)
      priorpar2    <- rep(NA, nsim)
      simpar1      <- rep(NA, nsim)
      simpar2      <- rep(NA, nsim)
      n            <- length(gr)
      obs          <- c(mean(gr), sd(gr))

      lognorm <- function(mu, si){

        meanlog <- log(mu)-0.5*log((si/mu)^2+1)
        sdlog   <- sqrt(log((si/mu)^2+1))

        return(c(meanlog, sdlog))}
      gamma   <- function(mu, si){

        scale <- si^2/mu
        shape <- mu/scale

        return(c(shape, scale))}

      for (p in 1:nsim){

        if(pr1dist == "lnorm"){
          par    <- lognorm(obs[1], obs[2])
          prior1 <- rlnorm(1, par[1], par1)}
        else if(pr1dist == "gamma"){
          prior1 <- rgamma(1, shape = obs[1], rate = par1)}
        else if(pr1dist == "norm"){
          prior1 <- rnorm(1, obs[1], par1)}
        else{stop("Select `lnorm`, `gamma` or `norm` for pr1dist")}

        if(pr2dist == "lnorm"){
          par    <- lognorm(obs[1], obs[2])
          prior2 <- rlnorm(1, par[2], par2)}
        else if(pr2dist == "gamma"){
          prior2 <- rgamma(1, shape = obs[1], rate = par2)}
        else if(pr2dist == "norm"){
          prior2 <- rnorm(1, obs[1], obs[2])}
        else{stop("Select `lnorm`, `gamma` or `norm` for pr2dist")}

        if(dist == "lnorm"){
          par           <- lognorm(prior1, prior2)
          simsamp       <- rlnorm(n, par[1], par[2])}
        else if(dist == "gamma"){
          par           <- gamma(prior1, prior2)
          simsamp       <- rgamma(n, shape = par[1], scale = par[2])}
        else if(dist == "norm"){
          simsamp       <- rnorm(n, prior1, prior2)}
        else{stop("Select `lnorm` or `gamma` or `norm` for the data generating model")}

        llists[[p]]   <- simsamp
        simpar1[p]    <- mean(simsamp)
        simpar2[p]    <- sd(simsamp)
        priorpar1[p]  <- prior1
        priorpar2[p]  <- prior2
      }

      #Statistics
      statistics            <- data.frame(simulate1=simpar1, simulate2=simpar2, prior1=priorpar1,  prior2=priorpar2)

      #Distance (Epsilon)
      statistics$distance   <- rowSums(sqrt(sweep(statistics[,1:2], 2, obs)^2))

      #Select closest quantile qtol
      parameters             <- statistics[order(statistics$distance),][1:c(nsim*qtol),]

      parameters                 <- cbind.data.frame(as.data.frame(matrix(ncol=2, nrow=nrow(parameters))), parameters)
      colnames(parameters)[1:2]  <- paste0(rep("LMadjusted",2),1:2)
      nr <- 1
      for(c in 1:2){
          si <- c+2
          pr <- c+4
          parameters[,nr]   <- mean(parameters[,pr])+resid(lm(parameters[,pr]~parameters[,si]))
          nr <-nr+1}

      allsim      <- list()
      for(l in rownames(parameters)){
        allsim[[l]] <- llists[[as.numeric(l)]]}

      allgr       <- do.call(cbind, allsim)
      allgr       <- tidyr::gather(as.data.frame(allgr))
      allgr$key   <- plyr::mapvalues(allgr$key, from=unique(allgr$key), to=1:length(unique(allgr$key)))

      return(list(simulates=allgr, parameters=parameters))}

  #Cumulative normalization function
  cus_fun   <- function(z){

    z[,2]   <- as.numeric(z[,2])
    z       <- z[order(z[,2]),]
    z$V3    <- cumsum(1:nrow(z))/max(cumsum(1:nrow(z)))

    return(z)}

    looplist      <- list()

  #Split dataset of species
    splitdf     <- split(df[,1], df[,2])
    w           <- sapply(splitdf, as.numeric)
    y           <- list()
    t           <- list()

    print("Task 1/2: Applying ABC to taxa")
    print("")
  #Start the ABC loop
  for(i in names(w)){

    if(print.progress == T){print(i)}

     k         <- spqtol(w[[i]], pr1dist=pr1dist, par1=par1, pr2dist=pr2dist, par2=par2, dist=dist, nsim=nsim, qtol=qtol)
     y[[i]]    <- cbind(i, k[[1]])
     t[[i]]    <- k[[2]]}

  #Number of accepted simulates
     acc       <- round(nsim*qtol)

    looplist  <- list()

    print("")
    print(paste("Task 2/2: Comparing", acc, "curves"))
  #Start comparison of curves
  for(i in 1:acc){

      if(print.progress == T){print(i)}

      s              <- lapply(y, function(y) y[y$key==i,])
      z              <- do.call(rbind, s)
      z              <- z[-2]

      r              <- cus_fun(z)
      colnames(r)    <- paste0(rep("V", 3),1:3)
      splitdfspec    <- split(r, r$V1)
      q              <- lapply(splitdfspec, function(x) cbind(x[,1:2], cumsum(1:dim(x)[1])/max(cumsum(1:dim(x)[1]))))

      lpdf  <- data.frame(Taxa=rep(NA, length(q)), Distance=rep(NA, length(q)), Split=rep(NA, length(q)))
      count <- 0
      for(l in names(q)){
        count        <- count+1
        s            <- q[[l]]
        rownames(s)  <-NULL
        n            <- r[r$V1==l,]
        Dl           <- n$V3-s[,3]
        m            <- abs(Dl)
        D            <- max(m)

        lpdf[count,] <- cbind(l, D*sign(Dl[which.max(m)]), s$V2[which.max(m)])}

      looplist[[i]]  <- lpdf}

  #Combine dataset and extract quantiles
  f                  <- do.call(rbind, looplist)
  res                <- aggregate(data=f, .~Taxa, function(x) c(mean(as.numeric(x)), quantile(as.numeric(x), c(.025, 0.975))))
  results            <- cbind.data.frame(res$Taxa, res$Distance, res$Split)
  colnames(results)  <- c("Taxa", "D", "D1", "D2", "S", "S1", "S2")
  results            <- cbind.data.frame(results[c("Taxa")], apply(results[-1], 2, as.numeric))
  results            <- cbind.data.frame(results, do.call(rbind, tapply(df$gradient, df$Taxa, quantile)))

  #Create pos and neg deviance points
  if(D_min > 0){cutD <- D_min}else{cutD <- 0}
  pos <- as.numeric(f$Split[f$Taxa %in% results$Taxa[results$D >= cutD]])
  neg <- as.numeric(f$Split[f$Taxa %in% results$Taxa[results$D < -cutD]])

  #Create data of grouping
  pr       <- df$gradient[df$Taxa %in% results$Taxa[results$D >= cutD]]
  nr       <- df$gradient[df$Taxa %in% results$Taxa[results$D < -cutD]]
  grouping <- data.frame(response=c(rep("Negative", length(nr)), rep("Positive", length(pr))), values=c(nr, pr))

  #Extract quantiles of all species and group
  d                  <- rbind.data.frame(quantile(neg, c(.025, .975)), c(quantile(pos, c(.025, .975))))
  d                  <- setNames(cbind(c("Negative", "Positive"), c(mean(neg), mean(pos)), d), c("Group", "S", "S1", "S2"))
  rownames(d)        <- NULL
  d$Group            <- factor(d$Group, levels = c("Positive", "Negative"))

  #Apply penalty although unsubstantiated by evidence
  if(penalty == T){
    penalty1   <- as.data.frame(t(sapply(t, function(x) quantile(x$prior1, c(.025, 0.975))/quantile(x$LMadjusted1, c(.025, 0.975))))*t(sapply(t, function(x) quantile(x$prior2, c(.025, 0.975))/quantile(x$LMadjusted2, c(.025, 0.975)))))
    results$S1 <- results$S1/penalty1[,1]
    results$S2 <- results$S2*penalty1[,2]
    results$D1 <- results$D1/penalty1[,1]
    results$D2 <- results$D2*penalty1[,2]
    d$S1       <- d$S1/mean(penalty1[,1])
    d$S2       <- d$S2/mean(penalty1[,2])}

  #Plot all species
  pl1 <- ggplot(results, aes(x=S, y=reorder(Taxa, -S)))+
    ylab("")+xlab("")+
    geom_errorbarh(aes(xmin = S1, xmax = S2, height = 0))+
    geom_point(aes(fill = results$D), fill = ifelse(results$D > 0, "white", "black"), pch=21, size = abs(results$D)*size)+
    theme_classic()+
    theme(axis.line = element_line(colour = "black"),
          axis.text.y = element_text(colour = "black"),
          axis.title.y = element_text(colour = "black"),
          legend.position = "none")

  #Plot only positive and negative response
  pl2 <- ggplot(d, aes(x=S, y=Group))+
    ylab("")+xlab(xlab)+
    xlim(ggplot_build(pl1)$layout$panel_scales_x[[1]]$range$range)+
    geom_errorbarh(aes(xmin = S1, xmax = S2, height = 0))+
    geom_point(fill = c("black", "white"), pch=21, size=6)+
    theme_classic()+
    theme(axis.line = element_line(colour = "black"),
          axis.text.y = element_text(colour = "black"),
          axis.title.y = element_text(colour = "black"),
          legend.position = "none")

  plot <- cowplot::plot_grid(pl1,pl2,labels = "AUTO", align = "v", ncol=1,
                             rel_heights = c(1.6, 0.4))

  #Stop timing
  if(timing == T){etime <- Sys.time()
  print(etime-stime)}

  return(list(Taxa=results, Binary=d, Plot=plot, parameters=t, distance=do.call(rbind,t)[,7], grouping=grouping))}