R/write_pyrate.R
In CoordinateCleaner: Automated Cleaning of Occurrence Records from Biological Collections

Documented in write_pyrate

#' Create Input Files for PyRate
#' 
#' Creates the input necessary to run Pyrate, based on a data.frame with fossil
#' ages (as derived e.g. from clean_fossils) and a vector of the
#' extinction status for each sample. Creates files in the working directory!
#' 
#' The replicate option allows the user to generate several replicates of the
#' data set in a single input file, each time re-drawing the ages of the
#' occurrences at random from uniform distributions with boundaries MinT and
#' MaxT. The replicates can be analysed in different runs (see PyRate command
#' -j) and combining the results of these replicates is a way to account for
#' the uncertainty of the true ages of the fossil occurrences. Examples:
#' replicates=1 (default, generates 1 data set), replicates=10 (generates 10
#' random replicates of the data set).
#' 
#' @param taxon character string. The column with the taxon name. 
#' Default = \dQuote{accepted_name}.
#' @param status a vector of character strings of length \code{nrow(x)}.
#' Indicating for each record \dQuote{extinct} or \dQuote{extant}.
#' @param trait a numeric vector of length \code{nrow(x)}. Indicating trait
#' values for each record. Optional.  Default = NULL.
#' @param fname a character string. The prefix to use for the output files.
#' @param path a character string. giving the absolute path to write the output
#' files. Default is the working directory.
#' @param replicates a numerical. The number of replicates for the randomized
#' age generation. See details. Default = 1.
#' @param cutoff a numerical. Specify a threshold to exclude fossil occurrences
#' with a high temporal uncertainty, i.e. with a wide temporal range between
#' min_age and max_age. Examples: cutoff=NULL (default; all occurrences are
#' kept in the data set) cutoff=5 (all occurrences with a temporal range of 5
#' Myr or higher are excluded from the data set)
#' @param random logical. Specify whether to take a random age (between MinT
#' and MaxT) for each occurrence or the midpoint age. Note that this option
#' defaults to TRUE if several replicates are generated (i.e. replicates > 1).
#' Examples: random = TRUE (default) random = FALSE (use midpoint ages)
#' @inheritParams cf_age
#' 
#' @return PyRate input files in the working directory.
#' 
#' @note See \url{https://github.com/dsilvestro/PyRate/wiki} for more details
#' and tutorials on PyRate and PyRate input.
#' 
#' @keywords Fossil
#' @examples
#' 
#' minages <- runif(250, 0, 65)
#' exmpl <- data.frame(accepted_name = sample(letters, size = 250, replace = TRUE),
#'                     lng = runif(250, min = 42, max = 51),
#'                     lat = runif(250, min = -26, max = -11),
#'                     min_ma = minages,
#'                     max_ma = minages + runif(250, 0.1, 65))
#' 
#' #a vector with the status for each record, 
#' #make sure species are only classified as either extinct or extant, 
#' #otherwise the function will drop an error
#' 
#' status <- sample(c("extinct", "extant"), size = nrow(exmpl), replace = TRUE)
#' 
#' #or from a list of species
#' status <- sample(c("extinct", "extant"), size = length(letters), replace = TRUE)
#' names(status) <- letters
#' status <- status[exmpl$accepted_name]
#' 
#' \dontrun{
#' write_pyrate(x = exmpl,fname = "test", status = status)
#' }
#' 
#' @export
#' @importFrom stats runif 
#' @family fossils
#' @importFrom utils write.table
write_pyrate <- function(x,
                        status,
                        fname, 
                        taxon = "accepted_name", 
                        min_age = "min_ma", 
                        max_age = "max_ma",
                        trait = NULL, 
                        path = getwd(), 
                        replicates = 1,
                        cutoff = NULL, 
                        random = TRUE) {
  if (missing(fname)) {
    stop("'fname' missing with no default")
  }

  if (missing(status)) {
    stop("'status' missing with no default.")
  }

  # adapt input data
  dat1 <- data.frame(species = x[, taxon], 
                     Status = status, 
                     min_age = x[, min_age], 
                     max_age = x[, max_age])

  if (length(trait) > 0) {
    dat1 <- data.frame(dat1, trait)
  }

  outfile <- paste(path, "/", fname, "_PyRate.py", sep = "")

  dat1[, 1] <- gsub("[[:blank:]]{1,}", "_", dat1[, 1])

  if (replicates > 1) {
    random <- TRUE
  }

  if (any(is.na(dat1[, 1:4]))) {
    print(c(which(is.na(dat1[, 1])), which(is.na(dat1[, 2])), 
            which(is.na(dat1[, 3])), which(is.na(dat1[, 4]))))
    stop(paste("the input file contains missing data",
               "in species names, status or ages\n"))
  }

  if (!is.null(cutoff)) {
    dat <- dat1[!(dat1[, 4] - dat1[, 3] >= cutoff), ]
    cat("\n\nExcluded ", 
        100 - round(100 * dim(dat)[1] / dim(dat1)[1]), 
        "% occurrences")
    hist(dat1[, 4] - dat1[, 3])
  } else {
    dat <- dat1
  }

  if (length(dat) == 5) {
    colnames(dat) <- c("Species", "Status", "min_age", "max_age", "trait")
  } else {
    colnames(dat) <- c("Species", "Status", "min_age", "max_age")
  }

  dat$new_age <- "NA"
  splist <- unique(dat[, c(1, 2)])[order(unique(dat[, c(1, 2)][, 1])), ]


  if (any(is.element(splist$Species[splist$Status == "extant"], 
                     splist$Species[splist$Status == "extinct"]))) {
    print(intersect(splist$Species[splist$Status == "extant"], 
                    splist$Species[splist$Status == "extinct"]))
    stop("at least one species is listed as both extinct and extant\n")
  }

  cat("#!/usr/bin/env python", "from numpy import * ", "",
    file = outfile,
    sep = "\n"
  )

  for (j in 1:replicates) {
    times <- list()
    cat("\nreplicate", j)

    dat[dat$min_age == 0, 3] <- 0.001

    if (isTRUE(random)) {
      dat$new_age <- round(stats::runif(length(dat[, 1]), 
                                        min = apply(dat[,3:4], FUN = min, 1), 
                                        max = apply(dat[, 3:4], FUN = max, 1)),
      digits = 6
      )
    } else {
      for (i in seq_along(dat[, 1])) {
        dat$new_age[i] <- mean(c(dat[i, 3], dat[i, 4]))
      }
    }

    dat2 <- subset(dat, select = c("Species", "new_age"))
    taxa <- sort(unique(dat2$Species))

    for (n in seq_along(taxa)) {
      times[[n]] <- dat2$new_age[dat2$Species == taxa[n]]
      if (toupper(splist$Status[splist$Species == taxa[n]]) == 
          toupper("extant")) {
        times[[n]] <- append(times[[n]], "0", after = length(times[[n]]))
      }
    }

    dat3 <- matrix(data = NA, 
                   nrow = length(times), 
                   ncol = max(vapply(times, length, FUN.VALUE = numeric(1))))
    rownames(dat3) <- taxa

    for (p in seq_along(times)) {
      dat3[p, seq_along(times[[p]])] <- times[[p]]
    }

    cat(noquote(sprintf("\ndata_%s=[", j)), file = outfile, append = TRUE)

    for (n in seq_len(length(taxa) - 1)) {
      rec <- paste(dat3[n, !is.na(dat3[n, ])], collapse = ",")
      cat(noquote(sprintf("array([%s]),", rec)),
        file = outfile, append = TRUE,
        sep = "\n"
      )
    }

    n <- n + 1
    rec <- paste(dat3[n, !is.na(dat3[n, ])], collapse = ",")
    cat(noquote(sprintf("array([%s])", rec)),
      file = outfile, append = TRUE,
      sep = "\n"
    )

    cat("]", "", file = outfile, append = TRUE, sep = "\n")
  }


  data_sets <- ""
  names <- ""

  if (replicates > 1) {
    for (j in 1:(replicates - 1)) {
      data_sets <- paste(data_sets, noquote(sprintf("data_%s,", j)))
      names <- paste(names, noquote(sprintf(" '%s_%s',", fname, j)))
    }

    data_sets <- paste(data_sets, noquote(sprintf("data_%s", j + 1)))
    names <- paste(names, noquote(sprintf(" '%s_%s',", fname, j + 1)))
  } else {
    data_sets <- "data_1"
    names <- noquote(sprintf(" '%s_1'", fname))
  }

  cat(noquote(sprintf("d=[%s]", data_sets)), noquote(sprintf(
    "names=[%s]",
    names
  )), "def get_data(i): return d[i]", "def get_out_name(i): return  names[i]",
  file = outfile, append = TRUE, sep = "\n"
  )


  tax_names <- paste(taxa, collapse = "','")
  cat(noquote(sprintf("taxa_names=['%s']", tax_names)), 
      "def get_taxa_names(): return taxa_names",
    file = outfile, append = TRUE, sep = "\n"
  )


  if ("trait" %in% colnames(dat)) {
    datBM <- dat[, 1]
    splist$Trait <- NA
    for (n in seq_along(splist[, 1])) {
      splist$Trait[n] <- mean(dat$trait[datBM == splist[n, 1]], na.rm = TRUE)
    }
    s1 <- "\ntrait1=array(["
    BM <- gsub("NaN|NA", "nan", toString(splist$Trait))
    s2 <- "])\ntraits=[trait1]\ndef get_continuous(i): return traits[i]"
    STR <- paste(s1, BM, s2)
    cat(STR, file = outfile, append = TRUE, sep = "\n")
  }

  splistout <- paste(path, "/", fname, "_TaxonList.txt", sep = "")
  lookup <- as.data.frame(taxa)
  lookup$status <- "extinct"

  write.table(splist, 
              file = splistout, 
              sep = "\t", 
              row.names = FALSE, 
              quote = FALSE)
  cat("\n\nPyRate input file was saved in: ", sprintf("%s", outfile), "\n\n")
}

Any scripts or data that you put into this service are public.

CoordinateCleaner documentation built on Oct. 25, 2023, 1:08 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

CoordinateCleaner
Automated Cleaning of Occurrence Records from Biological Collections

R/write_pyrate.R
In CoordinateCleaner: Automated Cleaning of Occurrence Records from Biological Collections

Defines functions write_pyrate

Documented in write_pyrate

Try the CoordinateCleaner package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CoordinateCleaner Automated Cleaning of Occurrence Records from Biological Collections

R/write_pyrate.R In CoordinateCleaner: Automated Cleaning of Occurrence Records from Biological Collections

Defines functions write_pyrate

Documented in write_pyrate

Try the CoordinateCleaner package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CoordinateCleaner
Automated Cleaning of Occurrence Records from Biological Collections

R/write_pyrate.R
In CoordinateCleaner: Automated Cleaning of Occurrence Records from Biological Collections