R/load.R
In marinespeed: Benchmark Data Sets and Functions for Marine Species Distribution Modelling

#' List species
#'
#' \code{list_species} returns a dataframe with all the species names and their
#' WoRMS identifier (\code{aphia_id}).
#'
#' @usage list_species()
#'
#' @details If the file with the list of species is not present on the hard disk
#'   then it is downloaded and stored in the data directory. The data directory
#'   can be set with \code{options(marinespeed_datadir = ".")}. By default data
#'   is downloaded to \code{tempdir()}.
#'
#' @examples
#' species <- list_species()
#' head(species)
#'
#' @export
#' @seealso \code{\link{lapply_kfold_species}}, \code{\link{lapply_species}},
#'   \code{\link{get_fold_data}}, \code{\link{get_occurrences}},
#'   \code{\link{species_info}}
list_species <- function() {
  csv2rds(get_file("species.csv.gz"))
}

#' Species information
#'
#' \code{species_info} returns a dataframe with species information.
#'
#' @usage species_info()
#'
#' @details Information about the taxonomy from the
#'   World Register of Marine Species (WoRMS), information on the covered
#'   latitudinal zones and the level of sample selection bias.
#'
#' @examples
#' info <- species_info()
#' unique(info$kingdom)
#' colnames(info)
#' @export
#' @seealso \code{\link{list_species}}
species_info <- function() {
  csv2rds(get_file("traits.csv.gz"))
}

#' Get occurrence records
#'
#' \code{get_occurrences} returns a data.frame with all occurrence records for
#' one or more species, first columns are species, longitude and latitude
#' followed by environmental data columns.
#'
#' @usage get_occurrences(species, raw = FALSE)
#'
#' @param species dataframe or character vector. Dataframe like returned by
#'   \code{\link{list_species}} or the names of the species.
#' @param raw logical. If \code{FALSE} then 25 square kilometer grid and manual
#'   outlier filtered occurrence records are returned.
#'
#' @return Dataframe with as columns: species, longitude, latitude and the
#'   environmental variable columns.
#'
#' @examples \dontrun{
#' abalistes_stellatus <- get_occurrences("Abalistes stellatus")
#'
#' species <- list_species()
#' first10 <- get_occurrences(species[1:10,])
#' }
#' @export
#' @seealso \code{\link{lapply_species}}, \code{\link{get_fold_data}},
#'   \code{\link{list_species}}, \code{\link{get_background}}
get_occurrences <- function(species = NULL, raw = FALSE) {
  if(raw) {
    dir <- get_file("occurrences_raw.zip")
  } else {
    dir <- get_file("occurrences.zip")
  }
  paths <- list.files(dir, pattern = "[.]csv[.]gz", full.names = TRUE)
  if(!is.null(species)) {
    species <- get_species_names(species)

    filenames <- basename(paths)
    filespecies <- sub("\\s?[0-9]*[.]csv[.]gz", "", filenames)
    paths <- paths[filespecies %in% species]

    if(length(paths) == 0) {
      warning("No occurrence files found for the provided species")
    }
  }
  do.call("rbind", lapply(paths, function(p) { csv2rds(p) }))
}

#' Get fold data
#'
#' \code{get_fold_data} returns a list of training and test occurrence and
#' background data fold(s) for one or more species.
#'
#' @usage get_fold_data(species, fold_type, k)
#'
#' @param species dataframe or character vector. Row from the dataframe returned
#'   by \code{\link{list_species}} or the name of the species.
#' @param fold_type character. Type of partitioning you want to use, default is
#'   \code{"disc"}.
#' @param k integer vector. Numbers of the folds you want to get data for, if
#'   you want all folds use \code{1:5}, which is the default.
#'
#' @details The different \code{fold_type} are:
#'
#'   \code{"disc"}: 5-fold disc partitioning of occurrences with pairwise
#'   distance sampled and buffer filtered random background points, equivalent
#'   to calling \code{\link{kfold_occurrence_background}} with
#'   \code{occurrence_fold_type = "disc", k = 5, pwd_sample = TRUE,
#'   background_buffer = 200*1000}
#'
#'   \code{"grid_4"} and \code{"grid_9"}: 4-fold and 9-fold grid partitioning of
#'   occurrences with pairwise distance sampled and buffer filtered random
#'   background points, equivalent to calling
#'   \code{\link{kfold_occurrence_background}} with \code{occurrence_fold_type =
#'   "grid", k = 4, pwd_sample = TRUE, background_buffer = 200*1000}
#'
#'   \code{"random"}: 5-fold random partitioning of occurrences and random
#'   background points, equivalent to calling
#'   \code{\link{kfold_occurrence_background}} with \code{occurrence_fold_type =
#'   "random", k = 5, pwd_sample = FALSE, background_buffer = 0}
#'
#'   \code{"targetgroup"}: same way of partitioning as the \code{"random"} folds
#'   but instead of random background points, a random subset of all occurrences
#'   points was used creating a targetgroup background points set which has the
#'   same sampling bias as the entire dataset.
#'
#' @return A 5 element list with fold data filled in for all \code{k}. Fold data
#'   consists of a list with 4 elements: \code{occurrence_training},
#'   \code{occurrence_test}, \code{background_training} and
#'   \code{background_test}.
#'
#' @examples \dontrun{
#' aba_folds <- get_fold_data("Abalistes stellatus", "random", k = 1:5)
#' k1 <- aba_folds[[1]]
#' k1$occurrence_training
#' k1$occurrence_test
#' k1$background_training
#' k1$background_test
#' }
#' @export
#' @seealso \code{\link{list_species}} \code{\link{lapply_kfold_species}}
#'   \code{\link{lapply_species}} \code{\link{kfold_data}}
get_fold_data <- function(species, fold_type, k = NULL) {
  if(NROW(species) > 1) {
    ## otherwise might get into memory problems
    stop("get_fold_data expects only 1 species")
  }
  max_k <- 5
  max_k <- ifelse(fold_type == "grid_4", 4, max_k)
  max_k <- ifelse(fold_type == "grid_9", 9, max_k)
  if(is.null(k)) {
    k <- 1:max_k
  }
  if(min(k) < 1 || max(k) > max_k) {
    stop(paste0("k should be between 1 and ", max_k))
  }
  species <- get_species_names(species)
  occurrences <- get_occurrences(species)
  folds <- get_folds(fold_type)
  if(fold_type == "targetgroup") {
    bg <- get_background("targetgroup")
  } else {
    bg <- get_background("random")
  }
  klist <- lapply(1:max_k, function(t) NULL)
  for (fold in k) {
    occ_train <- kfold_data(species, occurrences, folds$species, fold, training = TRUE)
    occ_test <- kfold_data(species, occurrences, folds$species, fold, training = FALSE)
    bg_train <- kfold_data(species, bg, folds$background, fold, training = TRUE)
    bg_test <- kfold_data(species, bg, folds$background, fold, training = FALSE)
    bg_train$species <- rep("background", nrow(bg_train))
    bg_test$species <- rep("background", nrow(bg_test))
    klist[[fold]] <- list(occurrence_training=occ_train, occurrence_test=occ_test,
                        background_training=bg_train, background_test=bg_test)
  }
  # }
  klist
}

#' Get MarineSPEED version
#'
#' \code{get_version} returns the currently used MarineSPEED version, this can
#' be changed by setting \code{options(marinespeed_version="<version
#' information>")}.
#'
#' @usage get_version()
#'
#' @return Character with the current version ("V1") or another version if the
#'   \code{marinespeed_version} has been set.
#'
#' @examples
#' print(get_version())
#'
#' @export
get_version <- function() {
  v <- getOption("marinespeed_version")
  if(is.null(v)) {
    v <- "v1"
  }
  v
}

get_datadir <- function() {
  datadir <- getOption("marinespeed_datadir")
  if(is.null(datadir)) {
    datadir <- file.path(tempdir(), "marinespeed")
    warning("file.path(tempdir(), \"marinespeed\") will be used as datadir, set options(marinespeed_datadir=\"<directory>\") to avoid re-downloading the data in every session")
  }
  datadir <- file.path(datadir, get_version())
  if(!dir.exists(datadir)) {
    dir.create(datadir, recursive = TRUE)
  }
  datadir
}

get_file <- function(filename) {
  datadir <- get_datadir()

  outfile <- file.path(datadir, filename)
  outfile_nozip <- file.path(datadir,sub("[.]zip$", "", filename))
  if(!file.exists(outfile) && !dir.exists(outfile_nozip)) {
    root <- paste0("http://marinespeed.samuelbosch.com/", get_version(), "/")
    ok <- -1
    tryCatch({
      ok <- utils::download.file(paste0(root, filename), outfile, mode="wb")
    }, finally = {
      if(ok != 0 && file.exists(outfile)) {
        file.remove(outfile)
      }
    })
    if(grepl("[.]zip$", filename)) {
      unzip(outfile, exdir = outfile_nozip)
      file.remove(outfile)
    }
  }
  normalizePath(outfile_nozip)
}

get_species_names <- function(species) {
  if(NCOL(species) >= 2 && all(c("species", "aphia_id") %in% colnames(species))) {
    species <- species[,"species"]
  }
  if(!is.character(species) && !is.factor(species)){
    stop("invalid species input")
  }
  as.character(species)
}

csv2rds <- function(file, extension = ".rds") {
  rds_file <- sub("[.]csv[.]gz$", extension, file)
  data <- NULL
  if(file.exists(rds_file)) { ## cache as rds (faster)
    data <- readRDS(rds_file)
  }
  if(extension == ".rds" && (is.null(data) || getOption("stringsAsFactors") != is.factor(data[1,1]))) {
    data <- read.csv(file)
    saveRDS(data, rds_file)
  } else if (is.null(data) && grepl("cv_folds_bit[.]rds$", rds_file)) {
    folds <- read.csv(file)
    data <- structure(list(), class = "marinespeed_folds")

    species <- folds[,1]
    for(sp in unique(species)) {
      w <- which(species == sp)
      data[["species"]][[sp]] <- c(min(w), max(w))
    }
    for(ki in 2:ncol(folds)) {
      if(any(is.na(folds[,ki]))) {
        data[[paste0(colnames(folds)[ki], "_NOTNA")]] <- !bit::as.bit(is.na(folds[,ki]))
      }
      data[[colnames(folds)[ki]]] <- bit::as.bit(folds[,ki])
    }
    saveRDS(data, rds_file)
  } else if(is.null(data)) {
    stop("unsupported")
  }
  data
}

#' Get folds
#'
#' \code{get_folds} returns the different pre-generated folds information. To
#' get the fold data for a species see also \code{\link{get_fold_data}}.
#'
#' @usage get_folds(type = "disc")
#'
#' @param type character. The type of partitioning you want to load.
#'
#' @details The different supported \code{type} are:
#'
#'   \code{"disc"}: 5-fold disc partitioning of occurrences with pairwise
#'   distance sampled and buffer filtered random background points, equivalent
#'   to calling \code{\link{kfold_occurrence_background}} with
#'   \code{occurrence_fold_type = "disc", k = 5, pwd_sample = TRUE,
#'   background_buffer = 200*1000}
#'
#'   \code{"grid_4"} and \code{"grid_9"}: 4-fold and 9-fold grid partitioning of
#'   occurrences with pairwise distance sampled and buffer filtered random
#'   background points, equivalent to calling
#'   \code{\link{kfold_occurrence_background}} with \code{occurrence_fold_type =
#'   "grid", k = 4, pwd_sample = TRUE, background_buffer = 200*1000}
#'
#'   \code{"random"}: 5-fold random partitioning of occurrences and random
#'   background points, equivalent to calling
#'   \code{\link{kfold_occurrence_background}} with \code{occurrence_fold_type =
#'   "random", k = 5, pwd_sample = FALSE, background_buffer = 0}
#'
#'   \code{"targetgroup"}: same way of partitioning as the \code{"random"} folds
#'   but instead of random background points, a random subset of all occurrences
#'   points was used creating a targetgroup background points set which has the
#'   same sampling bias as the entire dataset.
#'
#' @return A list with two entries \code{"background"} and \code{"species"},
#'   each entry is a dataframe with species name column and 5 fold columns as
#'   created by \code{\link{kfold_occurrence_background}}
#'
#' @examples \dontrun{
#' folds <- get_folds("random")
#'
#' abalistes <- "Abalistes stellatus"
#' occ <- get_occurrences(abalistes)
#' bg <- get_background("random")
#'
#' occ_train <- kfold_data(abalistes, occ, folds$species, k=1, training=TRUE)
#' occ_test <- kfold_data(abalistes, occ, folds$species, k=1, training=FALSE)
#' bg_train <- kfold_data(abalistes, bg, folds$background, k=1, training=TRUE)
#' bg_test <- kfold_data(abalistes, bg, folds$background, k=1, training=FALSE)
#' }
#' @export
#' @seealso \code{\link{lapply_kfold_species}} \code{\link{get_fold_data}}
#'   \code{\link{get_occurrences}} \code{\link{get_background}}
#'   \code{\link{kfold_data}}
get_folds <- function(type = "disc") {
  type <- as.character(type)
  if(type == "disc") {
    bg <- "pseudodisc_background_5cv_folds"
    species <- "pseudodisc_species_5cv_folds"
  } else if(type == "grid_4") {
    bg <- "grid_background_4cv_folds"
    species <- "grid_species_4cv_folds"
  } else if(type == "grid_9") {
    bg <- "grid_background_9cv_folds"
    species <- "grid_species_9cv_folds"
  } else if(type == "random") {
    bg <- "random_background_5cv_folds"
    species <- "random_species_5cv_folds"
  } else if(type == "targetgroup") {
    bg <- "targetgroup_background_5cv_folds"
    species <- "random_species_5cv_folds"
  } else {
    stop("fold_type not supported")
  }
  extension <- getOption("marinespeed_folds_extension")
  extension <- ifelse(is.null(extension), "_bit.rds", extension)
  bg <- get_file(paste0(bg, extension))
  species <- get_file(paste0(species, extension))
  if(extension == ".csv.gz") {
    bg_folds <- csv2rds(bg)
    species_folds <- csv2rds(species)
  } else {
    bg_folds <- readRDS(bg)
    species_folds <- readRDS(species)
  }

  list(background = bg_folds, species = species_folds)
}

#' Get background data
#'
#' \code{get_background} returns pre-generated background data.
#'
#' @usage get_background(type)
#'
#' @param type character. Either \code{"random"} or \code{"targetgroup"}.
#'
#' @details The targetgroup background was created by subsampling an average of
#'   37 occurrence records (20000 in total) from each species in the dataset
#'   providing in essence the same sampling bias as the entire dataset.
#'
#' @return A dataframe with all background points.
#'
#' @examples \dontrun{
#' random_bg <- get_background("random")
#' plot(random_bg[,2:3], pch=".", main="random background")
#'
#' targetgroup_bg <- get_background("targetgroup")
#' plot(targetgroup_bg[,2:3], pch=".", main="targetgroup background")
#' }
#' @export
#' @seealso \code{\link{get_fold_data}} \code{\link{get_occurrences}}
#'   \code{\link{get_folds}}
get_background <- function(type) {
  if(!(as.character(type) %in% c("random", "targetgroup"))) {
    stop("background type not recognized")
  }
  read.csv(get_file(paste0(as.character(type), "_background.csv.gz")))
}
Any scripts or data that you put into this service are public.
marinespeed documentation built on May 1, 2019, 10:26 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
marinespeed
Benchmark Data Sets and Functions for Marine Species Distribution Modelling

R/load.R
In marinespeed: Benchmark Data Sets and Functions for Marine Species Distribution Modelling

Try the marinespeed package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

marinespeed Benchmark Data Sets and Functions for Marine Species Distribution Modelling

R/load.R In marinespeed: Benchmark Data Sets and Functions for Marine Species Distribution Modelling

Try the marinespeed package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

marinespeed
Benchmark Data Sets and Functions for Marine Species Distribution Modelling

R/load.R
In marinespeed: Benchmark Data Sets and Functions for Marine Species Distribution Modelling