R/load.R

Defines functions list_species species_info get_occurrences get_fold_data get_version get_datadir get_file get_species_names csv2rds get_folds get_background

Documented in get_background get_fold_data get_folds get_occurrences get_version list_species species_info

#' List species
#'
#' \code{list_species} returns a dataframe with all the species names and their
#' WoRMS identifier (\code{aphia_id}).
#'
#' @usage list_species()
#'
#' @details If the file with the list of species is not present on the hard disk
#'   then it is downloaded and stored in the data directory. The data directory
#'   can be set with \code{options(marinespeed_datadir = ".")}. By default data
#'   is downloaded to \code{tempdir()}.
#'
#' @examples
#' species <- list_species()
#' head(species)
#'
#' @export
#' @seealso \code{\link{lapply_kfold_species}}, \code{\link{lapply_species}},
#'   \code{\link{get_fold_data}}, \code{\link{get_occurrences}},
#'   \code{\link{species_info}}
list_species <- function() {
  csv2rds(get_file("species.csv.gz"))
}

#' Species information
#'
#' \code{species_info} returns a dataframe with species information.
#'
#' @usage species_info()
#'
#' @details Information about the taxonomy from the
#'   World Register of Marine Species (WoRMS), information on the covered
#'   latitudinal zones and the level of sample selection bias.
#'
#' @examples
#' info <- species_info()
#' unique(info$kingdom)
#' colnames(info)
#' @export
#' @seealso \code{\link{list_species}}
species_info <- function() {
  csv2rds(get_file("traits.csv.gz"))
}

#' Get occurrence records
#'
#' \code{get_occurrences} returns a data.frame with all occurrence records for
#' one or more species, first columns are species, longitude and latitude
#' followed by environmental data columns.
#'
#' @usage get_occurrences(species, raw = FALSE)
#'
#' @param species dataframe or character vector. Dataframe like returned by
#'   \code{\link{list_species}} or the names of the species.
#' @param raw logical. If \code{FALSE} then 25 square kilometer grid and manual
#'   outlier filtered occurrence records are returned.
#'
#' @return Dataframe with as columns: species, longitude, latitude and the
#'   environmental variable columns.
#'
#' @examples \dontrun{
#' abalistes_stellatus <- get_occurrences("Abalistes stellatus")
#'
#' species <- list_species()
#' first10 <- get_occurrences(species[1:10,])
#' }
#' @export
#' @seealso \code{\link{lapply_species}}, \code{\link{get_fold_data}},
#'   \code{\link{list_species}}, \code{\link{get_background}}
get_occurrences <- function(species = NULL, raw = FALSE) {
  if(raw) {
    dir <- get_file("occurrences_raw.zip")
  } else {
    dir <- get_file("occurrences.zip")
  }
  paths <- list.files(dir, pattern = "[.]csv[.]gz", full.names = TRUE)
  if(!is.null(species)) {
    species <- get_species_names(species)

    filenames <- basename(paths)
    filespecies <- sub("\\s?[0-9]*[.]csv[.]gz", "", filenames)
    paths <- paths[filespecies %in% species]

    if(length(paths) == 0) {
      warning("No occurrence files found for the provided species")
    }
  }
  do.call("rbind", lapply(paths, function(p) { csv2rds(p) }))
}

#' Get fold data
#'
#' \code{get_fold_data} returns a list of training and test occurrence and
#' background data fold(s) for one or more species.
#'
#' @usage get_fold_data(species, fold_type, k)
#'
#' @param species dataframe or character vector. Row from the dataframe returned
#'   by \code{\link{list_species}} or the name of the species.
#' @param fold_type character. Type of partitioning you want to use, default is
#'   \code{"disc"}.
#' @param k integer vector. Numbers of the folds you want to get data for, if
#'   you want all folds use \code{1:5}, which is the default.
#'
#' @details The different \code{fold_type} are:
#'
#'   \code{"disc"}: 5-fold disc partitioning of occurrences with pairwise
#'   distance sampled and buffer filtered random background points, equivalent
#'   to calling \code{\link{kfold_occurrence_background}} with
#'   \code{occurrence_fold_type = "disc", k = 5, pwd_sample = TRUE,
#'   background_buffer = 200*1000}
#'
#'   \code{"grid_4"} and \code{"grid_9"}: 4-fold and 9-fold grid partitioning of
#'   occurrences with pairwise distance sampled and buffer filtered random
#'   background points, equivalent to calling
#'   \code{\link{kfold_occurrence_background}} with \code{occurrence_fold_type =
#'   "grid", k = 4, pwd_sample = TRUE, background_buffer = 200*1000}
#'
#'   \code{"random"}: 5-fold random partitioning of occurrences and random
#'   background points, equivalent to calling
#'   \code{\link{kfold_occurrence_background}} with \code{occurrence_fold_type =
#'   "random", k = 5, pwd_sample = FALSE, background_buffer = 0}
#'
#'   \code{"targetgroup"}: same way of partitioning as the \code{"random"} folds
#'   but instead of random background points, a random subset of all occurrences
#'   points was used creating a targetgroup background points set which has the
#'   same sampling bias as the entire dataset.
#'
#' @return A 5 element list with fold data filled in for all \code{k}. Fold data
#'   consists of a list with 4 elements: \code{occurrence_training},
#'   \code{occurrence_test}, \code{background_training} and
#'   \code{background_test}.
#'
#' @examples \dontrun{
#' aba_folds <- get_fold_data("Abalistes stellatus", "random", k = 1:5)
#' k1 <- aba_folds[[1]]
#' k1$occurrence_training
#' k1$occurrence_test
#' k1$background_training
#' k1$background_test
#' }
#' @export
#' @seealso \code{\link{list_species}} \code{\link{lapply_kfold_species}}
#'   \code{\link{lapply_species}} \code{\link{kfold_data}}
get_fold_data <- function(species, fold_type, k = NULL) {
  if(NROW(species) > 1) {
    ## otherwise might get into memory problems
    stop("get_fold_data expects only 1 species")
  }
  max_k <- 5
  max_k <- ifelse(fold_type == "grid_4", 4, max_k)
  max_k <- ifelse(fold_type == "grid_9", 9, max_k)
  if(is.null(k)) {
    k <- 1:max_k
  }
  if(min(k) < 1 || max(k) > max_k) {
    stop(paste0("k should be between 1 and ", max_k))
  }
  species <- get_species_names(species)
  occurrences <- get_occurrences(species)
  folds <- get_folds(fold_type)
  if(fold_type == "targetgroup") {
    bg <- get_background("targetgroup")
  } else {
    bg <- get_background("random")
  }
  klist <- lapply(1:max_k, function(t) NULL)
  for (fold in k) {
    occ_train <- kfold_data(species, occurrences, folds$species, fold, training = TRUE)
    occ_test <- kfold_data(species, occurrences, folds$species, fold, training = FALSE)
    bg_train <- kfold_data(species, bg, folds$background, fold, training = TRUE)
    bg_test <- kfold_data(species, bg, folds$background, fold, training = FALSE)
    bg_train$species <- rep("background", nrow(bg_train))
    bg_test$species <- rep("background", nrow(bg_test))
    klist[[fold]] <- list(occurrence_training=occ_train, occurrence_test=occ_test,
                        background_training=bg_train, background_test=bg_test)
  }
  # }
  klist
}

#' Get MarineSPEED version
#'
#' \code{get_version} returns the currently used MarineSPEED version, this can
#' be changed by setting \code{options(marinespeed_version="<version
#' information>")}.
#'
#' @usage get_version()
#'
#' @return Character with the current version ("V1") or another version if the
#'   \code{marinespeed_version} has been set.
#'
#' @examples
#' print(get_version())
#'
#' @export
get_version <- function() {
  v <- getOption("marinespeed_version")
  if(is.null(v)) {
    v <- "v1"
  }
  v
}

get_datadir <- function() {
  datadir <- getOption("marinespeed_datadir")
  if(is.null(datadir)) {
    datadir <- file.path(tempdir(), "marinespeed")
    print("file.path(tempdir(), \"marinespeed\") will be used as datadir, set options(marinespeed_datadir=\"<directory>\") to avoid re-downloading the data in every session")
  }
  datadir <- file.path(datadir, get_version())
  if(!dir.exists(datadir)) {
    dir.create(datadir, recursive = TRUE)
  }
  datadir
}

get_file <- function(filename) {
  datadir <- get_datadir()

  outfile <- file.path(datadir, filename)
  outfile_nozip <- file.path(datadir,sub("[.]zip$", "", filename))
  if(!file.exists(outfile) && !dir.exists(outfile_nozip)) {
    root <- paste0("http://marinespeed.samuelbosch.com/", get_version(), "/")
    ok <- -1
    tryCatch({
      suppressWarnings({ ok <- utils::download.file(paste0(root, filename), outfile, mode="wb") })
    }, finally = {
      if(ok != 0) {
        print(paste("failed to download", filename))
      }
      if(ok != 0 && file.exists(outfile)) {
        file.remove(outfile)
      }
    })
    if(grepl("[.]zip$", filename)) {
      unzip(outfile, exdir = outfile_nozip)
      file.remove(outfile)
    }
  }
  normalizePath(outfile_nozip)
}

get_species_names <- function(species) {
  if(NCOL(species) >= 2 && all(c("species", "aphia_id") %in% colnames(species))) {
    species <- species[,"species"]
  }
  if(!is.character(species) && !is.factor(species)){
    stop("invalid species input")
  }
  as.character(species)
}

csv2rds <- function(file, extension = ".rds") {
  rds_file <- sub("[.]csv[.]gz$", extension, file)
  data <- NULL
  if(file.exists(rds_file)) { ## cache as rds (faster)
    data <- readRDS(rds_file)
  }
  if(extension == ".rds" && (is.null(data) || getOption("stringsAsFactors") != is.factor(data[1,1]))) {
    data <- read.csv(file)
    saveRDS(data, rds_file)
  } else if (is.null(data) && grepl("cv_folds_bit[.]rds$", rds_file)) {
    folds <- read.csv(file)
    data <- structure(list(), class = "marinespeed_folds")

    species <- folds[,1]
    for(sp in unique(species)) {
      w <- which(species == sp)
      data[["species"]][[sp]] <- c(min(w), max(w))
    }
    for(ki in 2:ncol(folds)) {
      if(any(is.na(folds[,ki]))) {
        data[[paste0(colnames(folds)[ki], "_NOTNA")]] <- !bit::as.bit(is.na(folds[,ki]))
      }
      data[[colnames(folds)[ki]]] <- bit::as.bit(folds[,ki])
    }
    saveRDS(data, rds_file)
  } else if(is.null(data)) {
    stop("unsupported")
  }
  data
}

#' Get folds
#'
#' \code{get_folds} returns the different pre-generated folds information. To
#' get the fold data for a species see also \code{\link{get_fold_data}}.
#'
#' @usage get_folds(type = "disc")
#'
#' @param type character. The type of partitioning you want to load.
#'
#' @details The different supported \code{type} are:
#'
#'   \code{"disc"}: 5-fold disc partitioning of occurrences with pairwise
#'   distance sampled and buffer filtered random background points, equivalent
#'   to calling \code{\link{kfold_occurrence_background}} with
#'   \code{occurrence_fold_type = "disc", k = 5, pwd_sample = TRUE,
#'   background_buffer = 200*1000}
#'
#'   \code{"grid_4"} and \code{"grid_9"}: 4-fold and 9-fold grid partitioning of
#'   occurrences with pairwise distance sampled and buffer filtered random
#'   background points, equivalent to calling
#'   \code{\link{kfold_occurrence_background}} with \code{occurrence_fold_type =
#'   "grid", k = 4, pwd_sample = TRUE, background_buffer = 200*1000}
#'
#'   \code{"random"}: 5-fold random partitioning of occurrences and random
#'   background points, equivalent to calling
#'   \code{\link{kfold_occurrence_background}} with \code{occurrence_fold_type =
#'   "random", k = 5, pwd_sample = FALSE, background_buffer = 0}
#'
#'   \code{"targetgroup"}: same way of partitioning as the \code{"random"} folds
#'   but instead of random background points, a random subset of all occurrences
#'   points was used creating a targetgroup background points set which has the
#'   same sampling bias as the entire dataset.
#'
#' @return A list with two entries \code{"background"} and \code{"species"},
#'   each entry is a dataframe with species name column and 5 fold columns as
#'   created by \code{\link{kfold_occurrence_background}}
#'
#' @examples \dontrun{
#' folds <- get_folds("random")
#'
#' abalistes <- "Abalistes stellatus"
#' occ <- get_occurrences(abalistes)
#' bg <- get_background("random")
#'
#' occ_train <- kfold_data(abalistes, occ, folds$species, k=1, training=TRUE)
#' occ_test <- kfold_data(abalistes, occ, folds$species, k=1, training=FALSE)
#' bg_train <- kfold_data(abalistes, bg, folds$background, k=1, training=TRUE)
#' bg_test <- kfold_data(abalistes, bg, folds$background, k=1, training=FALSE)
#' }
#' @export
#' @seealso \code{\link{lapply_kfold_species}} \code{\link{get_fold_data}}
#'   \code{\link{get_occurrences}} \code{\link{get_background}}
#'   \code{\link{kfold_data}}
get_folds <- function(type = "disc") {
  type <- as.character(type)
  if(type == "disc") {
    bg <- "pseudodisc_background_5cv_folds"
    species <- "pseudodisc_species_5cv_folds"
  } else if(type == "grid_4") {
    bg <- "grid_background_4cv_folds"
    species <- "grid_species_4cv_folds"
  } else if(type == "grid_9") {
    bg <- "grid_background_9cv_folds"
    species <- "grid_species_9cv_folds"
  } else if(type == "random") {
    bg <- "random_background_5cv_folds"
    species <- "random_species_5cv_folds"
  } else if(type == "targetgroup") {
    bg <- "targetgroup_background_5cv_folds"
    species <- "random_species_5cv_folds"
  } else {
    stop("fold_type not supported")
  }
  extension <- getOption("marinespeed_folds_extension")
  extension <- ifelse(is.null(extension), "_bit.rds", extension)
  bg <- get_file(paste0(bg, extension))
  species <- get_file(paste0(species, extension))
  if(extension == ".csv.gz") {
    bg_folds <- csv2rds(bg)
    species_folds <- csv2rds(species)
  } else {
    bg_folds <- readRDS(bg)
    species_folds <- readRDS(species)
  }

  list(background = bg_folds, species = species_folds)
}

#' Get background data
#'
#' \code{get_background} returns pre-generated background data.
#'
#' @usage get_background(type)
#'
#' @param type character. Either \code{"random"} or \code{"targetgroup"}.
#'
#' @details The targetgroup background was created by subsampling an average of
#'   37 occurrence records (20000 in total) from each species in the dataset
#'   providing in essence the same sampling bias as the entire dataset.
#'
#' @return A dataframe with all background points.
#'
#' @examples \dontrun{
#' random_bg <- get_background("random")
#' plot(random_bg[,2:3], pch=".", main="random background")
#'
#' targetgroup_bg <- get_background("targetgroup")
#' plot(targetgroup_bg[,2:3], pch=".", main="targetgroup background")
#' }
#' @export
#' @seealso \code{\link{get_fold_data}} \code{\link{get_occurrences}}
#'   \code{\link{get_folds}}
get_background <- function(type) {
  if(!(as.character(type) %in% c("random", "targetgroup"))) {
    stop("background type not recognized")
  }
  read.csv(get_file(paste0(as.character(type), "_background.csv.gz")))
}
samuelbosch/marinespeed documentation built on Dec. 17, 2019, 11:47 p.m.