#' Load Presence-Background Data
#'
#' @param species
#' @param guild
#' @param region
#' @param save.map
#' @param mapfile_directory
#' @param email
#'
#' @return
#' @export
#'
#' @examples
# test <- occ(query = "Bertmainius colonus", from = c("gbif","ala","inat","ecoengine","vertnet","idigbio"))
# testALA <- occurrences(taxon = "text:\"Bertmainius colonus\"", download_reason_id = 5, method="offline", email = "tianxiaoh@student.unimelb.edu.au")
load_pres_bg_data <- function(species,
# guild = "ignore",
#clean = TRUE, #not sure if this needs to be an option because users SHOULD look at maps at least - maybe we should do "autoclean = TRUE"?
region = "all",
save.map = TRUE,
map.directory = ".",
email #needed for ALA4R 'offline' download
){
#######################
### Name Processing ###
#######################
## Check if name is properly formed
if(any(grepl("sp. ",species, fixed = TRUE),
grepl("'",species, fixed = TRUE),
grepl("\"",species, fixed = TRUE)
#add any other exceptions here
)){
stop("Not run: scientific name not properly formed!")
}
## Clean name of bracket suffix
### - assuming the brackets denote subpopulation, and not taxonomic notations
species <- stringr::str_remove(species,
"\\(.*\\)")
## Remove trailing whitespace
species <- stringr::str_squish(species)
####################
### Data Getting ###
####################
## For ala, using ALA4R
occ_ala <- ALA4R::occurrences(taxon = sprintf('text:"%s"',
species),
download_reason_id = 5,
email = email)
if(nrow(occ_ala$data) > 0){
occ_ala$data$eventDate <- lubridate::as_date(occ_ala$data$eventDate)
}
## For the rest, use spocc
### for now ignore guild specific databases, just get gbif
occ_spocc <- spocc::occ(query = species,
from = "gbif",
limit = 100000)
if(nrow(occ_spocc$gbif$data[[1]]) > 0){
occ_spocc$gbif$data[[1]]$eventDate <- lubridate::as_date(occ_spocc$gbif$data[[1]]$eventDate)
}
## If neither search returned data, terminate function
if(nrow(occ_ala$data) == 0 & nrow(occ_spocc$gbif$data[[1]]) == 0){
stop("Not run: no records found")
}
# #define guild-based database parameter (for now ignore)
# invert_databases <- c("gbif","inat","ecoengine","idigbio")
# bird_databases <- c(invert_databases,"ebird","vertnet")
# other_vert_databases <- c(invert_databases,"vertnet")
# #search based on guild
# if(guild %in% c("Spiny crayfish","Invertebrates")){
# occ_spocc <- spocc::occ(query = species, from = invert_databases, limit = 100000)
# }else{
# if(guild == "Birds"){
# occ_spocc <- spocc::occ(query = species, from = bird_databases, limit = 100000, ebirdopts = list(key = "ggpmhljvtpuf"))
# }else{
# occ_spocc <- spocc::occ(query = species, from = other_vert_databases, limit = 100000)
# }
# }
###remove generalised data (as much as possible)
#we first do this through generalisation column (not always there)
if("dataGeneralisedDuringProcessing" %in% colnames(occ_ala$data)){
occ_ala$data <- occ_ala$data[!grepl(x = occ_ala$data$dataGeneralisedDuringProcessing,
pattern = "Generalised"), ]
}
if("dataGeneralizations" %in% colnames(occ_spocc$gbif$data[[1]])){
occ_spocc$gbif$data[[1]] <- occ_spocc$gbif$data[[1]][!grepl("generalised",occ_spocc$gbif$data[[1]]$dataGeneralizations,),]
}#this may be a bit dodgy - the GBIF column may not necessarily contain full info about data generalisation (ie also used for taxonomic comments), but this may be the best we can do\
if("informationWithheld" %in% colnames(occ_spocc$gbif$data[[1]])){
occ_spocc$gbif$data[[1]] <- occ_spocc$gbif$data[[1]][is.na(occ_spocc$gbif$data[[1]]$informationWithheld),]
}
#then we remove data from known collections that consistently generalise their coordinates (also because we have the original data)
# occ_ala$data <- occ_ala$data[occ_ala$data$dataResourceName %nin% c("OEH Atlas of NSW Wildlife",
# "WildNet - Queensland Wildlife Data",
# "Victorian Biodiversity Atlas"),]
# occ_spocc$gbif$data[[1]] <- occ_spocc$gbif$data[[1]][occ_spocc$gbif$data[[1]]$collectionCode != "BioNet Atlas of NSW Wildlife",]#VBA and WildNet are not identifiable in GBIF, this is the best we can do, after a quick check this seems ok (ie remaining records at least look ok)
## Merging ALA and GBIF
### Add missing columns full of NAs
for(ALA_col in c("eventDate",
"basisOfRecord",
"locality",
"institution",
"collection",
"coordinateUncertaintyInMetres")){
if(ALA_col %nin% colnames(occ_ala$data)){
occ_ala$data[ , ALA_col] <- rep(NA, nrow(occ_ala$data))
}
}
for(GBIF_col in c("eventDate",
"basisOfRecord",
"locality",
"institutionCode",
"collectionCode",
"coordinateUncertaintyInMeters")){
if(GBIF_col %nin% colnames(occ_spocc$gbif$data[[1]])){
occ_spocc$gbif$data[[1]][ , GBIF_col] <- rep(NA, nrow(occ_spocc$gbif$data[[1]]))
}
}
### Merge data.frames
if(nrow(occ_ala$data) > 0 & nrow(occ_spocc$gbif$data[[1]]) > 0){
merged_df <- data.frame("ID" = seq_len(nrow(occ_ala$data) + nrow(occ_spocc$gbif$data[[1]])),
"Origin" = c(rep("ALA", nrow(occ_ala$data)),
rep("GBIF", nrow(occ_spocc$gbif$data[[1]]))),
"Species" = c(rep(species, nrow(occ_ala$data)),
rep(species, nrow(occ_spocc$gbif$data[[1]]))),#we are assuming the search returned all correct species - this needs looking at later on
"Longitude" = c(occ_ala$data$longitudeOriginal,#note ALA data may have GDA94 original long lat, but they are processed to be wgs84
occ_spocc$gbif$data[[1]]$longitude),
"Latitude" = c(occ_ala$data$latitudeOriginal,
occ_spocc$gbif$data[[1]]$latitude),
#add date for duplicate processing
"Date" = c(occ_ala$data$eventDate,
occ_spocc$gbif$data[[1]]$eventDate),
"Basis.of.Record" = c(occ_ala$data$basisOfRecord,
occ_spocc$gbif$data[[1]]$basisOfRecord),
#this as commentary - not sure how reliable this is
#so this column is not always present for all species - need to think about how to best use this information
#for now not considering it in the main dataframe, but can dig out in saved raw dataframes
# "Data.Underwent.Generalisation" = c(as.character(occ_ala$data$dataAreGeneralised),
# occ_spocc$gbif$data[[1]]$informationWithheld),
"Locality" = c(occ_ala$data$locality,
occ_spocc$gbif$data[[1]]$locality),
#same information with collection column, and gbif sometimes do not have this column thus giving problems
# "Dataset" = c(occ_ala$data$dataResourceName,
# occ_spocc$gbif$data[[1]]$datasetName),
"Institute" = c(occ_ala$data$institution,
occ_spocc$gbif$data[[1]]$institutionCode),
"Collection" = c(occ_ala$data$collection,
occ_spocc$gbif$data[[1]]$collectionCode),
"Coordinate.Uncertainty.in.Metres" = c(occ_ala$data$coordinateUncertaintyInMetres,
occ_spocc$gbif$data[[1]]$coordinateUncertaintyInMeters),
stringsAsFactors = FALSE)
} else { #if one of the search is empty, then don't merge, but use the merged dataset structure for subsequent cleaning
if(nrow(occ_ala$data) > 0 & nrow(occ_spocc$gbif$data[[1]]) == 0){
merged_df <- data.frame("ID" = seq_len(nrow(occ_ala$data)),
"Origin" = rep("ALA", nrow(occ_ala$data)),
"Species" = rep(species, nrow(occ_ala$data)),#we are assuming the search returned all correct species - this needs looking at later on
"Longitude" = occ_ala$data$longitudeOriginal,
"Latitude" = occ_ala$data$latitudeOriginal,
#add date for duplicate processing
"Date" = occ_ala$data$eventDate,
"Basis.of.Record" = occ_ala$data$basisOfRecord,
# #this as commentary - not sure how reliable this is
# "Data.Underwent.Generalisation" = as.character(occ_ala$data$dataAreGeneralised),
"Locality" = occ_ala$data$locality,
# "Dataset" = occ_ala$data$dataResourceName,
"Institute" = occ_ala$data$institution,
"Collection" = occ_ala$data$collection,
"Coordinate.Uncertainty.in.Metres" = occ_ala$data$coordinateUncertaintyInMetres,
stringsAsFactors = FALSE)
} else {
merged_df <- data.frame("ID" = seq_len(nrow(occ_spocc$gbif$data[[1]])),
"Origin" = rep("GBIF", nrow(occ_spocc$gbif$data[[1]])),
"Species" = rep(species, nrow(occ_spocc$gbif$data[[1]])),#we are assuming the search returned all correct species - this needs looking at later on
"Longitude" = occ_spocc$gbif$data[[1]]$longitude,
"Latitude" = occ_spocc$gbif$data[[1]]$latitude,
#add date for duplicate processing
"Date" = occ_spocc$gbif$data[[1]]$eventDate,
"Basis.of.Record" = occ_spocc$gbif$data[[1]]$basisOfRecord,
# #this as commentary - not sure how reliable this is
# "Data.Underwent.Generalisation" = occ_spocc$gbif$data[[1]]$informationWithheld,
"Locality" = occ_spocc$gbif$data[[1]]$locality,
# "Dataset" = occ_spocc$gbif$data[[1]]$datasetName,
"Institute" = occ_spocc$gbif$data[[1]]$institutionCode,
"Collection" = occ_spocc$gbif$data[[1]]$collectionCode,
"Coordinate.Uncertainty.in.Metres" = occ_spocc$gbif$data[[1]]$coordinateUncertaintyInMeters,
stringsAsFactors = FALSE)
}
}
#####################
### Data Cleaning ###
#####################
## Remove spatial duplicates (other duplicate types may matter, think later)
merged_df$Longitude <- as.numeric(merged_df$Longitude)
merged_df$Latitude <- as.numeric(merged_df$Latitude)
merged_df <- merged_df[!duplicated(merged_df[ , c("Longitude", "Latitude")]), ]
## Get rid of missing or incomplete long and lats
merged_df <- merged_df[!is.na(merged_df$Longitude) | !is.na(merged_df$Latitude), ]
## Get rid of unusable long lat vals
### (Roozbeh says can save some data here will look into it later)
merged_df <- merged_df[merged_df$Longitude > -180 &
merged_df$Longitude < 180 &
merged_df$Latitude > -90 &
merged_df$Latitude < 90, ]
## Check if any record left
if(nrow(merged_df) == 0){
stop("Not run: no data with legitimate coordinates found")
}
# ## Clean records using coord cleaner
#
# merged_df <- CoordinateCleaner::clean_coordinates(merged_df,
# lon = "Longitude",
# lat = "Latitude",
# species = "Species",
# tests = c("capitals",
# "centroids",
# "equal",
# "gbif",
# "institutions",
# "seas",
# "zeros"),
# #skip urban test - keeps giving proj4string errors, will look into later
# # urban_ref = as_Spatial(read_sf("Data/GIS/ne_50m_urban_areas/ne_50m_urban_areas.shp")),
# seas_ref = NULL, #as_Spatial(read_sf("Data/GIS/ne_50m_land/ne_50m_land.shp")),
#
# #ignore outliers for now
# # outliers_method = "distance",
# # outliers_td = 1500, #outlier bit probably needs tweaking, its curently set to be very conservative
# value = "clean")
## Check if duplicate long or lat - could be signal of rounding
suspect.rounding <- ifelse(any(anyDuplicated(merged_df$Longitude),
anyDuplicated(merged_df$Latitude)),
"duplicate long/lat found - suspect rounding",
NA)
# ####################
# ### Plot Records ###
# ####################
#
# ## Visualise those with fewer than 1k records
# ### (can tweak this - I just think there isn't much point to manual input
# ### when looking at more than 1k data)
#
# if(nrow(merged_df) <= 1000){
#
# sp.sf <- sf::st_as_sf(merged_df,
# coords = (4:5),
# crs = sp::CRS("+proj=longlat +datum=WGS84"))#all ALA and GBIF coord should be in wgs84 - but this needs attention when adding more dataset in the future (and also some of ALA may be gda94 but incorrectly labelled according to Lee Belbin (I think?) - but this may be beyond our ability to fix)
#
# sp.map <- mapview::mapview(sp.sf,
# layer.name = species,
# homebutton = FALSE)
#
# if(save.map == TRUE){
#
# map_filename <- sprintf("%s/%s.html",
# map.directory,
# gsub(" ",
# "_",
# species))
#
# htmlwidgets::saveWidget(sp.map@map,
# file = map_filename)
#
# cat(paste0("Map is saved to ", map_filename), "\n")
#
# }
#
# } else {
#
# sp.map <- "more than 1k records, not mapped"
#
# }
return(list(raw.ala.data = occ_ala$data,
raw.gbif.data = occ_spocc$gbif$data[[1]],
processed.data = merged_df,
rounding.comment = suspect.rounding))
}
# #test run
# test_run <- load_pres_bg_data("Atrichornis rufescens", email = "tianxiaoh@student.unimelb.edu.au", guild = "Birds")
# load_pres_bg_data("Atrichornis rufescens",
# email = "davidpw@student.unimelb.edu.au")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.