R/distributionData.R

Defines functions parse_obis_date .getOccNumber_GBIF .getOccurrence_GBIF .getOccurrence_OBIS .getOccurrence points.occ_df plot.occ_df rbind.occ_df print.occ_df getDistributionRecords

Documented in getDistributionRecords

#' Get distribution records from on-line databases
#'
#' @param sp The scientific name of the species
#' @param limit The maximum number of records, NULL gets all records.
#' @param source The data source (currently only "gbif" and "obis" are supported.)
#' @param year Range of years to be included in the search.
#' @param verbose boolean, do you want to know what's happening?
#' @param ... May be used later
#'
#' @return A tibble with species ocurrences.
#' @export
#'
#' @examples
#' getDistributionRecords(sp="Manta birostris", limit=300)
getDistributionRecords = function(sp, limit=NULL, source="gbif", year=NULL, 
                                  verbose=FALSE, ...) {
  
  out = list()
  for(isource in source) {
    message(sprintf("\nDownloading occurrence data from %s.\n", isource))
    out[[isource]] = .getOccurrence(sp=sp, source=isource, limit=limit, verbose=verbose, 
                 year=year)
  }
  out = do.call(rbind, out)
  check = setdiff(names(out), "provider") # check for duplicated up to provider.
  ind = duplicated(as.data.frame(out[, check]))
  ndup = sum(ind, na.rm=TRUE)
  if(ndup>0) message(sprintf("\nDetecting up to %d duplicated records, check the dataset before use.", ndup))
  return(out)
  
}


# S3 methods --------------------------------------------------------------

#' @export
print.occ_df = function(x, n=NULL, ...) {
  
  attList = attributes(x)
  spHead = sprintf("\nDistribution records of %s\n\n", unique(attList$sciName))
  
  cat(spHead)
  
  tibble:::print.tbl_df(x, n=n, ...)
  
  cat("\nCite this dataset as:\n")
  cat(paste(attList$reference, collapse=""))
  
  return(invisible(NULL))
  
}

#' @export
rbind.occ_df = function(..., deparse.level=1) {
  
  out = rbind.data.frame(..., deparse.level=deparse.level, 
                         stringsAsFactors=FALSE)
  
  attr(out, "sp")        = sapply(list(...), attr, which="sp")
  attr(out, "author")    = sapply(list(...), attr, which="author")
  attr(out, "sciName")   = sapply(list(...), attr, which="sciName")
  attr(out, "accessed")  = sapply(list(...), attr, which="accessed")
  attr(out, "source")    = sapply(list(...), attr, which="source")
  attr(out, "org")       = sapply(list(...), attr, which="org")
  attr(out, "web")       = sapply(list(...), attr, which="web")
  attr(out, "reference") = sapply(list(...), attr, which="reference")
  
  return(out)
  
}

#' @export
'[.occ_df' = function(x, i, j, drop = FALSE) {
  
  attList = attributes(x)
  # out = NextMethod("[")
  class(x) =  tail(class(x), -1)
  out = tibble:::'[.tbl_df'(x, i, j, drop)
  attList$row.names = attr(out, "row.names")
  attList$names = attr(out, "names")
  attributes(out) = attList
  return(out)
}

#' @exportS3Method plot occ_df
plot.occ_df = function(x, y=NULL, col="blue", xlim=NULL, ylim=NULL, domain="world", center=0, 
                       hires=FALSE, land.col="darkolivegreen4", sea.col="aliceblue", 
                       boundaries.col = "black", grid.col="white", grid=TRUE,
                       cex=0.25, pch=19, main=NULL, add=FALSE, axes=TRUE, 
                       border=!axes, asp=NA, axs="i", xaxs=axs, yaxs=axs, cex.axis=0.75, 
                       interior=FALSE, fill=TRUE, countries=FALSE, nx=NULL, ny=nx, ...) {
  
  plot.map(x=x, y=y, xlim=xlim, ylim=ylim, domain=domain, center=center, 
           hires=hires, land.col=land.col, sea.col=sea.col, 
           boundaries.col = boundaries.col, grid.col=grid.col, grid=grid,
           cex=cex, pch=pch, main=main, add=add, axes=axes, 
           border=border, asp=asp, axs=axs, xaxs=axs, yaxs=axs, cex.axis=cex.axis, 
           interior=interior, fill=fill, countries=countries, nx=nx, ny=ny, col=col, ...)
  
  return(invisible(NULL))
  
}

#' @export
points.occ_df = function(x, col="red", pch=19, cex=0.25, ...) {
  
  points(x=x$lon, y=x$lat, col=col, pch=pch, cex=cex, ...)
  
  return(invisible(NULL))
  
}

# head.occ_df = function (x, n = 6L, ...) {
#     # class(x) =  tail(class(x), -1)
#     stopifnot(length(n) == 1L)
#     n <- if (n < 0L) 
#       max(nrow(x) + n, 0L)
#     else min(n, nrow(x))
#     out = x[seq_len(n), , drop = FALSE]
#     # class(out) = c("occ_df", class(x))
#     return(out)
# }
# 
# tail.occ_df = function (x, n = 6L, ...) {
#   class(x) =  tail(class(x), -1)
#   stopifnot(length(n) == 1L)
#   nrx <- nrow(x)
#   n <- if (n < 0L) 
#     max(nrx + n, 0L)
#   else min(n, nrx)
#   x[seq.int(to = nrx, length.out = n), , drop = FALSE]
# }

# Internal ----------------------------------------------------------------

.getOccurrence = function(source, sp, limit=NULL, year=NULL, verbose=FALSE) {
  
  out = switch(source, 
               gbif = .getOccurrence_GBIF(sp=sp, limit=limit, year=year, 
                                          verbose=verbose),
               obis = .getOccurrence_OBIS(sp=sp, limit=limit, year=year, 
                                          verbose=verbose)
  )
  return(out)
  
}

.getOccurrence_OBIS = function(sp, limit=NULL, year=NULL, verbose=FALSE) {
  
  if(!requireNamespace("robis", quietly = TRUE)) 
    stop("You need to install the 'robis' package (from github).")
  
  # fixing issue with 'curl' 
  op = get("has_internet_via_proxy", environment(curl::has_internet))
  # check for internet
  np = !is.null(curl::nslookup("r-project.org", error = FALSE))
  assign("has_internet_via_proxy", np, environment(curl::has_internet))
  # restore old value, to use within functions.
  on.exit(assign("has_internet_via_proxy", op, environment(curl::has_internet)))
  
  
  if(length(sp)!=1) stop("You must provide only one species name.")
  sp = check_taxon(sp)
  vars = c("decimalLongitude", "decimalLatitude", "year", "month", "day", 
           "depth", "lifeStage", "basisOfRecord")
  
  startdate = NULL
  enddate   = NULL
  
  if(!is.null(year)) {
    if(!is.numeric(year)) stop("Argument 'year' must be numeric")
    year = na.omit(year)
    year = range(c(year, floor(min(year)), ceiling(max(year))), na.rm=TRUE)
    startdate = as.Date(sprintf("%d-01-01", year[1]))
    enddate   = as.Date(sprintf("%d-12-31", year[2]))
  }
  
  
  dat = robis::occurrence(sp, verbose=verbose, limit=limit,
                          startdate=startdate, enddate=enddate)
  if(nrow(dat)<1) {
  msg = sprintf("\nRetrieved %d records of %d (%0.2f%%)\n", 0, 0, 100)
  cat(msg)
    return(NULL)
  }
  
  eventDate = parse_obis_date(dat$eventDate)
  dat$year  = eventDate$year
  dat$month = eventDate$month
  dat$day   = eventDate$day
  
  missing_vars = vars[which(!(vars %in% names(dat)))]
  if(length(missing_vars) > 0) dat[, missing_vars] = NA
  
  out = dat[, vars]
  names(out)[1:2] = c("lon", "lat")
  n = nrow(out)
  out = out[complete.cases(out[, c("year", "month", "day")]), ]
  n = n - nrow(out)
  if(n>0) message(sprintf("\nRemoved %d records without date information.", n))
  nc = nchar(out$basisOfRecord)
  out$basisOfRecord[nc<2] = "Unknown"
  out$basisOfRecord = toCamel(out$basisOfRecord, split="_")
  out$basisOfRecord[out$basisOfRecord=="Observation"] = "Occurrence" # why?
  out$lifeStage[out$lifeStage=="Unknown"] = NA
  out = as_tibble(out)
  out = tibble::remove_rownames(out)
  out$basisOfRecord = as.factor(out$basisOfRecord)
  out$provider = "OBIS"
  spName = names(which.max(table(dat$scientificName)))
  spAuthor = names(which.max(table(dat$scientificNameAuthorship)))
  
  attr(out, "sp") = spName
  attr(out, "author") = spAuthor
  attr(out, "sciName") = sprintf("%s %s", spName, spAuthor)
  attr(out, "accessed") = lubridate::today()
  attr(out, "source") = "OBIS"
  attr(out, "org") = "Ocean Biogeographic Information System. Intergovernmental Oceanographic Commission of UNESCO"
  attr(out, "web") = "www.iobis.org"
  attr(out, "all_vars") = names(dat)
  att = attributes(out)
  attr(out, "reference") = sprintf("%s (%d) Distribution records of %s %s [Dataset]. Available: %s. %s. Accessed: %s)\n", 
                                   att$source, year(att$accessed), att$sp, att$author, att$org, att$web, att$accessed)
  class(out) = c("occ_df", class(out))
  
  return(out)
  
}

.getOccurrence_GBIF = function(sp, limit=NULL, year=NULL, verbose=FALSE) {
  
  if(!requireNamespace("rgbif", quietly = TRUE)) 
    stop("You need to install the 'rgbif' package.")
  
  if(length(sp)!=1) stop("You must provide only one species name.")
  sp = check_taxon(sp)
  vars = c("decimalLongitude", "decimalLatitude", "year", "month", "day", 
           "depth", "lifeStage", "basisOfRecord")
  
  oyear = year
  
  if(!is.null(year)) {
    if(!is.numeric(year)) stop("Argument 'year' must be numeric")
    year = na.omit(year)
    year = range(c(year, floor(min(year)), ceiling(max(year))), na.rm=TRUE)
    year = sprintf("%d,%d", year[1], year[2])
  }
  
  tmp = rgbif::occ_search(scientificName = sp, limit=0, year=year, hasCoordinate=TRUE)
  ntot = tmp$meta$count
  if(ntot==0) {
    msg = sprintf("\nRetrieved %d records of %d (%0.2f%%)\n", 0, 0, 100)
    cat(msg)
    return(NULL)
  }

  nrec = if(is.null(limit)) ntot else min(limit, ntot)  
  
  if(nrec >= 1e5) {
    
    ncount = .getOccNumber_GBIF(sp, year=oyear)
    split = attr(ncount, "split")
    split[1] = split[1] - 1 
    nquery = length(split) - 1
    
    output = list()
    for(i in seq_len(nquery)) {
      
      iyear = split[i + c(0, 1)] + c(1, 0)
      msg = sprintf("Retrieving records from %d to %d...", iyear[1], iyear[2])
      message(msg)
      iyear = sprintf("%d,%d", iyear[1], iyear[2])
      tmp = rgbif::occ_search(scientificName = sp, limit=1e5, year=iyear, hasCoordinate=TRUE)$data
      msg = sprintf("\tRetrieved %d records (%0.2f%%).\n", nrow(tmp), 100*nrow(tmp)/ntot)
      message(msg)
     
      tmp = tmp[, c(vars, "scientificName")]
      output[[i]] = tmp
      
    }
    
    dat = do.call(rbind, output)
    
  } else {
    
    dat = rgbif::occ_search(scientificName = sp, limit=nrec)$data
    msg = sprintf("\nRetrieved %d records of %d (%0.2f%%)\n", nrec, ntot, 100*nrec/ntot)
    message(msg)
    
  }

  missing_vars = vars[which(!(vars %in% names(dat)))]
  if(length(missing_vars) > 0) dat[, missing_vars] = NA
  
  out = dat[, vars]
  names(out)[1:2] = c("lon", "lat")
  n = nrow(out)
  out = out[complete.cases(out[, c("year", "month", "day")]), ]
  n = n - nrow(out)
  if(n>0) message(sprintf("Removed %d records without date information.", n))
  out$basisOfRecord = toCamel(out$basisOfRecord, split="_")
  out$basisOfRecord[out$basisOfRecord=="Observation"] = "Occurrence" # why?
  out$lifeStage[out$lifeStage=="Unknown"] = NA # why?
  out = tibble::remove_rownames(out)
  out$basisOfRecord = as.factor(out$basisOfRecord)
  out$provider = "GBIF"
  
  tmp = unlist(strsplit(names(which.max(table(dat$scientificName))), split=" "))
  spName = paste(tmp[1:2], collapse=" ")
  spAuthor = paste(tail(tmp, -2), collapse=" ")
  
  attr(out, "sp") = spName
  attr(out, "author") = spAuthor
  attr(out, "sciName") = sprintf("%s %s", spName, spAuthor)
  attr(out, "accessed") = lubridate::today()
  attr(out, "source") = "GBIF"
  attr(out, "org") = "GBIF. The Global Biodiversity Information Facility"
  attr(out, "web") = "www.gbif.org"
  attr(out, "all_vars") = names(dat)
  att = attributes(out)
  attr(out, "reference") = sprintf("%s (%d) Distribution records of %s %s [Dataset]. Available: %s. %s. Accessed: %s)\n", 
                                   att$source, year(att$accessed), att$sp, att$author, att$org, att$web, att$accessed)
  
  
  class(out) = c("occ_df", class(out))
  
  return(out)
} 


# Count occurrences -------------------------------------------------------

.getOccNumber_GBIF = function(sp, year=NULL, hasCoordinate=TRUE) {
  
  if(!requireNamespace("rgbif", quietly = TRUE)) 
    stop("You need to install the 'rgbif' package.")
  
  if(length(sp)!=1) stop("You must provide only one species name.")
  sp = check_taxon(sp)
  
  if(is.null(year)) year = c(1900, lubridate::year(lubridate::today()))
  if(!is.numeric(year)) stop("Argument 'year' must be numeric")
  year = na.omit(year)
  year = range(c(year, floor(min(year)), ceiling(max(year))), na.rm=TRUE)
  year = seq(from=year[1], to=year[2])
  
  out = rep(NA_integer_, length=length(year))
  
  for(i in seq_along(year)) {
    out[i] = rgbif::occ_search(scientificName = sp, limit=0, year=year[i], hasCoordinate=TRUE)$meta$count
  }
  names(out) = year
  
  tot = sum(out, na.rm=TRUE)
  
  nquery = ceiling(tot/1e5) + 1
  crit = tot/nquery
  ctot = cumsum(out)
  
  split = numeric(nquery+1)
  split[1] = 1
  split[nquery+1] = length(year)
  for(i in seq_len(nquery-1)) split[i+1] = which.max(ctot > (crit*i)) - 1
  split = year[split]
  
  attr(out, "split") = split
  
  return(out)
  
}


# Internal functions ------------------------------------------------------


parse_obis_date = function(x) {
  
  x[x=="0000-00-00"] = NA
  x = substr(x, start=1, stop=10)
  eventDate = suppressWarnings(lubridate::parse_date_time(x, "Ymd"))
  
  isna_day = which(is.na(eventDate) & !is.na(x))
  xmiss = suppressWarnings(lubridate::parse_date_time(substr(x[isna_day], start=1, stop=7), "Ym"))
  eventDate[isna_day] = xmiss
  
  isna_month = which(is.na(eventDate) & !is.na(x))
  xmiss = suppressWarnings(lubridate::parse_date_time(substr(x[isna_month], start=1, stop=4), "Y"))
  eventDate[isna_month] = xmiss
  
  year  = year(eventDate)
  month = month(eventDate)
  day   = day(eventDate)
  day[isna_day] = NA
  month[isna_month] = NA
  
  return(list(eventDate=eventDate, year=year, month=month, day=day))
  
}
roliveros-ramos/kali documentation built on Jan. 14, 2023, 4:30 a.m.