inst/doc/Cleaning_PBDB_fossils_with_CoordinateCleaner.R

## ----options, echo = FALSE----------------------------------------------------
knitr::opts_chunk$set(eval = FALSE)

## -----------------------------------------------------------------------------
#  install.packages("devtools")
#  library(devtools)
#  
#  install_github("ropensci/CoordinateCleaner")

## -----------------------------------------------------------------------------
#  library(dplyr)
#  library(ggplot2)
#  library(CoordinateCleaner)
#  library(countrycode)
#  library(paleobioDB)

## -----------------------------------------------------------------------------
#  #load data
#  dat <- paleobioDB::pbdb_occurrences(base_name = "Magnoliopsida",
#                                      vocab = "pbdb", limit = 5000,
#                          show = c("coords", "phylo", "attr", "loc", "time", "rem"))
#  dat <- dat %>% mutate(lng = as.numeric(lng),
#                 lat = as.numeric(lat),
#                 early_age = as.numeric(early_age),
#                 late_age = as.numeric(late_age))
#  rownames(dat) <- NULL

## -----------------------------------------------------------------------------
#  #plot data to get an overview
#  wm <- borders("world", colour = "gray50", fill = "gray50")
#  dat %>%
#    ggplot(aes(x = lng, y = lat)) +
#    coord_fixed() +
#    wm +
#    geom_point(
#               colour = "darkred",
#               size = 0.5) +
#    theme_bw()

## -----------------------------------------------------------------------------
#  cl <- cc_val(dat, lat = "lat", lon = "lng")

## -----------------------------------------------------------------------------
#  cl <- cc_equ(cl, lat = "lat", lon = "lng")

## -----------------------------------------------------------------------------
#  fl <- cc_equ(dat, value = "flagged", lat = "lat", lon = "lng")
#  ## Testing equal lat/lon
#  ## Flagged 0 records.
#  
#  # extract and check the flagged records
#  fl_rec <- dat[!fl,]
#  head(fl_rec)
#  ##  [1] occurrence_no  record_type    collection_no  taxon_name     taxon_rank     taxon_no       matched_name   matched_rank
#  ##  [9] matched_no     early_interval late_interval  early_age      late_age       reference_no   lng            lat
#  ## [17] class          class_no       phylum         phylum_no      cc             state          geogscale      early_age.1
#  ## [25] late_age.1     cx_int_no      early_int_no   late_int_no    genus          genus_no       family         family_no
#  ## [33] order          order_no       county         reid_no
#  ## <0 rows> (or 0-length row.names)

## -----------------------------------------------------------------------------
#  fl <- cc_cen(cl, lat = "lat", lon = "lng", value = "flagged")
#  ## Testing country centroids
#  ## Flagged 6 records.
#  fl_rec <- cl[!fl, ]
#  unique(fl_rec$cc)
#  ## [1] JP
#  ## Levels: NZ US CN IR KP AU UK FR JP DE CA RU KE ZM EG CD ZA TZ UG ET MX IT
#  cl <- cl[fl, ]

## -----------------------------------------------------------------------------
#  #adapt country code to ISO3, for country test
#  cs_ma <- "GBR"
#  names(cs_ma) <- "UK"
#  cl$cc_iso3 <- countrycode(cl$cc, origin = "iso2c", destination = "iso3c", custom_match = cs_ma)
#  
#  cl <- cc_coun(cl, lat = "lat", lon = "lng", iso3 = "cc_iso3")
#  ## Testing country identity
#  ## Removed 234 records.

## -----------------------------------------------------------------------------
#  cl <- cc_inst(cl, lat = "lat", lon = "lng")
#  ## Testing biodiversity institutions
#  ## Removed 0 records.
#  cl <- cc_gbif(cl, lat = "lat", lon = "lng")
#  ## Testing GBIF headquarters, flagging records around Copenhagen
#  ## Removed 0 records.

## -----------------------------------------------------------------------------
#  cl <- cc_zero(cl, lat = "lat", lon = "lng")
#  ## Testing zero coordinates
#  ## Removed 0 records.

## -----------------------------------------------------------------------------
#  cl <- cl[!is.na(cl$late_age),]
#  cl <- cl[!is.na(cl$early_age),]
#  cl <- cf_equal(cl, min_age = "late_age", max_age = "early_age")
#  ## Testing age validity
#  ## Removed 0 records.

## -----------------------------------------------------------------------------
#  rang <- cl$early_age - cl$late_age
#  hist(rang, breaks = 40, xlab = "Date range [max age - min age]", main = "")

## -----------------------------------------------------------------------------
#  # Outlier dataset
#  cl <- cf_range(cl, taxon = "", min_age = "late_age", max_age = "early_age")
#  ## Testing temporal range outliers on dataset level
#  ## Removed 57 records.
#  
#  # Outlier per taxon
#  cl <- cf_range(cl, taxon = "taxon_name", min_age = "late_age", max_age = "early_age")
#  ## Testing temporal range outliers on taxon level
#  ## Removed 86 records.
#  
#  # Absolute age limit
#  cl <- cf_range(cl, taxon = "taxon_name", min_age = "late_age",
#                 max_age = "early_age", method = "time", max_range = 35)
#  ## Testing temporal range outliers on taxon level
#  ## Removed 1 records.
#  
#  rang <- cl$early_age - cl$late_age
#  hist(rang, breaks = 40, xlab = "Date range [max age - min age]", main = "")

## -----------------------------------------------------------------------------
#  # Outlier dataset
#  cl <- cf_outl(cl, taxon = "", lat = "lat", lon = "lng",
#                min_age = "late_age", max_age = "early_age")
#  ## Testing spatio-temporal outliers on dataset level
#  ## Removed 256 records.
#  
#  # Outlier taxon
#  cl <- cf_outl(cl, taxon = "taxon_name", lat = "lat", lon = "lng",
#                min_age = "late_age", max_age = "early_age")
#  ## Testing spatio-temporal outliers on taxon level
#  ## Removed 30 records.

## -----------------------------------------------------------------------------
#  nrow(dat) - nrow(cl)

## -----------------------------------------------------------------------------
#  #adapt country code to ISO3, for country test
#  cs_ma <- "GBR"
#  names(cs_ma) <- "UK"
#  dat$cc <- countrycode(dat$cc, origin = "iso2c", destination = "iso3c", custom_match = cs_ma)
#  
#  cl <- dat %>%
#    cc_val(lat = "lat", lon = "lng") %>%
#    cc_equ(lat = "lat", lon = "lng") %>%
#    cc_cen(lat = "lat", lon = "lng") %>%
#    cc_coun(lat = "lat", lon = "lng", iso3 = "cc") %>%
#    cc_gbif(lat = "lat", lon = "lng") %>%
#    cc_inst(lat = "lat", lon = "lng") %>%
#    cc_zero(lat = "lat", lon = "lng") %>%
#    cf_equal(min_age = "late_age", max_age = "early_age") %>%
#    cf_range(
#      taxon = "taxon_name",
#      lat = "lat",
#      lon = "lng",
#      min_age = "late_age",
#      max_age = "early_age"
#    ) %>%
#    cf_outl(
#      taxon = "taxon_name",
#      lat = "lat",
#      lon = "lng",
#      min_age = "late_age",
#      max_age = "early_age"
#    )

## -----------------------------------------------------------------------------
#  #run automated testing
#  flags <- clean_fossils(x = dat,
#                         lat = "lat",
#                         lon = "lng",
#                         taxon = "taxon_name",
#                         min_age = "late_age", max_age = "early_age",
#                         value = "spatialvalid")
#  
#  head(flags)
#  cl <- dat[flags$.summary,] #the cleaned records
#  fl_rec <- dat[!flags$.summary,] # the flagged records for verification

## -----------------------------------------------------------------------------
#  #1. This looks OK
#  table(cl$phylum)
#  ##
#  ## Spermatophyta
#  ##          2220
#  
#  #2. Taxonomic level of identification
#  table(cl$taxon_rank)
#  ##
#  ##  class   genus species
#  ##    371     502    1347
#  

## -----------------------------------------------------------------------------
#  cl <- cl %>%
#    filter(taxon_rank %in% c("species", "genus"))

## -----------------------------------------------------------------------------
#  table(cl$geogscale)

## -----------------------------------------------------------------------------
#  #minimum ages
#  tail(table(cl$late_age))
#  ##
#  ##  63.3    66  70.6  93.5  93.9 100.5
#  ##    53   138     8     3     3    21
#  
#  ggplot(cl)+
#    geom_histogram(aes(x = late_age))
#  ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## -----------------------------------------------------------------------------
#  
#  #maximum ages
#  tail(table(cl$early_age))
#  ##
#  ##  70.6  83.5  99.6 100.5 105.3   113
#  ##   126     9     3    11     3    21
#  
#  ggplot(cl) +
#    geom_histogram(aes(x = early_age))
#  ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## -----------------------------------------------------------------------------
#  # replace  blanks in taxon names
#  cl$taxon_name <- gsub("[[:blank:]]{1,}","_", cl$taxon_name)
#  
#  #simulated current status, soley for demonstration purposes, replace with your own data
#  mock_status <- data.frame(taxon_name = unique(cl$taxon_name),
#                            status = sample(c("extinct", "extant"),
#                                            size = length(unique(cl$taxon_name)),
#                                            replace = TRUE))
#  
#  #add current status to fossils
#  cl2 <- inner_join(cl, mock_status, by = "taxon_name")
#  
#  #Write PyRate input to disk
#  write_pyrate(cl, fname = "paleobioDB_angiosperms", status = cl2$status,
#              taxon = "taxon_name", min_age = "late_age", max_age = "early_age")

Try the CoordinateCleaner package in your browser

Any scripts or data that you put into this service are public.

CoordinateCleaner documentation built on Oct. 25, 2023, 1:08 a.m.