scrubr

knitr::opts_chunk$set(
  warning = FALSE,
  message = FALSE,
  collapse = TRUE,
  comment = "#>"
)

R-check cran checks codecov.io rstudio mirror downloads cran version

Clean Biological Occurrence Records

Clean using the following use cases (checkmarks indicate fxns exist - not necessarily complete):

A note about examples: We think that using a piping workflow with %>% makes code easier to build up, and easier to understand. However, in some examples we provide examples without the pipe to demonstrate traditional usage.

Install

Stable CRAN version

install.packages("scrubr")

Development version

remotes::install_github("ropensci/scrubr")
library("scrubr")

Coordinate based cleaning

data("sampledata1")

Remove impossible coordinates (using sample data included in the pkg)

# coord_impossible(dframe(sample_data_1)) # w/o pipe
dframe(sample_data_1) %>% coord_impossible()

Remove incomplete coordinates

# coord_incomplete(dframe(sample_data_1)) # w/o pipe
dframe(sample_data_1) %>% coord_incomplete()

Remove unlikely coordinates (e.g., those at 0,0)

# coord_unlikely(dframe(sample_data_1)) # w/o pipe
dframe(sample_data_1) %>% coord_unlikely()

Do all three

dframe(sample_data_1) %>%
  coord_impossible() %>%
  coord_incomplete() %>%
  coord_unlikely()

Don't drop bad data

dframe(sample_data_1) %>% coord_incomplete(drop = TRUE) %>% NROW
dframe(sample_data_1) %>% coord_incomplete(drop = FALSE) %>% NROW

Deduplicate

smalldf <- sample_data_1[1:20, ]
# create a duplicate record
smalldf <- rbind(smalldf, smalldf[10,])
row.names(smalldf) <- NULL
# make it slightly different
smalldf[21, "key"] <- 1088954555
NROW(smalldf)
dp <- dframe(smalldf) %>% dedup()
NROW(dp)
attr(dp, "dups")

Dates

Standardize/convert dates

df <- sample_data_1
# date_standardize(dframe(df), "%d%b%Y") # w/o pipe
dframe(df) %>% date_standardize("%d%b%Y")

Drop records without dates

NROW(df)
NROW(dframe(df) %>% date_missing())

Create date field from other fields

dframe(sample_data_2) %>% date_create(year, month, day)

Ecoregion

Filter by FAO areas

wkt <- 'POLYGON((72.2 38.5,-173.6 38.5,-173.6 -41.5,72.2 -41.5,72.2 38.5))'
manta_ray <- rgbif::name_backbone("Mobula alfredi")$usageKey
res <- rgbif::occ_data(manta_ray, geometry = wkt, limit=300, hasCoordinate = TRUE)
dat <- sf::st_as_sf(res$data, coords = c("decimalLongitude", "decimalLatitude"))
dat <- sf::st_set_crs(dat, 4326)
mapview::mapview(dat)
tmp <- eco_region(dframe(res$data), dataset = "fao", region = "OCEAN:Indian")
tmp <- tmp[!is.na(tmp$decimalLongitude), ]
tmp2 <- sf::st_as_sf(tmp, coords = c("decimalLongitude", "decimalLatitude"))
tmp2 <- sf::st_set_crs(tmp2, 4326)
mapview::mapview(tmp2)

Meta



ropenscilabs/scrubr documentation built on Sept. 12, 2022, 4:10 p.m.