Nothing
## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## ----example-data-------------------------------------------------------------
library(vectra)
obs_path <- tempfile(fileext = ".vtr")
sites_path <- tempfile(fileext = ".vtr")
ref_path <- tempfile(fileext = ".vtr")
observations <- data.frame(
site_id = c(1, 1, 2, 2, 3, 3, 4, 4),
species = c("Quercus robur", "Pinus sylvestris",
"Quercus robur", "Fagus sylvatica",
"Pinus sylvestris", "Betula pendula",
"Fagus sylvatica", "Acer pseudoplatanus"),
date = c("2024-06-01", "2024-06-01", "2024-06-15",
"2024-06-15", "2024-07-01", "2024-07-01",
"2024-07-10", "2024-07-10"),
count = c(12, 5, 8, 3, 7, 14, 6, 2),
stringsAsFactors = FALSE
)
sites <- data.frame(
site_id = c(1, 2, 3, 5),
latitude = c(48.21, 47.07, 46.62, 48.85),
longitude = c(16.37, 15.44, 14.31, 13.02),
habitat = c("forest", "wetland", "grassland", "forest"),
stringsAsFactors = FALSE
)
reference <- data.frame(
species_name = c("Quercus robur", "Pinus sylvestris",
"Fagus sylvatica", "Betula pendula",
"Alnus glutinosa"),
family = c("Fagaceae", "Pinaceae", "Fagaceae",
"Betulaceae", "Betulaceae"),
conservation_status = c("LC", "LC", "LC", "LC", "LC"),
stringsAsFactors = FALSE
)
write_vtr(observations, obs_path)
write_vtr(sites, sites_path)
write_vtr(reference, ref_path)
## ----by-same-name-------------------------------------------------------------
left_join(tbl(obs_path), tbl(sites_path), by = "site_id") |>
collect()
## ----by-named-----------------------------------------------------------------
inner_join(
tbl(obs_path),
tbl(ref_path),
by = c("species" = "species_name")
) |> collect()
## ----by-natural---------------------------------------------------------------
left_join(tbl(obs_path), tbl(sites_path)) |>
collect()
## ----multi-key-setup----------------------------------------------------------
survey1_path <- tempfile(fileext = ".vtr")
survey2_path <- tempfile(fileext = ".vtr")
write_vtr(data.frame(
site_id = c(1, 1, 2, 2),
year = c(2023, 2024, 2023, 2024),
richness = c(5, 7, 3, 4)
), survey1_path)
write_vtr(data.frame(
site_id = c(1, 2, 2),
year = c(2024, 2023, 2024),
temperature = c(18.2, 16.1, 17.5)
), survey2_path)
inner_join(
tbl(survey1_path), tbl(survey2_path),
by = c("site_id", "year")
) |> collect()
## ----left-join----------------------------------------------------------------
enriched <- left_join(
tbl(obs_path), tbl(sites_path), by = "site_id"
) |> collect()
enriched
## ----left-join-suffix---------------------------------------------------------
extra_path <- tempfile(fileext = ".vtr")
write_vtr(data.frame(
site_id = c(1, 2, 3),
count = c(100, 200, 300)
), extra_path)
left_join(
tbl(obs_path), tbl(extra_path),
by = "site_id", suffix = c("_obs", "_site")
) |> collect()
## ----inner-join---------------------------------------------------------------
inner_join(
tbl(obs_path), tbl(sites_path), by = "site_id"
) |> collect()
## ----right-join---------------------------------------------------------------
right_join(
tbl(obs_path), tbl(sites_path), by = "site_id"
) |> collect()
## ----full-join----------------------------------------------------------------
full_join(
tbl(obs_path), tbl(sites_path), by = "site_id"
) |> collect()
## ----semi-join-setup----------------------------------------------------------
targets_path <- tempfile(fileext = ".vtr")
write_vtr(data.frame(
species = c("Quercus robur", "Betula pendula"),
stringsAsFactors = FALSE
), targets_path)
## ----semi-join----------------------------------------------------------------
semi_join(
tbl(obs_path), tbl(targets_path), by = "species"
) |> collect()
## ----anti-join----------------------------------------------------------------
anti_join(
tbl(obs_path),
tbl(ref_path),
by = c("species" = "species_name")
) |> collect()
## ----anti-join-reverse--------------------------------------------------------
anti_join(
tbl(ref_path),
tbl(obs_path),
by = c("species_name" = "species")
) |> collect()
## ----cross-join---------------------------------------------------------------
years_path <- tempfile(fileext = ".vtr")
write_vtr(data.frame(year = c(2022, 2023, 2024)), years_path)
grid <- cross_join(tbl(sites_path), tbl(years_path))
grid
## ----fuzzy-setup--------------------------------------------------------------
messy_path <- tempfile(fileext = ".vtr")
clean_path <- tempfile(fileext = ".vtr")
write_vtr(data.frame(
obs_id = c(1, 2, 3, 4, 5),
name = c("Quercus robar", "Pinus sylvestris L.",
"Fagus silvatica", "Betula pendla",
"Acer pseudoplatanus"),
stringsAsFactors = FALSE
), messy_path)
write_vtr(data.frame(
species_name = c("Quercus robur", "Pinus sylvestris",
"Fagus sylvatica", "Betula pendula",
"Acer pseudoplatanus"),
family = c("Fagaceae", "Pinaceae", "Fagaceae",
"Betulaceae", "Sapindaceae"),
stringsAsFactors = FALSE
), clean_path)
## ----fuzzy-join---------------------------------------------------------------
fuzzy_join(
tbl(messy_path),
tbl(clean_path),
by = c("name" = "species_name"),
method = "dl",
max_dist = 0.2
) |> collect()
## ----fuzzy-block-setup--------------------------------------------------------
messy2_path <- tempfile(fileext = ".vtr")
clean2_path <- tempfile(fileext = ".vtr")
write_vtr(data.frame(
genus = c("Quercus", "Pinus", "Fagus", "Betula"),
name = c("Quercus robar", "Pinus sylvestris L.",
"Fagus silvatica", "Betula pendla"),
stringsAsFactors = FALSE
), messy2_path)
write_vtr(data.frame(
genus = c("Quercus", "Pinus", "Fagus",
"Betula", "Alnus"),
species_name = c("Quercus robur", "Pinus sylvestris",
"Fagus sylvatica", "Betula pendula",
"Alnus glutinosa"),
stringsAsFactors = FALSE
), clean2_path)
## ----fuzzy-block--------------------------------------------------------------
fuzzy_join(
tbl(messy2_path),
tbl(clean2_path),
by = c("name" = "species_name"),
method = "dl",
max_dist = 0.25,
block_by = c("genus" = "genus")
) |> collect()
## ----multi-key-data-----------------------------------------------------------
counts_path <- tempfile(fileext = ".vtr")
effort_path <- tempfile(fileext = ".vtr")
write_vtr(data.frame(
site_id = c(1, 1, 2, 2, 3),
year = c(2023, 2024, 2023, 2024, 2024),
n_species = c(12, 15, 8, 10, 20)
), counts_path)
write_vtr(data.frame(
site_id = c(1, 1, 2, 3, 3),
year = c(2023, 2024, 2024, 2023, 2024),
hours = c(4.5, 5.0, 3.0, 6.0, 4.0)
), effort_path)
## ----multi-key-join-----------------------------------------------------------
left_join(
tbl(counts_path), tbl(effort_path),
by = c("site_id", "year")
) |> collect()
## ----multi-key-named----------------------------------------------------------
effort2_path <- tempfile(fileext = ".vtr")
write_vtr(data.frame(
loc = c(1, 1, 2, 3, 3),
survey_year = c(2023, 2024, 2024, 2023, 2024),
hours = c(4.5, 5.0, 3.0, 6.0, 4.0)
), effort2_path)
left_join(
tbl(counts_path), tbl(effort2_path),
by = c("site_id" = "loc", "year" = "survey_year")
) |> collect()
## ----coercion-----------------------------------------------------------------
int_path <- tempfile(fileext = ".vtr")
dbl_path <- tempfile(fileext = ".vtr")
write_vtr(data.frame(id = c(1L, 2L, 3L), x = c(10, 20, 30)), int_path)
write_vtr(data.frame(id = c(1.0, 2.0, 4.0), y = c(100, 200, 400)), dbl_path)
inner_join(tbl(int_path), tbl(dbl_path), by = "id") |> collect()
## ----na-keys------------------------------------------------------------------
na_left <- tempfile(fileext = ".vtr")
na_right <- tempfile(fileext = ".vtr")
write_vtr(data.frame(
id = c(1, NA, 3), x = c(10, 20, 30)
), na_left)
write_vtr(data.frame(
id = c(1, NA, 4), y = c(100, 200, 400)
), na_right)
left_join(tbl(na_left), tbl(na_right), by = "id") |> collect()
## ----guidance-enrich----------------------------------------------------------
# Attach habitat type to every observation
left_join(tbl(obs_path), tbl(sites_path), by = "site_id") |>
select(site_id, species, habitat) |>
collect()
## ----guidance-preprocess------------------------------------------------------
# Preprocessing before an exact join
tbl(obs_path) |>
mutate(species_clean = trimws(tolower(species))) |>
select(site_id, species_clean, count) |>
collect()
## ----cleanup, include = FALSE-------------------------------------------------
unlink(c(obs_path, sites_path, ref_path, targets_path,
extra_path, survey1_path, survey2_path,
years_path, messy_path, clean_path,
messy2_path, clean2_path,
counts_path, effort_path, effort2_path,
int_path, dbl_path, na_left, na_right))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.