Nothing
## ----include = FALSE----------------------------------------------------------------------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
# Increase width for printing tibbles
old <- options(width = 140)
knitr::read_chunk(system.file("extdata", "vascan_url.R", package = "dwctaxon"))
## ----setup, message = FALSE---------------------------------------------------------------------------------------------------------------
library(dwctaxon)
library(readr)
library(tibble)
library(dplyr)
## ----download-setup-----------------------------------------------------------------------------------------------------------------------
# - Specify temporary folder for downloading data
temp_dir <- tempdir()
# - Set name of zip file
temp_zip <- paste0(temp_dir, "/dwca-vascan.zip")
# - Set name of unzipped folder
temp_unzip <- paste0(temp_dir, "/dwca-vascan")
## ----set-url------------------------------------------------------------------------------------------------------------------------------
vascan_url <- "https://data.canadensys.net/ipt/archive.do?r=vascan&v=37.12"
## ----echo = FALSE, results = "asis"-------------------------------------------------------------------------------------------------------
# Check if file can be downloaded safely, quit early if not
# Make sure this URL matches the one in the next chunk
if (!dwctaxon:::safe_to_download(vascan_url)) {
cat(
paste0(
"Vignette rendering stopped. The zip file (",
vascan_url,
") could not be downloaded. Check your internet connection and the URL."
)
)
knitr::knit_exit()
}
## ----download-unzip-----------------------------------------------------------------------------------------------------------------------
# Download data
download.file(url = vascan_url, destfile = temp_zip, mode = "wb")
# Unzip
unzip(temp_zip, exdir = temp_unzip)
# Check the contents of the unzipped data (the Darwin Core Archive)
list.files(temp_unzip)
## ----load-data----------------------------------------------------------------------------------------------------------------------------
vascan <- read_tsv(paste0(temp_unzip, "/taxon.txt"))
# Take a peak at the data
vascan
## ----validation, error = TRUE-------------------------------------------------------------------------------------------------------------
dct_validate(vascan)
## ----validation-summary-------------------------------------------------------------------------------------------------------------------
validation_res <- dct_validate(vascan, on_fail = "summary")
validation_res
## ----summary-analysis---------------------------------------------------------------------------------------------------------------------
validation_res %>%
count(check, error)
## ----summary-analysis-hide, show = FALSE, echo = FALSE------------------------------------------------------------------------------------
validation_res_sum <-
validation_res %>%
count(check, error)
n_error_types <- nrow(validation_res_sum) %>%
english::english()
n_bad_cols <- validation_res_sum %>%
filter(error == "Invalid column names detected: id") %>%
pull(n) %>%
english::english()
n_bad_sci_name <- validation_res_sum %>%
filter(error == "scientificName detected with duplicated value") %>%
pull(n)
## ----check-sci-name-dups------------------------------------------------------------------------------------------------------------------
dup_names <-
validation_res %>%
filter(grepl("scientificName detected with duplicated value", error)) %>%
arrange(scientificName)
dup_names
## ----check-sci-name-dups-orig-------------------------------------------------------------------------------------------------------------
inner_join(
select(dup_names, taxonID),
vascan,
by = "taxonID"
) %>%
# Just look at the first 6 columns
select(1:6)
## ----inspect-id---------------------------------------------------------------------------------------------------------------------------
vascan %>%
select(id)
n_distinct(vascan$id)
## ----fix-data-----------------------------------------------------------------------------------------------------------------------------
vascan_fixed <-
vascan %>%
filter(!duplicated(scientificName))
## ----validation-2-------------------------------------------------------------------------------------------------------------------------
dct_validate(
vascan_fixed,
extra_cols = "id"
)
## ----include = FALSE----------------------------------------------------------
# Reset options
options(old)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.