inst/doc/basic_data_cleaning.R

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 6
)

## ----results='hide', message=FALSE, warning=FALSE-----------------------------
# Load packages
library(kuenm2)
library(terra)

# Current directory
getwd()

# Define new directory
#setwd("YOUR/DIRECTORY")  # uncomment and modify if setting a new directory

# Saving original plotting parameters
original_par <- par(no.readonly = TRUE)

## ----Import occurrence data---------------------------------------------------
# Import occurrences
data(occ_data_noclean, package = "kuenm2")

# Check data structure
str(occ_data_noclean)

## ----Load variables-----------------------------------------------------------
# Import raster layers
var <- terra::rast(system.file("extdata", "Current_variables.tif", 
                               package = "kuenm2"))

# Keep only one layer
var <- var$bio_1

# Check variable
terra::plot(var)

## ----visualize----------------------------------------------------------------
# Visualize occurrences on one variable
## Create an extent based on the layer and the records to see all errors
vext <- terra::ext(var)  # extent of layer
pext <- apply(occ_data_noclean[, 2:3], 2, range, na.rm = TRUE)  # extent of records

allext <- terra::ext(c(min(pext[1, 1], vext[1]), max(pext[2, 1], vext[2]), 
                       min(pext[1, 2], vext[3]), max(pext[2, 2], vext[4]))) + 1

# plotting records on the variable
terra::plot(var, ext = allext, main = "Bio 1")
points(occ_data_noclean[, c("x", "y")])

## ----remove missing-----------------------------------------------------------
# remove missing data
mis <- remove_missing(data = occ_data_noclean, columns = NULL, remove_na = TRUE,
                      remove_empty = TRUE)

# quick check
nrow(occ_data_noclean)
nrow(mis)

## ----remove duplicates--------------------------------------------------------
# remove exact duplicates
mis_dup <- remove_duplicates(data = mis, columns = NULL, keep_all_columns = TRUE)

# quick check
nrow(mis)
nrow(mis_dup)

## ----remove 00----------------------------------------------------------------
# remove records with 0 for x and y coordinates
mis_dup_00 <- remove_corrdinates_00(data = mis_dup, x = "x", y = "y")

# quick check
nrow(mis_dup)
nrow(mis_dup_00)

## ----filter decimal-----------------------------------------------------------
# remove coordinates with low decimal precision.
mis_dup_00_dec <- filter_decimal_precision(data = mis_dup_00, x = "x", y = "y", 
                                           decimal_precision = 2)

# quick check
nrow(mis_dup_00)
nrow(mis_dup_00_dec)

## ----all basic----------------------------------------------------------------
# all basic cleaning steps
clean_init <- initial_cleaning(data = occ_data_noclean, species = "species", 
                               x = "x", y = "y", remove_na = TRUE, 
                               remove_empty = TRUE, remove_duplicates = TRUE, 
                               by_decimal_precision = TRUE,
                               decimal_precision = 2)

# quick check
nrow(occ_data_noclean)  # original data
nrow(clean_init)  # data after all basic cleaning steps

# a final plot to check
par(mfrow = c(2, 2))

## initial data
terra::plot(var, ext = allext, main = "Initial data")
points(occ_data_noclean[, c("x", "y")])

## data after basic cleaning steps
terra::plot(var, ext = allext, main = "After basic cleaning")
points(clean_init[, c("x", "y")])

terra::plot(var, main = "After basic cleaning (zoom)")
points(clean_init[, c("x", "y")])

## ----cell duplicates----------------------------------------------------------
# exclude duplicates based on raster cell (pixel)
celldup <- remove_cell_duplicates(data = clean_init, x = "x", y = "y",
                                  raster_layer = var)

# quick check
nrow(clean_init)  # data after all basic cleaning steps
nrow(celldup)  # plus removing cell duplicates

## ----move records-------------------------------------------------------------
# move records to valid pixels
moved <- move_2closest_cell(data = celldup, x = "x", y = "y", 
                            raster_layer = var, move_limit_distance = 10)

# quick check
nrow(celldup)  # basic cleaning and no cell duplicates
nrow(moved[moved$condition != "Not_moved", ])  # plus moved to valid cells

## ----all advanced-------------------------------------------------------------
# move records to valid pixels
clean_data <- advanced_cleaning(data = clean_init, x = "x", y = "y", 
                                raster_layer = var, cell_duplicates = TRUE,
                                move_points_inside = TRUE, 
                                move_limit_distance = 10)

# exclude points not moved
clean_data <- clean_data[clean_data$condition != "Not_moved", 1:3]

# quick check
nrow(occ_data_noclean)  # original data
nrow(clean_init)  # data after all basic cleaning steps
nrow(clean_data)  # data after all basic cleaning steps

# a final plot to check
par(mfrow = c(3, 2))

## initial data
terra::plot(var, ext = allext, main = "Initial")
points(occ_data_noclean[, c("x", "y")])

## data after basic cleaning steps
terra::plot(var, ext = allext, main = "Basic cleaning")
points(clean_init[, c("x", "y")])

terra::plot(var, main = "Basic cleaning (zoom)")
points(clean_init[, c("x", "y")])

## data after basic cleaning steps
terra::plot(var, main = "Final data")
points(clean_data[, c("x", "y")])

## zoom to a particular area, initial data
terra::plot(var, xlim = c(-48, -50), ylim = c(-26, -25),  main = "Initial (zoom +)")
points(occ_data_noclean[, c("x", "y")])

## zoom to a particular area, final data
terra::plot(var, xlim = c(-48, -50), ylim = c(-26, -25),  main = "Final (zoom +)")
points(clean_data[, c("x", "y")])

## ----par_reset----------------------------------------------------------------
# Reset plotting parameters
par(original_par) 

## ----save data, eval=FALSE----------------------------------------------------
# # Save as CSV
# write.csv(clean_data, file = "Clean_data.csv", row.names = FALSE)
# 
# # Save as RDS
# saveRDS(clean_data, file = "Clean_data.rds")
# 

Try the kuenm2 package in your browser

Any scripts or data that you put into this service are public.

kuenm2 documentation built on April 21, 2026, 1:07 a.m.