knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Download GBIF data from gbif.org

#download.file(url="http://api.gbif.org/v1/occurrence/download/request/0000275-171219132708484.zip",
#              destfile="/home/mbiber/data/0000275-171219132708484.zip")
#unzip()

Create SQLite database

#' Load data.table library
library(data.table)

#' Set file directory
#filedir <- "E:/Data"
filedir <- "/home/mbiber/data"

# Specify file path
#gbif_file <- paste0(filedir, "/0002839-170627171947987.csv") #Old version
gbif_file <- paste0(filedir, "/0000275-171219132708484.csv")

#' Read GBIF Data into R, has 437 GB of size, so only read specify subset of rows
gbif_data <- data.table::fread(gbif_file, nrow=10)

#' See first 5 data entries
head(gbif_data)
#' See column names
colnames(gbif_data)

#' Get colClasses
(colClass_gbif <- as.character(rmngb::colClasses(gbif_data)))
#' Define eventdate as date class
colClass_gbif[25] <- "date"

#' Only select certain columns
gbif_data <- data.table::fread(gbif_file,
                    select=c(1:10,14,15,17:31,40,44), nrow=10)

#' Get column names
colnames_gbif <- colnames(gbif_data)

#' Read only 1000 entries at a time and only required columns
gbif_data <- data.table::fread(gbif_file,
                    select=c(1:10,14,15,17:31,40,44), 
                    col.names=colnames_gbif, 
                    colClasses=colClass_gbif,
                    skip=0, nrow=1000, data.table=FALSE)

#' Turn Date column into date format
gbif_data$eventdate <- as.Date(gbif_data$eventdate)

#' Specify date and datetime columns for Database
library(dplyr, quiet=TRUE)
date_cols <- gbif_data %>% 
  select_if(lubridate::is.Date) %>% 
  colnames()
gbif_data[ , date_cols] <- as.character.Date(gbif_data[ , date_cols])

#' Create gbif database (chunk by chunk)
con <- src_sqlite(paste0(filedir, "/gbif_database.sqlite"), create=TRUE)

Add data chunks to database

#' Connect to database
library(DBI)
con <- dbConnect(RSQLite::SQLite(), dbname = paste0(filedir, "/gbif_database.sqlite"))

# write this first batch of lines to SQLITE table, 
# converting dates to string representation
#dbWriteTable(con, "gbif", gbif_data, overwrite=TRUE)

# Function that appends new sections to the table
append_to_sqlite <- function(x, pos) {
  x <- as.data.frame(x)
  #x[ , date_cols] <- as.character.Date(x[ , date_cols])
  dbWriteTable(con, "gbif", x, append = TRUE)
}

# readr chunk functionality
library(readr)
read_delim_chunked(file=gbif_file, skip=377805260, callback=append_to_sqlite, delim=",",
                 chunk_size = 100000, 
                 col_names = colnames_gbif, 
                   col_types = cols(
                     gbifid = col_integer(),
                     datasetkey = col_character(),
                     occurrenceid = col_logical(),
                     kingdom = col_character(),
                     phylum = col_character(),
                     class = col_character(),
                     order = col_character(),
                     family = col_character(),
                     genus = col_character(),
                     species = col_character(),
                     countrycode = col_character(),
                     locality = col_character(),
                     decimallatitude = col_character(),
                     decimallongitude = col_character(),
                     coordinateuncertaintyinmeters = col_logical(),
                     coordinateprecision = col_logical(),
                     elevation = col_logical(),
                     elevationaccuracy = col_logical(),
                     depth = col_logical(),
                     depthaccuracy = col_logical(),
                     eventdate = col_character(),
                     day = col_integer(),
                     month = col_integer(),
                     year = col_integer(),
                     taxonkey = col_integer(),
                     specieskey = col_integer(),
                     basisofrecord = col_character(),
                     typestatus = col_logical(),
                     issue = col_character()
                   ))

#' Disconnect database
dbDisconnect(con)

Create indexes

#' Connect to database
con <- dbConnect(RSQLite::SQLite(), dbname = paste0(filedir, "/gbif_database.sqlite"))

#' Create indexes
db_create_indexes(con=con, table="gbif", indexes = list("gbifid", "species", "decimallatitude", 
                                              "decimallongitude", "eventdate", 
                                              "taxonkey", "specieskey"))

#' See if table really exists
dbListTables(con)

#' Now that we have copied the data, we can use tbl() to take a reference to it:
gbif_db <- dplyr::tbl(con, "gbif")
head(gbif_db)

#' Disconnect from database
dbDisconnect(con)


RS-eco/climateNiche documentation built on Feb. 21, 2023, 5:25 a.m.