knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
Download GBIF data from gbif.org
#download.file(url="http://api.gbif.org/v1/occurrence/download/request/0000275-171219132708484.zip", # destfile="/home/mbiber/data/0000275-171219132708484.zip") #unzip()
Create SQLite database
#' Load data.table library library(data.table) #' Set file directory #filedir <- "E:/Data" filedir <- "/home/mbiber/data" # Specify file path #gbif_file <- paste0(filedir, "/0002839-170627171947987.csv") #Old version gbif_file <- paste0(filedir, "/0000275-171219132708484.csv") #' Read GBIF Data into R, has 437 GB of size, so only read specify subset of rows gbif_data <- data.table::fread(gbif_file, nrow=10) #' See first 5 data entries head(gbif_data) #' See column names colnames(gbif_data) #' Get colClasses (colClass_gbif <- as.character(rmngb::colClasses(gbif_data))) #' Define eventdate as date class colClass_gbif[25] <- "date" #' Only select certain columns gbif_data <- data.table::fread(gbif_file, select=c(1:10,14,15,17:31,40,44), nrow=10) #' Get column names colnames_gbif <- colnames(gbif_data) #' Read only 1000 entries at a time and only required columns gbif_data <- data.table::fread(gbif_file, select=c(1:10,14,15,17:31,40,44), col.names=colnames_gbif, colClasses=colClass_gbif, skip=0, nrow=1000, data.table=FALSE) #' Turn Date column into date format gbif_data$eventdate <- as.Date(gbif_data$eventdate) #' Specify date and datetime columns for Database library(dplyr, quiet=TRUE) date_cols <- gbif_data %>% select_if(lubridate::is.Date) %>% colnames() gbif_data[ , date_cols] <- as.character.Date(gbif_data[ , date_cols]) #' Create gbif database (chunk by chunk) con <- src_sqlite(paste0(filedir, "/gbif_database.sqlite"), create=TRUE)
Add data chunks to database
#' Connect to database library(DBI) con <- dbConnect(RSQLite::SQLite(), dbname = paste0(filedir, "/gbif_database.sqlite")) # write this first batch of lines to SQLITE table, # converting dates to string representation #dbWriteTable(con, "gbif", gbif_data, overwrite=TRUE) # Function that appends new sections to the table append_to_sqlite <- function(x, pos) { x <- as.data.frame(x) #x[ , date_cols] <- as.character.Date(x[ , date_cols]) dbWriteTable(con, "gbif", x, append = TRUE) } # readr chunk functionality library(readr) read_delim_chunked(file=gbif_file, skip=377805260, callback=append_to_sqlite, delim=",", chunk_size = 100000, col_names = colnames_gbif, col_types = cols( gbifid = col_integer(), datasetkey = col_character(), occurrenceid = col_logical(), kingdom = col_character(), phylum = col_character(), class = col_character(), order = col_character(), family = col_character(), genus = col_character(), species = col_character(), countrycode = col_character(), locality = col_character(), decimallatitude = col_character(), decimallongitude = col_character(), coordinateuncertaintyinmeters = col_logical(), coordinateprecision = col_logical(), elevation = col_logical(), elevationaccuracy = col_logical(), depth = col_logical(), depthaccuracy = col_logical(), eventdate = col_character(), day = col_integer(), month = col_integer(), year = col_integer(), taxonkey = col_integer(), specieskey = col_integer(), basisofrecord = col_character(), typestatus = col_logical(), issue = col_character() )) #' Disconnect database dbDisconnect(con)
Create indexes
#' Connect to database con <- dbConnect(RSQLite::SQLite(), dbname = paste0(filedir, "/gbif_database.sqlite")) #' Create indexes db_create_indexes(con=con, table="gbif", indexes = list("gbifid", "species", "decimallatitude", "decimallongitude", "eventdate", "taxonkey", "specieskey")) #' See if table really exists dbListTables(con) #' Now that we have copied the data, we can use tbl() to take a reference to it: gbif_db <- dplyr::tbl(con, "gbif") head(gbif_db) #' Disconnect from database dbDisconnect(con)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.