R/load_gbif_dwc.R
In gde: GBIF Dataset Explorer

Documented in load_gbif_dwc

#' Open GBIF download from zip file
#' 
#' Function that loads the data in a zip file downloaded from GBIF
#'
#' @param zipfile Path to the zipfile
#' @param tmpdir Temporary path for extracted files
#' @param page_title Title to show on the Shiny app.
#' @export
#' @import shiny
#' @import dplyr
#' @import readr
#' @import RSQLite
#' @importFrom jsonlite fromJSON
#' @importFrom stringr fixed
#' @importFrom jsonlite prettify
#' @importFrom stringr str_replace str_replace_all
#' @importFrom XML xmlToList
#' @importFrom progress progress_bar
#' @importFrom R.utils countLines
#' @importFrom R.utils isDirectory
#' @importFrom httr GET
#' @importFrom httr content
#' @importFrom data.table fread
#' @importFrom data.table setnames
#' @importFrom utils unzip
#' 
#' @examples
#' \dontrun{
#' data_loaded <- load_gbif_dwc(zipfile = "####.zip", tmpdir = "tmp", page_title = "GBIF Dataset 1")
#' }


load_gbif_dwc <- function(zipfile = NA, tmpdir = "tmp", page_title = "GBIF Data"){
  
  if (is.na(zipfile) || file.exists(zipfile) == FALSE){
    stop("zipfile was not set")
  }
  
  if (file.exists(zipfile) == FALSE){
    stop("zipfile not found")
  }
  
  if (file.exists("gbif.sqlite3")){
    unlink("gbif.sqlite3")
  }
  
  if (is.na(tmpdir)){
    cat("\n\n tmpdir not set, creating tmp")
    dir.create("tmp")
  }
  
  if (isDirectory(tmpdir) == FALSE){
    cat("\n\n tmpdir does not exists, creating tmp")
    dir.create("tmp")
  }
  
  gbif_db <- dbConnect(RSQLite::SQLite(), "gbif.sqlite3")
  
  #Save title
  n <- RSQLite::dbExecute(gbif_db, paste0("CREATE TABLE metadata(id, data)"))
  n <- RSQLite::dbExecute(gbif_db, paste0("INSERT INTO metadata (id, data) VALUES ('title', '", page_title, "');"))
  
  
  occ_file <- paste0(tmpdir, "/occurrence.txt")
  ver_file <- paste0(tmpdir, "/verbatim.txt")
  multi_file <- paste0(tmpdir, "/multimedia.txt")
  dataset_xml_path <- paste0(tmpdir, "/dataset/")
  metadata_path <- paste0(tmpdir, "/metadata.xml")

  
  #Extract file in the command line
  cat("\n Extracting files from zip...\n  Please wait...\n")
  #system2("unzip", args = c("-u", "-o", "-d", tmpdir, zipfile))
  unzip(zipfile, exdir = tmpdir, overwrite = TRUE)
  
  #Occurrence table----
  occ_cols <- data.table::fread(input = occ_file, header = FALSE, sep = "\t", stringsAsFactors = FALSE, encoding = "UTF-8", quote = "", nrows = 1)
  
  occ_row1 <- data.table::fread(input = occ_file, header = FALSE, sep = "\t", stringsAsFactors = FALSE, encoding = "UTF-8", quote = "", nrows = 1, skip = 10)
  gbif_cols_q <- paste0(occ_cols, collapse = " text, ")
  gbif_cols_q <- tolower(stringr::str_replace(gbif_cols_q, "gbifID text", "gbifid TEXT PRIMARY KEY"))
  gbif_cols_q <- paste0(gbif_cols_q, " text, ignorerow BOOLEAN DEFAULT 'f'")
  #Replace sql keywords
  gbif_cols_q <- stringr::str_replace(gbif_cols_q, ", group ", ", _group ")
  gbif_cols_q <- stringr::str_replace(gbif_cols_q, ", order", ", _order")
  gbif_cols_q <- stringr::str_replace(gbif_cols_q, "references", "_references")
  
  n <- RSQLite::dbExecute(gbif_db, paste0("CREATE TABLE bade_gbif_occ(", gbif_cols_q, ")"))
  
  #Verbatim table ----
  ver_cols <- data.table::fread(input = ver_file, header = FALSE, sep = "\t", stringsAsFactors = FALSE, encoding = "UTF-8", quote = "", nrows = 1)
  ver_row1 <- data.table::fread(input = ver_file, header = FALSE, sep = "\t", stringsAsFactors = FALSE, encoding = "UTF-8", quote = "", nrows = 1, skip = 10)
  
  ver_cols_q <- paste0(ver_cols, collapse = " text , ")
  ver_cols_q <- tolower(stringr::str_replace(ver_cols_q, "gbifID text", "gbifid TEXT PRIMARY KEY"))
  #Replace sql keywords
  ver_cols_q <- stringr::str_replace(ver_cols_q, "references", "_references")
  ver_cols_q <- stringr::str_replace(ver_cols_q, ", group", ", _group")
  ver_cols_q <- stringr::str_replace(ver_cols_q, ", order", ", _order")
  
  n <- RSQLite::dbExecute(gbif_db, paste0("CREATE TABLE bade_gbif_verbatim(", ver_cols_q, " text)"))
  
  
  #multimedia table ----
  mm_cols <- data.table::fread(input = multi_file, header = FALSE, sep = "\t", stringsAsFactors = FALSE, encoding = "UTF-8", quote = "", nrows = 1)
  mm_row1 <- data.table::fread(input = multi_file, header = FALSE, sep = "\t", stringsAsFactors = FALSE, encoding = "UTF-8", quote = "", nrows = 1, skip = 10)
  
  mm_cols_q <- paste0(mm_cols, collapse = " text, ")
  mm_cols_q <- tolower(stringr::str_replace(mm_cols_q, "gbifID text", "gbifid TEXT PRIMARY KEY"))
  #Replace sql keywords
  mm_cols_q <- stringr::str_replace(mm_cols_q, "references", "_references")
  
  n <- RSQLite::dbExecute(gbif_db, paste0("CREATE TABLE bade_gbif_multimedia(", mm_cols_q, " text)"))
  
  
  #Datasets table ----
  datasets_xml <- list.files(dataset_xml_path, pattern = "*.xml", full.names = TRUE)
  no_datasets <- length(datasets_xml)
  
  n <- RSQLite::dbExecute(gbif_db, 'CREATE TABLE bade_gbif_datasets(datasetKey text PRIMARY KEY, title text, institution text);')
  n <- RSQLite::dbExecute(gbif_db, 'CREATE INDEX ds_datasetKey ON bade_gbif_datasets(datasetKey);')
  
  pb <- progress::progress_bar$new(
    format = " Loading datasets [:bar] :percent in :elapsed",
    total = no_datasets, clear = FALSE, width= 100)
  
  for (i in 1:no_datasets){
    pb$tick()
    meta_file <- XML::xmlToList(datasets_xml[i])
    datasetKey <- stringr::str_replace(basename(datasets_xml[i]), ".xml", "")
    datasetTitle <- stringr::str_replace_all(meta_file$dataset$title, "'", "''")
    datasetInst <- stringr::str_replace_all(meta_file$dataset$creator$organizationName, "'", "''")
    insert_query <- paste0("INSERT INTO bade_gbif_datasets (datasetKey, title, institution) VALUES ('", datasetKey, "', '", datasetTitle, "', '", datasetInst, "');")
    n <- RSQLite::dbExecute(gbif_db, insert_query)
  }
  
  
  #how big?
  no_lines <- R.utils::countLines(occ_file)[1]
  
  #how many steps?
  no_rows <- 20000
  no_steps <- floor(no_lines/no_rows)
  
  if (no_steps ==0){
    no_steps <- 1
  }
  #occ_cols
  
  #Progress bar
  pb <- progress::progress_bar$new(
    format = " Loading occurrence and verbatim tables [:bar] :percent eta: :eta",
    total = no_steps, clear = FALSE, width= 100)
  
  for (i in 1:no_steps){
    pb$tick()
    
    if (i == 1){
      gbif_data <- data.table::fread(input = occ_file, header = FALSE, sep = "\t", stringsAsFactors = FALSE, encoding = "UTF-8", quote = "", nrows = (no_rows - 1), skip = 1, colClasses = "character")
      verbatim_data <- data.table::fread(input = ver_file, header = FALSE, sep = "\t", stringsAsFactors = FALSE, encoding = "UTF-8", quote = "", nrows = (no_rows - 1), skip = 1, colClasses = "character")
    }else{
      skip_rows <- i * no_rows
      gbif_data <- data.table::fread(input = occ_file, header = FALSE, sep = "\t", stringsAsFactors = FALSE, encoding = "UTF-8", quote = "", nrows = no_rows, skip = skip_rows, colClasses = "character")
      verbatim_data <- data.table::fread(input = ver_file, header = FALSE, sep = "\t", stringsAsFactors = FALSE, encoding = "UTF-8", quote = "", nrows = no_rows, skip = skip_rows, colClasses = "character")
    }
    
    while (dim(gbif_data)[2] != dim(occ_cols)[2]){
      gbif_data <- cbind(gbif_data, NA)
    }
    
    names(gbif_data) <- tolower(unlist(occ_cols))
    names(gbif_data)[names(gbif_data) == 'order'] <- '_order'
    names(gbif_data)[names(gbif_data) == 'group'] <- '_group'
    names(gbif_data)[names(gbif_data) == 'references'] <- '_references'

    names(verbatim_data) <- tolower(unlist(ver_cols))
    names(verbatim_data)[names(verbatim_data) == 'order'] <- '_order'
    names(verbatim_data)[names(verbatim_data) == 'group'] <- '_group'
    names(verbatim_data)[names(verbatim_data) == 'references'] <- '_references'
    
    #write rows
    RSQLite::dbWriteTable(gbif_db, "bade_gbif_occ", gbif_data, append = TRUE)
    RSQLite::dbWriteTable(gbif_db, "bade_gbif_verbatim", verbatim_data, append = TRUE)
  }
  
  rm(gbif_data)
  rm(verbatim_data)
  
  #Indices
  n <- RSQLite::dbExecute(gbif_db, 'CREATE INDEX gbifID ON bade_gbif_occ(gbifID);')
  n <- RSQLite::dbExecute(gbif_db, 'CREATE INDEX issue ON bade_gbif_occ(issue);')
  n <- RSQLite::dbExecute(gbif_db, 'CREATE INDEX gb_datasetKey ON bade_gbif_occ(datasetID);')
  n <- RSQLite::dbExecute(gbif_db, 'CREATE INDEX basisOfRecord ON bade_gbif_occ(basisOfRecord);')
  n <- RSQLite::dbExecute(gbif_db, 'CREATE INDEX scientificName ON bade_gbif_occ(scientificName);')
  n <- RSQLite::dbExecute(gbif_db, 'CREATE INDEX ignorerow ON bade_gbif_occ(ignorerow);')
  n <- RSQLite::dbExecute(gbif_db, 'CREATE INDEX verbatim_gbifID ON bade_gbif_verbatim(gbifID);')
  n <- RSQLite::dbExecute(gbif_db, 'CREATE INDEX multimedia_gbifID ON bade_gbif_multimedia(gbifID);')
  
  
  cat("\n Cataloging issues...")
  
  n <- RSQLite::dbExecute(gbif_db, "CREATE TABLE bade_gbif_issues(id serial PRIMARY KEY, gbifID text, issue text);")
  issue_list <- RSQLite::dbGetQuery(gbif_db, "SELECT DISTINCT issue FROM bade_gbif_occ WHERE issue != ''")
  
  issues_list <- data.frame(matrix(ncol = 1, nrow = 0, data = NA))
  for (i in 1:dim(issue_list)[1]){
    a <- strsplit(issue_list[i,1], ";")
    for (j in 1:length(a[[1]])){
      issues_list <- c(issues_list, a[[1]][j])
    }
  }
  
  distinct_issues <- unique(unlist(issues_list))
  
  pb <- progress::progress_bar$new(
    format = " Generating list of issues... [:bar] :percent eta: :eta",
    total = length(distinct_issues), clear = FALSE, width = 100)
  
  for (i in 1:length(distinct_issues)) {
    pb$tick()
    RSQLite::dbExecute(gbif_db, paste0("INSERT INTO bade_gbif_issues (gbifID, issue) SELECT gbifid, '", distinct_issues[i], "' FROM bade_gbif_occ WHERE issue LIKE '%", distinct_issues[i], "%'"))
  }
  
  n <- RSQLite::dbExecute(gbif_db, 'CREATE INDEX issue_issue ON bade_gbif_issues(issue);')
  n <- RSQLite::dbExecute(gbif_db, 'CREATE INDEX gbifID_issue ON bade_gbif_issues(gbifid);')
  
  fields <- c(tolower(unlist(occ_cols)))
  
  pb <- progress::progress_bar$new(
    format = " Generating field indices... [:bar] :percent eta: :eta",
    total = length(fields), clear = FALSE, width = 100)
  
  #Columns to skip
  skip_cols <- c("occurrenceremarks", "gbifid")
  
  for (f in seq(1, length(fields))){
    
    if (fields[f] %in% skip_cols){
      next
    }
    
    this_field <- stringr::str_replace(fields[f], stringr::fixed("group"), "_group")
    this_field <- stringr::str_replace(this_field, "island_group", "islandgroup")
    this_field <- stringr::str_replace(this_field, stringr::fixed("orderkey"), "orderkey_")
    this_field <- stringr::str_replace(this_field, "order", "_order")
    this_field <- stringr::str_replace(this_field, stringr::fixed("_orderkey_"), "orderkey")
    this_field <- stringr::str_replace(this_field, stringr::fixed("references"), "_references")
    this_field <- stringr::str_replace(this_field, stringr::fixed("associated_references"), "associatedreferences")
    this_field <- stringr::str_replace(this_field, stringr::fixed("geo_referencesources"), "georeferencesources")
    this_field <- stringr::str_replace(this_field, stringr::fixed("identification_references"), "identificationreferences")
    
    
    n <- RSQLite::dbExecute(gbif_db, paste0("CREATE INDEX IF NOT EXISTS bade_gbif_occ_", fields[f], "_idx ON bade_gbif_occ(", this_field, ")"))
  }
  
  
  #Field statistics----
  n <- RSQLite::dbExecute(gbif_db, "CREATE TABLE bade_gbif_issue_stats (field_name text, not_null_vals text, no_rows_distinct text)")
  
  fields <- RSQLite::dbGetQuery(gbif_db, "select * from pragma_table_info('bade_gbif_occ') as tblInfo;")
  
  fields <- dplyr::filter(fields, name != 'ignorerow')
  fields <- dplyr::filter(fields, name != 'gbifid')
  
  no_rows_total <- RSQLite::dbGetQuery(gbif_db, "SELECT count(*) from bade_gbif_occ")
  
  pb <- progress::progress_bar$new(
    format = " Calculating field statistics... [:bar] :percent eta: :eta",
    total = dim(fields)[1], clear = FALSE, width = 100)
  
  for (f in seq(1, dim(fields)[1])){
    pb$tick()
    this_field <- fields$name[f]
    
    
    no_rows_null_q <- paste0("SELECT count(*) as no_rows from bade_gbif_occ WHERE ", this_field, " IS NULL OR ", this_field, " = ''")
    no_rows_null <- RSQLite::dbGetQuery(gbif_db, no_rows_null_q)

    no_rows_notnull_pc1 <- round(((no_rows_total - no_rows_null)/no_rows_total) * 100, 2)
    no_rows_notnull_pc <- paste0(no_rows_notnull_pc1, " %")
    
    not_null <- paste0("<div class=\"progress\" style=\"background-color: #ffc107;\"><div class=\"progress-bar bg-success\" role=\"progressbar\" style=\"width: ", no_rows_notnull_pc1, "%; background-color: #28a745;\" aria-valuenow=\"", no_rows_notnull_pc1, "\" aria-valuemin=\"0\" aria-valuemax=\"100\" title=\"", no_rows_notnull_pc1, "\">", no_rows_notnull_pc1, "%</div></div>")
    
    no_rows_distinct <- RSQLite::dbGetQuery(gbif_db, paste0("SELECT count(DISTINCT ", this_field, ") as distinct_vals from bade_gbif_occ"))
    
    n <- RSQLite::dbExecute(gbif_db, paste0("INSERT INTO bade_gbif_issue_stats (field_name, not_null_vals, no_rows_distinct) VALUES ('", this_field, "', '", not_null, "', '", no_rows_distinct, "')"))
  }
  

  #Download metadata
  dl_meta_file <- XML::xmlToList(metadata_path)

  this_doi <- dl_meta_file$additionalMetadata$metadata$`gbif`$citation$.attrs
  gbif_key <- dl_meta_file$dataset$alternateIdentifier
  metadata_json <- paste0("http://api.gbif.org/v1/occurrence/download/", gbif_key)
  gbif_metadata <- httr::content(httr::GET(metadata_json), as="text", encoding = "UTF-8")
  
  n <- RSQLite::dbExecute(gbif_db, paste0("CREATE TABLE bade_gbif_metadata AS SELECT '", gbif_metadata, "' AS metadata_json"))
  
  dbDisconnect(gbif_db)
  
  #delete files
  system2("rm", args = c(paste0(tmpdir, "/*.txt")))
  system2("rm", args = c(paste0(tmpdir, "/*.xml")))
  system2("rm", args = c("-r", paste0(tmpdir, "/dataset")))
  
  cat("\n Done! The database is ready.\n\n Launch using launchApp()\n\n")
}