R/finbif_occurrence_load.R

Defines functions split_col split_coord_euref_col split_coord_col split_dt_col split_taxa_col expand_lite_cols open_tsv_connection nlines infer_file_vars paste_col convert_col_type unlist_col bind_facts spread_facts select_facts deselect op_unzip rd_read dt_read any_issues add_nas get_zip new_vars fix_issue_vars localise_enums attempt_read read_finbif_tsv finbif_occurrence_load

Documented in finbif_occurrence_load

#' Load FinBIF occurrence records from a file
#'
#' Load occurrence data from a file as a `data.frame`.
#'
#' @aliases fb_occurrence_load
#'
#' @param file Character or Integer. Either the path to a Zip archive or
#'   tabular data file that has been downloaded from "laji.fi", a URI
#'   linking to such a data file (e.g.,
#'   [https://tun.fi/HBF.49381](https://tun.fi/HBF.49381)) or an integer
#'   representing the URI (i.e., `49381`).
#' @param select Character vector. Variables to return. If not specified, a
#'   default set of commonly used variables will be used. Use `"default_vars"`
#'   as a shortcut for this set. Variables can be deselected by prepending a `-`
#'   to the variable name. If only deselects are specified the default set of
#'   variables without the deselection will be returned. Use `"all"` to select
#'   all available variables in the file.
#' @param n Integer. How many records to import. Negative and other invalid
#'   values are ignored causing all records to be imported.
#' @param write_file Character. Path to write downloaded zip file to if `file`
#'   refers to a URI. Will be ignored if `getOption("finbif_cache_path")` is not
#'   `NULL` and will use the cache path instead.
#' @param dt Logical. If package, `data.table`, is available return a
#'   `data.table` object rather than a `data.frame`.
#' @param keep_tsv Logical. Whether to keep the TSV file if `file` is a ZIP
#'   archive or represents a URI. Is ignored if `file` is already a TSV. If
#'   `TRUE` the tsv file will be kept in the same directory as the ZIP archive.
#' @param facts List. A named list of "facts" to extract from supplementary
#'   "fact" files in a local or online FinBIF data archive. Names can include
#'   one or more of `"record"`, `"event"` or `"document"`. Elements of the list
#'   are character vectors of the "facts" to be extracted and then joined to the
#'   return value.
#' @param type_convert_facts Logical. Should facts be converted from character
#'   to numeric or integer data where applicable?
#' @param drop_facts_na Logical. Should missing or "all `NA`" facts be dropped?
#'   Any value other than a length one logical vector with the value of TRUE
#'   will be interpreted as FALSE. Argument is ignored if `drop_na` is TRUE for
#'   all variables explicitly or via recycling. To only drop some
#'   missing/`NA`-data facts use `drop_na` argument.
#' @param locale Character. One of the supported two-letter ISO 639-1 language
#'   codes. Current supported languages are English, Finnish and Swedish. For
#'   data where more than one language is available the language denoted by
#'   `locale` will be preferred while falling back to the other languages in the
#'   order indicated above.
#' @param skip Integer. The number of lines of the data file to skip before
#'   beginning to read data (not including the header).
#' @inheritParams finbif_occurrence
#' @return A `data.frame`, or if `count_only =  TRUE` an integer.
#' @examples \dontrun{
#'
#' # Get occurrence data
#' finbif_occurrence_load(49381)
#'
#' }
#' @export
finbif_occurrence_load <- function(
  file,
  select = NULL,
  n = -1,
  count_only = FALSE,
  quiet = getOption("finbif_hide_progress"),
  cache = getOption("finbif_use_cache"),
  dwc = FALSE,
  date_time_method = NULL,
  tzone = getOption("finbif_tz"),
  write_file = tempfile(),
  dt = NA,
  keep_tsv = FALSE,
  facts = list(),
  type_convert_facts = TRUE,
  drop_na = FALSE,
  drop_facts_na = drop_na,
  locale = getOption("finbif_locale"),
  skip = 0
) {

  fb_records_obj <- list(
    file = file,
    n = as.integer(n),
    count_only = count_only,
    quiet = quiet,
    cache = cache,
    write_file = write_file,
    dt = dt,
    keep_tsv = keep_tsv,
    skip = skip,
    facts = "none"
  )

  all_cols <- "all" %in% select
  var_names <- sysdata(list(which = "var_names"))
  var_type <- col_type_string(dwc)
  col_names <- var_names[[var_type]]
  deselect <- character()

  if (all_cols) {
    deselect <- grep("^-", select, value = TRUE)
    deselect <- gsub("-", "", deselect)
    deselect <- match(deselect, col_names)
    deselect <- row.names(var_names[deselect, ])
    select <- "default_vars"
  }

  fb_records_obj[["aggregate"]] <- "none"
  fb_records_obj[["select"]] <- select
  fb_records_obj[["include_facts"]] <- FALSE
  fb_records_obj[["var_type"]] <- var_type

  defer_errors({
    select <- infer_selection(fb_records_obj)
  })

  fact_types <- vapply(facts, length, 0L) > 0L
  fact_types <- which(fact_types)
  fact_types <- names(fact_types)

  select[["deselect"]] <- deselect
  select[["all"]] <- all_cols
  select[["type"]] <- var_type
  select[["facts"]] <- fact_types

  fb_records_obj[["select"]] <- select
  fb_occurrence_df <- read_finbif_tsv(fb_records_obj)

  if (count_only) {
    return(fb_occurrence_df)
  }

  file_vars <- attr(fb_occurrence_df, "file_vars", TRUE)
  attr(file_vars, "var_type") <- var_type

  select[["lite"]] <- attr(file_vars, "lite", TRUE)
  attr(fb_occurrence_df, "locale") <- locale

  cache <- c(cache, getOption("finbif_use_cache_metadata"))
  attr(fb_occurrence_df, "cache") <- cache[1:2]

  fb_occurrence_df <- localise_enums(fb_occurrence_df)

  select <- select_facts(select)
  attr(fb_occurrence_df, "select") <- select

  fb_occurrence_df <- new_vars(fb_occurrence_df)

  record_id <- file_vars[["translated_var"]] == "record_id"
  record_id <- file_vars[record_id, ]
  record_id <- rownames(record_id)

  event_id <- file_vars[["translated_var"]] == "event_id"
  event_id <- file_vars[event_id, ]
  event_id <- rownames(event_id)

  document_id <- file_vars[["translated_var"]] == "document_id"
  document_id <- file_vars[document_id, ]
  document_id <- rownames(document_id)

  record_id_col <- fb_occurrence_df[[record_id]]

  fb_occurrence_df <- expand_lite_cols(fb_occurrence_df)

  df_names <- names(fb_occurrence_df)
  nms <- file_vars[df_names, var_type]
  na_nms <- is.na(nms)
  names(fb_occurrence_df) <- ifelse(na_nms, df_names, nms)

  select_user <- select[["user"]]

  fb_occurrence_df <- structure(
    fb_occurrence_df,
    select_user = select_user,
    column_names = select_user,
    aggregate = "none",
    dwc = dwc,
    date_time_method = date_time_method,
    tzone = tzone,
    locale = locale,
    include_new_cols = !all_cols,
    record_id = record_id_col
  )

  fb_occurrence_df <- compute_vars_from_id(fb_occurrence_df)
  fb_occurrence_df <- compute_abundance(fb_occurrence_df)
  fb_occurrence_df <- compute_citation(fb_occurrence_df)
  fb_occurrence_df <- compute_coordinate_uncertainty(fb_occurrence_df)
  fb_occurrence_df <- compute_scientific_name(fb_occurrence_df)
  fb_occurrence_df <- add_nas(fb_occurrence_df)

  select[["user"]] <- names(fb_occurrence_df)
  n_recs <- attr(fb_occurrence_df, "nrow", TRUE)

  if (!all_cols) {
    select[["user"]] <- select_user

    datetime_obj <- list(date_time_method = date_time_method, n = n_recs)
    datetime_obj <- det_datetime_method(datetime_obj)

    fb_occurrence_df <- structure(
      fb_occurrence_df,
      select_user = select_user,
      column_names = select_user,
      aggregate = "none",
      dwc = dwc,
      date_time = select[["date_time_selected"]],
      date_time_method = datetime_obj[["date_time_method"]],
      tzone = tzone
    )

    fb_occurrence_df <- date_times(fb_occurrence_df)
    fb_occurrence_df <- compute_date_time(fb_occurrence_df)
    fb_occurrence_df <- compute_duration(fb_occurrence_df)
    fb_occurrence_df <- compute_iso8601(fb_occurrence_df)
    fb_occurrence_df <- any_issues(fb_occurrence_df)

    df_names <- names(fb_occurrence_df)
    nrows <- nrow(fb_occurrence_df)
    na <- rep_len(NA, nrows)

    for (extra_var in setdiff(select_user, df_names)) {
      type <- var_names[col_names == extra_var, "type"]
      fb_occurrence_df[[extra_var]] <- cast_to_type(na, type)
    }

  } else {
    select_user_keep <- !duplicated(select[["user"]])
    select[["user"]] <- select[["user"]][select_user_keep]
    fb_occurrence_df <- fb_occurrence_df[, select_user_keep, drop = FALSE]
  }

  attr(fb_occurrence_df, "file_cols") <- NULL
  attr(fb_occurrence_df, "file_vars") <- NULL
  class <- class(fb_occurrence_df)

  fb_occurrence_df <- structure(
    fb_occurrence_df,
    class = c("finbif_occ", class),
    nrec_dnld = n_recs,
    nrec_avl = n_recs,
    url = attr(fb_occurrence_df, "url", TRUE),
    time = "??",
    dwc = dwc,
    record_id = record_id_col
  )

  for (ftype in select[["facts"]]) {
    stopifnot("Invalid fact type" = ftype %in% c("record", "event", "document"))

    fb_records_obj[["select"]] <- list(
      all = TRUE,
      deselect = character(),
      type = "translated_var"
    )
    fb_records_obj[["n"]] <- -1L
    fb_records_obj[["facts"]] <- ftype
    fb_records_obj[["skip"]] <- 0L

    id <- switch(
      ftype,
      record = file_vars[[record_id, var_type]],
      event = file_vars[[event_id, var_type]],
      document = file_vars[[document_id, var_type]]
    )

    facts_df <- structure(
      read_finbif_tsv(fb_records_obj),
      facts = facts[[ftype]],
      fact_type = ftype,
      id = id,
      type_convert_facts = type_convert_facts,
      drop_facts_na = drop_facts_na
    )
    facts_df <- spread_facts(facts_df)

    select_user <- select[["user"]]

    df_names_user <- names(fb_occurrence_df[select_user])
    df_names_facts <- names(facts_df)
    df_names_facts <- setdiff(df_names_facts, id)

    select[["user"]] <- c(df_names_user, df_names_facts)
    attr(fb_occurrence_df, "facts_df") <- facts_df
    fb_occurrence_df <- bind_facts(fb_occurrence_df)
  }

  select_user <- name_chr_vec(select[["user"]])
  fb_occurrence_df <- fb_occurrence_df[, select_user, drop = FALSE]

  attr(fb_occurrence_df, "column_names") <- select_user
  attr(fb_occurrence_df, "drop_na") <- drop_na

  drop_na_col(fb_occurrence_df)
}

#' @noRd
#' @importFrom utils read.delim unzip
read_finbif_tsv <- function(fb_occurrence_obj) {
  file <- as.character(fb_occurrence_obj[["file"]])
  ptrn <- "^https?://.+?/HBF\\."

  if (grepl(ptrn, file)) {
    file <- sub(ptrn, "", file)
  }

  tsv <- basename(file)
  tsv <- gsub("zip", "tsv", tsv)

  facts <- fb_occurrence_obj[["facts"]]

  tsv_prefix <- switch(
    facts,
    none = "rows_",
    record = "unit_facts_",
    event = "gathering_facts_",
    document = "document_facts_"
  )

  valid_facts <- facts %in% c("none", "record", "event", "document")

  stopifnot(
    "Facts can only be of types: record, event and/or document" = valid_facts
  )

  file <- gsub("rows_", tsv_prefix, file)
  fb_occurrence_obj[["file"]] <- file
  fb_occurrence_obj[["tsv"]] <-  paste0(tsv_prefix, tsv)

  if (grepl("^[0-9]*$", file)) {
    fb_occurrence_obj[["tsv"]] <- sprintf("%sHBF.%s.tsv", tsv_prefix, file)
    finbif_dl_url <- getOption("finbif_dl_url")
    fb_occurrence_obj[["url"]] <- sprintf("%s/HBF.%s", finbif_dl_url, file)
    fb_occurrence_obj <- get_zip(fb_occurrence_obj)
    file <- fb_occurrence_obj[["file"]]
  }

  fb_occurrence_obj[["is_dwc"]] <- FALSE

  if (grepl("\\.zip$", file)) {
    files <- utils::unzip(path.expand(file), list = TRUE, unzip = op_unzip())

    if (any(grepl("meta\\.xml$", files[["Name"]]))) {
      fb_occurrence_obj[["is_dwc"]] <- TRUE
      fb_occurrence_obj[["tsv"]] <- switch(
        facts,
        none = "occurrences.txt",
        record = "facts/occurrence_facts.txt",
        event = "facts/event_facts.txt",
        document = "facts/parent_event_facts.txt"
      )
    }
  }

  df <- attempt_read(fb_occurrence_obj)

  if (fb_occurrence_obj[["count_only"]]) {
    return(df)
  }

  attr(df, "is_dwc") <- fb_occurrence_obj[["is_dwc"]]
  attr(df, "url") <- file

  if (identical(fb_occurrence_obj[["n"]], -1L)) {
    attr(df, "nrow") <- nrow(df)
  }

  df
}

#' @noRd
attempt_read <- function(fb_occurrence_obj) {
  use_dt <- fb_occurrence_obj[["dt"]]

  if (is.na(use_dt)) {
    use_dt <- TRUE
    fb_occurrence_obj[["dt"]] <- FALSE
  }

  fb_occurrence_obj[["nrows"]] <- nlines(fb_occurrence_obj)

  if (fb_occurrence_obj[["count_only"]]) {
    ans <- fb_occurrence_obj[["nrows"]]
  } else {
    if (use_dt && has_pkgs("data.table")) {
      input <- as.character(fb_occurrence_obj[["file"]])
      input_list <- list(input = input, tsv = fb_occurrence_obj[["tsv"]])
      input_list <- list(zip = input_list)

      if (grepl("\\.tsv$", input)) {
        fb_occurrence_obj[["keep_tsv"]] <- FALSE
        input_list <- list(input = input)
      }

      fb_occurrence_obj[["dt_args"]] <- input_list
      ans <- dt_read(fb_occurrence_obj)

    } else {
      ans <- rd_read(fb_occurrence_obj)
    }

    attr(ans, "nrow") <- fb_occurrence_obj[["nrows"]]
  }

  ans
}

#' @noRd
localise_enums <- function(df) {
  file_vars <- attr(df, "file_vars", TRUE)

  if (attr(df, "is_dwc", TRUE)) {
    row.names(file_vars) <- make.unique(file_vars[["dwc"]])
  }

  field_var_names <- row.names(file_vars)

  labels_obj <- list(
    var_names = file_vars,
    locale = attr(df, "locale", TRUE),
    cache = attr(df, "cache", TRUE)
  )

  for (nm in names(df)) {
    if (nm %in% field_var_names && isTRUE(file_vars[[nm, "localised"]])) {
      labels_obj[["labels"]] <- df[[nm]]
      labels_obj[["col"]] <- nm
      df[[nm]] <- localise_labels(labels_obj)
    }
  }

  df
}

#' @noRd
fix_issue_vars <- function(x) {
  type <- c("Time", "Location")

  for (i in c("Issue", "Source", "Message")) {
    for (j in 1:2) {
      issue <- sprintf("Issue.%s.%s", i, j)
      issue_type <- sprintf("%sIssue.%s", type[[j]], i)
      x <- sub(issue, issue_type, x)
    }
  }

  x
}

#' @noRd
new_vars <- function(df) {
  file_vars <- attr(df, "file_vars", TRUE)
  nss <- file_vars[["superseeded"]] == "FALSE"
  ss <- rownames(file_vars[!nss, ])
  if (attr(df, "is_dwc", TRUE)) ss <- file_vars[!nss, "dwc"]

  file_cols <- attr(df, "file_cols", TRUE)

  if (is.null(file_cols)) {
    file_cols <- names(df)
    attr(df, "file_cols") <- file_cols
  }

  ss <- intersect(ss, file_cols)
  var_names <- sysdata(list(which = "var_names"))
  select <- attr(df, "select", TRUE)

  ds <- select[["deselect"]]
  ds <- file_vars[["translated_var"]] %in% var_names[ds, "translated_var"]
  nms <- row.names(file_vars[nss & !ds, ])
  file_cols <- c(file_cols, file_vars[ss, "superseeded"])

  if (!select[["all"]] && nrow(df) > 0L) {
    df[setdiff(nms, file_cols)] <- NA
  }

  df
}

#' @noRd
#' @importFrom digest digest
#' @importFrom httr RETRY progress write_disk
get_zip <- function(fb_occurrenc_obj) {
  write_file <- fb_occurrenc_obj[["write_file"]]
  url <- fb_occurrenc_obj[["url"]]

  if (fb_occurrenc_obj[["cache"]] > 0) {
    hash <- sub(":\\d+", "", url)
    hash <- digest::digest(hash)
    fcp <- getOption("finbif_cache_path")

    if (is.null(fcp)) {
      cache_file <- get_cache(hash)

      if (!is.null(cache_file)) {
        fb_occurrenc_obj[["file"]] <- cache_file
        return(fb_occurrenc_obj)
      }

      on.exit({
        if (!is.null(write_file)) {
          cache_obj <- list(data = write_file, hash = hash, timeout = Inf)
          set_cache(cache_obj)
        }
      })
    } else if (is.character(fcp)) {
      file_name <- paste0("finbif_dwnld_cache_file_", hash)
      write_file <- file.path(fcp, file_name)

      if (file.exists(write_file)) {
        fb_occurrenc_obj[["file"]] <- write_file
        return(fb_occurrenc_obj)
      }

    } else {
      stop("Database cache cannot be used for FinBIF downloads.", call. = TRUE)
    }

  }

  progress <- NULL

  if (!fb_occurrenc_obj[["quiet"]]) {
    progress <- httr::progress()
  }

  allow <- getOption("finbif_allow_query")
  stopifnot("Request not cached and option:finbif_allow_query = FALSE" = allow)
  Sys.sleep(1 / getOption("finbif_rate_limit"))

  query <- list()
  auth <- Sys.getenv("FINBIF_RESTRICTED_FILE_ACCESS_TOKEN")

  if (!identical(auth, "")) {
    query <- list(personToken = auth)
  }

  resp <- httr::RETRY(
    "GET",
    url,
    httr::write_disk(write_file, overwrite = TRUE),
    progress,
    query = query,
    times =  getOption("finbif_retry_times"),
    pause_base = getOption("finbif_retry_pause_base"),
    pause_cap = getOption("finbif_retry_pause_cap"),
    pause_min = getOption("finbif_retry_pause_min"),
    quiet = fb_occurrenc_obj[["quiet"]],
    terminate_on = 404L
  )

  fs <- file.size(write_file)
  fl <- Sys.getenv("FINBIF_FILE_SIZE_LIMIT")
  fl <- as.integer(fl)

  if (isTRUE(fs > fl)) {
    stop("File download too large; err_name: too_large", call. = FALSE)
  }

  code <- resp[["status_code"]]

  if (!identical(code, 200L)) {
    msg <- sprintf("File request failed [%s]; err_name: request_failed", code)
    stop(msg, call. = FALSE)
  }

  fb_occurrenc_obj[["file"]] <- write_file
  fb_occurrenc_obj

}

#' @noRd
add_nas <- function(df) {
  dwc <- attr(df, "dwc", TRUE)
  var_type <- col_type_string(dwc)
  file_vars <- attr(df, "file_vars", TRUE)
  file_var_type <- file_vars[[var_type]]
  vnames <- sysdata(list(which = "var_names"))
  vnames_type <- vnames[[var_type]]

  for (nm in names(df)) {
    if (all_na(df[[nm]])) {
      ind <- file_var_type == nm & file_vars[["superseeded"]] == "FALSE"

      if (any(ind)) {
        df[[nm]] <- cast_to_type(df[[nm]], file_vars[ind, "type"])
      } else if (nm %in% vnames_type) {
        df[[nm]] <- cast_to_type(df[[nm]], vnames[vnames_type == nm, "type"])
      }

    }
  }

  df
}

#' @noRd
any_issues <- function(df) {
  dwc <- attr(df, "dwc", TRUE)
  vtype <- col_type_string(dwc)
  vnms <- sysdata(list(which = "var_names"))
  issues <- vnms[["unit.quality.documentGatheringUnitQualityIssues", vtype]]

  if (issues %in% attr(df, "select_user", TRUE) && !issues %in% names(df)) {
    issue <- logical()

    if (nrow(df) > 0L) {
      issue <- FALSE
      issue_cols <- c(
        "unit.quality.issue.issue",
        "gathering.quality.issue.issue",
        "gathering.quality.timeIssue.issue",
        "gathering.quality.locationIssue.issue"
      )
      has_an_issue_col <- FALSE

      for (i in issue_cols) {
        issue_col_nm <- vnms[[i, vtype]]
        issue_col <- df[[issue_col_nm]]
        has_issue_col <- length(issue_col) > 0L

        if (has_issue_col) {
          issue <- issue | !is.na(issue_col)
        }

        has_an_issue_col <- has_an_issue_col || has_issue_col
      }

      if (!has_an_issue_col) {
        issue <- NA
      }

    }

    df[[issues]] <- issue
  }

  df
}

#' @noRd
#' @importFrom utils unzip
dt_read <- function(fb_occurrence_obj) {
  skip <- fb_occurrence_obj[["skip"]]

  args <- list(
    nrows = 0,
    showProgress = !fb_occurrence_obj[["quiet"]],
    data.table = fb_occurrence_obj[["dt"]],
    na.strings = "",
    quote = "",
    sep = "\t",
    fill = TRUE,
    check.names = FALSE,
    header = TRUE,
    skip = 0L
  )
  args <- c(fb_occurrence_obj[["dt_args"]], args)

  if ("zip" %in% names(args)) {
    unzip <- op_unzip()
    zip_input <- path.expand(args[[c("zip", "input")]])
    zip_tsv <- args[[c("zip", "tsv")]]
    dir <- dirname(zip_input)
    args_input <- sprintf("%s/%s", dir, zip_tsv)
    args[["input"]] <- args_input

    if (!file.exists(args_input)) {
      utils::unzip(zip_input, files = zip_tsv, exdir = dir, unzip = unzip)

      if (!fb_occurrence_obj[["keep_tsv"]]) {
        on.exit(unlink(args_input))
      }

    }

    args[["zip"]] <- NULL
  }

  cols <- sysdata(list(which = "cite_file_vars"))
  cols <- rownames(cols)

  if (file.exists(args[["input"]])) {
    cols <- do.call(data.table::fread, args)
    cols <- names(cols)
  }

  cols <- make.names(cols)
  cols <- make.unique(cols)
  cols <- fix_issue_vars(cols)

  file_vars <- infer_file_vars(cols)
  if (fb_occurrence_obj[["is_dwc"]]) {
    row.names(file_vars) <- make.unique(file_vars[["dwc"]])
  }

  select <- fb_occurrence_obj[["select"]]
  select[["file_vars"]] <- file_vars

  if (attr(file_vars, "lite", TRUE)) {
    args[["quote"]] <- "\""
  }

  args_select <- !cols %in% deselect(select)
  args[["select"]] <- which(args_select)

  args[["nrows"]] <- as.double(fb_occurrence_obj[["n"]])
  args[["check.names"]] <- TRUE

  skip_n <- 1

  if (fb_occurrence_obj[["is_dwc"]] && fb_occurrence_obj[["facts"]] == "none") {
    skip_n <- 3
  }

  args[["skip"]] <- skip + skip_n
  args[["header"]] <- FALSE

  df <- structure(
    vector("list", length(args[["select"]])), class = "data.frame"
  )

  if (file.exists(args[["input"]]) && fb_occurrence_obj[["nrows"]] > 0L) {
    df <- do.call(data.table::fread, args)
  }

  names(df) <- cols[args_select]
  classes <- file_vars[cols, "type"]
  classes <- classes[args_select]
  na_classes <- is.na(classes)
  classes <- ifelse(na_classes, "character", classes)

  for (i in seq_along(df)) {
    df[[i]] <- cast_to_type(df[[i]], classes[[i]])
  }

  attr(df, "file_vars") <- file_vars
  attr(df, "file_cols") <- cols

  df
}

#' @noRd
#' @importFrom utils read.delim unzip
rd_read <- function(fb_occurrence_obj) {
  file <- fb_occurrence_obj[["file"]]
  tsv <- fb_occurrence_obj[["tsv"]]

  if (fb_occurrence_obj[["keep_tsv"]] && !grepl("\\.tsv$", file)) {
    unzip <- op_unzip()
    dir <- dirname(file)
    utils::unzip(file, tsv, exdir = dir, unzip = unzip)
  }

  connection_obj <- list(file = file, tsv = tsv, mode = "")
  con <- open_tsv_connection(connection_obj)
  quote <- ""

  df <- utils::read.delim(
    con,
    nrows = 1L,
    na.strings = "",
    quote = quote,
    skip = 0L
  )

  df_names <- names(df)
  cols <- fix_issue_vars(df_names)

  file_vars <- infer_file_vars(cols)
  if (fb_occurrence_obj[["is_dwc"]]) {
    row.names(file_vars) <- make.unique(file_vars[["dwc"]])
  }

  if (attr(file_vars, "lite", TRUE)) {
    quote <- "\""
  }

  select <- fb_occurrence_obj[["select"]]
  select[["file_vars"]] <- file_vars

  n <- as.integer(fb_occurrence_obj[["n"]])

  skip_n <- 1

  if (fb_occurrence_obj[["is_dwc"]] && fb_occurrence_obj[["facts"]] == "none") {
    skip_n <- 3
  }

  no_rows <- identical(fb_occurrence_obj[["nrows"]], 0L)

  if (identical(n, 0L) || inherits(con, "textConnection") || no_rows) {
    df <- df[0L, ]
  } else {
    df <- utils::read.delim(
      open_tsv_connection(connection_obj),
      header = FALSE,
      quote = quote,
      na.strings = "",
      nrows = n,
      skip = fb_occurrence_obj[["skip"]] + skip_n
    )
    classes <- file_vars[cols, "type"]
    na_classes <- is.na(classes)
    classes <- ifelse(na_classes, "character", classes)

    for (i in seq_along(df)) {
      df[[i]] <- cast_to_type(df[[i]], classes[[i]])
    }
  }

  idx <- !cols %in% deselect(select)
  df <- df[idx]
  names(df) <- cols[idx]
  attr(df, "file_vars") <- file_vars

  df
}

#' @noRd
op_unzip <- function() {
  unzip <- "internal"
  op_unzip <- getOption("unzip")

  if (!is.null(op_unzip) && !identical(op_unzip, "")) {
    unzip <- op_unzip
  }

  unzip
}

#' @noRd
deselect <- function(select) {
  file_vars <- select[["file_vars"]]
  type <- select[["type"]]
  deselect <- select[["deselect"]]
  var_names <- sysdata(list(which = "var_names"))
  ind <- file_vars[[type]] %in% var_names[deselect, type]
  row.names(file_vars[ind, ])
}

#' @noRd
select_facts <- function(select) {
  if (select[["lite"]]) {
    select[["facts"]] <- NULL
  }

  select
}

#' @noRd
spread_facts <-  function(facts) {
  select <- attr(facts, "facts", TRUE)
  type <- attr(facts, "fact_type", TRUE)
  id <- attr(facts, "id", TRUE)
  type_convert_facts <- attr(facts, "type_convert_facts", TRUE)
  drop_facts_na <- attr(facts, "drop_facts_na", TRUE)

  if (identical(nrow(facts), 0L)) {
    facts <- data.frame(
      Parent = NA_character_,
      Fact = NA_character_,
      Value = NA_character_,
      IntValue = NA_character_,
      DecimalValue = NA_character_
    )
  }

  names(facts)[1:3] <- c("Parent", "Fact", "Value")

  missing_facts <- character()
  select_facts <- facts[["Fact"]]
  ind <- match(select, select_facts)
  fact_names <- names(facts)
  id_col <- fact_names == "Parent"
  fact_names[id_col] <- id
  names(facts) <- fact_names
  na_ind <- is.na(ind)

  if (any(na_ind)) {
    missing_facts <- select[na_ind]
    warning <- paste(missing_facts, collapse = ", ")
    warning(
      "Selected fact(s) - ",
      warning,
      " - could not be found in dataset",
      call. = FALSE
    )

    missing_facts <- missing_facts[!isTRUE(drop_facts_na)]
  }

  if (!all(na_ind)) {
    select <- select[!na_ind]
    facts <- facts[select_facts %in% select, ]
    facts[["Fact"]] <- paste(type, "fact_", facts[["Fact"]], sep = "_")
    facts <- tapply(facts[["Value"]], facts[c("Fact", id)], c, simplify = FALSE)
    fact_dimnames <- dimnames(facts)

    selected_fact_nms <- paste(type, "fact_", select, sep = "_")
    fact_nms <- intersect(selected_fact_nms, fact_dimnames[["Fact"]])
    colnames <- c(id, fact_nms)
    ncols <- length(colnames)
    fact_list <- vector("list", ncols)
    names(fact_list) <- colnames
    ids <- fact_dimnames[[id]]
    fact_list[[id]] <- ids

    for (i in fact_nms) {
      fact_i <- facts[i, ]
      fact_i <- unname(fact_i)
      fact_i[vapply(fact_i, is.null, NA)] <- NA
      fact_i <- unlist_col(fact_i)

      if (type_convert_facts) {
        fact_i <- convert_col_type(fact_i)
      }

      fact_list[[i]] <- fact_i
    }

    facts <- structure(
      fact_list, class = "data.frame", row.names = seq_along(ids)
    )
  } else {
    facts <- facts[, id_col, drop = FALSE]
  }

  for (mf in missing_facts) {
    facts[[paste(type, "fact_", mf, sep = "_")]] <- NA_character_
  }

  attr(facts, "id") <- id
  unique(facts)
}

#' @noRd
bind_facts <- function(x) {
  facts <- attr(x, "facts_df", TRUE)
  id <- attr(facts, "id", TRUE)
  stopifnot("Cannot bind facts. ID column missing from data" = id %in% names(x))

  matches <- match(x[[id]], facts[[id]])
  facts[[id]] <- NULL
  facts <- facts[matches, , drop = FALSE]
  ans <- cbind(x, facts)
  attr <- attributes(x)
  attr[["names"]] <- names(ans)
  attributes(ans) <- attr
  ans
}

#' @noRd
unlist_col <- function(col) {
  col_unlisted <- unlist(col)
  col_len <- length(col)
  col_unlisted_len <- length(col_unlisted)

  if (identical(col_len, col_unlisted_len)) {
    col_unlisted
  } else {
    col
  }

}

#' @noRd
convert_col_type <- function(col) {

  if (is.list(col)) {
    col <- vapply(col, paste_col, "")
  }

  col[col == ""] <- NA_character_
  nws <- trimws(col)
  nws_no_na <- nws[!is.na(col)]
  num <- grepl("^[-+]?[0-9]*[\\.,]?[0-9]+([eE][-+]?[0-9]+)?$", nws_no_na)

  if (all(num)) {
    col <- as.numeric(nws)
    int <- !grepl("[\\.,]", nws_no_na)

    if (all(int)) {
      col <- as.integer(nws)
    }

  }

  col
}

#' @noRd
paste_col <- function(x) {
  x[is.na(x)] <- ""
  paste(x, collapse = ", ")
}

#' @noRd
infer_file_vars <- function(cols) {
  if (length(cols) < 65L && !"fact" %in% tolower(cols)) {
    file_vars <- sysdata(list(which = "lite_download_file_vars"))

    locale <- lapply(file_vars, intersect, cols)
    locale <- vapply(locale, length, 0L)
    locale <- locale == max(locale)
    locale <- which(locale)

    locale_length <- length(locale)
    one_locale <- identical(locale_length, 1L)

    stopifnot(
      "Field names incompatible with this {finbif} package version" = one_locale
    )

    locale_nms <- names(locale)
    locale <- locale_nms[[1L]]
    rownames(file_vars) <- file_vars[[locale]]

    attr(file_vars, "lite") <- TRUE
    attr(file_vars, "locale") <- locale

  } else {
    file_vars <- sysdata(list(which = "cite_file_vars"))
    attr(file_vars, "lite") <- FALSE
    attr(file_vars, "locale") <- "none"
  }

  file_vars
}

#' @noRd
nlines <- function(fb_occurrence_obj) {
  connection_obj <- list(
    file = fb_occurrence_obj[["file"]],
    tsv = fb_occurrence_obj[["tsv"]],
    mode = "rb"
  )

  con <- open_tsv_connection(connection_obj)
  on.exit(close(con))

  n <- -1L

  if (fb_occurrence_obj[["is_dwc"]]) n <- -3L

  cond <- !inherits(con, "textConnection")

  while (cond) {
    chunk <- readBin(con, "raw", 65536L)
    chunk_10 <- chunk == as.raw(10L)
    n <- n + sum(chunk_10)
    empty <- raw(0L)
    cond <- !identical(chunk, empty)
  }

  n
}

#' @noRd
#' @importFrom utils unzip
open_tsv_connection <- function(connection_obj) {
  file <- connection_obj[["file"]]
  mode <- connection_obj[["mode"]]
  tsv <- connection_obj[["tsv"]]

  nchars <- nchar(file)
  ext <- substring(file, nchars - 3L, nchars)

  if (identical(ext, ".tsv")) {
    file(file, mode)
  } else if (tsv %in% utils::unzip(file, list = TRUE)[["Name"]]) {
    unz(file, tsv, mode)
  } else {
    vars <- sysdata(list(which = "cite_file_vars"))
    vars <- rownames(vars)
    vars <- paste0(vars, collapse = "\t")
    textConnection(vars)
  }

}

#' @noRd
expand_lite_cols <- function(df) {
  select <- attr(df, "select", TRUE)

  if (!select[["all"]]) {
    formatted_taxon_name <- c(
      "scientific_name_interpreted",
      "common_name_english",
      "common_name_finnish",
      "common_name_swedish"
    )

    formatted_date_time <- c(
      "date_start",
      "date_end",
      "hour_start",
      "hour_end",
      "minute_start",
      "minute_end"
    )

    coordinates_euref <- c(
      "lat_min_euref",
      "lat_max_euref",
      "lon_min_euref",
      "lon_max_euref"
    )

    coordinates_1_ykj <- c("lon_1_ykj", "lat_1_ykj")
    coordinates_10_ykj <- c("lon_10_ykj", "lat_10_ykj")
    coordinates_1_center_ykj <- c("lon_1_center_ykj", "lat_1_center_ykj")
    coordinates_10_center_ykj <- c("lon_10_center_ykj", "lat_10_center_ykj")

    df_names <- names(df)
    file_vars <- attr(df, "file_vars", TRUE)
    locale <- attr(file_vars, "locale", TRUE)

    cols <- c(
      "formatted_taxon_name",
      "formatted_date_time",
      "coordinates_euref",
      "coordinates_1_ykj",
      "coordinates_10_ykj",
      "coordinates_1_center_ykj",
      "coordinates_10_center_ykj"
    )

    translated_vars <- file_vars[["translated_var"]]

    for (col in which(translated_vars %in% cols)) {
      col_nm <- rownames(file_vars[col, ])
      df_col <- df[[col_nm]]
      df_col_na <- is.na(df_col)
      has_col_nm <- col_nm %in% df_names && !all(df_col_na)

      if (has_col_nm) {
        attr(df_col, "locale") <- locale
        translated_var <- file_vars[[col, "translated_var"]]

        type <- switch(
          translated_var,
          formatted_taxon_name = "taxon",
          formatted_date_time = "date_time",
          coordinates_euref = "coordinates_euref",
          "coords"
        )

        split_cols <- switch(
          type,
          taxon = split_taxa_col(df_col),
          date_time = split_dt_col(df_col),
          coordinates_euref = split_coord_euref_col(df_col),
          coords = split_coord_col(df_col),
        )

        new_cols <- switch(
          translated_var,
          formatted_taxon_name = formatted_taxon_name,
          formatted_date_time = formatted_date_time,
          coordinates_euref = coordinates_euref,
          coordinates_1_ykj = coordinates_1_ykj,
          coordinates_10_ykj = coordinates_10_ykj,
          coordinates_1_center_ykj = coordinates_1_center_ykj,
          coordinates_10_center_ykj = coordinates_10_center_ykj
        )
        new_cols <- rownames(file_vars[translated_vars %in% new_cols, ])

        for (i in seq_along(new_cols)) {
          col <- new_cols[[i]]
          dfi <- df[[col]]
          na_dfi <- is.na(dfi)
          no_col <- is.null(dfi) || all(na_dfi)

          if (no_col) {
            df[[col]] <- split_cols[[i]]
          }

        }
      }
    }
  }

  df
}

#' @noRd
split_taxa_col <- function(col) {
  col_list <- list(col, n = 2L, split = " \u2014 ")
  split_cols <- split_col(col_list)
  col_list2 <- list(split_cols[[2L]], n = 2L, split = " \\(|\\)")

  common_names <- split_col(col_list2)
  common_names1 <- common_names[[1L]]
  common_names2 <- common_names[[2L]]
  common_names_na <- is.na(common_names1)

  locale <- attr(col, "locale", TRUE)
  common_names1 <- ifelse(common_names_na, locale, common_names1)

  split_cols <- list(scientific_name = split_cols[[1L]])

  for (loc in c("en", "fi", "sv")) {
    ind <- common_names1 == loc
    col_loc <- NA_character_
    col_loc[ind] <- common_names2[ind]
    split_cols[[loc]] <- col_loc
  }

  split_cols
}

#' @noRd
split_dt_col <- function(col) {
  col <- list(col, n = 2L, split = " - |/")
  col <- split_col(col)
  col <- lapply(col, list, n = 2L, split = " \\[|\\]")
  col <- lapply(col, split_col)

  dates <- lapply(col, "[[", 2L)
  dates1 <- dates[[1L]]
  dates2 <- dates[[2L]]
  dates_na <- is.na(dates1)

  times <- lapply(col, "[[", 1L)
  times <- lapply(times, list, n = 2L, split = "-")
  times <- lapply(times, split_col)

  times1 <- times[[1L]]
  times1 <- times1[[1L]]
  times2 <- times[[2L]]

  start_times <- list(times2[[2L]], n = 2L, split = ":")
  start_times <- split_col(start_times)

  times_na <- is.na(times1)

  end_times <- ifelse(times_na, times2[[1L]], times1)
  end_times <- list(end_times, n = 2L, split = ":")
  end_times <- split_col(end_times)

  list(
    date_start = dates2,
    date_end = ifelse(dates_na, dates2, dates1),
    hour_start = as.integer(start_times[[2L]]),
    hour_end =  as.integer(end_times[[2L]]),
    minute_start = as.integer(start_times[[1L]]),
    minute_end = as.integer(end_times[[1L]])
  )
}

#' @noRd
split_coord_col <- function(col) {
  col <- list(col, n = 2L, split = ":")
  split_cols <- split_col(col)
  lapply(split_cols, as.numeric)
}

#' @noRd
split_coord_euref_col <- function(col) {
  col <- list(col, n = 4L, split = "\\D")
  split_cols <- split_col(col)
  split_cols <- lapply(split_cols, as.numeric)
  rev(split_cols)
}

#' @noRd
split_col <- function(split_obj) {
  col <- as.character(split_obj[[1L]])
  sq <- seq_len(split_obj[["n"]])

  split_cols <- strsplit(col, split_obj[["split"]])
  split_cols <- lapply(split_cols, c, NA_character_)
  split_cols <- lapply(split_cols, "[", sq)
  split_cols <- lapply(split_cols, rev)
  split_cols <- do.call(rbind, split_cols)

  apply(split_cols, 2L, c, simplify = FALSE)
}

Try the finbif package in your browser

Any scripts or data that you put into this service are public.

finbif documentation built on Jan. 27, 2026, 9:06 a.m.