data-raw/load.DML.R

## load.DML.R: Download and process DML oxygen isotope data
##
## Author: Thomas Münch (thomas.muench@awi.de), Alfred-Wegener-Institut, 2018
##

library(usethis)

# 'pangaear' package for easy download from PANGAEA repository
library(pangaear)

# PANGAEA doi's of the DML data sets
doi <- c("10.1594/PANGAEA.104889",
         "10.1594/PANGAEA.104890",
         "10.1594/PANGAEA.104880",
         "10.1594/PANGAEA.104891",
         "10.1594/PANGAEA.104879",
         "10.1594/PANGAEA.104892",
         "10.1594/PANGAEA.104893",
         "10.1594/PANGAEA.104878",
         "10.1594/PANGAEA.104887",
         "10.1594/PANGAEA.104886",
         "10.1594/PANGAEA.104885",
         "10.1594/PANGAEA.104884",
         "10.1594/PANGAEA.104882",
         "10.1594/PANGAEA.104881",
         "10.1594/PANGAEA.104888")

# Names of the respective data sets
names <- c(paste("FB98",
                 c("04", "05", "07", "08", "09", "10",
                   "11", "13", "14", "15", "16", "17"),
                 sep = ""),
           "B31", "B32", "B33")

# Download and process entire DML data
dml1 <- load.DML(doi = doi, names = names,
                 setStart = 1994,
                 setEnd = c(rep(1801, 12), rep(1000, 3)),
                 fillNA = TRUE,
                 clearCache = TRUE, verbose = TRUE)

# Split into the two DML data sets used in the paper
dml2 <- structure(dml1[13 : 15], info = attr(dml1, "info"))
dml1[13 : 15] <- lapply(dml1[13 : 15], function(x){x[1 : length(dml1[[1]])]})

dml <- list(dml1 = dml1, dml2 = dml2)

usethis::use_data(dml, overwrite = TRUE)


#-------------------------------------------------------------------------------
# utility function

#' load.DML: Download and process DML data
#'
#' Download DML isotope data from the PANGAEA repository and process them for
#' the spectral analyses.
#'
#' @param doi vector of PANGAEA doi's ("10.1594/PANGAEA.xxxxxx").
#' @param names optional character vector of the names of the data sets.
#' @param setStart vector of start dates for each data set; if `NULL` the
#'   original start date is used. If only one date is given, the length of
#'   `setStart` is recycled to match the number of data sets.
#' @param setEnd vector of end dates for each data set; if `NULL` the original
#'   end date is used. If only one date is given, the length of `setEnd` is
#'   recycled to match the number of data sets.
#' @param fillNA Shall missing values be linearly interpolated? Defaults to
#'   `FALSE`.
#' @param clearCache Shall the local pangaea data cache (the downloaded data
#'   text files) be deleted? Defaults to `FALSE`.
#' @param verbose if `TRUE`, messages on the download and processing are
#'   printed.
#'
#' @return a list with the oxygen isotope data for the data sets.
#'
#' @author Thomas Münch
#'
load.DML <- function(doi,
                     names = NULL,
                     setStart = NULL,
                     setEnd = NULL,
                     fillNA = FALSE,
                     clearCache = FALSE,
                     verbose = FALSE) {

  if (length(setStart) == 1) {
    setStart <- rep(setStart, length(doi))
  }
  if (length(setEnd) == 1) {
    setEnd <- rep(setEnd, length(doi))
  }

  # Download data from PANGAEA
  data <- list()
  for (i in 1 : length(doi)) {

    tmp <- pangaear::pg_data(doi = doi[i], mssgs = verbose)
    
    # Keep age (yr AD) and d18O values only
    data[[i]] <- as.data.frame(tmp[[1]]$data)[, c(3, 6)]

    # Adjust time frame of record
    i.start <- ifelse(is.null(setStart[i]),
                      1, match(setStart[i], data[[i]][, 1]))
    i.end <- ifelse(is.null(setEnd[i]),
                    nrow(data[[i]]), match(setEnd[i], data[[i]][, 1]))

    data[[i]] <- data[[i]][i.start : i.end, ]

  }
  
  # Set names for the data set if provided
  if (!is.null(names)) {
    names(data) <- names
  }

  # Interpolate missing values if desired
  if (fillNA) {

    # Print number of interpolated vs. total number of data points
    if (verbose) {
      
      na.amount <- sapply(data, function(df) {
        sum(is.na(df[, 2]))
      })

      print(sprintf("Total # data points: %2.0f",
                    length(unlist(data)) / 2))
      print(sprintf("Data points interpolated: %2.0f.",
                    sum(na.amount)))

    }

    data <- lapply(data, function(df) {
      df[, 2] <- approx(df[, 1], df[, 2], df[, 1])$y
      return(df)
    })
    
  }

  # Keep only d18O values
  data <- lapply(data, function(df) {df[, 2]})
  
  # Clear the local pangaea data cache if desired
  if (clearCache) {
    pangaear::pg_cache$delete_all()
  }

  # Provide meta info as attributes
  attr(data, "info") <-
    sprintf("Downloaded and processed on: %s.", Sys.time())

  return(data)

}
EarthSystemDiagnostics/proxysnr documentation built on June 9, 2025, 11:58 a.m.