quantarcticR: Access to Quantarctica Data Sets

Documented in qa_dataset qa_datasets

#' Retrieve details of a Quantarctica data set
#'
#' @param name string: the name of the data set
#' @param refresh_cache numeric: 0 = do not overwrite existing files, 1 = overwrite if the remote file is newer than the local copy, 2 = always overwrite existing files
#' @param cache_directory string: the cache directory to use. As for the \code{path} parameter to the \code{\link{qa_cache_dir}} function
#' @param verbose logical: show progress messages?
#'
#' @return A tibble
#' @examples
#' \dontrun{
#'   dsx <- qa_dataset("ALBMAP Bed/bathymetry elevation (5km)")
#' }
#' @export
qa_dataset <- function(name, cache_directory = qa_cache_dir(), refresh_cache = 0, verbose = FALSE) {
    cache_directory <- resolve_cache_dir(cache_directory) ## convert "session" or "persistent" to actual paths, if needed
    ## find name in datasets index
    lx <- dataset_detail(name, cache_path = cache_directory, refresh_cache = refresh_cache, verbose = verbose)
    path <- dirname(lx$datasource) ## worst case, we'll grab everything in this directory
    ## refine what we download, if possible
    if (grepl("\\.shp$", tolower(lx$datasource)) && identical(lx$type, "shapefile")) {
        ## for shapefiles, only download files matching the same name (ignoring file extension) as the .shp file
        accept_download <- paste0(fs::path_ext_remove(basename(lx$datasource)), "\\.[^\\.]+$")
        ade <- character()
    } else if (grepl("\\.(tif[f]?|jp2)$", tolower(lx$datasource)) && identical(lx$type, "raster")) {
        ## if it's a tif or jp2 file, then we only need that one .tif file
        accept_download <- basename(lx$datasource)
        ## note that this will also download e.g. filename.tif.aux.xml files, but that's OK
        ade <- character()
    } else {
        accept_download <- bowerbird::bb_rget_default_downloads()
        ade <- "(jp2|vrt|ovr|jpg|jgw|cpg|dbf|prj|qix|shp|shx|xml)$"
    }
    ## the only other type (i.e. dataset$main_file extension) is .vrt
    ## these are virtual files that point to other files, and we can't know what they are without downloading the .vrt file
    ## but we just assume that the required files are kept in the same directory as the .vrt file

    ## need to know how many levels there are in the directory hierarchy of the mirror so that we can set the cut_dirs appropriately
    ## e.g. no_host = TRUE and cut_dirs = 1 will drop the hostname/Quantarctica3 part of the directory
    ## or no_host = TRUE and cut_dirs = 2 will drop the hostname/gis/Quantarctica3 parts of the directory
    my_cut_dirs <- length(fs::path_split(httr::parse_url(qa_mirror())$path)[[1]])
    bb <- bb_source(name = lx$layername,
                    id = paste0("Quantarctica: ", lx$layername),
                    description = "Quantarctica data",
                    doc_url = "http://quantarctica.npolar.no/",
                    citation = paste0("Matsuoka, K., Skoglund, A., & Roth, G. (2018). Quantarctica ", lx$layername, ". Norwegian Polar Institute. https://doi.org/10.21334/npolar.2018.8516e961"),
                    source_url = sub("[/\\]+$", "/", paste0(qa_mirror(), path, "/")), ## ensure trailing sep
                    license = "CC-BY 4.0 International",
                    method = list("bb_handler_rget", level = 1, no_host = TRUE, cut_dirs = my_cut_dirs, accept_download = accept_download, accept_download_extra = ade)
                    ##collection_size = tryCatch(as.numeric(lx$download_size)/1024^3, error = function(e) NA_real_)
                    ##data_group = "Topography")
                    )
    ## add the full path to the main file of this data set
    lx$main_file <- file.path(cache_directory, lx$datasource)
    lx$bb_source <- bb
    class(lx) <- c("qa_dataset", class(lx))
    lx
}

#' @method print qa_dataset
#' @export
print.qa_dataset <- function(x, ...) {
    ## as a placeholder, we'll just print the dataset object as a data.frame, but hide the bb_source component
    temp <- x
    message("This Quantarctica dataset is made available under a ", temp$bb_source$license, " license.\n")
    message("If the abstract provides a specific citation for this dataset, please use the citation in the abstract: \n", temp$abstract, "\n\n" )
    message("Otherwise cite as: ", temp$bb_source$citation)
    class(temp) <- setdiff(class(temp), "qa_dataset")
    tibble::glimpse(temp)
}

#' Available Quantarctica data sets
#'
#' @param cache_directory string: the cache directory to use. As for the \code{path} parameter to the \code{\link{qa_cache_dir}} function
#' @param refresh_cache numeric: as for \code{qa_get}
#' @param verbose logical: show progress messages?
#'
#' @return A tibble with columns \code{layername}, \code{main_file}, \code{type}, \code{cached}, and \code{download_size}
#'
#' @seealso \code{\link{qa_get}}
#'
#' @examples
#' \dontrun{
#'   qa_datasets()
#' }
#'
#' @export
qa_datasets <- function(cache_directory = qa_cache_dir(), refresh_cache = 0, verbose = FALSE) {
    cache_directory <- resolve_cache_dir(cache_directory) ## convert "session" or "persistent" to actual paths, if needed
    lxs <- dataset_index(cache_path = cache_directory, refresh_cache = refresh_cache, verbose = verbose)
    if (!is.null(lxs)) {
        lxs$cached <- vapply(lxs$datasource, file.exists, FUN.VALUE = TRUE, USE.NAMES = FALSE)
        ## rename datasource to main_file for consistency with qa_dataset
        names(lxs)[names(lxs) == "datasource"] <- "main_file"
        ## add download_size information, which has been pre-cached in the layer_sizes internal data object
        lxs <- merge(lxs, layer_sizes, all.x = TRUE, sort = FALSE)
        lxs$download_size <- fs::as_fs_bytes(lxs$download_size)
        as_tibble(lxs)
    } else {
        warning("something went wrong")
        NULL
    }
}

## internal function to get dataset index
## cache_path must be an actual path, not "session" or "persistent"

dataset_index <- function(cache_path, refresh_cache = 0, verbose = FALSE) {
    index_file <- fetch_dataset_index(cache_path = cache_path, refresh_cache = refresh_cache, verbose = verbose)
    lx <- xml2::read_xml(index_file)
    lxs <- as_tibble(do.call(rbind, lapply(xml2::xml_find_all(lx, ".//layer-tree-layer"), get_layer_details)))
    lxs <- setNames(lxs, c("layername", "datasource"))
    ## clean bad sources
    for (i in seq_along(lxs$datasource)) {
        if (!grepl("\\.[a-z0-9]$", lxs$datasource[i])) {
            lxs$datasource[i] <- strsplit(lxs$datasource[i], "\\|")[[1]][1]
        }
    }
    lxs$datasource <- sub("^.*Quantarctica3/", "", lxs$datasource)
    lxs$type <- type_from_filename(lxs$datasource)
    lxs$datasource <- sub("^\\./", "", lxs$datasource) ## strip leading ./ on path
    lxs$datasource <- file.path(cache_path, lxs$datasource)
    ## remove duplicate entries: there are three. See https://github.com/SCAR-sandpit/quantarcticR/issues/14
    lxs[!duplicated(lxs$layername), ]
}
get_layer_details <- function(z) as.data.frame(as.list(xml2::xml_attrs(z))[c("name", "source")], stringsAsFactors = FALSE)

dataset_detail <- function(name, cache_path, refresh_cache = 0, verbose = FALSE) {
    index_file <- fetch_dataset_index(cache_path = cache_path, refresh_cache = refresh_cache, verbose = verbose)
    lx <- xml2::read_xml(index_file)
    ## if we did not need to pare out duplicate layer names, we could just do:
    ##dx <- xml2::xml_find_all(lx, paste0("//projectlayers/maplayer[layername = '", name, "']"))
    ##if (length(dx) < 1) {
    ##    ## try case-insensitive match on name
    ##    dx <- xml2::xml_find_all(lx, paste0("//projectlayers/maplayer[lower-case(@layername) = '", tolower(name), "']"))
    ##}
    ## but for now, need to get rid of duplicates (until these are removed from the index file by the Quantarctica maintainers)
    lx <- xml2::xml_find_all(lx, "//projectlayers/maplayer")
    lyrs <- sapply(lx, function(z) xml2::xml_text(xml2::xml_child(z, search = "layername")))
    idx <- !is.na(lyrs) & !duplicated(lyrs) & lyrs == name
    if (sum(idx) < 1) {
        ## try case-insensitive match on name
        idx <- !is.na(lyrs) & !duplicated(lyrs) & tolower(lyrs) == tolower(name)
    }
    dx <- lx[idx]

    if (length(dx) < 1) {
        stop("no matching data set found")
    } else if (length(dx) > 1) {
        stop("multiple matching data sets found")
    } else {
        xmll <- xml2::as_list(dx)
        dx <- clean_layer(xmll)
        dx$palette <- list(get_layer_palette(xmll))
        dx$type <- type_from_filename(dx$datasource)
        ## add download_size information, which has been pre-cached in the layer_sizes internal data object
        szidx <- which(layer_sizes$layername == dx$layername)
        this_size <- if (length(szidx) == 1) layer_sizes$download_size[szidx] else NA
        dx$download_size <- fs::as_fs_bytes(this_size)
        dx
    }
}

## internal function to infer type (shapefile or raster or unknown) from the file extension
type_from_filename <- function(fname) {
    lxs_type <- rep("unknown", length(fname))
    lxs_type[grepl("shp$", fname, ignore.case = TRUE)] <- "shapefile"
    lxs_type[grepl("(tif|jp2|vrt)$", fname, ignore.case = TRUE)] <- "raster"
    lxs_type
}

## internal function to extract the colour paletter from the rasterrenderrer XML element
## xmll is the layer XML converted to a list with xml2::as_list(layer_xml)
get_layer_palette <- function(xmll) {
    if (length(xmll) == 1) xmll <- xmll[[1]]
    ptype <- attr(xmll$pipe$rasterrenderer, "type")
    crtype <- attr(xmll$pipe$rasterrenderer$rastershader$colorrampshader, "colorRampType")
    df_from_list_attrs <- function(ll, accept_names) {
        out <- lapply(ll, function(z) as.data.frame(attributes(z), stringsAsFactors = FALSE))
        if (!missing(accept_names) && !is.null(accept_names)) out <- out[names(out) %in% accept_names]
        do.call(rbind, out)
    }
    ## ptype                  crtype
    ## hillshade             NULL
    ## multibandcolor        NULL
    ## paletted              NULL
    ## singlebandgray        NULL
    ## singlebandpseudocolor INTERPOLATED
    ## singlebandpseudocolor EXACT
    ## singlebandpseudocolor DISCRETE
    if (is.null(ptype) || !ptype %in% c("hillshade", "multibandcolor", "paletted", "singlebandgray", "singlebandpseudocolor")) return(NULL)
    pal <- NULL
    if (ptype == "singlebandpseudocolor") {
        if (crtype %in% c("INTERPOLATED", "EXACT", "DISCRETE")) {
            pal <- df_from_list_attrs(xmll$pipe$rasterrenderer$rastershader$colorrampshader, accept_names = "item")
            pal$value <- as.numeric(pal$value)
        } else {
            warning("don't know how to handle color ramp type '", crtype, "', ignoring")
        }
    } else if (ptype == "singlebandgray") {
        if (!identical(attr(xmll$pipe$rasterrenderer, "gradient"), "BlackToWhite")) {
            warning("unexpected gradient type for singlebandgray palette, ignoring")
        } else {
            ##alpha <- tryCatch(as.numeric(attr(xmll$pipe$rasterrenderer, "opacity")), error = function(e) NA_real_)
            ##if (is.na(alpha) || alpha <= 0 || alpha > 1) alpha <- 1
            pal <- data.frame(color = grDevices::colorRampPalette(c("black", "white"))(51))
        }
    } else if (ptype == "paletted") {
        ## e.g. BEDMAP2 Lake Vostok mask
        pal <- df_from_list_attrs(xmll$pipe$rasterrenderer$colorPalette)
        pal$value <- as.numeric(pal$value)
    } else if (ptype == "multibandcolor") {
        ## these are RGB rasters with 3 layers (see e.g. "USGS/NASA subantarctic Landsat (15m)")
        ## no palette, plot with e.g. raster::plotRGB
    } else if (ptype == "hillshade") {
        ## ignore hillshade for now
        ## see e.g. pipe$rasterrenderer attributes "azimuth", "zfactor", "multidirection", "angle"
    } else {
        warning("don't know how to handle palette of type '", ptype, "', ignoring")
    }
    pal
}


## cache_path must be an actual path, not "session" or "persistent"
fetch_dataset_index <- function(cache_path, refresh_cache = 0, verbose = FALSE) {
    index_file <- file.path(cache_path, qa_index_filename())
    if (file.exists(index_file) && refresh_cache < 1) return(index_file) ## don't re-fetch if not needed
    if (!dir.exists(dirname(index_file))) tryCatch(dir.create(dirname(index_file), recursive = TRUE), error = function(e) stop("Could not create cache directory: ", dirname(index_file)))
    ## as of quantarctica 3.2, the index file is gzipped
    res <- bb_rget(url = paste0(qa_mirror(), sub("qgs$", "qgz", qa_index_filename())), force_local_filename = sub("qgs$", "qgz", index_file), use_url_directory = FALSE, verbose = verbose, clobber = refresh_cache)
    ## gunzip
    if (isTRUE(res$ok)) {
        tryCatch({
            utils::unzip(zipfile = res$files[[1]]$file[1], exdir = cache_path, overwrite = TRUE)
        }, error = function(e) {
            stop("could not unzip qgz file")
        })
    }
    if (file.exists(index_file)) {
        index_file
    } else {
        stop("could not retrieve dataset index file")
    }
}

## internal function to clean layer data
clean_layer <- function(layer) {
    l <- as_tibble(t(unlist(layer)))
    ld <- l[,c("layername", "datasource")]
    if (!grepl("\\.[a-z0-9]$", ld$datasource)) {
        ld$datasource <- strsplit(ld$datasource, "\\|")[[1]][1]
    }
    ld$datasource <- sub("^\\./", "", ld$datasource) ## strip leading ./ on path
    ld$layer_attributes <- ifelse(any(grepl("pipe", names(layer))),
                                  list(unlist(c(lapply(layer$pipe, attributes), attributes(layer)[-1]))),
                                  list(attributes(layer)[-1])
    )
    ld$srs_attributes <- list(l[c("srs.spatialrefsys.proj4", "srs.spatialrefsys.srsid",
                                  "srs.spatialrefsys.authid", "srs.spatialrefsys.description")])
    ld$provider <- if ("provider" %in% names(l)) l$provider else NA_character_
    ld$abstract <- if ("abstract" %in% names(l)) l$abstract else NA_character_
    if ("extent" %in% names(layer)) {
        ext <- unlist(layer$extent)
        class(ext) <- "numeric" ## from char to numeric
    } else {
        ext <- NULL
    }
    ld$extent <- list(ext)
    ld
}

## internal function that defines the index file name, just so that if in the future it changes we don't have to
## change a lot of hard-coded references to it
qa_index_filename <- function() "Quantarctica32.qgs"