# to do
# month and year selection options
# a way to estimate the potential size of the data frame to ask the user of they still want to read it in
# the above works right now for the lonlat case. Get it working for the curvilinear case
# need an option for cacheing results potentially...
# a smarter way to handle nas in the data frame, as some values might be na but others not
# add drop_na option
# also, add a check for the number of horizontal grids. Shouldn't read the netcdf if there is more than 1 GB(ish)
#' @title Read a list of netcdf file to a nested data frame. Experimental
#'
#' @description This reads a list of netcdf files to a data frame. This function is strict and currently only works when the files are of identifical format, and it will fail for netcdf files with multiple times. Experimental feature, likely to change.
#' @param ff_list Files to read. A character vector.
#' @param vars A list of variables you want to read in. Character vector. Everything is read in if this is empty.
#' @param cdo_output Do you want to show the cdo output? Set to TRUE in case you want to troubleshoot errors.
#' @param dim_check The number of data points in the final data frame that will ask to continue. Set to NULL if you don't want to check.
#' @export
nc_read_list <- function(ff_list, vars = NULL, cdo_output = FALSE, dim_check = 15e7) {
init_dir <- getwd()
on.exit(setwd(init_dir))
if (!is.character(ff_list)) {
stop("error: ff_list is not a character vector")
}
for (ff in ff_list) {
if (!file_valid(ff)) {
stop(stringr::str_glue("error: {ff} does not exist or is not netcdf"))
}
}
# let's loop through the files
result <- list()
tracker <- 1
# set grid_checked to false. Once we have found the grid and stored it we don't need to calculate it again. It is then reset to TRUE
grid_checked <- FALSE
for (ff_file in ff_list) {
ff <- ff_file
if (grid_checked == FALSE) {
if (as.integer(system(stringr::str_glue("cdo ngrids {ff}"), intern = TRUE, ignore.stderr = TRUE)) > 1) {
warning("warning: there is more than one horizontal grid in the netcdf file. Please check output for errors!")
}
# check that the vars given are actually in the file
if (!is.null(vars)) {
var_list <- stringr::str_flatten(nc_variables(ff), collapse = " ")
for (vv in vars) {
if (vv %in% nc_variables(ff) == FALSE) {
stop(stringr::str_glue("variable {vv} does not appear to be in the file. Available variables are {var_list}"))
}
}
}
# It is possible the files are not cdo compatible.
# Right now the package attempts to make them cdo compatible. Is there a smarter way round this for reading?
# Note: in read_list this only needs to happen once
delete_copy <- FALSE
if (!cdo_compatible(ff)) {
delete_copy <- TRUE
temp_dir <- random_temp()
file.copy(ff, stringr::str_c(temp_dir, "/raw.nc"), overwrite = TRUE)
# remove anything from the temporary folder to make sure there are no clashes etc.
setwd(temp_dir)
if (getwd() == init_dir) {
stop("error: there was a problem changing the directory")
}
if (getwd() != temp_dir) {
stop("error: there was a problem changing the directory")
}
temp_dir <- stringr::str_c(temp_dir, "/")
if (file.exists("dummy.nc")) {
file.remove("dummy.nc")
}
# We need to change ff to the name of the copied file so that the following code works
ff <- "raw.nc"
# If the file is not cdo compatible, we'll need to attempt to add coordinate variables...
add_missing_grid(ff, vars = vars)
# If this does not make the file cdo compatible, then we need to throw an error
if (!cdo_compatible("raw.nc")) {
stop("error: file is not cdo compatible, even after trying to fix the coordinates")
}
}
grid_details <- system(stringr::str_glue("cdo griddes {ff}"), intern = TRUE, ignore.stderr = TRUE)
grid_details <- grid_details %>%
dplyr::as_tibble()
grid_details <- grid_details %>%
dplyr::mutate(Total = 0) %>%
dplyr::mutate(Total = ifelse(startsWith(value, "yunits"), 1, Total)) %>%
dplyr::mutate(Total = cumsum(Total)) %>%
dplyr::filter(Total < 1)
grid_type <- grid_details %>%
dplyr::filter(startsWith(value, "gridtype")) %>%
tidyr::separate(value, into = c("ignore", "grid")) %>%
dplyr::select(grid) %>%
dplyr::pull(grid)
grid_details <- grid_details %>%
dplyr::filter(startsWith(value, "xname") | startsWith(value, "yname")) %>%
tidyr::separate(value, into = c("ignore", "variable"), sep = "=") %>%
dplyr::mutate(ignore = stringr::str_replace_all(ignore, " ", "")) %>%
dplyr::mutate(variable = stringr::str_replace_all(variable, " ", "")) %>%
tidyr::spread(ignore, variable)
lon_name <- stringr::str_replace_all(grid_details$xname, " ", "")
lat_name <- stringr::str_replace_all(grid_details$yname, " ", "")
depths <- system(stringr::str_glue("cdo showlevel {ff}"), intern = TRUE, ignore.stderr = (cdo_output == FALSE)) %>%
stringr::str_split(" ") %>%
.[[1]] %>%
as.numeric()
depths <- depths[complete.cases(depths)]
times <- NULL
}
# now, pull in the longitudes and latitudes...
nc_raw <- ncdf4::nc_open(ff)
nc_lon <- ncdf4::ncvar_get(nc_raw, lon_name)
nc_lat <- ncdf4::ncvar_get(nc_raw, lat_name)
# this is coded on the assumption that when there is only one depth and time, those dimensions will be collapsed to nothing
# this should be a valid assumption, and works on all netcdf files tested
if (grid_checked == FALSE) {
if ("curvilinear" %nin% grid_type & "unstructured" %nin% grid_type) {
# we need to estimate the data frame size for the curvilinear grid case
if (dim_check < length(nc_lon) * length(nc_lat) * length(depths) * length(times) * (2 + (length(depths) > 1) + (length(times) > 1))) {
choice <- readline(prompt = "This file is potentially large. Do you want to continue? (y/n): ")
print(choice)
if (choice != "y") {
(return("file is too big to read in"))
}
}
nc_grid <- eval(parse(text = stringr::str_c(
"expand.grid(Longitude = nc_lon, Latitude = nc_lat",
ifelse(length(depths) > 1, ",Depth = depths", ""),
ifelse(length(times) > 1, ",Time = times", ""),
")"
)))
} else {
# right now, this is relatively simplistic. Only uses row numbers. Could be smarter....
if (dim_check < length(nc_lon)) {
choice <- readline(prompt = "This file is potentially large. Do you want to continue? (y/n): ")
print(choice)
if (choice != "y") {
(return("file is too big to read in"))
}
}
if (length(depths) < 2 & length(times) < 2) {
nc_grid <- dplyr::tibble(Longitude = as.numeric(nc_lon), Latitude = as.numeric(nc_lat))
} else {
nc_grid <- dplyr::tibble(Longitude = as.numeric(nc_lon), Latitude = as.numeric(nc_lat))
dt_grid <- eval(parse(text = stringr::str_c(
"expand.grid(",
ifelse(length(depths) > 1, "Depth = depths", ""),
ifelse(length(depths) > 1 & length(times) > 1, ",", ""),
ifelse(length(times) > 1, "Time = times", ""),
")"
)))
nc_grid <- dt_grid %>%
dplyr::mutate(data = purrr::map(
1,
function(x) return(nc_grid)
)) %>%
tidyr::unnest()
}
}
grid_checked <- TRUE
}
# if no vars are supplied, use the variables in the netcdf file
if (is.null(vars)) {
vars <- nc_variables(ff)
}
nc_grid <- nc_grid %>%
dplyr::select(names(.)[names(.) %in% c("Longitude", "Latitude", "Depth")])
for (vv in vars) {
nc_var <- ncdf4::ncvar_get(nc_raw, vv)
nc_grid$var <- nc_var %>% as.numeric()
names(nc_grid)[ncol(nc_grid)] <- vv
}
nc_grid <- nc_grid %>%
# tidyr::drop_na() %>%
dplyr::as_tibble()
ncdf4::nc_close(nc_raw)
result[[tracker]] <- nc_grid
tracker <- tracker + 1
}
if (delete_copy) {
file.remove(stringr::str_glue(temp_dir, ff))
}
result <- tibble::enframe(result) %>%
dplyr::rename(File = name, data = value) %>%
dplyr::mutate(File = ff_list)
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.