Nothing
#' function for converting netcdfs to long data tables.
#'
#' @description The function converts netcdfs into long data.tables.
#' Be aware that the data table can be much larger in memory, especially if you have many dimension variables.
#'
#' @param nc Either a character string with the name of the .nc file (including path), or an object of type ncdf4.
#' @param vars Which variables should be read from the netcdf? Either a character vector of variable names, or an
#' integer vector of variable indices. If this is NULL, all variables are read.
#' @param verbose Either 0, 1 or 2. How much information should be printed?
#' The default (2) is to print the entire netcdf information (as output by \code{ncdf4::nc_open}), 1 just prints the units for all variables, 0 (or any other input)
#' prints nothing.
#' @param trymerge logical. If TRUE, a single data table containing all variables is returned, else a list of data
#' tables, one for each variable. The latter is much more memory efficient if you have multiple variables depending
#' on different dimensions.
#' @param subset_list A named list for reading only subsets of the data. Currently only 'rectangle subsetting' is provided, i.e. you can provide two limit values for each dimension and everything between
#' will be read. The names of the pages of subset_list must correspond to the names of dimension variables in the netcdf, and each page should contain a (two-element-)range vector.
#' For example, subsetting a global dataset to just East Africa could look like this: subset_list = list(latitude = c(-15,25),longitude = c(20,55)).
#' Non-rectangular subsetting during reading a netcdf seems to be difficult, see ncvar_get. Every dimension variable not named in subset_list is read entirely.
#' @param keep_nas Should missing values be kept? If FALSE (the default), missing values are not included in the returned data table.
#' If this is set to TRUE, the data table is constructed from the full data-cube (meaning its number of rows is the product of the length of the dimension variables, even if many coordinates
#' have missing data). This makes the returned data table potentially much larger and is almost never an advantage. It is only allowed, because it can make complex bookkeeping tasks easier
#' (specifically upscaling many CHIRPS-netcdfs with the same coordinates while saving the upscaling weights in a matrix).
#'
#' @return A data table if \code{trymerge == TRUE} or else a list of data tables.
#'
#' @import ncdf4
#'
#' @examples
#' # filename of example-netcdf file:
#' fn = system.file("extdata", "example.nc", package="SeaVal")
#'
#' dt = netcdf_to_dt(fn)
#' print(dt)
#'
#'
#' @export
netcdf_to_dt = function(nc, vars = NULL,
verbose = 2,
trymerge = TRUE,
subset_list = NULL,
keep_nas = FALSE)
{
if(is.character(nc))
{
if(!file.exists(nc)) stop(paste0('The file ',nc,' does not exist.\nRemember that you need to include the path to the directory the file is located in.'))
nc = ncdf4::nc_open(nc)
on.exit(ncdf4::nc_close(nc))
}
if(verbose == 2) print(nc)
# convert vars to numeric vector indexing the variables you want to extract:
if(is.null(vars))
{
vars = 1:nc$nvars
} else if(is.character(vars))
{
vars = match(vars,names(nc$var))
}
dt_list = list()
if(!is.null(subset_list))
{
subset_list = lapply(subset_list,range) # such that each page has 2 elements, important to allow subsetting like month = 10 or lon = 1:50
which_dims_subsetted = match(names(subset_list),names(nc$dim))
for(ii in which_dims_subsetted)
{
dim_name = nc$dim[[ii]]$name
vals = nc$dim[[ii]]$vals
read_inds = which(vals %between% subset_list[[dim_name]])
if(length(read_inds) == 0) {
warning(paste0("Dimvar ",dim_name," has no values within the subset range [",paste(subset_list[[dim_name]],collapse = ", "),"]"))
return(data.table())
}
subset_list[[dim_name]] = list(start = min(read_inds),count = max(read_inds) - min(read_inds) + 1)
}
}
units = NULL
for(var in vars)
{
legend_list = list() # we have to store which variables have a legend-attribute (which indicates that they originally contained characters).
v = nc$var[[var]]
if(v$ndims == 0)
{
warning(paste0('The variable ',v$name,' has no dimensions and is skipped.'))
next
}
if('legend' %in% names(ncatt_get(nc, varid = v$name))) {
legend_list = c(legend_list,ncatt_get(nc, varid = v$name, attname = 'legend')$value)
names(legend_list[[length(legend_list)]]) = v$name
}
units = c(units, paste0(v$name,': ',v$units))
# subsetting:
start = rep(1,v$ndims)
count = rep(-1,v$ndims)
for(ii in 1:length(v$dim))
{
if(v$dim[[ii]]$name %in% names(subset_list))
{
start[ii] = subset_list[[v$dim[[ii]]$name]]$start
count[ii] = subset_list[[v$dim[[ii]]$name]]$count
}
}
dim_lengths = v$varsize
dim_lengths = count * (count != -1) + v$varsize * (count == -1)
if(prod(dim_lengths) > 2^31) stop('The resulting data.table (for variable ',v$name,') is too long! Try subsetting.')
dt_temp = NULL
# generate data.table with dimension variables: Do NOT use nvar_get here because that fails when dimvarid is missing :-P
for(i in 1:v$ndims)
{
if('legend' %in% names(ncatt_get(nc, varid = v$dim[[i]]$name))) {
legend_list = c(legend_list,ncatt_get(nc, varid = v$dim[[i]]$name, attname = 'legend')$value)
names(legend_list[[length(legend_list)]]) = v$dim[[i]]$name
}
### vectorize dimension entries: ###
if(count[i] == -1)
{
# we need to first repeat using times = {the product of lengths of 'later' dimension vectors}...
dimension_vector = rep(v$dim[[i]]$vals,times = prod(dim_lengths[(i+1):(v$ndims + 1)],na.rm = T))
#... and then to repeat this using each = {the product of lengths of 'earlier' dimension vectors}:
dimension_vector = rep(dimension_vector,each = prod(dim_lengths[0:(i-1)],na.rm = T))
} else {
read_inds = start[i] : (start[i] + count[i] - 1)
# we need to first repeat using times = {the product of lengths of 'later' dimension vectors}...
dimension_vector = rep(v$dim[[i]]$vals[read_inds],times = prod(dim_lengths[(i+1):(v$ndims + 1)],na.rm = T))
#... and then to repeat this using each = {the product of lengths of 'earlier' dimension vectors}:
dimension_vector = rep(dimension_vector,each = prod(dim_lengths[0:(i-1)],na.rm = T))
}
dt_ttemp = data.table(dimension_vector)
setnames(dt_ttemp,v$dim[[i]]$name)
dt_temp = data.table(dt_temp, dt_ttemp)
}
# add variable values:
dt_ttemp = data.table(as.vector(ncvar_get(nc,varid = v$name,start = start,count = count)))
setnames(dt_ttemp,v$name)
dt_temp = data.table(dt_temp, dt_ttemp)
for(i in seq_along(legend_list)){
temp = strsplit(legend_list[[i]],split = ', ')
temp = strsplit(unlist(temp),split = ' = ')
inds = c()
char_values = c()
for(j in seq_along(temp)){
inds = c(inds,as.numeric(temp[[j]][1]))
char_values = c(char_values,temp[[j]][2])
}
dt_temp[,(names(legend_list[[i]])) := char_values[match(get(names(legend_list[[i]])),inds)]]
}
if(keep_nas) dt_list = c(dt_list,list(dt_temp))
if(!keep_nas) dt_list = c(dt_list,list(dt_temp[!is.na(get(v$name))]))
}
# merge list into data.table:
if(trymerge)
{
if(length(dt_list) == 1)
{
dt_list = rbindlist(dt_list) # for some reason unlist(,recursive = FALSE) does not work here and also unlists the data table
} else
{
dt0 = dt_list[[1]]
for(i in 2:length(dt_list))
{
if(length(intersect(names(dt0),names(dt_list[[i]]))) == 0)
{
stop('Your file has variables with disjoint dimensions, which should not be stored in a single data table. Either set trymerge to FALSE or select variables with overlapping dimensions in vars.' )
}
dt0 = merge(dt0,dt_list[[i]],by = intersect(names(dt0),names(dt_list[[i]])), all = T)
}
dt_list = dt0
}
}
# print units of dimension variables:
if(verbose == 1)
{
for(i in 1:length(nc$dim))
{
units = c(units, paste0(names(nc$dim)[i],': ',nc$dim[[i]]$units))
}
catout = paste0(c('Units:',units),sep = '',collapse = "\n")
cat(catout)
}
return(dt_list)
}
#' Write a netcdf from a long data table
#'
#'
#' @description This function writes a netcdf from a long data table, the usual data format in SeaVal.
#' If not specified, it guesses (based on column names) which columns contain dimension variables and which contain variables.
#' The function currently does not support writing netcdfs with multiple variables that have different sets of dimension variables!
#'
#' It allows to store character columns in netcdfs (essentially labelling them as integers and storing a legend).
#' This legend is automatically interpreted when the netcdf is read with [netcdf_to_dt()], but is also humanly readable.
#'
#' @param dt a data.table
#' @param vars names of columns in dt containing variables. If this is NULL, the function guesses and asks for confirmation.
#' @param units character vector containing the units for vars (in the same order). If this is NULL (default), the user is prompted for input.
#' @param dim_vars names of columns in dt containing dimension variables. If this is NULL, the function guesses and asks for confirmation.
#' @param dim_var_units character vector containing the units for dim_vars (in the same order). If this is NULL (default), the user is prompted for input (except for lon/lat).
#' @param nc_out File name (including path) of the netcdf to write.
#' @param check If check is TRUE, the function asks the user whether an existing file should be overwritten, and whether the correct dimvars have been guessed.
#' @param description For adding a global attribute 'Description' as a string.
#'
#' @return none.
#'
#' @import ncdf4
#'
#' @examples
#' example_dt = data.table(lon = 1:3, month = 4:6, prec = 7:9)
#' file_name = tempfile()
#' dt_to_netcdf(dt = example_dt, nc_out = file_name,
#' vars = "prec", units = "mm",
#' dim_vars = c("lon","month"), dim_var_units = c("degree longitude","month"))
#'
#'
#'
#' @export
dt_to_netcdf = function(dt,nc_out,
vars = NULL,
units = NULL,
dim_vars = dimvars(dt), dim_var_units = NULL,
check = interactive(),
description = NULL)
{
if(check & file.exists(nc_out))
{
test = readline(prompt = paste0('The netcdf file already exists. Do you want to overwrite? [y/n]'))
if(test == 'y') {
invisible(file.remove(nc_out))
}else(stop('aborted.'))
}
if(is.null(vars))
{
vars = setdiff(names(dt),dim_vars)
if(check)
{
message(paste("You didn't provide information which columns of dt are dim_vars and which are variables.",
"My guess would be as follows:",
paste0("variables: ",paste(vars,collapse = ', ')),
paste0("dim_vars: ",paste(dim_vars,collapse = ', ')),
"Is that ok?",sep = "\n"))
test = menu(c('yes','no'))
if(test != 1) stop('please provide vars and dim_vars manually. See ?dt_to_netcdf for details.')
}
}
# convert character columns to integers and record their levels
char_colnames = names(dt)[as.logical(dt[,lapply(.SD, is.character)])]
att_list_for_char_cols = list()
for(i in seq_along(char_colnames)){
dt[,char_colnames[i] := as.factor(get(char_colnames[i]))]
att_list_for_char_cols[[i]] = dt[,levels(get(char_colnames[i]))]
dt[,char_colnames[i] := as.integer(get(char_colnames[i]))]
}
# get data into correct order:
setkeyv(dt,rev(dim_vars))
# expand data.table by NAs to contain all values in a 'rectangular' array, and get dimension variables:
dim_list = list()
dim_vars_ncdf = list()
for(ii in seq_along(dim_vars))
{
dv = dim_vars[ii]
vals = sort(unique(dt[,get(dv)]))
temp = list(vals)
names(temp) = dv
### get unit for dimvar ###
if(!is.null(dim_var_units)){
unit = dim_var_units[ii]
} else {
if(dv %in% c('lon','Lon')) {unit = 'degree longitude'
}else if(dv %in% c('lat','Lat')) {unit = 'degree latitude'
}else if(dv %in% c('country','member','month','season','system','year')) {unit = dv
}else
{
unit = readline(prompt = paste0('Enter the unit of the dimension variable ',dv,':\n'))
}
}
dim_var = ncdf4::ncdim_def(name = dv,units = unit,vals = vals)
dim_vars_ncdf = c(dim_vars_ncdf,list(dim_var))
dim_list = c(dim_list,temp)
}
# get the 'rectangular' dt:
expand_dt = as.data.table(expand.grid(dim_list))
dt_new = merge(dt,expand_dt,by = key(dt),all = TRUE) # dt is keyed by dimension variables
# define the variables of the netcdf:
vars_ncdf = list()
for(ii in seq_along(vars))
{
v = vars[ii]
if(!v %in% names(dt_new)) stop(paste0(v,' is not a column name in dt.'))
vals = dt_new[,get(v)]
if(is.null(units))
{
unit = readline(prompt = paste0('Enter the unit of the variable ',v,':\n'))
} else {
unit = units[ii]
}
ncvar = ncdf4::ncvar_def(name = v,
units = unit,
dim = dim_vars_ncdf,
missval = NA)
vars_ncdf = c(vars_ncdf,list(ncvar))
}
# write the netcdf file:
nc = ncdf4::nc_create(filename = nc_out,vars = c(vars_ncdf))
on.exit(ncdf4::nc_close(nc))
for(ii in seq_along(vars))
{
v = vars[ii]
values = dt_new[,get(v)]
ncdf4::ncvar_put(nc, varid = vars[ii],vals = values)
}
for(i in seq_along(char_colnames)){
templist = att_list_for_char_cols[[i]]
legend_string = paste(1:length(templist),'=',templist,collapse = ', ')
ncatt_put(nc, varid = char_colnames[i], attname = 'legend', attval = legend_string)
# revert conversion of the column in dt:
dt[,char_colnames[i] := att_list_for_char_cols[[i]][get(char_colnames[i])]]
}
if(!is.null(description))
{
ncdf4::ncatt_put(nc,varid = 0,attname = 'Description',attval = description)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.