Nothing
# Internal stacking method used by both neon_read and neon_store
neon_stack <- function(files,
keep_filename = FALSE,
sensor_metadata = TRUE,
altrep = FALSE,
progress = TRUE,
vroom_progress = FALSE,
...){
## Stack H5 Files (eddy-covariance only)
if(any(grepl("[.]h5$", files))){
stack_eddy(files, progress = progress, ...)
## Stack sensor data
} else if(is_sensor_data(files) && sensor_metadata){
df <- vroom_each(files,
progress = progress,
altrep = altrep,
vroom_progress = vroom_progress,
...)
add_sensor_columns(df)
## Stack observational data, with filename as a column
} else if(keep_filename) {
## Just keeps files names as an additional column in stacked data
vroom_each(files,
progress = progress,
altrep = altrep,
vroom_progress = vroom_progress,
...)
## Faster stacking of observational data, no additional meta columns
} else {
## Usually much much faster if we can do this one
vroom_many(files,
progress = progress,
altrep = altrep,
vroom_progress = vroom_progress,
...)
}
}
add_sensor_columns <- function(df){
filename_meta <- neon_filename_parser(df$file)
df$domainID <- filename_meta$DOM
df$siteID <- filename_meta$SITE
df$horizontalPosition <- filename_meta$HOR
df$verticalPosition <- filename_meta$VER
df$publicationDate <- as.POSIXct(filename_meta$GENTIME,
format = "%Y%m%dT%H%M%OS")
df
}
## read each file in separately and then stack them.
## include file name as additional id column
vroom_each <- function(files,
progress = TRUE,
altrep = FALSE,
vroom_progress = FALSE,
...){
if(progress){
pb <- progress::progress_bar$new(
format = paste(" reading files",
"[:bar] :percent in :elapsed, eta: :eta"),
total = length(files),
clear = FALSE,
width = 80)
}
groups <- lapply(files,
function(x){
if(progress) pb$tick()
out <- vroom::vroom(x, guess_max = 1e6,
altrep = altrep,
progress = vroom_progress,
show_col_types = FALSE,
...)
out$file <- basename(x)
out
})
suppressWarnings({
df <- ragged_bind(groups)
na_bool_to_char(df)
})
}
## vroom can read in a list of files, but only if columns are consistent
## So this attempts vroom over a list of files, but falls back on vroom_ragged
vroom_many <- function(files,
altrep = FALSE,
progress = FALSE,
vroom_progress = FALSE,
...){
df <- tryCatch(vroom::vroom(files,
guess_max = 1e6,
altrep = altrep,
progress = vroom_progress,
show_col_types = FALSE,
...),
error = function(e) vroom_ragged(files,
guess_max = 1e6,
altrep = altrep,
vroom_progress = FALSE,
show_col_types = FALSE,
...),
finally = NULL)
na_bool_to_char(df)
}
## Apply vroom over files that share a common schema.
vroom_ragged <- function(files,
altrep = FALSE,
vroom_progress = FALSE,
show_col_types = FALSE,
...){
options(readr.show_col_types = FALSE)
options(vroom.show_col_types = FALSE)
## We read the guess_max line of every file to determine schema
schema <- lapply(files,
vroom::vroom,
n_max = 1e6,
altrep = altrep,
progress = FALSE,
show_col_types = FALSE,
guess_max = 1e6,
#...
)
## Now, we read in tables in groups of matching schema,
## filling in additional columns as in bind_rows.
col_schemas <- lapply(schema, colnames)
u_schemas <- unique(col_schemas)
tbl_list <- vector("list", length=length(u_schemas))
all_cols <- unique(unlist(u_schemas))
i <- 1
for(s in u_schemas){
## select tables that have matching schemas
index <- vapply(col_schemas, identical, logical(1L), s)
col_types <- vroom::spec(schema[index][[1]])
## Read in all those tables
tbl <- vroom::vroom(files[index],
altrep = altrep,
progress = vroom_progress,
col_types = col_types)
## append any columns missing from all_cols set
missing <- all_cols[ !(all_cols %in% colnames(tbl)) ]
tbl[ missing ] <- NA
tbl_list[[i]] <- tbl
i <- i+1
}
do.call(rbind, tbl_list)
}
## vs do.call(rbind, x), dplyr::bind_rows will:
## - Handle differing numbers of columns
## - Handle type-coercion correctly (logical vs Date -> Date, not double)
## - Run about 15x faster and use much less RAM
## A base-R version of (recent versions of) dplyr::bind_rows,
## which can handle varying numbers of columns
ragged_bind <- function(x){
#x <- x[!is.null(x)]
#x <- x[vapply(x, nrow, 1) > 0]
col_schemas <- lapply(x, colnames)
col_types <- lapply(x, function(df) lapply(df, function(x) class(x)[[1]]))
u_schemas <- unique(col_schemas)
all_cols <- unique(unlist(u_schemas))
consensus_type <- vapply(all_cols, function(col)
type_hierarchy(unique(lapply(col_types, `[[`, col))),
""
)
for(i in seq_along(x)){
## append any columns missing from all_cols set
missing <- all_cols[ !(all_cols %in% colnames(x[[i]])) ]
x[[i]][ missing ] <- NA
x[[i]] <- set_schema_type(x[[i]], consensus_type)
}
do.call(rbind, x)
}
## Enforce column typing according to this consensus ranking
type_hierarchy <- function(x){
out <- max(ordered(x,
c("logical",
"integer",
"numeric",
"character",
"Date",
"POSIXct")), na.rm = TRUE)
as.character(out)
}
set_schema_type <- function(df, col_types){
if(!all(colnames(df) %in% names(col_types))){
stop("some column names not found in type list")
}
df_types <- vapply(df, function(x) class(x)[[1]], "")
wrong <- which(df_types != col_types[colnames(df)])
for(i in wrong){
## What do we do about non-coerce-able types?
if(is.logical(df[[i]]) && !all(is.na(df[[i]]))){
warning("cannot align column type", df_types[[i]],
"with", col_types[[i]])
}
df[[i]] <- as_type(df[[i]], col_types[[i]])
}
df
}
## bc methods::as(NA, "Date") fail!
as_type <- function(x, type){
switch(type,
"Date" = as.Date(x),
"POSIXct" = as.POSIXct(x),
"character" = as.character(x),
"numeric" = as.numeric(x),
"integer" = as.integer(x),
"logical" = as.logical(x),
x)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.