#' List all CMIP5 files in a directory tree
#'
#' List all CMIP5 files in a directory tree, parsing their filenames for
#' information like experiment, model, and variable names.
#'
#' @param path string root of directory tree
#' @param recursive logical. Should the listing recurse into directories?
#' @return data.frame containing the following parsed from file names:
#' \item{filename}{Full filename, including path}
#' \item{variable}{File variable}
#' \item{domain}{File domain}
#' \item{model}{Model that produced this file}
#' \item{experiment}{File experiment}
#' \item{ensemble}{File ensemble}
#' \item{time}{year (and often month) range of file}
#' \item{size}{File size, in kilobytes}
#' @details For more information on CMIP5 filename structure and data description,
#' see \url{http://cmip-pcmdi.llnl.gov/cmip5/data_description.html}
#' @export
#' @examples
#' getFileInfo()
#' getFileInfo('.', recursive=FALSE)
#' @seealso \code{\link{checkTimePeriod}}
getFileInfo <- function(path='.', recursive=TRUE) {
# Sanity checks
assert_that(is.dir(path))
assert_that(is.readable(path))
assert_that(is.flag(recursive))
# Pull all nc files from the directory
fullFile <- list.files(path=path, pattern='nc$',
full.names=TRUE, recursive=recursive)
# Check that there are nc files to process
if(!length(fullFile)) {
warning('No NetCDF files found')
return(NULL)
}
# Pull the file name w/o directory and take off the '.nc',
shortFile <- gsub(".nc$", "", basename(fullFile))
# Split out the various components of the file name based on CMIP5
# ...file naming convention: variable_domain_model_experiment_ensemble_time
# ...or variable_domain_model_experiment_ensemble. The first example
# ...contains 6 string identifiers while the second contains 5.
fileInfo <- strsplit(shortFile, split='_')
# Get the number of string identifiers in the file names
infoSize <- unlist(lapply(fileInfo, length))
# Flag the ones that match the expected number (5 or 6)
valid <- infoSize %in% c(5,6)
# Remove the files which do not have expected number of string identifiers
if(!all(valid)) {
warning('Unexpected (not correctly formatted) files. Cutting the following files from the list: ', fullFile[!valid])
fullFile <- fullFile[valid]
shortFile <- shortFile[valid]
fileInfo <- fileInfo[valid]
infoSize <- infoSize[valid]
if(length(fullFile) == 0) {
warning('No files to process.')
return(NULL)
}
}
# Pull the file size
sizeInfo <- unlist(lapply(fullFile, function(x) { paste0(round(file.info(x)$size/1024), "K") }))
# Since some of the files only have 5 pieces of information and other's have
# ...6 we need to deal with them seperately and then merge the data frames.
# Deal with the 'fixed' variables (example: areacella)
if(any(infoSize == 5)) {
fixedInfo <- t(as.data.frame(fileInfo[infoSize == 5], row.names=NULL))
fixedInfo <- cbind(fixedInfo, rep('', length=sum(infoSize == 5)))
fixedInfo <- data.frame(path=dirname(fullFile[infoSize == 5]),
filename=shortFile[infoSize==5],
fixedInfo,
size=sizeInfo[infoSize==5],
row.names=NULL)
} else {
fixedInfo <- NULL
}
# Deal with the 'temporal' variables (example: tas)
if(any(infoSize == 6)) {
temporalInfo <- t(as.data.frame(fileInfo[infoSize==6], row.names=NULL))
temporalInfo <- data.frame(path=dirname(fullFile[infoSize == 6]),
filename=shortFile[infoSize==6],
temporalInfo,
size=sizeInfo[infoSize==6],
row.names=NULL)
} else {
temporalInfo <- NULL
}
# Put everything together
fileInfo.df <- rbind(fixedInfo, temporalInfo)
names(fileInfo.df) <- c('path', 'filename', 'variable', 'domain', 'model', 'experiment', 'ensemble', 'time', 'size')
fileInfo.df <- data.frame(lapply(fileInfo.df, as.character),
stringsAsFactors=FALSE)
return(fileInfo.df)
} # getFileInfo
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.