#' @rdname CMIP5Files_info
#' @export
#'
#' @examples
#' library(CMIP5tools)
#' library(magrittr)
#' data(files_short)
#'
#' ensemble <- extract_ensemble(files_short)
extract_ensemble <- function(files){
basename(files) %>% str_extract("(?<=_)r\\d.*(?=_\\d{6,8})") # %>% gsub("_gn$", "", .)
}
#' @rdname CMIP5Files_info
#' @export
#' @examples
#' model <- extract_model(files_short)
extract_model <- function(files){
basename(files) %>% str_extract("[[:alnum:]-]*(?=_his|_rcp|_ssp|_piControl)")
# basename(files) %>% str_extract("(?<=day_|Day_|mon_|Mon).*(?=_his|_rcp|_piControl)")
}
#' get CMIP5 scenario and model from file path
#' @name get_CMIP5
#' @keywords internal
#' NULL
#' @rdname get_CMIP5
#' @export
get_scenario <- function(file){
basename(file) %>% str_extract("[a-z,A-Z,0-9]*(?=_r\\d)")
}
#' @rdname get_CMIP5
#' @export
get_scenario2 <- function(file){
basename(file) %>% str_extract("^[a-z,A-Z,0-9]*(?=_)")
}
#' @rdname get_CMIP5
#' @export
get_model <- function(file, prefix = '_', postfix = "_") {
pattern <- sprintf("(?<=%s).*(?=%s)", prefix, postfix)
str_extract(basename(file), pattern)
}
#' @rdname get_CMIP5
#' @export
get_freq <- function(file) {
str_extract(basename(file), "(?<=_)[:alpha:]{4,6}")
}
#' @rdname get_CMIP5
#' @export
get_period <- function(scenario = "historical", period = NULL) {
is_rcp <- str_detect(scenario, "RCP")
is_his <- str_detect(scenario, "his")
if (is.null(period)) {
if (is_his) {
period = c(1850, 2012) # Historical
}
}
duration = 200
if (scenario == "historical") {
period = c(1850, 2012) # Historical
}
if (is_rcp) {
period = c(2006, 2100) # RCP
} else if (scenario == "piControl") {
# duration <- NULL
period <- NULL
}
listk(period, duration)
}
#' @name CMIP5Files_info
#' @title summary CMIP5 files information
#'
#' @description
#' \itemize{
#' \item [extract_ensemble]: Extract CMIP5 ensemle name from file names
#' \item [extract_model]: Extract CMIP5 model name from file names
#'
#' \item [CMIP5Files_info]: Extract CMIP5 information from file names
#' \item [CMIP5Files_filter]: Filter corresponding duration or period files.
#' Dates has been adjust from 12-01 to 01-01 for `start_adj`. Duplicated dates
#' are also removed.
#' \item [CMIP5Files_summary]: Get the start and end information of every model
#' }
NULL
#' @param files CMIP5 nc files, full name path.
#'
#' @rdname CMIP5Files_info
#' @examples
#' CMIP5Files_info(files_short)
#'
#' @export
CMIP5Files_info <- function(files){
files_short <- basename(files)
model <- extract_model(files)
ensemble <- extract_ensemble(files_short)
freq <- get_freq(files_short)
# get begin date and end date of all files
info <- str_extract_all(files_short, '[0-9]{6,8}') %>%
do.call(rbind, .) %>% as.data.table() %>%
set_names(c('start', 'end'))
is_month <- nchar(info$start[1]) == 6
if (is_month) { info %<>% map(~paste0(., "01"))}
info <- llply(info, ymd) %>% as.data.table()
info %<>% cbind(model, ensemble, freq, .)
info[, `:=`(start_adj = start,
end_adj = end,
year_start = year(start),
year_end = year(end))]
info[, `:=`(len = year_end - year_start + 1)]
info$file <- files
# filename already includes model, scenario and ensemble info.
info <- info[order(basename(file))] %>% cbind(Id =seq_along(model), .)
info
}
# Find the nearest duration's year data of 'piControl'
filter_duration <- function(d, duration = 200){
year_max <- max(d$year_end)
year_min <- min(d$year_start)
if ((year_max - year_min + 1) > duration){
if (d$model[1] == "BNU-ESM") {
# 修改filter_duration的策略,从左侧优先选够200y
# from left
year_max0 <- year_min + duration - 1
dnew <- subset(d, year_start <= year_max0)
dnew$end_adj[nrow(dnew)] %<>% pmin(make_date(year_max0, 12, 31))
} else {
# from right
year_min0 <- year_max - duration + 1 # great than the min year
dnew <- subset(d, year_end >= year_min0)
# adjust year_start
dnew$start_adj[1] %<>% pmax(make_date(year_min0, 1, 1))
}
dnew %<>% mutate(len = year(end_adj) - year(start_adj) + 1)
dnew
} else {
d
}
}
#' @param duration length of year.
#' @param period starting and ending year, e.g. `c(1850, 2005)`. If
#' provided, duration will be overwritten.
#' If duration and period are `null` at the same time, no filter applied.
#' @param check_dupli Boolean. If true, duplicated date will be removed by [check_dfile()].
#' @param verbose
#' * echo nc file missing info, if `verbose >= 1`
#' * echo duplicated info, if `verbose >= 2`
#'
#' @rdname CMIP5Files_info
#' @importFrom stringr str_extract str_extract_all
#'
#' @return d_files A data.table with colnames of
#' `'Id', 'model', 'kind', 'start', 'end', 'year_start', 'year_start_adj',
#' 'year_end', 'file'`
#'
#' @note Only one scenario per time.
#' @seealso [check_dfile()], [rm_duplicate()]
#'
#' @examples
#' # filter CMIP5 files
#' CMIP5Files_filter(files_short, duration = 200)
#' @export
CMIP5Files_filter <- function(files, duration = 200, period = NULL, check_dupli = TRUE,
verbose = 1)
{
info <- CMIP5Files_info(files)
# If duration and period are null at the same time, no filter applied.
if (is.null(duration) && is.null(period)){
d_files <- info
} else {
if (is.null(period)){
d_files <- ddply(info, .(model, ensemble, freq), filter_duration, duration = duration) %>% data.table()
} else {
d_files <- info[year_end >= period[1] & year_start <= period[2]]
d_files$start_adj %<>% pmax(make_date(period[1], 1, 1))
d_files$end_adj %<>% pmin(make_date(period[2], 12, 31))
d_files[, len := year(end_adj) - year(start_adj) +1]
}
}
rm_dupli <- function(d_file){
if (!is.data.table(d_file)) d_file %<>% as.data.table()
model <- d_file$model %>% check_str_null()
ensemble <- d_file$ensemble %>% check_str_null()
d_file2 <- check_dfile(d_file, verbose)
# note that some model begins from 12-01
date_start <- d_file2$start_adj[1]
if (month(date_start) == 12) {
date_start %<>% add_1month()
d_file2$start_adj[1] <- date_start
warn(sprintf("[m] date adjust from 12-01 to 01-01! [%s, %s]\n",
model, ensemble))
}
# check summary here
info <- CMIP5Files_summary(d_file2)
nmiss <- sum(info$len_adj) - sum(d_file2$len)
if(nmiss > 0) {
if (verbose >= 1) {
str_miss <- d_file2[, missInfo.MonthDate(start, end)]
cat("=========================================\n")
warn(sprintf('[missing] %2d years: %s in [%s, %s]\n',
nmiss, str_miss, model, ensemble))
print(info)
cat("-----------------------------------------\n")
print(d_file2[1:pmin(10, nrow(d_file2)), 1:10])
}
}
d_file2
}
if (check_dupli) {
scenario <- d_files[1, str_extract(basename(file),
sprintf("(?<=%s_).*(?=_%s)", model, ensemble))]
fmt <- ifelse (grepl("rcp|RCP", scenario), "=== %-6s ===", " %-10s ") %>%
sprintf("===========================%s===========================\n", .)
if (verbose > 0) cat(bold(sprintf(fmt, scenario)))
## check duplicated date
d_files <- ddply(d_files, .(model, ensemble, freq), rm_dupli) %>% data.table()
}
return(d_files)
}
#' @param d An object returned by `CMIP5Files_filter` or
#' `CMIP5Files_info` or a data.frame at least with the columns of `
#' 'model', 'year_start_adj', 'year_end', 'file'`.
#'
#' @rdname CMIP5Files_info
#' @examples
#' # Get the summary infomation of CMIP5Files_filter or CMIP5Files_info
#' CMIP5Files_filter(files_short) %>% CMIP5Files_summary()
#'
#' @export
CMIP5Files_summary <- function(d){
summary <- function(di){
res <- di[, .(
start = min(start),
end = max(end),
start_adj = min(start_adj),
end_adj = max(end_adj),
year_start = min(year_start),
year_end = max(year_end),
n = .N
), .(model, ensemble)]
res[, `:=`(
len = year_end - year_start + 1,
len_adj = year(end_adj) - year(start_adj) + 1)]
res[order(toupper(model)), ]
}
if (is.data.frame(d)){
# data.table
summary(d)
} else {
map(d, summary)
}
}
# #' read threshold or prehist from RDS
# #' @export
# read_RDS <- function(indir = "OUTPUT/china/", prefix = '_', postfix = "\\.RDS"){
# files <- dir(indir, "*.RDS", full.names = T)
# pattern <- sprintf("(?<=%s).*(?=%s)", prefix, postfix)
# models <- basename(files) %>% str_extract(pattern)
# files %<>% set_names(models)
# llply(files, readRDS, .progress = "text")
# }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.