R/CMIP5Files.R

Defines functions CMIP5Files_info get_period get_freq get_model get_scenario2 get_scenario extract_model extract_ensemble

Documented in CMIP5Files_info extract_ensemble extract_model get_freq get_model get_period get_scenario get_scenario2

#' @rdname CMIP5Files_info
#' @export
#' 
#' @examples
#' library(CMIP5tools)
#' library(magrittr)
#' data(files_short)
#' 
#' ensemble <- extract_ensemble(files_short)
extract_ensemble <- function(files){
    basename(files) %>% str_extract("(?<=_)r\\d.*(?=_\\d{6,8})") # %>% gsub("_gn$", "", .) 
}


#' @rdname CMIP5Files_info
#' @export
#' @examples
#' model <- extract_model(files_short)
extract_model <- function(files){
    basename(files) %>% str_extract("[[:alnum:]-]*(?=_his|_rcp|_ssp|_piControl)")
    # basename(files) %>% str_extract("(?<=day_|Day_|mon_|Mon).*(?=_his|_rcp|_piControl)")
}


#' get CMIP5 scenario and model from file path
#' @name get_CMIP5
#' @keywords internal
#' NULL

#' @rdname get_CMIP5
#' @export
get_scenario <- function(file){
    basename(file) %>% str_extract("[a-z,A-Z,0-9]*(?=_r\\d)")
}

#' @rdname get_CMIP5
#' @export
get_scenario2 <- function(file){
    basename(file) %>% str_extract("^[a-z,A-Z,0-9]*(?=_)")
}

#' @rdname get_CMIP5
#' @export
get_model <- function(file, prefix = '_', postfix = "_") {
    pattern <- sprintf("(?<=%s).*(?=%s)", prefix, postfix)
    str_extract(basename(file), pattern)
}

#' @rdname get_CMIP5
#' @export
get_freq <- function(file) {
    str_extract(basename(file), "(?<=_)[:alpha:]{4,6}")
}

#' @rdname get_CMIP5
#' @export
get_period <- function(scenario = "historical", period = NULL) {
    is_rcp <- str_detect(scenario, "RCP")
    is_his <- str_detect(scenario, "his")

    if (is.null(period)) {
        if (is_his) {
            period = c(1850, 2012) # Historical
        }
    }

    duration = 200
    if (scenario == "historical") {
        period = c(1850, 2012) # Historical
    }
    if (is_rcp) {
        period = c(2006, 2100) # RCP
    } else if (scenario == "piControl") {
        # duration <- NULL
        period <- NULL
    }
    listk(period, duration)
}

#' @name CMIP5Files_info
#' @title summary CMIP5 files information
#' 
#' @description
#' \itemize{
#'   \item [extract_ensemble]:   Extract CMIP5 ensemle name from file names
#'   \item [extract_model]:      Extract CMIP5 model name from file names
#' 
#'   \item [CMIP5Files_info]:    Extract CMIP5 information from file names
#'   \item [CMIP5Files_filter]:  Filter corresponding duration or period files. 
#' Dates has been adjust from 12-01 to 01-01 for `start_adj`. Duplicated dates 
#' are also removed.
#'   \item [CMIP5Files_summary]: Get the start and end information of every model
#' }
NULL

#' @param files CMIP5 nc files, full name path.
#' 
#' @rdname CMIP5Files_info
#' @examples
#' CMIP5Files_info(files_short)
#' 
#' @export
CMIP5Files_info <- function(files){
    files_short <- basename(files)
    model     <- extract_model(files)
    ensemble  <- extract_ensemble(files_short)
    freq      <- get_freq(files_short)

    # get begin date and end date of all files
    info <- str_extract_all(files_short, '[0-9]{6,8}') %>% 
        do.call(rbind, .) %>% as.data.table() %>% 
        set_names(c('start', 'end'))
    
    is_month <- nchar(info$start[1]) == 6
    if (is_month) { info %<>% map(~paste0(., "01"))}
    
    info <- llply(info, ymd) %>% as.data.table()
    
    info %<>% cbind(model, ensemble, freq, .)
    info[, `:=`(start_adj  = start, 
                end_adj    = end, 
                year_start = year(start), 
                year_end   = year(end))]
    info[, `:=`(len = year_end - year_start + 1)]
    info$file <- files

    # filename already includes model, scenario and ensemble info.
    info <- info[order(basename(file))] %>% cbind(Id =seq_along(model), .)    
    info
}


# Find the nearest duration's year data of 'piControl'
filter_duration <- function(d, duration = 200){
    year_max <- max(d$year_end)
    year_min <- min(d$year_start)
    
    if ((year_max - year_min + 1) > duration){
        if (d$model[1] == "BNU-ESM") {
            # 修改filter_duration的策略,从左侧优先选够200y
            # from left
            year_max0 <- year_min + duration - 1
            dnew <- subset(d, year_start <= year_max0)
            dnew$end_adj[nrow(dnew)] %<>% pmin(make_date(year_max0, 12, 31))
        } else {
            # from right
            year_min0 <- year_max - duration + 1 # great than the min year
            dnew <- subset(d, year_end >= year_min0)
            # adjust year_start
            dnew$start_adj[1] %<>% pmax(make_date(year_min0, 1, 1))
        }
        dnew %<>% mutate(len = year(end_adj) - year(start_adj) + 1)
        dnew
    } else {
        d
    }
}

#' @param duration length of year.
#' @param period starting and ending year, e.g. `c(1850, 2005)`. If 
#' provided, duration will be overwritten.
#' If duration and period are `null` at the same time, no filter applied.
#' @param check_dupli Boolean. If true, duplicated date will be removed by [check_dfile()].
#' @param verbose 
#' * echo nc file missing info, if `verbose >= 1`
#' * echo duplicated info, if `verbose >= 2`
#' 
#' @rdname CMIP5Files_info
#' @importFrom stringr str_extract str_extract_all
#' 
#' @return d_files A data.table with colnames of 
#' `'Id', 'model', 'kind', 'start', 'end', 'year_start', 'year_start_adj', 
#' 'year_end', 'file'`
#' 
#' @note Only one scenario per time.
#' @seealso [check_dfile()], [rm_duplicate()]
#' 
#' @examples
#' # filter CMIP5 files
#' CMIP5Files_filter(files_short, duration = 200)
#' @export
CMIP5Files_filter <- function(files, duration = 200, period = NULL, check_dupli = TRUE, 
    verbose = 1)
{
    info <- CMIP5Files_info(files)

    # If duration and period are null at the same time, no filter applied.
    if (is.null(duration) && is.null(period)){
        d_files <- info
    } else {
        if (is.null(period)){
            d_files  <- ddply(info, .(model, ensemble, freq), filter_duration, duration = duration) %>% data.table()
        } else {
            d_files <- info[year_end >= period[1] & year_start <= period[2]]
            d_files$start_adj %<>% pmax(make_date(period[1], 1, 1))
            d_files$end_adj   %<>% pmin(make_date(period[2], 12, 31))
            d_files[, len := year(end_adj) - year(start_adj) +1]
        }
    }
    
    rm_dupli <- function(d_file){
        if (!is.data.table(d_file)) d_file %<>% as.data.table()
            
        model <- d_file$model %>% check_str_null()
        ensemble <- d_file$ensemble %>% check_str_null()

        d_file2 <- check_dfile(d_file, verbose)
        # note that some model begins from 12-01
        date_start <- d_file2$start_adj[1]

        if (month(date_start) == 12) {
            date_start %<>% add_1month()
            d_file2$start_adj[1] <- date_start
            warn(sprintf("[m] date adjust from 12-01 to 01-01! [%s, %s]\n", 
                            model, ensemble))
        }
        # check summary here
        info <- CMIP5Files_summary(d_file2)
        nmiss <- sum(info$len_adj) - sum(d_file2$len)
        if(nmiss > 0) {
            if (verbose >= 1) {
                str_miss <- d_file2[, missInfo.MonthDate(start, end)]
                cat("=========================================\n")
                warn(sprintf('[missing] %2d years: %s in [%s, %s]\n', 
                             nmiss, str_miss, model, ensemble))
                print(info)
                cat("-----------------------------------------\n")    
                print(d_file2[1:pmin(10, nrow(d_file2)), 1:10])
            }
        }
        d_file2
    }

    if (check_dupli) {
        scenario <- d_files[1, str_extract(basename(file), 
            sprintf("(?<=%s_).*(?=_%s)", model, ensemble))]
        fmt <- ifelse (grepl("rcp|RCP", scenario), "=== %-6s ===", " %-10s ") %>% 
            sprintf("===========================%s===========================\n", .)
        if (verbose > 0) cat(bold(sprintf(fmt, scenario)))
        ## check duplicated date
        d_files <- ddply(d_files, .(model, ensemble, freq), rm_dupli) %>% data.table()
    }
    return(d_files)
}

#' @param d An object returned by `CMIP5Files_filter` or 
#' `CMIP5Files_info` or a data.frame at least with the columns of `
#' 'model', 'year_start_adj', 'year_end', 'file'`.
#' 
#' @rdname CMIP5Files_info
#' @examples
#' # Get the summary infomation of CMIP5Files_filter or CMIP5Files_info
#' CMIP5Files_filter(files_short) %>% CMIP5Files_summary()
#' 
#' @export
CMIP5Files_summary <- function(d){
    summary <- function(di){
        res <- di[, .(
            start = min(start), 
            end = max(end), 
            start_adj  = min(start_adj),
            end_adj    = max(end_adj),
            year_start = min(year_start), 
            year_end   = max(year_end), 
            n = .N
            ), .(model, ensemble)]
        res[, `:=`(
            len     = year_end - year_start + 1, 
            len_adj = year(end_adj) - year(start_adj) + 1)]
        res[order(toupper(model)), ]
    }

    if (is.data.frame(d)){
        # data.table
        summary(d)
    } else {
        map(d, summary)
    }
}

# #' read threshold or prehist from RDS
# #' @export
# read_RDS <- function(indir = "OUTPUT/china/", prefix = '_', postfix = "\\.RDS"){
#     files   <- dir(indir, "*.RDS", full.names = T)
#     pattern <- sprintf("(?<=%s).*(?=%s)", prefix, postfix)

#     models  <- basename(files) %>% str_extract(pattern)
#     files %<>% set_names(models)
    
#     llply(files, readRDS, .progress = "text")
# }
kongdd/CMIP5tools documentation built on Dec. 17, 2020, 11:03 a.m.