R/urls_CMIP5_ceda.R

Defines functions tidy_filelist

tidy_filelist <- function(file){
    # file <- "fileList_20150706.txt"
    # df <- fread(file, nrows = 1e7) # select = c(4, 9)
    # table(df$V4)
    
    host <- "ftp://ftp.ceda.ac.uk/badc/cmip5/data/cmip5/metadata/fileList_20150706.txt"
    df   <- fread(file, select = c(5, 9), nrows = Inf) %>% 
        set_colnames(c("size_mb", "file"))
    df[, size_mb := size_mb/1024^2]
    
    info <- split(df$file, "/") %>% do.call(rbind, .) %>% data.table() %>% .[, -9]
    
    varnames <- c("output", "org", "model", "scenario", "time_scale", "part", "time_scale2", "ensemble", "variable", "file")
    colnames(info) <- varnames
    
    info %<>% cbind(I = 1:nrow(.), .)
    save(info, df, file = "filelist_CMIP5.rda")
}

split_scenario <- function(d){
    d %>% split(., .$scenario)
}

fix_host <- function(d) {
    host <- "ftp://ftp.ceda.ac.uk/badc/cmip5/data/cmip5/"
    d$file %<>% paste0(host, .)
    d
}


#' @importFrom readr write_lines
write_urls_ceda <- function(d, outfile, show.filesize = FALSE) {
    check_dir(dirname(outfile))
    scenario = d$scenario[1]
    size = round(sum(d$size_mb) / 1024, 2)
    message_str = sprintf("# [%s] %+6s Gb", scenario, size)
    write_lines(message_str, outfile)

    if (show.filesize) {
        d[, sprintf("%s\n# %4.1fMb", file, size_mb)] %>% write_lines(outfile, append = TRUE)
    } else {
        write_lines(d$file, outfile, append = TRUE)
    }
}

#' ftp urls of CMIP5 nc files from CEDA
#' 
#' For RCP and piControl, only r1i1p1 ensemble returned.
#' 
#' @param df data.frame of ceda CMIP5 filelist.
#' - `file`   : relative path of nc file
#' - `size_mb`: file size
#' @param info file detailed information
#' @param varname which variable to extracted
#' @param TIMESCALE filter time_scale2 in the `info`. e.g. `"Amon", "day", ...` 
#' @param PART one of `"aerosol", "atmos", "land", "landIce", "ocean", "ocnBgchem", "seaIce"`
#' @param scenarios which scenarios to be extracted
#' @param save boolean. Whether save urls to txt files?
#' @param outdir output directory
#' 
#' @references
#' 1. ftp://ftp.ceda.ac.uk/badc/cmip5/data/cmip5/
#' 2. ftp://ftp.ceda.ac.uk/badc/cmip5/data/cmip5/metadata/fileList_20150706.txt
#' 
#' @example man/examples/ex-urls_CMIP5.R
#' @export
urls_CMIP5_ceda <- function(df, info, varname = "tasmax",
    TIMESCALE = "Amon",
    PART = NULL, 
    scenarios = c("piControl", "historical", "historicalGHG", "historicalMisc", 
                  "historicalNat", "rcp26", "rcp45", "rcp60", "rcp85"), 
    save   = TRUE, 
    outdir = ".")
{
    info_sel <- info[variable == varname & 
                        time_scale2 %in% TIMESCALE & 
                        scenario    %in% scenarios, ]
    if (!is.null(PART)) info_sel <- info_sel[part == PART, ]
    
    # fill into real url
    info_sel$file <- df$file[info_sel$I]
    info_sel$size_mb <- df$size_mb[info_sel$I]
    
    # ignore: cfDay, cfMon
    l <- info_sel %>% split(., .$scenario) %>% map(function(d){
        I_sel <- urls_filter(d$file)
        d[I_sel, ]
    })
    lst <- map(l, fix_host) # link

    if (save) {    
        temp = foreach(d = lst, scenario = names(lst)) %do% {
            outfile <- sprintf("%s/%s_%s.txt", outdir, scenario, varname)
            write_urls_ceda(d, outfile)
        }
    }
    # print size_mb
    show_file_size(lst)
    lst
}

#' show_file_size
#' @param lst Object returned by [urls_CMIP5_ceda()]
#' 
#' @export
show_file_size <- function(lst) {
    sizes <- foreach(d = lst, scenario = names(lst), i = icount(), .combine = c) %do% {
        size = round(sum(d$size_mb) / 1024, 1)
        size_str = sprintf("%+6s", size)
        message_str = sprintf("[%-13s] %s Gb\n", scenario, green(size_str))
        cat(message_str)
        size
    }
    cat(sprintf("[Total] %s Gb\n", round(sum(sizes))))
}
kongdd/CMIP5tools documentation built on Dec. 17, 2020, 11:03 a.m.