R/retrieve.ESGF.R

Defines functions meta.ESGF retrieve.ESGF

Documented in meta.ESGF retrieve.ESGF

#' Retrieve CMIP data directly from the Earth System Grid Federation (ESGF)
#' https://earthsystemcog.org/projects/cog/esgf_search_restful_api
#' meta.ESGF returns a data.frame with the model metadata and the OpenDAP URL that 
#' can be used with retrieve.ESGF. The function \code{retrieve.ESGF} is a wraparound
#' for \code{retrieve} that reads several files belonging to the same model and run.
#' Also see <https://pcmdi.llnl.gov/CMIP6/Guide/dataUsers.html> for more documentation
#'
#' @import ncdf4
#'
#' @seealso retrieve meta.ESGF
#'
#' @param param Name of parameter
#' @param verbose Logical value defaulting to FALSE. If FALSE, do not display
#' comments (silent mode). If TRUE, displays extra information on progress.
#' @param url The base URL for ESGF
#' @param n Number of models to read - NULL reads everything
#' @param expid Name of the MIP experiment ('historical' for historical run)
#' @param mip CMIP5 or CMIP6
#' @param freq Frequency of data 
#' @return A data.frame for \code{meta.ESGF} and a "zoo" "field" object with additional attributes used for further
#' processing for +code{retrieve.ESGF}.
#'
#' @examples 
#' \dontrun{
#' meta <- meta.ESGF(n=3)
#' X <- retrieve.ESGF(im=3,lon=c(-30,40),lat=c(50,70),meta=meta)
#' map(X)
#'}
#' @export retrieve.ESGF
retrieve.ESGF <- function(im=1,meta=NULL,verbose=FALSE,...) { 
  if (is.null(meta)) meta <- meta.ESGF(verbose=verbose,...)
  if (inherits(im,c('numeric','integer'))) model <- as.character(meta$model[im]) else
    if (is.character(im)) { 
      if (verbose) print(im)
      ## Assume the shape '<model>_<expid>_<ensid>'
      im <- strsplit(gsub('_',' ',im),' ')
      model <- im[[1]][1]; expid <- im[[1]][2]; ensid <- im[[1]][3]
      im <- intersect( grep(model,meta$model), grep(ensid,meta$member.id) )[1]
      model <- meta$model[im]
      if (verbose) print(c(im,model,ensid))
      if (is.na(im)) return(NULL)
    }
  ripf <- as.character(meta$member.id[im])
  jm <- (1:length(meta$model))[is.element(as.character(meta$model),model) &
                               is.element(as.character(meta$member.id),ripf)]
  for (j in jm) { 
    print(as.character(meta$period[j]))
    opendap <- as.character(meta$OpenDap[j])
    if (verbose) print(opendap)
    x <- try(retrieve(file=opendap,param=attr(meta,'variable'),verbose=verbose,...))
    if ( (!inherits(x,"try-error")) & (length(jm)>1) ) { 
      if (j == jm[1]) X <- x else X <- c(zoo(X),zoo(x))
      X <- attrcp(x,X); class(X) <- class(x)
    } else if ( (length(jm)==1) | (inherits(x,"try-error")) ) X <- x 
  }
  attr(X,'model_id') <- model
  attr(X,'ripf_id') <- ripf
  invisible(X)
}

#' Read metadata from ESGF
#'
#' @seealso retrieve.ESGF
#'
#' @param url URL
#' @param mip CMIP5 or CMIP6
#' @param param meteorological variable name, e.g., "tas"
#' @param freq temporal frequency, e.g., "mon"=monthly
#' @param expid scenario (ssp for CMIP6, rcp for CMIP5)
#' @param verbose if TRUE print progress
#' @param n number of datasets
#'
#' @export meta.ESGF
meta.ESGF <- function(url="https://esgf-data.dkrz.de/esg-search/search/",mip="CMIP6",param="tas",
                      freq="mon",expid="ssp585",verbose=FALSE,n=NULL) {
  if (verbose) print('meta.ESFG - this function uses the jsonlite package to read metadata from ESGF')

  ## Check if the JSON library is installed
  if (!requireNamespace("jsonlite", quietly = TRUE)) {
    stop("Package 'jsonlite' needed to use 'meta.ESGF'. Please install it.")
  } else {
    
    ## Get number of available datasets
    ### Facet switches: CMIP5 | CMIP6 | else
    facet_cmip <- switch(mip,"CMIP5" = "project", "CMIP6" = "mip_era", "project") 
    facet_var  <- switch(mip,"CMIP5" = "variable", "CMIP6" = "variable_id")
    facet_freq <- switch(mip,"CMIP5" = "time_frequency", "CMIP6" = "frequency")
    facet_exp  <- switch(mip,"CMIP5" = "experiment", "CMIP6" = "experiment_id")
    facet_mod  <- switch(mip,"CMIP5" = "model", "CMIP6" = "source_id")
    facet_ens  <- switch(mip,"CMIP5" = "ensemble", "CMIP6" = "member_id")
    facet_ins  <- switch(mip,"CMIP5" = "institute", "CMIP6" = "institution_id")

    search_string <- paste("type=Dataset&replica=false&latest=true&",facet_cmip,"=",mip,"&",facet_var,"=",param,
                         "&",facet_freq,"=",freq,"&",facet_exp,"=",expid,"&format=application%2Fsolr%2Bjson",sep="")
    search_hist_string <- paste("type=Dataset&replica=false&latest=true&",facet_cmip,"=",mip,"&",facet_var,"=",param, 
                           "&",facet_freq,"=",freq,"&",facet_exp,"=",'historical',"&format=application%2Fsolr%2Bjson",sep ="")
    
    if (verbose) print(search_string)

    ## Get number of available datasets
    nof_datasets <- jsonlite::fromJSON(paste(url,"?",search_string,sep=""))$response$numFound

    if (!is.null(n)) nof_datasets <- n
    print(paste('Found',nof_datasets,'datasets'))
    if (nof_datasets==0) {
      print('Please check the search URL (CMIP, experiment, variable, facets, etc.):')
      print(paste(url,search_string,sep=""))      
      print('If all looks OK, maybe the connection is down - try again later!')
      return(NULL)
    }
    
    ## Query ESGF server
    ESGF_query <- jsonlite::fromJSON(paste(url,"?limit=",nof_datasets,"&",search_string,sep=""))
    ESGF_hist_query <- jsonlite::fromJSON(paste(url,"?limit=",nof_datasets,"&",search_hist_string,sep=""))
    
    if (verbose) str(ESGF_query,max.level = 2)
    
    ## List files in each dataset and link to files from the corresponding historical dataset
    #nof_datasets <- 7 ## test
    results <- list()
    for (i in 1:nof_datasets) {
      nof_files <- jsonlite::fromJSON(paste(url,"?limit=0&type=File&replica=false&latest=true&",facet_cmip,"=",mip,"&",facet_var,"=",param,"&",facet_freq,"=",freq,"&",facet_exp,"=",expid,"&",facet_mod,"=",ESGF_query$response$docs[i,facet_mod],"&",facet_ens,"=",ESGF_query$response$docs[i,facet_ens],"&",facet_ins,"=",ESGF_query$response$docs[i,facet_ins],"&format=application%2Fsolr%2Bjson",sep=""))$response$numFound
      
      ESGF_file_query <- jsonlite::fromJSON(paste(url,"?limit=",nof_files,"&type=File&replica=false&latest=true&",facet_cmip,"=",mip,"&",facet_var,"=",param,"&",facet_freq,"=",freq,"&",facet_exp,"=",expid,"&",facet_mod,"=",ESGF_query$response$docs[i,facet_mod],"&",facet_ens,"=",ESGF_query$response$docs[i,facet_ens],"&",facet_ins,"=",ESGF_query$response$docs[i,facet_ins],"&format=application%2Fsolr%2Bjson",sep=""))
      
      nof_hist_files <- jsonlite::fromJSON(paste(url,"?limit=0&type=File&replica=false&latest=true&",facet_cmip,"=",mip,"&",facet_var,"=",param,"&",facet_freq,"=",freq,"&",facet_exp,"=historical&",facet_mod,"=",ESGF_hist_query$response$docs[i,facet_mod],"&",facet_ens,"=",ESGF_hist_query$response$docs[i,facet_ens],"&",facet_ins,"=",ESGF_hist_query$response$docs[i,facet_ins],"&format=application%2Fsolr%2Bjson",sep=""))$response$numFound
      
      ESGF_hist_file_query <- jsonlite::fromJSON(paste(url,"?limit=", nof_hist_files, "&type=File&replica=false&latest=true&", facet_cmip, "=", mip, "&", facet_var, "=", param,"&", facet_freq, "=", freq, "&", facet_exp,"=historical&", facet_mod, "=", ESGF_hist_query$response$docs[i,facet_mod],"&", facet_ens, "=", ESGF_hist_query$response$docs[i,facet_ens], "&", facet_ins, "=", ESGF_hist_query$response$docs[i,facet_ins], "&format=application%2Fsolr%2Bjson",sep = ""))
      
      if (verbose) {
        print(paste("Found", nof_files, " scenario file(s) and ", nof_hist_files, " historical file(s) in dataset", i))
        print(paste(rep("=",nchar(ESGF_query$response$docs$id[i])+9),collapse=""))
        print(paste("Dataset:",ESGF_query$response$docs$id[i]))
        print("Files:")
        print(ESGF_file_query$response$docs$title)
        print(paste("Historical dataset:",ESGF_hist_file_query$response$docs$dataset_id[1]))
        print("Files:")
        print(ESGF_hist_file_query$response$docs$title)
        print(paste(rep("=",nchar(ESGF_query$response$docs$id[i])+9),collapse=""))
      } else cat('.')
      
      ic <- as.character(i); if (i < 100) ic <- paste('0',ic,sep=''); if (i < 10) ic <- paste('0',ic,sep='')
      results[[paste('dataset.query:',ic,sep='_')]] <- ESGF_query$response$docs[i,]
      
      for (j in 1:nof_files) {
        jc <- as.character(j); if (j < 100) jc <- paste('0',jc,sep=''); if (j < 10) jc <- paste('0',jc,sep='')
        
        results[[paste('file.query:',ic,jc,sep='_')]] <- ESGF_file_query$response$docs[j,]
        
        opendap_idx <- grep("OPENDAP",ESGF_file_query$response$docs$url[[j]])
        http_idx <- grep("HTTPServer",ESGF_file_query$response$docs$url[[j]])
        http_url <- unlist(strsplit(ESGF_file_query$response$docs$url[[j]][http_idx],"|",fixed=TRUE))[1]
        #results[[paste('http',ic,jc,sep='_')]] <- http_url
        results[[paste('https',ic,jc,sep='_')]] <- http_url  ## REB 2021-04-12 - modification
        # if (verbose) print(http_url)
        opendap_url <- unlist(strsplit(ESGF_file_query$response$docs$url[[j]][opendap_idx],"|",fixed=TRUE))[1]
        # if (verbose) print(opendap_url)
        results[[paste('OpenDAP',ic,jc,sep='_')]] <- gsub("http://","https://",gsub(".nc.html",".nc",opendap_url))
        results[[paste('grid',ic,jc,sep='_')]] <- ESGF_file_query$response$docs$grid[[j]]
        results[[paste('member_id',ic,jc,sep='_')]] <- ESGF_file_query$response$docs[[j,facet_ens]]
        results[[paste('source_id',ic,jc,sep='_')]] <- ESGF_file_query$response$docs[[j,facet_mod]]
        results[[paste('type',ic,jc,sep='_')]] <- ESGF_file_query$response$docs$source_type[[j]]
        results[[paste('title',ic,jc,sep='_')]] <- ESGF_file_query$response$docs$title[[j]]
        results[[paste('timestamp',ic,jc,sep='_')]] <- ESGF_file_query$response$docs$timestamp[[j]]
        
        #Set facets that are not available in CMIP5 to NA
        if (is.null(results[[paste('grid',ic,jc,sep='_')]])) results[[paste('grid',ic,jc,sep='_')]] <- "NA"
        if (is.null(results[[paste('type',ic,jc,sep='_')]])) results[[paste('type',ic,jc,sep='_')]] <- "NA"
      }
      
      if (nof_hist_files == 0)
      {
        if (verbose) cat('\n','No historical dataset found for',ESGF_query$response$docs$id[i])
      } else  {
        for (j in 1:nof_hist_files) {
          jc <- as.character(j+nof_files); if (j+nof_files < 100) jc <- paste('0',jc,sep=''); if (j+nof_files < 10) jc <- paste('0',jc,sep='')
          
          results[[paste('file.query:',ic,jc,sep='_')]] <- ESGF_hist_file_query$response$docs[j,]
          
          opendap_idx <- grep("OPENDAP",ESGF_hist_file_query$response$docs$url[[j]])
          http_idx <- grep("HTTPServer",ESGF_hist_file_query$response$docs$url[[j]])
          http_url <- unlist(strsplit(ESGF_hist_file_query$response$docs$url[[j]][http_idx],"|",fixed=TRUE))[1]
          #results[[paste('http',ic,jc,sep='_')]] <- http_url
          results[[paste('https',ic,jc,sep='_')]] <- http_url  ## REB 2021-04-12 - modification
          # if (verbose) print(http_url)
          opendap_url <- unlist(strsplit(ESGF_hist_file_query$response$docs$url[[j]][opendap_idx],"|",fixed=TRUE))[1]
          # if (verbose) print(opendap_url)
          results[[paste('OpenDAP',ic,jc,sep='_')]] <- gsub("http://","https://",gsub(".nc.html",".nc",opendap_url))
          results[[paste('grid',ic,jc,sep='_')]] <- ESGF_hist_file_query$response$docs$grid[[j]]
          results[[paste('member_id',ic,jc,sep='_')]] <- ESGF_hist_file_query$response$docs[[j,facet_ens]]
          results[[paste('source_id',ic,jc,sep='_')]] <- ESGF_hist_file_query$response$docs[[j,facet_mod]]
          results[[paste('type',ic,jc,sep='_')]] <- ESGF_hist_file_query$response$docs$source_type[[j]]
          results[[paste('title',ic,jc,sep='_')]] <- ESGF_hist_file_query$response$docs$title[[j]]
          results[[paste('timestamp',ic,jc,sep='_')]] <- ESGF_hist_file_query$response$docs$timestamp[[j]]
          
          #Set facets that are not available in CMIP5 to NA
          if (is.null(results[[paste('grid',ic,jc,sep='_')]])) results[[paste('grid',ic,jc,sep='_')]] <- "NA"
          if (is.null(results[[paste('type',ic,jc,sep='_')]])) results[[paste('type',ic,jc,sep='_')]] <- "NA"
        }
      }
    }
    
    elements <- names(results)
    opendap <- grep('OpenDAP',names(results))
    #http <- grep('http',names(results))
    http <- grep('https',names(results))  ## REB 2021-04-12: modification
    mem <- grep('member_id',names(results))
    grid <- grep('grid',names(results))
    model <- grep('source_id',names(results))
    type <- grep('type',names(results))
    title <- grep('title',names(results))
    timestamp <- grep('timestamp',names(results))
    ## KMP 2022-11-18: changed period extraction to be more flexible (for sub-daily frequencies)                 
    #period <- substr(as.character(results[title]),nchar(as.character(results[title]))-15,                       
    #                 nchar(as.character(results[title]))-3)                                                     
    period <- sapply(as.character(results[title]),
                     function(x) {
                       i <- regexpr('[0-9]{8,14}-[0-9]{8,14}',x)
                       return(substr(x, i, i+attr(i,"match.length")-1)) })

    meta <- data.frame(OpenDap=as.character(results[opendap]),http=as.character(results[http]),
                       member.id=as.character(results[mem]),grid=as.character(results[grid]),
                       model=as.character(results[model]),type=as.character(results[type]),
                       title=as.character(results[title]), period=as.character(period),
                       timestamp=as.character(results[timestamp]))
    
    attr(meta,'variable') <- param
    attr(meta,'expid') <- expid
    attr(meta,'file.query.data') <- results[grep('file.query',names(results))]
    attr(meta,'dataset.query.data') <- results[grep('dataset.query',names(results))]
    attr(meta,'history') <- history.stamp(meta)
    
    return(meta)
  }
}
metno/esd documentation built on April 29, 2024, 3:34 p.m.