R/extract_data.R

Defines functions extract_data

Documented in extract_data

#' @title Extract data values from SDMX XML 
#' @description Extracts the data values from the SDMX XML data file
#' @param xml_lf an input XML leaf with data series from an SDMX XML file to extract the value and its dimensions from it
#' @param keep_flags a logical value if to extract the observation status (flag) information from the XML file. The default value is \code{FALSE}
#' @param stringsAsFactors a logical value. If \code{TRUE} the columns are converted to factors. The default is \code{FALSE}, 
#'        in this case the strings are returned as characters.
#' @param bulk a logical value with default value \code{TRUE} if the input SDMX XML file is from the bulk download facility containing all the observations. 
#'        If the input file has pre-filtered values then the value \code{FALSE} should be used.  
#' @param check_toc if the data file was downloaded using the URL from the TOC or not. The default is FALSE means not the TOC link is used.         
#' @export 
#' @details It is a sub-function to use in the \code{\link{get_eurostat_data}} and the \code{\link{get_eurostat_raw}} functions.
#' @return a data frame containing the values of an SDMX node: the dimensions, value and the optional flag(s)
#' @examples 
#' \dontshow{
#' if (parallel::detectCores()<=2){
#'    options(restatapi_cores=1)
#' }else{
#'    options(restatapi_cores=2)
#' }
#' }
#' \donttest{
#' id<-"agr_r_milkpr"
#' toc<-get_eurostat_toc()
#' bulk_url<-toc$downloadLink.sdmx[toc$code==id]
#' options(timeout=2)
#' if (!(is.null(bulk_url))&(!(length(bulk_url)==0))){
#'   temp<-tempfile()
#'   tryCatch({download.file(bulk_url,temp)
#'   sdmx_xml<-xml2::read_xml(unzip(temp,paste0(id,".sdmx.xml"),exdir=tempdir()))
#'   xml_leafs<-xml2::xml_find_all(sdmx_xml,".//data:Series")
#'   extract_data(xml_leafs[1])
#'   unlink(file.path(tempdir(),paste0(id,".sdmx.xml")))})
#'   unlink(temp)
#' }
#' options(timeout=60)
#' }
#' 

extract_data<-function(xml_lf,keep_flags=FALSE,stringsAsFactors=FALSE,bulk=TRUE,check_toc=FALSE){
  if (getOption("restatapi_verbose",FALSE))  {message("extract_data - API version:",get("rav",envir=restatapi::.restatapi_env))}
  rav<-get("rav",envir=restatapi::.restatapi_env)
  prefix<-switch(rav,"1"="generic:","2"="g:")
  if(bulk){
    if (Sys.info()[['sysname']]=='Windows'){
      xml_lf<-gsub(prefix,"",xml_lf)
      xml_lf<-xml2::as_xml_document(xml_lf)
    } 
    bd<-t(as.data.frame(xml2::xml_attrs(xml_lf),stringsAsFactors=FALSE))
    rownames(bd)<-NULL
    dv<-xml2::xml_attrs(xml2::xml_children(xml_lf))
    if (keep_flags){
      flagc<-switch(rav,"1"="OBS_STATUS","2"="OBS_FLAG")
      if (check_toc) {flagc<-"OBS_STATUS"}
      cn<-c("TIME_PERIOD","OBS_VALUE",flagc)
    } else {
      cn<-c("TIME_PERIOD","OBS_VALUE")
    }
    df<-do.call(rbind, lapply(lapply(dv, unlist),"[", cn))
    colnames(df)<-cn
    out<-data.frame(bd,df,stringsAsFactors=FALSE)  
  } else {
    # if (Sys.info()[['sysname']]=='Windows'){
    #   xml_lf<-xml2::as_xml_document(xml_lf)
    # } 
    tmp<-as.data.frame(xml2::xml_attrs(xml2::xml_children(xml2::xml_find_all(xml_lf,paste0(".//",prefix,"SeriesKey")))),stringsAsFactors=FALSE)
    bd<-as.data.frame(tmp[2,],stringsAsFactors=FALSE)
    colnames(bd)<-tmp[1,]
    rownames(bd)<-NULL
    obs<-xml2::xml_find_all(xml_lf,paste0(".//",prefix,"Obs"))
    cn<-c("obsTime","obsValue") 
    df<-data.table::rbindlist(lapply(obs, function(x) {dr<-as.data.frame(t(unlist(xml2::xml_attrs(xml2::xml_children(x)))),stringsAsFactors=FALSE)
                                                rownames(dr)<-NULL
                                                if (ncol(dr)<2) {dr$nc<-""}
                                                colnames(dr)<-cn
                                                if ((keep_flags)){
                                                  f<-xml2::xml_attr(xml2::xml_children(xml2::xml_children(x)),"value")
                                                  if(length(f)==0){
                                                    dr$OBS_STATUS<-""
                                                  } else{
                                                    f<-gsub("na","",f)
                                                    dr$OBS_STATUS<-paste0(f,collapse="")
                                                  }
                                                }
                                                return(as.data.frame(dr,stringsAsFactors=FALSE))
      }), fill=TRUE)
    if (any(!grepl("\\d:\\d",df$obsValue))) df$obsValue<-as.numeric(df$obsValue)
    # df$obsValue<-as.numeric(df$obsValue)
    df$obsValue[df$obsValue=="NaN"]<-NA
    if (keep_flags){
      out<-data.frame(bd,df,stringsAsFactors=stringsAsFactors)  
    } else{
      out<-data.frame(bd,df[,1:2],stringsAsFactors=stringsAsFactors)
    }
  }
  return(out)
}

Try the restatapi package in your browser

Any scripts or data that you put into this service are public.

restatapi documentation built on Sept. 19, 2023, 5:08 p.m.