R/dspl.r
In googlePublicData: Working with Google's 'Public Data Explorer' DSPL Metadata Files

.getFilesNames <- function(path, sep=";") {
################################################################################
# DESCRIPTION:
# As a function of a path and a file extension, gets all the file names there.
################################################################################  
  # Checking the sep
  pattern <- paste0("[.]",switch(
    sep,
    "\t" = "tab",
    ";" = "csv",
    "," = "csv",
    xls = "xls",
    xlsx = "xlsx"), "$")
  
  # Listing files
  files <- list.files(path=path, pattern = pattern, full.names=TRUE)
  
  nfiles <- length(files)
  if (nfiles==0) {
    stop(nfiles, ' files found.\nCannot\' continue') 
  }
  else {
    message(nfiles, ' files found...')
  }
  return(files)
}



#' Generates a dataframe used to complement a DSPL bundle
#' 
#' Parsing \emph{csv}, \emph{tab} or \emph{xls(x)} files at a specific
#' directory path, genMore info generates a dataframe used to complete a DSPL
#' bundle with a more complete concepts definition including description, url,
#' etc..
#' 
#' If there isn't any output defined (\code{NA}) the function returns a
#' dataframe containing concepts as observations. Using this, the user may add
#' more descriptive info about concepts. In turn it writes a tab file with the
#' dataframe described above. The user may recycle this file writing ``append''
#' in the \code{action} argument.
#' 
#' @param path String. Path to the folder where the tables are saved.
#' @param encoding The encoding of the files to be parsed.
#' @param sep The separation character of the tables in the 'path' folder.
#' Currently supports introducing the following arguments: ``,'' or ``;'' (for
#' .csv files), ``\\t'' (for .tab files) and ``xls'' or ``xlsx'' (for Microsoft's
#' excel files).
#' @param output If defined, the place where to save the dataframe as tab file.
#' Otherwise it returns a data frame object.
#' @param action Tells the function what to do if there's a copy of the file.
#' Available actions are ``merge'' and ``replace''.
#' @param dec String. Decimal point.
#' @return If no \code{output} defined, \code{genMoreInfo} returns a dataframe
#' with the following columns.  \item{id}{XML id of the concept
#' (autogenerated)} \item{label}{The label of the concept (autogenerated)}
#' \item{description}{A brief description of the concept} \item{topic}{The
#' topic of the concept} \item{url}{A URL for the concept where, for example,
#' to get more info} \item{totalName}{A total name as specified by DSPL language
#' (works for dimensional concepts)} \item{pluralName}{A total name as
#' specified by DSPL language (works for dimensional concepts)}
#' @author George G. Vega Yon 
#' @references Google Public Data Explorer: \url{http://publicdata.google.com}
#' @keywords IO
#' @examples
#' 
#' # Getting the path where all the datasets are
#' path <- system.file("dspl-tutorial", package="googlePublicData")
#' info <- genMoreInfo(path) # This is a dataframe
#' 
#' # Setting the 5th concept as topic "Demographics"
#' info[5, "topic"] <- "Demographics"
#' 
#' # Generating the dspl file
#' ans <- dspl(path, moreinfo = info)
#' ans
#' 
#'   \dontrun{
#'     # Parsing some xlsx files at "my stats folder" to gen a "moreinfo" dataframe
#'     INFO <- genMoreInfo(path="my stats folder/", sep="xls")
#'     
#'     # Rows 1 to 10 are about "Poverty" and rows 11 to 20 about "Education"
#'     # So we fill the "topic" column with it.
#'     INFO$topic[1:10] <- "Poverty"
#'     INFO$topic[11:20] <- "Education"
#'     
#'     # Finally, we build the DSPL ZIP including more info
#'     dspl(path="my stats folder/", sep="xls", moreinfo=INFO)
#'     
#'   }
#' @export 
#' 
genMoreInfo <- function(path, encoding=getOption("encoding"), sep=";", 
                        output=NA, action="merge", dec=".") {
################################################################################
# DESCRIPTION:
# Reads .csv and .xls(x) files, outputs a descriptive dataframe of the data and
# builds a config file.
################################################################################  
  
  oldopt <- options()$stringsAsFactors
  options(stringsAsFactors=FALSE)
  on.exit(options(stringsAsFactors = oldopt))
  
  # Checks if the path exists
  .checkPath(path, "input")
  
  # Generates the filelist acording to an specific sep
  files <- .getFilesNames(path, sep)
  
  # Reads and analices the files
  x <- seekTables(files=files, encoding=encoding, sep=sep, dec=dec)
    
  # Extracts the unique list of variables
  x <- unique(
    # subset(x,subset=type != 'date' & is.dim.tab==F, 
           # select=c(id, label))
    x[
      with(x, type!='date' & !is.dim.tab),
      colnames(x) %in% c('id','label'),
      FALSE]
    )
  
  x <- data.frame(cbind(x, description=NA, topic=NA, url=NA, totalName=NA, 
                        pluralName=NA), stringsAsFactors=F)
  
  # Prints the result
  if (is.na(output)) {
    return(x) 
  } 
  
  # Case of mergin
  if (action == 'merge') {
    target.exists <- file.exists(output)
    
    if (target.exists) {
      # Merge of the tables
      x0 <- read.table(file=output, na.strings='NA', sep='\t')
      x <- subset(x, !(x$id %in% x0$id))
      x <- rbind(x0,x)
    }
    else {
      warning('The file ',output,'doesn\'t exists. It will be created')
    }
    ERR<-try(write.table(x, file=output, quote=F, na="NA", sep='\t'))
    if (class(ERR)!='try-error') {
      message("DSPL Configfile written correctly at\n",normalizePath(output))
    }
    else {
      stop("An error has occurred during the file writing at:\n",normalizePath(output))
    }
  }
  
  # Case of replacing
  else if (action %in% c('replace','merge')) {
    ERR<-try(write.table(x, file=output, quote=F, na="NA", sep='\t'))
    if (class(ERR)!='try-error') {
      message("DSPL Configfile written correctly at:\n",normalizePath(output))
    }
    else {
      stop("An error has occurred during the file writing at:\n",normalizePath(output))
    }
  }
}

seekTables <- function(files, encoding, sep, output = NA, replace = T, dec) {
################################################################################
# DESCRIPTION:
# Reads .csv and .xls(x) files, exports them as csv and outputs a descriptive da
# taframe. Also determinates which field is dim or metric.
################################################################################  

  # Timeframe metrics
  metrics <- matrix(c(
    'dia','day','semana','week','trimestre','quarter', 'mes','month','ano', 
    'year', 'year','year','month','month'), ncol = 2, byrow=T)

  FUN <- function(x,y,z) {           
           # In the case of csv, tab 
           if (!(sep %in% c("xls", "xlsx"))) {
             
             data <- utils::read.table(
               x, sep=sep, strip.white=T, encoding=y, fill=T,
               dec=z, header=T
             )
             
           } else {
              data <- readxl::read_excel(x, col_names = T, sheet = 1)
           }
           
           cols <- colnames(data)
           
           cols <- gsub(".", " ", cols, fixed = T)
           cols <- gsub("^[[:space:]]*|[[:space:]]*$", "", cols)
           cols <- gsub("[^[:graph:]][[:space:]]*"," ", cols)
           
           colnames(data) <- cols
           
           fnames <- gsub("\\.[[:alpha:]]*$","",x)
           fnames <- gsub(".*(/|\\\\)", "", fnames)

           fnames <- rep(fnames, length(cols))
           
           # Builds descriptive matrix
           
           var <- data.frame(
             id=.cleantext(cols),
             label=cols,
             type=.fixType(unlist(lapply(data, typeof))),
             slice=fnames
           )
           
           # Creates a new column of metric vs dimm
           var <- cbind(var, concept.type='metric')
           
           var[var[,1] %in% metrics[,1], 5] <- 'dimension' # If time
           var[var[,3] %in% c('string'),5] <- 'dimension' # If string
           var[var[,1] %in% c('longitud','latitud','colour'),5] <- NA # If string
           var[var[,1] %in% metrics[,1], 3] <- 'date' # If date
            
           # Identifies which dataset is a dimension dataset
           var <- cbind(var, is.dim.tab = F)
           if (all(var[,3] != 'date')) {var['is.dim.tab'] <- T}
                
           # Replaces the date-time colnames for those proper to PDE
           for (i in 1:NROW(metrics)) {
             cols <- gsub(metrics[i,1], metrics[i,2],.cleantext(cols), fixed = T)
            }
           
           var['id'] <- cols
           
           # In the case of output, it creates a new folder
           if (!is.na(output)) {
             colnames(data) <- cols
             
             # Sorts the data acording to dimensional concepts (not date)
             ord <- var[var[,5]=='dimension' & var[,3] != 'date',1]
             if (length(ord)!=0) data <- data[do.call(order,data[ord]),]
             
             # Writes the data into csv files
             con <- file(paste(output,'/',var[1,4],'.csv',sep=''), encoding="UTF-8")
             
             write.table(
               x         = data,
               file      = con,
               na        = '',
               sep       = ',',
               quote     = FALSE,
               row.names = FALSE,
               dec       = '.'
               )

             message(
               gsub(".*(/|\\\\)","",x)," analyzed correctly, ordered by ", 
               paste(utils::head(ord), collapse = ", "), ifelse(length(ord)>6, "...", ""),
               " and exported as csv"
               )
           }
           else {
             message(x," analyzed correctly")
           }
           return(var)
         }
  
  # Puts it all into a single matrrx
  vars <- do.call('rbind', lapply(files, FUN, y=encoding, z=dec))
  
  # Identifies where are the correspondant tables for each dimension  
  vars <- cbind(vars, dim.tab.ref = NA)
  for (i in 1:NROW(vars)) {
    if (vars$concept.type[i] == 'dimension' & vars$type[i] != 'date' & 
      vars$is.dim.tab[i] != 'TRUE') {
      delta <- try(vars[vars[,1]==vars[i,1] & vars[,6] == 'TRUE',4])
      
      if (length(delta) == 0) {
        stop('The dimension ', vars$label[i],' needs a code (key) tab')
      } else {
        vars[i,7] <- try(delta)
      }
    }
  }
  
  # Setting which concepts extends geo:location
  
  geo <-vars[with(vars, is.dim.tab == TRUE), colnames(vars) %in% c("id", "slice"), FALSE]
  geo <- by(
    geo, geo$slice, 
    function(x) all(c('longitude', 'latitude') %in% unlist(x['id']))
  )
  
  geo <- names(which(geo))
  vars <- cbind(vars, extends.geo=FALSE)
  vars$extends.geo[which(vars$dim.tab.ref %in% geo)] <- TRUE
  
  return(vars)
}

.getMoreInfo <- function(source,target, encoding="unknown", id='id', label='label') {
################################################################################
# Reads from a tab file generated by genMoreInfo as a complement info to concepts
################################################################################  
  
  # Partial fix (should work, need to see
  # http://stackoverflow.com/questions/23475309/in-r-is-it-possible-to-suppress-note-no-visible-binding-for-global-variable)
  # type <- is.dim.tab <- id <- label <- freq <- NULL
  
  if (length(source)) {
    
    # If a file, reads the moreinfo file
    if (!inherits(source, "data.frame")) {
      
      oldopt <- options()$stringsAsFactors
      options(stringsAsFactors=FALSE)
      on.exit(options(stringsAsFactors = oldopt))
      
      source <- utils::read.table(file=source, header=TRUE, sep='\t', na.strings='NA',
                             encoding=encoding)
    }
    
    # Cleans up the content
    source$topicid <- .cleantext(source$topic)
    
    # Checks if there are at least moreinfo concepts as path concepts, where path
    # concepts are those captured from the files at pde(path=...)
    ntarget <- NROW(target)
    nsource <- NROW(source)
    if (ntarget < nsource) {
      warning('The number of moreinfo concepts (',nsource,') vs concepts found at path (',
              ntarget,') differ. Concepts at moreinfo won\'t appear at the metadata.')
    }
    else if (ntarget > nsource) {
      stop('Can\'t continue, there are more concepts at the path than at moreinfo')
    }
    
    # Merges source and target
    source <- source[which(source[[id]] %in% target$id),] # subset(source, id %in% target$id)
    target$label <- source[[label]] # source$label
    source <- source[,colnames(source)!=label,FALSE] #subset(source, select=-label)
    target <- merge(target, source, by=c('id'))
  }
  return(target)
}

.fixType <- function(x) {
################################################################################
# Transforms the datatypes in function of allowed datatypes for DSPL
################################################################################
  replace <- matrix(c('logical', 'integer', 'double', 'complex', 'character',
                      'boolean', 'integer', 'float', 'float', 'string'), ncol =2)
  for (i in 1:5) x <- as.character(gsub(replace[i,1], replace[i,2],x, fixed=T))
  return(x)
}

.cleantext <- function(x) {
################################################################################
# Adapts labels to IDs
################################################################################  
  
  sym <- matrix(c("$", "money", "??", "grad", "#", "n", "%", "pcent",
                  " ", "_", ".", "", ",", "_", ";", "_", ":", "_",
                  "(","_",")","_"), 
                ncol = 2, byrow = TRUE)
  
  x <- tolower(x)
  
  x <- iconv(x, to="ASCII//TRANSLIT")
  
  for (i in 1:NROW(sym)) {
    x <- gsub(sym[i,1], sym[i,2], x, fixed = TRUE)
  }
  
  # Extracts 
  x <- 
    
  return(x)
}

.addInfo <- function(nodename,values,parent,lang) {
################################################################################
# Function to add information and provider nodes y multiple languages
################################################################################  
  XML::newXMLNode(nodename, parent=parent, sapply(values, function(x) {
    XML::newXMLNode('value',attrs=c('xml:lang'=lang[which(values==x)]), x,
               suppressNamespaceWarning=TRUE)}))
}

.addTopics <- function(nodename, values, parent, lang) {
################################################################################
# Function to create and populate the topics
################################################################################  
  values <- unique(values)
  values <- subset(values, !is.na(values[,1]))
  colnames(values) <- c('label','id')
  
  fun <- function(x, ...) {apply(x, MARGIN = 1,...)}
  
  fun(values, FUN=
    function(x) {
      tempnode0 <- XML::newXMLNode(nodename, parent=parent, attrs=c(x['id']))
      tempnode1 <- XML::newXMLNode('info', parent=tempnode0)
      tempnode2 <- XML::newXMLNode('name', parent=tempnode1)
      XML::newXMLNode('value', parent=tempnode2, attrs=c('xml:lang'=lang[1]),
                 x['label'], suppressNamespaceWarning=T)
    }
  )
}

.addConcepts <- function(val,parent,lang) {
################################################################################
# Function to create and populate the concepts
################################################################################  
  colnames(val)[3] <- 'ref'
  colnames(val)[7] <- 'geo'
  
#   if (NCOL(val) > 1) {
#     fun <- function(x, ...) {apply(x, MARGIN = 1,...)}
#   } else {
#     fun <- function(x, FUN) {FUN(x)}
#   }
#   
#   fun(val, .addConcepts_sub)
  if (ncol(val)>1) apply(val,.addConcepts_sub,MARGIN = 1,parent=parent,lang=lang)
  else .addConcepts(val,parent=parent,lang=lang)
}

.addConcepts_sub <- function(x,parent,lang) {
  # function(x) {
    x['is.dim.tab'] <- gsub(' ','',x['is.dim.tab'])
    x['geo'] <- gsub(' ','',x['geo'])
    
    if (x["ref"] == 'string') {ATT <- c(x['id'], extends='entity:entity')}
    if (x['ref'] == 'string' && x['geo']=='TRUE') {ATT <- c(x['id'], extends='geo:location')}
    if (x['ref'] != 'string') {ATT <- c(x['id'])}
    
    # in the case of not being a dimensional concept
    if (x["concept.type"]!="dimension") {
      tempnode0 <- XML::newXMLNode('concept', attrs=ATT, parent=parent)
      
      # Starts Adding info
      tempnode1 <- XML::newXMLNode('info', parent=tempnode0)
      
      tempnode2 <- XML::newXMLNode('name', parent=tempnode1)
      
      # Adds a description
      if (!is.na(x['description'])) {
        description <- as.character(x['description'])
        tempnode3 <- XML::newXMLNode('description', parent=tempnode1)
        XML::newXMLNode('value', parent=tempnode3, attrs=c('xml:lang'=lang[1]), description)
      }
      
      # URL node
      if (!is.na(x['url'])) {
        url <- as.character(x['url'])
        tempnode4 <- XML::newXMLNode('url', parent=tempnode1)
        XML::newXMLNode('value', parent=tempnode4, attrs=c('xml:lang'=lang[1]), url)
      }
      
      # Here should start the multilanguage loop
      XML::newXMLNode('value', parent=tempnode2, attrs=c('xml:lang'=lang[1]),
                 suppressNamespaceWarning=T, x['label'])
      
      # Adds a topic category
      if (!is.na(x['topicid'])) {
        topicref <- as.character(x['topicid'])
        XML::newXMLNode('topic', parent=tempnode0, attrs=c(ref=topicref))
      }
      
      # Adds the data type specification
      XML::newXMLNode('type', attrs=c(x['ref']), parent=tempnode0)
      
      # in the case of being a dimensional concept
    } else {
      tempnode0 <- XML::newXMLNode('concept', attrs=ATT, parent=parent)
      
      # Starts adding info
      tempnode1 <- XML::newXMLNode('info', parent=tempnode0)
      
      # Name Node
      tempnode2 <- XML::newXMLNode('name', parent=tempnode1)
      XML::newXMLNode('value', parent=tempnode2, attrs=c('xml:lang'=lang[1]),
                 suppressNamespaceWarning=T, x['label'])
      
      # Description node
      if (!is.na(x['description'])) {
        description <- as.character(x['description'])
        tempnode3 <- XML::newXMLNode('description', parent=tempnode1)
        XML::newXMLNode('value', parent=tempnode3, attrs=c('xml:lang'=lang[1]), description)
      }
      
      # URL node
      if (!is.na(x['url'])) {
        url <- as.character(x['url'])
        tempnode4 <- XML::newXMLNode('url', parent=tempnode1)
        XML::newXMLNode('value', parent=tempnode4, attrs=c('xml:lang'=lang[1]), url)
      }
      
      # Plural name node
      if (!is.na(x['totalName'])) {
        pluralName <- as.character(x['pluralName'])
        tempnode6 <- XML::newXMLNode('pluralName', parent=tempnode1)
        XML::newXMLNode('value', parent=tempnode6, attrs=c('xml:lang'=lang[1]), pluralName)
      }        
      
      # Total name node
      if (!is.na(x['totalName'])) {
        totalName <- as.character(x['totalName'])
        tempnode5 <- XML::newXMLNode('totalName', parent=tempnode1)
        XML::newXMLNode('value', parent=tempnode5, attrs=c('xml:lang'=lang[1]),totalName)
      }
      
      XML::newXMLNode('type', parent=tempnode0, attrs=c(x['ref']))
      XML::newXMLNode('table', parent=tempnode0, attrs=
                   c(ref=paste(x['dim.tab.ref'],'_table',sep='')))
      
    }
  # }
}

.addSlices <- function(tableid, sliceatt, parent) {
################################################################################
# Function to create and populate the slices
################################################################################
  colnames(sliceatt)[1] <- 'concept'
  by(data=sliceatt, INDICES=tableid,FUN=
    function(x) {
      XML::newXMLNode(name='slice', attrs=c(id=paste(x$slice[1],'_slice',sep='')),
                 parent=parent, apply(x, MARGIN = 1,FUN=
                   function(z){
                     #z <- as.character(z)
                     # In the case of dates-time
                     if (z['type'] == 'date') {
                       XML::newXMLNode(name=z['concept.type'], 
                                  attrs=c(concept=paste('time:',z['concept'],sep='')))
                       # Otherwise
                     } else {
                       XML::newXMLNode(name=z['concept.type'], attrs=c(z['concept']))
                     }}), XML::newXMLNode('table', attrs=c(ref=paste(x$slice[1],'_table',sep=''))))
    }
    )
}

.addTables <- function(tableid, tableatt, parent, format) {
################################################################################
# Function to create and populate the tables
################################################################################
  by(data=tableatt, INDICES=tableid,FUN=
    function(x) {
      XML::newXMLNode(name='table', attrs=c(id=paste(x$slice[1],'_table',sep='')),parent=
        parent, apply(X=x, 
                      MARGIN = 1, FUN=
                        function(z){
                          if (z['type'] == 'date') {
                            XML::newXMLNode(name='column', attrs=c(z['id'], z['type'], format=format))  
                          } else {
                            XML::newXMLNode(name='column', attrs=c(z['id'], z['type']))
                          }}), XML::newXMLNode(name='data', XML::newXMLNode('file', attrs=c(format=
                            'csv', encoding='utf8'),paste(x$slice[1],'.csv',sep='')))
                 )
    }
     )
}



#' Builds Dataset Publication Language (DSPL) metadata file
#' 
#' Parsing \emph{csv}, \emph{tab} or \emph{xls(x)} files at a specific
#' directory path, dspl generates a complete DSPL file.  If an output string is
#' specified, the function generates the complete ZIP (DSPL file plus csv
#' files) ready to be uploaded to Google Public Data Explorer.
#' 
#' If there isn't any output defined the function returns a list of class
#' \code{dspl} that among its contents has a xml object (DSPL file); otherwise,
#' if an output is defined, the results consists on two things, an already ZIP
#' file containing a all the necessary to be uploaded at
#' \url{publicdata.google.com} (a collection of csv files and the XML DSPL
#' written file) and a message (character object).
#' 
#' Internally, the parsing process consists on the following steps:
#' \enumerate{
#' \item Loading the data, 
#' \item Generating each column corresponding id, 
#' \item Identifying the data types, 
#' \item Building concepts, 
#' \item Identifying dimensional concepts and distinguishing between categorical,
#' geographical and time dimensions, and
#' \item Executing internal checks.
#' }
#' 
#' In order to properly load the zip file (DSPL file plus CSV data files), the
#' function executes a series of internal checks upon the data structure. The
#' detailed list:
#' \itemize{
#' \item \strong{Slices with the same dimensions}: DSPL
#' requires that each slice represents one dimensional cut, this is, there
#' should not be more than one data table with the same dimensions.
#' \item \strong{Duplicated concepts}: As a result of multiple data types, e.g a single
#' concept (statistic) as integer in one table and float in other, \code{dspl}
#' may get confused, so during the parsing process, if there is a chance, it
#' collapses duplicated concepts into only one concept and assigns it the
#' common data type (float).
#' \item \strong{Correct time format definition}: Using \code{\link{checkTimeFormat}}
#' ensures that the time format specified is
#' compatible with DSPL.
#' }
#' 
#' @param path String. Path to the folder where the tables (csv|tab|xls) are
#' at.
#' @param output String, optional. Path to the output ZIP file.
#' @param replace Logical. If \code{output} ZIP file is defined exists,
#' \code{dspl} replaces it.
#' @param targetNamespace String. As DSPL documentation states ``Provides a URI
#' that identifies your dataset. This URI is not required to point to an actual
#' resource, but it's a good idea to have the URI resolve to a document
#' describing your content or dataset''.
#' @param timeFormat String. The corresponding time format of the collection.
#' Should be specified accordingly to joda-time format. See the Details section
#' for more information.
#' @param lang A list of strings of the languages supported by the dataset.
#' Could be only one.
#' @param name List of strings. The name of the dataset as defined accordingly
#' to the \code{lang} list.
#' @param description List of strings. Description of the dataset. It also
#' supports multiple description as the \code{name}
#' @param url The corresponding URL for the dataset.
#' @param providerName List of strings. The data provider name.
#' @param providerURL List of strings. The data provider website url.
#' @param sep The separation character of the tables in the 'path' folder.
#' Currently supports introducing the following arguments: ``,'' or ``;'' (for
#' .csv files), ``\\t'' (for .tab files) and ``xls'' or ``xlsx'' (for Microsoft's
#' excel files).
#' @param dec String. Decimal point.
#' @param encoding The char encoding of the input tables. Currently ignored for
#' Microsoft excel files.
#' @param moreinfo A special tab file generated by the function
#' \code{\link{genMoreInfo}} that contains a dataframe of the dataset concepts
#' with more specifications such as description, topic, url, etc.
#' @return If there isn't any \code{output} defined, \code{dspl} returns list
#' of \code{\link{class}} "\code{dspl}".
#' 
#' An object of class "\code{dspl}" is a list containing:
#' \item{dspl}{A character string containing the DSPL XML document as defined
#' by the \code{\link[XML:saveXML]{saveXML}} function.} 
#' \item{concepts.by.table}{A data frame object of concepts stored by table.} 
#' \item{dimtabs}{A data frame containing dimensional tables.} 
#' \item{slices}{A data frame of slices.} 
#' \item{concepts}{A data frame of concepts (all of them).}
#' \item{dimensions}{A data frame of dimensional concepts.}
#' \item{statistics}{A matrix of statistics.}
#' 
#' otherwise the function will build a ZIP file as specified in the output
#' containing the CSV and DSPL (XML) files.
#' @author George G. Vega Yon 
#' @references \itemize{ \item Google Public Data Explorer Tutorial:
#' \url{https://developers.google.com/public-data/docs/tutorial} }
#' @keywords IO
#' @examples
#' 
#' demo(dspl)
#'   
#' 
#' @export 
#' @name dspl
#' @aliases GooglePublicData
NULL

#' @export
#' @rdname dspl
dspl <- function(
################################################################################
# DESCRIPTION:
# Based on an specific folder directory, the function seeks for files that match
# the specified extension (csv, tab, xls, xlsx), reads the column names, guesses
# the datatype, analyces data structure and outputs a dspl metadata file includi
# ng the corresponding csv files.
#
# VARIABLES:
# - path: Full path to the folder where the tables are saved.
# - Output: FUll path to the output folder (pde will create a subfolder call r_dspl).
# - replace: In the case of defining output, replaces the files.
# - targetNamespace:
# - timeFormat: The corresponding time format of the collection.
# - lang: A list of the languages supported by the dataset.
# - name: The name of the dataset.
# - description: The dataset description.
# - url: The corresponding URL for the dataset.
# - providerNAme
# - providerURL
# - sep: The extension of the tables in the 'path' folder.
# - encoding: The char encoding of the input tables.
################################################################################  
  path,
  output = NA,
  replace = F,
  targetNamespace = "",
  timeFormat = "yyyy",
  lang = c("es", "en"),
  name = NA,
  description = NA,
  url = NA,
  providerName = NA,
  providerURL = NA,
  sep = ";",
  dec=".",
  encoding = getOption("encoding"),
  moreinfo = NULL
  ) {
  
  oldopt <- options()$stringsAsFactors
  options(stringsAsFactors=FALSE)
  on.exit(options(stringsAsFactors = oldopt))
  
  # Initial checks
  description <- ifelse(!is.na(description),description,'No description')
  name <- ifelse(!is.na(name),name,'No name')
  providerName <- ifelse(!is.na(providerName),providerName,'No provider')
  
  # Checking if output path is Ok
  if (!is.na(output)) {
    temp.path <- tempdir()
    .checkPath(path, "input")
  }
  else temp.path <- NA
  if (length(moreinfo)) {
    if (!inherits(moreinfo,"data.frame"))
      .checkPath(moreinfo, "input")
  }
  
  # Checking timeFormat
  timeOk <- checkTimeFormat(timeFormat)
  if (!timeOk) stop("Undefined \'joda-time\' definition ", timeFormat,
                    "\nFor more information checkout\n",
                    "http://joda-time.sourceforge.net/api-release/org/joda/time/format/DateTimeFormat.html")
  # Gets the filenames
  files <- .getFilesNames(path, sep)
    
  # Variables Lists and datatypes
  vars <- seekTables(files=files, encoding=encoding, sep=sep, 
                     output=temp.path, replace=replace, dec=dec)
  
  dims <- vars[
    with(vars, concept.type=='dimension'),
    colnames(vars) %in% c('id', 'slice', 'concept.type'), FALSE
    ]
  #subset(vars, concept.type=='dimension', select=c(id, slice, concept.type)) 
  
  # Identifying if there is any duplicated slice
  with(dims, .checkSlices(dims=id, by=slice))
  
  vars <- .checkDuplConcepts(concepts=vars)
  
  # Creates a unique concept list
  varConcepts <- unique(
    # subset(vars,subset=type != 'date' & is.dim.tab==F, select=-slice) 
    vars[
      with(vars, type!='date' & is.dim.tab==FALSE),
      colnames(vars)!='slice',FALSE
      ]
    )
  
  # Checks if there is a moreinfo file
  varConcepts <- .getMoreInfo(source=moreinfo, target=varConcepts, "")
  
  # Armado de xml
  archXML <- XML::newXMLDoc()
  dspl <- XML::newXMLNode(name='dspl', doc=archXML, attrs=c(
    targetNamespace=targetNamespace),
    namespaceDefinitions = 'http://schemas.google.com/dspl/2010')
  
  
  # Definiciones dspl
  imports <- c('quantity', 'entity', 'geo', 'time', 'unit')
  
  sapply(imports,
         function(x) {
           XML::newXMLNamespace(node=dspl, prefix=x,namespace=paste(
             'http://www.google.com/publicdata/dataset/google/',x,sep=''))
         })
  # Concepts import lines
  XML::newXMLCommentNode('Concepts imports', parent=dspl)
  
  imports <- paste(
    "http://www.google.com/publicdata/dataset/google/",
    imports, sep = '')
  
  sapply(X = imports,
         FUN = function(x) XML::newXMLNode(attrs=c(namespace=x), name = 'import',
                                      parent = dspl))
  # INFO
  XML::newXMLCommentNode('Info lines', parent=dspl)
  info <- XML::newXMLNode('info', parent = dspl)
  .addInfo('name', name, info, lang)
  .addInfo('description', description, info, lang)
  if (!is.na(url)) XML::newXMLNode('url', XML::newXMLNode('value', url), 
                              parent = info)
  
  # PROVIDER
  XML::newXMLCommentNode('Data Provider', parent=dspl)
  provider <- XML::newXMLNode('provider', parent = dspl)
  .addInfo('name', providerName, provider, lang)
  if (!is.na(providerURL)) XML::newXMLNode('url', XML::newXMLNode('value', providerURL), 
                                      parent = provider)
  
  # TOPICS
  if ("topicid" %in% colnames(varConcepts)) {
    test <- !all(is.na(varConcepts$topicid))
  }
  else {
    test <- F
  }
  if (test) {
    XML::newXMLCommentNode('Topics definition', parent=dspl)
    topics <- XML::newXMLNode('topics', parent=dspl)
    .addTopics('topic', varConcepts[c('topic', 'topicid')], topics, lang)
  }
    
  # CONCEPTS
  XML::newXMLCommentNode('Concepts Definitions', parent=dspl)
  concepts <- XML::newXMLNode('concepts', parent = dspl)
  .addConcepts(varConcepts,concepts, lang)
  
  # SLICES
  XML::newXMLCommentNode('Slices Definitions', parent=dspl)
  slices <- XML::newXMLNode('slices', parent = dspl)
  .addSlices(
    tableid  = vars[with(vars, !is.dim.tab),, FALSE][['slice']],
    sliceatt = vars[with(vars, !is.dim.tab),, FALSE], # subset(vars, is.dim.tab != T),
    parent   = slices
    )
  
  # TABLES
  XML::newXMLCommentNode('Tables Definitios', parent=dspl)
  tables <- XML::newXMLNode('tables', parent = dspl)
  .addTables(tableid=vars$slice,tableatt=vars,parent=tables, format=timeFormat)  
  
  # Building ouput
  .dimtabs  <- unique(vars[with(vars, is.dim.tab),,FALSE][['slice']]) #subset(vars, subset=is.dim.tab, select=slice))
  .slices   <- unique(vars[['slice']]) #subset(vars, select=slice))
  .concepts <- unique(vars[['label']]) #subset(vars, select=label))
  .dims <- unique(  
#     subset(
#       vars, 
#       subset=is.dim.tab & !(label %in% c('name', 'latitude','longitude','colour')),
#       select=label
#       )
    vars[ 
      with(vars, is.dim.tab & !(label %in% c('name', 'latitude','longitude','colour'))),
      colnames(vars)=='label',
      FALSE
    ]) 
  
  lapply(c(.dimtabs, .slices, .concepts, .dims), function(x) names(x) <- 'Name')
  
  pde.statistics <- matrix(
    c(
      NROW(.slices),
      NROW(.concepts),
      NROW(.dims)
    ), ncol=3)
  
  colnames(pde.statistics) <- c('slices','concepts','dimensions')
  
  result <- structure(
    .Data=
      list(
        XML::saveXML(archXML, encoding="UTF-8"), vars, .dimtabs, .slices, .concepts,
        .dims, pde.statistics
        ),
    .Names = c(
      'dspl', 'concepts.by.table', 'dimtabs', 'slices', 'concepts', 'dimensions',
      'statistics'),
    class = "dspl"
    )

  # If an output file is specified, it writes it on it
  if (is.na(output)) {
    return(result)
  
  } else {
    path <- paste(temp.path,'/metadata.xml',sep='')
    print.dspl(x=result, path=path, replace=replace, quiet=T)
    
    # Zipping the files
    tozip <- list.files(temp.path, full.names=T, pattern="csv$|xml$")
    utils::zip(output ,tozip,flags='-r9jm')
    
    message("Metadata created successfully at:\n", normalizePath(output))
  }
}

#' @export
#' @rdname dspl
new_dspl <- dspl
Any scripts or data that you put into this service are public.
googlePublicData documentation built on May 2, 2019, 3:45 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
googlePublicData
Working with Google's 'Public Data Explorer' DSPL Metadata Files

R/dspl.r
In googlePublicData: Working with Google's 'Public Data Explorer' DSPL Metadata Files

Try the googlePublicData package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

googlePublicData Working with Google's 'Public Data Explorer' DSPL Metadata Files

R/dspl.r In googlePublicData: Working with Google's 'Public Data Explorer' DSPL Metadata Files

Try the googlePublicData package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

googlePublicData
Working with Google's 'Public Data Explorer' DSPL Metadata Files

R/dspl.r
In googlePublicData: Working with Google's 'Public Data Explorer' DSPL Metadata Files