R/dspl.r

.getFilesNames <- function(path, sep=";") {
################################################################################
# DESCRIPTION:
# As a function of a path and a file extension, gets all the file names there.
################################################################################  
  # Checking the sep
  pattern <- paste0("[.]",switch(
    sep,
    "\t" = "tab",
    ";" = "csv",
    "," = "csv",
    xls = "xls",
    xlsx = "xlsx"), "$")
  
  # Listing files
  files <- list.files(path=path, pattern = pattern, full.names=TRUE)
  
  nfiles <- length(files)
  if (nfiles==0) {
    stop(nfiles, ' files found.\nCannot\' continue') 
  }
  else {
    message(nfiles, ' files found...')
  }
  return(files)
}



#' Generates a dataframe used to complement a DSPL bundle
#' 
#' Parsing \emph{csv}, \emph{tab} or \emph{xls(x)} files at a specific
#' directory path, genMore info generates a dataframe used to complete a DSPL
#' bundle with a more complete concepts definition including description, url,
#' etc..
#' 
#' If there isn't any output defined (\code{NA}) the function returns a
#' dataframe containing concepts as observations. Using this, the user may add
#' more descriptive info about concepts. In turn it writes a tab file with the
#' dataframe described above. The user may recycle this file writing ``append''
#' in the \code{action} argument.
#' 
#' @param path String. Path to the folder where the tables are saved.
#' @param encoding The encoding of the files to be parsed.
#' @param sep The separation character of the tables in the 'path' folder.
#' Currently supports introducing the following arguments: ``,'' or ``;'' (for
#' .csv files), ``\\t'' (for .tab files) and ``xls'' or ``xlsx'' (for Microsoft's
#' excel files).
#' @param output If defined, the place where to save the dataframe as tab file.
#' Otherwise it returns a data frame object.
#' @param action Tells the function what to do if there's a copy of the file.
#' Available actions are ``merge'' and ``replace''.
#' @param dec String. Decimal point.
#' @return If no \code{output} defined, \code{genMoreInfo} returns a dataframe
#' with the following columns.  \item{id}{XML id of the concept
#' (autogenerated)} \item{label}{The label of the concept (autogenerated)}
#' \item{description}{A brief description of the concept} \item{topic}{The
#' topic of the concept} \item{url}{A URL for the concept where, for example,
#' to get more info} \item{totalName}{A total name as specified by DSPL language
#' (works for dimensional concepts)} \item{pluralName}{A total name as
#' specified by DSPL language (works for dimensional concepts)}
#' @author George G. Vega Yon 
#' @references Google Public Data Explorer: \url{http://publicdata.google.com}
#' @keywords IO
#' @examples
#' 
#' # Getting the path where all the datasets are
#' path <- system.file("dspl-tutorial", package="googlePublicData")
#' info <- genMoreInfo(path) # This is a dataframe
#' 
#' # Setting the 5th concept as topic "Demographics"
#' info[5, "topic"] <- "Demographics"
#' 
#' # Generating the dspl file
#' ans <- dspl(path, moreinfo = info)
#' ans
#' 
#'   \dontrun{
#'     # Parsing some xlsx files at "my stats folder" to gen a "moreinfo" dataframe
#'     INFO <- genMoreInfo(path="my stats folder/", sep="xls")
#'     
#'     # Rows 1 to 10 are about "Poverty" and rows 11 to 20 about "Education"
#'     # So we fill the "topic" column with it.
#'     INFO$topic[1:10] <- "Poverty"
#'     INFO$topic[11:20] <- "Education"
#'     
#'     # Finally, we build the DSPL ZIP including more info
#'     dspl(path="my stats folder/", sep="xls", moreinfo=INFO)
#'     
#'   }
#' @export 
#' 
genMoreInfo <- function(path, encoding=getOption("encoding"), sep=";", 
                        output=NA, action="merge", dec=".") {
################################################################################
# DESCRIPTION:
# Reads .csv and .xls(x) files, outputs a descriptive dataframe of the data and
# builds a config file.
################################################################################  
  
  oldopt <- options()$stringsAsFactors
  options(stringsAsFactors=FALSE)
  on.exit(options(stringsAsFactors = oldopt))
  
  # Checks if the path exists
  .checkPath(path, "input")
  
  # Generates the filelist acording to an specific sep
  files <- .getFilesNames(path, sep)
  
  # Reads and analices the files
  x <- seekTables(files=files, encoding=encoding, sep=sep, dec=dec)
    
  # Extracts the unique list of variables
  x <- unique(
    # subset(x,subset=type != 'date' & is.dim.tab==F, 
           # select=c(id, label))
    x[
      with(x, type!='date' & !is.dim.tab),
      colnames(x) %in% c('id','label'),
      FALSE]
    )
  
  x <- data.frame(cbind(x, description=NA, topic=NA, url=NA, totalName=NA, 
                        pluralName=NA), stringsAsFactors=F)
  
  # Prints the result
  if (is.na(output)) {
    return(x) 
  } 
  
  # Case of mergin
  if (action == 'merge') {
    target.exists <- file.exists(output)
    
    if (target.exists) {
      # Merge of the tables
      x0 <- read.table(file=output, na.strings='NA', sep='\t')
      x <- subset(x, !(x$id %in% x0$id))
      x <- rbind(x0,x)
    }
    else {
      warning('The file ',output,'doesn\'t exists. It will be created')
    }
    ERR<-try(write.table(x, file=output, quote=F, na="NA", sep='\t'))
    if (class(ERR)!='try-error') {
      message("DSPL Configfile written correctly at\n",normalizePath(output))
    }
    else {
      stop("An error has occurred during the file writing at:\n",normalizePath(output))
    }
  }
  
  # Case of replacing
  else if (action %in% c('replace','merge')) {
    ERR<-try(write.table(x, file=output, quote=F, na="NA", sep='\t'))
    if (class(ERR)!='try-error') {
      message("DSPL Configfile written correctly at:\n",normalizePath(output))
    }
    else {
      stop("An error has occurred during the file writing at:\n",normalizePath(output))
    }
  }
}

seekTables <- function(files, encoding, sep, output = NA, replace = T, dec) {
################################################################################
# DESCRIPTION:
# Reads .csv and .xls(x) files, exports them as csv and outputs a descriptive da
# taframe. Also determinates which field is dim or metric.
################################################################################  

  # Timeframe metrics
  metrics <- matrix(c(
    'dia','day','semana','week','trimestre','quarter', 'mes','month','ano', 
    'year', 'year','year','month','month'), ncol = 2, byrow=T)

  FUN <- function(x,y,z) {           
           # In the case of csv, tab 
           if (!(sep %in% c("xls", "xlsx"))) {
             
             data <- utils::read.table(
               x, sep=sep, strip.white=T, encoding=y, fill=T,
               dec=z, header=T
             )
             
           } else {
              data <- readxl::read_excel(x, col_names = T, sheet = 1)
           }
           
           cols <- colnames(data)
           
           cols <- gsub(".", " ", cols, fixed = T)
           cols <- gsub("^[[:space:]]*|[[:space:]]*$", "", cols)
           cols <- gsub("[^[:graph:]][[:space:]]*"," ", cols)
           
           colnames(data) <- cols
           
           fnames <- gsub("\\.[[:alpha:]]*$","",x)
           fnames <- gsub(".*(/|\\\\)", "", fnames)

           fnames <- rep(fnames, length(cols))
           
           # Builds descriptive matrix
           
           var <- data.frame(
             id=.cleantext(cols),
             label=cols,
             type=.fixType(unlist(lapply(data, typeof))),
             slice=fnames
           )
           
           # Creates a new column of metric vs dimm
           var <- cbind(var, concept.type='metric')
           
           var[var[,1] %in% metrics[,1], 5] <- 'dimension' # If time
           var[var[,3] %in% c('string'),5] <- 'dimension' # If string
           var[var[,1] %in% c('longitud','latitud','colour'),5] <- NA # If string
           var[var[,1] %in% metrics[,1], 3] <- 'date' # If date
            
           # Identifies which dataset is a dimension dataset
           var <- cbind(var, is.dim.tab = F)
           if (all(var[,3] != 'date')) {var['is.dim.tab'] <- T}
                
           # Replaces the date-time colnames for those proper to PDE
           for (i in 1:NROW(metrics)) {
             cols <- gsub(metrics[i,1], metrics[i,2],.cleantext(cols), fixed = T)
            }
           
           var['id'] <- cols
           
           # In the case of output, it creates a new folder
           if (!is.na(output)) {
             colnames(data) <- cols
             
             # Sorts the data acording to dimensional concepts (not date)
             ord <- var[var[,5]=='dimension' & var[,3] != 'date',1]
             if (length(ord)!=0) data <- data[do.call(order,data[ord]),]
             
             # Writes the data into csv files
             con <- file(paste(output,'/',var[1,4],'.csv',sep=''), encoding="UTF-8")
             
             write.table(
               x         = data,
               file      = con,
               na        = '',
               sep       = ',',
               quote     = FALSE,
               row.names = FALSE,
               dec       = '.'
               )

             message(
               gsub(".*(/|\\\\)","",x)," analyzed correctly, ordered by ", 
               paste(utils::head(ord), collapse = ", "), ifelse(length(ord)>6, "...", ""),
               " and exported as csv"
               )
           }
           else {
             message(x," analyzed correctly")
           }
           return(var)
         }
  
  # Puts it all into a single matrrx
  vars <- do.call('rbind', lapply(files, FUN, y=encoding, z=dec))
  
  # Identifies where are the correspondant tables for each dimension  
  vars <- cbind(vars, dim.tab.ref = NA)
  for (i in 1:NROW(vars)) {
    if (vars$concept.type[i] == 'dimension' & vars$type[i] != 'date' & 
      vars$is.dim.tab[i] != 'TRUE') {
      delta <- try(vars[vars[,1]==vars[i,1] & vars[,6] == 'TRUE',4])
      
      if (length(delta) == 0) {
        stop('The dimension ', vars$label[i],' needs a code (key) tab')
      } else {
        vars[i,7] <- try(delta)
      }
    }
  }
  
  # Setting which concepts extends geo:location
  
  geo <-vars[with(vars, is.dim.tab == TRUE), colnames(vars) %in% c("id", "slice"), FALSE]
  geo <- by(
    geo, geo$slice, 
    function(x) all(c('longitude', 'latitude') %in% unlist(x['id']))
  )
  
  geo <- names(which(geo))
  vars <- cbind(vars, extends.geo=FALSE)
  vars$extends.geo[which(vars$dim.tab.ref %in% geo)] <- TRUE
  
  return(vars)
}

.getMoreInfo <- function(source,target, encoding="unknown", id='id', label='label') {
################################################################################
# Reads from a tab file generated by genMoreInfo as a complement info to concepts
################################################################################  
  
  # Partial fix (should work, need to see
  # http://stackoverflow.com/questions/23475309/in-r-is-it-possible-to-suppress-note-no-visible-binding-for-global-variable)
  # type <- is.dim.tab <- id <- label <- freq <- NULL
  
  if (length(source)) {
    
    # If a file, reads the moreinfo file
    if (!inherits(source, "data.frame")) {
      
      oldopt <- options()$stringsAsFactors
      options(stringsAsFactors=FALSE)
      on.exit(options(stringsAsFactors = oldopt))
      
      source <- utils::read.table(file=source, header=TRUE, sep='\t', na.strings='NA',
                             encoding=encoding)
    }
    
    # Cleans up the content
    source$topicid <- .cleantext(source$topic)
    
    # Checks if there are at least moreinfo concepts as path concepts, where path
    # concepts are those captured from the files at pde(path=...)
    ntarget <- NROW(target)
    nsource <- NROW(source)
    if (ntarget < nsource) {
      warning('The number of moreinfo concepts (',nsource,') vs concepts found at path (',
              ntarget,') differ. Concepts at moreinfo won\'t appear at the metadata.')
    }
    else if (ntarget > nsource) {
      stop('Can\'t continue, there are more concepts at the path than at moreinfo')
    }
    
    # Merges source and target
    source <- source[which(source[[id]] %in% target$id),] # subset(source, id %in% target$id)
    target$label <- source[[label]] # source$label
    source <- source[,colnames(source)!=label,FALSE] #subset(source, select=-label)
    target <- merge(target, source, by=c('id'))
  }
  return(target)
}

.fixType <- function(x) {
################################################################################
# Transforms the datatypes in function of allowed datatypes for DSPL
################################################################################
  replace <- matrix(c('logical', 'integer', 'double', 'complex', 'character',
                      'boolean', 'integer', 'float', 'float', 'string'), ncol =2)
  for (i in 1:5) x <- as.character(gsub(replace[i,1], replace[i,2],x, fixed=T))
  return(x)
}

.cleantext <- function(x) {
################################################################################
# Adapts labels to IDs
################################################################################  
  
  sym <- matrix(c("$", "money", "??", "grad", "#", "n", "%", "pcent",
                  " ", "_", ".", "", ",", "_", ";", "_", ":", "_",
                  "(","_",")","_"), 
                ncol = 2, byrow = TRUE)
  
  x <- tolower(x)
  
  x <- iconv(x, to="ASCII//TRANSLIT")
  
  for (i in 1:NROW(sym)) {
    x <- gsub(sym[i,1], sym[i,2], x, fixed = TRUE)
  }
  
  # Extracts 
  x <- 
    
  return(x)
}

.addInfo <- function(nodename,values,parent,lang) {
################################################################################
# Function to add information and provider nodes y multiple languages
################################################################################  
  XML::newXMLNode(nodename, parent=parent, sapply(values, function(x) {
    XML::newXMLNode('value',attrs=c('xml:lang'=lang[which(values==x)]), x,
               suppressNamespaceWarning=TRUE)}))
}

.addTopics <- function(nodename, values, parent, lang) {
################################################################################
# Function to create and populate the topics
################################################################################  
  values <- unique(values)
  values <- subset(values, !is.na(values[,1]))
  colnames(values) <- c('label','id')
  
  fun <- function(x, ...) {apply(x, MARGIN = 1,...)}
  
  fun(values, FUN=
    function(x) {
      tempnode0 <- XML::newXMLNode(nodename, parent=parent, attrs=c(x['id']))
      tempnode1 <- XML::newXMLNode('info', parent=tempnode0)
      tempnode2 <- XML::newXMLNode('name', parent=tempnode1)
      XML::newXMLNode('value', parent=tempnode2, attrs=c('xml:lang'=lang[1]),
                 x['label'], suppressNamespaceWarning=T)
    }
  )
}

.addConcepts <- function(val,parent,lang) {
################################################################################
# Function to create and populate the concepts
################################################################################  
  colnames(val)[3] <- 'ref'
  colnames(val)[7] <- 'geo'
  
#   if (NCOL(val) > 1) {
#     fun <- function(x, ...) {apply(x, MARGIN = 1,...)}
#   } else {
#     fun <- function(x, FUN) {FUN(x)}
#   }
#   
#   fun(val, .addConcepts_sub)
  if (ncol(val)>1) apply(val,.addConcepts_sub,MARGIN = 1,parent=parent,lang=lang)
  else .addConcepts(val,parent=parent,lang=lang)
}

.addConcepts_sub <- function(x,parent,lang) {
  # function(x) {
    x['is.dim.tab'] <- gsub(' ','',x['is.dim.tab'])
    x['geo'] <- gsub(' ','',x['geo'])
    
    if (x["ref"] == 'string') {ATT <- c(x['id'], extends='entity:entity')}
    if (x['ref'] == 'string' && x['geo']=='TRUE') {ATT <- c(x['id'], extends='geo:location')}
    if (x['ref'] != 'string') {ATT <- c(x['id'])}
    
    # in the case of not being a dimensional concept
    if (x["concept.type"]!="dimension") {
      tempnode0 <- XML::newXMLNode('concept', attrs=ATT, parent=parent)
      
      # Starts Adding info
      tempnode1 <- XML::newXMLNode('info', parent=tempnode0)
      
      tempnode2 <- XML::newXMLNode('name', parent=tempnode1)
      
      # Adds a description
      if (!is.na(x['description'])) {
        description <- as.character(x['description'])
        tempnode3 <- XML::newXMLNode('description', parent=tempnode1)
        XML::newXMLNode('value', parent=tempnode3, attrs=c('xml:lang'=lang[1]), description)
      }
      
      # URL node
      if (!is.na(x['url'])) {
        url <- as.character(x['url'])
        tempnode4 <- XML::newXMLNode('url', parent=tempnode1)
        XML::newXMLNode('value', parent=tempnode4, attrs=c('xml:lang'=lang[1]), url)
      }
      
      # Here should start the multilanguage loop
      XML::newXMLNode('value', parent=tempnode2, attrs=c('xml:lang'=lang[1]),
                 suppressNamespaceWarning=T, x['label'])
      
      # Adds a topic category
      if (!is.na(x['topicid'])) {
        topicref <- as.character(x['topicid'])
        XML::newXMLNode('topic', parent=tempnode0, attrs=c(ref=topicref))
      }
      
      # Adds the data type specification
      XML::newXMLNode('type', attrs=c(x['ref']), parent=tempnode0)
      
      # in the case of being a dimensional concept
    } else {
      tempnode0 <- XML::newXMLNode('concept', attrs=ATT, parent=parent)
      
      # Starts adding info
      tempnode1 <- XML::newXMLNode('info', parent=tempnode0)
      
      # Name Node
      tempnode2 <- XML::newXMLNode('name', parent=tempnode1)
      XML::newXMLNode('value', parent=tempnode2, attrs=c('xml:lang'=lang[1]),
                 suppressNamespaceWarning=T, x['label'])
      
      # Description node
      if (!is.na(x['description'])) {
        description <- as.character(x['description'])
        tempnode3 <- XML::newXMLNode('description', parent=tempnode1)
        XML::newXMLNode('value', parent=tempnode3, attrs=c('xml:lang'=lang[1]), description)
      }
      
      # URL node
      if (!is.na(x['url'])) {
        url <- as.character(x['url'])
        tempnode4 <- XML::newXMLNode('url', parent=tempnode1)
        XML::newXMLNode('value', parent=tempnode4, attrs=c('xml:lang'=lang[1]), url)
      }
      
      # Plural name node
      if (!is.na(x['totalName'])) {
        pluralName <- as.character(x['pluralName'])
        tempnode6 <- XML::newXMLNode('pluralName', parent=tempnode1)
        XML::newXMLNode('value', parent=tempnode6, attrs=c('xml:lang'=lang[1]), pluralName)
      }        
      
      # Total name node
      if (!is.na(x['totalName'])) {
        totalName <- as.character(x['totalName'])
        tempnode5 <- XML::newXMLNode('totalName', parent=tempnode1)
        XML::newXMLNode('value', parent=tempnode5, attrs=c('xml:lang'=lang[1]),totalName)
      }
      
      XML::newXMLNode('type', parent=tempnode0, attrs=c(x['ref']))
      XML::newXMLNode('table', parent=tempnode0, attrs=
                   c(ref=paste(x['dim.tab.ref'],'_table',sep='')))
      
    }
  # }
}

.addSlices <- function(tableid, sliceatt, parent) {
################################################################################
# Function to create and populate the slices
################################################################################
  colnames(sliceatt)[1] <- 'concept'
  by(data=sliceatt, INDICES=tableid,FUN=
    function(x) {
      XML::newXMLNode(name='slice', attrs=c(id=paste(x$slice[1],'_slice',sep='')),
                 parent=parent, apply(x, MARGIN = 1,FUN=
                   function(z){
                     #z <- as.character(z)
                     # In the case of dates-time
                     if (z['type'] == 'date') {
                       XML::newXMLNode(name=z['concept.type'], 
                                  attrs=c(concept=paste('time:',z['concept'],sep='')))
                       # Otherwise
                     } else {
                       XML::newXMLNode(name=z['concept.type'], attrs=c(z['concept']))
                     }}), XML::newXMLNode('table', attrs=c(ref=paste(x$slice[1],'_table',sep=''))))
    }
    )
}

.addTables <- function(tableid, tableatt, parent, format) {
################################################################################
# Function to create and populate the tables
################################################################################
  by(data=tableatt, INDICES=tableid,FUN=
    function(x) {
      XML::newXMLNode(name='table', attrs=c(id=paste(x$slice[1],'_table',sep='')),parent=
        parent, apply(X=x, 
                      MARGIN = 1, FUN=
                        function(z){
                          if (z['type'] == 'date') {
                            XML::newXMLNode(name='column', attrs=c(z['id'], z['type'], format=format))  
                          } else {
                            XML::newXMLNode(name='column', attrs=c(z['id'], z['type']))
                          }}), XML::newXMLNode(name='data', XML::newXMLNode('file', attrs=c(format=
                            'csv', encoding='utf8'),paste(x$slice[1],'.csv',sep='')))
                 )
    }
     )
}



#' Builds Dataset Publication Language (DSPL) metadata file
#' 
#' Parsing \emph{csv}, \emph{tab} or \emph{xls(x)} files at a specific
#' directory path, dspl generates a complete DSPL file.  If an output string is
#' specified, the function generates the complete ZIP (DSPL file plus csv
#' files) ready to be uploaded to Google Public Data Explorer.
#' 
#' If there isn't any output defined the function returns a list of class
#' \code{dspl} that among its contents has a xml object (DSPL file); otherwise,
#' if an output is defined, the results consists on two things, an already ZIP
#' file containing a all the necessary to be uploaded at
#' \url{publicdata.google.com} (a collection of csv files and the XML DSPL
#' written file) and a message (character object).
#' 
#' Internally, the parsing process consists on the following steps:
#' \enumerate{
#' \item Loading the data, 
#' \item Generating each column corresponding id, 
#' \item Identifying the data types, 
#' \item Building concepts, 
#' \item Identifying dimensional concepts and distinguishing between categorical,
#' geographical and time dimensions, and
#' \item Executing internal checks.
#' }
#' 
#' In order to properly load the zip file (DSPL file plus CSV data files), the
#' function executes a series of internal checks upon the data structure. The
#' detailed list:
#' \itemize{
#' \item \strong{Slices with the same dimensions}: DSPL
#' requires that each slice represents one dimensional cut, this is, there
#' should not be more than one data table with the same dimensions.
#' \item \strong{Duplicated concepts}: As a result of multiple data types, e.g a single
#' concept (statistic) as integer in one table and float in other, \code{dspl}
#' may get confused, so during the parsing process, if there is a chance, it
#' collapses duplicated concepts into only one concept and assigns it the
#' common data type (float).
#' \item \strong{Correct time format definition}: Using \code{\link{checkTimeFormat}}
#' ensures that the time format specified is
#' compatible with DSPL.
#' }
#' 
#' @param path String. Path to the folder where the tables (csv|tab|xls) are
#' at.
#' @param output String, optional. Path to the output ZIP file.
#' @param replace Logical. If \code{output} ZIP file is defined exists,
#' \code{dspl} replaces it.
#' @param targetNamespace String. As DSPL documentation states ``Provides a URI
#' that identifies your dataset. This URI is not required to point to an actual
#' resource, but it's a good idea to have the URI resolve to a document
#' describing your content or dataset''.
#' @param timeFormat String. The corresponding time format of the collection.
#' Should be specified accordingly to joda-time format. See the Details section
#' for more information.
#' @param lang A list of strings of the languages supported by the dataset.
#' Could be only one.
#' @param name List of strings. The name of the dataset as defined accordingly
#' to the \code{lang} list.
#' @param description List of strings. Description of the dataset. It also
#' supports multiple description as the \code{name}
#' @param url The corresponding URL for the dataset.
#' @param providerName List of strings. The data provider name.
#' @param providerURL List of strings. The data provider website url.
#' @param sep The separation character of the tables in the 'path' folder.
#' Currently supports introducing the following arguments: ``,'' or ``;'' (for
#' .csv files), ``\\t'' (for .tab files) and ``xls'' or ``xlsx'' (for Microsoft's
#' excel files).
#' @param dec String. Decimal point.
#' @param encoding The char encoding of the input tables. Currently ignored for
#' Microsoft excel files.
#' @param moreinfo A special tab file generated by the function
#' \code{\link{genMoreInfo}} that contains a dataframe of the dataset concepts
#' with more specifications such as description, topic, url, etc.
#' @return If there isn't any \code{output} defined, \code{dspl} returns list
#' of \code{\link{class}} "\code{dspl}".
#' 
#' An object of class "\code{dspl}" is a list containing:
#' \item{dspl}{A character string containing the DSPL XML document as defined
#' by the \code{\link[XML:saveXML]{saveXML}} function.} 
#' \item{concepts.by.table}{A data frame object of concepts stored by table.} 
#' \item{dimtabs}{A data frame containing dimensional tables.} 
#' \item{slices}{A data frame of slices.} 
#' \item{concepts}{A data frame of concepts (all of them).}
#' \item{dimensions}{A data frame of dimensional concepts.}
#' \item{statistics}{A matrix of statistics.}
#' 
#' otherwise the function will build a ZIP file as specified in the output
#' containing the CSV and DSPL (XML) files.
#' @author George G. Vega Yon 
#' @references \itemize{ \item Google Public Data Explorer Tutorial:
#' \url{https://developers.google.com/public-data/docs/tutorial} }
#' @keywords IO
#' @examples
#' 
#' demo(dspl)
#'   
#' 
#' @export 
#' @name dspl
#' @aliases GooglePublicData
NULL

#' @export
#' @rdname dspl
dspl <- function(
################################################################################
# DESCRIPTION:
# Based on an specific folder directory, the function seeks for files that match
# the specified extension (csv, tab, xls, xlsx), reads the column names, guesses
# the datatype, analyces data structure and outputs a dspl metadata file includi
# ng the corresponding csv files.
#
# VARIABLES:
# - path: Full path to the folder where the tables are saved.
# - Output: FUll path to the output folder (pde will create a subfolder call r_dspl).
# - replace: In the case of defining output, replaces the files.
# - targetNamespace:
# - timeFormat: The corresponding time format of the collection.
# - lang: A list of the languages supported by the dataset.
# - name: The name of the dataset.
# - description: The dataset description.
# - url: The corresponding URL for the dataset.
# - providerNAme
# - providerURL
# - sep: The extension of the tables in the 'path' folder.
# - encoding: The char encoding of the input tables.
################################################################################  
  path,
  output = NA,
  replace = F,
  targetNamespace = "",
  timeFormat = "yyyy",
  lang = c("es", "en"),
  name = NA,
  description = NA,
  url = NA,
  providerName = NA,
  providerURL = NA,
  sep = ";",
  dec=".",
  encoding = getOption("encoding"),
  moreinfo = NULL
  ) {
  
  oldopt <- options()$stringsAsFactors
  options(stringsAsFactors=FALSE)
  on.exit(options(stringsAsFactors = oldopt))
  
  # Initial checks
  description <- ifelse(!is.na(description),description,'No description')
  name <- ifelse(!is.na(name),name,'No name')
  providerName <- ifelse(!is.na(providerName),providerName,'No provider')
  
  # Checking if output path is Ok
  if (!is.na(output)) {
    temp.path <- tempdir()
    .checkPath(path, "input")
  }
  else temp.path <- NA
  if (length(moreinfo)) {
    if (!inherits(moreinfo,"data.frame"))
      .checkPath(moreinfo, "input")
  }
  
  # Checking timeFormat
  timeOk <- checkTimeFormat(timeFormat)
  if (!timeOk) stop("Undefined \'joda-time\' definition ", timeFormat,
                    "\nFor more information checkout\n",
                    "http://joda-time.sourceforge.net/api-release/org/joda/time/format/DateTimeFormat.html")
  # Gets the filenames
  files <- .getFilesNames(path, sep)
    
  # Variables Lists and datatypes
  vars <- seekTables(files=files, encoding=encoding, sep=sep, 
                     output=temp.path, replace=replace, dec=dec)
  
  dims <- vars[
    with(vars, concept.type=='dimension'),
    colnames(vars) %in% c('id', 'slice', 'concept.type'), FALSE
    ]
  #subset(vars, concept.type=='dimension', select=c(id, slice, concept.type)) 
  
  # Identifying if there is any duplicated slice
  with(dims, .checkSlices(dims=id, by=slice))
  
  vars <- .checkDuplConcepts(concepts=vars)
  
  # Creates a unique concept list
  varConcepts <- unique(
    # subset(vars,subset=type != 'date' & is.dim.tab==F, select=-slice) 
    vars[
      with(vars, type!='date' & is.dim.tab==FALSE),
      colnames(vars)!='slice',FALSE
      ]
    )
  
  # Checks if there is a moreinfo file
  varConcepts <- .getMoreInfo(source=moreinfo, target=varConcepts, "")
  
  # Armado de xml
  archXML <- XML::newXMLDoc()
  dspl <- XML::newXMLNode(name='dspl', doc=archXML, attrs=c(
    targetNamespace=targetNamespace),
    namespaceDefinitions = 'http://schemas.google.com/dspl/2010')
  
  
  # Definiciones dspl
  imports <- c('quantity', 'entity', 'geo', 'time', 'unit')
  
  sapply(imports,
         function(x) {
           XML::newXMLNamespace(node=dspl, prefix=x,namespace=paste(
             'http://www.google.com/publicdata/dataset/google/',x,sep=''))
         })
  # Concepts import lines
  XML::newXMLCommentNode('Concepts imports', parent=dspl)
  
  imports <- paste(
    "http://www.google.com/publicdata/dataset/google/",
    imports, sep = '')
  
  sapply(X = imports,
         FUN = function(x) XML::newXMLNode(attrs=c(namespace=x), name = 'import',
                                      parent = dspl))
  # INFO
  XML::newXMLCommentNode('Info lines', parent=dspl)
  info <- XML::newXMLNode('info', parent = dspl)
  .addInfo('name', name, info, lang)
  .addInfo('description', description, info, lang)
  if (!is.na(url)) XML::newXMLNode('url', XML::newXMLNode('value', url), 
                              parent = info)
  
  # PROVIDER
  XML::newXMLCommentNode('Data Provider', parent=dspl)
  provider <- XML::newXMLNode('provider', parent = dspl)
  .addInfo('name', providerName, provider, lang)
  if (!is.na(providerURL)) XML::newXMLNode('url', XML::newXMLNode('value', providerURL), 
                                      parent = provider)
  
  # TOPICS
  if ("topicid" %in% colnames(varConcepts)) {
    test <- !all(is.na(varConcepts$topicid))
  }
  else {
    test <- F
  }
  if (test) {
    XML::newXMLCommentNode('Topics definition', parent=dspl)
    topics <- XML::newXMLNode('topics', parent=dspl)
    .addTopics('topic', varConcepts[c('topic', 'topicid')], topics, lang)
  }
    
  # CONCEPTS
  XML::newXMLCommentNode('Concepts Definitions', parent=dspl)
  concepts <- XML::newXMLNode('concepts', parent = dspl)
  .addConcepts(varConcepts,concepts, lang)
  
  # SLICES
  XML::newXMLCommentNode('Slices Definitions', parent=dspl)
  slices <- XML::newXMLNode('slices', parent = dspl)
  .addSlices(
    tableid  = vars[with(vars, !is.dim.tab),, FALSE][['slice']],
    sliceatt = vars[with(vars, !is.dim.tab),, FALSE], # subset(vars, is.dim.tab != T),
    parent   = slices
    )
  
  # TABLES
  XML::newXMLCommentNode('Tables Definitios', parent=dspl)
  tables <- XML::newXMLNode('tables', parent = dspl)
  .addTables(tableid=vars$slice,tableatt=vars,parent=tables, format=timeFormat)  
  
  # Building ouput
  .dimtabs  <- unique(vars[with(vars, is.dim.tab),,FALSE][['slice']]) #subset(vars, subset=is.dim.tab, select=slice))
  .slices   <- unique(vars[['slice']]) #subset(vars, select=slice))
  .concepts <- unique(vars[['label']]) #subset(vars, select=label))
  .dims <- unique(  
#     subset(
#       vars, 
#       subset=is.dim.tab & !(label %in% c('name', 'latitude','longitude','colour')),
#       select=label
#       )
    vars[ 
      with(vars, is.dim.tab & !(label %in% c('name', 'latitude','longitude','colour'))),
      colnames(vars)=='label',
      FALSE
    ]) 
  
  lapply(c(.dimtabs, .slices, .concepts, .dims), function(x) names(x) <- 'Name')
  
  pde.statistics <- matrix(
    c(
      NROW(.slices),
      NROW(.concepts),
      NROW(.dims)
    ), ncol=3)
  
  colnames(pde.statistics) <- c('slices','concepts','dimensions')
  
  result <- structure(
    .Data=
      list(
        XML::saveXML(archXML, encoding="UTF-8"), vars, .dimtabs, .slices, .concepts,
        .dims, pde.statistics
        ),
    .Names = c(
      'dspl', 'concepts.by.table', 'dimtabs', 'slices', 'concepts', 'dimensions',
      'statistics'),
    class = "dspl"
    )

  # If an output file is specified, it writes it on it
  if (is.na(output)) {
    return(result)
  
  } else {
    path <- paste(temp.path,'/metadata.xml',sep='')
    print.dspl(x=result, path=path, replace=replace, quiet=T)
    
    # Zipping the files
    tozip <- list.files(temp.path, full.names=T, pattern="csv$|xml$")
    utils::zip(output ,tozip,flags='-r9jm')
    
    message("Metadata created successfully at:\n", normalizePath(output))
  }
}

#' @export
#' @rdname dspl
new_dspl <- dspl

Try the googlePublicData package in your browser

Any scripts or data that you put into this service are public.

googlePublicData documentation built on May 2, 2019, 3:45 a.m.