.getFilesNames <- function(path, sep=";") {
################################################################################
# DESCRIPTION:
# As a function of a path and a file extension, gets all the file names there.
################################################################################
# Checking the sep
pattern <- paste0("[.]",switch(
sep,
"\t" = "tab",
";" = "csv",
"," = "csv",
xls = "xls",
xlsx = "xlsx"), "$")
# Listing files
files <- list.files(path=path, pattern = pattern, full.names=TRUE)
nfiles <- length(files)
if (nfiles==0) {
stop(nfiles, ' files found.\nCannot\' continue')
}
else {
message(nfiles, ' files found...')
}
return(files)
}
#' Generates a dataframe used to complement a DSPL bundle
#'
#' Parsing \emph{csv}, \emph{tab} or \emph{xls(x)} files at a specific
#' directory path, genMore info generates a dataframe used to complete a DSPL
#' bundle with a more complete concepts definition including description, url,
#' etc..
#'
#' If there isn't any output defined (\code{NA}) the function returns a
#' dataframe containing concepts as observations. Using this, the user may add
#' more descriptive info about concepts. In turn it writes a tab file with the
#' dataframe described above. The user may recycle this file writing ``append''
#' in the \code{action} argument.
#'
#' @param path String. Path to the folder where the tables are saved.
#' @param encoding The encoding of the files to be parsed.
#' @param sep The separation character of the tables in the 'path' folder.
#' Currently supports introducing the following arguments: ``,'' or ``;'' (for
#' .csv files), ``\\t'' (for .tab files) and ``xls'' or ``xlsx'' (for Microsoft's
#' excel files).
#' @param output If defined, the place where to save the dataframe as tab file.
#' Otherwise it returns a data frame object.
#' @param action Tells the function what to do if there's a copy of the file.
#' Available actions are ``merge'' and ``replace''.
#' @param dec String. Decimal point.
#' @return If no \code{output} defined, \code{genMoreInfo} returns a dataframe
#' with the following columns. \item{id}{XML id of the concept
#' (autogenerated)} \item{label}{The label of the concept (autogenerated)}
#' \item{description}{A brief description of the concept} \item{topic}{The
#' topic of the concept} \item{url}{A URL for the concept where, for example,
#' to get more info} \item{totalName}{A total name as specified by DSPL language
#' (works for dimensional concepts)} \item{pluralName}{A total name as
#' specified by DSPL language (works for dimensional concepts)}
#' @author George G. Vega Yon
#' @references Google Public Data Explorer: \url{http://publicdata.google.com}
#' @keywords IO
#' @examples
#'
#' # Getting the path where all the datasets are
#' path <- system.file("dspl-tutorial", package="googlePublicData")
#' info <- genMoreInfo(path) # This is a dataframe
#'
#' # Setting the 5th concept as topic "Demographics"
#' info[5, "topic"] <- "Demographics"
#'
#' # Generating the dspl file
#' ans <- dspl(path, moreinfo = info)
#' ans
#'
#' \dontrun{
#' # Parsing some xlsx files at "my stats folder" to gen a "moreinfo" dataframe
#' INFO <- genMoreInfo(path="my stats folder/", sep="xls")
#'
#' # Rows 1 to 10 are about "Poverty" and rows 11 to 20 about "Education"
#' # So we fill the "topic" column with it.
#' INFO$topic[1:10] <- "Poverty"
#' INFO$topic[11:20] <- "Education"
#'
#' # Finally, we build the DSPL ZIP including more info
#' dspl(path="my stats folder/", sep="xls", moreinfo=INFO)
#'
#' }
#' @export
#'
genMoreInfo <- function(path, encoding=getOption("encoding"), sep=";",
output=NA, action="merge", dec=".") {
################################################################################
# DESCRIPTION:
# Reads .csv and .xls(x) files, outputs a descriptive dataframe of the data and
# builds a config file.
################################################################################
oldopt <- options()$stringsAsFactors
options(stringsAsFactors=FALSE)
on.exit(options(stringsAsFactors = oldopt))
# Checks if the path exists
.checkPath(path, "input")
# Generates the filelist acording to an specific sep
files <- .getFilesNames(path, sep)
# Reads and analices the files
x <- seekTables(files=files, encoding=encoding, sep=sep, dec=dec)
# Extracts the unique list of variables
x <- unique(
# subset(x,subset=type != 'date' & is.dim.tab==F,
# select=c(id, label))
x[
with(x, type!='date' & !is.dim.tab),
colnames(x) %in% c('id','label'),
FALSE]
)
x <- data.frame(cbind(x, description=NA, topic=NA, url=NA, totalName=NA,
pluralName=NA), stringsAsFactors=F)
# Prints the result
if (is.na(output)) {
return(x)
}
# Case of mergin
if (action == 'merge') {
target.exists <- file.exists(output)
if (target.exists) {
# Merge of the tables
x0 <- read.table(file=output, na.strings='NA', sep='\t')
x <- subset(x, !(x$id %in% x0$id))
x <- rbind(x0,x)
}
else {
warning('The file ',output,'doesn\'t exists. It will be created')
}
ERR<-try(write.table(x, file=output, quote=F, na="NA", sep='\t'))
if (class(ERR)!='try-error') {
message("DSPL Configfile written correctly at\n",normalizePath(output))
}
else {
stop("An error has occurred during the file writing at:\n",normalizePath(output))
}
}
# Case of replacing
else if (action %in% c('replace','merge')) {
ERR<-try(write.table(x, file=output, quote=F, na="NA", sep='\t'))
if (class(ERR)!='try-error') {
message("DSPL Configfile written correctly at:\n",normalizePath(output))
}
else {
stop("An error has occurred during the file writing at:\n",normalizePath(output))
}
}
}
seekTables <- function(files, encoding, sep, output = NA, replace = T, dec) {
################################################################################
# DESCRIPTION:
# Reads .csv and .xls(x) files, exports them as csv and outputs a descriptive da
# taframe. Also determinates which field is dim or metric.
################################################################################
# Timeframe metrics
metrics <- matrix(c(
'dia','day','semana','week','trimestre','quarter', 'mes','month','ano',
'year', 'year','year','month','month'), ncol = 2, byrow=T)
FUN <- function(x,y,z) {
# In the case of csv, tab
if (!(sep %in% c("xls", "xlsx"))) {
data <- utils::read.table(
x, sep=sep, strip.white=T, encoding=y, fill=T,
dec=z, header=T
)
} else {
data <- readxl::read_excel(x, col_names = T, sheet = 1)
}
cols <- colnames(data)
cols <- gsub(".", " ", cols, fixed = T)
cols <- gsub("^[[:space:]]*|[[:space:]]*$", "", cols)
cols <- gsub("[^[:graph:]][[:space:]]*"," ", cols)
colnames(data) <- cols
fnames <- gsub("\\.[[:alpha:]]*$","",x)
fnames <- gsub(".*(/|\\\\)", "", fnames)
fnames <- rep(fnames, length(cols))
# Builds descriptive matrix
var <- data.frame(
id=.cleantext(cols),
label=cols,
type=.fixType(unlist(lapply(data, typeof))),
slice=fnames
)
# Creates a new column of metric vs dimm
var <- cbind(var, concept.type='metric')
var[var[,1] %in% metrics[,1], 5] <- 'dimension' # If time
var[var[,3] %in% c('string'),5] <- 'dimension' # If string
var[var[,1] %in% c('longitud','latitud','colour'),5] <- NA # If string
var[var[,1] %in% metrics[,1], 3] <- 'date' # If date
# Identifies which dataset is a dimension dataset
var <- cbind(var, is.dim.tab = F)
if (all(var[,3] != 'date')) {var['is.dim.tab'] <- T}
# Replaces the date-time colnames for those proper to PDE
for (i in 1:NROW(metrics)) {
cols <- gsub(metrics[i,1], metrics[i,2],.cleantext(cols), fixed = T)
}
var['id'] <- cols
# In the case of output, it creates a new folder
if (!is.na(output)) {
colnames(data) <- cols
# Sorts the data acording to dimensional concepts (not date)
ord <- var[var[,5]=='dimension' & var[,3] != 'date',1]
if (length(ord)!=0) data <- data[do.call(order,data[ord]),]
# Writes the data into csv files
con <- file(paste(output,'/',var[1,4],'.csv',sep=''), encoding="UTF-8")
write.table(
x = data,
file = con,
na = '',
sep = ',',
quote = FALSE,
row.names = FALSE,
dec = '.'
)
message(
gsub(".*(/|\\\\)","",x)," analyzed correctly, ordered by ",
paste(utils::head(ord), collapse = ", "), ifelse(length(ord)>6, "...", ""),
" and exported as csv"
)
}
else {
message(x," analyzed correctly")
}
return(var)
}
# Puts it all into a single matrrx
vars <- do.call('rbind', lapply(files, FUN, y=encoding, z=dec))
# Identifies where are the correspondant tables for each dimension
vars <- cbind(vars, dim.tab.ref = NA)
for (i in 1:NROW(vars)) {
if (vars$concept.type[i] == 'dimension' & vars$type[i] != 'date' &
vars$is.dim.tab[i] != 'TRUE') {
delta <- try(vars[vars[,1]==vars[i,1] & vars[,6] == 'TRUE',4])
if (length(delta) == 0) {
stop('The dimension ', vars$label[i],' needs a code (key) tab')
} else {
vars[i,7] <- try(delta)
}
}
}
# Setting which concepts extends geo:location
geo <-vars[with(vars, is.dim.tab == TRUE), colnames(vars) %in% c("id", "slice"), FALSE]
geo <- by(
geo, geo$slice,
function(x) all(c('longitude', 'latitude') %in% unlist(x['id']))
)
geo <- names(which(geo))
vars <- cbind(vars, extends.geo=FALSE)
vars$extends.geo[which(vars$dim.tab.ref %in% geo)] <- TRUE
return(vars)
}
.getMoreInfo <- function(source,target, encoding="unknown", id='id', label='label') {
################################################################################
# Reads from a tab file generated by genMoreInfo as a complement info to concepts
################################################################################
# Partial fix (should work, need to see
# http://stackoverflow.com/questions/23475309/in-r-is-it-possible-to-suppress-note-no-visible-binding-for-global-variable)
# type <- is.dim.tab <- id <- label <- freq <- NULL
if (length(source)) {
# If a file, reads the moreinfo file
if (!inherits(source, "data.frame")) {
oldopt <- options()$stringsAsFactors
options(stringsAsFactors=FALSE)
on.exit(options(stringsAsFactors = oldopt))
source <- utils::read.table(file=source, header=TRUE, sep='\t', na.strings='NA',
encoding=encoding)
}
# Cleans up the content
source$topicid <- .cleantext(source$topic)
# Checks if there are at least moreinfo concepts as path concepts, where path
# concepts are those captured from the files at pde(path=...)
ntarget <- NROW(target)
nsource <- NROW(source)
if (ntarget < nsource) {
warning('The number of moreinfo concepts (',nsource,') vs concepts found at path (',
ntarget,') differ. Concepts at moreinfo won\'t appear at the metadata.')
}
else if (ntarget > nsource) {
stop('Can\'t continue, there are more concepts at the path than at moreinfo')
}
# Merges source and target
source <- source[which(source[[id]] %in% target$id),] # subset(source, id %in% target$id)
target$label <- source[[label]] # source$label
source <- source[,colnames(source)!=label,FALSE] #subset(source, select=-label)
target <- merge(target, source, by=c('id'))
}
return(target)
}
.fixType <- function(x) {
################################################################################
# Transforms the datatypes in function of allowed datatypes for DSPL
################################################################################
replace <- matrix(c('logical', 'integer', 'double', 'complex', 'character',
'boolean', 'integer', 'float', 'float', 'string'), ncol =2)
for (i in 1:5) x <- as.character(gsub(replace[i,1], replace[i,2],x, fixed=T))
return(x)
}
.cleantext <- function(x) {
################################################################################
# Adapts labels to IDs
################################################################################
sym <- matrix(c("$", "money", "??", "grad", "#", "n", "%", "pcent",
" ", "_", ".", "", ",", "_", ";", "_", ":", "_",
"(","_",")","_"),
ncol = 2, byrow = TRUE)
x <- tolower(x)
x <- iconv(x, to="ASCII//TRANSLIT")
for (i in 1:NROW(sym)) {
x <- gsub(sym[i,1], sym[i,2], x, fixed = TRUE)
}
# Extracts
x <-
return(x)
}
.addInfo <- function(nodename,values,parent,lang) {
################################################################################
# Function to add information and provider nodes y multiple languages
################################################################################
XML::newXMLNode(nodename, parent=parent, sapply(values, function(x) {
XML::newXMLNode('value',attrs=c('xml:lang'=lang[which(values==x)]), x,
suppressNamespaceWarning=TRUE)}))
}
.addTopics <- function(nodename, values, parent, lang) {
################################################################################
# Function to create and populate the topics
################################################################################
values <- unique(values)
values <- subset(values, !is.na(values[,1]))
colnames(values) <- c('label','id')
fun <- function(x, ...) {apply(x, MARGIN = 1,...)}
fun(values, FUN=
function(x) {
tempnode0 <- XML::newXMLNode(nodename, parent=parent, attrs=c(x['id']))
tempnode1 <- XML::newXMLNode('info', parent=tempnode0)
tempnode2 <- XML::newXMLNode('name', parent=tempnode1)
XML::newXMLNode('value', parent=tempnode2, attrs=c('xml:lang'=lang[1]),
x['label'], suppressNamespaceWarning=T)
}
)
}
.addConcepts <- function(val,parent,lang) {
################################################################################
# Function to create and populate the concepts
################################################################################
colnames(val)[3] <- 'ref'
colnames(val)[7] <- 'geo'
# if (NCOL(val) > 1) {
# fun <- function(x, ...) {apply(x, MARGIN = 1,...)}
# } else {
# fun <- function(x, FUN) {FUN(x)}
# }
#
# fun(val, .addConcepts_sub)
if (ncol(val)>1) apply(val,.addConcepts_sub,MARGIN = 1,parent=parent,lang=lang)
else .addConcepts(val,parent=parent,lang=lang)
}
.addConcepts_sub <- function(x,parent,lang) {
# function(x) {
x['is.dim.tab'] <- gsub(' ','',x['is.dim.tab'])
x['geo'] <- gsub(' ','',x['geo'])
if (x["ref"] == 'string') {ATT <- c(x['id'], extends='entity:entity')}
if (x['ref'] == 'string' && x['geo']=='TRUE') {ATT <- c(x['id'], extends='geo:location')}
if (x['ref'] != 'string') {ATT <- c(x['id'])}
# in the case of not being a dimensional concept
if (x["concept.type"]!="dimension") {
tempnode0 <- XML::newXMLNode('concept', attrs=ATT, parent=parent)
# Starts Adding info
tempnode1 <- XML::newXMLNode('info', parent=tempnode0)
tempnode2 <- XML::newXMLNode('name', parent=tempnode1)
# Adds a description
if (!is.na(x['description'])) {
description <- as.character(x['description'])
tempnode3 <- XML::newXMLNode('description', parent=tempnode1)
XML::newXMLNode('value', parent=tempnode3, attrs=c('xml:lang'=lang[1]), description)
}
# URL node
if (!is.na(x['url'])) {
url <- as.character(x['url'])
tempnode4 <- XML::newXMLNode('url', parent=tempnode1)
XML::newXMLNode('value', parent=tempnode4, attrs=c('xml:lang'=lang[1]), url)
}
# Here should start the multilanguage loop
XML::newXMLNode('value', parent=tempnode2, attrs=c('xml:lang'=lang[1]),
suppressNamespaceWarning=T, x['label'])
# Adds a topic category
if (!is.na(x['topicid'])) {
topicref <- as.character(x['topicid'])
XML::newXMLNode('topic', parent=tempnode0, attrs=c(ref=topicref))
}
# Adds the data type specification
XML::newXMLNode('type', attrs=c(x['ref']), parent=tempnode0)
# in the case of being a dimensional concept
} else {
tempnode0 <- XML::newXMLNode('concept', attrs=ATT, parent=parent)
# Starts adding info
tempnode1 <- XML::newXMLNode('info', parent=tempnode0)
# Name Node
tempnode2 <- XML::newXMLNode('name', parent=tempnode1)
XML::newXMLNode('value', parent=tempnode2, attrs=c('xml:lang'=lang[1]),
suppressNamespaceWarning=T, x['label'])
# Description node
if (!is.na(x['description'])) {
description <- as.character(x['description'])
tempnode3 <- XML::newXMLNode('description', parent=tempnode1)
XML::newXMLNode('value', parent=tempnode3, attrs=c('xml:lang'=lang[1]), description)
}
# URL node
if (!is.na(x['url'])) {
url <- as.character(x['url'])
tempnode4 <- XML::newXMLNode('url', parent=tempnode1)
XML::newXMLNode('value', parent=tempnode4, attrs=c('xml:lang'=lang[1]), url)
}
# Plural name node
if (!is.na(x['totalName'])) {
pluralName <- as.character(x['pluralName'])
tempnode6 <- XML::newXMLNode('pluralName', parent=tempnode1)
XML::newXMLNode('value', parent=tempnode6, attrs=c('xml:lang'=lang[1]), pluralName)
}
# Total name node
if (!is.na(x['totalName'])) {
totalName <- as.character(x['totalName'])
tempnode5 <- XML::newXMLNode('totalName', parent=tempnode1)
XML::newXMLNode('value', parent=tempnode5, attrs=c('xml:lang'=lang[1]),totalName)
}
XML::newXMLNode('type', parent=tempnode0, attrs=c(x['ref']))
XML::newXMLNode('table', parent=tempnode0, attrs=
c(ref=paste(x['dim.tab.ref'],'_table',sep='')))
}
# }
}
.addSlices <- function(tableid, sliceatt, parent) {
################################################################################
# Function to create and populate the slices
################################################################################
colnames(sliceatt)[1] <- 'concept'
by(data=sliceatt, INDICES=tableid,FUN=
function(x) {
XML::newXMLNode(name='slice', attrs=c(id=paste(x$slice[1],'_slice',sep='')),
parent=parent, apply(x, MARGIN = 1,FUN=
function(z){
#z <- as.character(z)
# In the case of dates-time
if (z['type'] == 'date') {
XML::newXMLNode(name=z['concept.type'],
attrs=c(concept=paste('time:',z['concept'],sep='')))
# Otherwise
} else {
XML::newXMLNode(name=z['concept.type'], attrs=c(z['concept']))
}}), XML::newXMLNode('table', attrs=c(ref=paste(x$slice[1],'_table',sep=''))))
}
)
}
.addTables <- function(tableid, tableatt, parent, format) {
################################################################################
# Function to create and populate the tables
################################################################################
by(data=tableatt, INDICES=tableid,FUN=
function(x) {
XML::newXMLNode(name='table', attrs=c(id=paste(x$slice[1],'_table',sep='')),parent=
parent, apply(X=x,
MARGIN = 1, FUN=
function(z){
if (z['type'] == 'date') {
XML::newXMLNode(name='column', attrs=c(z['id'], z['type'], format=format))
} else {
XML::newXMLNode(name='column', attrs=c(z['id'], z['type']))
}}), XML::newXMLNode(name='data', XML::newXMLNode('file', attrs=c(format=
'csv', encoding='utf8'),paste(x$slice[1],'.csv',sep='')))
)
}
)
}
#' Builds Dataset Publication Language (DSPL) metadata file
#'
#' Parsing \emph{csv}, \emph{tab} or \emph{xls(x)} files at a specific
#' directory path, dspl generates a complete DSPL file. If an output string is
#' specified, the function generates the complete ZIP (DSPL file plus csv
#' files) ready to be uploaded to Google Public Data Explorer.
#'
#' If there isn't any output defined the function returns a list of class
#' \code{dspl} that among its contents has a xml object (DSPL file); otherwise,
#' if an output is defined, the results consists on two things, an already ZIP
#' file containing a all the necessary to be uploaded at
#' \url{publicdata.google.com} (a collection of csv files and the XML DSPL
#' written file) and a message (character object).
#'
#' Internally, the parsing process consists on the following steps:
#' \enumerate{
#' \item Loading the data,
#' \item Generating each column corresponding id,
#' \item Identifying the data types,
#' \item Building concepts,
#' \item Identifying dimensional concepts and distinguishing between categorical,
#' geographical and time dimensions, and
#' \item Executing internal checks.
#' }
#'
#' In order to properly load the zip file (DSPL file plus CSV data files), the
#' function executes a series of internal checks upon the data structure. The
#' detailed list:
#' \itemize{
#' \item \strong{Slices with the same dimensions}: DSPL
#' requires that each slice represents one dimensional cut, this is, there
#' should not be more than one data table with the same dimensions.
#' \item \strong{Duplicated concepts}: As a result of multiple data types, e.g a single
#' concept (statistic) as integer in one table and float in other, \code{dspl}
#' may get confused, so during the parsing process, if there is a chance, it
#' collapses duplicated concepts into only one concept and assigns it the
#' common data type (float).
#' \item \strong{Correct time format definition}: Using \code{\link{checkTimeFormat}}
#' ensures that the time format specified is
#' compatible with DSPL.
#' }
#'
#' @param path String. Path to the folder where the tables (csv|tab|xls) are
#' at.
#' @param output String, optional. Path to the output ZIP file.
#' @param replace Logical. If \code{output} ZIP file is defined exists,
#' \code{dspl} replaces it.
#' @param targetNamespace String. As DSPL documentation states ``Provides a URI
#' that identifies your dataset. This URI is not required to point to an actual
#' resource, but it's a good idea to have the URI resolve to a document
#' describing your content or dataset''.
#' @param timeFormat String. The corresponding time format of the collection.
#' Should be specified accordingly to joda-time format. See the Details section
#' for more information.
#' @param lang A list of strings of the languages supported by the dataset.
#' Could be only one.
#' @param name List of strings. The name of the dataset as defined accordingly
#' to the \code{lang} list.
#' @param description List of strings. Description of the dataset. It also
#' supports multiple description as the \code{name}
#' @param url The corresponding URL for the dataset.
#' @param providerName List of strings. The data provider name.
#' @param providerURL List of strings. The data provider website url.
#' @param sep The separation character of the tables in the 'path' folder.
#' Currently supports introducing the following arguments: ``,'' or ``;'' (for
#' .csv files), ``\\t'' (for .tab files) and ``xls'' or ``xlsx'' (for Microsoft's
#' excel files).
#' @param dec String. Decimal point.
#' @param encoding The char encoding of the input tables. Currently ignored for
#' Microsoft excel files.
#' @param moreinfo A special tab file generated by the function
#' \code{\link{genMoreInfo}} that contains a dataframe of the dataset concepts
#' with more specifications such as description, topic, url, etc.
#' @return If there isn't any \code{output} defined, \code{dspl} returns list
#' of \code{\link{class}} "\code{dspl}".
#'
#' An object of class "\code{dspl}" is a list containing:
#' \item{dspl}{A character string containing the DSPL XML document as defined
#' by the \code{\link[XML:saveXML]{saveXML}} function.}
#' \item{concepts.by.table}{A data frame object of concepts stored by table.}
#' \item{dimtabs}{A data frame containing dimensional tables.}
#' \item{slices}{A data frame of slices.}
#' \item{concepts}{A data frame of concepts (all of them).}
#' \item{dimensions}{A data frame of dimensional concepts.}
#' \item{statistics}{A matrix of statistics.}
#'
#' otherwise the function will build a ZIP file as specified in the output
#' containing the CSV and DSPL (XML) files.
#' @author George G. Vega Yon
#' @references \itemize{ \item Google Public Data Explorer Tutorial:
#' \url{https://developers.google.com/public-data/docs/tutorial} }
#' @keywords IO
#' @examples
#'
#' demo(dspl)
#'
#'
#' @export
#' @name dspl
#' @aliases GooglePublicData
NULL
#' @export
#' @rdname dspl
dspl <- function(
################################################################################
# DESCRIPTION:
# Based on an specific folder directory, the function seeks for files that match
# the specified extension (csv, tab, xls, xlsx), reads the column names, guesses
# the datatype, analyces data structure and outputs a dspl metadata file includi
# ng the corresponding csv files.
#
# VARIABLES:
# - path: Full path to the folder where the tables are saved.
# - Output: FUll path to the output folder (pde will create a subfolder call r_dspl).
# - replace: In the case of defining output, replaces the files.
# - targetNamespace:
# - timeFormat: The corresponding time format of the collection.
# - lang: A list of the languages supported by the dataset.
# - name: The name of the dataset.
# - description: The dataset description.
# - url: The corresponding URL for the dataset.
# - providerNAme
# - providerURL
# - sep: The extension of the tables in the 'path' folder.
# - encoding: The char encoding of the input tables.
################################################################################
path,
output = NA,
replace = F,
targetNamespace = "",
timeFormat = "yyyy",
lang = c("es", "en"),
name = NA,
description = NA,
url = NA,
providerName = NA,
providerURL = NA,
sep = ";",
dec=".",
encoding = getOption("encoding"),
moreinfo = NULL
) {
oldopt <- options()$stringsAsFactors
options(stringsAsFactors=FALSE)
on.exit(options(stringsAsFactors = oldopt))
# Initial checks
description <- ifelse(!is.na(description),description,'No description')
name <- ifelse(!is.na(name),name,'No name')
providerName <- ifelse(!is.na(providerName),providerName,'No provider')
# Checking if output path is Ok
if (!is.na(output)) {
temp.path <- tempdir()
.checkPath(path, "input")
}
else temp.path <- NA
if (length(moreinfo)) {
if (!inherits(moreinfo,"data.frame"))
.checkPath(moreinfo, "input")
}
# Checking timeFormat
timeOk <- checkTimeFormat(timeFormat)
if (!timeOk) stop("Undefined \'joda-time\' definition ", timeFormat,
"\nFor more information checkout\n",
"http://joda-time.sourceforge.net/api-release/org/joda/time/format/DateTimeFormat.html")
# Gets the filenames
files <- .getFilesNames(path, sep)
# Variables Lists and datatypes
vars <- seekTables(files=files, encoding=encoding, sep=sep,
output=temp.path, replace=replace, dec=dec)
dims <- vars[
with(vars, concept.type=='dimension'),
colnames(vars) %in% c('id', 'slice', 'concept.type'), FALSE
]
#subset(vars, concept.type=='dimension', select=c(id, slice, concept.type))
# Identifying if there is any duplicated slice
with(dims, .checkSlices(dims=id, by=slice))
vars <- .checkDuplConcepts(concepts=vars)
# Creates a unique concept list
varConcepts <- unique(
# subset(vars,subset=type != 'date' & is.dim.tab==F, select=-slice)
vars[
with(vars, type!='date' & is.dim.tab==FALSE),
colnames(vars)!='slice',FALSE
]
)
# Checks if there is a moreinfo file
varConcepts <- .getMoreInfo(source=moreinfo, target=varConcepts, "")
# Armado de xml
archXML <- XML::newXMLDoc()
dspl <- XML::newXMLNode(name='dspl', doc=archXML, attrs=c(
targetNamespace=targetNamespace),
namespaceDefinitions = 'http://schemas.google.com/dspl/2010')
# Definiciones dspl
imports <- c('quantity', 'entity', 'geo', 'time', 'unit')
sapply(imports,
function(x) {
XML::newXMLNamespace(node=dspl, prefix=x,namespace=paste(
'http://www.google.com/publicdata/dataset/google/',x,sep=''))
})
# Concepts import lines
XML::newXMLCommentNode('Concepts imports', parent=dspl)
imports <- paste(
"http://www.google.com/publicdata/dataset/google/",
imports, sep = '')
sapply(X = imports,
FUN = function(x) XML::newXMLNode(attrs=c(namespace=x), name = 'import',
parent = dspl))
# INFO
XML::newXMLCommentNode('Info lines', parent=dspl)
info <- XML::newXMLNode('info', parent = dspl)
.addInfo('name', name, info, lang)
.addInfo('description', description, info, lang)
if (!is.na(url)) XML::newXMLNode('url', XML::newXMLNode('value', url),
parent = info)
# PROVIDER
XML::newXMLCommentNode('Data Provider', parent=dspl)
provider <- XML::newXMLNode('provider', parent = dspl)
.addInfo('name', providerName, provider, lang)
if (!is.na(providerURL)) XML::newXMLNode('url', XML::newXMLNode('value', providerURL),
parent = provider)
# TOPICS
if ("topicid" %in% colnames(varConcepts)) {
test <- !all(is.na(varConcepts$topicid))
}
else {
test <- F
}
if (test) {
XML::newXMLCommentNode('Topics definition', parent=dspl)
topics <- XML::newXMLNode('topics', parent=dspl)
.addTopics('topic', varConcepts[c('topic', 'topicid')], topics, lang)
}
# CONCEPTS
XML::newXMLCommentNode('Concepts Definitions', parent=dspl)
concepts <- XML::newXMLNode('concepts', parent = dspl)
.addConcepts(varConcepts,concepts, lang)
# SLICES
XML::newXMLCommentNode('Slices Definitions', parent=dspl)
slices <- XML::newXMLNode('slices', parent = dspl)
.addSlices(
tableid = vars[with(vars, !is.dim.tab),, FALSE][['slice']],
sliceatt = vars[with(vars, !is.dim.tab),, FALSE], # subset(vars, is.dim.tab != T),
parent = slices
)
# TABLES
XML::newXMLCommentNode('Tables Definitios', parent=dspl)
tables <- XML::newXMLNode('tables', parent = dspl)
.addTables(tableid=vars$slice,tableatt=vars,parent=tables, format=timeFormat)
# Building ouput
.dimtabs <- unique(vars[with(vars, is.dim.tab),,FALSE][['slice']]) #subset(vars, subset=is.dim.tab, select=slice))
.slices <- unique(vars[['slice']]) #subset(vars, select=slice))
.concepts <- unique(vars[['label']]) #subset(vars, select=label))
.dims <- unique(
# subset(
# vars,
# subset=is.dim.tab & !(label %in% c('name', 'latitude','longitude','colour')),
# select=label
# )
vars[
with(vars, is.dim.tab & !(label %in% c('name', 'latitude','longitude','colour'))),
colnames(vars)=='label',
FALSE
])
lapply(c(.dimtabs, .slices, .concepts, .dims), function(x) names(x) <- 'Name')
pde.statistics <- matrix(
c(
NROW(.slices),
NROW(.concepts),
NROW(.dims)
), ncol=3)
colnames(pde.statistics) <- c('slices','concepts','dimensions')
result <- structure(
.Data=
list(
XML::saveXML(archXML, encoding="UTF-8"), vars, .dimtabs, .slices, .concepts,
.dims, pde.statistics
),
.Names = c(
'dspl', 'concepts.by.table', 'dimtabs', 'slices', 'concepts', 'dimensions',
'statistics'),
class = "dspl"
)
# If an output file is specified, it writes it on it
if (is.na(output)) {
return(result)
} else {
path <- paste(temp.path,'/metadata.xml',sep='')
print.dspl(x=result, path=path, replace=replace, quiet=T)
# Zipping the files
tozip <- list.files(temp.path, full.names=T, pattern="csv$|xml$")
utils::zip(output ,tozip,flags='-r9jm')
message("Metadata created successfully at:\n", normalizePath(output))
}
}
#' @export
#' @rdname dspl
new_dspl <- dspl
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.