Nothing
.getFilesNames <- function(path, sep=";") {
################################################################################
# DESCRIPTION:
# As a function of a path and a file extension, gets all the file names there.
################################################################################
# Checking the sep
pattern <- paste0("[.]",switch(
sep,
"\t" = "tab",
";" = "csv",
"," = "csv",
xls = "xls",
xlsx = "xlsx"), "$")
# Listing files
files <- list.files(path=path, pattern = pattern, full.names=TRUE)
nfiles <- length(files)
if (nfiles==0) {
stop(nfiles, ' files found.\nCannot\' continue')
}
else {
message(nfiles, ' files found...')
}
return(files)
}
#' Generates a dataframe used to complement a DSPL bundle
#'
#' Parsing \emph{csv}, \emph{tab} or \emph{xls(x)} files at a specific
#' directory path, genMore info generates a dataframe used to complete a DSPL
#' bundle with a more complete concepts definition including description, url,
#' etc..
#'
#' If there isn't any output defined (\code{NA}) the function returns a
#' dataframe containing concepts as observations. Using this, the user may add
#' more descriptive info about concepts. In turn it writes a tab file with the
#' dataframe described above. The user may recycle this file writing ``append''
#' in the \code{action} argument.
#'
#' @param path String. Path to the folder where the tables are saved.
#' @param encoding The encoding of the files to be parsed.
#' @param sep The separation character of the tables in the 'path' folder.
#' Currently supports introducing the following arguments: ``,'' or ``;'' (for
#' .csv files), ``\\t'' (for .tab files) and ``xls'' or ``xlsx'' (for Microsoft's
#' excel files).
#' @param output If defined, the place where to save the dataframe as tab file.
#' Otherwise it returns a data frame object.
#' @param action Tells the function what to do if there's a copy of the file.
#' Available actions are ``merge'' and ``replace''.
#' @param dec String. Decimal point.
#' @return If no \code{output} defined, \code{genMoreInfo} returns a dataframe
#' with the following columns. \item{id}{XML id of the concept
#' (autogenerated)} \item{label}{The label of the concept (autogenerated)}
#' \item{description}{A brief description of the concept} \item{topic}{The
#' topic of the concept} \item{url}{A URL for the concept where, for example,
#' to get more info} \item{totalName}{A total name as specified by DSPL language
#' (works for dimensional concepts)} \item{pluralName}{A total name as
#' specified by DSPL language (works for dimensional concepts)}
#' @author George G. Vega Yon
#' @references Google Public Data Explorer: \url{http://publicdata.google.com}
#' @keywords IO
#' @examples
#'
#' # Getting the path where all the datasets are
#' path <- system.file("dspl-tutorial", package="googlePublicData")
#' info <- genMoreInfo(path) # This is a dataframe
#'
#' # Setting the 5th concept as topic "Demographics"
#' info[5, "topic"] <- "Demographics"
#'
#' # Generating the dspl file
#' ans <- dspl(path, moreinfo = info)
#' ans
#'
#' \dontrun{
#' # Parsing some xlsx files at "my stats folder" to gen a "moreinfo" dataframe
#' INFO <- genMoreInfo(path="my stats folder/", sep="xls")
#'
#' # Rows 1 to 10 are about "Poverty" and rows 11 to 20 about "Education"
#' # So we fill the "topic" column with it.
#' INFO$topic[1:10] <- "Poverty"
#' INFO$topic[11:20] <- "Education"
#'
#' # Finally, we build the DSPL ZIP including more info
#' dspl(path="my stats folder/", sep="xls", moreinfo=INFO)
#'
#' }
#' @export
#'
genMoreInfo <- function(path, encoding=getOption("encoding"), sep=";",
output=NA, action="merge", dec=".") {
################################################################################
# DESCRIPTION:
# Reads .csv and .xls(x) files, outputs a descriptive dataframe of the data and
# builds a config file.
################################################################################
oldopt <- options()$stringsAsFactors
options(stringsAsFactors=FALSE)
on.exit(options(stringsAsFactors = oldopt))
# Checks if the path exists
.checkPath(path, "input")
# Generates the filelist acording to an specific sep
files <- .getFilesNames(path, sep)
# Reads and analices the files
x <- seekTables(files=files, encoding=encoding, sep=sep, dec=dec)
# Extracts the unique list of variables
x <- unique(
# subset(x,subset=type != 'date' & is.dim.tab==F,
# select=c(id, label))
x[
with(x, type!='date' & !is.dim.tab),
colnames(x) %in% c('id','label'),
FALSE]
)
x <- data.frame(cbind(x, description=NA, topic=NA, url=NA, totalName=NA,
pluralName=NA), stringsAsFactors=F)
# Prints the result
if (is.na(output)) {
return(x)
}
# Case of mergin
if (action == 'merge') {
target.exists <- file.exists(output)
if (target.exists) {
# Merge of the tables
x0 <- read.table(file=output, na.strings='NA', sep='\t')
x <- subset(x, !(x$id %in% x0$id))
x <- rbind(x0,x)
}
else {
warning('The file ',output,'doesn\'t exists. It will be created')
}
ERR<-try(write.table(x, file=output, quote=F, na="NA", sep='\t'))
if (class(ERR)!='try-error') {
message("DSPL Configfile written correctly at\n",normalizePath(output))
}
else {
stop("An error has occurred during the file writing at:\n",normalizePath(output))
}
}
# Case of replacing
else if (action %in% c('replace','merge')) {
ERR<-try(write.table(x, file=output, quote=F, na="NA", sep='\t'))
if (class(ERR)!='try-error') {
message("DSPL Configfile written correctly at:\n",normalizePath(output))
}
else {
stop("An error has occurred during the file writing at:\n",normalizePath(output))
}
}
}
seekTables <- function(files, encoding, sep, output = NA, replace = T, dec) {
################################################################################
# DESCRIPTION:
# Reads .csv and .xls(x) files, exports them as csv and outputs a descriptive da
# taframe. Also determinates which field is dim or metric.
################################################################################
# Timeframe metrics
metrics <- matrix(c(
'dia','day','semana','week','trimestre','quarter', 'mes','month','ano',
'year', 'year','year','month','month'), ncol = 2, byrow=T)
FUN <- function(x,y,z) {
# In the case of csv, tab
if (!(sep %in% c("xls", "xlsx"))) {
data <- utils::read.table(
x, sep=sep, strip.white=T, encoding=y, fill=T,
dec=z, header=T
)
} else {
data <- readxl::read_excel(x, col_names = T, sheet = 1)
}
cols <- colnames(data)
cols <- gsub(".", " ", cols, fixed = T)
cols <- gsub("^[[:space:]]*|[[:space:]]*$", "", cols)
cols <- gsub("[^[:graph:]][[:space:]]*"," ", cols)
colnames(data) <- cols
fnames <- gsub("\\.[[:alpha:]]*$","",x)
fnames <- gsub(".*(/|\\\\)", "", fnames)
fnames <- rep(fnames, length(cols))
# Builds descriptive matrix
var <- data.frame(
id=.cleantext(cols),
label=cols,
type=.fixType(unlist(lapply(data, typeof))),
slice=fnames
)
# Creates a new column of metric vs dimm
var <- cbind(var, concept.type='metric')
var[var[,1] %in% metrics[,1], 5] <- 'dimension' # If time
var[var[,3] %in% c('string'),5] <- 'dimension' # If string
var[var[,1] %in% c('longitud','latitud','colour'),5] <- NA # If string
var[var[,1] %in% metrics[,1], 3] <- 'date' # If date
# Identifies which dataset is a dimension dataset
var <- cbind(var, is.dim.tab = F)
if (all(var[,3] != 'date')) {var['is.dim.tab'] <- T}
# Replaces the date-time colnames for those proper to PDE
for (i in 1:NROW(metrics)) {
cols <- gsub(metrics[i,1], metrics[i,2],.cleantext(cols), fixed = T)
}
var['id'] <- cols
# In the case of output, it creates a new folder
if (!is.na(output)) {
colnames(data) <- cols
# Sorts the data acording to dimensional concepts (not date)
ord <- var[var[,5]=='dimension' & var[,3] != 'date',1]
if (length(ord)!=0) data <- data[do.call(order,data[ord]),]
# Writes the data into csv files
con <- file(paste(output,'/',var[1,4],'.csv',sep=''), encoding="UTF-8")
write.table(
x = data,
file = con,
na = '',
sep = ',',
quote = FALSE,
row.names = FALSE,
dec = '.'
)
message(
gsub(".*(/|\\\\)","",x)," analyzed correctly, ordered by ",
paste(utils::head(ord), collapse = ", "), ifelse(length(ord)>6, "...", ""),
" and exported as csv"
)
}
else {
message(x," analyzed correctly")
}
return(var)
}
# Puts it all into a single matrrx
vars <- do.call('rbind', lapply(files, FUN, y=encoding, z=dec))
# Identifies where are the correspondant tables for each dimension
vars <- cbind(vars, dim.tab.ref = NA)
for (i in 1:NROW(vars)) {
if (vars$concept.type[i] == 'dimension' & vars$type[i] != 'date' &
vars$is.dim.tab[i] != 'TRUE') {
delta <- try(vars[vars[,1]==vars[i,1] & vars[,6] == 'TRUE',4])
if (length(delta) == 0) {
stop('The dimension ', vars$label[i],' needs a code (key) tab')
} else {
vars[i,7] <- try(delta)
}
}
}
# Setting which concepts extends geo:location
geo <-vars[with(vars, is.dim.tab == TRUE), colnames(vars) %in% c("id", "slice"), FALSE]
geo <- by(
geo, geo$slice,
function(x) all(c('longitude', 'latitude') %in% unlist(x['id']))
)
geo <- names(which(geo))
vars <- cbind(vars, extends.geo=FALSE)
vars$extends.geo[which(vars$dim.tab.ref %in% geo)] <- TRUE
return(vars)
}
.getMoreInfo <- function(source,target, encoding="unknown", id='id', label='label') {
################################################################################
# Reads from a tab file generated by genMoreInfo as a complement info to concepts
################################################################################
# Partial fix (should work, need to see
# http://stackoverflow.com/questions/23475309/in-r-is-it-possible-to-suppress-note-no-visible-binding-for-global-variable)
# type <- is.dim.tab <- id <- label <- freq <- NULL
if (length(source)) {
# If a file, reads the moreinfo file
if (!inherits(source, "data.frame")) {
oldopt <- options()$stringsAsFactors
options(stringsAsFactors=FALSE)
on.exit(options(stringsAsFactors = oldopt))
source <- utils::read.table(file=source, header=TRUE, sep='\t', na.strings='NA',
encoding=encoding)
}
# Cleans up the content
source$topicid <- .cleantext(source$topic)
# Checks if there are at least moreinfo concepts as path concepts, where path
# concepts are those captured from the files at pde(path=...)
ntarget <- NROW(target)
nsource <- NROW(source)
if (ntarget < nsource) {
warning('The number of moreinfo concepts (',nsource,') vs concepts found at path (',
ntarget,') differ. Concepts at moreinfo won\'t appear at the metadata.')
}
else if (ntarget > nsource) {
stop('Can\'t continue, there are more concepts at the path than at moreinfo')
}
# Merges source and target
source <- source[which(source[[id]] %in% target$id),] # subset(source, id %in% target$id)
target$label <- source[[label]] # source$label
source <- source[,colnames(source)!=label,FALSE] #subset(source, select=-label)
target <- merge(target, source, by=c('id'))
}
return(target)
}
.fixType <- function(x) {
################################################################################
# Transforms the datatypes in function of allowed datatypes for DSPL
################################################################################
replace <- matrix(c('logical', 'integer', 'double', 'complex', 'character',
'boolean', 'integer', 'float', 'float', 'string'), ncol =2)
for (i in 1:5) x <- as.character(gsub(replace[i,1], replace[i,2],x, fixed=T))
return(x)
}
.cleantext <- function(x) {
################################################################################
# Adapts labels to IDs
################################################################################
sym <- matrix(c("$", "money", "??", "grad", "#", "n", "%", "pcent",
" ", "_", ".", "", ",", "_", ";", "_", ":", "_",
"(","_",")","_"),
ncol = 2, byrow = TRUE)
x <- tolower(x)
x <- iconv(x, to="ASCII//TRANSLIT")
for (i in 1:NROW(sym)) {
x <- gsub(sym[i,1], sym[i,2], x, fixed = TRUE)
}
# Extracts
x <-
return(x)
}
.addInfo <- function(nodename,values,parent,lang) {
################################################################################
# Function to add information and provider nodes y multiple languages
################################################################################
XML::newXMLNode(nodename, parent=parent, sapply(values, function(x) {
XML::newXMLNode('value',attrs=c('xml:lang'=lang[which(values==x)]), x,
suppressNamespaceWarning=TRUE)}))
}
.addTopics <- function(nodename, values, parent, lang) {
################################################################################
# Function to create and populate the topics
################################################################################
values <- unique(values)
values <- subset(values, !is.na(values[,1]))
colnames(values) <- c('label','id')
fun <- function(x, ...) {apply(x, MARGIN = 1,...)}
fun(values, FUN=
function(x) {
tempnode0 <- XML::newXMLNode(nodename, parent=parent, attrs=c(x['id']))
tempnode1 <- XML::newXMLNode('info', parent=tempnode0)
tempnode2 <- XML::newXMLNode('name', parent=tempnode1)
XML::newXMLNode('value', parent=tempnode2, attrs=c('xml:lang'=lang[1]),
x['label'], suppressNamespaceWarning=T)
}
)
}
.addConcepts <- function(val,parent,lang) {
################################################################################
# Function to create and populate the concepts
################################################################################
colnames(val)[3] <- 'ref'
colnames(val)[7] <- 'geo'
# if (NCOL(val) > 1) {
# fun <- function(x, ...) {apply(x, MARGIN = 1,...)}
# } else {
# fun <- function(x, FUN) {FUN(x)}
# }
#
# fun(val, .addConcepts_sub)
if (ncol(val)>1) apply(val,.addConcepts_sub,MARGIN = 1,parent=parent,lang=lang)
else .addConcepts(val,parent=parent,lang=lang)
}
.addConcepts_sub <- function(x,parent,lang) {
# function(x) {
x['is.dim.tab'] <- gsub(' ','',x['is.dim.tab'])
x['geo'] <- gsub(' ','',x['geo'])
if (x["ref"] == 'string') {ATT <- c(x['id'], extends='entity:entity')}
if (x['ref'] == 'string' && x['geo']=='TRUE') {ATT <- c(x['id'], extends='geo:location')}
if (x['ref'] != 'string') {ATT <- c(x['id'])}
# in the case of not being a dimensional concept
if (x["concept.type"]!="dimension") {
tempnode0 <- XML::newXMLNode('concept', attrs=ATT, parent=parent)
# Starts Adding info
tempnode1 <- XML::newXMLNode('info', parent=tempnode0)
tempnode2 <- XML::newXMLNode('name', parent=tempnode1)
# Adds a description
if (!is.na(x['description'])) {
description <- as.character(x['description'])
tempnode3 <- XML::newXMLNode('description', parent=tempnode1)
XML::newXMLNode('value', parent=tempnode3, attrs=c('xml:lang'=lang[1]), description)
}
# URL node
if (!is.na(x['url'])) {
url <- as.character(x['url'])
tempnode4 <- XML::newXMLNode('url', parent=tempnode1)
XML::newXMLNode('value', parent=tempnode4, attrs=c('xml:lang'=lang[1]), url)
}
# Here should start the multilanguage loop
XML::newXMLNode('value', parent=tempnode2, attrs=c('xml:lang'=lang[1]),
suppressNamespaceWarning=T, x['label'])
# Adds a topic category
if (!is.na(x['topicid'])) {
topicref <- as.character(x['topicid'])
XML::newXMLNode('topic', parent=tempnode0, attrs=c(ref=topicref))
}
# Adds the data type specification
XML::newXMLNode('type', attrs=c(x['ref']), parent=tempnode0)
# in the case of being a dimensional concept
} else {
tempnode0 <- XML::newXMLNode('concept', attrs=ATT, parent=parent)
# Starts adding info
tempnode1 <- XML::newXMLNode('info', parent=tempnode0)
# Name Node
tempnode2 <- XML::newXMLNode('name', parent=tempnode1)
XML::newXMLNode('value', parent=tempnode2, attrs=c('xml:lang'=lang[1]),
suppressNamespaceWarning=T, x['label'])
# Description node
if (!is.na(x['description'])) {
description <- as.character(x['description'])
tempnode3 <- XML::newXMLNode('description', parent=tempnode1)
XML::newXMLNode('value', parent=tempnode3, attrs=c('xml:lang'=lang[1]), description)
}
# URL node
if (!is.na(x['url'])) {
url <- as.character(x['url'])
tempnode4 <- XML::newXMLNode('url', parent=tempnode1)
XML::newXMLNode('value', parent=tempnode4, attrs=c('xml:lang'=lang[1]), url)
}
# Plural name node
if (!is.na(x['totalName'])) {
pluralName <- as.character(x['pluralName'])
tempnode6 <- XML::newXMLNode('pluralName', parent=tempnode1)
XML::newXMLNode('value', parent=tempnode6, attrs=c('xml:lang'=lang[1]), pluralName)
}
# Total name node
if (!is.na(x['totalName'])) {
totalName <- as.character(x['totalName'])
tempnode5 <- XML::newXMLNode('totalName', parent=tempnode1)
XML::newXMLNode('value', parent=tempnode5, attrs=c('xml:lang'=lang[1]),totalName)
}
XML::newXMLNode('type', parent=tempnode0, attrs=c(x['ref']))
XML::newXMLNode('table', parent=tempnode0, attrs=
c(ref=paste(x['dim.tab.ref'],'_table',sep='')))
}
# }
}
.addSlices <- function(tableid, sliceatt, parent) {
################################################################################
# Function to create and populate the slices
################################################################################
colnames(sliceatt)[1] <- 'concept'
by(data=sliceatt, INDICES=tableid,FUN=
function(x) {
XML::newXMLNode(name='slice', attrs=c(id=paste(x$slice[1],'_slice',sep='')),
parent=parent, apply(x, MARGIN = 1,FUN=
function(z){
#z <- as.character(z)
# In the case of dates-time
if (z['type'] == 'date') {
XML::newXMLNode(name=z['concept.type'],
attrs=c(concept=paste('time:',z['concept'],sep='')))
# Otherwise
} else {
XML::newXMLNode(name=z['concept.type'], attrs=c(z['concept']))
}}), XML::newXMLNode('table', attrs=c(ref=paste(x$slice[1],'_table',sep=''))))
}
)
}
.addTables <- function(tableid, tableatt, parent, format) {
################################################################################
# Function to create and populate the tables
################################################################################
by(data=tableatt, INDICES=tableid,FUN=
function(x) {
XML::newXMLNode(name='table', attrs=c(id=paste(x$slice[1],'_table',sep='')),parent=
parent, apply(X=x,
MARGIN = 1, FUN=
function(z){
if (z['type'] == 'date') {
XML::newXMLNode(name='column', attrs=c(z['id'], z['type'], format=format))
} else {
XML::newXMLNode(name='column', attrs=c(z['id'], z['type']))
}}), XML::newXMLNode(name='data', XML::newXMLNode('file', attrs=c(format=
'csv', encoding='utf8'),paste(x$slice[1],'.csv',sep='')))
)
}
)
}
#' Builds Dataset Publication Language (DSPL) metadata file
#'
#' Parsing \emph{csv}, \emph{tab} or \emph{xls(x)} files at a specific
#' directory path, dspl generates a complete DSPL file. If an output string is
#' specified, the function generates the complete ZIP (DSPL file plus csv
#' files) ready to be uploaded to Google Public Data Explorer.
#'
#' If there isn't any output defined the function returns a list of class
#' \code{dspl} that among its contents has a xml object (DSPL file); otherwise,
#' if an output is defined, the results consists on two things, an already ZIP
#' file containing a all the necessary to be uploaded at
#' \url{publicdata.google.com} (a collection of csv files and the XML DSPL
#' written file) and a message (character object).
#'
#' Internally, the parsing process consists on the following steps:
#' \enumerate{
#' \item Loading the data,
#' \item Generating each column corresponding id,
#' \item Identifying the data types,
#' \item Building concepts,
#' \item Identifying dimensional concepts and distinguishing between categorical,
#' geographical and time dimensions, and
#' \item Executing internal checks.
#' }
#'
#' In order to properly load the zip file (DSPL file plus CSV data files), the
#' function executes a series of internal checks upon the data structure. The
#' detailed list:
#' \itemize{
#' \item \strong{Slices with the same dimensions}: DSPL
#' requires that each slice represents one dimensional cut, this is, there
#' should not be more than one data table with the same dimensions.
#' \item \strong{Duplicated concepts}: As a result of multiple data types, e.g a single
#' concept (statistic) as integer in one table and float in other, \code{dspl}
#' may get confused, so during the parsing process, if there is a chance, it
#' collapses duplicated concepts into only one concept and assigns it the
#' common data type (float).
#' \item \strong{Correct time format definition}: Using \code{\link{checkTimeFormat}}
#' ensures that the time format specified is
#' compatible with DSPL.
#' }
#'
#' @param path String. Path to the folder where the tables (csv|tab|xls) are
#' at.
#' @param output String, optional. Path to the output ZIP file.
#' @param replace Logical. If \code{output} ZIP file is defined exists,
#' \code{dspl} replaces it.
#' @param targetNamespace String. As DSPL documentation states ``Provides a URI
#' that identifies your dataset. This URI is not required to point to an actual
#' resource, but it's a good idea to have the URI resolve to a document
#' describing your content or dataset''.
#' @param timeFormat String. The corresponding time format of the collection.
#' Should be specified accordingly to joda-time format. See the Details section
#' for more information.
#' @param lang A list of strings of the languages supported by the dataset.
#' Could be only one.
#' @param name List of strings. The name of the dataset as defined accordingly
#' to the \code{lang} list.
#' @param description List of strings. Description of the dataset. It also
#' supports multiple description as the \code{name}
#' @param url The corresponding URL for the dataset.
#' @param providerName List of strings. The data provider name.
#' @param providerURL List of strings. The data provider website url.
#' @param sep The separation character of the tables in the 'path' folder.
#' Currently supports introducing the following arguments: ``,'' or ``;'' (for
#' .csv files), ``\\t'' (for .tab files) and ``xls'' or ``xlsx'' (for Microsoft's
#' excel files).
#' @param dec String. Decimal point.
#' @param encoding The char encoding of the input tables. Currently ignored for
#' Microsoft excel files.
#' @param moreinfo A special tab file generated by the function
#' \code{\link{genMoreInfo}} that contains a dataframe of the dataset concepts
#' with more specifications such as description, topic, url, etc.
#' @return If there isn't any \code{output} defined, \code{dspl} returns list
#' of \code{\link{class}} "\code{dspl}".
#'
#' An object of class "\code{dspl}" is a list containing:
#' \item{dspl}{A character string containing the DSPL XML document as defined
#' by the \code{\link[XML:saveXML]{saveXML}} function.}
#' \item{concepts.by.table}{A data frame object of concepts stored by table.}
#' \item{dimtabs}{A data frame containing dimensional tables.}
#' \item{slices}{A data frame of slices.}
#' \item{concepts}{A data frame of concepts (all of them).}
#' \item{dimensions}{A data frame of dimensional concepts.}
#' \item{statistics}{A matrix of statistics.}
#'
#' otherwise the function will build a ZIP file as specified in the output
#' containing the CSV and DSPL (XML) files.
#' @author George G. Vega Yon
#' @references \itemize{ \item Google Public Data Explorer Tutorial:
#' \url{https://developers.google.com/public-data/docs/tutorial} }
#' @keywords IO
#' @examples
#'
#' demo(dspl)
#'
#'
#' @export
#' @name dspl
#' @aliases GooglePublicData
NULL
#' @export
#' @rdname dspl
dspl <- function(
################################################################################
# DESCRIPTION:
# Based on an specific folder directory, the function seeks for files that match
# the specified extension (csv, tab, xls, xlsx), reads the column names, guesses
# the datatype, analyces data structure and outputs a dspl metadata file includi
# ng the corresponding csv files.
#
# VARIABLES:
# - path: Full path to the folder where the tables are saved.
# - Output: FUll path to the output folder (pde will create a subfolder call r_dspl).
# - replace: In the case of defining output, replaces the files.
# - targetNamespace:
# - timeFormat: The corresponding time format of the collection.
# - lang: A list of the languages supported by the dataset.
# - name: The name of the dataset.
# - description: The dataset description.
# - url: The corresponding URL for the dataset.
# - providerNAme
# - providerURL
# - sep: The extension of the tables in the 'path' folder.
# - encoding: The char encoding of the input tables.
################################################################################
path,
output = NA,
replace = F,
targetNamespace = "",
timeFormat = "yyyy",
lang = c("es", "en"),
name = NA,
description = NA,
url = NA,
providerName = NA,
providerURL = NA,
sep = ";",
dec=".",
encoding = getOption("encoding"),
moreinfo = NULL
) {
oldopt <- options()$stringsAsFactors
options(stringsAsFactors=FALSE)
on.exit(options(stringsAsFactors = oldopt))
# Initial checks
description <- ifelse(!is.na(description),description,'No description')
name <- ifelse(!is.na(name),name,'No name')
providerName <- ifelse(!is.na(providerName),providerName,'No provider')
# Checking if output path is Ok
if (!is.na(output)) {
temp.path <- tempdir()
.checkPath(path, "input")
}
else temp.path <- NA
if (length(moreinfo)) {
if (!inherits(moreinfo,"data.frame"))
.checkPath(moreinfo, "input")
}
# Checking timeFormat
timeOk <- checkTimeFormat(timeFormat)
if (!timeOk) stop("Undefined \'joda-time\' definition ", timeFormat,
"\nFor more information checkout\n",
"http://joda-time.sourceforge.net/api-release/org/joda/time/format/DateTimeFormat.html")
# Gets the filenames
files <- .getFilesNames(path, sep)
# Variables Lists and datatypes
vars <- seekTables(files=files, encoding=encoding, sep=sep,
output=temp.path, replace=replace, dec=dec)
dims <- vars[
with(vars, concept.type=='dimension'),
colnames(vars) %in% c('id', 'slice', 'concept.type'), FALSE
]
#subset(vars, concept.type=='dimension', select=c(id, slice, concept.type))
# Identifying if there is any duplicated slice
with(dims, .checkSlices(dims=id, by=slice))
vars <- .checkDuplConcepts(concepts=vars)
# Creates a unique concept list
varConcepts <- unique(
# subset(vars,subset=type != 'date' & is.dim.tab==F, select=-slice)
vars[
with(vars, type!='date' & is.dim.tab==FALSE),
colnames(vars)!='slice',FALSE
]
)
# Checks if there is a moreinfo file
varConcepts <- .getMoreInfo(source=moreinfo, target=varConcepts, "")
# Armado de xml
archXML <- XML::newXMLDoc()
dspl <- XML::newXMLNode(name='dspl', doc=archXML, attrs=c(
targetNamespace=targetNamespace),
namespaceDefinitions = 'http://schemas.google.com/dspl/2010')
# Definiciones dspl
imports <- c('quantity', 'entity', 'geo', 'time', 'unit')
sapply(imports,
function(x) {
XML::newXMLNamespace(node=dspl, prefix=x,namespace=paste(
'http://www.google.com/publicdata/dataset/google/',x,sep=''))
})
# Concepts import lines
XML::newXMLCommentNode('Concepts imports', parent=dspl)
imports <- paste(
"http://www.google.com/publicdata/dataset/google/",
imports, sep = '')
sapply(X = imports,
FUN = function(x) XML::newXMLNode(attrs=c(namespace=x), name = 'import',
parent = dspl))
# INFO
XML::newXMLCommentNode('Info lines', parent=dspl)
info <- XML::newXMLNode('info', parent = dspl)
.addInfo('name', name, info, lang)
.addInfo('description', description, info, lang)
if (!is.na(url)) XML::newXMLNode('url', XML::newXMLNode('value', url),
parent = info)
# PROVIDER
XML::newXMLCommentNode('Data Provider', parent=dspl)
provider <- XML::newXMLNode('provider', parent = dspl)
.addInfo('name', providerName, provider, lang)
if (!is.na(providerURL)) XML::newXMLNode('url', XML::newXMLNode('value', providerURL),
parent = provider)
# TOPICS
if ("topicid" %in% colnames(varConcepts)) {
test <- !all(is.na(varConcepts$topicid))
}
else {
test <- F
}
if (test) {
XML::newXMLCommentNode('Topics definition', parent=dspl)
topics <- XML::newXMLNode('topics', parent=dspl)
.addTopics('topic', varConcepts[c('topic', 'topicid')], topics, lang)
}
# CONCEPTS
XML::newXMLCommentNode('Concepts Definitions', parent=dspl)
concepts <- XML::newXMLNode('concepts', parent = dspl)
.addConcepts(varConcepts,concepts, lang)
# SLICES
XML::newXMLCommentNode('Slices Definitions', parent=dspl)
slices <- XML::newXMLNode('slices', parent = dspl)
.addSlices(
tableid = vars[with(vars, !is.dim.tab),, FALSE][['slice']],
sliceatt = vars[with(vars, !is.dim.tab),, FALSE], # subset(vars, is.dim.tab != T),
parent = slices
)
# TABLES
XML::newXMLCommentNode('Tables Definitios', parent=dspl)
tables <- XML::newXMLNode('tables', parent = dspl)
.addTables(tableid=vars$slice,tableatt=vars,parent=tables, format=timeFormat)
# Building ouput
.dimtabs <- unique(vars[with(vars, is.dim.tab),,FALSE][['slice']]) #subset(vars, subset=is.dim.tab, select=slice))
.slices <- unique(vars[['slice']]) #subset(vars, select=slice))
.concepts <- unique(vars[['label']]) #subset(vars, select=label))
.dims <- unique(
# subset(
# vars,
# subset=is.dim.tab & !(label %in% c('name', 'latitude','longitude','colour')),
# select=label
# )
vars[
with(vars, is.dim.tab & !(label %in% c('name', 'latitude','longitude','colour'))),
colnames(vars)=='label',
FALSE
])
lapply(c(.dimtabs, .slices, .concepts, .dims), function(x) names(x) <- 'Name')
pde.statistics <- matrix(
c(
NROW(.slices),
NROW(.concepts),
NROW(.dims)
), ncol=3)
colnames(pde.statistics) <- c('slices','concepts','dimensions')
result <- structure(
.Data=
list(
XML::saveXML(archXML, encoding="UTF-8"), vars, .dimtabs, .slices, .concepts,
.dims, pde.statistics
),
.Names = c(
'dspl', 'concepts.by.table', 'dimtabs', 'slices', 'concepts', 'dimensions',
'statistics'),
class = "dspl"
)
# If an output file is specified, it writes it on it
if (is.na(output)) {
return(result)
} else {
path <- paste(temp.path,'/metadata.xml',sep='')
print.dspl(x=result, path=path, replace=replace, quiet=T)
# Zipping the files
tozip <- list.files(temp.path, full.names=T, pattern="csv$|xml$")
utils::zip(output ,tozip,flags='-r9jm')
message("Metadata created successfully at:\n", normalizePath(output))
}
}
#' @export
#' @rdname dspl
new_dspl <- dspl
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.