R/xml2relational.r

Defines functions savetofiles getInsertSQL getCreateSQL infer.datatype is.nullable convertible.enum convertible.double convertible.num convertible.datetime check.datetimeformats toRelational parseXMLNode find.object serial.xml serial.df create.id get.df

Documented in getCreateSQL getInsertSQL savetofiles toRelational

#' @title Package 'xml2relational'
#'
#' @description Transforming a hierarchical XML document into a relational data
#'   model.
#'
#' @section What is \code{xml2relational}:
#' The \code{xml2relational} package is
#'   designed to 'flatten' XML documents with nested objects into relational
#'   dataframes. \code{xml2relational} takes an XML file as input and converts
#'   it into a set of dataframes (tables). The tables are linked among each
#'   other with foreign keys and can be exported as CSV or ready-to-use SQL code
#'   (\code{CREATE TABLE} for the data model, \code{INSERT INTO} for the data).
#'
#'
#' @section How to use \code{xml2relational}:
#' \itemize{ \item First, use
#'   \code{\link{toRelational}()} to read in an XML file and to convert into a
#'   relational data model. \item This will give you a list of dataframes, one
#'   for each table in the relational data model. Tables are linked by foreign
#'   keys. You can specify the naming convention for the tables' primary and
#'   foreign keys as arguments in \code{\link{toRelational}()}. \item You can
#'   now export the data structures of the tables (or a selection of tables)
#'   using  \code{\link{getCreateSQL}()}. It support multiple SQL dialects, and
#'   you also provide syntax and data type information for additional SQL
#'   dialects. \item You can also export the data as SQL \code{INSERT}
#'   statements with the \code{\link{getInsertSQL}()}. If you only want to
#'   export the data as CSV use \code{\link{savetofiles}()} to save the
#'   dataframes produced by \code{\link{toRelational}()} as comma-separated
#'   files.
#'   }
#'@name xml2relational
NULL



get.df <- function(l, table.name) {
  if(table.name %in% names(l)) {
    return(which(names(l)==table.name))
  }
  else return(NULL)
}


# check.all: Unique ID accross all tables or only in relation to current table?
# table.name: Table for which ID is generated
create.id <- function(l, table.name, check.all = TRUE, prefix.primary = "ID_", keys.dim = 6) {
  id <- NULL
  df.index <- get.df(l, table.name)
  if(df.index) {
    repeat {
      id <- round(stats::runif(1, 1, 10^keys.dim-1),0)
      if(check.all == TRUE) to.check <- names(l)
      else to.check <- table.name
      found <- FALSE
      for(i in 1:NROW(to.check)){
        if(id %in% l[[which(names(l)==to.check[i])]][, paste0(prefix.primary, to.check[i])]) found <- TRUE
      }
      if(found == FALSE) break
    }
  }
  return(id)
}


serial.df <- function(l, elem.df, df.name, record, prefix.primary, prefix.foreign) {
  serial <- c()
  elem.df <- data.frame(lapply(elem.df, as.character), stringsAsFactors = FALSE)
  for(i in 1:NCOL(elem.df)) {
    if(stringr::str_sub(names(elem.df)[i], 1, nchar(prefix.foreign)) == prefix.foreign) {
      table.name <- stringr::str_replace(names(elem.df)[i], prefix.foreign, "")
      df.sub <- l[[get.df(l, table.name)]]
      if(is.null(df.sub)) {
        return(NA)
      }
      else {
        if(!is.na(elem.df[record, i])) {
          serial <- append(serial, tidyr::replace_na(
            serial.df(l, df.sub, table.name, which(df.sub[, paste0(prefix.primary, table.name)] == elem.df[record, i]), prefix.primary, prefix.foreign),"0"))
        }
      }
    }
    else {
      if(stringr::str_sub(names(elem.df)[i], 1, nchar(prefix.primary)) != prefix.primary) {
        serial <- append(serial, tidyr::replace_na(elem.df[record, i], "0"))
        names(serial)[NROW(serial)] <- paste0(df.name, "@", names(elem.df)[i])
      }
    }
  }
  if(NROW(serial) > 0) {
    return(tidyr::replace_na(serial[order(names(serial))],"0"))
  }
  else return(NA)
}


serial.xml <- function(obj) {
  serial <- c()
  chdr <- xml2::xml_children(obj)
  if(length(chdr) > 0) {
    for(i in 1:length(chdr)) {
      if(length(xml2::xml_children(chdr[i])) > 0) serial <- append(serial, serial.xml(chdr[i]))
      else {
        ctn <- as.character(xml2::xml_contents(chdr[i]))
        if(identical(ctn, character(0))) ctn <- NA
        serial <- append(serial, tidyr::replace_na(ctn, "0"))
        names(serial)[NROW(serial)] <- paste0(xml2::xml_name(obj), "@", xml2::xml_name(chdr[i]))
      }
    }
  }
  return(serial[order(names(serial))])
}


find.object <- function(l, obj, prefix.primary, prefix.foreign) {
  ex <- NULL
  elem <- get.df(l, xml2::xml_name(obj))
  if(is.null(elem)) ex <- NULL
  else {
    elem.df <- l[[elem]]
    for(i in 1:NROW(elem.df)) {
      res.df <- serial.df(l, elem.df, xml2::xml_name(obj), i, prefix.primary, prefix.foreign)
      res.xml <- serial.xml(obj)
      if(NROW(res.df) == NROW(res.xml)) {
        if(sum(tidyr::replace_na(res.df, "0") == tidyr::replace_na(res.xml, "0")) - NROW(res.df) == 0) {
          ex <- elem.df[i,paste0(prefix.primary,xml2::xml_name(obj))]
          break
        }
      }
    }
  }
  return(ex)
}


# Mögliche Parameter: Prefix für IDs, IDs unique über alle Tabellen (check.all), Länge der Primärschlüssel
parseXMLNode <- function(parent, envir, first = FALSE, prefix.primary, prefix.foreign, keys.unique, keys.dim) {
  if(first == TRUE) {
    xml2relational <- new.env(parent = baseenv())
    rlang::env_bind(xml2relational, ldf=list())
    parseXMLNode(parent, xml2relational, FALSE, prefix.primary, prefix.foreign, keys.unique, keys.dim)
  }
  else {
    obj.name <- xml2::xml_name(parent)
    chdr <- xml2::xml_children(parent)
    # Does parent have children, i.e. is parent an object?
    if(length(chdr) > 0) {
      elem <- get.df(envir$ldf, obj.name)
      # Is there no dataframe for the parent?
      if(is.null(elem)) {
        # Create new dataframe
        envir$ldf[[length(envir$ldf)+1]] <- data.frame()
        names(envir$ldf)[length(envir$ldf)] <- obj.name
        # Create record in dataframe
        id.name <- paste0(prefix.primary, obj.name)
        envir$ldf[[length(envir$ldf)]][1,id.name] <- 0
        id.value <- create.id(envir$ldf, obj.name, keys.unique, prefix.primary)
        envir$ldf[[length(envir$ldf)]][1,id.name] <- id.value
        for(i in 1:length(chdr)) {
          if(length(xml2::xml_children(chdr[i])) > 0) envir$ldf[[get.df(envir$ldf, obj.name)]][1, paste0(prefix.foreign, xml2::xml_name(chdr[i]))] <- parseXMLNode(chdr[i], envir, FALSE, prefix.primary, prefix.foreign, keys.unique, keys.dim)$value
          else envir$ldf[[get.df(envir$ldf, obj.name)]][1, xml2::xml_name(chdr[i])] <- parseXMLNode(chdr[i], envir, FALSE, prefix.primary, prefix.foreign, keys.unique, keys.dim)$value
        }
        return(list(ldf=envir$ldf, value=id.value))
      }
      # dataframe for the object exists already
      else {
        res <- find.object(envir$ldf, parent, prefix.primary, prefix.foreign)
        elem <- get.df(envir$ldf, obj.name)
        # Is parent not yet captured in dataframe?
        if(is.null(res)) {
          id.name <- paste0(prefix.primary, obj.name)
          id.value <- create.id(envir$ldf, obj.name, TRUE, prefix.primary, keys.dim)
          new.index <- NROW(envir$ldf[[elem]]) + 1
          envir$ldf[[elem]][new.index,id.name] <- id.value
          for(i in 1:length(chdr)) {
            if(length(xml2::xml_children(chdr[i])) > 0) envir$ldf[[get.df(envir$ldf, obj.name)]][new.index, paste0(prefix.foreign, xml2::xml_name(chdr[i]))] <- parseXMLNode(chdr[i], envir, FALSE, prefix.primary, prefix.foreign, keys.unique, keys.dim)$value
            else envir$ldf[[get.df(envir$ldf, obj.name)]][new.index, xml2::xml_name(chdr[i])] <- parseXMLNode(chdr[i], envir, FALSE, prefix.primary, prefix.foreign, keys.unique, keys.dim)$value
          }
          return(list(ldf=envir$ldf, value=id.value))
        }
        # Return ID of existing parent entry in dataframe
        else return(list(ldf=envir$ldf, value=res))
      }
    }
    # parent is not an object
    else {
      res <- as.character(xml2::xml_contents(parent))
      if(length(res) > 0) return(list(ldf=envir$ldf, value=res))
      else return(list(ldf=envir$ldf, value=NA))
    }
  }
}


#' @title Converting an XML document into a relational data model
#'
#' @description Imports  an XML document and converts it into a set of
#'   dataframes each of which represents one table in the data model.
#'
#' @param file The XML document to be processed.
#' @param prefix.primary A prefix for the tables' primary keys (unique numeric
#'   identifier for a data record/row in the table) . Default is \code{"ID_"}.
#'   The primary key field name will consist of the prefix and the table name.
#' @param prefix.foreign A prefix for the tables' foreign keys (). Default is
#'   \code{"FKID_"}. The rest of the foreign key field name will consist of the
#'   prefix and the table name.
#' @param keys.unique Defines if the primary keys must be unique across all
#'   tables of the data model or only within the table of which it is the
#'   primary key. Default is \code{TRUE} (unique across all tables).
#' @param keys.dim Size of the 'key space' reserved for primary keys. Argument
#'   is a power of ten. Default is \code{6} which means the namespace for
#'   primary keys extends from \code{1} to \code{1 million}.
#'
#'
#' @details \code{toRelational()} converts the hierarchical XML structure into a
#'   flat tabular structure with one dataframe for each table in the data model.
#'   \code{toRelational()} determines automatically which XML elements need to
#'   be stored in a separate table. The relationship between the nested objects
#'   in the XML data is recreated in the dataframes with combinations of foreign
#'   and primary keys. The foreign keys refer to the primary keys that
#'   \code{toRelational()} creates automatically when adding XML elements to a
#'   table.
#'   \tabular{llll}{ Column \tab Type \tab Description \tab Example \cr
#'   \code{Style} \tab \code{character} \tab Name of the SQL flavor. \tab
#'   \code{"MySQL"}  \cr \code{NormalField} \tab \code{character} \tab Template
#'   string for a normal, nullable field. \tab \code{"\%FIELDNAME\% \%DATATYPE\%"}
#'   \cr \code{NormalFieldNotNull} \tab \code{character} \tab Template string
#'   for non-nullable field. \tab \code{"\%FIELDNAME\% \%DATATYPE\% NOT NULL"} \cr
#'   \code{PrimaryKey} \tab \code{character} \tab Template string for the
#'   definition of a primary key. \tab \code{"PRIMARY KEY (\%FIELDNAME\%)"} \cr
#'   \code{ForeignKey} \tab \code{character} \tab Template string for the
#'   definition of a foreign key. \tab \code{"FOREIGN KEY (\%FIELDNAME\%) REFERENCES
#'   \%REFTABLE\%(\%REFPRIMARYKEY\%)"}  \cr \code{PrimaryKeyDefSeparate} \tab
#'   \code{logical} \tab Indicates if primary key needs additional definition
#'   like a any other field.  \tab \code{TRUE}  \cr \code{ForeignKeyDefSeparate}
#'   \tab \code{logical} \tab Indicates if foreign key needs additional
#'   definition like a any other field. \tab \code{TRUE} \cr \code{Int} \tab
#'   \tab \code{character} \tab Name of integer data type. \code{"INT"}  \cr
#'   \code{Int.MaxSize} \tab \code{numeric} \tab Size limit of integer data
#'   type.  \tab \code{4294967295}  \cr \code{BigInt} \tab \code{character} \tab
#'   Name of data type for integers larger than the size limit of the normal
#'   integer data type. \tab \code{"BIGINT"} \cr \code{Decimal} \tab
#'   \code{character} \tab Name of data type for floating point numbers. \tab
#'   \code{"DECIMAL"}  \cr \code{VarChar} \tab \code{character} \tab Name of
#'   data type for variable-size character fields. \tab \code{"VARCHAR"}  \cr
#'   \code{VarChar.MaxSize} \tab \code{numeric} \tab Size limit of variable-size
#'   character data type.\tab \code{65535} \cr \code{Text} \tab \code{character}
#'   \tab Name of data type for string data larger than the size limit of the
#'   variable-size character data type. \tab \code{"TEXT"} \cr \code{Date}
#'   \tab \code{character} \tab Name of data type date data. \tab \code{"DATE"}
#'   \cr \code{Time} \tab \code{character} \tab Name of data type time data \tab
#'   \code{"TIME"} \cr \code{Date} \tab \code{character} \tab Name of data
#'   type for combined date and time data. \tab \code{"TIMESTAMP"}  \cr  }
#'
#'   In the template strings you can use the following placeholders, as you also
#'   see from the MySQL example in the table: \enumerate{ \item
#'   \code{\%FIELDNAME\%}: Name of the field to be defined. \item
#'   \code{\%DATATYPE\%}: Datatype of the field to be defined. \item
#'   \code{\%REFTABLE\%}: Table referenced by a foreign key. \item
#'   \code{\%REFPRIMARYKEY\%}: Name of the primary key field of the table
#'   referenced by a foreign key. } When you use your own defintion of an SQL
#'   flavor, then \code{sql.style} must be a one-row dataframe providing the
#'   fields described in the table above.
#'
#'   You can use the \code{datatype.func} argument to provide your own function
#'   to determine how the data type of a field is derived from the values in
#'   that field. In this case, the values of the columns \code{Int},
#'   \code{Int.MaxSize}, \code{VarChar}, \code{VarChar.MaxSize}, \code{Decimal}
#'   and \code{Text} in the \code{sql.style} dataframe are ignored. They are
#'   used by the built-in mechanism to determine data types. Providing your own
#'   function allows you to determine data types in a more differentiated way,
#'   if you like. The function that is provided needs to take a vectors of
#'   values as its argument and needs to provide the SQL data type of these
#'   values as a one-element character vector.
#'
#'
#' @return A list of standard R dataframes, one for each table of the data model. The
#'   tables are named for the elements in the XML document.
#'
#'
#' @examples
#'
#' # Find path to custmers.xml example file in package directory
#' path <- system.file("", "customers.xml", package = "xml2relational")
#' db <- toRelational(path)
#'
#' @family xml2relational
#'
#'
#' @export
toRelational <- function(file, prefix.primary = "ID_", prefix.foreign = "FKID_", keys.unique = TRUE, keys.dim = 6) {
  x <- xml2::read_xml(file)
  p <- xml2::xml_root(x)
  return(parseXMLNode(p, NULL, TRUE, prefix.primary, prefix.foreign, keys.unique, keys.dim)$ldf)
}


check.datetimeformats <- function(vec, funcs, return.convertfunc = FALSE, tz = "UTC") {
  conv <- list()
  for(i in 1:length(funcs)) {
    conv[[length(conv)+1]] <- suppressWarnings(funcs[[i]](vec, tz=tz))
  }
  if(return.convertfunc) {
    if(max(unlist(lapply(conv, function(x){sum(!is.na(x))}))) == NROW(vec[!is.na(vec)])) {
      return(funcs[unlist(lapply(conv, function(x){sum(!is.na(x))})) == NROW(vec[!is.na(vec)])][[1]])
    }
    else {
      return(NULL)
    }
  }
  else {
    return(max(unlist(lapply(conv, function(x){sum(!is.na(x))}))))
  }
}


convertible.datetime <- function(vec, return.convertfunc = FALSE, tz = "UTC") {
  res <- ""
  vec <- as.character(vec)
  has.time <- sum(stringr::str_detect(vec, ":"), na.rm=TRUE) == NROW(vec[!is.na(vec)])
  funcs <- list(lubridate::ymd_hms, lubridate::ymd_hm, lubridate::ymd_h, lubridate::dmy_hms, lubridate::dmy_hm, lubridate::dmy_h, lubridate::mdy_hms, lubridate::mdy_hm, lubridate::mdy_h)
  if(has.time & check.datetimeformats(vec, funcs, FALSE, tz) == NROW(vec[!is.na(vec)])) {
    if(return.convertfunc) res <- check.datetimeformats(vec, funcs, TRUE, tz)
    else res <- "DateTime"
  }
  else {
    funcs <- list(lubridate::ymd, lubridate::dmy, lubridate::mdy)
    if(check.datetimeformats(vec, funcs, FALSE, tz) == NROW(vec[!is.na(vec)])) {
      if(return.convertfunc) res <- check.datetimeformats(vec, funcs, TRUE, tz)
      else res <- "Date"
    }
    else {
      funcs <- list(lubridate::hms, lubridate::hm, lubridate::ms)
      if(has.time & check.datetimeformats(vec, funcs, FALSE, tz) == NROW(vec[!is.na(vec)])) {
        if(return.convertfunc) res <- check.datetimeformats(vec, funcs, TRUE, tz)
        else res <- "Time"
      }
    }
  }
  return(res)
}


convertible.num <- function(vec) {
  vec[vec == ""] <- NA
  return(sum(is.na(vec)) == sum(is.na(suppressWarnings(as.numeric(vec)))))
}


convertible.double <- function(vec) {
  vec <- vec[!is.na(vec)]
  return(!(sum((as.numeric(vec) %% 1 == 0)) == NROW(vec)))
}


convertible.enum <- function(vec, max.ratio = 0.25) {
  vec <- vec[!is.na(vec)]
  if(NROW(vec) > 0) return(NROW(unique(vec))/NROW(vec) <= max.ratio)
  else return(FALSE)
}


is.nullable <- function(vec) {
  return(sum(is.na(vec)) > 0)
}


infer.datatype <- function(vec, bib, sql.style, tz = "UTC") {
  convert.dt <- convertible.datetime(vec, FALSE, tz)
  if(convert.dt != "") {
    return(as.character(bib[bib$Style == sql.style, convert.dt]))
  }
  else {
    if(convertible.num(vec)) {
      # numeric
      if(!convertible.double(vec)) {
        # integer
        vec <- vec[!is.na(vec)]
        max.limit <- bib[bib$Style == sql.style, "Int.MaxSize"]
        if(max(vec) <= max.limit-1 & min(vec) >= (-1)*max.limit) return(as.character(bib[bib$Style == sql.style, "Int"]))
        else return(as.character(bib[bib$Style == sql.style, "BigInt"]))
      }
      else {
        # floating point number
        vec.char <- as.character(vec)
        dec.pos <- stringr::str_locate(vec.char, "\\.")[,2]
        dec.pos.before <- dec.pos - 1
        dec.pos.before[is.na(dec.pos.before)] <- stringr::str_length(vec.char[is.na(dec.pos.before)])
        vec.char.before <- stringr::str_sub(vec.char, 1, dec.pos.before)
        s <- max(stringr::str_length(stringr::str_sub(vec.char, dec.pos + 1, stringr::str_length(vec.char))), na.rm=TRUE)
        p <- max(nchar(vec.char.before)) + s
        return(paste0(bib[bib$Style == sql.style, "Decimal"], "(", p, ",", s, ")"))
      }
    }
    else {
      max.size <- max(nchar(as.character(vec)), na.rm = TRUE)
      if(max.size > bib[bib$Style == sql.style, "VarChar.MaxSize"]) return (as.character(bib[bib$Style == sql.style, "Text"]))
      else return(paste0(bib[bib$Style == sql.style, "VarChar"], "(",max.size, ")"))
    }
  }
}



#' @title Exporting the relational data model and data to a database
#'
#' @description Produces ready-to-run SQL \code{INSERT} statements to import the
#'   data transformed with \code{\link{toRelational}()} into a SQL database.
#'
#' @param ldf A \strong{l}ist of \strong{d}ata\strong{f}rames created by
#'   \code{\link{toRelational}()} (the data tables transformed from XML to a
#'   relational schema).
#' @param sql.style The SQL flavor that the produced \code{CREATE} statements
#'   will follow. The supported SQL styles are \code{"MySQL"},
#'   \code{"TransactSQL"} and \code{"Oracle"}. You can add your own SQL flavor
#'   by providing a dataframe with the required information instead of the name
#'   of one of the predefined SQL flavors as value for \code{sql.style}. See the
#'   Details section for more information on working with different SQL flavors.
#' @param tables A character vector with the names of the tables for whichs SQL
#'   \code{CREATE} statements will be produced. If null (default) \code{CREATE}
#'   statements will be produced for all tables in in the relational data model
#'   of \code{ldf}.
#' @param prefix.primary The prefix that is used in the relational data model of
#'   \code{ldf} to identify primary keys. \code{"ID_"} by default.
#' @param prefix.foreign The prefix that is used in the relational data model of
#'   \code{ldf} to identify foreign keys. \code{"FKID_"} by default.
#' @param line.break Line break character that is added to the end of each
#'   \code{CREATE} statement (apart from the semicolon that is added
#'   automatically). Default is \code{"\n"}.
#' @param datatype.func A function that is used to determine the data type of
#'   the table fields. The function must take the field/column from the data
#'   table (basically the result of \code{SELCT field FROM table})
#'   as its sole argument and return a character vector providing the data type.
#'   If null (default), the built-in mechanism will be used to determine the
#'   data type.
#' @param one.statement Determines whether all \code{CREATE} statements will be
#'   returned as one piece of SQL code (\code{one.statement = TRUE}) or if each
#'   \code{CREATE} statement will be stored in a separate element of the return
#'   vector.
#'
#' @details
#'   If you want to produce SQL \code{CREATE} statements that follow a
#'   different SQL dialect than one of the built-in SQL flavors (i.e. MySQL,
#'   TransactSQL and Oracle) you can provide the necessary information to
#'   \code{getCreateSQL()} via the \code{sql.style} argument. In this case the
#'   \code{sql.style} argument needs to be a dataframe with the folling fields:
#'   \tabular{llll}{ Column \tab Type \tab Description \tab Example \cr
#'   \code{Style} \tab \code{character} \tab Name of the SQL flavor. \tab
#'   \code{"MySQL"}  \cr \code{NormalField} \tab \code{character} \tab Template
#'   string for a normal, nullable field. \tab \code{"\%FIELDNAME\% \%DATATYPE\%"}
#'   \cr \code{NormalFieldNotNull} \tab \code{character} \tab Template string
#'   for non-nullable field. \tab \code{"\%FIELDNAME\% \%DATATYPE\% NOT NULL"} \cr
#'   \code{PrimaryKey} \tab \code{character} \tab Template string for the
#'   definition of a primary key. \tab \code{"PRIMARY KEY (\%FIELDNAME\%)"} \cr
#'   \code{ForeignKey} \tab \code{character} \tab Template string for the
#'   definition of a foreign key. \code{"FOREIGN KEY (\%FIELDNAME\%) REFERENCES
#'   \%REFTABLE\%(\%REFPRIMARYKEY\%)"}  \cr \code{PrimaryKeyDefSeparate} \tab
#'   \code{logical} \tab Indicates if primary key needs additional definition
#'   like a any other field.  \tab \code{TRUE}  \cr \code{ForeignKeyDefSeparate}
#'   \tab \code{logical} \tab Indicates if foreign key needs additional
#'   definition like a any other field. \tab \code{TRUE} \cr \code{Int} \tab
#'   \tab \code{character} \tab Name of integer data type. \code{"INT"}  \cr
#'   \code{Int.MaxSize} \tab \code{numeric} \tab Size limit of integer data
#'   type.  \tab \code{4294967295}  \cr \code{BigInt} \tab \code{character} \tab
#'   Name of data type for integers larger than the size limit of the normal
#'   integer data type. \tab \code{"BIGINT"} \cr \code{Decimal} \tab
#'   \code{character} \tab Name of data type for floating point numbers. \tab
#'   \code{"DECIMAL"}  \cr \code{VarChar} \tab \code{character} \tab Name of
#'   data type for variable-size character fields. \tab \code{"VARCHAR"}  \cr
#'   \code{VarChar.MaxSize} \tab \code{numeric} \tab Size limit of variable-size
#'   character data type.\tab \code{65535} \cr \code{Text} \tab \code{character}
#'   \tab Name of data type for string data larger than the size limit of the
#'   variable-size character data type. \tab \code{"TEXT"} \cr \cr \code{Date}
#'   \tab \code{character} \tab Name of data type date data. \tab \code{"DATE"}
#'   \cr \code{Time} \tab \code{character} \tab Name of data type time data \tab
#'   \code{"TIME"} \cr \code{Date} \tab \code{character} \tab Name of data
#'   type for combined date and time data. \tab \code{"TIMESTAMP"}  \cr  }
#'
#'   In the template strings you can use the following placeholders, as you also
#'   see from the MySQL example in the table: \enumerate{ \item
#'   \code{\%FIELDNAME\%}: Name of the field to be defined. \item
#'   \code{\%DATATYPE\%}: Datatype of the field to be defined. \item
#'   \code{\%REFTABLE\%}: Table referenced by a foreign key. \item
#'   \code{\%REFPRIMARYKEY\%}: Name of the primary key field of the table
#'   referenced by a foreign key. } When you use your own defintion of an SQL
#'   flavor, then \code{sql.style} must be a one-row dataframe providing the
#'   fields described in the table above.
#'
#'   You can use the \code{datatype.func} argument to provide your own function
#'   to determine how the data type of a field is derived from the values in
#'   that field. In this case, the values of the columns \code{Int},
#'   \code{Int.MaxSize}, \code{VarChar}, \code{VarChar.MaxSize}, \code{Decimal}
#'   and \code{Text} in the \code{sql.style} dataframe are ignored. They are
#'   used by the built-in mechanism to determine data types. Providing your own
#'   function allows you to determine data types in a more differentiated way,
#'   if you like. The function that is provided needs to take a vectors of
#'   values as its argument and needs to provide the SQL data type of these
#'   values as a one-element character vector.
#'
#' @return A character vector with exactly one element (if argument
#'   \code{one.statement = TRUE}) or with one element per \code{CREATE}
#'   statement.
#'
#' @examples
#' # Find path to custmers.xml example file in package directory
#' path <- system.file("", "customers.xml", package = "xml2relational")
#' db <- toRelational(path)
#'
#' sql.code <- getCreateSQL(db, "TransactSQL", "address")
#'
#' @family xml2relational
#'
#' @export
getCreateSQL <- function(ldf, sql.style = "MySQL", tables = NULL, prefix.primary = "ID_", prefix.foreign = "FKID_", line.break ="\n", datatype.func = NULL, one.statement = FALSE) {
  sql.stylebib <- data.frame(list(
    Style = c("MySQL", "TransactSQL", "Oracle"),
    NormalField = c("%FIELDNAME% %DATATYPE%","%FIELDNAME% %DATATYPE%","%FIELDNAME% %DATATYPE%"),
    NormalFieldNotNull = c("%FIELDNAME% %DATATYPE% NOT NULL", "%FIELDNAME% %DATATYPE% NOT NULL", "%FIELDNAME% %DATATYPE% NOT NULL"),
    PrimaryKey = c("PRIMARY KEY (%FIELDNAME%)", "%FIELDNAME% %DATATYPE% PRIMARY KEY", "%FIELDNAME% %DATATYPE% PRIMARY KEY"),
    ForeignKey = c("FOREIGN KEY (%FIELDNAME%) REFERENCES %REFTABLE%(%REFPRIMARYKEY%)", "%FIELDNAME% %DATATYPE% REFERENCES %REFTABLE%(%REFPRIMARYKEY%)", "%FIELDNAME% %DATATYPE% REFERENCES %REFTABLE%(%REFPRIMARYKEY%)"),
    PrimaryKeyDefSeparate = c(TRUE, FALSE, FALSE),
    ForeignKeyDefSeparate = c(TRUE, FALSE, FALSE),
    Int = c("INT", "int", "NUMBER"),
    Int.MaxSize = c(2147483648, 2147483648, 1),
    BigInt = c("BIGINT", "bigint", "NUMBER"),
    Decimal = c("DECIMAL", "decimal", "NUMBER"),
    VarChar = c("VARCHAR", "varchar", "VARCHAR2"),
    VarChar.MaxSize = c(65535, 8000, 4000),
    Text = c("TEXT", "varchar(max)", "LONG"),
    Date = c("DATE", "date", "DATE"),
    DateTime = c("TIMESTAMP", "datetime2", "TIMESTAMP"),
    Time = c("TIME", "TIME", "VARCHAR2(20)")
  ), stringsAsFactors = FALSE)

  if(is.data.frame(sql.style)) {
    sql.stylebib <- rbind(sql.stylebib, sql.style)
    sql.style = sql.stylebib[1,1]
  }
  else {
    if(!sql.style %in% sql.stylebib[,1]) stop(paste0("'", sql.style, "' is not a valid SQL flavor. Valid flavors are ",
                                                     paste0("'", sql.stylebib[,1], "'", collapse = ",")), ".\n")
  }

  if(is.null(tables)) tabs <- 1:length(ldf)
  else {
    tabs <- c()
    for(i in 1:NROW(tables)) {
      if(tables[i] %in% names(ldf)) {
        tabs <- append(tabs, get.df(ldf, tables[i]))
      }
      else warning(paste0("Table '", tables[i], "' does not exist in your data model. Valid tables names are ",
                          paste0("'", names(ldf), "'", collapse = ",")), ".\n")
    }
  }

  sql.code <- c()
  for(i in 1:NROW(tabs)) {
    df <- ldf[[get.df(ldf, names(ldf)[tabs[i]])]]
    df <- data.frame(lapply(df, as.character), stringsAsFactors = FALSE)
    sql.code[i] <- paste0("CREATE TABLE ", names(ldf)[tabs[i]], " (", line.break)
    for(f in 1:NCOL(df)) {
      if(f != 1) sql.code[i] <- paste0(sql.code[i], ", ")
      if(is.null(datatype.func)) datatype <- infer.datatype(df[,f], sql.stylebib, sql.style)
      else datatype = datatype.func(df[,f])
      field <- names(df)[f]
      reftable <- ""
      if(field==paste0(prefix.primary, names(ldf)[tabs[i]])) {
        # primary
        sql.code[i] <- paste0(sql.code[i], sql.stylebib[sql.stylebib$Style==sql.style, "PrimaryKey"])
        if(sql.stylebib[sql.stylebib$Style==sql.style, "PrimaryKeyDefSeparate"])
          sql.code[i] <- paste0(sql.code[i], line.break, ", ", sql.stylebib[sql.stylebib$Style==sql.style, "NormalField"])
      }
      else {
        if(stringr::str_sub(field, 1, nchar(prefix.foreign)) == prefix.foreign) {
          # foreign
          reftable <- stringr::str_replace_all(field, prefix.foreign, "")
          sql.code[i] <- paste0(sql.code[i], sql.stylebib[sql.stylebib$Style==sql.style, "ForeignKey"])
          if(sql.stylebib[sql.stylebib$Style==sql.style, "ForeignKeyDefSeparate"])
            sql.code[i] <- paste0(sql.code[i], line.break, ", ", sql.stylebib[sql.stylebib$Style==sql.style, "NormalField"])
        }
        else {
          # normal
          if(is.nullable(df[,f])) {
            # nullable
            sql.code[i] <- paste0(sql.code[i], sql.stylebib[sql.stylebib$Style==sql.style, "NormalField"])
          }
          else {
            # not nullable
            sql.code[i] <- paste0(sql.code[i], sql.stylebib[sql.stylebib$Style==sql.style, "NormalFieldNotNull"])
          }
        }
      }
      sql.code[i] <- stringr::str_replace_all(sql.code[i], "%FIELDNAME%", field)
      sql.code[i] <- stringr::str_replace_all(sql.code[i], "%DATATYPE%", datatype)
      sql.code[i] <- stringr::str_replace_all(sql.code[i], "%REFTABLE%", reftable)
      sql.code[i] <- stringr::str_replace_all(sql.code[i], "%REFPRIMARYKEY%", paste0(prefix.primary, reftable))
      sql.code[i] <- paste0(sql.code[i], line.break)
    }
    sql.code[i] <- paste0(sql.code[i], ");")
  }

  if(one.statement) sql.code <- paste0(sql.code, collapse = line.break)
  return(sql.code)
}



#' @title Exporting the relational data model and data to a database
#'
#' @description Produces ready-to-run SQL \code{INSERT} statements to import the
#'   data transformed with \code{\link{toRelational}()} into a SQL database.
#'
#' @param ldf A \strong{l}ist of \strong{d}ata\strong{f}rames created by
#'   \code{\link{toRelational}()} (the data tables transformed from XML to a
#'   relational schema).
#' @param table.name Name of the table from the data table list \code{ldf} for
#'   which \code{INSERT} statements are to be created.
#' @param line.break Line break character that is added to the end of each
#'   \code{INSERT} statement (apart from the semicolon that is added
#'   automatically). Default is \code{"\n"}.
#' @param one.statement Determines whether all \code{INSERT} statements will be
#'   returned as one piece of SQL code (\code{one.statement = TRUE}) or if each
#'   \code{INSERT} statement will be stored in a separate element of the return
#'   vector. In the former case the return vector will have just one element, in
#'   the latter case as many elements as there are data records to insert.
#'   Default is \code{FALSE} (return vector has one element per \code{INSERT}
#'   statement.
#' @param tz The code of the timezone used for exporting timestamp data. Default it
#'   \code{"UTC"} (Coordinated Universal Time).
#'
#' @return A character vector with exactly one element (if argument
#'   \code{one.statement = TRUE}) or with one element per \code{INSERT}
#'   statement.
#'
#' @examples
#' # Find path to custmers.xml example file in package directory
#' path <- system.file("", "customers.xml", package = "xml2relational")
#' db <- toRelational(path)
#'
#' sql.code <- getInsertSQL(db, "address")
#'
#' @family xml2relational
#'
#' @export
getInsertSQL <- function(ldf, table.name, line.break = "\n", one.statement = FALSE, tz = "UTC") {
  if(!table.name %in% names(ldf)) stop(paste0("Table '", table.name, "' does not exist in your data model. Valid tables names are ",
                                              paste0("'", names(ldf), "'", collapse = ",")), ".\n")
  tab <- ldf[[get.df(ldf, table.name)]]
  col.delimiter <- c()
  cols <- c()
  res <- c()
  for(f in 1:NCOL(tab)) {
    if(convertible.datetime(tab[,f], return.convertfunc = FALSE, tz=tz) != "") tab[,f] <- as.character(convertible.datetime(tab[,f], return.convertfunc = TRUE, tz=tz)(tab[,f]))

    if(!convertible.num(tab[,f])) col.delimiter[f] <- "'"
    else col.delimiter[f] <- ""
    cols <- append(cols, names(tab)[f])
  }
  cols <- paste0(names(tab), collapse = ", ")

  for(i in 1:NROW(tab)) {
    vals <- c()
    for(f in 1:NCOL(tab)) {
      if(!is.na(tab[i,f])) vals <- paste0(vals, ", ", col.delimiter[f], tab[i,f], col.delimiter[f])
      else vals <- paste0(vals, ", NULL")
    }
    vals <- stringr::str_sub(vals, 3, stringr::str_length(vals))
    res <- append(res, paste0("INSERT INTO ", table.name, "(", cols, ") VALUES (", vals, ");"))
  }
  if(one.statement) res <- paste0(res, collapse = line.break)
  return(res)
}



#' @title Saving the relational data
#'
#' @description Saves a list of dataframes created from an XML source with
#'   \code{\link{toRelational}()} to CSV files, one file per dataframe (i.e.
#'   table in the relational data model). File names are identical to the
#'   dataframe/table names.
#'
#' @param ldf A \strong{l}ist of \strong{d}ata\strong{f}rames created by
#'   \code{\link{toRelational}()} (the data tables transformed from XML to a
#'   relational schema).#' @param dir Directory where the files will be stored.
#'   Default is the current working directory.
#' @param dir The directory to save the CSV files in. Per default the working directory.
#' @param sep Character symbol to separate fields in the CSV fil, comma by
#'   default.
#' @param dec Decimal separator used for numeric fields in the CSV file, point
#'   by default.
#'
#' @return No return vaue.
#'
#' @examples
#' # Find path to custmers.xml example file in package directory
#' path <- system.file("", "customers.xml", package = "xml2relational")
#' db <- toRelational(path)
#'
#' savetofiles(db, dir = tempdir())
#'
#'
#' @family xml2relational
#'
#' @export
savetofiles <- function(ldf, dir, sep = ",", dec = ".") {
  if(dir != "") {
    if(!dir.exists(dir)) stop(paste0("Directory '", dir, "' does not exist."))
  }
  for(i in 1:length(ldf)) {
    tab <- ldf[[i]]
    for(f in 1:NCOL(tab)) {
      if(convertible.num(tab[,f])) tab[,f] <- as.numeric(tab[,f])
      else tab[,f] <- as.character(tab[,f])
    }
    utils::write.table(tab, file=fs::path(dir, paste0(names(ldf)[i], ".csv")), dec = dec, sep = sep)
  }
}

Try the xml2relational package in your browser

Any scripts or data that you put into this service are public.

xml2relational documentation built on Feb. 11, 2022, 1:08 a.m.