R/03_DB_creation.R

#### Creating DB from list of data frames
#############################################################################
#' Creates GADS data base.
#'
#' Function to create a relational data base.
#'
#' Primary keys need to be unique in each data frame. Foreign keys need to have the same names as primary keys they are referencing too. All three lists need to be named in the exact same way and order.
#'
#'@param dfList Named list of data frames per level
#'@param primeKeys Named character vector of one or more primary Keys (identification variables).
#'@param filePath Path of db file (including name) to be created, has to end on db.
#'
#'@return Creates a data base in the given path, returns a list of logicals corresponding to data input list.
#'
#'@examples
#'# Create in memory data base.
#' createDB(dfList = datList, pkList = pkList, fkList = fkList, filePath = ":memory:")
#'
createDB <- function(allList, filePath) {
  stopifnot(all(c("dfList", "labelList", "pkList", "fkList") %in% names(allList)))
  dfList <- allList$dfList
  labelList <- allList$labelList
  pkList <- allList$pkList
  fkList <- allList$fkList

  # 1) Create queries for actual data tables
  dtQueries <- Map(writeQ_create, df = dfList, primeKey = pkList, foreignKey = fkList, df_name = names(dfList))
  # Create query for data table with merging information (meta data table)
  metaQuery <- writeQ_mergeOrder(dfMergeOrder = names(dfList))
  # Create query for data table with variable, value and missing labels (relation to other data tables?)
  labelDT_name <- "Labels"
  labelQuery <- writeQ_create(df = labelList, df_name = labelDT_name,
                primeKey = c("varName", "value", "data_table"), foreignKey = NULL)
  # all queries into one object
  createQueries <- c(metaQuery, labelQuery, dtQueries)

  # 2) Create empty Data base
  init_DB(filePath)

  # Establish Connection, disconnect when function exits
  con <- dbConnect(RSQLite::SQLite(), dbname = filePath)
  on.exit(dbDisconnect(con))

  # 3) Execute "create Queries"
  lapply(createQueries, dbExecute_safe, conn = con)

  # 4) fill data base tables with data
  # normale data tables
  lapply(seq_along(dfList), function(i)
    dbWriteTable(conn = con, name = names(dfList)[i], value = dfList[[i]], append = TRUE))

  # label data table
  dbWriteTable(conn = con, name = labelDT_name, value = labelList, append = TRUE)
  return()
}



# 01a) Create Queries actual data tables ---------------------------------------------------------
## write query for data tables (without foreign keys)
writeQ_create <- function(df, primeKey, foreignKey, df_name) {
  # write string for variable definitions
  varDefinitions <- write_varDef(df)
  # write partial query for primary definition
  pkDefinition <- write_primeKey(primeKey)
  # write partial query for foreign Key definition if fk is defined
  if(is.null(foreignKey$References)) {
    fkDefinition <- ""
  } else {
    fkDefinition <- write_foreignKey(foreignKey)
  }
  # write query including create, variable definitions and primary key
  paste("CREATE TABLE", paste(df_name), "(",
        varDefinitions,
        pkDefinition,
        fkDefinition,
        ");")
}

# variable definitions
write_varDef <- function(df) {
  varList <- vector(mode = "character", length = ncol(df))
  # determine type of all variables
  for(i in seq(ncol(df))) {
    if(is.double(df[, i]) || is.integer(df[, i])) varType = "REAL"
    else if(is.character(df[, i])) varType = "TEXT"
    else if(is.factor(df[, i])) varType = "TEXT"
    else stop("invalid variable type")
  # write syntax per variable
  varList[i]  <- paste(names(df)[i], varType, ",")
  }
  # paste all together
  paste(varList, collapse = " ")
}

# primary key definition
write_primeKey <- function(primeKey) {
  pk <- paste(primeKey, collapse = ", ")
  paste("PRIMARY KEY (", pk, ")")
}

# foreign key definition
write_foreignKey <- function(foreignKey) {
  ref <- foreignKey$References
  fk <- paste(foreignKey$Keys, collapse = ", ")
  paste("FOREIGN KEY (", fk,")", "REFERENCES", ref, "(", fk,")")
}

# 01b) Create Queries meta information data tables ---------------------------------------------------------
## query which saves the order of data frame merging
writeQ_mergeOrder <- function(dfMergeOrder) {
  createQ <- "CREATE TABLE metaInformation ( mergeOrder TEXT );"
  insertQ <- paste("INSERT INTO metainformation (mergeOrder)",
                   "VALUES ( '", paste(dfMergeOrder, collapse = " "),"' );")
  c(createQ, insertQ)
}


# 02) Create Empty Database ---------------------------------------------------------
## create DB
init_DB <- function(filePath) {
  # check path / file
  check_filePath(filePath)
  # create DB, throws an error if sqlite3 not in Path!
  shell(cmd = paste("sqlite3", filePath, ".databases", sep = " "), mustWork = TRUE)
}

## check path for db
check_filePath <- function(filePath){
  if(identical(filePath, ":memory:")) {
    message("filePath points to work memory")
    return()
  }
  lastSL <- rev(unlist(gregexpr("/", filePath)))[[1]]
  lastDot <- rev(unlist(gregexpr("\\.", filePath)))[[1]]
  # divide string
  directory <- substr(filePath, 1, lastSL)
  fileFormat <- substr(filePath, lastDot, nchar(filePath))
  # check directory
  if(!dir.exists(directory)) stop(paste(directory, "is not an existing directory"))
  # check file name
  if(file.exists(filePath)) stop(paste(filePath, "is an existing data base"))
  if(!identical(fileFormat, ".db")) stop("Filename does not end on .db")

  return()
}


# 03) Execute Create Queries ---------------------------------------------------------
# safe version of dbExecute with verbose error
dbExecute_safe <- function(conn, statement) {
  check <- try(dbExecute(conn = conn, statement = statement))
  if(class(check) == "try-error") {
    stop(paste("Error while trying to execute the following query", statement))
  }
}
b-becker/eatGADS documentation built on May 24, 2019, 8:47 p.m.