R/dm.R

Defines functions print.data_model dm_set_display dm_set_segment dm_re_query dm_set_col_attr dm_set_key dm_add_references dm_add_reference_ dm_from_data_frames dm_create_references dm_get_table_attrs dm_list2coltable dm_read_yaml as.data_model.data.frame is.data_model as.data_model.list as.data_model

Documented in as.data_model as.data_model.data.frame dm_add_reference_ dm_add_references dm_create_references dm_from_data_frames dm_list2coltable dm_read_yaml dm_re_query dm_set_col_attr dm_set_display dm_set_key dm_set_segment is.data_model print.data_model

#' datamodelr: data model diagrams
#'
#' Provides a simple structure to describe data models,
#' functions to read data model from YAML file,
#' and a function to create DiagrammeR graph objects:
#'
#'
#' \itemize{
#'   \item \pkg{datamodelr}'s data model object is a simple list of data frames which
#'     represent the model entities and include elements and their relations.
#'     See \code{\link{as.data_model}}.
#'   \item Function \code{\link{as.data_model}} creates a
#'     data model object from a data frame.
#'   \item Function \code{\link{dm_read_yaml}} reads YAML format and creates a
#'     data model object.
#'   \item Function \code{\link{dm_create_graph}} creates a graph object from
#'     data model object.
#'   \item Function \code{\link{dm_render_graph}} renders a graph object (using
#'   DiagrammeR package).
#' }
#'
#' @docType package
#' @name datamodelr-package
#' @aliases datamodelr
NULL

#' Coerce to a data model
#'
#' Functions to coerce an object to a data model if possible.
#'
#' @details Function accepts a data frame with columns info.
#'   Data frame must have 'table' and 'column' elements.
#'   Optional element 'key' (boolean) marks a column as a primary key.
#'   Optional element 'ref' (character string) defines referenced table name.
#'   Optional element 'ref_col' (character string) defines a column in a
#'     referenced table name primary key (only necessery when referenced
#'     table has a compound primary key).
#' @param x A data frame to be coerced to data model object
#' @aliases as.data_model
#' @return If possible it returns a data model object.
#'   It is a list of data frames with at least the following columns:
#'   \item{ table }{A table name}
#'   \item{ column }{A name of the column in a table}
#'   \item{ key }{A boolean value indicating this column is in a primary key.
#'   Use integer values with order for compound keys }
#'   \item{ ref }{A character string with a referenced table name.
#'     If exists (not NA) then the column is a foreign key.}
#'   \item{ ref_col }{A character string with a referenced table name column.
#'     This is only necessary when referenced table has multi-column key}
#' @export
as.data_model <- function(x) {
  UseMethod("as.data_model")
}

#' @keywords internal
#' @export
as.data_model.list <- function(x) {

  if(mode(x) != "list"){
    stop("Not a list")
  }
  if(!all(c("columns", "references") %in% (names(x)))) {
    stop("Input must have columns and references")
  }

  class(x) <- c("data_model", class(x))
  x
}

#' Check if object is a data model
#'
#' @param x Object to check if it is a data model
#' @keywords internal
#' @export
is.data_model <- function(x) {
  inherits(x, "data_model")
}


#' Coerce a data frame to a data model
#'
#' @keywords internal
#' @export
as.data_model.data.frame <- function(x) {

  if(!inherits(x, "data.frame")) stop("Not a data.frame")

  if(!all(c("column", "table") %in% names(x)))
  {
    stop("Data frame must have elements named 'table' and 'column'.")
  }

  # set key to 0 if NA or add key if NULL:
  if(!is.null(x[["key"]])) {
    x[is.na(x[,"key"]), "key"] <- FALSE
  } else {
    x[,"key"] <- FALSE
  }

  # convert logical key markers to numeric (column order in a key)
  # x$table <- factor(x$table, ordered = TRUE)
  # if(max(x$key, na.rm = TRUE) <= 1) {
  #   keys <-
  #     lapply(split(x, x$table), function(t) {
  #       cumsum(t$key) * t$key
  #     })
  #   x$key <- unlist(keys)
  # }

  if(is.null(x[["ref"]])) x[["ref"]] <- NA


  # create references from ref and keys
  ref_table <- dm_create_references(x)

  table_attrs <- attr(x, "tables")
  if(is.null(table_attrs)) {
    table_attrs <-
      data.frame(
        table = unique(x[["table"]]),
        segment = NA,
        display = NA,
        row.names = NULL,
        stringsAsFactors = FALSE
      )
  }
  attr(x, "tables") <- NULL
  ret <- list(
    tables = table_attrs,
    columns = x,
    references = ref_table
  )
  as.data_model(ret)
}

#' Read YAML
#'
#' Reads a file in YAML format and returns a data model object.
#'
#' @details YAML description should include table names (first level),
#' columns (second level) and column attributes (third level).
#' Expected (but not required) column attributes are
#'   \code{key} (Yes|No),
#'   \code{ref} (Name of referenced table),
#'   \code{comment} (column description).
#'
#' @param file A file in YAML format
#' @param text A YAML formated character string
#' @examples
#' dm <-
#'   dm_read_yaml(text = "
#'
#'     Person:
#'       Person ID: {key: yes}
#'       Name:
#'       E-mail:
#'       Street:
#'       Street number:
#'       City:
#'       ZIP:
#'
#'     Order:
#'       Order ID: {key: yes}
#'       Customer: {ref: Person}
#'       Sales person: {ref: Person}
#'       Order date:
#'       Requested ship date:
#'       Status:
#'
#'     Order Line:
#'       Order ID: {key: yes, ref: Order}
#'       Line number: {key: yes}
#'       Order item: {ref: Item}
#'       Quantity:
#'       Price:
#'
#'     Item:
#'       Item ID: {key: yes}
#'       Item Name:
#'       Description:
#'   ")
#' @export
dm_read_yaml <- function(file = NULL, text = NULL) {

  if( !requireNamespace("yaml", quietly = TRUE)) {
    stop("yaml package needed for this function to work. Please install it.",
         call. = FALSE)
  }


  if(missing(text)) {
    if(!missing(file)) {
      if(!file.exists(file)) stop("File does not exist.")
      dm <- yaml::yaml.load_file(file)
    } else {
      stop("A file or text needed.")
    }
  } else {
    dm <- yaml::yaml.load(text)
  }
  if(is.null(dm)) {
    return(NULL)
  }

  col_table <- dm_list2coltable(dm)
  return(as.data_model(col_table))
}


#' List to column table
#'
#' Convert a 3 level named list to a data frame with column info
#'
#' @details The funcion is used when creating data model object
#'   from list provided by yaml parser.
#' @param x a named list
#' @export
#' @keywords internal
dm_list2coltable <- function(x) {

  if(!is.list(x)) {
    stop("Input must be a list.")
  }

  if(is.null(names(x))) {
    # parsed yaml with sequences
    x_tables <- x[sapply(x, function(x) !is.null(x[["table"]]))]

    table_names <- sapply(x_tables, function(tab) tab[["table"]])
    columns <- lapply(x_tables, function(tab) {
      tab_name <- tab[["table"]]
      if(!is.null(tab_name)) {
        cols <- tab[["columns"]]
      }
    })
    names(columns) <- table_names

    column_names <- lapply(columns, names)
    column_attributes <- unique( unlist( lapply(columns, sapply, names)))

  } else {
    # Named list (parsed yaml with maps)
    columns <- x
    table_names <- names(columns)
    column_names <- lapply(columns, names)
    column_attributes <- unique( unlist( lapply(columns, sapply, names)))
  }


  table_list <-
    lapply(table_names, function(tab_name) {
      if(is.null(column_names[[tab_name]])) {
        column_names[[tab_name]] <- NA
      }
      tab <- data.frame(
        table = tab_name,
        column = column_names[tab_name],
        stringsAsFactors = FALSE
      )
      names(tab) <- c("table", "column")

      for(a in column_attributes) {
        attr_value <-
          unlist(
            sapply(column_names[[tab_name]], function(cname) {
              if(is.list(columns[[tab_name]][[cname]]))
                value <- columns[[tab_name]][[cname]][[a]]
              else
                value <- NA
              ifelse(is.null(value), NA, value)
            })
          )
        tab[[a]] <- attr_value
      }
      tab
    })

  ret <- do.call(rbind, table_list)

  table_attrs <- dm_get_table_attrs(x)
  if(!is.null(table_attrs) && is.null(table_attrs$segment))
    table_attrs$segment <- NA
  attr(ret, "tables") <- table_attrs

  ret
}

dm_get_table_attrs <- function(x) {

  x_tables <- x[sapply(x, function(x) !is.null(x[["table"]]))]
  table_names <- sapply(x_tables, function(tab) tab[["table"]])
  table_attrs <- unique(unlist(lapply(x_tables, names)))
  table_attrs <- table_attrs[!table_attrs %in% c("columns", "table")]
  names(x_tables) <- table_names

  table_attrs <-
    lapply(table_names, function(tab) {
      ret <-
        data.frame(
          table = tab,
          stringsAsFactors = FALSE
        )
      for(aname in table_attrs) {
        tab_attr <- x_tables[[tab]][[aname]]
        if(is.null(tab_attr)) {
          tab_attr <- NA
        }
        ret[[aname]] <- tab_attr
      }
      ret
    })

  do.call(rbind, table_attrs)
}


#' Create reference info
#'
#' Creates references (foreign keys) based on reference table names in
#' column info table.
#'
#' @param col_table A data frame with table columns
#' @details The function is used when creating data model object.
#'   \code{col_table} must have at least
#'     \code{table},
#'     \code{column} and
#'     \code{ref} elements.
#'   When referencing to tables with compound primary keys
#'   additional \code{ref_col} with primary key columns must be provided.
#' @export
#' @keywords internal
dm_create_references <- function(col_table) {

  if(!inherits(col_table, "data.frame")) stop("Input must be a data frame.")

  if(!all(c("table", "column") %in% names(col_table))) {
    stop("Column info table must have table, column and ref variables.")
  }
  if(!"ref" %in% names(col_table)) {
    return(NULL)
  }
  if(all(is.na(col_table[,"ref"]))) {
    return(NULL)
  }


  if(is.null(col_table[["ref_col"]])) {
    col_table[["ref_col"]] <- NA
  }
  ref_table <- col_table[
    !is.na(col_table[["ref"]]),  # take only rows with reference
    c("table", "column", "ref", "ref_col")]
  col_table[is.na(col_table$key), "key"] <- FALSE

  ref_col <-
    with(ref_table,
         ifelse(is.na(ref_col),
                sapply(ref_table$ref, function(x)
                  col_table[col_table$table == x & col_table$key, "column"][1]
                ),
                ref_col
         )
    )
  ref_table[["ref_col"]] <- ref_col

  # number of columns in primary key
  num_col = sapply(ref_table$ref, function(x)
    length(col_table[col_table$table == x & col_table$key, ][["column"]])
  )
  num_col[num_col == 0L] <- 1L

  key_col_num = {

    # create column index number
    rle1 <- rle(num_col)
    if(lengths(rle1)[1] > 0) {
      col_list <- sapply(1:lengths(rle1)[1], function(i) {
        rep(1 : rle1$values[i], rle1$lengths[i] / rle1$values[i])
      })
      col_list[lengths(col_list) == 0] <- 1
      unlist(col_list)
    } else {
      NA
    }
  }

  dim(key_col_num) <- NULL
  if(nrow(ref_table) == length(key_col_num)) {
    ref_table$ref_id <- cumsum(key_col_num == 1)
    ref_table$ref_col_num <- key_col_num
  } else {
    ref_table$ref_col_num <- 1
    ref_table$ref_id <- cumsum(ref_table$ref_col_num)
  }
  ref_table
}


#' Create data model object from R data frames
#'
#' Uses data frame column names to create a data model diagram
#'
#' @param ... Data frames or one list of data frames
#' @export
dm_from_data_frames <- function(...) {

  df_list <- list(...)
  if(length(df_list) == 1 && inherits(df_list[[1]], "list")) {
    df_list <- df_list[[1]]
  } else {
    if(length(names(df_list)) < length(df_list)) {
      names(df_list) <- as.list(match.call( expand.dots = TRUE)[-1])
    }
  }
  tables <- df_list
  names(tables) <- make.names(names(tables))
  dfdm <-
    do.call(rbind,
            lapply(names(tables), function(table_name) {
              t1 <- tables[[table_name]]
              columns <- data.frame(
                column = names(t1),
                type = sapply(t1[0, , drop = FALSE], function(x) paste(class(x), collapse = ", ")),
                stringsAsFactors = FALSE)
              columns$table <- table_name
              columns
            })
    )
  as.data_model(dfdm)

}

#' Add reference
#'
#' Adds reference to existing data model object
#'
#' @param dm A data model object
#' @param table Table name
#' @param column Column(s) name
#' @param ref Referenced table name
#' @param ref_col Referenced column(s) name
#' @return New data model object
#' @export
dm_add_reference_ <- function(dm, table, column, ref = NULL, ref_col = NULL) {
  ref_df <-
    data.frame(
      table = table,
      column = column,
      ref = ref,
      ref_col = ref_col,
      ref_id = ifelse(is.null(dm$references), 1, max(dm$references$ref_id) + 1),
      ref_col_num = 1:(length(ref_col)),

      stringsAsFactors = FALSE
    )
  dm$references <- rbind(dm$references, ref_df)
  dm$columns$ref[dm$columns$table == table & dm$columns$column %in% column] <- ref
  dm
}

#' Add references
#'
#' Adds references defined with logical expressions from data frames
#'   in format table1$column1 == table2$column2
#'
#' @param dm Data model object
#' @param ... Logical expressions in format table1$column1 == table2$column2
#' @export
dm_add_references <- function(dm, ...)
{
  ref_list <- substitute(list(...))

  if(is.null(dm$columns$ref)) dm$columns$ref <- NA
  if(is.null(dm$columns$ref_col)) dm$columns$ref_col <- NA
  if(is.null(dm$columns$key)) dm$columns$key <- FALSE

  for(ref in as.list(ref_list[-1])) {
    ref <- as.list(ref)
    if(
      as.character(ref[1]) != "`==`" ||
      length(ref) != 3 || length(ref[[2]]) != 3 || length(ref[[3]]) != 3) {
      stop("Define references with logical expressions:
           dataframe1$column1 == dataframe2$column2, ...",
           call. = FALSE)
    }
    toChar <- function(ref, i, j) as.character(ref[[i]][[j]])

    table_name  = as.character(ref[[2]][[2]])
    column_name = as.character(ref[[2]][[3]])
    ref_table   = as.character(ref[[3]][[2]])
    ref_col     = as.character(ref[[3]][[3]])

    dm_row <- with(dm$columns, table == table_name & column == column_name)
    dm$columns[dm_row, "ref"] <- ref_table
    dm$columns[dm_row, "ref_col"] <- ref_col

    dm_key_row <- dm$columns$table == ref_table & dm$columns$column == ref_col
    dm$columns[dm_key_row, "key"] <- TRUE
  }

  ref_table <- dm_create_references(dm$columns)
  dm$references <- ref_table
  dm
}


#' Set key
#'
#' Set column as a primary key
#'
#' @param dm A data model object
#' @param table Table name
#' @param column Column(s) name
#' @export
dm_set_key <- function(dm, table, column) {
  update_cols <- dm$columns$table == table & dm$columns$column %in% column
  if(!any(update_cols)) {
    stop("Column not found.")
  }
  dm$columns$key[update_cols] <- seq_along(column)
  dm
}

#' Set column attribute
#'
#' Set column attribute value
#'
#' @param dm A data model object
#' @param table Table name
#' @param column Column(s) name
#' @param attr Column attribute name
#' @param value New value
#' @export
#' @keywords internal
dm_set_col_attr <- function(dm, table, column, attr, value) {
  update_cols <- dm$columns$table == table & dm$columns$column == column
  if(!any(update_cols)) {
    stop("Column not found.")
  }
  dm$columns[update_cols, attr] <- value
  dm
}


#' Reverse engineer query
#'
#' Returns a string with SQL query to reverse engineer a database
#'
#' @param rdbms Which database ("postgres" or "sqlserver")
#' @return A character string with sql query
#' @export
#' @examples
#' \dontrun{
#' library(RPostgreSQL)
#' # dvdrental sample database: http://www.postgresqltutorial.com/postgresql-sample-database
#' con <- dbConnect(dbDriver("PostgreSQL"), dbname="dvdrental", user ="postgres")
#' sQuery <- dm_re_query("postgres")
#' dm_dvdrental <- dbGetQuery(con, sQuery)
#' dbDisconnect(con)
#' }
dm_re_query <- function(rdbms) {
  sql_script <- sprintf("sql/%s.sql", rdbms)
  file_name <- system.file(sql_script, package ="datamodelr")
  if( !file.exists(file_name) ) {
    stop("This rdbs not supported")
  }
  sQuery <- paste(readLines(file_name), collapse = "\n")
  sQuery
}


#' Set table segment
#'
#' Change tables' segment name in a data model
#'
#' @param dm A data model object
#' @param table_segments A named list of vectors with segments as element names
#'   and tables as values in vectors
#' @export
dm_set_segment <- function(dm, table_segments) {

  if(!is.data_model(dm))
    stop("Not a data model object.")
  for(s in names(table_segments)) {
    table_names <- table_segments[[s]]
    dm$tables$segment[dm$tables$table %in% table_names ] <- s
  }
  dm
}

#' Set table display
#'
#' Change tables' display in a data model
#'
#' @param dm A data model object
#' @param display A named list of vectors with display as element names
#'   and tables as values in vectors
#' @export
dm_set_display <- function(dm, display) {

  if(!is.data_model(dm))
    stop("Not a data model object.")
  for(s in names(display)) {
    table_names <- display[[s]]
    dm$tables$display[dm$tables$table %in% table_names ] <- s
  }
  dm
}

#' Print data model graph
#'
#' @param x data model object.
#' @param ... further arguments passed to or from other methods.
#' @export
print.data_model <- function(x, ...) {
  cat("Data model object:\n")
  tables <- paste(utils::head(x$tables$table, 4), collapse = ", ")
  if(length(x$tables$table) > 4) {
    tables <- paste(tables, "...")
  }
  cat(" ", nrow(x$tables), "tables: ", tables,"\n")
  cat(" ", nrow(x$columns), "columns\n")
  cat(" ", length(unique(x$columns[x$columns[["key"]] != 0,"table"])), "primary keys\n")
  cat(" ", ifelse(is.null(x$references), "no", nrow(unique(x$references))),
      "references\n")
}
bergant/datamodelr documentation built on March 6, 2021, 5:47 a.m.