R/database-utilities-deleting.R

Defines functions de_duplicate_table nuke_btclearn_database drop_table drop_schema

Documented in de_duplicate_table drop_schema drop_table nuke_btclearn_database

# Database utilities for deleting things from the database

#' @title Drop a schema
#'
#' @param schema_name A character representing a schema name
#' @param conn Connection to run the query
#' 
#' @description  Calls \code{run_query}
#' 
#' @return Boolean depending on if schema was dropped
#' 
drop_schema <- function (schema_name, conn = default_conn_to_mysql_server()) {
  if (substitute(conn) == "default_conn_to_mysql_server()") {
    on.exit(close_conn_to_mysql_server(conn))
  }
  q <- paste0("DROP SCHEMA ", schema_name, ";")
  out <- try(run_query(q, conn), silent = TRUE)
  query_errored <- class(out) == "try-error"
  if (query_errored) {
    warning(out[1])
  }
  return (!query_errored && !check_if_schema_exists(schema_name, conn))
}

#' @title Drop a table
#'
#' @param table_name A character representing a table name
#' @param conn Connection to run the query
#' 
#' @description  Calls \code{run_query}
#' 
#' @return Boolean depending on if table was dropped
#' 
drop_table <- function (table_name, conn = default_conn_to_mysql_server()) {
  if (substitute(conn) == "default_conn_to_mysql_server()") {
    on.exit(close_conn_to_mysql_server(conn))
  }
  q <- paste0("DROP TABLE ", table_name, ";")
  out <- try(run_query(q, conn), silent = TRUE)
  query_errored <- class(out) == "try-error"
  if (query_errored) {
    warning(out[1])
  }
  return (!query_errored && !check_if_table_exists(table_name, conn))
}

#' @title Nuke the whole database
#' 
#' @param confirm_with_user Should the user be asked whether or not 
#' he/she wants to drop the table before it is actually dropped?
#' @param conn Connection to the database
#' 
#' @description  Drops all \code{btclearn} tables in the database. Only run this
#' if you know what you're doing.
#' 
#' @return Boolean saying whether or not all tables were successfully dropped
#' 
nuke_btclearn_database <- function (confirm_with_user = TRUE,
                                    conn = default_conn_to_mysql_server()) {
  if (substitute(conn) == "default_conn_to_mysql_server()") {
    on.exit(close_conn_to_mysql_server(conn))
  }
  
  # Gather a list of tables to drop
  tables_to_drop <- sapply(database_tables, get_full_table_name)
  nuke_success   <- TRUE

  # Go through each table
  for (tab in tables_to_drop) {
    
    # Verify that this table should be dropped
    drop_this_table <- TRUE
    if (confirm_with_user) {
      table_size <- dim_table(tab)
      input <- readline(paste0("Dropping table '", tab, "' (", table_size[1], " rows, ",
                               table_size[2], " cols), input 'y' to drop: "))
      drop_this_table <- input == "y"
    }
    if (drop_this_table) {
      nuke_success <- nuke_success && drop_table(tab)
    } else {
      message("Input provided was not 'y' so keeping '", tab, "'")
      nuke_success <- FALSE      
    }
  }
  
  return (nuke_success)
}

#' @title De-duplicate a table
#' 
#' @param table_name A table to de-duplicate
#' @param primary_key The primary key column name in \code{table_name}
#' @param columns_to_ignore Columns to ignore when determining duplicates - note
#' that at least the primary key should always be given
#' @param conn Connection to the database
#' 
#' @description Be careful if you run this function because it will delete data
#' 
#' @return \code{TRUE} if the table was successfully de-duped and
#' \code{FALSE} otherwise
#' 
de_duplicate_table <- function (table_name, primary_key, columns_to_ignore,
                                conn = default_conn_to_mysql_server()) {
  if (substitute(conn) == "default_conn_to_mysql_server()") {
    on.exit(close_conn_to_mysql_server(conn))
  }
  
  # Read in the table
  d <- get_table(table_name, conn = conn)
  
  # Remove the columns that should be ignored for determining duplicates
  d_cols_rm <- d[, -which(colnames(d) %in% columns_to_ignore)]
  d_cols_rm$tmp_unique_id <- apply(X = d_cols_rm, MARGIN = 1,
                                   FUN = function (x) paste0(x, collapse = "_"))
  d_cols_rm$should_rm <- duplicated(d_cols_rm$tmp_unique_id)
  
  # Find the values of the primary key to remove
  pk_rm_values <- d[d_cols_rm$should_rm, primary_key] 
  
  # Build and run the query that removes those rows
  de_dupe_success <- TRUE
  if (length(pk_rm_values) > 0) {
    
    # Run the query and verify that it worked
    q <- paste0("DELETE FROM ", table_name, " WHERE ", primary_key, " IN (",
                paste0(pk_rm_values, collapse = ", "), ");")
    run_query(q, conn = conn)
    resulting_nrow  <- dim_table(table_name)[1]
    de_dupe_success <- resulting_nrow == nrow(d) - length(pk_rm_values)
    
    # Report how many rows were removed
    if (de_dupe_success) {
      message("Successfully removed ", length(pk_rm_values), " duplicates")
    } else {
      message("Identified duplicated but failed to remove them")
    }
    
  } else {
    message("Table contains no duplicates - not performing any deletions")
  }
  
  return (de_dupe_success)
}
kyleengel/btclearn documentation built on June 7, 2018, 12:26 a.m.