R/deidentify_methods.R

Defines functions deidentify_text.data.frame deidentify_text

Documented in deidentify_text

#' @title methods to deidentify different forms of data
#' @description
#' The deidentify_text method offers a flexible approach to encryption offering the user the option to encrypt text
#' columns using either openssl or sodium. The encryption path chosen depends on the encryption key generated by
#' the user. For example, if a user generates an aes_key, and passes it to deidentify_text, aes encryption will be
#' automatically applied. The function will return a dataframe with the an encrypt_id suffix added to the end of the
#' column name. Thus, a user will have to actively remove the unencrypted columns manually.
#' @param .data
#' a data frame
#' @param key
#' A cyphr key generated from the cyphr package
#' @param cols_to_encrypt
#' This is specific to data.frames. This should be a character vector specifying
#' the names of the desired columns to encrypt.
#' @return
#' depending on the input, an object of the same class. If you input
#' a data.frame, you will get a data.frame with columns with a enc suffix
#' @examples
#' \dontrun{
#' data("initiations")
#' aes_key <- gen_aes_key(24)
#' deidentify_text(initiations, "case_participant_id", aes_key)
#' }
#' @export
deidentify_text <- function(.data,key,cols_to_encrypt = NULL) {
  UseMethod("deidentify_text")
}

#' @export
deidentify_text.data.frame <- function(.data, key = NULL, cols_to_encrypt) {
  if (!is.character(cols_to_encrypt)) {
    stop("cols_to_encrypt must be a character vector.")
  }
  if (class(key) != "cyphr_key") {
    stop("key must be a cyphr key")
  }
  # check that the column name actually exists in the data.
  if (!all(cols_to_encrypt %in% names(.data))) {
    stop("You have selected columns which are not in the data.")
  }
  enc_data <- .data[, cols_to_encrypt, drop = F] %>%
    dplyr::mutate(dplyr::across(dplyr::everything(),
                                list("encrypt_id" = ~ .apply_enc(as.character(.), key = key)))) %>%
    # rejoin initial data based on unencrypted columns
    dplyr::left_join(.data, by = cols_to_encrypt)
  return(enc_data)
}

#' @export
deidentify_text.character <- function(.data, key = NULL,cols_to_encrypt = NULL) {
  if(is.null(key)){
    stop("Key must be provided in order to encrypt")
  }
  if (class(key) != "cyphr_key") {
    stop("key must be a cyphr key")
  }
  enc_data <- .data %>%
    .apply_enc(key = key) #encrypt the string
  return(enc_data)
}



.enc_strings <- function(col_to_encrypt, key) {
  # encrypts the vector depending on the key
  enc_string <- paste0(cyphr::encrypt_string(col_to_encrypt, key = key), collapse = "")
  return(enc_string)
}


# this encrypts the unique vector of strings and passes it on.

.apply_enc <- function(string_to_encrypt, key) {
  unique_ent <- unique(string_to_encrypt)
  enc_string <-
    unlist(lapply(unique_ent, function(x) {
      .enc_strings(x, key)
    }))
  # change the names based on the original encrypted string
  names(enc_string) <- unique_ent
  # use a lookup table to speedily replace values
  # this also removes the names of the vector
  final_vec <- unname(enc_string[string_to_encrypt])
  return(final_vec)
}

is_any_date_type <- function(x) {
  return(lubridate::is.Date(x) | lubridate::is.POSIXct(x) | lubridate::is.POSIXlt(x))
}
phillydao/deidentify documentation built on Feb. 4, 2021, 2:31 p.m.