R/clean.R

Defines functions log_cleaning

Documented in log_cleaning

#' Log a cleaning change
#'
#' @param data The dataset that is being cleaned. Has to be a data.frame (no tibble).
#' @param uuid A vector with uuid character objects. Can be one but also multiple.
#' If there are multiple the cleaning will iterate over all of them and use the same question.name, issue and new.value for them.
#' @param action This is the action that needs to be done on the defined row. It is either flag (f), change (c) or deletion (d).
#' @param extra_columns Must be a list with the names of the objects being the names of the columns in the cleaning log
#' and the content of the list being the names of the dataset columns associated with it (see example). This is not required.
#' But it should be the same as the initiated logbook.
#' @param question.name The name of the column in which the value is that needs to be flagged or changed.
#' @param new.value The value that the old value needs to be changed to (only for change action). Must be either same length as
#' uuid vector or lenght 1 (if all need to be changed to same value).
#' @param issue A description of the issue why it is being cleaned.
#' @param dir The directory name of the location where it needs to be saved.
#' @param filename Name of the logbook. Default is cleaning_logbook.csv. Needs to have a csv extention.
#' @return It returns the logbook as data.frame.
#' @examples
#' dir <- getwd()
#' extra_columns <- list(population_group = "population_group", governorate = "governorate_mcna")
#' \dontrun{
#' init_cleaning_log(dir, extra_columns = extra_columns)
#'
#' data(mcna2019)
#' library(dplyr)
#' idp_first_place <- mcna2019 %>% dplyr::filter(idp_first_place == "yes")
#' flag <- difftime(as.POSIXct(idp_first_place$arrival_date_idp, format="%Y-%m-%d"),
#'                  as.POSIXct(idp_first_place$displace_date_idp, format="%Y-%m-%d"),
#'                  units = "weeks") > 4
#' idp_first_place[which(flag), c("displace_date_idp", "arrival_date_idp")]
#' uuid <- idp_first_place$X_uuid[which(flag)]
#' log <- log_cleaning(mcna2019, uuid, action = "f",  extra_columns = extra_columns,
#'                                     question.name="arrival_date_idp",
#'                                     issue="The difference between displace date
#'                                     and arrival date while it being first place of
#'                                     displacement is more than 4 weeks",
#'                                     dir = dir)
#' }
log_cleaning <- function(data, uuid, action, extra_columns = list(),
                         question.name = NULL, new.value = NULL, issue = NULL,
                         dir, filename = "cleaning_logbook.csv") {
  if (length(uuid) == 0) {
    stop("uuid parameter is empty; no records to add to logbook.")
  }
  action <- ifelse(action == "c", "change", ifelse(action == "d", "deletion", ifelse(action == "f", "flag", action)))
  if (class(data) != "data.frame") {
    stop("Dataset should be of class data.frame.")
  }
  if (!action %in% c("change", "deletion", "flag")) {
    stop("Action given is not recognized.")
  }
  if (action == "change" & (is.null(question.name) | is.null(new.value))) {
    stop("For a change all the parameters of old.value, question.name and new.value should be given.")
  }
  if (action == "flag" & is.null(question.name)) {
    stop("For a flag the parameter question.name should be given.")
  }
  log_file <- sprintf("%s/%s", dir, filename)
  if (! file.exists(log_file)){
    stop("Cleaning log not found, check the file.name or initiate it first with dclogger::init.cleaning.log()")
  } else {
    log <- read.csv(log_file, stringsAsFactors = F, encoding = "UTF-8")
  }
  if (! all(names(log)[3:(which(names(log) == "question.name") - 1)] == names(extra_columns))) {
    check <- readline(prompt="Warning!\nextra_column information is different from the existing cleaning log.\n
                      This may lead to unexpected results and lost information.\nDo you wish to continue? (y/n)")
    if(check != "y") stop("Aborted.")
  }

  for (i in 1:length(uuid)){
    log[nrow(log)+1,"uuid"] <- uuid[i]
    row_nr <- which(data$X_uuid == uuid[i])
    if(length(row_nr) > 1) {
      warning("UUID is not unique, use it only for flagging because executing changes or deletions in this way will give unexpected results.")
      row_nr <- row_nr[1]
    }
    log$log_date[nrow(log)] <- format(Sys.Date(), "%d-%m-%Y")
    for (j in 1:length(extra_columns)) {
      log[nrow(log), names(extra_columns)[j]] <- data[row_nr, extra_columns[[j]]]
    }
    log$action[nrow(log)] <- action
    log$issue[nrow(log)] <- issue
    if (action != "deletion") {
      log$question.name[nrow(log)] <- question.name
      log$old.value[nrow(log)] <- data[row_nr, question.name]
    }
    if (action == "change") {
      newval <- ifelse(length(new.value) == 1, new.value, new.value[i])
      log$new.value[nrow(log)] <- newval
    }
  }
  write.csv(log, log_file, row.names = F, fileEncoding = "UTF-8")
  return(log)
}
boukepieter/dclogger documentation built on Feb. 7, 2020, 8:34 p.m.