R/utilities.R

Defines functions select_other_columns empty_issues_table aggregate_count data_validation_outliers_normal data_validation_outliers_log_normal outliers.numerical log.outliers.numerical setcolnames hasdata

Documented in hasdata

select_other_columns <- function(data) {
  othernames <- grep("other$|Other$|autre$|Autre$", names(data),
    value = T
  )
  data[othernames]
}

empty_issues_table <- function() {
  data.frame(
    index = numeric(), value = numeric(), variable = character(),
    has_issue = logical(), issue_type = character()
  )

}

aggregate_count <- function(df, split.by = NULL, ignore.missing.data = T, write.to.file = NULL) {
  if (!is.null(split.by)) {
    insure.string.is.column.header(df, split.by)
  }
  if (!is.null(split.by)) {
    insure.is.single.value(split.by)
  }
  counts <- lapply(df, function(d) {
    wtd.table(x = d, y = (if (is.null(split.by)) {
      NULL
    }
    else {
      df[[split.by]]
    }), na.rm = ignore.missing.data)
  })
  if (!is.null(write.to.file)) {
    write.csv.untidy(counts, write.to.file)
  }
  return(counts)
}


data_validation_outliers_normal <- function(data, maximum_standard_deviations = 3) {
  outliers_normal <- data %>% lapply(outliers.numerical, maximum_standard_deviations = maximum_standard_deviations)
  return(outliers_normal)
}

data_validation_outliers_log_normal <- function(data, maximum_standard_deviations = 3) {
  outliers_log_normal <- data %>% lapply(log.outliers.numerical, maximum_standard_deviations = maximum_standard_deviations)
  return(outliers_log_normal)
}

# detecting outliers:
outliers.numerical <- function(x, maximum_standard_deviations = 3) {
  # IN:
  # x: numerical vector
  # maximum_standard_deviations: integer
  # out: vector of indicies, of values in x that deviate more than maximum_standard_deviations from the mean.

  x <- gsub(" ","",x)
  x<- suppressWarnings(as.numeric(as.character(x))) # as.character to prevent factors from being treated is factor level ID integer
  x_data_only<-hasdata(x)
  x_data_only_indices<-hasdata(x,return.index = T)
  outliers_indices_in_data_only<-which(abs(x_data_only-mean(x_data_only))>maximum_standard_deviations*sd(x_data_only)& length(unique(x_data_only))>10)
  outliers_indices_in_original_vector<-x_data_only_indices[outliers_indices_in_data_only]
  return(
    cbind(
      index=outliers_indices_in_original_vector,
      value=x[outliers_indices_in_original_vector]) %>% as.data.frame(stringsAsFactors=F))

}

log.outliers.numerical <- function(x, maximum_standard_deviations = 3) {
  # IN:
  # x: numerical vector
  # maximum_standard_deviations: integer
  # out: vector of indicies, of values in x that deviate more than maximum_standard_deviations from the mean.
  x<- gsub(" ","",x)
  x<- suppressWarnings(as.numeric(as.character(x))) # as.character to prevent factors from being treated is factor level ID integer
  x_not_logged<-x
  x <- suppressWarnings(log(x))
  x_data_only <- hasdata(x)
  x_data_only_indices <- hasdata(x, return.index = T)
  outliers_indices_in_data_only <- which(abs(x_data_only - mean(x_data_only)) > maximum_standard_deviations * sd(x_data_only) & length(unique(x_data_only)) > 10)
  outliers_indices_in_original_vector <- x_data_only_indices[outliers_indices_in_data_only]
  return(
    cbind(

      index=outliers_indices_in_original_vector,
      value=x_not_logged[outliers_indices_in_original_vector]) %>% as.data.frame(stringsAsFactors=F))
}


setcolnames <- function(x, colnames) {
  colnames(x) <- colnames
  return(x)
}

#' has data
#' removes NA, empty strings and non-finite values from a vector
#' @param x vector
#' @param return.index if true, returns indices of the vector that have valid data. Defaults to FALSE.
#' @return returns the values of the input vector that contain valid data
hasdata <- function(x, return.index = F) {
  # in: vector of any class
  # out: the in vector without NULL,NA, or ""
  index <- which(!is.null(x) & !is.na(x) & x != "" & !is.infinite(x))
  value <- x[which(!is.null(x) & !is.na(x) & x != "" & !is.infinite(x))]
  if (return.index) {
    return(index)
  }
  return(value)
}
ellieallien/cleaninginspectoR documentation built on July 18, 2019, 12:30 p.m.