R/check_quality.R
In clickR: Semi-Automatic Preprocessing of Messy Data with Change Tracking for Dataset Cleaning

Documented in bivariate_outliers check_quality extreme_values may.numeric numeros ttrue workspace workspace_sapply xscores

#' Checks if each value might be numeric
#'
#' @description Checks if each value from a vector might be numeric
#' @param x A vector
#' @return A logical vector
may.numeric <- function(x) suppressWarnings(!is.na(numeros(x)))

#' Extreme values from a numeric vector
#'
#' @description Returns the nth lowest and highest values from a vector
#' @param x A vector
#' @param n Number of extreme values to return
#' @param id ID column to reference the found extreme values
#' @return A matrix with the lowest and highest values from a vector
extreme_values <- function(x, n=5, id=NULL){
  h<-matrix(rep("", n*2), ncol=n)
  m<-matrix(sort(na.omit(x))[c(1:n, (length(na.omit(x))-(n-1)):length(na.omit(x)))], nrow=2, byrow=TRUE)
  if(!is.null(id)){
    h<- matrix(id[order(x)][c(1:n, (length(na.omit(x))-(n-1)):length(na.omit(x)))], nrow=2, byrow=TRUE)
  }
  return(list(Values=setNames(data.frame(t(m)), c("Low", "High")), IDs=setNames(data.frame(t(h)), c("Low", "High"))))
}

#' Brute numeric coercion
#'
#' @description If possible, coerces values from a vector to numeric
#' @param x A vector
#' @return A numeric vector
numeros <- function(x){
  suppressWarnings(
    as.numeric(
      gsub(
        paste(c(",", "\\.\\.", ",,", "\\.,", ",\\.", "\\."),
              collapse = "|"),
        ".",
        gsub(
          "[A-Za-z]",
          "",
          iconv(
            gsub(
              "^ *|(?<= ) | *$",
              "",
              gsub("\\$|\u20ac|\u00A3",
                   "",
                   tolower(as.character(x))),
              perl = TRUE),
            to = "ASCII//TRANSLIT")))))
}

#' True TRUE
#'
#' @description Makes possible vectorized logical comparisons against NULL and NA values
#' @param x A logical vector
#' @return A logical vector
#' @export
ttrue <- function(x){
  x[is.na(x)] <- FALSE
  if(length(x) == 0L) x <- FALSE
  x
}

#' greater & NA
#'
#' @description '>' operator where NA values return FALSE
#' @param x Vector for the left side of the operator
#' @param y A Scalar or vector of the same length as x for the right side of the operator
#' @return A logical vector of the same length as x
#' @export
`%>NA%` <- function(x, y){
  ttrue(x > y)
}

#' less & NA
#'
#' @description '<' operator where NA values return FALSE
#' @param x Vector for the left side of the operator
#' @param y A Scalar or vector of the same length as x for the right side of the operator
#' @return A logical vector of the same length as x
#' @export
`%<NA%` <- function(x, y){
  ttrue(x < y)
}

#' geq & not NA
#'
#' @description '>=' operator where NA values return FALSE
#' @param x Vector for the left side of the operator
#' @param y A Scalar or vector of the same length as x for the right side of the operator
#' @return A logical vector of the same length as x
#' @export
`%>=NA%` <- function(x, y){
  ttrue(x >= y)
}

#' leq & not NA
#'
#' @description '<=' operator where NA values return FALSE
#' @param x Vector for the left side of the operator
#' @param y A Scalar or vector of the same length as x for the right side of the operator
#' @return A logical vector of the same length as x
#' @export
`%<=NA%` <- function(x, y){
  ttrue(x <= y)
}

#' between operator
#'
#' @description Operator equivalent to x >= lower.value & x <= upper.value
#' @param x Vector for the left side of the operator
#' @param y A vector of length two with the lower and upper values of the interval
#' @return A logical vector of the same length as x
#' @export
`%between%` <- function(x, y){
  x >= y[1] & x <= y[2]
}

#' between operator & not NA
#'
#' @description Operator equivalent to x >= lower.value & x <= upper.value & !is.na(x)
#' @param x Vector for the left side of the operator
#' @param y A vector of length two with the lower and upper values of the interval
#' @return A logical vector of the same length as x
#' @export
`%betweenNA%` <- function(x, y){
  x %>=NA% y[1] & x %<=NA% y[2]
}


#' Checks data quality of a variable
#'
#' @description Returns different data quality details of a numeric or categorical variable
#' @param x A variable from a data.frame
#' @param id ID column to reference the found extreme values
#' @param plot If the variable is numeric, should a boxplot be drawn?
#' @param numeric If set to TRUE, forces the variable to be considered numeric
#' @param k Number of different numeric values in a variable to be considered as numeric
#' @param n Number of extreme values to extract
#' @param output Format of the output. If TRUE, optimize for exporting as csv
#' @param ... further arguments passed to boxplot()
#' @return A list of a data.frame with information about data quality of the variable
#' @importFrom graphics boxplot dotchart
#' @importFrom stats setNames
#' @export
#' @examples
#' check_quality(airquality$Ozone)  #For one variable
#' lapply(airquality, check_quality)  #For a data.frame
#' lapply(airquality, check_quality, output=TRUE)  #For a data.frame, one row per variable
check_quality <- function(x, id=1:length(x), plot=TRUE, numeric=NULL, k=5, n=ifelse(is.numeric(x) | ttrue(numeric) | class(x) %in% "Date", 5, 2), output=FALSE, ...){
  call_n <- !is.null(as.list(match.call())$n)
  num <- numeric
  date <- class(x) %in% "Date"
  numbers <- sum(may.numeric(x))
  offending_values<-NA
  if(is.null(numeric)){
    if(numbers>(length(x)/10) & length(unique(numeros(x))) > k) {
      num<-TRUE
      n<-max(c(n, 5*!call_n))} else num <- FALSE
  }
  if(num & !is.numeric(x)){
    warning("Numeric variable encoded as a factor. Use fix_numerics() to amend", call.=FALSE)
    off<-table(x)[is.na(numeros(names(table(x))))]
    offending_values<-paste(paste(names(off), " (", off, ")", sep=""), collapse="; ", sep="")
  }
  if(plot & num) boxplot(numeros(x), col="gray", outcol="darkred", pch=16, las=1, ylab=as.character(as.list(match.call())$x)[3], ...)
  if(plot & !num & !date) dotchart(sort(setNames(as.numeric(table(x)), names(table(x)))), pch=16)
  if(plot & date) plot(sort(x, na.last = TRUE), 1:length(x), pch=16, col=rgb(50, 50, 50, 100, maxColorValue = 255), las=1, xlab="Date", ylab="")
  Extremes_low<-NA
  Extremes_high<-NA
  Tabla2<-NA
  if(num){
    Extremes <- extreme_values(x, n, id)
  }
  if(!num & !date){
    Table <- data.frame(sort(table(x))[1:n])
    Tabla2<-paste(apply(data.frame(sort(table(x))[1:n]), 1, function(x) paste(x[1], " (", x[2], ")", sep="")), collapse="; ")
  }
  if(date){
    h<-matrix(rep("", n*2), ncol=2)
    m<-data.frame(Low=sort(na.omit(x))[1:n], high=sort(na.omit(x))[(length(na.omit(x))-(n-1)):length(na.omit(x))])
    if(!is.null(id)){
      h<- t(matrix(id[order(x)][c(1:n, (length(na.omit(x))-(n-1)):length(na.omit(x)))], nrow=2, byrow=TRUE))
    }
    Extremes <- list(Values=m, IDs=h)
  }
  if(num | date){
    Extremes_low <- gsub(";", "];", paste(paste(Extremes$Values[, 1], Extremes$IDs[, 1], sep=" ["), "; ", collapse="", sep=""))
    Extremes_high <- gsub(";", "];", paste(paste(Extremes$Values[, 2], Extremes$IDs[, 2], sep=" ["), "; ", collapse="", sep=""))
  }
  N.Categories <- length(table(x))
  Strings <- sum(grepl("[[:alpha:]]", x))
  NAs<-sum(is.na(x))
  whitespace<-sum(x %in% "" | x %in% " ", na.rm=TRUE)
  if(output){
    res<-data.frame(n=length(x), NAs=NAs, whitespace=whitespace, numbers=numbers, Strings=Strings,
                    N.Categories=N.Categories, Extremes_low=Extremes_low, Extremes_high=Extremes_high, Table=strtrim(Tabla2, 150),
                    Offenders=strtrim(offending_values, 150))
  } else{
    res<-list(Summary=data.frame(n=length(x), NAs=NAs, whitespace=whitespace, numbers=numbers, strings=Strings, class=class(x)), Extremes=if(num | date) Extremes else cbind(Table, N.Categories=c(rep("", n-1), length(table(x)))), Offending=offending_values)
  }
  return(res)
}

#' Explores global environment workspace
#'
#' @description Returns information regarding the different objects in global environment
#' @param table If TRUE a table with the frequencies of each type of object is given
#' @return A list of object names by class or a table with frequencies if table = TRUE
#' @export
#' @examples
#' df1 <- data.frame(x=rnorm(10), y=rnorm(10, 1, 2))
#' df2 <- data.frame(x=rnorm(20), y=rnorm(20, 1, 2))
#' workspace(table=TRUE)  #Frequency table of the different object classes
#' workspace()  #All objects in the global object separated by class
workspace <- function(table=FALSE) {
  list_obj <- split(objects(envir=.GlobalEnv), sapply(objects(envir=.GlobalEnv), function(x) class(get(x, envir=.GlobalEnv))[length(class(get(x, envir=.GlobalEnv)))]))
  if(table) sapply(list_obj, function(x) length(x)) else list_obj
}


#' Applies a function over objects of a specific class
#'
#' @description Applies a function over all objects of a specific class in the global environment
#' @param object_class Class of the objects where the function is to be applied
#' @param action Name of the function to apply
#' @return Results of the function
#' @export
#' @examples
#' df1 <- data.frame(x=rnorm(10), y=rnorm(10, 1, 2))
#' df2 <- data.frame(x=rnorm(20), y=rnorm(20, 1, 2))
#' workspace_sapply("data.frame", "summary")  #Gives a summary of each data.frame
workspace_sapply <- function(object_class, action="summary"){
  sapply(workspace()[[object_class]], function(x) get(action)(get(x)), simplify=FALSE)
}

#' Check for bivariate outliers
#'
#' @description Checks for bivariate outliers in a data.frame
#' @param x A data.frame object
#' @param threshold_r Threshold for the case of two continuous variables
#' @param threshold_b Threshold for the case of one continuous and one categorical variable
#' @return A data frame with all the observations considered as bivariate outliers
#' @importFrom stats cooks.distance
#' @importFrom utils combn
#' @export
#' @examples
#' bivariate_outliers(iris)
bivariate_outliers <- function(x, threshold_r=10, threshold_b=1.5){
  pairwise_comb <- combn(1:ncol(x), 2)
  outliers <- apply(pairwise_comb, 2, function(y){
    if(all(sapply(x[,y], is.numeric))){
      data_l <- data.frame(x=x[ , y[1]], y=x[ , y[2]])
      mod_a <- stats::rstudent(lm(x ~ y + I(y^2) + I(y^3), data=data_l, na.action = "na.exclude"))^2
      mod_b <- stats::rstudent(lm(y ~ x + I(x^2) + I(x^3), data=data_l, na.action = "na.exclude"))^2
      rs <- (mod_a+mod_b)/mean(mod_a+mod_b, na.rm=TRUE)
      if(any(rs %>NA% threshold_r)){
        data.frame(row=rownames(x)[which(rs %>NA% threshold_r)], variable1=names(x)[y[1]], value1=x[,y[1]][which(rs %>NA% threshold_r)],
                   variable2=names(x)[y[2]], value2=x[,y[2]][which(rs %>NA% threshold_r)])
      }
    } else{
      if(sum(sapply(x[,y], is.numeric) * rev(sapply(x[,y], is.factor))) == 1){
        factor <- sapply(x[,y], is.factor)
        case <- unsplit(lapply(split(x[,y][,!factor], x[,y][,factor]), function(x) outliers(x, threshold_b)), x[,y][,factor])
        if(any(case)){
          data.frame(row=rownames(x)[ttrue(case)], variable1=names(x)[y[1]], value1=as.character(x[,y[1]][ttrue(case)]),
                     variable2=names(x)[y[2]], value2=as.character(x[,y[2]][ttrue(case)]))
        }
      }
    }
  })
  output <- do.call(rbind, as.list(outliers))
  rownames(output) <- NULL
  output
}

#' Estimate sample scores
#'
#' @description Calculates different scores to measure how much extreme are the different data points
#' @param x A vector
#' @param type 'z' calculates standard normal scores, 'z-out' calculates standard normal scores excluding
#' each data point when computing the mean and the standard deviation, 't' calculates t scores,
#' 'chisq' calculates chisquared scores, 'tukey' calculates scores based on the boxplot method,
#' 'mad' calculates scores using median and mad instead of mean and sd.
#' @importFrom stats mad
#' @export
#' @examples
#' xscores(iris$Sepal.Length, type="z-out")
xscores <- function(x, type="z"){
  if(! type %in% c("z", "z-out", "t", "chisq", "tukey", "mad")) stop("Invalid method. Available methods are: 'z', 't', 'chisq', 'tukey' and 'mad'")
  if(type=="z"){
    score <- (x - mean(x))/sd(x)
  }
  if(type=="z-out"){
    score <- sapply(1:length(x), function(y) (x[y] - mean(x[-y]))/sd(x[-y]))
  }
  if(type=="t"){
    score <- (((x - mean(x))/sd(x)) * sqrt(length(x) - 2))/sqrt(length(x) - 1 - ((x - mean(x))/sd(x))^2)
  }
  if(type=="chisq"){
    score <- (x - mean(x))^2/var(x)
  }
  if(type=="tukey"){
    k <- quantile(x, probs=c(0.25, 0.75))
    score <- ifelse(x < k[1], (x-k[1])/diff(k),
                    ifelse(x > k[2], (x-k[2])/diff(k),
                           ifelse(is.na(x), NA, 0)))
  }
  if(type=="mad"){
    score <- (x - median(x))/mad(x)
  }
  score
}

Any scripts or data that you put into this service are public.

clickR documentation built on April 3, 2025, 9:22 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

clickR
Semi-Automatic Preprocessing of Messy Data with Change Tracking for Dataset Cleaning

R/check_quality.R
In clickR: Semi-Automatic Preprocessing of Messy Data with Change Tracking for Dataset Cleaning

Defines functions xscores bivariate_outliers workspace_sapply workspace check_quality `%betweenNA%` `%between%` `%<=NA%` `%>=NA%` `%<NA%` `%>NA%` ttrue numeros extreme_values may.numeric

Documented in bivariate_outliers check_quality extreme_values may.numeric numeros ttrue workspace workspace_sapply xscores

Try the clickR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

clickR Semi-Automatic Preprocessing of Messy Data with Change Tracking for Dataset Cleaning

R/check_quality.R In clickR: Semi-Automatic Preprocessing of Messy Data with Change Tracking for Dataset Cleaning

Defines functions xscores bivariate_outliers workspace_sapply workspace check_quality `%betweenNA%` `%between%` `%<=NA%` `%>=NA%` `%<NA%` `%>NA%` ttrue numeros extreme_values may.numeric

Documented in bivariate_outliers check_quality extreme_values may.numeric numeros ttrue workspace workspace_sapply xscores

Try the clickR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

clickR
Semi-Automatic Preprocessing of Messy Data with Change Tracking for Dataset Cleaning

R/check_quality.R
In clickR: Semi-Automatic Preprocessing of Messy Data with Change Tracking for Dataset Cleaning