#' To check a dataset
#'
#' @param data imported dataset to be checked
#' @param info information about variables which will be used to check data
#' @param id name of variable that contains subejct identification
#' @param check_missing vector of variables to check missingness
#' @param plot whether create a plot of distribution of all variables or not
#' @param prefix prefix in name of output
#' @param outdir directory of output
#'
#' @return .csv file that contains list of potential errors
#' @export
inspect.data <- function(data, info, id, check_missing, plot = FALSE, prefix = "", outdir){
if (missing(outdir)) outdir <- "."
# select variables to check
data <- data[,names(data)[names(data) %in% info$varname]]
info <- info[match(names(data), info$varname),]
# check missing
if (missing(check_missing)) {
info$check_missing <- TRUE
} else {
info$check_missing <- check_missing
}
## check data
output <- do.call("rbind", mapply(inspect.each,
x = as.list(data),
varname = info$varname,
value = info$value,
type = info$type,
check_missing = info$check_missing))
## add id: use "=""...""" to avoid Excel to interpret string as Date
if (!missing(id) & !is.null(output)) {
output[, id] <- paste('"=""', as.character(data[output$index, id]), '"""', sep = "")
output <- output[, c(id, "index", "error")]
}
## plot distribution of data
if (plot) {
pdf(file = file.path(outdir, paste(prefix, "distribution.pdf", sep = "_")), width = 12, height = 10, family = "Helvetica", fonts = NULL, paper = "a4r")
layout(matrix(c(1:12), ncol = 4, nrow = 3, byrow = TRUE), respect = TRUE)
for (i in (1:length(info$varname))) {
if (length(na.omit(data[, info$varname[i]])) == 0) {
plot(x = 1:10, y = 1:10, main = info$varname[i], type = "n", xlab = "", ylab = "")
text(x = 5, y = 5, labels = "No non-missing value")
} else {
if (mode(data[, info$varname[i]]) != "numeric" | is.factor(data[, info$varname[i]])) {
barplot(table(data[, info$varname[i]]), main = info$varname[i])
} else {
if (is.POSIXct(data[, info$varname[i]])) {
boxplot(as.Date(data[, info$varname[i]]), main = info$varname[i])
} else {
boxplot(data[, info$varname[i]], main = info$varname[i])
}
}
}
}
dev.off()
}
## write output
if (!is.null(output)) {
write.csv(output, file = file.path(outdir, paste(prefix, "error.csv", sep = "_")), quote = FALSE, row.names = FALSE)
return(output)
} else {
cat("No error was found !")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.