clean_data | R Documentation |
Cleans up messy data frames by performing several operations. These include among others: cleaning of column names, detecting and removing duplicates, empty records and columns, constant columns, replacing missing values by NA, converting character columns into dates when they contain a certain number of date values, detecting subject IDs with wrong formats, etc.
clean_data(data, ...)
data |
The input |
... |
A
|
The cleaned input data according to the user-specified parameters.
This is associated with a data cleaning report that can be accessed using
attr(cleaned_data, "report")
# Parameters for column names standardization
standardize_column_names <- list(keep = NULL, rename = NULL)
# parameters to remove constant columns, empty rows and columns
remove_constants <- list(cutoff = 1)
# Parameters for substituting missing values with NA:
replace_missing_values <- list(target_columns = NULL, na_strings = "-99")
# Parameters for duplicates removal across all columns
remove_duplicates <- list(target_columns = NULL)
# Parameters for dates standardization
standardize_dates <- list(
target_columns = NULL,
error_tolerance = 0.4,
format = NULL,
timeframe = as.Date(c("1973-05-29", "2023-05-29")),
orders = list(
world_named_months = c("Ybd", "dby"),
world_digit_months = c("dmy", "Ymd"),
US_formats = c("Omdy", "YOmd")
)
)
# Parameters for subject IDs standardization
standardize_subject_ids <- list(
target_columns = "study_id",
prefix = "PS",
suffix = "P2",
range = c(1, 100),
nchar = 7
)
# convert the 'sex' column into numeric
to_numeric <- list(target_columns = "sex", lang = "en")
# the dictionary-based cleaning will not be performed here
dictionary = NULL
# no need to check for the sequence of date events
check_date_sequence <- NULL
cleaned_data <- clean_data(
data = readRDS(
system.file("extdata", "test_df.RDS", package = "cleanepi")
),
standardize_column_names = standardize_column_names,
remove_constants = remove_constants,
replace_missing_values = replace_missing_values,
remove_duplicates = remove_duplicates,
standardize_dates = standardize_dates,
standardize_subject_ids = standardize_subject_ids,
to_numeric = to_numeric,
dictionary = NULL,
check_date_sequence = NULL
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.