Nothing
run_preprocessing <- function(
cl,
feature_info_list = NULL,
project_info,
settings,
file_paths,
message_indent = 0L,
verbose) {
# Suppress NOTES due to non-standard evaluation in data.table
data_id <- run_id <- list_name <- complete <- NULL
# Determine how parallel processing takes place.
if (settings$prep$do_parallel %in% c("TRUE", "inner")) {
# Parallel processing in inner function, i.e. within each data subset.
cl_inner <- cl
cl_outer <- NULL
} else if (settings$prep$do_parallel %in% c("outer")) {
# Parallel processing in outer loop, i.e. over all data subsets.
cl_inner <- NULL
cl_outer <- cl
if (!is.null(cl_outer)) {
logger_message(paste0(
"\nPre-processing: Load-balanced parallel processing is done in the outer loop. ",
"No progress can be displayed."),
indent = message_indent,
verbose = verbose)
}
} else {
# No parallel processing.
cl_inner <- cl_outer <- NULL
}
# Check if a feature info list was already created. This will typically
# generate a generic feature info list when called from summon_familiar.
if (is.null(feature_info_list)) {
feature_info_list <- .get_feature_info_data(
data = get_data_from_backend(),
file_paths = file_paths,
project_id = project_info$project_id,
outcome_type = settings$data$outcome_type)
}
# TODO: Check if the generic contains all the required data -- particularly
# for externally provided feature information.
# Create a list of runs for which pre-processing information should be
# obtained. First find the data ids over which should be iterated.
data_id <- c(
.get_process_step_data_identifier(
project_info = project_info,
process_step = "fs"),
.get_process_step_data_identifier(
project_info = project_info,
process_step = "mb"))
# Create a list of runs, with data_id and run_id.
run_list <- data.table::rbindlist(lapply(
unique(data_id),
function(data_id, project_info) {
# Find the current data identifier for pre-processing. This may or may not
# be data_id.
pre_process_data_id <- .get_preprocessing_iteration_identifiers(run = .get_run_list(
iteration_list = project_info$iter_list,
data_id = data_id,
run_id = 1))$data
# Find data and run ids.
iteration_list <- .get_run_list(
iteration_list = project_info$iter_list,
data_id = pre_process_data_id)
# Iterate over the iteration list, extract the run-table and return it.
return(data.table::rbindlist(lapply(
iteration_list,
function(x) (tail(x$run_table, n = 1L)))))
},
project_info = project_info))
# Remove duplicates.
run_list <- unique(run_list)
# Add list names and check for completeness.
run_list[, ":="(
"list_name" = .get_feature_info_list_name(data_id = data_id, run_id = run_id),
"complete" = FALSE)]
# Iterate over the runs and check which feature information lists are already
# fully complete.
run_list[, "complete" := feature_info_complete(feature_info_list[[list_name]]), by = "list_name"]
# Get all runs which are not (fully) complete, and add some additional data.
run_list <- run_list[complete == FALSE, ]
if (!is_empty(run_list)) {
# Set preprocessing run identifier and total number of datasets.
run_list[, ":="(
"preprocessing_run_id" = .I,
"n_preprocessing_runs" = nrow(run_list))]
# Iterate over data subsets for which parameters have not yet been set.
new_feature_info_list <- fam_mapply_lb(
cl = cl_outer,
assign = "data",
FUN = .run_preprocessing,
progress_bar = !is.null(cl_outer),
run = split(run_list, by = c("preprocessing_run_id")),
MoreArgs = list(
"cl" = cl_inner,
"feature_info_list" = feature_info_list,
"project_info" = project_info,
"settings" = settings,
"message_indent" = message_indent,
"verbose" = verbose & is.null(cl_outer)))
# Set names of the new feature list.
names(new_feature_info_list) <- run_list$list_name
# Update lists with feature information.
feature_info_list[run_list$list_name] <- new_feature_info_list
}
# Save to file, if necessary.
if (!is.null(file_paths)) {
# Determine file name
feature_info_file <- .get_feature_info_file_name(
file_paths = file_paths,
project_id = project_info$project_id)
# Write to file
saveRDS(feature_info_list, file = feature_info_file)
}
# Attach the feature info file to the backend.
.assign_feature_info_to_backend(feature_info_list = feature_info_list)
return(invisible(TRUE))
}
.run_preprocessing <- function(
cl = NULL,
run,
feature_info_list,
project_info,
settings,
message_indent,
verbose) {
logger_message(
paste0(
"\nPre-processing: Starting preprocessing for run ",
run$preprocessing_run_id, " of ",
run$n_preprocessing_runs, "."),
indent = message_indent,
verbose = verbose)
# Selected feature information list.
template_feature_info <- combine_feature_info_list(
preferred = feature_info_list[[run$list_name]],
custom = feature_info_list[["custom"]],
generic = feature_info_list[["generic"]])
# Find pre-processing parameters
feature_info_list <- determine_preprocessing_parameters(
cl = cl,
feature_info_list = template_feature_info,
data_id = run$data_id,
run_id = run$run_id,
project_info = project_info,
settings = settings,
message_indent = message_indent + 1L,
verbose = verbose)
return(feature_info_list)
}
determine_preprocessing_parameters <- function(
cl = NULL,
feature_info_list,
data_id,
run_id,
project_info,
settings,
message_indent,
verbose) {
# Add workflow control info.
feature_info_list <- add_control_info(
feature_info_list = feature_info_list,
data_id = data_id,
run_id = run_id)
# Add signature feature info.
feature_info_list <- add_signature_info(
feature_info_list = feature_info_list,
signature = settings$data$signature)
# Add novelty feature info.
feature_info_list <- add_novelty_info(
feature_info_list = feature_info_list,
novelty_features = settings$data$novelty_features)
# Find the run list.
run_list <- .get_run_list(
iteration_list = project_info$iter_list,
data_id = data_id,
run_id = run_id)
# Select unique samples.
sample_identifiers <- .get_sample_identifiers(
run = run_list,
train_or_validate = "train")
sample_identifiers <- unique(sample_identifiers)
# Find currently available features.
available_features <- get_available_features(feature_info_list = feature_info_list)
# Create a dataObject.
data <- methods::new(
"dataObject",
data = get_data_from_backend(sample_identifiers = sample_identifiers),
preprocessing_level = "none",
outcome_type = settings$data$outcome_type)
# Remove unavailable features from the data object.
data <- filter_features(
data = data,
available_features = available_features)
return(.determine_preprocessing_parameters(
cl = cl,
data = data,
feature_info_list = feature_info_list,
settings = settings,
message_indent = message_indent,
verbose = verbose))
}
.get_feature_info_file_name <- function(file_paths, project_id) {
# Generate file name of pre-processing file
file_name <- paste0(project_id, "_feature_info.RDS")
# Add file path and normalise according to the OS
file_name <- normalizePath(file.path(
file_paths$process_data_dir, file_name), mustWork = FALSE)
return(file_name)
}
.get_feature_info_list_name <- function(data_id, run_id) {
return(paste0(data_id, ".", run_id))
}
.get_feature_info_list <- function(run) {
# Find pre-processing control element for the current run
pre_proc_id_list <- .get_preprocessing_iteration_identifiers(run = run)
# Load feature info list from backend
feature_info_list <- get_feature_info_from_backend(
data_id = pre_proc_id_list$data,
run_id = pre_proc_id_list$run)
return(feature_info_list)
}
.determine_preprocessing_parameters <- function(
cl = NULL,
data,
feature_info_list,
settings,
message_indent = 0L,
verbose = FALSE) {
if (!is(data, "dataObject")) {
..error_reached_unreachable_code(
".determine_preprocessing_parameters: data is not a dataObject.")
}
if (is_empty(data)) stop("The provided dataset does not contain any samples.")
if (!has_feature_data(data)) stop("The provided dataset does not contain any features.")
# Remove samples with missing outcome data -----------------------------------
n_samples_current <- data.table::uniqueN(
x = data@data,
by = get_id_columns(id_depth = "sample"))
logger_message(
paste0("Pre-processing: ", n_samples_current, " samples were initially available."),
indent = message_indent,
verbose = verbose)
# Remove all samples with missing outcome data
data <- filter_missing_outcome(data = data, is_validation = FALSE)
if (is_empty(data)) stop("The provided training dataset lacks outcome data.")
n_samples_remain <- data.table::uniqueN(
x = data@data,
by = get_id_columns(id_depth = "sample"))
n_samples_removed <- n_samples_current - n_samples_remain
logger_message(
paste0("Pre-processing: ", n_samples_removed,
" samples were removed because of missing outcome data. ",
n_samples_remain, " samples remain."),
indent = message_indent,
verbose = verbose)
#Remove features with a large fraction of missing values ---------------------
n_features_current <- get_n_features(data)
logger_message(
paste0("Pre-processing: ", n_features_current, " features were initially available."),
indent = message_indent,
verbose = verbose)
# Determine the fraction of missing values
feature_info_list <- add_missing_value_fractions(
cl = cl,
feature_info_list = feature_info_list,
data = data,
threshold = settings$prep$feature_max_fraction_missing)
# Find features that are not missing too many values.
available_features <- get_available_features(feature_info_list = feature_info_list)
# Remove features with a high fraction of missing values
data <- filter_features(
data = data,
available_features = available_features)
if (!has_feature_data(data)) {
stop(paste0(
"The provided dataset lacks features with sufficient available values. ",
"Please investigate missing values in the dataset or increase the missingness ",
"threshold by increasing the feature_max_fraction_missing configuration parameter."))
}
# Message how many features were removed
logger_message(
paste0(
"Pre-processing: ", n_features_current - length(available_features),
" features were removed because of a high fraction of missing values. ",
length(available_features), " features remain."),
indent = message_indent,
verbose = verbose)
n_samples_current <- n_samples_remain
# Remove samples with a large fraction of missing values ---------------------
data <- filter_bad_samples(
data = data,
threshold = settings$prep$sample_max_fraction_missing)
if (is_empty(data)) {
stop(paste0(
"The provided dataset lacks samples with sufficient available feature values. ",
"Please investigate missing values in the dataset or increase the missingness ",
"threshold by increasing the sample_max_fraction_missing configuration parameter."))
}
# Message how many subjects were removed
n_samples_remain <- data.table::uniqueN(
x = data@data,
by = get_id_columns(id_depth = "sample"))
logger_message(
paste0(
"Pre-processing: ", n_samples_current - n_samples_remain,
" samples were removed because of missing feature data. ",
n_samples_remain, " samples remain."),
indent = message_indent,
verbose = verbose)
n_features_current <- length(available_features)
# Remove invariant features --------------------------------------------------
# Filter features that are invariant.
feature_info_list <- find_invariant_features(
cl = cl,
feature_info_list = feature_info_list,
data = data)
# Find available features.
available_features <- get_available_features(feature_info_list = feature_info_list)
# Remove invariant features from the data
data <- filter_features(
data = data,
available_features = available_features)
if (!has_feature_data(data)) {
stop(paste0(
"Remaining features in the dataset only have a single value for all samples ",
"and cannot be used for training."))
}
# Message number of features removed by the no-variance filter.
logger_message(
paste0(
"Pre-processing: ", n_features_current - length(available_features),
" features were removed due to invariance. ",
length(available_features), " features remain."),
indent = message_indent,
verbose = verbose)
# Add feature distribution data ----------------------------------------------
logger_message(
paste0("Pre-processing: Adding value distribution statistics to features."),
indent = message_indent,
verbose = verbose)
# Add feature distribution data
feature_info_list <- compute_feature_distribution_data(
cl = cl,
feature_info_list = feature_info_list,
data = data)
# Transform features ---------------------------------------------------------
logger_message(
"Pre-processing: Performing transformations to normalise feature value distributions.",
indent = message_indent,
verbose = verbose && settings$prep$transform_method != "none")
# Add skeletons to the feature information list.
feature_info_list <- create_transformation_parameter_skeleton(
feature_info_list = feature_info_list,
transformation_method = settings$prep$transform_method,
transformation_optimisation_criterion = settings$prep$transformation_optimisation_criterion,
transformation_gof_p_value = settings$prep$transformation_gof_test_p_value
)
# Add transformation parameters to the feature information list
feature_info_list <- add_transformation_parameters(
cl = cl,
feature_info_list = feature_info_list,
data = data,
verbose = verbose)
# Apply transformation.
data <- transform_features(
data = data,
feature_info_list = feature_info_list)
logger_message(
"Pre-processing: Feature distributions have been transformed for normalisation.",
indent = message_indent,
verbose = verbose & settings$prep$transform_method != "none")
# Remove low-variance features -----------------------------------------------
if ("low_variance" %in% settings$prep$filter_method) {
n_features_current <- length(available_features)
# Filter features that are invariant.
feature_info_list <- find_low_variance_features(
cl = cl,
feature_info_list = feature_info_list,
data = data,
settings = settings)
# Check available features.
available_features <- get_available_features(feature_info_list = feature_info_list)
# Remove invariant features from the data
data <- filter_features(
data = data,
available_features = available_features)
if (!has_feature_data(data)) {
stop(paste0(
"Remaining features in the dataset have a variance that is lower than the threshold ",
"and were therefore all removed. Please investigate your data, or increase the threshold ",
"through the low_var_minimum_variance_threshold configuration parameter."))
}
# Message number of features removed by the low-variance filter.
logger_message(
paste0(
"Pre-processing: ", n_features_current - length(available_features),
" features were removed due to low variance. ",
length(available_features), " features remain."),
indent = message_indent,
verbose = verbose)
}
# Normalise features ---------------------------------------------------------
logger_message(
"Pre-processing: Extracting normalisation parameters from feature data.",
indent = message_indent,
verbose = verbose && settings$prep$normalisation_method != "none")
# Add skeletons to the feature information list.
feature_info_list <- create_normalisation_parameter_skeleton(
feature_info_list = feature_info_list,
normalisation_method = settings$prep$normalisation_method)
# Add normalisation parameters to the feature information list.
feature_info_list <- add_normalisation_parameters(
cl = cl,
feature_info_list = feature_info_list,
data = data,
verbose = verbose)
# Apply normalisation to data before clustering
data <- normalise_features(
data = data,
feature_info_list = feature_info_list)
logger_message(
"Pre-processing: Feature data were normalised.",
indent = message_indent,
verbose = verbose && settings$prep$normalisation_method != "none")
# Batch normalise features ---------------------------------------------------
logger_message(
"Pre-processing: Extracting batch normalisation parameters from feature data.",
indent = message_indent,
verbose = verbose && settings$prep$batch_normalisation_method != "none")
# Add batch normalisation skeletons.
feature_info_list <- create_batch_normalisation_parameter_skeleton(
feature_info_list = feature_info_list,
normalisation_method = settings$prep$batch_normalisation_method)
# Add batch normalisation parameters to the feature information list.
feature_info_list <- add_batch_normalisation_parameters(
cl = cl,
feature_info_list = feature_info_list,
data = data,
verbose = verbose)
# Batch-normalise feature values
data <- batch_normalise_features(
data = data,
feature_info_list = feature_info_list)
logger_message(
"Pre-processing: Feature data were batch-normalised.",
indent = message_indent,
verbose = verbose && settings$prep$batch_normalisation_method != "none")
# Remove non-robust features -------------------------------------------------
if ("robustness" %in% settings$prep$filter_method) {
n_features_current <- length(available_features)
# Filter features that are not robust
feature_info_list <- find_non_robust_features(
cl = cl,
feature_info_list = feature_info_list,
data = data,
settings = settings)
available_features <- get_available_features(feature_info_list = feature_info_list)
# Remove non-robust features from the data
data <- filter_features(
data = data,
available_features = available_features)
if (!has_feature_data(data)) {
stop(paste0(
"Remaining features in the dataset have a robustness that is lower than the threshold ",
"and were therefore all removed. Please investigate your data, or decrease the threshold ",
"through the robustness_threshold_value configuration parameter."))
}
# Message number of features removed by the robustness filter.
logger_message(
paste0(
"Pre-processing: ", n_features_current - length(available_features),
" features were removed due to low robustness. ",
length(available_features), " features remain."),
indent = message_indent,
verbose = verbose)
}
# Remove unimportant features ------------------------------------------------
if ("univariate_test" %in% settings$prep$filter_method) {
n_features_current <- length(available_features)
# Filter features that are not relevant
feature_info_list <- find_unimportant_features(
cl = cl,
feature_info_list = feature_info_list,
data = data,
settings = settings)
available_features <- get_available_features(feature_info_list = feature_info_list)
# Remove unimportant features from the data
data <- filter_features(
data = data,
available_features = available_features)
if (!has_feature_data(data)) {
stop(paste0(
"Remaining features in the dataset have a p-value that is higher than the threshold ",
"and were therefore all removed. Please investigate your data, or increase the threshold ",
"through the univariate_test_threshold configuration parameter."))
}
# Message number of features removed by the importance filter.
logger_message(
paste0(
"Pre-processing: ", n_features_current - length(available_features),
" features were removed due to low importance. ",
length(available_features), " features remain."),
indent = message_indent,
verbose = verbose)
}
# Impute missing values ------------------------------------------------------
logger_message(
"Pre-processing: Adding imputation information to features.",
indent = message_indent,
verbose = verbose)
# Add imputation skeletons.
feature_info_list <- create_imputation_parameter_skeleton(
feature_info_list = feature_info_list,
imputation_method = settings$prep$imputation_method)
# Add imputation info
feature_info_list <- add_imputation_info(
cl = cl,
feature_info_list = feature_info_list,
data = data,
verbose = verbose)
# Impute features with censored data prior to clustering
data <- impute_features(
data = data,
feature_info_list = feature_info_list)
# Cluster features -----------------------------------------------------------
logger_message(
"Pre-processing: Starting clustering of redundant features",
indent = message_indent,
verbose = verbose && settings$prep$cluster_method != "none")
# Add clustering skeletons.
feature_info_list <- create_cluster_parameter_skeleton(
feature_info_list = feature_info_list,
cluster_method = settings$prep$cluster_method,
cluster_linkage = settings$prep$cluster_linkage,
cluster_cut_method = settings$prep$cluster_cut_method,
cluster_similarity_threshold = settings$prep$cluster_similarity_threshold,
cluster_similarity_metric = settings$prep$cluster_similarity_metric,
cluster_representation_method = settings$prep$cluster_representation_method)
# Extract clustering information
feature_info_list <- add_cluster_info(
cl = cl,
feature_info_list = feature_info_list,
data = data,
message_indent = message_indent + 1L,
verbose = verbose)
# Build cluster table.
cluster_table <- .create_clustering_table(feature_info_list = feature_info_list)
# Determine the number of features prior to clustering.
n_features_current <- nrow(cluster_table)
# Further summarise the clusters by grouping.
cluster_table <- cluster_table[, list("cluster_size" = .N), by = "cluster_name"]
logger_message(
paste0(
nrow(cluster_table),
ifelse(nrow(cluster_table) == 1, " feature cluster was", " feature clusteres were"),
" created from ", n_features_current,
ifelse(n_features_current == 1, " feature. ", " features. "),
sum(cluster_table$cluster_size > 1L),
ifelse(sum(cluster_table$cluster_size > 1L) == 1, " cluster contains", " clusters contain"),
" more than one feature. The remaining ",
sum(cluster_table$cluster_size == 1L),
ifelse(sum(cluster_table$cluster_size == 1L) == 1, " cluster is", " clusters are"),
" singular."),
indent = message_indent + 1L,
verbose = verbose && settings$prep$cluster_method != "none")
# Add required features
feature_info_list <- add_required_features(feature_info_list = feature_info_list)
# Filter features that are not required from the list
feature_info_list <- trim_unused_features_from_list(feature_info_list = feature_info_list)
# Return list of featureInfo objects
return(feature_info_list)
}
combine_feature_info_list <- function(
preferred = NULL,
custom = NULL,
generic = NULL) {
# Suppress NOTES due to non-standard evaluation in data.table
name <- present <- list_name <- complete <- NULL
# Get all features.
feature_names <- unique(c(
names(preferred),
names(custom),
names(generic)))
# Identify which features appear where.
data <- mapply(
FUN = function(x, x_name, feature_names) {
# Default dataset.
data <- data.table::data.table(
"list_name" = x_name,
"name" = feature_names,
"present" = FALSE,
"complete" = FALSE)
# Mark feature names that are present in the current dataset.
data[name %in% names(x), "present" := TRUE]
# Check whether data are present, are complete.
data[
present == TRUE,
"complete" := feature_info_complete(object = x[[name]]),
by = "name"]
return(data)
},
x = list(
"preferred" = preferred,
"custom" = custom,
"generic" = generic),
x_name = c("preferred", "custom", "generic"),
MoreArgs = list("feature_names" = feature_names),
SIMPLIFY = FALSE)
# Combine list.
data <- data.table::rbindlist(data)
# Start new feature list.
new_feature_list <- NULL
# Preference to get features with complete information first from the
# preferred set.
selected_feature_names <- data[list_name == "preferred" & complete == TRUE]$name
# Add to feature list and remove from set of features.
if (length(selected_feature_names) > 0) {
new_feature_list <- c(new_feature_list, preferred[selected_feature_names])
feature_names <- setdiff(feature_names, selected_feature_names)
}
# Then, preference to get complete information from the custom set.
selected_feature_names <- data[name %in% feature_names & list_name == "custom" & complete == TRUE, ]$name
# Add to feature list and remove from set of features.
if (length(selected_feature_names) > 0) {
new_feature_list <- c(new_feature_list, custom[selected_feature_names])
feature_names <- setdiff(feature_names, selected_feature_names)
}
# Then, preference to get incomplete information first from the preferred set.
selected_feature_names <- data[name %in% feature_names & list_name == "preferred" & present == TRUE, ]$name
# Add to feature list and remove from set of features.
if (length(selected_feature_names) > 0) {
new_feature_list <- c(new_feature_list, preferred[selected_feature_names])
feature_names <- setdiff(feature_names, selected_feature_names)
}
# Then, preference to get incomplete information first from the custom set.
selected_feature_names <- data[name %in% feature_names & list_name == "custom" & present == TRUE, ]$name
# Add to feature list and remove from set of features.
if (length(selected_feature_names) > 0) {
new_feature_list <- c(new_feature_list, custom[selected_feature_names])
feature_names <- setdiff(feature_names, selected_feature_names)
}
# Then, preference to get incomplete information first from the generic set.
selected_feature_names <- data[name %in% feature_names & list_name == "generic" & present == TRUE, ]$name
# Add to feature list and remove from set of features.
if (length(selected_feature_names) > 0) {
new_feature_list <- c(new_feature_list, generic[selected_feature_names])
feature_names <- setdiff(feature_names, selected_feature_names)
}
return(new_feature_list)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.