Nothing
#*******************************************************************************
# Filename : miic.utils.R
#
# Description: various utilities functions and constants for miic
#*******************************************************************************
#===============================================================================
# CONSTANTS
#===============================================================================
MIIC_VALID_MODES = c ("S", "TS")
MIIC_TEMPORAL_MODES = c ("TS")
MIIC_VALID_LATENT <- c ("orientation", "yes", "no")
MIIC_VALID_CONSISTENT <- c ("no", "orientation", "skeleton")
MIIC_CONTINUOUS_TRESHOLD <- 5
STATE_ORDER_STANDARD_VALID_COLUMS <- c ("var_names", "var_type",
"levels_increasing_order", "is_contextual", "is_consequence",
"group", "group_color")
STATE_ORDER_TEMPORAL_VALID_COLUMNS = c (STATE_ORDER_STANDARD_VALID_COLUMS,
"n_layers", "delta_t", "mov_avg")
#===============================================================================
# FUNCTIONS
#===============================================================================
# list_to_str
#-------------------------------------------------------------------------------
# Utility function to transform the first n_max items of a list into a
# displayable (comma + space separated) string: "item1, item2, item3, ..."
# - list: a list
# - n_max: int, optional, NULL by default, maximum of items used. If NULL,
# all items are used. If the list has more than n_max items, ", ..." is added
# return: string, the displayable string
#-------------------------------------------------------------------------------
list_to_str <- function (list, n_max=NULL) {
if (is.null (list) )
return ("NULL")
if (length (list) == 0)
return ("")
if ( (! is.null (n_max)) && (length (list) > n_max) )
list <- c (list[1:n_max], "...")
ret <- paste (unlist(list), collapse=", ")
return (ret)
}
#-------------------------------------------------------------------------------
# miic_error
#-------------------------------------------------------------------------------
# Utility function to raise an error and stop
#-------------------------------------------------------------------------------
miic_error <- function (context, ...)
{
stop (paste0 ("Error in ", context, ": ", ...), call.=FALSE)
}
#-------------------------------------------------------------------------------
# miic_warning
#-------------------------------------------------------------------------------
# Utility function to raise a warning
#-------------------------------------------------------------------------------
miic_warning <- function (context, ...) {
warning (paste0 ("Warning in ", context, ": ", ...), call.=FALSE)
}
#-------------------------------------------------------------------------------
# miic_msg
#-------------------------------------------------------------------------------
# Utility function to display a message
#-------------------------------------------------------------------------------
miic_msg <- function (...)
{
cat (paste0 (..., "\n") )
}
#-------------------------------------------------------------------------------
# Check input data
#-------------------------------------------------------------------------------
# params:
# - input_data: a dataframe with variables as columns and rows as samples
# - mode : MIIC mode
# return:
# - input_data: the input data dataframe, eventually without full of NAs rows
#-------------------------------------------------------------------------------
check_input_data <- function (input_data, mode)
{
if ( is.null(input_data) )
miic_error ("input data", "The input data is required.")
if ( ! is.data.frame (input_data) )
miic_error ("input data", "The input data must be a dataframe.")
if (nrow (input_data) == 0)
miic_error ("input data", "The input data is empty.")
if ( (ncol (input_data) == 0)
|| ( (mode %in% MIIC_TEMPORAL_MODES) && (ncol (input_data) == 1) ) )
miic_error ("input data", "The input data has no variable.")
#
# Check variables full of NAs
#
cols_only_na <- colSums (is.na (input_data)) == nrow (input_data)
input_data <- input_data [, !cols_only_na]
if ( (ncol (input_data) == 0)
|| ( (mode %in% MIIC_TEMPORAL_MODES) && (ncol (input_data) == 1) ) )
miic_error ("input data", "The input data contains only NAs")
if ( any (cols_only_na) )
miic_warning ("input data", "the input data contains ", sum(cols_only_na),
" variables(s) with only NAs. These variables(s) will be removed.")
#
# Check about rows with only NAs (only if mode is nottemporal, as NAs check
# in temporal mode will be done after data lagging)
#
if ( ! (mode %in% MIIC_TEMPORAL_MODES) )
{
rows_only_na <- rowSums (is.na (input_data)) == ncol (input_data)
input_data <- input_data [!rows_only_na, ]
if ( any (rows_only_na) )
miic_warning ("input data", "the input data contains ", sum(rows_only_na),
" row(s) with only NAs. These row(s) will be removed.")
}
#
# Check constant variables
#
if (mode %in% MIIC_TEMPORAL_MODES)
{
col_names = colnames (input_data[,2:ncol(input_data)])
n_unique_vals <- unlist (lapply (input_data[,2:ncol(input_data)],
function (x) { length (unique (x[!is.na(x)] ) ) } ) )
}
else
{
col_names = colnames (input_data)
n_unique_vals <- unlist (lapply (input_data,
function (x) { length (unique (x[!is.na(x)] ) ) } ) )
}
vars_constant = (n_unique_vals <= 1)
if ( any (vars_constant) )
{
msg_str <- list_to_str (col_names[vars_constant], n_max=10)
if (sum (vars_constant) == 1)
miic_warning ("input data", "the variable ", msg_str, " is constant.",
" Such variable can not be connected and should be removed.")
else
miic_warning ("input data", sum(vars_constant), " variables (", msg_str,
") are constant. Such variables can not be connected",
" and should be removed.")
}
return (input_data)
}
#-------------------------------------------------------------------------------
# check_state_order
#-------------------------------------------------------------------------------
# Check the state_order: verify that the var_names column is supplied, that all
# variables are present in the var_names. For each extra columns, check the
# values supplied and correct them if necessary. Wrong or not supplied values
# are assigned to default values.
# Stop if all variables are contextual or consequences, other issues will
# raise a warning and be "fixed" by using default values.
# Parameters:
# - input_data: a dataframe with variables as columns and rows as samples
# - state_order: a dataframe, can be NULL.
# possible/expected columns are:
# * var_names: the list of all variables in the input data
# * var_type: 0=discrete, 1=continuous (default deduced from the input data,
# note that numerical variables with less than x unique values will be
# discrete)
# * levels_increasing_order: NA for continuous variables. For discrete,
# can be NA or the full ordered list of the unique values. (default NA)
# * is_contextual: 0=not contextual, 1=contextual (default 0)
# * is_consequence: 0=not consequence, 1=consequence (default 0)
# additional possible columns in temporal mode are:
# * n_layers: the number of layers in the time unfolded graph
# * delta_t: the number of time steps between layers
# * mov_avg: if a moving average must applied on some variables
# NB: is_consequence is not allowed in temporal mode
# - mode: the MIIC mode
# Return: the checked and eventually generated or completed state order dataframe
#-------------------------------------------------------------------------------
check_state_order <- function (input_data, state_order, mode)
{
if (mode %in% MIIC_TEMPORAL_MODES)
input_data <- input_data[,2:ncol(input_data)]
data_var_names <- colnames (input_data)
n_vars <- length (data_var_names)
#
# Basic checks
#
if ( is.null (state_order) )
state_order <- data.frame ("var_names"=data_var_names, stringsAsFactors=F)
if ( ! is.data.frame(state_order) )
{
miic_warning ("state order",
"the supplied state_order is not a dataframe and will be ignored.")
state_order <- data.frame ("var_names"=data_var_names, stringsAsFactors=F)
}
#
# Factors lead to wrong test results
#
factor_cols <- which (unlist (lapply (state_order, is.factor) ) )
for (i in factor_cols)
state_order[,i] <- as.character (state_order[,i])
#
# Check content
#
if ( ! ("var_names" %in% colnames (state_order)) )
{
miic_warning ("state order", "the column var_names is missing,",
" the supplied state_order will be ignored.")
state_order <- data.frame ("var_names"=data_var_names, stringsAsFactors=F)
}
#
# Check if the state_order columns are valid
#
if (mode %in% MIIC_TEMPORAL_MODES)
valid_cols <- STATE_ORDER_TEMPORAL_VALID_COLUMNS
else
valid_cols <- STATE_ORDER_STANDARD_VALID_COLUMS
mismatch <- is.na (match (colnames (state_order), valid_cols))
if ( any (mismatch) )
{
msg_str <- list_to_str (colnames (state_order)[mismatch], n_max=10)
if (sum (mismatch) == 1)
miic_warning ("state order", "the column ", msg_str,
" is not valid and will be ignored.")
else
miic_warning ("state order", sum (mismatch), " columns (", msg_str,
") are not valid and will be ignored.")
state_order <- state_order[, !mismatch, drop=FALSE]
}
#
# We ensure that the var_names column is the first
#
idx_var_names <- which (colnames(state_order) == "var_names")
idx_others <- 1:ncol (state_order)
idx_others <- idx_others[idx_others != idx_var_names]
state_order <- state_order[, c(idx_var_names, idx_others), drop=FALSE]
#
# Check variables in state_order not in data
#
mismatch <- is.na (match (state_order$var_names, data_var_names))
if ( any (mismatch) )
{
msg_str <- list_to_str (state_order$var_names[mismatch], n_max=10)
if (sum (mismatch) == 1)
miic_warning ("state order", "the variable ", msg_str,
" does not match any name in input data and will be ignored.")
else
miic_warning ("state order", sum (mismatch), " variables (", msg_str,
") do not match any name in input data and will be ignored.")
state_order <- state_order[!mismatch, ]
}
#
# Before checking variables in data not in the state_order
# if var_type, is_contextual or is_consequence are present, we flag NA
# in these columns as "NA" to be able to display correct warnings later.
# The same applies for the specific columns of the temporal modes.
# ( !! this change the column type to character, even if no NA is detected !! )
#
if ("var_type" %in% colnames (state_order) )
state_order$var_type[ is.na (state_order$var_type) ] <- "NA"
if ("is_contextual" %in% colnames (state_order) )
state_order$is_contextual[ is.na (state_order$is_contextual) ] <- "NA"
if ("is_consequence" %in% colnames (state_order) )
state_order$is_consequence[ is.na (state_order$is_consequence) ] <- "NA"
if (mode %in% MIIC_TEMPORAL_MODES)
{
if ("n_layers" %in% colnames (state_order) )
state_order$n_layers[ is.na (state_order$n_layers) ] <- "NA"
if ("delta_t" %in% colnames (state_order) )
state_order$delta_t[ is.na (state_order$delta_t) ] <- "NA"
if ("mov_avg" %in% colnames (state_order) )
state_order$mov_avg[ is.na (state_order$mov_avg) ] <- "NA"
}
#
# Check variable in data not in the state_order
#
not_found <- is.na (match (data_var_names, state_order$var_names))
if ( any (not_found) )
{
msg_str <- list_to_str (data_var_names[not_found], n_max=10)
if ( sum (not_found) == 1)
miic_warning ("state order", "the variables ", msg_str,
" in input data can not be found in the state order. Default values",
" will be used for this variable.")
else
miic_warning ("state order", sum (not_found), " variables (", msg_str,
") in input data can not be found in the state order. Default values",
" will be used for these variables.")
#
# Add missing variable names with NA in the other columns
#
na_vals <- rep (NA, ncol(state_order) - 1)
for (i in which (not_found))
state_order[nrow(state_order)+1,] <- c(data_var_names[i], na_vals)
}
#
# The state_order rows are ordered as the variables in the data
#
state_order <- state_order [order (match(state_order$var_names, data_var_names)),,
drop=FALSE]
rownames (state_order) <- NULL
#
# var_type (0=discrete / 1=continuous)
#
data_is_num <- unlist (lapply (input_data, is.numeric) )
n_unique_vals <- unlist (lapply (input_data, function (x) {
length (unique (x[!is.na(x)] ) ) } ) )
var_type_specified <- rep (F, n_vars)
if ( ! ("var_type" %in% colnames (state_order) ) )
{
state_order$var_type <- as.integer (data_is_num)
#
# Continuous Variables with less than MIIC_CONTINUOUS_TRESHOLD are
# considered as discrete
#
state_order$var_type [ n_unique_vals < MIIC_CONTINUOUS_TRESHOLD ] = 0
}
else
{
var_type_specified <- rep (T, n_vars)
#
# Exclude NAs from the warning (NA = row added because var name missing)
#
non_valid <- ( ( ! (is.na (state_order$var_type) ) )
& ( ! (state_order$var_type %in% c(0,1)) ) )
if ( any (non_valid) )
{
msg_str <- list_to_str (state_order$var_names[non_valid], n_max=10)
if ( sum (non_valid) == 1)
miic_warning ("state order", "the variable ", msg_str,
" does not have a valid value in the var_type column,",
" the invalid value be ignored and type determined from data.")
else
miic_warning ("state order", sum(non_valid), " variables (", msg_str,
") do not have a valid value in the var_type column,",
" the invalid values will be ignored and types determined from data.")
}
#
# All non 0 or 1 need to be fixed
#
non_valid <- ! (state_order$var_type %in% c(0,1))
if ( any (non_valid) )
{
state_order$var_type[non_valid] <- as.integer(data_is_num)[non_valid]
var_type_specified[non_valid] <- F
}
#
# Ensure the type of var_type is numerical
# (because when looking for NAs present before, the column type has been
# shifted to character. Now, we are sure that we have only O and 1 => as.int
#
state_order$var_type = as.integer(state_order$var_type)
#
# Check var_type against data
#
pb_continuous <- (state_order$var_type == 1) & (!data_is_num)
if ( any (pb_continuous) )
{
msg_str <- list_to_str (state_order$var_names[pb_continuous], n_max=10)
if ( sum (pb_continuous) == 1)
miic_warning ("state order", "the variable ", msg_str,
" is declared continuous in the var_type column but is not numeric.",
" This variable will be considered as discrete.")
else
miic_warning ("state order", sum (pb_continuous), " variables (", msg_str,
") are declared continuous in the var_type column but these variables",
" are not numeric. These variables will be considered as discrete.")
state_order$var_type[pb_continuous] <- 0
}
#
# In temporal mode, we store if var_type was specified by the user for a
# future use
#
if (mode %in% MIIC_TEMPORAL_MODES)
state_order$var_type_specified <- var_type_specified
}
#
# Check the number of unique values versus var_type
#
for (i in 1:n_vars)
{
if (state_order[i, "var_type"] == 1) # Continuous
{
# Less than 3 unique values does not make sense for a continuous variable
#
if (n_unique_vals[[i]] <= 2)
{
if (var_type_specified[[i]])
miic_warning ("state order", "variable ", data_var_names[[i]],
" specified as continuous has only ", n_unique_vals[[i]],
" non-NA unique values. It will be processed as discrete.")
state_order$var_type[[i]] <- 0
}
#
# Less than MIIC_CONTINUOUS_TRESHOLD unique variables can be discretized
# but may not be truly continuous
#
else if (n_unique_vals[[i]] < MIIC_CONTINUOUS_TRESHOLD)
miic_warning ("state order", "numerical variable ", data_var_names[[i]],
" is treated as continuous but has only ", n_unique_vals[[i]],
" non-NA unique values.")
}
else # discrete var
{
if ( data_is_num[[i]] && (n_unique_vals[[i]] >= MIIC_CONTINUOUS_TRESHOLD * 2) )
miic_warning ("state order", "numerical variable ", data_var_names[[i]],
" is treated as discrete but has ", n_unique_vals[[i]], " levels.")
}
}
#
# is_contextual
#
if ( ! ("is_contextual" %in% colnames (state_order) ) )
state_order$is_contextual <- rep (0, n_vars)
else
{
# Exclude NAs from the warning (NA = row added because var name missing)
#
non_valid <- ( ( ! (is.na (state_order$is_contextual) ) )
& ( ! (state_order$is_contextual %in% c(0,1)) ) )
if (any (non_valid))
{
msg_str <- list_to_str (state_order$var_names[non_valid], n_max=10)
if (sum (non_valid) == 1)
miic_warning ("state order", "the variable ", msg_str,
" does not have a valid value in the is_contextual column,",
" this variable will be considered as not contextual.")
else
miic_warning ("state order", sum (non_valid), " variables (", msg_str,
") do not have a valid value in the is_contextual column,",
" these variables will be considered as not contextual.")
}
#
# All non 0 or 1 are not valid => set to not contextual
#
non_valid <- ! (state_order$is_contextual %in% c(0,1))
if (any (non_valid))
state_order$is_contextual[non_valid] <- 0
#
# Ensure the type of is_contextual is numerical
# (because when looking for NAs present before, the column type has been
# shifted to character. Now, we are sure that we have only O and 1 => as.int
#
state_order$is_contextual = as.integer(state_order$is_contextual)
#
# Stop if all variables are contextual
#
if (all (state_order$is_contextual == 1))
miic_error ("state order", "All variables have been defined as",
" contextual. No network can be infered with these settings.")
}
#
# is_consequence
#
if ( ! ("is_consequence" %in% colnames (state_order) ) )
state_order$is_consequence <- rep (0, n_vars)
else
{
if (mode %in% MIIC_TEMPORAL_MODES)
{
# Exclude NAs from warnings (NA = row added because var name missing)
# => Look of anything not NA and != 0
#
conseq_def <- ( ( ! (is.na (state_order$is_consequence) ) )
& ( is.null (state_order$is_consequence)
| (state_order$is_consequence != 0) ) )
if (any (conseq_def))
{
msg_str <- list_to_str (state_order$var_names[conseq_def], n_max=10)
if (sum (conseq_def) == 1)
miic_warning ("state order", "the variable ", msg_str,
" is defined as consequence but consequence prior is not compatible",
" with temporal mode. This variable will be considered as not",
" consequence")
else
miic_warning ("state order", sum (conseq_def), " variables (", msg_str,
") are defined as consequence but consequence prior is not compatible",
" with temporal mode. These variables will be considered as not",
" consequence")
state_order$is_consequence <- rep (0, n_vars)
}
}
else # Not temporal mode
{
# Exclude NAs from warnings (NA = row added because var name missing)
#
non_valid <- ( ( ! (is.na (state_order$is_consequence) ) )
& ( ! (state_order$is_consequence %in% c(0,1)) ) )
if (any (non_valid))
{
msg_str <- list_to_str (state_order$var_names[non_valid], n_max=10)
if (sum (non_valid) == 1)
miic_warning ("state order", "the variable ", msg_str,
" does not have a valid value in the is_consequence column,",
" this variable will be considered as not consequence")
else
miic_warning ("state order", sum (non_valid), " variables (", msg_str,
") do not have a valid value in the is_consequence column,",
" these variables will be considered as not consequence")
}
#
# All non 0 or 1 are not valid => set to not consequence
#
non_valid <- ! (state_order$is_consequence %in% c(0,1))
if (any (non_valid))
state_order$is_consequence[non_valid] <- 0
#
# Ensure the type of is_consequence is numerical
# (because when looking for NAs present before, the column type has been
# shifted to character. Now, we are sure that we have only O and 1 => as.int
#
state_order$is_consequence = as.integer(state_order$is_consequence)
#
# Stop if all variables are consequences
#
if (all (state_order$is_consequence == 1))
miic_error ("state order", "All variables have been defined as",
" consequences. No network can be infered with these settings.")
}
}
#
# levels_increasing_order
#
if ( ! ("levels_increasing_order" %in% colnames (state_order) ) )
state_order$levels_increasing_order <- NA
else
{
for (i in 1:n_vars)
{
order_str <- state_order[i, "levels_increasing_order"]
if ( is.na (order_str) )
next
if (order_str == "")
{
state_order[i, "levels_increasing_order"] <- NA
next
}
if (state_order[i, "var_type"] == 1)
{
miic_warning ("state order", "variable ", state_order[i, "var_names"],
" is considered as a continuous variable,",
" the provided levels order will be ignored.")
state_order[i, "levels_increasing_order"] <- NA
next
}
#
# Discrete var, check the match of unique values in data and values
# in levels_increasing_order
#
orders <- trimws (unlist (strsplit (as.character (order_str), ",") ) )
values <- unique (input_data[!is.na(input_data[,i]),i])
#
# Convert values in state order using the same type as data.
# It will avoid issues when comparing TRUE/FALSE with T/F or 1.0 with 1
# If the values comming from the state_order can not be converted,
# leave the value unchanged to display a meaningful warning laterly
#
if (is.logical (values))
{
suppressWarnings ( { orders_log <- as.logical(orders) } )
orders[!is.na (orders_log)] <- orders_log[!is.na (orders_log)]
}
else if (is.integer (values))
{
suppressWarnings ( { orders_int <- as.integer(orders) } )
orders[!is.na (orders_int)] <- orders_int[!is.na (orders_int)]
}
else if (is.numeric (values))
{
suppressWarnings ( { orders_num <- as.numeric(orders) } )
orders[!is.na (orders_num)] <- orders_num[!is.na (orders_num)]
}
orders <- as.character(orders)
values <- as.character (values)
#
# If the column in input_data does not contain a "NA" string
# and if the levels_increasing_order contains "NA" string,
# issue a specific warning about NA
# NB : we test here only "NA" string as only "NA" is converted as NA in R
# by default when using read.table or read.csv, other #NA, N/A, ...
# will however be discarded later with a less specific warning
#
if ( (! ("NA" %in% values)) && ("NA" %in% orders) )
{
miic_warning ("state order", "variable ", state_order[i, "var_names"],
" has a NA value in the provided levels order. NA can not be used to",
" order levels and should not be included in the provided levels order.")
orders <- orders[ orders != "NA"]
if ( length (orders) == 0 )
{
state_order[i, "levels_increasing_order"] <- NA
next
}
}
#
# Check if some provided levels are not in the data
#
not_in_data <- is.na (match (orders, values) )
if ( any (not_in_data) )
{
msg_str <- list_to_str (orders[not_in_data], n_max=10)
if (sum (not_in_data) == 1)
miic_warning ("state order", "variable ", state_order[i, "var_names"],
" has value ", msg_str, " in the provided levels order not present",
" in the data. This value will be ignored.")
else
miic_warning ("state order", "variable ", state_order[i, "var_names"],
" has values ", msg_str, " in the provided levels order not present",
" in the data. These values will be ignored.")
orders <- orders[!not_in_data]
if ( length (orders) == 0 )
{
state_order[i, "levels_increasing_order"] <- NA
next
}
}
#
# Check if missing levels compared to data
#
absent <- is.na (match (values, orders) )
if ( any (absent) )
{
msg_str <- list_to_str (values[absent], n_max=10)
if (sum (absent) == 1)
miic_warning ("state order", "variable ", state_order[i, "var_names"],
" has value ", msg_str, " in the data that can not be found",
" in the provided levels order.",
" The provided levels order for this variable will be ignored.")
else
miic_warning ("state order", "variable ", state_order[i, "var_names"],
" has values ", msg_str, " in the data that can not be found",
" in the provided levels order.",
" The provided levels order for this variable will be ignored.")
state_order[i, "levels_increasing_order"] <- NA
next
}
#
# If the levels_increasing_order was not turned into NA,
# update the levels_increasing_order to have a clean string without
# leading or trailing blanks and same type format between data and state
# order (i.e. if data column is logical (TRUE/FALSE), values (T/F) in the
# state_order will be converted as TRUE/FALSE )
#
state_order[i, "levels_increasing_order"] <- paste0 (orders, collapse=",")
}
}
#
# Cross checks : check that no var is both contextual and consequence
#
ctx_and_csq = state_order$is_contextual + state_order$is_consequence
ctx_and_csq = (ctx_and_csq >= 2)
if (any (ctx_and_csq))
{
msg_str <- list_to_str (state_order$var_names[ctx_and_csq], n_max=10)
if (sum (ctx_and_csq) == 1)
miic_warning ("state order", "the variable ", msg_str,
" can not be defined as both contextual and consequence. This variable",
" will be considered as neither contextual nor consequence.")
else
miic_warning ("state order", sum (ctx_and_csq), " variables (", msg_str,
") can not be defined as both contextual and consequence. These",
" variables will be considered as neither contextual nor consequence.")
state_order$is_contextual[ctx_and_csq] = 0
state_order$is_consequence[ctx_and_csq] = 0
}
return (state_order)
}
#-------------------------------------------------------------------------------
# check_other_df
#-------------------------------------------------------------------------------
# input_data: a data frame with variables as columns and rows as samples
# - df: the data fame to check, expected to be a 2 columns data frame in
# standard mode and 3 columns data frame in temporal mode.
# All values in 2 first columns of the data frame are expected to be variables
# names, and in temporal mode, the 3rd column is expected to contain lags.
# An invalid data frame will be ignored, Invalid rows will be discarded
# - state_order: the data frame returned by check_state_order
# - df_name: the data fame name (i.e. :"black box", "true edges")
# - mode: the MIIC mode
# return: the data frame checked
#-------------------------------------------------------------------------------
check_other_df <- function (input_data, state_order, df, df_name, mode)
{
if ( is.null(df) )
return (NULL)
#
# Basic checks
#
if ( ! is.data.frame(df) )
{
miic_warning (df_name, "The ", df_name, " parameter, if provided,",
" must be a dataframe. The ", df_name, " will be ignored.")
return (NULL)
}
#
# Factors lead to wrong test results
#
factor_cols <- which (unlist (lapply (df, is.factor) ) )
for (i in factor_cols)
df[,i] <- as.character (df[,i])
#
# Check number of cols
#
if (mode %in% MIIC_TEMPORAL_MODES)
{
input_data = input_data[, 2:ncol(input_data), drop=F]
n_cols <- 3
}
else
n_cols <- 2
if (ncol(df) != n_cols)
{
miic_warning (df_name, "The expected dataframe must have ", n_cols,
" columns but the provided one has ", ncol(df), " and will be ignored.")
return (NULL)
}
if (nrow(df) == 0)
{
miic_warning (df_name, "The provided dataframe is empty.")
return (NULL)
}
data_var_names <- colnames (input_data)
rows_with_warning <- c()
for ( row_idx in 1:nrow(df) )
{
for (col_idx in 1:2)
{
one_var_name <- df[row_idx, col_idx]
if (! (one_var_name %in% data_var_names) )
{
miic_warning (df_name, "The variable ", one_var_name,
" is not present in the input data. The row ", row_idx, " will be ignored.")
rows_with_warning[[length(rows_with_warning)+1]] <- row_idx
}
}
if ( (length(rows_with_warning) > 0)
&& (rows_with_warning[[length(rows_with_warning)]] == row_idx) )
next
if ( ( ! (mode %in% MIIC_TEMPORAL_MODES) )
&& (df[row_idx, 1] == df[row_idx, 2]) )
{
miic_warning (df_name, "the variables must be different for each row (found ",
df[row_idx, 1], " two times at row ", row_idx, "). This row will be ignored.")
rows_with_warning[[length(rows_with_warning)+1]] <- row_idx
}
}
rows_ok <- unlist (lapply (1:nrow(df), FUN=function (x) { ! (x %in% rows_with_warning) } ) )
df <- df [rows_ok, , drop=F]
if (nrow(df) == 0)
{
miic_warning (df_name, "The provided dataframe is empty.")
return (NULL)
}
#
# In temporal mode, check that the 3rd columns is integer >= 0 (lags)
#
if (mode %in% MIIC_TEMPORAL_MODES)
{
wrong_lags = unlist (lapply (df[,3], FUN=function(x) {
if ( is.null (x) ) # NULL: KO
return (TRUE)
if ( is.na (x) ) # NA: OK for now
return (FALSE)
else if ( is.na ( suppressWarnings (as.numeric(x)) ) ) # Not num: KO
return (TRUE)
else if ( round(as.numeric(x),0) != as.numeric(x) ) # Not int: KO
return (TRUE)
else if ( (as.numeric(x) < 0) ) # <0: KO
return (TRUE)
else
return (FALSE) # OK
} ) )
if ( any (wrong_lags) )
{
msg_str <- list_to_str (which(wrong_lags), n_max=10)
if (sum (wrong_lags) == 1)
miic_warning (df_name, "lag is incorrect at row ", msg_str,
", this line will be ignored.")
else
miic_warning (df_name, "lag is incorrect for multiple rows (", msg_str,
"), these rows will be ignored.")
df <- df [!wrong_lags, , drop=F]
}
if (nrow(df) == 0)
{
miic_warning (df_name, "The provided dataframe is empty.")
return (NULL)
}
#
# Check that contextual lag are NA
#
contextuals = unlist ( apply ( df, MARGIN=1, FUN=function (x) {
orig_idx = which (state_order$var_names == x[[1]])
dest_idx = which (state_order$var_names == x[[2]])
return ( (state_order[orig_idx, "is_contextual"] == 1)
|| (state_order[dest_idx, "is_contextual"] == 1) ) } ) )
wrongs_ctx = ( contextuals & ( ! is.na (df[,3]) ) )
if ( any (wrongs_ctx) )
{
if (sum (wrongs_ctx) == 1)
miic_warning (df_name, "lags for contextual variables must be NA.",
" The line ", df[wrongs_ctx, 1], " - ", df[wrongs_ctx, 2], " lag ",
df[wrongs_ctx, 3], " will be ignored.")
else
miic_warning (df_name, "lags for contextual variables must be NAs. ",
sum (wrongs_ctx), " wrong line will be ignored.")
}
#
# Check that lag >= 0 if not contextual
#
wrongs_lagged = ( (!contextuals) & is.na (df[,3]) )
if ( any (wrongs_lagged) )
{
if (sum (wrongs_lagged) == 1)
miic_warning (df_name, "lag for non contextual variables must be >= 0.",
" The line ", df[wrongs_lagged, 1], " - ", df[wrongs_lagged, 2], " lag ",
df[wrongs_lagged, 3], " will be ignored.")
else
miic_warning (df_name, "lags for non contextual variables must be >= 0. ",
sum (wrongs_lagged), " wrong lines will be ignored.")
}
#
# The self loops need a lag > 0
#
wrongs_selfs = ( (!contextuals) & (df[,1] == df[,2]) & (df[,3] == 0) )
if ( any (wrongs_selfs) )
{
if (sum (wrongs_selfs) == 1)
miic_warning (df_name, "lag for self loops must be > 0.",
" The line ", df[wrongs_selfs, 1], " - ", df[wrongs_selfs, 2], " lag ",
df[wrongs_selfs, 3], " will be ignored.")
else
miic_warning (df_name, "lags for self loops must be > 0. ",
sum (wrongs_selfs), " wrong lines will be ignored.")
}
df <- df [ (!wrongs_ctx) & (!wrongs_lagged) & (!wrongs_selfs), , drop=F]
if (nrow(df) == 0)
{
miic_warning (df_name, "The provided dataframe is empty.")
return (NULL)
}
}
#
# Remove duplicate row
#
n_rows_sav = nrow(df)
#
# Equal rows
#
df = unique (df)
rownames(df) = NULL
#
# We remove equal rows but with variable names swapped
# as edges in black box are not oriented and, for true edges,
# the post-processing will not be able to process opposite edges
#
rows_kept = rep (T, nrow(df))
for (i in 1:nrow(df))
{
if ( ! rows_kept[[i]] )
next
if (mode %in% MIIC_TEMPORAL_MODES)
{
# In temporal mode, lag != 0 with variable swapped are not duplicate
#
if ( (!is.na(df[i,3])) && (df[i,3] != 0) )
next
dup_inverse = ( (df[,1] == df[i,2])
& (df[,2] == df[i,1])
& (rownames(df) != i)
& (is.na(df[,3]) | (df[,3] == 0)) )
}
else
dup_inverse = ( (df[,1] == df[i,2])
& (df[,2] == df[i,1])
& (rownames(df) != i) )
rows_kept = rows_kept & (!dup_inverse)
}
df = df[rows_kept, , drop=F]
if ( n_rows_sav != nrow(df) )
{
if (df_name == "true edges")
{
miic_warning (df_name, "the implementation of truth edges",
" is not designed to handle opposite edges.",
" Only one direction will be considered for the opposite edge(s).")
}
else
{
if (n_rows_sav - nrow(df) == 1)
miic_warning (df_name, "1 row is duplicated. Only one instance",
" of the row will be used.")
else
miic_warning (df_name, n_rows_sav - nrow(df), " rows are duplicated.",
" Only one instance of these rows will be used.")
}
}
if (nrow(df) == 0)
{
miic_warning (df_name, "The provided dataframe is empty.")
return (NULL)
}
return (df)
}
#-------------------------------------------------------------------------------
# check_param_string
#-------------------------------------------------------------------------------
# Params :
# - value: the parameter to check
# - name: the name of the parameter
# - list: the possible values
# Returns: the checked parameter, eventually reset to its default value
#-------------------------------------------------------------------------------
check_param_string <- function (value, name, possibles)
{
if ( is.null(value)
|| (length (value) != 1)
|| is.na(value)
|| (!is.character(value))
|| (!(value %in% possibles)) )
{
msg_str = paste0 (paste0 ("'", possibles, "'"), collapse=", ")
if ( is.null (value) )
val_str = "NULL"
else
val_str = list_to_str (value)
miic_warning ("parameters", "supplied value '", val_str,
"' for the ", name, " parameter is invalid. Possible values are: ",
msg_str, ". The default value ('", possibles[[1]], "') will be used.")
value = possibles[[1]]
}
return (value)
}
#-------------------------------------------------------------------------------
# check_param_logical
#-------------------------------------------------------------------------------
# Params :
# - value: the parameter to check
# - name: the name of the parameter
# - default: the default value
# Returns: the checked parameter, eventually reset to its default value
#-------------------------------------------------------------------------------
check_param_logical <- function (value, name, default)
{
if ( is.null (value)
|| (length (value) != 1)
|| is.na (value)
|| (!is.logical(value)) )
{
if ( is.null (value) )
val_str = "NULL"
else
val_str = list_to_str (value)
miic_warning ("parameters", "supplied value ", val_str,
" for the ", name, " parameter is invalid. It must be TRUE/FALSE.",
" The default value (", default, ") will be used.")
value = default
}
return (value)
}
#-------------------------------------------------------------------------------
# test_param_wrong_int
#-------------------------------------------------------------------------------
# Params :
# - value: the parameter to check
# - min: the min value, NA if none
# - max: the max values, NA if none
# Returns: TRUE if the value is not an int or not in the range, FALSE otherwise
#-------------------------------------------------------------------------------
test_param_wrong_int <- function (value, min=NA, max=NA)
{
return ( is.null (value)
|| (length (value) != 1)
|| is.na (value)
|| (!is.numeric(value))
|| (round(value,0) != value)
|| ((!is.na (min)) && (value < min))
|| ((!is.na (max)) && (value > max)) )
}
#-------------------------------------------------------------------------------
# check_param_int
#-------------------------------------------------------------------------------
# Params :
# - value: the parameter to check
# - name: the name of the parameter
# - min_max: a tuple with min and max values. NA if no min and/or no max
# - default: the default value
# Returns: the checked parameter, eventually reset to its default value
#-------------------------------------------------------------------------------
check_param_int <- function (value, name, default, min=NA, max=NA)
{
if ( test_param_wrong_int (value, min, max) )
{
msg_str = " It must be an integer."
if ( (!is.na(min)) && (!is.na(max)) )
msg_str = paste0 (" It must be an integer in the range [",
min, ", ", max, "].")
else if ( ! is.na (min) )
msg_str = paste0 (" It must be an integer >= ", min, ".")
else if ( ! is.na (max) )
msg_str = paste0 (" It must be an integer <= ", max, ".")
if ( is.null (value) )
val_str = "NULL"
else
val_str = list_to_str (value)
miic_warning ("parameters", "supplied value ", val_str,
" for the ", name, " parameter is invalid." , msg_str,
" The default value (", default, ") will be used.")
value = default
}
return (value)
}
#-------------------------------------------------------------------------------
# test_param_wrong_float
#-------------------------------------------------------------------------------
# Params :
# - value: the parameter to check
# - min: the min value, NA if none
# - max: the max values, NA if none
# Returns: TRUE if the value is not a float or not in the range, FALSE otherwise
#-------------------------------------------------------------------------------
test_param_wrong_float <- function (value, min=NA, max=NA)
{
return ( is.null (value)
|| (length (value) != 1)
|| is.na (value)
|| (!is.numeric(value))
|| ((!is.na (min)) && (value < min))
|| ((!is.na (max)) && (value > max)) )
}
#-------------------------------------------------------------------------------
# check_parameters
#-------------------------------------------------------------------------------
# Check all input parameters that are not df and not temporal
# Params :
# - input_data: a dataframe with input data
# - all possible parameters of miic method
# Returns: a list with all the parameters, eventually modified or initialized
#-------------------------------------------------------------------------------
check_parameters <- function (input_data, n_threads, cplx,
orientation, ort_proba_ratio, ort_consensus_ratio, propagation, latent,
n_eff, n_shuffles, conf_threshold, sample_weights, test_mar,
consistent, max_iteration, consensus_threshold,
mode, negative_info, verbose) {
list_ret = list ("mode" = mode)
list_ret$n_threads = check_param_int (n_threads, "n_threads", 1, min=1, max=NA)
list_ret$cplx = check_param_string (cplx, "complexity", c("nml", "bic"))
list_ret$orientation = check_param_logical (orientation, "orientation", TRUE)
if ( test_param_wrong_float (ort_proba_ratio, min=0, max=1) )
{
miic_warning ("parameters", "supplied value ", ort_proba_ratio,
" for the orientation probability ratio parameter is invalid.",
" It must be a floating point between 0 and 1.",
" The default value (1) will be used.")
ort_proba_ratio = 1
}
list_ret$ort_proba_ratio = ort_proba_ratio
if ( is.null (ort_consensus_ratio) )
ort_consensus_ratio = list_ret$ort_proba_ratio
else if ( test_param_wrong_float (ort_consensus_ratio, min=0, max=1) )
{
miic_warning ("parameters", "supplied value ", ort_consensus_ratio,
" for the orientation concensus ratio parameter is invalid.",
" It must be a floating point between 0 and 1.",
" The default value (same as orientation probabilty ratio: ",
ort_proba_ratio, ") will be used.")
ort_consensus_ratio = list_ret$ort_proba_ratio
}
list_ret$ort_consensus_ratio = ort_consensus_ratio
list_ret$propagation = check_param_logical (propagation, "propagation", FALSE)
list_ret$latent = check_param_string (latent, "latent", MIIC_VALID_LATENT)
if ( test_param_wrong_int (n_eff, min=-1, max=nrow(input_data) )
|| (n_eff == 0) )
{
miic_warning ("parameters", "supplied value ", n_eff,
" for the number of effective samples is invalid.",
" The number of effective samples must be an integer that can be -1",
" for an automatic assignment or a positive number less or",
" equal to the number of samples. The default value (-1) will be used.")
n_eff = -1
}
list_ret$n_eff = n_eff
n_shuffles = check_param_int (n_shuffles, "number of shufflings", 0, min=0, max=NA)
if (n_shuffles == 0)
{
if ( (length(conf_threshold) > 1)
|| ( (!is.null (conf_threshold))
&& (!is.na (conf_threshold))
&& (conf_threshold != 0) ) )
miic_warning ("parameters", "supplied value ", list_to_str (conf_threshold),
" for the confidence threshold parameter will be ignored",
" as the number of shufflings is set to 0.",
" To activate the confidencence cut, both the number of shufflings",
" and the confidence threshold must be > 0 (i.e.: n_shuffles = 100",
" and conf_threshold = 0.01).")
conf_threshold = 0
}
else
{
if ( test_param_wrong_float (conf_threshold, min=0, max=NA) )
{
miic_warning ("parameters", "supplied value ", conf_threshold,
" for the confidence threshold parameter is invalid.",
" When confidence cut is activated (when n_shuffles > 0),",
" the confidence threshold must be a floating point > 0. The",
" confidence cut will be desactivated and default values will be used",
" for the number of shufflings (0) and the confidence threshold (0).")
n_shuffles = 0
conf_threshold = 0
}
else if (conf_threshold == 0)
{
miic_warning ("parameters", "the confidence threshold parameter is 0",
" but it must be > 0 when confidence cut is activated",
" (when n_shuffles > 0). The confidence cut will be desactivated.",
" To activate the confidencence cut, both the number of shufflings",
" and the confidence threshold must be > 0 (i.e.: n_shuffles = 100",
" and conf_threshold = 0.01).")
n_shuffles = 0
}
}
list_ret$n_shuffles = n_shuffles
list_ret$conf_threshold = conf_threshold
if ( ( ! is.null (sample_weights) )
&& ( (length(sample_weights) != nrow(input_data))
|| (any(is.na (sample_weights)))
|| (any(!is.numeric(sample_weights)))
|| (any(sample_weights < 0))
|| (any(sample_weights > 1)) ) )
{
miic_warning ("parameters", "supplied value for the sample_weights parameter",
" is invalid. It must be a vector of the same size as the number of",
" samples in the input data and all weights must be floating points",
" in the [0,1] range. The parameter will be ignored.")
sample_weights = NULL
}
list_ret$sample_weights = sample_weights
list_ret$test_mar = check_param_logical (test_mar, "missing at random test", TRUE)
list_ret$consistent = check_param_string (consistent, "consistent", MIIC_VALID_CONSISTENT)
if (list_ret$consistent == "no")
{
if ( (length (max_iteration) > 1)
|| ( (!is.null (max_iteration))
&& (!is.na (max_iteration))
&& (max_iteration != 100) ) )
miic_warning ("parameters", "supplied value ", list_to_str(max_iteration),
" for the maximum iteration parameter will not be used",
" as consistency is off.")
max_iteration = 100
if ( (length (consensus_threshold) > 1)
|| ( (!is.null (consensus_threshold))
&& (!is.na (consensus_threshold))
&& (consensus_threshold != 0.8) ) )
miic_warning ("parameters", "Supplied value ", list_to_str(consensus_threshold),
" for the consensus threshold parameter will not be used",
" as consistency is off.")
consensus_threshold = 0.8
}
else # Consistency on
{
if ( test_param_wrong_int (max_iteration, min=1, max=NA) )
{
miic_warning ("parameters", "supplied value ", max_iteration,
" for the maximum iteration parameter is invalid.",
" It must be a stricly positive integer when consistency is activated.",
" The default value (100) will be used.")
max_iteration = 100
}
if ( test_param_wrong_float (consensus_threshold, min=0.5, max=1) )
{
miic_warning ("parameters", "supplied value ", consensus_threshold,
" for the consensus threshold parameter is invalid.",
" It must be a floating point between 0.5 and 1 when consistency is",
" activated. The default value (0.8) will be used.")
consensus_threshold = 0.8
}
}
list_ret$max_iteration = max_iteration
list_ret$consensus_threshold = consensus_threshold
list_ret$negative_info = check_param_logical (negative_info,
"allowing/disallowing negative shifted mutual information", FALSE)
list_ret$verbose = check_param_logical (verbose, "verbose", FALSE)
return (list_ret)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.