#' @export
#' @title Subgroup discovery algorithms for use with structural equation models
#' @description This function is the main function of the package and can be
#' flexibly used to interface the python module pysubgroup for efficiently
#' finding subgroups in structural equation models estimated by the R package
#' lavaan.
#' @param f_fit Function to be fitted. Must take at least two arguments.
#' \code{f_fit} has the signature \code{function(group, dat, ...)}. \code{group}
#' is a numeric vector. The length of this vector equals the rows in the data
#' frame \code{dat} and is to be interpreted as an additional column indicating
#' the group assignment. \code{f_fit} returns the interestingness measure.
#' Returned values should be greater than \code{min_quality} in case of sucess
#' and smaller in case of failure (e.g., non-convergence, error).
#' @param dat A data frame.
#' @param columns Column names of the provided data frame which are to be
#' analysed. Columns must have ordinal or nominal scale.
#' @param ignore Optional argument. If \code{columns = NULL}, \code{ignore} will
#' be used to select every column that is not in ignore.
#' @param algorithm A character specifying the subgroup discovery algorithm to
#' use. An exhaustive depth-first search is provided with 'SimpleDFS' (default)
#' . A heuristic (non-exhaustive) Beam search is provided with 'Beam', but not
#' yet implemented.
#' @param max_n_subgroups Maximum number of subgroups. Default is 10.
#' @param search_depth Maximum number of attribute combinations. Default is 3.
#' @param min_quality Minimum value of interestingness measure. Values below
#' will not be considered. Default is 0.
#' @param min_subgroup_size Minimum size of a subgroup. Subgroups with sizes
#' below will not be considered. The absolute minimum is set to 50 units, which
#' can not be lowered. If NULL (default) the absolute minimum is applied.
#' @param na_rm Boolean. Default is FALSE. If set to TRUE, cases with NA values
#' on any column will be set to FALSE in the \code{sg} vector. If set to FALSE,
#' the regarding in the \code{sg} vector will be also \code{NA}.
#' @param weighting_attr This option is \emph{deprecated}.
#' @param generalization_aware This option is \emph{deprecated}.
#' @param bw Integer for beam width. Only used if algorithm is Beam search.
#' Defaults to \code{max_n_subgroups}.
#' @param verbose Logical. Get some information, what is going on. Defaults
#' to \code{FALSE}.
#' @param ... Additional arguments to be passed to \code{f_fit}. Currently,
#' not well implemented.
#' @return List containing the time consumed and the groups.
#' @examples
#' if (FALSE) {
#' model <- "
#' eta1 =~ NA*x1 + x2 + x3
#' eta2 =~ NA*x4 + x5 + x6
#' eta3 =~ NA*x7 + x8 + x9
#'
#' eta1 ~~ 1*eta1
#' eta2 ~~ 1*eta2
#' eta3 ~~ 1*eta3
#'
#' eta1 + eta2 + eta3 ~ 0*1
#' "
#'
#' f_fit <- function(sg, dat) {
#' # Add subgroup to dataset (from logical to numeric)
#' sg <- as.numeric(sg)
#' dat$subgroup <- sg
#'
#' # if all participants in subgroup return -1
#' if (all(sg == 1)) {
#' rval <- 0
#' return(rval)
#' }
#' rval <- tryCatch(
#' {
#' # Fit Model
#' fit <- sem(model, data = dat, group = "subgroup")
#' stopifnot(lavInspect(fit, "post.check"))
#'
#' # Compute interestingness measure
#' tmp <- partable(fit)
#' lam1 <- tmp$est[
#' tmp$lhs == "eta1" &
#' tmp$op == "=~" & tmp$group == 1
#' ]
#' lam2 <- tmp$est[
#' tmp$lhs == "eta1" &
#' tmp$op == "=~" &
#' tmp$group == 2
#' ]
#' difflam <- abs(lam2 - lam1)
#' rval <- sum(sg, na.rm = T)^0.5 * sum(difflam)
#' },
#' error = function(e) -1
#' )
#'
#' if (!is.numeric(rval) | length(rval) > 1) {
#' rval <- -1
#' }
#'
#' return(rval)
#' }
#'
#' m1 <- subgroupsem(
#' f_fit = f_fit,
#' dat = HolzingerSwineford1939,
#' columns = c("sex", "school", "grade")
#' )
#' summary(m1)
#' }
#' @importFrom reticulate import_main import py_run_string
#' @import lavaan
subgroupsem <- function(f_fit,
dat,
columns = names(dat),
ignore = NULL,
algorithm = "SimpleDFS",
max_n_subgroups = 10L,
search_depth = 3L,
min_quality = 0,
min_subgroup_size = NULL,
weighting_attr = NULL,
generalization_aware = FALSE,
na_rm = FALSE,
bw = NULL,
verbose = FALSE,
...) {
# Some lines to check, whether reticulate / Python is set up correctly...
if (!((status <- subgroupsem_ready(ask = T)) == TRUE)) {
stop(status)
}
obj <- new("subgroupsem")
obj@call <- match.call()
# Some checks due to the refactoring arguments may be deprecated
# Weighting attribute was never implemented and is currently not planned
if (!is.null(weighting_attr)) {
warning(
paste(
"subgroupsem warning:",
"Option 'weighting_attr' is deprecated and will not be used."
)
)
}
# Generalization awareness was implemented before,
# but not sure whether it is useful
if (generalization_aware) {
warning(
paste(
"subgroupsem warning:",
"Option 'generalization_aware' is deprecated and will not be used."
)
)
}
# Import Python environment, pysubgroup module, and semtarget class
py_main <- import_main()
py_main$ps <- import("pysubgroup")
py_run_string(get_py_classes())
## push data and f_fit function to python
py_main$data <- dat
## TODO: maybe it would be easier to extend the class Conjunction
## for the complement...
## get matrix of NAs
if (!is.null(ignore)) {
columns <- columns[!(columns %in% ignore)]
}
has_na <- sapply(columns, function(column) is.na(dat[, column]))
subsem_county <- 1L
f_fit_internal <- function(sg, selectors = NULL) {
# Empty selectors are transfered as empty list...
if (is.list(selectors)) selectors <- unlist(selectors)
## if selectors for subgroup is not NULL
if (!na_rm && !is.null(selectors)) {
## if case has NA in one of the selectors, insert NA
has_na_selectors <- apply(has_na[, selectors, drop = F], 1, any)
sg <- ifelse(has_na_selectors, NA, sg)
}
# Check if subgroup is big enough, else return -1
if (!is.null(min_subgroup_size)) {
if (sum(sg, na.rm = TRUE) < min_subgroup_size) {
return(-1)
}
}
if (verbose) {
if (subsem_county > 1L) {
cat("\r")
} else {
cat("\n")
}
msg <- paste(
"Inspecting group",
subsem_county,
"combination of",
paste(selectors, collapse = " AND ")
)
cat(msg)
subsem_county <<- subsem_county + 1L
}
## pass sg and dat to user specified function
return(f_fit(sg, dat))
}
py_main$f_fit <- f_fit_internal
# RUN PYSUBGROUP ROUTINE
# 1. Define target class
py_main$target <- py_main$SEMTarget()
# 2. Define selector variables, i.e. variables named in columns
# and not excluded through ignore
ignore_names <- names(dat)[!(names(dat) %in% columns)]
py_main$searchspace <- py_main$ps$create_selectors(
dat,
ignore = ignore_names
)
# 3. Specify the subgroup discovery task by passing all required
# arguments to the respective Python function
py_main$task <- py_main$ps$SubgroupDiscoveryTask(
py_main$data,
py_main$target,
py_main$searchspace,
qf = py_main$SEM_QF(),
result_set_size = max_n_subgroups,
depth = search_depth,
min_quality = min_quality,
constraints = NULL
)
# 4. Specify and run the search algorithm
# only DFS implemented at the moment
# Beam search should follow shortly
# Double beam search, perhaps?
start <- Sys.time()
if (algorithm == "SimpleDFS" | algorithm == "DFS") {
py_run_string("result = ps.SimpleDFS().execute(task)")
} else if (algorithm == "Beam") {
if (is.null(bw)) {
py_main$bw <- as.integer(max_n_subgroups)
} else {
py_main$bw <- as.integer(bw)
}
py_run_string("result = ps.BeamSearch(beam_width=bw).execute(task)")
} else {
warning(
paste(
"subgroupsem warning:",
"Currently only depth-first-search (DFS) and beam search are available as algorithms."
)
)
}
end <- Sys.time()
# Import results
obj@time_elapsed <- end - start
obj@summary_statistics <- py_main$result$to_dataframe()
# At last try to remove everything on the Python site from memory
clean_up_python()
return(obj)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.