#' @import Matrix methods arules
#'
library(arules)
library(R.utils)
#' @title Classifier Builder
#' @description An implementation of the CBA-CB M1 algorithm (Liu et al, 1998) adapted for R and arules package apriori implementation in place of CBA-RG.
#' @references Ma, Bing Liu Wynne Hsu Yiming. Integrating classification and association rule mining. Proceedings of the fourth international conference on knowledge discovery and data mining. 1998.
#' @param rules object of class rules from arules package
#' @param txns input object with transactions.
#' @param classitems a list of items to appear in the consequent (rhs) of the rules.
#' @param default_rule_pruning boolean indicating whether default pruning should be performed. If set to TRUE, default pruning is performed as in the CBA algorithm.
#' If set to FALSE, default pruning is not performed i.e. all rules surviving data coverage pruning are kept. In either case, a default rule is added to the end of the classifier.
#' @param rule_window the number of rules to precompute for CBA data coverage pruning. The default value can be adjusted to decrease runtime.
#' @param input_list_sorted_by_length indicates by default that the input rule list is sorted by antecedent length (as output by arules), if this param is set to false, the list will be resorted
#' @param debug output debug messages.
#' @param greedy_pruning setting to TRUE activates early stopping condition: pruning will be stopped on first rule on which total error increases.
#'
#' @return Returns an object of class \link{rules}. Note that `rules@quality` slot has been extended
#' with additional measures, specifically `orderedConf`, `orderedSupp`, and `cumulativeConf`. The rules are output in the order
#' in which they are assumed to be applied in classification. Only the first applicable rule is used to
#' classify the instance. As a result, in addition to rule confidence -- which is computed over the
#' whole training dataset -- it makes sense to define order-sensitive confidence, which is computed
#' only from instances reaching the given rule as \eqn{a/(a+b)}, where \eqn{a} is the number of instances
#' matching both the antecedent and consequent (available in slot `orderedSupp`) and \eqn{b} is the number of instances matching the antecedent, but
#' not matching the consequent of the given rule. The cumulative confidence is an experimental measure,
#' which is computed as the accuracy of the rule list comprising the given rule and all higher priority
#' rules (rules with lower index) with uncovered instances excluded from the computation.
#'
#' @export
#' @seealso \code{\link{topRules}}
#' @examples
#' #Example 1
#' txns <- as(discrNumeric(datasets::iris, "Species")$Disc.data,"transactions")
#' appearance <- getAppearance(datasets::iris,"Species")
#' rules <- apriori(txns, parameter = list(confidence = 0.5,
#' support= 0.01, minlen= 2, maxlen= 4),appearance = appearance)
#' prune(rules,txns, appearance$rhs)
#' inspect(rules)
#'
#' #Example 2
#' utils::data(Adult) # this dataset comes with the arules package
#' classitems <- c("income=small","income=large")
#' rules <- apriori(Adult, parameter = list(supp = 0.3, conf = 0.5,
#' target = "rules"), appearance=list(rhs=classitems, default="lhs"))
#' # produces 25 rules
#' rulesP <- prune(rules,Adult,classitems)
#' rulesP@quality # inspect rule quality measured including the new additions
#' # Rules after data coverage pruning: 8
#' # Performing default rule pruning.
#' # Final rule list size: 6
prune <- function(rules, txns, classitems,default_rule_pruning=TRUE, rule_window=50000,greedy_pruning=FALSE, input_list_sorted_by_length = TRUE,debug=FALSE){
if (!default_rule_pruning & greedy_pruning)
{
stop("When greedy_pruning is enabled, default_rule_pruning must be enabled too")
}
if (length(rules)==0)
{
stop("Cannot prune empty rule list")
}
# compute rule length
tryCatch(
{
rules@quality$lhs_length <- colSums(rules@lhs@data)
rulesWithEmptyAntecedent <- which(rules@quality$lhs_length==0)
if (length(rulesWithEmptyAntecedent) > 0)
{
rules <- rules[-rulesWithEmptyAntecedent]
if (debug) message("Rule(s) with empty antecedent removed")
}
if (length(rules) == 0)
{
warning("There are no rules with non-empty antecedent.")
warning("Cannot prune empty rule list")
return(rules)
}
}, error= function(err)
{
warning(err)
warning("Is there at least one rule with non empty antecedent?")
return(rules)
})
# sort rules according to CBA criteria
# the first of the following two lines is not necessary as long as the rule set was mined by apriori
if (!input_list_sorted_by_length)
{
rules <- sort(rules, by = "lhs_length", decreasing="false")
}
rules <- sort(rules, by = c("confidence", "support"))
# obtain item ids in dataframe for class items
classitemspositions <- vector(length = length(classitems))
for (i in 1:length(classitems))
{
classitemspositions[i] <- which(txns@itemInfo$labels==classitems[i])
}
# compute class frequencies
# this is neeeded to determine support of default rule during default rule pruning
alldata_classfrequencies <- rowSums(txns@data[classitemspositions,])
# set default class based on all transactions
# this default will be used only in the rare situation when the first rule matches all transactions,
# these will be removed, and thus no transactions will be left to compute default class
default_class <- which.max(alldata_classfrequencies)
orig_transaction_count <- length(txns)
distinct_items <- ncol(txns)
rules_to_remove <- c()
rule_count <- length(rules)
RULEWINDOW <- rule_window
if (RULEWINDOW > rule_count)
{
RULEWINDOW <- rule_count
}
total_errors <- rep(NA, rule_count)
default_classes <-rep(NA, rule_count)
last_total_error_without_default <- 0
last_total_error_with_default <- Inf
#not necessary for CBA algorithm, used for list-based rule confidence computation
def_rule_errors <- rep(NA, rule_count)
instances_uncovered <- rep(NA, rule_count)
#this variation on confidence is computed only from instances that reach that rule
ordered_conf <- rep(NA, rule_count)
ordered_supp <- rep(NA, rule_count)
for(r in seq_len(rule_count))
{
# the purpose of using rule windows is following:
# bulk operations on multiple rules at a time may be cheaper than many iterative multiplications of single rule vectors with all transactions
# however, when the number of input rules is large, this may result in many unneccesary operations
# the RULEWINDOW parameter controls how many rules are processed at a time
# there are two possible savings:
## in some cases, all transactions can be covered by first N rules, remaining rules thus may not need to be processed at all
## as the pruning proceeds through the rule list, the number of remaining transactions decreases, which makes subsequent multiplications cheaper
# RT holds data precomputed for current window size
if ((r-1)%%RULEWINDOW==0)
{
if (RULEWINDOW==1)
{
ws <- r
we <- r
}
else
{
ws <- ((r %/% RULEWINDOW)*RULEWINDOW)+1
we <- ws + RULEWINDOW-1
if (we > rule_count)
{
we <- rule_count
}
}
# the result of matrix mutliplication is a matrix for which element M[r,t] corresponds to how many items in rule r are matched by an item in transaction t
# the conversion from nMatrix to dMatrix since the definition of matrix multiplication operations on nMatrix would mean different result than outlined above
RT.lhs <- t(as(rules@lhs@data[,ws:we,drop = FALSE],"dMatrix")) %*% as(txns@data,"dMatrix")
# this operation would also work without the subsetting by classitemspositions
# now do the same thing for rhs
# Performance tip: nMatrix can be cast to dMatrix just once, outside the loop for the entire matrix
RT.rhs <- t(as(rules@rhs@data[classitemspositions,ws:we,drop = FALSE],
"dMatrix")) %*% as(txns@data[classitemspositions,,drop = FALSE],"dMatrix")
# thus the rule r's lhs matches the transaction t iff M[r,t] == number of items in t is the same as number of items in lhs of rule r
RT.matches_lhs <- (rules[ws:we,drop = FALSE]@quality$lhs_length==RT.lhs)
# the rule r correctly covers the transaction t if the number of items in t matched by the LHS and RHS equals the length of the LHS plus 1 (RHS must always have length 1)
RT.matches_lhs_and_rhs <- (rules[ws:we,drop = FALSE]@quality$lhs_length+1==RT.lhs+RT.rhs)
# performance tip: now the RT.lhs and RT.rhs can be freed
}
# the index of currently processed rule r is recomputed to relate to current window
sr <- r - ws +1
if (debug) message(paste("processing rule ",r, " sr = ", sr))
# cur_rule holds data for current rule r
cur_rule.matches_lhs <- RT.matches_lhs[sr,]
if (TRUE %in% cur_rule.matches_lhs)
{
if (debug) message(paste("Antecedent of rule ",r , " matches at least 1 transaction"))
# check if rule completely matches at least one transaction
R.matches_lhs_and_rhs <- RT.matches_lhs_and_rhs[sr,]
if (TRUE %in% R.matches_lhs_and_rhs)
{
if (debug) message(paste("Rule ",r, " correctly covers at least 1 transaction"))
# remove transactions matching lhs
RT.matches_lhs <- RT.matches_lhs[,!cur_rule.matches_lhs,drop=FALSE]
RT.matches_lhs_and_rhs <- RT.matches_lhs_and_rhs[,!cur_rule.matches_lhs,drop=FALSE]
# remove covered instances from data to allow default rule computation
# drop=FALSE ensures that when txns is subset so that it contains only one transaction the result is not converted to a vector, which would result into an error during assignment
txns@data <- txns@data[,!cur_rule.matches_lhs,drop = FALSE]
# total errors are computed and set only for rules which will not be pruned
total_errors[r] <- last_total_error_without_default+sum(cur_rule.matches_lhs-R.matches_lhs_and_rhs)
#experimental
ordered_supp[r] <- sum(R.matches_lhs_and_rhs)
ordered_conf[r] <- ordered_supp[r]/sum(cur_rule.matches_lhs)
# last_total_error should exclude default rule error, which is added to total_errors[r] later
last_total_error_without_default <- total_errors[r]
# check if there are any more transactions to process
if (ncol(txns@data)==0)
{
if (debug) message(paste("rule ",r, " No transactions to process"))
rules_to_remove <- c(rules_to_remove,(r+1):rule_count)
# there are no transactions left from which to compute the default class for this rule
# use default class from the last iteration
default_classes[r] <- default_class
instances_uncovered[r] <- 0
break
}
else{
# compute default rule error only if there are any transactions left
classfrequencies <- rowSums(txns@data[classitemspositions,,drop=FALSE])
default_class <- which.max(classfrequencies)
majority_class_tran_count <- classfrequencies[default_class]
default_err_count <- length(txns) - majority_class_tran_count
total_errors[r] <- total_errors[r] + default_err_count
default_classes[r] <- default_class
# for ordered confidence computation (not in original CBA)
def_rule_errors[r] <- default_err_count
instances_uncovered[r] <- sum(classfrequencies)
# experimental option (not in original CBA)
if (greedy_pruning)
{
if (last_total_error_with_default<total_errors[r])
{
if (debug) message(paste("Total error including default increase on rule ",r))
rules_to_remove <- c(rules_to_remove,(r+1):rule_count)
break;
}
last_total_error_with_default <- total_errors[r]
}
}
}
else
{
if (debug) message(paste("Rule ",r, " does not match at least 1 transaction correctly"))
rules_to_remove <- c(rules_to_remove,r)
}
}
else{
if (debug) message(paste("Antecedent of rule ",r, " does not match at least 1 transaction"))
rules_to_remove <- c(rules_to_remove,r)
}
}
message(paste("Original rules: ",rule_count))
if (length(rules_to_remove)>0)
{
rules <- rules[-rules_to_remove]
total_errors <- total_errors[-rules_to_remove]
def_rule_errors <- def_rule_errors[-rules_to_remove]
default_classes <- default_classes[-rules_to_remove]
instances_uncovered <- instances_uncovered[-rules_to_remove]
ordered_conf <- ordered_conf[-rules_to_remove]
ordered_supp <- ordered_supp[-rules_to_remove]
}
message(paste("Rules after data coverage pruning:",length(rules)))
# perform default rule pruning
if (default_rule_pruning)
{
message("Performing default rule pruning.")
# find "last rule" the first rule with the smallest number of errors
if (debug)
{
message("total errors by rule")
print (total_errors)
}
last_rule_pos <- which.min(total_errors)
# keep only the top rules until "last rule"(inclusive)
rules <- rules[1:last_rule_pos]
total_errors <- total_errors[1:last_rule_pos]
def_rule_errors <- def_rule_errors[1:last_rule_pos]
default_classes <- default_classes[1:last_rule_pos]
instances_uncovered <- instances_uncovered[1:last_rule_pos]
ordered_conf <- ordered_conf[1:last_rule_pos]
ordered_supp <- ordered_supp[1:last_rule_pos]
}
else {
last_rule_pos <- length(rules)
}
# compute ordered confidence estimate for the default rule from the incorrect classifications
# by the last regular rule in the list caused by applying the the default rule on instances
# uncovered by this rule
defaultRuleOrderedConfidence <- 1- def_rule_errors[last_rule_pos]/instances_uncovered[last_rule_pos]
# append ordered confidence estimates to regular rules, defaultRuleOrderedConfidence will be
# appended later
rules@quality <- cbind(rules@quality,orderedConf=ordered_conf)
rules@quality <- cbind(rules@quality,orderedSupp=ordered_supp)
# compute cumulative_confidence - the accuracy of the rule list up to the given rule excluding uncovered
# instances
cumulative_confidence<- 1 - (total_errors- def_rule_errors)/(orig_transaction_count - instances_uncovered)
cumulative_confidence_default_rule <- 1- total_errors[last_rule_pos]/orig_transaction_count
# append ordered confidence estimates to regular rules, defaultRuleOrderedConfidence will be
# appended later
rules@quality <- cbind(rules@quality,cumulativeConf=cumulative_confidence)
# add default rule to the end
## the lhs of the default rule has no items (all item positions to 0 in the item matrix)
rules@lhs@data <- cbind(rules@lhs@data, as(matrix(0, distinct_items,1),"nMatrix"))
# now prepare the rhs of the default rule
# first prepare empty vector
default_rhs <- as(matrix(0, distinct_items,1),"nMatrix")
# the class associated with the default rule is the class that has been precomputed as part of evaluation of the "last rule"
default_rhs[classitemspositions[default_classes[last_rule_pos]]] <- TRUE
# finally append the default rule rhs to the rules rhs matrix
rules@rhs@data <- cbind(rules@rhs@data, default_rhs)
# compute the support and confidence (will be the same) of the default rule
default_rule_absolute_support <- alldata_classfrequencies[default_classes[last_rule_pos]]
default_rule_support_confidence <- default_rule_absolute_support/orig_transaction_count
#Coverage is the support of the left-hand-side of the rule, in case of default rule all instances are covered
coverage_default <- 1.0
# lift of default rule is also 1.0
lift_default <- 1.0
#count is absolute support
def_rule_lhs_length <- 0
def_rule_ordered_supp <- instances_uncovered[last_rule_pos] * defaultRuleOrderedConfidence
#changing the order may have undesired consequences
#the order is used, e.g., in method prediction
rules@quality <- rbind(rules@quality,c(default_rule_support_confidence,default_rule_support_confidence,coverage_default,lift_default,default_rule_absolute_support,def_rule_lhs_length,defaultRuleOrderedConfidence,def_rule_ordered_supp,cumulative_confidence_default_rule))
# set row name on the newly added default rule to "0"
# so that it does not clash with any of the row names of the original rule list
row.names(rules@quality)[nrow(rules@quality)] <- 0
message(paste("Final rule list size: ",length(rules)))
return(rules)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.