#' Runs the logical itemset mining algorithm to generate pairs of items
#' @param data A dataframe containing
#' @param item_col The name of the column in the dataframe containing the item
#' idenfifier
#' @param min_cooccur_cnt The minimum times a cooccuring pair can cooccur to be
#' consider
#' @param link_col The name of the column in the dataframe linking the paris of
#' items
#' @param min_consistency The minimum information criteria to retain a pair
#' in the output data set
#' @examples
#' data <- data.frame(item_id = c(1,3,4,6,3,4,3,5,6,1,2,1,4,2),
#' trans_id = c(1,2,3,4,1,2,3,4,1,2,3,4,1,2))
#' lism(data, item_col = "item_id", link_col = "trans_id",
#' min_consistency = 0.2)
#' @export
#' @import assertthat
#' @import data.table
lism <- function (data, item_col, link_col, min_cooccur_cnt = 0,
min_consistency = 0.1) {
assert_that(is.string(item_col),
is.string(link_col),
is.number(min_cooccur_cnt),
is.number(min_consistency),
is.data.frame(data))
# Build the minimum necessary data.table
data_dt <- data.table(item_id = data[, item_col],
link_id = data[, link_col],
key = "link_id")
# Get a list item combinations
data_dt <- merge(x = data_dt, y = data_dt, by = "link_id",
allow.cartesian = TRUE)
#data_dt <- data_dt[data_dt$item_id.x < data_dt$item_id.y, c("item_id.x", "item_id.y")]
#data_dt[, cooccurrences := .N, by = c("item_id.x", "item_id.y")]
# Filter our dupes and calculate coocurrences
data_dt <- data_dt[data_dt$item_id.x < data_dt$item_id.y,
list(cooccurrences = .N),
by = c("item_id.x", "item_id.y")]
if (min_cooccur_cnt > 0) {
# Filter out low coocurrence pairs
data_dt <- data_dt[data_dt$cooccurrences >= min_cooccur_cnt,]
}
repeat {
prev_pairs = NROW(data_dt)
# Total occurences by item
total_dt <- data.table(item_id = c(data_dt$item_id.x, data_dt$item_id.y),
cooccurrences = c(data_dt$cooccurrences,
data_dt$cooccurrences),
key = "item_id")
total_dt <- total_dt[, list(total_occurences = sum(cooccurrences)), by = "item_id"]
# Total coocurrences overall
total_cc <- sum(data_dt$cooccurrences)
data_dt$p_a_b = data_dt$cooccurrences / total_cc
total_dt$mc <- total_dt$total_occurences / total_cc
data_dt$p_a <- total_dt$mc[match(data_dt$item_id.x, total_dt$item_id)]
data_dt$p_b <- total_dt$mc[match(data_dt$item_id.y, total_dt$item_id)]
data_dt$nmpi <- (pmax(0, log(data_dt$p_a_b / (data_dt$p_a * data_dt$p_b)))
/ (-1 * log(data_dt$p_a_b)))
# Remove rows that don't meet the minimum information cutoff
data_dt <- data_dt[data_dt$nmpi >= min_consistency,]
# Loop until no rows are removed
if(NROW(data_dt) == prev_pairs) {
# No pairs removed so exit
break
}
}
ret_df <- data.frame(item_id_x = data_dt$item_id.x,
item_id_y = data_dt$item_id.y,
cooccurrences = data_dt$cooccurrences,
nmpi = data_dt$nmpi)
class(ret_df) <- c("data.frame","lism")
return(ret_df)
}
#' Get the latent itemsets from the data
#'
#' @param data lism data frame generated from the lism function
#' @param min Sets the minimum clique size to return
#' @param max Sets the maxmium clique size to return
#' @examples
#' data <- data.frame(item_id = c(1,3,4,6,3,4,3,5,6,1,2,1,4,2),
#' trans_id = c(1,2,3,4,1,2,3,4,1,2,3,4,1,2))
#' zz <- lism(data, item_col = "item_id", link_col = "trans_id",
#' min_consistency = 0.2)
#' getCliques(zz, min = 3, max = 10)
#' @export
#' @import igraph
#' @import assertthat
getCliques <- function(data, min = 3, max = NULL) {
assert_that(is.number(min),
is.number(max))
data_graph <- matrix(c(data$item_id_x, data$item_id_y), ncol = 2)
gr <- graph.edgelist(data_graph, directed = FALSE)
return(max_cliques(gr, min = min, max = max))
}
# cl <- lapply(1:NROW(mc), function(x) {
#
# as.character(data$item_txt[match(mc[[114]], data$item_id)])
# })
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.