R/LogicalItemsetMiner.R

#' Runs the logical itemset mining algorithm to generate pairs of items
#' @param data A dataframe containing
#' @param item_col The name of the column in the dataframe containing the item
#' idenfifier
#' @param min_cooccur_cnt The minimum times a cooccuring pair can cooccur to be
#' consider
#' @param link_col The name of the column in the dataframe linking the paris of
#' items
#' @param min_consistency The minimum information criteria to retain a pair
#' in the output data set
#' @examples
#' data <- data.frame(item_id = c(1,3,4,6,3,4,3,5,6,1,2,1,4,2),
#'                    trans_id = c(1,2,3,4,1,2,3,4,1,2,3,4,1,2))
#' lism(data, item_col = "item_id", link_col = "trans_id",
#'      min_consistency = 0.2)
#' @export
#' @import assertthat
#' @import data.table
lism <- function (data, item_col, link_col, min_cooccur_cnt = 0,
                  min_consistency = 0.1) {

  assert_that(is.string(item_col),
              is.string(link_col),
              is.number(min_cooccur_cnt),
              is.number(min_consistency),
              is.data.frame(data))

  # Build the minimum necessary data.table
  data_dt <- data.table(item_id = data[, item_col],
                        link_id = data[, link_col],
                        key = "link_id")

  # Get a list item combinations
  data_dt <- merge(x = data_dt, y = data_dt, by = "link_id",
                   allow.cartesian = TRUE)

  #data_dt <- data_dt[data_dt$item_id.x < data_dt$item_id.y, c("item_id.x", "item_id.y")]
  #data_dt[, cooccurrences := .N, by = c("item_id.x", "item_id.y")]

  # Filter our dupes and calculate coocurrences
  data_dt <- data_dt[data_dt$item_id.x < data_dt$item_id.y,
                     list(cooccurrences = .N),
                     by = c("item_id.x", "item_id.y")]

  if (min_cooccur_cnt > 0) {
    # Filter out low coocurrence pairs
    data_dt <- data_dt[data_dt$cooccurrences >= min_cooccur_cnt,]
  }

  repeat {

    prev_pairs = NROW(data_dt)

    # Total occurences by item
    total_dt <- data.table(item_id = c(data_dt$item_id.x, data_dt$item_id.y),
                           cooccurrences = c(data_dt$cooccurrences,
                                             data_dt$cooccurrences),
                           key = "item_id")
    total_dt <- total_dt[, list(total_occurences = sum(cooccurrences)), by = "item_id"]

    # Total coocurrences overall
    total_cc <- sum(data_dt$cooccurrences)

    data_dt$p_a_b = data_dt$cooccurrences / total_cc

    total_dt$mc <- total_dt$total_occurences / total_cc
    data_dt$p_a <- total_dt$mc[match(data_dt$item_id.x, total_dt$item_id)]
    data_dt$p_b <- total_dt$mc[match(data_dt$item_id.y, total_dt$item_id)]

    data_dt$nmpi <- (pmax(0, log(data_dt$p_a_b / (data_dt$p_a * data_dt$p_b)))
                              / (-1 * log(data_dt$p_a_b)))

    # Remove rows that don't meet the minimum information cutoff
    data_dt <- data_dt[data_dt$nmpi >= min_consistency,]

    # Loop until no rows are removed
    if(NROW(data_dt) == prev_pairs) {
      # No pairs removed so exit
      break
    }

  }

  ret_df <- data.frame(item_id_x = data_dt$item_id.x,
                       item_id_y = data_dt$item_id.y,
                       cooccurrences = data_dt$cooccurrences,
                       nmpi = data_dt$nmpi)
  class(ret_df) <- c("data.frame","lism")

  return(ret_df)

}

#' Get the latent itemsets from the data
#'
#' @param data lism data frame generated from the lism function
#' @param min Sets the minimum clique size to return
#' @param max Sets the maxmium clique size to return
#' @examples
#' data <- data.frame(item_id = c(1,3,4,6,3,4,3,5,6,1,2,1,4,2),
#'                    trans_id = c(1,2,3,4,1,2,3,4,1,2,3,4,1,2))
#' zz <- lism(data, item_col = "item_id", link_col = "trans_id",
#'            min_consistency = 0.2)
#' getCliques(zz, min = 3, max = 10)
#' @export
#' @import igraph
#' @import assertthat
getCliques <- function(data, min = 3, max = NULL) {

  assert_that(is.number(min),
              is.number(max))

  data_graph <- matrix(c(data$item_id_x, data$item_id_y), ncol = 2)
  gr <- graph.edgelist(data_graph, directed = FALSE)
  return(max_cliques(gr, min = min, max = max))

}


# cl <- lapply(1:NROW(mc), function(x) {
#
#     as.character(data$item_txt[match(mc[[114]], data$item_id)])
#   })
mvanwyk/logical-itemset-miner documentation built on May 15, 2019, 5:50 p.m.