R/subgroup.R

Defines functions is.pattern.matching .FreeMemory ToDataFrame .FormatDoubleSignificantDigits DiscoverSubgroups DiscoverSubgroupsByTask .ExtractSelectors .ConvertDescription .GetParameters as.target CreateSDTask .CreateSimpleSDTask .CreateOntologyForData

Documented in as.target CreateSDTask DiscoverSubgroups DiscoverSubgroupsByTask is.pattern.matching ToDataFrame

###############################################################################
#    rsubgroup package R classes
# 
#    This file is part of the rsubgroup package.
#    Copyright (C) 2011-2019 by Martin Atzmueller
#    
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
    
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#    
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
#    Contact: Martin Atzmueller (martin@atzmueller.net)
###############################################################################

library(foreign)
library(rJava)

setGeneric(".CreateARFFProvider",
    function(source, name, ...) {
      standardGeneric(".CreateARFFProvider")
    }
)

setMethod(".CreateARFFProvider", signature(source = "data.frame", name = "character"),
    function(source, name, ...) {
      # Creates a dataset provider (converting the dataframe)
      arff.vector <- NULL # This prevents the R CHECK NOTE:
      # 'No visible bindingfor global variable Note in R CMD check'
      
      con <- textConnection("arff.vector", open = "w", local = TRUE)
      write.arff(source, con)
      flush(con)
      close(con)
      rm(con)
      arff <- paste(arff.vector, "", collapse="\n")
      provider <- .jnew("org/vikamine/kernel/xpdl/ARFFAsStringDatasetProvider", arff, name)
      rm(arff.vector)
      return(provider)
    }
)

setMethod(".CreateARFFProvider", signature(source = "character", name = "character"),
    function(source, name, ...) {
      # Creates a dataset provider given a file name
      provider <- .jnew("org/vikamine/kernel/xpdl/FileDatasetProvider", source, name)
      return(provider)
    }
)

.CreateOntologyForData <- function(provider, dataset) {
  # Creates the ontology object for the respective dataset
  ontology <- .jcall(provider, "Lorg/vikamine/kernel/data/Ontology;","getDataset", dataset)
  return(ontology)
}

.CreateSimpleSDTask <- function(ontology, target) {
  # Creates a simple subgroup discovery task
  .FreeMemory()
  simpleTask <- new(J("org/vikamine/kernel/subgroup/search/SDSimpleTask"), ontology)
  if (!is.null(target$value)) {
    selector <- new(J("org/vikamine/kernel/subgroup/selectors/DefaultSGSelector"), ontology, target$attribute, target$value)
    target <- new(J("org/vikamine/kernel/subgroup/target/SelectorTarget"), selector)
  } else {
    attribute <- J(ontology, "getAttribute", target$attribute)
    target <- new(J("org/vikamine/kernel/subgroup/target/NumericTarget"), attribute)
  }
  J(simpleTask, "setTarget", target)
  return(simpleTask)
}

CreateSDTask <- function(source, target, config = SDTaskConfig()) {
  # Creates a subgroup discovery task
  #
  # Args:
  #   source: A data source, i.e., dataframe or file (name)
  #   target: The target variable
  #   config: A SDTaskConfig task configuration
  #
  # Returns:
  #   A subgroup discovery task
  .FreeMemory()
  provider <- .CreateARFFProvider(source, "data")
  ontology <- .CreateOntologyForData(provider, "data")
  task <- .CreateSimpleSDTask(ontology, target)
  J(task, "setQualityFunction", config@qf)
  J(task, "setSDMethod", config@method)
  J(task, "setMaxSGCount", as.integer(config@k))
  J(task, "setMinQualityLimit", as.double(config@minqual))
  J(task, "setMinSubgroupSize", as.double(config@minsize))
  J(task, "setMinTPSupportAbsolute", as.double(config@mintp))
  J(task, "setMaxSGDSize", as.integer(config@maxlen))
  J(task, "setSuppressStrictlyIrrelevantSubgroups", config@relfilter)
  J(task, "setIgnoreDefaultValues", config@nodefaults)
  if ((length(config@postfilter) > 1) || (nchar(config@postfilter) > 0)) {
    for (filter in config@postfilter) {
      J(task, "setPostFilter", filter)
    }
  }
  if (!is.na(config@parfilter)) {
    J(task, "setPostFilterParameter", config@parfilter)
  }
  
  doDiscretize = config@discretize
  if (is.na(config@nbins)) {
	  doDiscretize = FALSE
  }
  intBins = as.integer(config@nbins)
  
  
  if (is.null(config@attributes)) {
    attributesArrayObject <- .GetAllAttributesAsJArray(ontology = ontology)
    J(task, "setAttributes", attributesArrayObject, doDiscretize, intBins)
  } else if ((!is.null(config@attributes)) && (length(config@attributes) > 0)) {
    J(task, "setAttributes", .jarray(config@attributes), doDiscretize, intBins)  
  } else {
    J(task, "setAttributes", .jarray(character(0)), doDiscretize, intBins) 
  }
    
  return(task)
}

as.target <- function(attribute=NULL, value=NULL) {
  # Creates a target variable object given attribute and value (for nominals)
  #
  # Args:
  #   attribute: The respective attribute
  #   value: For nominals, the respective value; for numeric NULL
  #
  # Returns:
  #   A target object representation
  if (!is.null(attribute) && !is.null(value))
    return(list(attribute=attribute, value=value))
  else if (!is.null(attribute))
    return(list(attribute=attribute))
  else
    return(NULL)
}

.GetParameters <- function(task, sg) {
  target <- J(task, "getTarget")
  if (J(target, "isBoolean")) {
    size <- J(J(sg, "getStatistics"), "getSubgroupSize")
    p <- J(J(sg, "getStatistics"), "getP")
    p0 <- J(J(sg, "getStatistics"), "getP0")
    chi2 <- J("org.vikamine.kernel.subgroup.SGUtils")$calculateChi2OfSubgroup(J(sg, "getStatistics"))
    return(list(p = p, p0 = p0, chi2=chi2, size = size))
  } else if (J(target, "isNumeric")) {
    size <- J(J(sg, "getStatistics"), "getSubgroupSize")
    mean <- J(J(sg, "getStatistics"), "getSGMean")
    popMean <- J(J(sg, "getStatistics"), "getPopulationMean")
    return(list(mean = mean, populationMean = popMean, size = size))
  } else {
    stop("Unknown target")
  }
}

.ConvertDescription <- function(sgDescription) {
  # Internal function for converting a (Java) SGDescription consisting
  # of a set of selection expressions into a character vector of strings
  # representing these
  # Args:
  #   sgDescription: A (Java) SGDescription object
  #
  # Returns:
  #   A character vector
  sgSelectorArray <-
      J("org.vikamine.kernel.subgroup.search.SDSimpleTask")$getSimpleDescription(sgDescription)
  return(as.character(sgSelectorArray))
}

.ExtractSelectors <- function(sgDescription) {
  # Internal function for converting a (Java) SGDescription consisting
  # of a set of selection expressions into a list of those expressions
  # where the 'key' is the attribute and the 'value' is the selector value
  # Args:
  #   sgDescription: A (Java) SGDescription object
  #
  # Returns:
  #   A list of characters
  result <- list()
  sgSelectorList <- J(J(sgDescription, "getSelectors"), "toArray")
  sgSelectorArray <- .jevalArray(
      sgSelectorList,
      simplify=TRUE)
  for (selector in sgSelectorArray) {
    attribute <- .jcall("org/vikamine/kernel/subgroup/search/SDSimpleTask", "Ljava/lang/String;", method = "getAttributeIDOfSelector", .jcast(selector, "org/vikamine/kernel/subgroup/selectors/SGSelector"))
    value <- .jcall("org/vikamine/kernel/subgroup/search/SDSimpleTask", "Ljava/lang/String;", method="getSingleValueIDOfSelector", .jcast(selector, "org/vikamine/kernel/subgroup/selectors/SGSelector"))
    tmp <- list()
    tmp[[attribute]] <- value
    result <- append(result, tmp)
  }
  return(result)
}

DiscoverSubgroupsByTask <- function(task, as.df = FALSE) {
  # Internal function for setting up and performing subgroup discovery
  # Args:
  #   task: A subgroup discovery task
  #
  # Returns:
  #   A list of subgroup patterns
  sgSet <- J(task, "performSubgroupDiscovery")
  sgList <- J(sgSet, "toSortedList", FALSE)
  sgArray <- .jevalArray(J(sgList, "toArray"))
  
  patterns <- list()
  for (sg in sgArray) {
    #description <- as.character(J(J(sg, "getSGDescription"), "getDescription"))
    sgDescription <- J(sg, "getSGDescription")
    description <- .ConvertDescription(sgDescription)
    selectors <- .ExtractSelectors(sgDescription)
    quality <- J(sg, "getQuality")
    size <- J(J(sg, "getStatistics"), "getSubgroupSize")
    parameters <- .GetParameters(task, sg)    
    pattern <- new("Pattern", description=description, quality=quality, size=size, parameters=parameters, selectors=selectors)
    patterns <- append(patterns, pattern)
  }
  
  if (as.df) {
    dataFrameRules <- ToDataFrame(patterns)
    return(dataFrameRules)
  } else {
    return(patterns)
  }
}

DiscoverSubgroups <- function(source, target, config=SDTaskConfig(), as.df=FALSE) {
  # Performs subgroup discovery according to target and config on data
  #
  # Args:
  #   data: A dataframe
  #   target: A target variable (constructed by as.target)
  #   config: a SDTaskConfig configuration for the algorithm
  #
  # Returns:
  #   A list of subgroup patterns
  task <- CreateSDTask(source, target, config)
  result <- DiscoverSubgroupsByTask(task, as.df)
  return(result)
}


.FormatDoubleSignificantDigits <- function(double, ndigits=2) {
  # Internal function: Prints double according to
  if (is.numeric(ndigits)) {
    sprintf(paste("%.", ndigits, "f", sep=""), double)
  } else {
    double
  }
}



ToDataFrame <- function(patterns, ndigits=2) {
  # Transforms a list/vector of patterns into a dataframe
  #
  # Args:
  #   patterns: List of patterns
  #   ndigits: Number of significant digits for floats
  #
  # Returns:
  #   The dataframe containing the pattern information
  isNumeric <- FALSE  
  descriptions <- list()
  length(descriptions) <- length(patterns)
  qualities <-list()
  length(qualities) <- length(patterns)
  sizes <- list()
  length(sizes) <- length(patterns)
  ps <- list()
  length(ps) <- length(patterns)
  chi2 <- list()
  length(chi2) <- length(patterns)
  
  i <- 1
  for (pattern in patterns) {
    descriptions[i] <- paste(pattern@description, collapse=", ")
    qualities[i] <- .FormatDoubleSignificantDigits(pattern@quality, ndigits)
    sizes[i] <- pattern@size
    if (!is.null(pattern@parameters$mean)) {
      ps[i] <- .FormatDoubleSignificantDigits(pattern@parameters$mean, ndigits)
      isNumeric <- TRUE
    } else {
      ps[i] <- .FormatDoubleSignificantDigits(pattern@parameters$p, ndigits)
      chi2[i] <- .FormatDoubleSignificantDigits(pattern@parameters$chi2, ndigits)
      isNumeric <- FALSE
    }
    i <- i + 1
  }
  if (isNumeric) {
    dataframe <- data.frame(
        quality=as.vector(qualities, "numeric"),
        mean=as.vector(ps, "numeric"), 
        size=as.vector(sizes, "numeric"),
        description=as.vector(descriptions, "character"))
  } else {
    dataframe <- data.frame(
        quality=as.vector(qualities, "numeric"),
        p=as.vector(ps, "numeric"), 
        size=as.vector(sizes, "numeric"),
        chi2=as.vector(chi2, "numeric"),
        description=as.vector(descriptions, "character"))
  }
  return(dataframe)
}

.FreeMemory <- function(...) {
  # Call the R garbage collection
  # Then call Java garbage collection
  gc(...)
  .jcall("java/lang/System", method = "gc")
  invisible()
}

is.pattern.matching <- function(pattern, data.list) {
  selectors <- pattern@selectors
  matching <- TRUE
  for (sel in names(selectors)) {
    data.list.selector <- as.character(data.list[[sel]])
    pattern.selector <- as.character(selectors[[sel]])
    if (isTRUE(data.list.selector != pattern.selector)) {
      matching <- FALSE
      break
    }
  }
  return(matching)
}

Try the rsubgroup package in your browser

Any scripts or data that you put into this service are public.

rsubgroup documentation built on Feb. 23, 2021, 5:05 p.m.