R/dimensionality.R

Defines functions c.D3 c.D2 c.D1 pca ls.dimensionality dimensionality.formula dimensionality.default dimensionality

Documented in dimensionality dimensionality.default dimensionality.formula

#' Measures of dimensionality
#'
#' These measures give an indicative of data sparsity. They capture how sparse 
#' a datasets tend to have regions of low density. These regions are know to be 
#' more difficult to extract good classification and regression models.
#'
#' @family complexity-measures
#' @param x A data.frame contained only the input attributes.
#' @param y A response vector with one value for each row/component of x.
#' @param measures A list of measures names or \code{"all"} to include all them.
#' @param formula A formula to define the output column.
#' @param data A data.frame dataset contained the input and output attributes.
#' @param ... Not used.
#' @details
#'  The following measures are allowed for this method:
#'  \describe{
#'    \item{"D1"}{Average number of points per dimension (D1) is given by the 
#'      ratio between the number of examples and dimensionality of the dataset.}
#'    \item{"D2"}{Average number of points per PCA (D2) is similar to T2, but 
#'      uses the number of PCA components needed to represent 95% of data 
#'      variability as the base of data sparsity assessment.}
#'    \item{"D3"}{Ratio of the PCA Dimension to the Original (D3) estimates the
#'      proportion of relevant and the original dimensions for a dataset.}
#'  }
#' @return A list named by the requested dimensionality measure.
#'
#' @references
#'  Ana C Lorena, Ivan G Costa, Newton Spolaor and Marcilio C P Souto. (2012). 
#'    Analysis of complexity indices for classification problems: Cancer gene 
#'    expression data. Neurocomputing 75, 1, 33--42.
#'
#' @examples
#' ## Extract all dimensionality measures for classification task
#' data(iris)
#' dimensionality(Species ~ ., iris)
#'
#' ## Extract all dimensionality measures for regression task
#' data(cars)
#' dimensionality(speed ~ ., cars)
#' @export
dimensionality <- function(...) {
  UseMethod("dimensionality")
}

#' @rdname dimensionality
#' @export
dimensionality.default <- function(x, y, measures="all", ...) {

  if(!is.data.frame(x)) {
    stop("data argument must be a data.frame")
  }

  if(is.data.frame(y)) {
    y <- y[, 1]
  }

  if(nrow(x) != length(y)) {
    stop("x and y must have same number of rows")
  }

  if(measures[1] == "all") {
    measures <- ls.dimensionality()
  }

  measures <- match.arg(measures, ls.dimensionality(), TRUE)
  colnames(x) <- make.names(colnames(x), unique=TRUE)

  x <- binarize(x)

  sapply(measures, function(f) {
    eval(call(paste("c", f, sep="."), x=x))
  })
}

#' @rdname dimensionality
#' @export
dimensionality.formula <- function(formula, data, measures="all", ...) {

  if(!inherits(formula, "formula")) {
    stop("method is only for formula datas")
  }

  if(!is.data.frame(data)) {
    stop("data argument must be a data.frame")
  }

  modFrame <- stats::model.frame(formula, data)
  attr(modFrame, "terms") <- NULL

  dimensionality.default(modFrame[, -1, drop=FALSE], modFrame[, 1, drop=FALSE],
    measures, ...)
}

ls.dimensionality <- function() {
  c("D1", "D2", "D3")
}

pca <- function(x) {
  aux <- stats::prcomp(x)
  tmp <- length(which(summary(aux)$importance[3,] < 0.95)) + 1
  return(tmp)
}

c.D1 <- function(x) {
  ncol(x)/nrow(x)
}

c.D2 <- function(x) {
  pca(x)/nrow(x)
}

c.D3 <- function(x) {
  pca(x)/ncol(x)
}
lpfgarcia/ECoL documentation built on Dec. 22, 2020, 1:41 a.m.