R/statistical.R

Defines functions m.wLambda m.var m.tMean m.sparsity m.skewness m.sdRatio m.sd m.range m.nrOutliers m.nrNorm m.nrCorAttr m.min m.median m.mean m.max m.mad m.kurtosis m.iqRange m.hMean m.gMean m.eigenvalues m.nrDisc m.cov m.cor m.gravity m.canCor ls.statistical.exclude.byclass ls.statistical.multiples ls.statistical statistical.formula statistical.default statistical

Documented in ls.statistical statistical statistical.default statistical.formula

#' Statistical meta-features
#'
#' Statistical meta-features are the standard statistical measures to describe
#' the numerical properties of a distribution of data. As it requires only
#' numerical attributes, the categorical data are transformed to numerical.
#'
#' @family meta-features
#' @param x A data.frame contained only the input attributes.
#' @param y A factor response vector with one label for each row/component of x.
#' @param features A list of features names or \code{"all"} to include all them.
#'  The details section describes the valid values for this group.
#' @param summary A list of summarization functions or empty for all values. See
#'  \link{post.processing} method to more information. (Default: 
#'  \code{c("mean", "sd")})
#' @param by.class A logical value indicating if the meta-features must be
#'  computed for each group of samples belonging to different output classes.
#'  (Default: FALSE)
#' @param transform A logical value indicating if the categorical attributes
#'  should be transformed. If \code{FALSE} they will be ignored. (Default: 
#'  \code{TRUE})
#' @param formula A formula to define the class column.
#' @param data A data.frame dataset contained the input attributes and class
#'  The details section describes the valid values for this group.
#' @param ... Further arguments passed to the summarization functions.
#' @details
#'  The following features are allowed for this method:
#'  \describe{
#'    \item{"canCor"}{Canonical correlations between the predictive attributes 
#'    and the class (multi-valued).}
#'    \item{"gravity"}{Center of gravity, which is the distance between the 
#'    instance in the center of the majority class and the instance-center of 
#'    the minority class.}
#'    \item{"cor"}{Absolute attributes correlation, which measure the 
#'    correlation between each pair of the numeric attributes in the dataset 
#'    (multi-valued). This measure accepts an extra argument called 
#'    \code{method = c("pearson", "kendall", "spearman")}. See 
#'    \code{\link[stats]{cor}} for more details.}
#'    \item{"cov"}{Absolute attributes covariance, which measure the covariance 
#'    between each pair of the numeric attributes in the dataset 
#'    (multi-valued).}
#'    \item{"nrDisc"}{Number of the discriminant functions.}
#'    \item{"eigenvalues"}{Eigenvalues of the covariance matrix (multi-valued).}
#'    \item{"gMean"}{Geometric mean of attributes (multi-valued).}
#'    \item{"hMean"}{Harmonic mean of attributes (multi-valued).}
#'    \item{"iqRange"}{Interquartile range of attributes (multi-valued).}
#'    \item{"kurtosis"}{Kurtosis of attributes (multi-valued).}
#'    \item{"mad"}{Median absolute deviation of attributes (multi-valued).}
#'    \item{"max"}{Maximum value of attributes (multi-valued).}
#'    \item{"mean"}{Mean value of attributes (multi-valued).}
#'    \item{"median"}{Median value of attributes (multi-valued).}
#'    \item{"min"}{Minimum value of attributes (multi-valued).}
#'    \item{"nrCorAttr"}{Number of attributes pairs with high correlation 
#'    (multi-valued when \code{by.class=TRUE}).}
#'    \item{"nrNorm"}{Number of attributes with normal distribution. The 
#'    Shapiro-Wilk Normality Test is used to assess if an attribute is or not is
#'    normally distributed (multi-valued only when \code{by.class=TRUE}).}
#'    \item{"nrOutliers"}{Number of attributes with outliers values. The 
#'    Turkey's boxplot algorithm is used to compute if an attributes has or does 
#'    not have outliers (multi-valued only when \code{by.class=TRUE}).}
#'    \item{"range"}{Range of Attributes (multi-valued).}
#'    \item{"sd"}{Standard deviation of the attributes (multi-valued).}
#'    \item{"sdRatio"}{Statistic test for homogeneity of covariances.}
#'    \item{"skewness"}{Skewness of attributes (multi-valued).}
#'    \item{"sparsity"}{Attributes sparsity, which represents the degree of 
#'    discreetness of each attribute in the dataset (multi-valued).}
#'    \item{"tMean"}{Trimmed mean of attributes (multi-valued). It is the 
#'    arithmetic mean excluding the 20\% of the lowest and highest instances.}
#'    \item{"var"}{Attributes variance (multi-valued).}
#'    \item{"wLambda"}{Wilks Lambda.}
#'  }
#'  This method uses simple binarization to transform the categorical attributes
#'  when \code{transform=TRUE}.
#' @return A list named by the requested meta-features.
#'
#' @references
#'  Ciro Castiello, Giovanna Castellano, and Anna M. Fanelli. Meta-data: 
#'  Characterization of input features for meta-learning. In 2nd International 
#'  Conference on Modeling Decisions for Artificial Intelligence (MDAI), 
#'  pages 457 - 468, 2005.
#'
#'  Shawkat Ali, and Kate A. Smith. On learning algorithm selection for 
#'  classification. Applied Soft Computing, volume 6, pages 119 - 138, 2006.
#'
#' @examples
#' ## Extract all meta-features
#' statistical(Species ~ ., iris)
#'
#' ## Extract some meta-features
#' statistical(iris[1:4], iris[5], c("cor", "nrNorm"))
#'
#' ## Extract all meta-features without summarize the results
#' statistical(Species ~ ., iris, summary=c())
#' 
#' ## Use another summarization function
#' statistical(Species ~ ., iris, summary=c("min", "median", "max"))
#'
#' ## Extract statistical measures using by.class approach
#' statistical(Species ~ ., iris, by.class=TRUE)
#' 
#' ## Do not transform the data (using only categorical attributes)
#' statistical(Species ~ ., iris, transform=FALSE)
#' @export
statistical <- function(...) {
  UseMethod("statistical")
}

#' @rdname statistical
#' @export
statistical.default <- function(x, y, features="all",
                                   summary=c("mean", "sd"), by.class=FALSE,
                                   transform=TRUE, ...) {
  if(!is.data.frame(x)) {
    stop("data argument must be a data.frame")
  }

  if(is.data.frame(y)) {
    y <- y[, 1]
  }
  y <- as.factor(y)

  if(nrow(x) != length(y)) {
    stop("x and y must have same number of rows")
  }

  if(features[1] == "all") {
    features <- ls.statistical()
  }
  features <- match.arg(features, ls.statistical(), TRUE)
  colnames(x) <- make.names(colnames(x), unique=TRUE)
  
  if (length(summary) == 0) {
    summary <- "non.aggregated"
  }

  if (transform) {
    numdata <- binarize(x)
  } else {
    numdata <- x[, sapply(x, is.numeric), drop=FALSE]

    if (length(numdata) == 0) {
      if (by.class) {
        multiples <- c(ls.statistical.multiples(), 
                       setdiff(setdiff(ls.statistical(), 
                                       ls.statistical.multiples()), 
                               ls.statistical.exclude.byclass()))
      } else {
        multiples <- ls.statistical.multiples()
      }
      return(
        sapply(features, function(f) {
          post.processing(NA, summary, f %in% multiples, ...)
        }, simplify=FALSE)
      )
    }
  }
  
  y.num <- binarize(as.data.frame(y))
  x.cov <- stats::cov(numdata)
  
  extra <- list(
    y.num = y.num,
    cancor = tryCatch(
      stats::cancor(numdata, y.num),
      error=function(e) {
        warning(e)
        list(cor=c())
      }),
    x.cov = x.cov,
    eigenvalues = base::eigen(x.cov)
  )

  if(by.class) {
    exclude <- ls.statistical.exclude.byclass()
    measures <- sapply(levels(y), function(class) {
      new.data <- numdata[y==class, , drop=FALSE]
      new.x <- x[y==class, , drop=FALSE]
      nex.xcov <- stats::cov(new.data)
      new.extra <- list(
        x.cov = nex.xcov,
        eigenvalues = base::eigen(nex.xcov)
      )
      
      #new.data <- new.data[, apply(new.data, 2, stats::sd) != 0, drop=FALSE]
      aux <- sapply(features, function(f) {
        fn <- paste("m", f, sep=".")
        if (f %in% exclude) {
          do.call(fn, c(list(x=numdata, y=y, xorig=x, extra=extra), list(...)))
        } else {
          do.call(fn, c(list(x=new.data, xorig=new.x, extra=new.extra), list(...)))
        }
      }, simplify=FALSE)
      
      aux
    }, simplify=FALSE)
    
    sapply(features, function(f) {
      if (f %in% exclude) {
        values <- measures[[1]][[f]]
      } else {
        values <- lapply(measures, function(values) values[[f]])
      }
      
      post.processing(unlist(values), summary, 
                      !f %in% setdiff(exclude, ls.statistical.multiples()), ...)
    }, simplify=FALSE)
  } else {
    sapply(features, function(f) {
      fn <- paste("m", f, sep=".")
      measure <- do.call(fn, c(list(x=numdata, y=y, xorig=x, extra=extra), list(...)))
      post.processing(measure, summary, f %in% ls.statistical.multiples(), ...)
    }, simplify=FALSE)
  }
}

#' @rdname statistical
#' @export
statistical.formula <- function(formula, data, features="all",
                                   summary=c("mean", "sd"), by.class=FALSE,
                                   transform=TRUE, ...) {
  if(!inherits(formula, "formula")) {
    stop("method is only for formula datas")
  }

  if(!is.data.frame(data)) {
    stop("data argument must be a data.frame")
  }

  modFrame <- stats::model.frame(formula, data)
  attr(modFrame, "terms") <- NULL

  statistical.default(modFrame[-1], modFrame[1], features, summary, 
    by.class, transform, ...)
}

#' List the statistical meta-features
#'
#' @return A list of statistical meta-features names
#' @export
#'
#' @examples
#' ls.statistical()
ls.statistical <- function() {
  c("canCor", "gravity", "cor", "cov", "nrDisc", "eigenvalues", "gMean", 
    "hMean", "iqRange", "kurtosis", "mad", "max", "mean", "median", "min", 
    "nrCorAttr", "nrNorm", "nrOutliers", "range", "sd", "sdRatio", "skewness", 
    "sparsity", "tMean", "var", "wLambda")
}

ls.statistical.multiples <- function() {
  c("canCor", "cor", "cov", "eigenvalues", "gMean", "hMean", "iqRange", 
    "kurtosis", "mad", "max", "mean", "median", "min", "range", "sd", 
    "skewness", "sparsity", "tMean", "var")
}

ls.statistical.exclude.byclass <- function() {
  c("canCor", "gravity", "nrDisc", "sdRatio", "wLambda")
}

m.canCor <- function(x, y, extra, ...) {
  if (length(extra$cancor$cor) == 0) return(NA)
  else return(extra$cancor$cor)
}

m.gravity <- function(x, y, ...) {
  classes <- table(y)
  minc <- which.min(classes)
  maxc <- which.max(classes[-minc])
  
  centers <- t(sapply(names(c(minc, maxc)), function(class){
    apply(x[y == class, , drop=FALSE], 2, base::mean)
  }))

  c(stats::dist(centers))
}

m.cor <- function(x, ...) {
  args <- list(...)
  method <- ifelse(is.null(args$method), "pearson", args$method)
  aux <- stats::cor(x, method=method)
  abs(aux[upper.tri(aux)])
}

m.cov <- function(x, y, extra, ...) {
  abs(extra$x.cov[upper.tri(extra$x.cov)])
}

m.nrDisc <- function(x, y, extra, ...) {
  length(extra$cancor$cor)
}

m.eigenvalues <- function(x, y, extra, ...) {
  extra$eigenvalues$values
}

m.gMean <- function(x, ...) {
  res1 <- apply(x, 2, prod)^(1/nrow(x))

  x[x < 1] <- NA
  res2 <- apply(x, 2, function(col) {
    exp(base::mean(log(col), ...))
  })

  coalesce(res1, res2)
}

m.hMean <- function(x, ...) {
  apply(x, 2, function(col) length(col) / sum(1/col))
}

m.iqRange <- function(x, ...) {
  apply(x, 2, stats::IQR)
}

m.kurtosis <- function(x, ...) {
  apply(x, 2, e1071::kurtosis)
}

m.mad <- function(x, ...) {
  apply(x, 2, stats::mad)
}

m.max <- function(x, ...) {
  apply(x, 2, base::max)
}

m.mean <- function(x, ...) {
  apply(x, 2, base::mean)
}

m.median <- function(x, ...) {
  apply(x, 2, stats::median)
}

m.min  <- function(x, ...) {
  apply(x, 2, base::min)
}

m.nrCorAttr <- function(x, ...) {
  sum(abs(m.cor(x, ...)) >= 0.5) / (ncol(x) * (ncol(x) - 1) / 2)
}

m.nrNorm <- function(x, ...) {
  sum(unlist(apply(x, 2, function(col) {
    p.value <- NA
    tryCatch(
      p.value <- stats::shapiro.test(col[seq(min(length(col), 5000))])$p.value, 
      error = function(e) e
    )
    p.value
  })) > 0.1, na.rm = TRUE)
}

m.nrOutliers <- function(x, ...) {
  args <- list(...)
  na.rm <- ifelse(is.null(args$na.rm), FALSE, args$na.rm)
  sum(apply(x, 2, function(x) {
    qs <- stats::quantile(x, na.rm=na.rm)
    iqr <- (qs[4] - qs[2]) * 1.5
    (qs[2] - iqr) > qs[1] | (qs[4] + iqr) < qs[5] 
  }))
}

m.range <- function(x, ...) {
  res <- apply(x, 2, base::range)
  res[2,] - res[1,]
}

m.sd <- function(x, ...) {
  args <- list(...)
  na.rm <- ifelse(is.null(args$na.rm), FALSE, args$na.rm)
  apply(x, 2, stats::sd, na.rm=na.rm)
}

m.sdRatio <- function(x, y, extra, ...) {
  p <- ncol(x)
  q <- nlevels(y)
  n <- length(y)
  ni <- table(y) - 1
  
  Si <- lapply(levels(y), function(class) stats::cov(x[y == class,, drop=FALSE]))
  S <- Reduce('+', mapply(function(Si, ni) ni*Si, S=Si, n=ni, SIMPLIFY=FALSE)) /
    (n - q)
  
  tryCatch({
    M <- (1 - ((2*p^2+3*p-1)/(6*(p+1)*(q-1))) * (sum(1/ni)-1/(n-q))) *
      ((n - q) * log(det(S)) - sum(ni * log(sapply(Si, det))))
    
    ifelse(is.na(M) | is.infinite(M), NA, exp(M / (p * sum(ni - 1))))
  }, warning = function(e) {
    NA
  })
}

m.skewness <- function(x, ...) {
  apply(x, 2, e1071::skewness)
}

m.sparsity <- function(xorig, ...) {
  (apply(xorig, 2, function(col) base::mean(table(col))) - 1) / (nrow(xorig) - 1)
}

m.tMean <- function(x, ...) {
  apply(x, 2, base::mean, trim=0.2)
}

m.var <- function(x, ...) {
  apply(x, 2, stats::var)
}

m.wLambda <- function(x, y, ...) {
  if (ncol(x) > 1) {
    if (any(table(y) < 2)) {
        warning("WLambda: Removing classes with less than 2 instances.")
        idx <- as.character(y) %in% names(which(table(y) < 2))
        x <- x[!idx,]
        y <- droplevels(y[!idx])
    }
    as.numeric(rrcov::Wilks.test(x, grouping=y)$statistic)
  } else {
    return(NA)
  }
}
rivolli/mfe documentation built on March 29, 2022, 11:08 p.m.