R/smoothness.R

Defines functions r.S4 r.S3 r.S2 r.S1 ls.smoothness.multiples ls.smoothness smoothness.formula smoothness.default smoothness

Documented in smoothness smoothness.default smoothness.formula

#' Measures of smoothness
#'
#' Regression task. In regression problems, the smoother the function to be 
#' fitted to the data, the simpler it shall be. Larger variations in the inputs
#' and/or outputs, on the other hand, usually indicate the existence of more 
#' intricate relationships between them.
#'
#' @family complexity-measures
#' @param x A data.frame contained only the input attributes.
#' @param y A response vector with one value for each row/component of x.
#' @param measures A list of measures names or \code{"all"} to include all them.
#' @param formula A formula to define the output column.
#' @param data A data.frame dataset contained the input and output attributes.
#' @param summary A list of summarization functions or empty for all values. See
#'  \link{summarization} method to more information. (Default: 
#'  \code{c("mean", "sd")})
#' @param ... Not used.
#' @details
#'  The following measures are allowed for this method:
#'  \describe{
#'    \item{"S1"}{Output distribution (S1) monitors whether the examples 
#'      joined in the MST have similar output values. Lower values indicate 
#'      simpler problems, where the outputs of similar examples in the input 
#'      space are also next to each other.}
#'    \item{"S2"}{Input distribution (S2) measure how similar in the input space
#'      are data items with similar outputs based on distance.}
#'    \item{"S3"}{Error of a nearest neighbor regressor (S3) calculates the mean
#'      squared error of a 1-nearest neighbor regressor  using leave-one-out.}
#'    \item{"S4"}{Non-linearity of nearest neighbor regressor (S4) calculates 
#'      the mean squared error of a 1-nearest neighbor regressor to the new 
#'      randomly interpolated points.}
#'  }
#' @return A list named by the requested smoothness measure.
#'
#' @references
#'  Ana C Lorena and Aron I Maciel and Pericles B C Miranda and Ivan G Costa and
#'    Ricardo B C Prudencio. (2018). Data complexity meta-features for 
#'    regression problems. Machine Learning, 107, 1, 209--246.
#'
#' @examples
#' ## Extract all smoothness measures for regression task
#' data(cars)
#' smoothness(speed ~ ., cars)
#' @export
smoothness <- function(...) {
  UseMethod("smoothness")
}

#' @rdname smoothness
#' @export
smoothness.default <- function(x, y, measures="all", summary=c("mean", "sd"), 
                               ...) {

  if(!is.data.frame(x)) {
    stop("data argument must be a data.frame")
  }

  if(is.data.frame(y)) {
    y <- y[, 1]
  }

  if(is.factor(y)) {
    stop("label attribute needs to be numeric")
  }

  if(nrow(x) != length(y)) {
    stop("x and y must have same number of rows")
  }

  if(measures[1] == "all") {
    measures <- ls.smoothness()
  }

  measures <- match.arg(measures, ls.smoothness(), TRUE)

  if (length(summary) == 0) {
    summary <- "return"
  }

  colnames(x) <- make.names(colnames(x), unique=TRUE)
  x <- normalize(x)
  y <- normalize(y)[,1]

  x <- x[order(y), , drop=FALSE]
  y <- y[order(y)]
  d <- dist(x)

  sapply(measures, function(f) {
    measure = eval(call(paste("r", f, sep="."), d=d, x=x, y=y))
    summarization(measure, summary, f %in% ls.smoothness.multiples(), ...)
  }, simplify=FALSE)
}

#' @rdname smoothness
#' @export
smoothness.formula <- function(formula, data, measures="all", 
                               summary=c("mean", "sd"), ...) {

  if(!inherits(formula, "formula")) {
    stop("method is only for formula datas")
  }

  if(!is.data.frame(data)) {
    stop("data argument must be a data.frame")
  }

  modFrame <- stats::model.frame(formula, data)
  attr(modFrame, "terms") <- NULL

  smoothness.default(modFrame[, -1, drop=FALSE], modFrame[, 1, drop=FALSE],
    measures, summary, ...)
}

ls.smoothness <- function() {
  c("S1", "S2", "S3", "S4")
}

ls.smoothness.multiples <- function() {
  ls.smoothness()
}

r.S1 <- function(d, x, y) {

  g <- igraph::graph.adjacency(d, mode="undirected", weighted=TRUE)
  tree <- as.matrix(igraph::as_adj(igraph::mst(g)))
  tmp <- which(tree != 0, arr.ind=TRUE)
  aux <- abs(y[tmp[,1]] - y[tmp[,2]])
  aux/(aux + 1)
}

r.S2 <- function(d, x, y) {

  pred <- sapply(2:nrow(d), function(i) {
    d[i-1, i]
  })

  pred/(pred + 1)
}

r.S3 <- function(d, x, y) {

  diag(d) <- Inf
  pred <- apply(d, 1, function(i) {
    y[which.min(i)]
  })

  aux <- (pred - y)^2
  aux/(aux + 1)
}

r.S4 <- function(d, x, y) {
  test <- r.generate(x, y, nrow(x))
  pred <- FNN::knn.reg(x, test[, -ncol(test), drop=FALSE], y, k=1)$pred
  aux <- (pred - test[, ncol(test)])^2
  aux/(aux + 1)
}
SmartDataAnalytics/ECoL documentation built on Dec. 21, 2020, 9:36 p.m.