# R/entropy.R In FCBF: Fast Correlation Based Filter for Feature Selection

#### Documented in IGSU

```# Based on implementation  from Rajarshi Guha <rajarshi@presidency.com>
# 13/05/2005
#
# Modified by Tiago Lubiana (22/08/2018)
# Functions to calculate forms of entropy for categorical variables ("factors")

# H(X) - entropy

entropy <- function(x, base = exp(1)) {
if (!is.factor(x)) {
stop("For calculating the entropy, the vector must be a factor")
}
t <- table(x)
probabily_of_t <- t / sum(t)
if (any(t == 0)) {
probabily_of_t <- probabily_of_t[-which(t == 0)]
}
ent <- -1 * sum(probabily_of_t * log(probabily_of_t) / log(base))
if (is.na(ent)) {
ent <- 0
}
ent
}

# H(X,Y) - joint entropy
entropy.joint <- function(x, y, base = exp(1)) {
if (!is.factor(x) || !is.factor(y)) {
stop("For calculating the joint entropy, the vector x & y must be factors")
}
t <- table(x, y)
probabily_of_t <- as.numeric(t / sum(t))
if (any(probabily_of_t == 0)) {
probabily_of_t <- probabily_of_t[-which(probabily_of_t == 0)]
}
ent <- -1 * sum(probabily_of_t * log(probabily_of_t) / log(base))
if (is.na(ent)) {
ent <- 0
}
ent
}

# H(X|Y) = H(X,Y) - H(Y) - conditional entropy
entropy.cond <- function(x, y, base = exp(1)) {
if (!is.factor(x) || !is.factor(y)) {
stop("For calculating the conditional entropy, the vectors x & y must be factors")
}
ent <- entropy.joint(x, y, base) - entropy(y, base)
if (is.na(ent)) {
ent <- 0
}
ent
}

#'  Symmetrical Uncertainty diagnostic

# Formula for symetrical uncertainty as described in  Yu, L. and Liu, H. , 2003.
#' This functions runs symmetrical uncertainty for two features,
#' returning the score
#'
#' @param x A vector containing a categorical feature
#' @param y A vector containing other categorical feature
#' @param base The base used for the logaritmic function. The default is exp(1) (~2.718)
#' @return A numerical value for the Symetrical Uncertainty score
#' @export
#' @examples
#'  data(scDengue)
#'  exprs <- SummarizedExperiment::assay(scDengue, 'logcounts')
#'  discrete_expression <- as.data.frame(discretize_exprs(exprs))
#'  discrete_expression_gene_1 <- discrete_expression\$V1
#'  discrete_expression_gene_2 <- discrete_expression\$V2
#'  SU(discrete_expression_gene_1,discrete_expression_gene_2)

SU <- function(x, y, base = exp(1)) {
if (is.character(x)) {
x <- as.factor(x)
}
y <- as.factor(y)
if (!is.factor(x) || !is.factor(y)) {
stop(
"For calculating the symmetrical uncertainty, the vectors x & y must be factors.
Using a continuous(numeric) feature set leads to this error."
)
}
Ht <- entropy.joint(x, y, base)
Hx <- entropy(x, base)
Hy <- entropy(y, base)
#cat(Ht,' ',Hx,' ',Hy,'\n')

# Returns the symmetrical uncertainty value for the vector pair
2 * (Hy + Hx - Ht) / (Hx + Hy)

}

#'  Information Gain
#' This functions runs Information Gain for two features,
#' returning the score
#'
#' @param x A vector containing a categorical feature
#' @param y A vector containing other categorical feature
#' @param base The base used for the logaritmic function. The default is exp(1) (~2.718)
#' @return A numerical value for the Information Gain score
#' @export
#' @examples
#'   data(scDengue)
#'   exprs <- SummarizedExperiment::assay(scDengue, 'logcounts')
#'   discrete_expression <- as.data.frame(discretize_exprs(exprs))
#'   discrete_expression_gene_1 <- discrete_expression\$V1
#'   discrete_expression_gene_2 <- discrete_expression\$V2
#'   IG(discrete_expression_gene_1,discrete_expression_gene_2)

# Formula for Information Gain
IG <- function(x, y, base = exp(1)) {
if (is.character(x)) {
x <- as.factor(x)
}
if (!is.factor(x) || !is.factor(y)) {
stop(
"For calculating the information gain, the vectors x & y must be factors.
Using a continuous(numeric) feature set leads to this error."
)
}
Ht <- entropy.joint(x, y, base)
Hx <- entropy(x, base)
Hy <- entropy(y, base)
# Returns the information gain for the pair
IG <- (Hy + Hx - Ht)
IG
}
```

## Try the FCBF package in your browser

Any scripts or data that you put into this service are public.

FCBF documentation built on Nov. 8, 2020, 8:30 p.m.