R/metrics.R

Defines functions filter.entropy filter.variance filter.auc filter visualize.roc.curve roc.curve auc confusion.matrix.FPR confusion.matrix.TPR confusion.matrix visualize.correlation.matrix correlation df.correlation range.spearman two.var.correlation visualize.infotheo.matrix mutual.information joint.entropy conditional.entropy df.conditional.entropy df.joint.entropy df.mutual.information two.var.mutual.information two.var.conditional.entropy two.var.joint.entropy entropy col.entropy variance col.variance

Documented in auc conditional.entropy correlation entropy filter joint.entropy mutual.information roc.curve variance visualize.correlation.matrix visualize.infotheo.matrix visualize.roc.curve

col.variance <- function(x){
  x.mean <- mean(x)
  sum((x-x.mean)**2) / (length(x)-1)
}



#' Variance
#' 
#' @param x a numeric vector, matrix, or data.frame.
#' @param margin In case of matrix or dataframe apply variance by columns margin = 2L or by rows margin = 1L. Default 2.
#' 
#' 
#' 
#' @return
#' the sample variance of the input.
#' 
#' @examples
#' example1 <- data.frame("V1" = rep(1, 100), "V2" = 100:1,  "V4" = sample.int(3000,100,replace=FALSE))
#' example2 <- rep(1, 100)
#' 
#' variance(example1)
#' variance(example1, margin = 1L)
#' variance(example2)


variance <- function(x, margin = 2L){
  if (is.data.frame(x) | is.matrix(x)){
      apply(x, margin,col.variance)
  }else if (is.vector(x) && is.atomic(x)) {
    col.variance(x)
  }else{
    stop("Not supported data type")
  }
}





col.entropy <- function(x) {
  x.p <- table(x) / length(x)
  x.entropy <- -sum(x.p * log2(x.p))
  return(x.entropy)
}
#' Entropy
#' 
#' @param x a numeric vector, matrix, or data.frame.
#' @param margin In case of matrix or dataframe apply entropy by columns margin = 2L or by rows margin = 1L. Default 2.
#' 
#' 
#' 
#' @return
#' the sample entropy of the input.
#' 
#' @examples
#' col1 <-  c('False', 'False', 'True', 'True', 'True', 'True', 'False', 'True', 'False', 'False')
#' col2 <- c('True', 'False', 'False', 'False', 'True', 'True', 'False', 'False', 'False', 'False')
#' col3 <- c('a', 'b', 'c', 'a', 'a', 'a', 'l', 'a', 'l', 'a')
#' df <- data.frame("V1" = col1, "V2" = col2, "V3" = col3)
#' entropy(df)


entropy <- function(x, margin = 2L){
  if (is.data.frame(x) | is.matrix(x)){
      apply(x, margin,col.entropy)
  }else if (is.vector(x) && is.atomic(x)) {
    col.entropy(x)
  }else{
    stop("Not supported data type")
  }

}


two.var.joint.entropy <- function(x, y){
  x.vals <- names(table(x))
  y.vals <- names(table(y))
  t <- cbind(x, y)
  res <- 0
  for(x.i in x.vals){
    for(y.i in y.vals){
      p_xi_yi <- length(t[t[,1] == x.i & t[,2] == y.i,] ) / 2 / nrow(t)
      if(!is.null(p_xi_yi) && length(p_xi_yi) > 0 && p_xi_yi > 0){
        res <- res - p_xi_yi * log2(p_xi_yi)
      }
      
    }
  }
  return(res) 
  
}


two.var.conditional.entropy <- function(x, y) {
  two.var.joint.entropy(x, y) - col.entropy(y)
}

two.var.mutual.information <- function(x, y) {
  col.entropy(x) - two.var.conditional.entropy(x, y)
}


df.mutual.information <- function(df, file) {
  ncols <- ncol(df)
  coln <- colnames(df)
  mi.df <- data.frame(matrix(ncol=ncols,nrow=ncols, dimnames=list(coln, coln)))
  for (i in 1:ncols){
    for (j in 1:i){
        x <- df[, i]
        y <- df[, j]
        c <- two.var.mutual.information(as.vector(x), as.vector(y))
        mi.df[i, j] <- c
        mi.df[j, i] <- c
    }
  }
  if (file != "None"){
    visualize.infotheo.matrix(mi.df, type = "Mutual Information", file = file)
  }
  mi.df
}


df.joint.entropy <- function(df, file) {
  ncols <- ncol(df)
  coln <- colnames(df)
  mi.df <- data.frame(matrix(ncol=ncols,nrow=ncols, dimnames=list(coln, coln)))
  for (i in 1:ncols){
    for (j in 1:ncols){
        x <- df[, i]
        y <- df[, j]
        c <- two.var.joint.entropy(as.vector(x), as.vector(y))
        mi.df[i, j] <- c
    }
  }
  if (file != "None"){
    visualize.infotheo.matrix(mi.df, type = "Joint Entropy", file = file)
  }
  mi.df
}

df.conditional.entropy <- function(df, file) {
  ncols <- ncol(df)
  coln <- colnames(df)
  mi.df <- data.frame(matrix(ncol=ncols,nrow=ncols, dimnames=list(coln, coln)))
  for (i in 1:ncols){
    for (j in 1:ncols){
        x <- df[, i]
        y <- df[, j]
        c <- two.var.conditional.entropy(as.vector(x), as.vector(y))
        mi.df[i, j] <- c
    }
  }
  if (file != "None"){
    visualize.infotheo.matrix(mi.df, type = "Conditional Entropy", file = file)
  }
  mi.df
}

#' Conditional Entropy
#' 
#' @param x a vector, matrix, or data.frame.
#' @param y a vector. y must be NULL if x is a matrix or a data.frame
#' @param file In case of x been a data.frame or matrix, represent the name of output file for saving conditional entropy plot
#' 
#' 
#' @return
#' In case of x been a data.frame or matrix, conditional entropy of all variables/columns in x. Otherwise, conditional entropy of x and y
#' 
#' @examples
#' df <- data.frame("V1" = sample.int(2,5,replace=TRUE), "V2" =sample.int(2,5,replace=TRUE) , "V3" = sample.int(2,5,replace=TRUE), "V4" = sample.int(2,5,replace=TRUE),  "V5" = sample.int(2,5,replace=TRUE), "V6" = sample.int(2,5,replace=TRUE), "V7" = sample.int(2,5,replace=TRUE), "V8" = sample.int(2,5,replace=TRUE))
#' x <- sample.int(2, 5,replace=TRUE)
#' y <- sample.int(2,5,replace=TRUE)
#' conditional.entropy(df, file ="conditionalEntropy")
#' conditional.entropy(x = x, y = y)
#' 


conditional.entropy <- function(x, y = NULL, file = "None"){
  if (is.data.frame(x) | is.matrix(x)){
    df.conditional.entropy(x, file)
  }else if (is.vector(x) && is.atomic(x) && !is.null(y)) {
    two.var.conditional.entropy(x, y)
  }else{
    stop("Not supported data type")
  }

}

#' Joint Entropy
#' 
#' @param x a vector, matrix, or data.frame.
#' @param y a vector. y must be NULL if x is a matrix or a data.frame.
#' @param file In case of x been a data.frame or matrix, represent the name of output file for saving joint entropy plot.
#' 
#' 
#' @return
#' In case of x been a data.frame or matrix, joint entropy of all variables/columns in x. Otherwise, joint entropy of x and y.
#' 
#' @examples
#' df <- data.frame("V1" = sample.int(2,5,replace=TRUE), "V2" =sample.int(2,5,replace=TRUE) , "V3" = sample.int(2,5,replace=TRUE), "V4" = sample.int(2,5,replace=TRUE),  "V5" = sample.int(2,5,replace=TRUE), "V6" = sample.int(2,5,replace=TRUE), "V7" = sample.int(2,5,replace=TRUE), "V8" = sample.int(2,5,replace=TRUE))
#' x <- sample.int(2, 5,replace=TRUE)
#' y <- sample.int(2, 5,replace=TRUE)
#' joint.entropy(df, file ="jointEntropy")
#' joint.entropy(x = x, y = y)
#' 
joint.entropy <- function(x, y = NULL, file = "None"){
  if (is.data.frame(x) | is.matrix(x)){
    df.joint.entropy(x, file)
  }else if (is.vector(x) && is.atomic(x) && !is.null(y)) {
    two.var.joint.entropy(x, y)
  }else{
    stop("Not supported data type")
  }

}

#' Mutual information
#' 
#' @param x a vector, matrix, or data.frame.
#' @param y a vector. y must be NULL if x is a matrix or a data.frame.
#' @param file In case of x been a data.frame or matrix, represent the name of output file for saving mutual information plot.
#' 
#' 
#' @return
#' In case of x been a data.frame or matrix, mutual information of all variables/columns in x. Otherwise, mutual information of x and y.
#' 
#' @examples
#' df <- data.frame("V1" = sample.int(2,5,replace=TRUE), "V2" =sample.int(2,5,replace=TRUE) , "V3" = sample.int(2,5,replace=TRUE), "V4" = sample.int(2,5,replace=TRUE),  "V5" = sample.int(2,5,replace=TRUE), "V6" = sample.int(2,5,replace=TRUE), "V7" = sample.int(2,5,replace=TRUE), "V8" = sample.int(2,5,replace=TRUE))
#' x <- sample.int(2, 5,replace=TRUE)
#' y <- sample.int(2, 5,replace=TRUE)
#' mutual.information(df, file ="mutualInformation")
#' mutual.information(x = x, y = y)

mutual.information <- function(x, y = NULL, file = "None"){
  if (is.data.frame(x) | is.matrix(x)){
    df.mutual.information(x, file)
  }else if (is.vector(x) && is.atomic(x) && !is.null(y)) {
    two.var.mutual.information(x, y)
  }else{
    stop("Not supported data type")
  }

}


#' Heatmap plot of theory information metrics
#' 
#' @param df Conditional Entropy, Joint Entropy or Mutual Information data.frame or matrix.
#' @param type Type of theory information metric. Character
#' @param file  In case you want to save the plot, file represents output file name.
#' @param colors Three colors for the gradient. Default c("#DEB841", "white", "#267278").
#' 
#' 
#' @return
#' Plot theory information metrics heatmap
#' 
#' @examples
#' df <- data.frame("V1" = sample.int(2,5,replace=TRUE), "V2" =sample.int(2,5,replace=TRUE) , "V3" = sample.int(2,5,replace=TRUE), "V4" = sample.int(2,5,replace=TRUE),  "V5" = sample.int(2,5,replace=TRUE), "V6" = sample.int(2,5,replace=TRUE), "V7" = sample.int(2,5,replace=TRUE), "V8" = sample.int(2,5,replace=TRUE))
#' ce <- conditional.entropy(df)
#' je <- joint.entropy(df)
#' mi <- mutual.information(df)
#' 
#' visualize.infotheo.matrix(ce, type = "Conditional Entropy")
#' visualize.infotheo.matrix(ce, type = "Conditional Entropy", file = "conditionalEntropy")
#' visualize.infotheo.matrix(ce, type = "Conditional Entropy", file = "conditionalEntropy", colors = c("Blue", "White", "Green"))
#' 
#' 
#' visualize.infotheo.matrix(je, type = "Joint Entropy")
#' visualize.infotheo.matrix(je, type = "Joint Entropy", file = "jointEntropy")
#' visualize.infotheo.matrix(je, type = "Joint Entropy", file = "jointEntropy", colors = c("Blue", "White", "Green"))
#' 
#' 
#' visualize.infotheo.matrix(je, type = "Mutual Information")
#' visualize.infotheo.matrix(je, type = "Mutual Information", file = "mutualInformation")
#' visualize.infotheo.matrix(je, type = "Mutual Information", file = "mutualInformation", colors = c("Blue", "White", "Green"))


visualize.infotheo.matrix <- function(df, type, file = "None", colors = c("#DEB841", "white", "#267278")) {
  if (!requireNamespace("ggplot2") | !requireNamespace("reshape2")){
    stop("visualize.infotheo.matrix requires the installation of ggplot2 and reshape2 packages")
  }
  df.melt <- reshape2::melt(as.matrix(df))
  gg <- ggplot2::ggplot(df.melt, ggplot2::aes(x = Var2, y = Var1)) + 
  ggplot2::geom_tile(ggplot2::aes(fill = value))+
  ggplot2::geom_text(ggplot2::aes(label = round(value, 2))) +
  ggplot2::scale_fill_gradientn(colors = colors, limits =  c(min(df),max(df))) + 
  ggplot2::labs(title=type) +
  ggplot2::theme_bw() + ggplot2::theme(axis.text.x=ggplot2::element_text(size=9, angle=0, vjust=0.3),
                     axis.text.y=ggplot2::element_text(size=9),
                     axis.title.x=ggplot2::element_blank(),
                     axis.title.y=ggplot2::element_blank(),
                     plot.title=ggplot2::element_text(size=11))
  if(file != "None"){
    path = paste(file, "jpg", sep = ".")
    ggplot2::ggsave(filename=path, plot= gg, height=15, width=15, units=c("cm"), dpi=200)
  }
  gg
}




two.var.correlation <- function(x, y, method) {
  
  if(method == "Pearson"){
    if(all(x == y)){
      return(1)
    }
    x.y.cov <- cov(x, y)
    if (x.y.cov != 0){
      mul.sd.x.y <- sd(x) * sd(y)
      if (mul.sd.x.y != 0){
        return(x.y.cov/mul.sd.x.y)
      }
    } 
    return(0)
  }else if(method == "Spearman"){
    n <- length(x)
    rx <- range.spearman(x)
    ry <- range.spearman(y)
    d2 <-(rx-ry) ^ 2 
    Sd2 <- sum(d2)
    corr <- 1 - (6 * Sd2) / (n * (n^2 - 1)) 
  }else{
    stop("Not supported discretization method")
  }
  corr
}

range.spearman <- function(x) {
  x.sort <- x[order(x)]
  x.values <- 1:length(x)
  x.ranges <- x.values
  x.sort.values <- data.frame("x" = x.sort, "values" = x.values)
  for (i in 1:length(x)){
    x.sort.i <- x.sort[i]
    x_i <- data.frame(x.sort.values[x.sort.values$x == x.sort.i , ])
    x.values[i] <- mean(x_i$values)
    
    
  }
  df.values <- data.frame("x" = x.sort, "v" = x.values)
  res <- c()
  for (elem in x){
    val <- head(df.values[df.values$x == elem, 2], 1)
    res <- c(res, val)
  }
  res
  
  
}



df.correlation <- function(df, method, file) {
  ncols <- ncol(df)
  coln <- colnames(df)
  cor.df <- data.frame(matrix(ncol=ncols,nrow=ncols, dimnames=list(coln, coln)))
  for (i in 1:ncols){
    for (j in 1:i){
      if (i != j){
        x <- df[, i]
        y <- df[, j]
        c <- two.var.correlation(x, y, method)
        cor.df[j, i] <- c
      }
    }
  }
  cor.df <- cor.df[-nrow(cor.df), -1]
  if (file != "None"){
    visualize.correlation.matrix(cor.df, file)
  }
  cor.df
  
}

#' Correlation
#' 
#' @param x a numeric vector, matrix, or data.frame.
#' @param y a numeric vector. y must be NULL if x is a matrix or a data.frame.
#' @param method Pearson or Spearman method for calculate correlation. Default Pearson.
#' @param file In case of x been a data.frame or matrix, represent the name of output file for saving mutual information plot.
#' 
#' 
#' @return
#' In case of x been a data.frame or matrix, correlation of all variables/columns in x. Otherwise, correlation of x and y.
#' 
#' @examples
#' df <- data.frame("V1" = sample.int(3000,100,replace=FALSE), "V2" =sample.int(3000,100,replace=FALSE) , "V3" = sample.int(3000,100,replace=FALSE), "V4" = sample.int(3000,100,replace=FALSE),  "V5" = sample.int(3000,100,replace=FALSE), "V6" = sample.int(3000,100,replace=FALSE), "V7" = sample.int(3000,100,replace=FALSE), "V8" = sample.int(3000,100,replace=FALSE))
#' x <- c(35,23,47, 17, 10, 43, 9, 6, 28)
#' y <- c(30, 33, 45, 23, 8, 49, 12, 4, 31)
#' 
#' correlation(x = x, y = y, method = "Pearson")
#' correlation(x = x, y = y, method = "Spearman")
#' correlation(x = df, method = "Pearson")
#' correlation(x = df, method = "Spearman")
#' correlation(x = df, method = "Pearson", file = "pearsonCor")
#' correlation(x = df, method = "Spearman", file = "spearmanCor")

correlation <- function(x, y = NULL, method = "Pearson", file = "None"){
 if (is.data.frame(x) | is.matrix(x)){
    df.correlation(x, method, file)
  }else if (is.vector(x) && is.atomic(x) && !is.null(y)) {
    two.var.correlation(x, y, method)
  }else{
    stop("Not supported data type")
  }
}

#' Plot correlation matrix
#' 
#' @param cor.df a data.frame representing correlation matrix
#' @param file In case of x been a data.frame or matrix, represent the name of output file for saving mutual information plot.
#' @param colors gradiente colors
#' 
#' @return
#' Plot correlation matrix
#' 
#' @examples
#' df <- data.frame("V1" = sample.int(3000,100,replace=FALSE), "V2" =sample.int(3000,100,replace=FALSE) , "V3" = sample.int(3000,100,replace=FALSE), "V4" = sample.int(3000,100,replace=FALSE),  "V5" = sample.int(3000,100,replace=FALSE), "V6" = sample.int(3000,100,replace=FALSE), "V7" = sample.int(3000,100,replace=FALSE), "V8" = sample.int(3000,100,replace=FALSE))
#' pearson.cor <- correlation(x = df, method = "Pearson")
#' visualize.correlation.matrix(cor.df = pearson.cor)
#' visualize.correlation.matrix(cor.df = pearson.cor, file = "pearsonCor")
#' visualize.correlation.matrix(cor.df = pearson.cor, file = "pearsonCor", colors = c("Blue", "White", "Green"))

visualize.correlation.matrix <- function(cor.df, file = "None", colors = c("#DEB841", "white", "#267278")) {
  if (!requireNamespace("ggplot2") | !requireNamespace("reshape2")){
    stop("visualize.correlation.matrix requires the installation of ggplot2 and reshape2 packages")
  }
  df <- reshape2::melt(as.matrix(cor.df))
  gg <- ggplot2::ggplot(df, ggplot2::aes(x = Var2, y = Var1)) + 
  ggplot2::geom_tile(ggplot2::aes(fill = value))+
  ggplot2::geom_text(ggplot2::aes(label = ifelse(is.na(value), "", round(value, 2)))) +
  ggplot2::scale_fill_gradientn(colors = colors, limits =  c(-1,1), na.value = "white", breaks = c(-1,0,1),  labels = format(c(-1,0,1))) + 
  ggplot2::labs(title="Correlation Matrix")+ 
  ggplot2::theme_bw() + ggplot2::theme(axis.text.x=ggplot2::element_text(size=9, angle=0, vjust=0.3),
                     axis.text.y=ggplot2::element_text(size=9),
                     axis.title.x=ggplot2::element_blank(),
                     axis.title.y=ggplot2::element_blank(),
                     plot.title=ggplot2::element_text(size=11))
  if(file != "None"){
    path = paste(file, "jpg", sep = ".")
    ggplot2::ggsave(filename=path, plot= gg, height=15, width=15, units=c("cm"), dpi=200)
  }
  gg
}




confusion.matrix <- function(actual, predited){
  TP <- 0
  FP <- 0
  TN <- 0
  FN <- 0
  n <- length(actual)
  for (i in 1:n){
    if (actual[i]  && predited[i]){
      TP <- TP + 1
    }else if (actual[i]  && !predited[i]){
      FN <- FN + 1
    }else if (!actual[i] && predited[i] ){
      FP <- FP + 1
    }else{
      TN <- TN + 1
    }
  }
  c.matrix <- matrix(c(TP, FN, FP, TN), nrow = 2, byrow = TRUE)

  rownames(c.matrix) <- c("Real Positive", "Real Negative")
  colnames(c.matrix) <- c("Predicted Positive", "Predicted Negative")
  
  return(c.matrix)
}

confusion.matrix.TPR <- function(m){
  TP <- m[1,1]
  FN <- m[1,2]
  if (TP > 0){
    TPR <- TP / (TP + FN)
  }else{
    TPR <- 0
  }
}

confusion.matrix.FPR <- function(m){
  FP <- m[2,1]
  TN <- m[2,2]
  if (FP > 0){
    FPR <- FP / (FP + TN)
  }else{
    FPR <- 0
  }
}

#' auc
#' 
#' @param x if x is a dataframe represent FPR and TPR values but if it is a vector represent FPR value.
#' @param y if x is a vector, y represent TPR values
#' 
#' @return
#' auc value
#' 
#' @examples
#' random.x <- sample.int(3000,1000,replace=FALSE)
#' class <- c(rep(TRUE, 500), rep(FALSE, 500))
#' pos <- sample.int(1000, 1000)
#' random.y <- class[order(pos)]
#' roc.curve <- roc.curve(random.x, random.y)
#' auc <- auc(roc.curve$FPR , roc.curve$TPR )
auc <- function(x, y = NULL) {
  if (is.data.frame(x) | is.matrix(x)){
      y <- x$TPR
      x <- x$FPR
      sum(-diff(x) * rowMeans(cbind(y[-1], y[-length(y)])))
  }else if (is.vector(x) && is.atomic(x) && !is.null(y)) {
      sum(-diff(x) * rowMeans(cbind(y[-1], y[-length(y)])))
  }else{
    stop("Not supported data type")
  }
}



#' Roc curve
#' 
#' @param x a numerical vector
#' @param y represent logical class TRUE/FALSE.
#' @param file name of output file for saving roc curve plot.
#' @param AUC return auc value AUC = TRUE
#' 
#' 
#' @return
#' Return roc curve values: TPR, FPR or AUC
#' 
#' @examples
#' random.x <- sample.int(3000,1000,replace=FALSE)
#' class <- c(rep(TRUE, 500), rep(FALSE, 500))
#'  pos <- sample.int(1000, 1000)
#'  random.y <- class[order(pos)]
#' 
#' perfect.x <- 1:3000
#' perfect.y <- c(rep(TRUE, 1500), rep(FALSE, 1500))
#' 
#' worst.x <- 1:3000
#' worst.y <- c(rep(FALSE, 1500), rep(TRUE, 1500))
#' 
#' roc.curve(random.x, random.y, file= "randomRocCurve")
#' roc.curve(perfect.x, perfect.y, file= "perfectRocCurve")
#' roc.curve(worst.x, worst.y, file= "worstRocCurve") 
roc.curve <- function(x, y, file = "None", AUC = FALSE){
  
  x.order <- x[order(x)]
  x.length <- length(x)
  
  actual.labels <- y[order(x)]
  roc.TPR <- c()
  roc.FPR <- c()
  
  for (i in 0:x.length){
    n.true <- x.length - i
    n.false <- x.length - n.true
    predicted.labels <- c(rep(TRUE, n.true), rep(FALSE, n.false))
    cMatrix <-  confusion.matrix(actual.labels, predicted.labels)
    
    TPR <- confusion.matrix.TPR(cMatrix)
    
    FPR <- confusion.matrix.FPR(cMatrix)
    
    roc.FPR <- append(roc.FPR, FPR)
    roc.TPR <- append(roc.TPR, TPR)
    
    
  }

  auc.val <- auc(roc.FPR, roc.TPR)

  if (file != "None"){
    visualize.roc.curve(roc.FPR, roc.TPR, auc.val, file)
  }
  if (AUC){
    auc.val
  }else{
    data.frame("FPR" = roc.FPR, "TPR" = roc.TPR)
  }
    
  
}


#' Plot Roc curve
#' 
#' @param roc.FPR a numerical vector representing false positive rates
#' @param roc.TPR a numerical vector representing true positive rates
#' @param auc auc value. Use auc() function.
#' @param file  name of output file for saving roc curve plot.
#' 
#' @return
#' Plot roc curve
#' 
#' @examples
#' random.x <- sample.int(3000,1000,replace=FALSE)
#' class <- c(rep(TRUE, 500), rep(FALSE, 500))
#' pos <- sample.int(1000, 1000)
#' random.y <- class[order(pos)]
#' roc.curve <- roc.curve(random.x, random.y)
#' auc <- auc(roc.curve$FPR , roc.curve$TPR )
#' visualize.roc.curve(roc.curve$FPR , roc.curve$TPR, auc)
#' visualize.roc.curve(roc.curve$FPR , roc.curve$TPR, auc, file = "randomRocCurve")
visualize.roc.curve <- function(roc.FPR, roc.TPR, auc, file = "None"){
  if (!requireNamespace("ggplot2")){
    stop("visualize.roc.curve requires the installation of ggplot2 and reshape2 packages")
  }
  data <- data.frame(FPR = rev(roc.FPR), TPR = rev(roc.TPR))
  auc.val <- as.character(round(auc, 3))
  
  gg <- ggplot2::ggplot(data, ggplot2::aes(x = FPR, y = TPR), limit = c(1,1)) + ggplot2::xlab("FPR (1-specificity)") + ggplot2::ylab("TPR (sensitivity)")
  gg <- gg + ggplot2::geom_area(position = "identity", fill="#69b3a2", alpha=0.4) 
  gg <- gg + ggplot2::geom_line(color="#9D9D9D", size=2)
  gg <- gg + ggplot2::geom_point(size=2, color="#267278")
  gg <- gg + ggplot2::geom_abline(size=1, color="#ec7063", linetype = "dashed")
  gg <- gg + ggplot2::annotate("text",  x = 0.8, y = 0.1, label = paste("AUC: ", auc.val))
  gg <- gg + ggplot2::annotate("rect", xmin = 0.65, xmax = 0.95, ymin = 0.05, ymax = 0.15, alpha = .3, fill="#424949" )
  gg <- gg + ggplot2::annotate(geom = "text", x = 0.5, y = 0.45, label = "RANDOM CLASSIFIER", color = "#ec7063", angle = 45)
  gg <- gg + ggplot2::ggtitle("ROC Curve") + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5))
  
  if(file != "None"){
    path = paste(file, "jpg", sep = ".")
    ggplot2::ggsave(filename=path, plot= gg, height=15, width=15, units=c("cm"), dpi=200)
  }

  gg

}

#' Metric filter
#' 
#' @param df data.frame. Each colum represent a variable
#' @param class name of the class variable
#' @param by metric to apply the filter. AUC, Variance and Entropy are available.
#' @param uplimit metric uplimit value
#' @param lowlimit  metric uplimit value. Default
#' 
#'  
#' 
#' @return
#' Filtered data.frame
#' 
#' @examples
#' class <- c(rep(TRUE, 50), rep(FALSE, 50))
#' pos <- sample.int(100, 100)
#' random.y <- class[order(pos)]
#' df.numeric <- data.frame("V1" = sample.int(10,100,replace=TRUE), "V2" = sample.int(10,100,replace=TRUE), "V3" = sample.int(100,100,replace=TRUE), "Class" =  random.y)
#' df.cat <- data.frame("V1" = as.factor(sample.int(3,100,replace=TRUE)), "V2" =  as.factor(sample.int(3,100,replace=TRUE)),  "V4" =  as.factor(sample.int(3,100,replace=TRUE)), "Class" =  random.y)
#' filter(df.numeric, class = "Class", by = "AUC", uplimit = 1, lowlimit = 0.5)
#' filter(df.numeric, class = "Class", by = "Variance", uplimit = 100, lowlimit = 0)
#' filter(df.cat, class = "Class", by = "Entropy", uplimit = 2, lowlimit = 0)
#' filter(df.cat, class = "Class", by = "Entropy")
filter <- function(df, class, by = "AUC",  uplimit = .Machine$integer.max, lowlimit = -.Machine$integer.max) {
  y <- df[, c(class)]
  if(!is.logical(y)){
    stop("Class must be logical type")
  }
  vars <- df[, !names(df) %in% c(class)]
  if(by == "AUC"){
    if(FALSE %in% apply(vars, 2, is.numeric)){
      stop("For AUC filter, variables must be numerical")
    }
    df.filter <- filter.auc(df, vars, y, class, uplimit, lowlimit)
  }else if(by == "Variance"){
    if(FALSE %in% apply(vars, 2, is.numeric)){
      stop("For Variance filter, variables must be numerical")
    }
    df.filter <- filter.variance(df, vars, y, class, uplimit, lowlimit)
  }else if(by == "Entropy"){
    if(FALSE %in% apply(vars, 2, is.integer) && FALSE %in% apply(vars, 2, is.character)){
      stop("For Entropy filter, variables must be numerical")
    }
    df.filter <- filter.entropy(df, vars, y, class, uplimit, lowlimit)
  }else{
    stop("Not supported filter method")
  }
  df.filter
  
}

filter.auc <- function(df, vars, y, class, uplimit, lowlimit) {
  
  vars.auc <- apply(vars, 2, roc.curve, y = y, AUC = TRUE)
  print(vars.auc)
  df.filtered <- df[ , c(names(subset(vars.auc, vars.auc >= lowlimit & vars.auc <= uplimit)), class)]
  if (is.logical(df.filtered)){
    df.filtered <- data.frame(class = df.filtered)
  }

  df.filtered
}

filter.variance <- function(df, vars, y, class, uplimit, lowlimit) {
  vars.variance <- variance(vars)
  df.filtered <- df[ , c(names(subset(vars.variance,vars.variance >= lowlimit & vars.variance <= uplimit)), class)]
  if (is.logical(df.filtered)){
    df.filtered <- data.frame(class = df.filtered)
  }

  df.filtered
  
}


filter.entropy <- function(df, vars, y, class, uplimit, lowlimit) {
  vars.entropy <- entropy(vars)
  df.filtered <- df[ , c(names(subset(vars.entropy,vars.entropy >= lowlimit & vars.entropy <= uplimit)), class)]
  if (is.logical(df.filtered)){
    df.filtered <- data.frame(class = df.filtered)
  }

  df.filtered
  
}
AnderEhu/sme documentation built on Jan. 31, 2022, 12:01 a.m.