R/plotting.R
In scmamp: Statistical Comparison of Multiple Algorithms in Multiple Problems

Documented in plotPvalues

# NORMALITY CHECK PLOTS ---------------------------------------------------

#' @title Gaussian distribution quantile-quantile plot
#'
#' @description This function creates a quantile-quantile plot to assess the goodness of fit of a Gaussian distribution to a given sample.
#' @param data List of data points to check
#' @param ... The plot is created using \code{ggplot2}. This special parameter can be used to pass additional parameters to the \code{\link{geom_point}} function used to plot the sample points.
#' @return A \code{\link{ggplot}} object.
#' @seealso \code{\link{plotDensities}}
#' @examples
#' ## Skewed distribution
#' sample <- rbeta(100 , 2 , 50)
#' qqplotGaussian(sample)
#' ## Symmetric distribution
#' sample <- rbeta(100 , 5 , 5)
#' qqplotGaussian(sample)

qqplotGaussian <- function (data, ...) {
  
  processVector <- function (sample) {
    # Auxiliar function to get the points for a single qqplot
    sample <- sort(sample)
    m      <- mean(sample)
    sd     <- sd(sample)
    n      <- length(sample)
    
    q.gaussian <- qnorm(seq(from=(1 / n), to=((n - 1) / n), by=(1/n)), 
                        mean=m, sd=sd)
    q.sample <- sample[-n]
    
    df <- data.frame(Empirical=q.sample, Gaussian=q.gaussian)  
    return (df)
  }
  
  if (is.vector(data)) {
    df <- processVector(data)
    facet <- NULL
  } else {
    aux <- lapply(1:ncol(data), 
                  FUN=function(i) {
                    alg.pts  <- processVector(data[, i])
                    alg.name <- names(data)[i]
                    return(cbind(alg.pts, Algorithm=alg.name))
                  })
    df <- do.call(rbind, aux)
    facet <- facet_wrap(~Algorithm, scales="free")
  }
  
  gplot <- ggplot(df, aes_string(x="Empirical", y="Gaussian")) +
           geom_abline(slope=1, intercept=0, col="darkgray", size=1.1) +  
           geom_point(...) + facet
  
  return(gplot)
}


#' @title Kernel based density estimation of the samples
#'
#' @description This function estimates and plots the densities of the results of each algorithm
#' @param data A matrix where columns represent the algorithms
#' @param ... The plot is created using \code{ggplot2}. This special parameter can be used to pass additional parameters to the \code{\link{geom_line}} function used to plot the sample points. It can also be used to pass additional arguments to the \code{density} function, which is used to eastimate the densities.
#' @return A \code{\link{ggplot}} object.
#' @seealso \code{\link{qqplotGaussian}}
#' @examples
#' data(data_gh_2010)
#' plotDensities(data.gh.2010)
#' 

plotDensities <- function (data, ...) {
  
  if (is.vector(data)){
    d  <- density(data)
    df <- data.frame(Value=d$x, Density=d$y)
    mapping <- aes_string(x="Value", y="Density")
  } else {
    k <- dim(data)[2]
    aux <- lapply(1:k, 
                  FUN=function (x) {
                    d  <- density(data[, x], ...)
                    df <- data.frame(Algorithm=colnames(data)[x], 
                                     Value=d$x, Density=d$y)
                    return(df)
    })
    df <- do.call(rbind, aux)
    mapping <- aes_string(x="Value", y="Density", col="Algorithm")
  }
  gplot <- ggplot(df, mapping) + geom_line(...)
  return(gplot)
}

# PLOTTING THE RESULTS ----------------------------------------------------

#' @title Plotting the p-value matrix
#'
#' @description This function plots the p-value matrix as a tile plot.
#' @param pvalue.matrix Matrix with the p-values to plot
#' @param alg.order A permutation indicating the reordering of the algorithms
#' @param show.pvalue Logical value indicating whether the numerical values have to be printed
#' @param font.size Size of the p-values, if printed
#' @return A \code{\link{ggplot}} object.
#' @seealso \code{\link{drawAlgorithmGraph}}, \code{\link{plotCD}}
#' @examples
#' data(data_gh_2008)
#' pvalues <- friedmanPost(data.gh.2008)
#' ordering <- order(summarizeData(data.gh.2008))
#' plotPvalues(pvalues, alg.order=ordering)
#' 
 
plotPvalues <- function(pvalue.matrix, alg.order=NULL, show.pvalue=TRUE, font.size=5) {

  # Convert the matrix into a data frame and order the algorithms according to 
  # the desired order.
  df <- melt(pvalue.matrix)
  colnames(df) <- c("X", "Y", "p.value")
  if (!is.null(alg.order)) {
    l <- colnames(pvalue.matrix)[alg.order]
    df$X <- factor(df$X, levels=l)
    df$Y <- factor(df$Y, levels=l)
  }
  
  gplot <- ggplot(df, aes_string(x="X", y="Y", fill="p.value")) + geom_tile(col="white") +
          scale_fill_continuous("p-value") + labs(x="Algorithm" , y="Algorithm")
  
  if (show.pvalue) {
    p.value <- df$p.value
    gplot <- gplot + geom_text(aes_string(label="round(p.value, 2)"), 
                               size=font.size, col="white")
  }
  return(gplot)
} 


#' @title Critical difference plot
#'
#' @description This function plots the critical difference plots shown in Demsar (2006)
#' @param results.matrix Matrix or data frame with the results for each algorithm
#' @param alpha Significance level to get the critical difference. By default this value is 0.05
#' @param cex Numeric value to control the size of the font. By default it is set at 0.75.
#' @param ... Additional arguments for \code{\link{rankMatrix}}
#' @seealso \code{\link{drawAlgorithmGraph}},   \code{\link{plotRanking}}, \code{\link{plotPvalues}}
#' @references Demsar, J. (2006) Statistical Comparisons of Classifiers over Multiple Data Sets. \emph{Journal of Machine Learning Research}, 7, 1-30.
#' @examples
#' data(data_gh_2008)
#' plotCD(data.gh.2008, alpha=0.01)
#' 
plotCD <- function (results.matrix, alpha=0.05, cex=0.75, ...) {
  k <- dim(results.matrix)[2]
  N <- dim(results.matrix)[1]
  cd <- getNemenyiCD(alpha=alpha, num.alg=k, num.problems=N)
  
  mean.rank <- sort(colMeans(rankMatrix(results.matrix, ...)))
  
  # Separate the algorithms in left and right parts
  lp <- round(k/2)
  left.algs <- mean.rank[1:lp]
  right.algs <- mean.rank[(lp+1):k]  
  max.rows <- ceiling(k/2)
  
  # Basic dimensions and definitions
  char.size    <- 0.001  # Character size
  line.spacing <- 0.25   # Line spacing for the algorithm name
  m            <- floor(min(mean.rank))
  M            <- ceiling(max(mean.rank))
  max.char     <- max(sapply(colnames(results.matrix), FUN = nchar))  # Longest length of a label
  text.width   <- (max.char + 4) * char.size
  w            <- (M-m) + 2 * text.width
  h.up         <- 2.5 * line.spacing  # The upper part is fixed. Extra space is for the CD
  h.down       <- (max.rows + 2.25) * line.spacing # The lower part depends on the no. of algorithms. 
  # The 2 extra spaces are for the lines that join algorithms
  tick.h       <- 0.25 * line.spacing
  
  label.displacement <- 0.1    # Displacement of the label with respect to the axis
  line.displacement  <- 0.025  # Displacement for the lines that join algorithms
  
  # Background of the plot
  plot(0, 0, type="n", xlim=c(m - w / (M - m), M + w / (M - m)), 
       ylim=c(-h.down, h.up), xaxt="n", yaxt="n", xlab= "", ylab="", bty="n")
  
  # Draw the axis
  lines (c(m,M), c(0,0))
  dk <- sapply(m:M, 
               FUN=function(x) {
                 lines(c(x,x), c(0, tick.h))
                 text(x, 3*tick.h, labels=x, cex=cex)
               })
  
  # Draw the critical difference
  lines(c(m, m + cd), c(1.75 * line.spacing, 1.75 * line.spacing))
  text(m + cd / 2, 2.25 * line.spacing, "CD", cex=cex)
  lines(c(m, m), c(1.75 * line.spacing - tick.h / 4, 
                   1.75 * line.spacing + tick.h / 4))
  lines(c(m + cd, m + cd), c(1.75 * line.spacing - tick.h / 4, 
                             1.75 * line.spacing + tick.h / 4))
  
  # Left part, labels
  dk <- sapply (1:length(left.algs), 
                FUN=function(x) {
                  line.h <- -line.spacing * (x + 2)
                  text(x=m - label.displacement, y=line.h, 
                       labels=names(left.algs)[x], cex=cex, adj=1)
                  lines(c(m, left.algs[x]), c(line.h, line.h))
                  lines(c(left.algs[x], left.algs[x]), c(line.h, 0))
                })
  
  # Right part, labels
  dk <- sapply (1:length(right.algs), 
                FUN=function(x) {
                  line.h <- -line.spacing * (x + 2)
                  text(x=M + label.displacement, y=line.h, 
                       labels=names(right.algs)[x], cex=cex, adj=0)
                  lines(c(M, right.algs[x]), c(line.h, line.h))
                  lines(c(right.algs[x], right.algs[x]), c(line.h, 0))
                })
  
  # Draw the lines to join algorithms
  getInterval <- function (x) {
    from <- mean.rank[x]
    diff <- mean.rank - from
    ls <- which(diff > 0 & diff < cd)
    if (length(ls) > 0) {
      c(from, mean.rank[max(ls)])
    }
  }
  
  intervals <- mapply (1:k, FUN=getInterval)
  aux <- do.call(rbind, intervals)
  
  # With this strategy, there can be intervals included into bigger ones
  # We remove them in a sequential way
  to.join <- aux[1,]
  if(nrow(aux) > 1) {  
    for (r in 2:nrow(aux)) {
      if (aux[r - 1, 2] < aux[r, 2]) {
        to.join <- rbind(to.join, aux[r, ])
      }
    }
  }
  
  row <- c(1)
  # Determine each line in which row will be displayed
  if (!is.matrix(to.join)) {  # To avoid treating vector separately
    to.join <- t(as.matrix(to.join))
  }
  nlines <- dim(to.join)[1]
  
  for(r in 1:nlines) {
    id <- which(to.join[r, 1] > to.join[, 2])
    if(length(id) == 0) {
      row <- c(row, tail(row, 1) + 1)
    } else {
      row <- c(row, min(row[id]))
    }
  }
  
  step <- max(row) / 2
  
  # Draw the line
  dk <- sapply (1:nlines, 
                FUN = function(x) {
                  y <- -line.spacing * (0.5 + row[x] / step)
                  lines(c(to.join[x, 1] - line.displacement, 
                          to.join[x, 2] + line.displacement), 
                        c(y, y), lwd=3)
                })
}



#' @title Ranking Plots
#'
#' @description This function creates a plot similar to the critical difference plot, but applicable to any corrected pvalue.
#' @param pvalues Matrix or data frame with the p-values used to determine the differences
#' @param summary Summary values used to place the algorithms. Typically it will be the average ranking, but it can be any other value 
#' @param alpha Significance level to determine the significativity of the differences. By default this value is 0.05
#' @param cex Numeric value to control the size of the font. By default it is set at 0.75.
#' @param decreasing A logical value to determine whether the values have to be plotter from smaller to larger or the other way round.
#' @seealso \code{\link{drawAlgorithmGraph}}, \code{\link{plotCD}}, \code{\link{plotPvalues}}
#' @examples
#' data(data_gh_2008)
#' test <- postHocTest(data.gh.2008, test="friedman", correct="bergmann", use.rank=TRUE)
#' plotRanking(pvalues=test$corrected.pval, summary=test$summary, alpha=0.05)
#' 
plotRanking <- function (pvalues, summary, alpha=0.05, cex=0.75, decreasing=FALSE) {
  k <- length(summary)

  if (is.matrix(summary)) {
    if (ncol(summary) == 1) {
      summary <- summary[, 1]    
    } else {
      summary <- summary[1, ]
    }
  }
  
  # Check the names
  if (!all(sort(colnames(pvalues)) %in% sort(names(summary))) |
      !all(sort(names(summary)) %in% sort(colnames(pvalues)))) {
    stop("The column names of 'pvalues' and the names of 'summary' have to contain the same names")
  }
  
  # Reorder the pvalues and mean ranks
  o <- order(summary, decreasing=decreasing)
  summary <- summary[o]
  pvalues <- pvalues[names(summary),names(summary)]
  
  
  # Separate the algorithms in left and right parts
  lp <- round(k/2)
  left.algs <- summary[1:lp]
  right.algs <- summary[(lp+1):k]  
  max.rows <- ceiling(k/2)
  
  # Basic dimensions and definitions
  char.size    <- 0.001  # Character size
  line.spacing <- 0.25   # Line spacing for the algorithm name
  m            <- floor(min(summary))
  M            <- ceiling(max(summary))
  max.char     <- max(sapply(colnames(pvalues), FUN = nchar))  # Longest length of a label
  text.width   <- (max.char + 4) * char.size
  w            <- (M-m) + 2 * text.width
  h.up         <- 2.5 * line.spacing  # The upper part is fixed. Extra space is for the CD
  h.down       <- (max.rows + 2.25) * line.spacing # The lower part depends on the no. of algorithms. 
  # The 2 extra spaces are for the lines that join algorithms
  tick.h       <- 0.25 * line.spacing
  
  label.displacement <- 0.1    # Displacement of the label with respect to the axis
  line.displacement  <- 0.025  # Displacement for the lines that join algorithms
  
  # Background of the plot
  plot(0, 0, type="n", xlim=c(m - w / (M - m), M + w / (M - m)), 
       ylim=c(-h.down, h.up), xaxt="n", yaxt="n", xlab= "", ylab="", bty="n")
  
  # Draw the axis
  lines (c(m,M), c(0,0))
  dk <- sapply(m:M, 
               FUN=function(x) {
                 lines(c(x,x), c(0, tick.h))
                 text(x, 3*tick.h, labels=x, cex=cex)
               })
  
  
  # Left part, labels
  dk <- sapply (1:length(left.algs), 
                FUN=function(x) {
                  line.h <- -line.spacing * (x + 2)
                  text(x=m - label.displacement, y=line.h, 
                       labels=names(left.algs)[x], cex=cex, adj=1)
                  lines(c(m, left.algs[x]), c(line.h, line.h))
                  lines(c(left.algs[x], left.algs[x]), c(line.h, 0))
                })
  
  # Right part, labels
  dk <- sapply (1:length(right.algs), 
                FUN=function(x) {
                  line.h <- -line.spacing * (x + 2)
                  text(x=M + label.displacement, y=line.h, 
                       labels=names(right.algs)[x], cex=cex, adj=0)
                  lines(c(M, right.algs[x]), c(line.h, line.h))
                  lines(c(right.algs[x], right.algs[x]), c(line.h, 0))
                })
  
  # Draw the lines to join algorithms
  getInterval <- function (x) {
    ls <- which(pvalues[x, ] > alpha)
    if (length(ls) > 0) {
      c(summary[x], summary[max(ls)])
    }
  }
  
  intervals <- mapply (1:k, FUN=getInterval)
  
  # Under some circumstances the function does not return a matrix ...
  if (is.matrix(intervals)) {
    aux <- t(intervals)
  } else {
    aux <- do.call(rbind, intervals)
  }
  
  # With this strategy, there can be intervals included into bigger ones
  # We remove them in a sequential way
  to.join <- aux[1,]
  if(nrow(aux) > 1) {  
    for (r in 2:nrow(aux)) {
      if (aux[r - 1, 2] < aux[r, 2]) {
        to.join <- rbind(to.join, aux[r, ])
      }
    }
  }
  
  row <- c(1)
  # Determine each line in which row will be displayed
  if (!is.matrix(to.join)) {  # To avoid treating vector separately
    to.join <- t(as.matrix(to.join))
  }
  nlines <- dim(to.join)[1]
  
  for(r in 1:nlines) {
    id <- which(to.join[r, 1] > to.join[, 2])
    if(length(id) == 0) {
      row <- c(row, tail(row, 1) + 1)
    } else {
      row <- c(row, min(row[id]))
    }
  }
  
  step <- max(row) / 2
  
  # Draw the line
  dk <- sapply (1:nlines, 
                FUN = function(x) {
                  y <- -line.spacing * (0.5 + row[x] / step)
                  lines(c(to.join[x, 1] - line.displacement, 
                          to.join[x, 2] + line.displacement), 
                        c(y, y), lwd=3)
                })
}



#' @title Hypotheses represented as a graph
#'
#' @description This function can be used to plot a graph where algorithms are nodes and  algorithms that cannot be regarded as different are joined by an edge.
#' @param pvalue.matrix Matrix with the p-values
#' @param mean.value Vector of values to be written together with the name of the algorithm
#' @param ... Additional parameters to the Rgraphviz function. This is mainly to change the layout of the graph
#' @param alpha Significance level to determine which hypotheses are rejected.
#' @param font.size Size of the font for the node labels.
#' @param highlight A character indicating which node has to be highlighted. It can be the one with the maximum value (\code{'max'}), the minimum value (\code{'min'}) or none (\code{'none'}).
#' @param highlight.color Any R valid color for the highlighted node.
#' @param node.color Any R valid color for the non-highlighted nodes.
#' @param font.color Any R valid color for the node labels.
#' @param digits Number of digits to display the value associated to each node
#' @param node.width Numeric value for the width of the node
#' @param node.height Numeric value for the height of the node
#' @seealso \code{\link{plotPvalues}}, \code{\link{plotRanking}}, \code{\link{plotCD}}
#' @examples
#' data(data_blum_2015)
#' data <- filterData(data.blum.2015, condition="Size == 1000", remove.cols=1:2)
#' res <- postHocTest(data, test = "friedman", use.rank=TRUE, correct="bergmann")
#' ## This function requieres the package Rgraphviz
#' # drawAlgorithmGraph(res$corrected.pval, res$summary)
#' 

drawAlgorithmGraph <- function (pvalue.matrix, mean.value, ..., 
                                alpha=0.05, font.size=15, highlight="min", 
                                highlight.color="chartreuse3", node.color="gray30", 
                                font.color="white", digits=2, 
                                node.width=5, node.height=2) {

  # Just in case we have a matrix ...
  if (is.matrix(mean.value)) {
    mean.value <- mean.value[1, ]
  }
  
  if (!all(colnames(pvalue.matrix) %in% names(mean.value))) {
    stop ("The names of the algorithms in the matrix and the mean.value vector ",
          "do not match")
  }
  
  hypothesis.matrix <- pvalue.matrix > alpha
  
  nc <- rep(node.color, length(mean.value))
  if(highlight == "min") {
    nc[which.min(mean.value)] <- highlight.color
  } else if(highlight == "max") {
    nc[which.max(mean.value)] <- highlight.color
  }
    
  hypothesis.matrix[is.na(hypothesis.matrix)]<-FALSE
  adj.matrix <- hypothesis.matrix
  colnames(adj.matrix) <- names(mean.value)
  rownames(adj.matrix) <- names(mean.value)
  
  nl <- paste(names(mean.value), "\\\n", round(mean.value, digits), sep="")
  
  am.graph <- new("graphAM", adjMat=adj.matrix, edgemode="undirected")
  names(nc) <- graph::nodes(am.graph)
  names(nl) <- graph::nodes(am.graph)
  
  nAttrs <- list()
  nAttrs$label      <- nl
  nAttrs$fillcolor  <- nc
  
  attrs <- list(node=list(shape="rectangle", width=node.width, height=node.height,
                            fontcolor=font.color, fontsize=font.size))
  plot(am.graph, ... , nodeAttrs=nAttrs, attrs=attrs)
}