R/stabilityIndex.R
In evaluomeR: Evaluation of Bioinformatics Metrics

Documented in stability stabilityRange stabilitySet

#' @title Stability index.
#' @name stability
#' @aliases stability
#' @description
#' This analysis permits to estimate whether the clustering is meaningfully
#' affected by small variations in the sample. First, a clustering using the
#' k-means algorithm is carried out. The value of \code{k} can be provided by the user.
#' Then, the stability index is the mean of the Jaccard coefficient
#' values of a number of \code{bs} bootstrap replicates. The values are in the range [0,1],
#' having the following meaning:
#' \itemize{
#' \item Unstable: [0, 0.60[.
#' \item Doubtful: [0.60, 0.75].
#' \item Stable: ]0.75, 0.85].
#' \item Highly Stable: ]0.85, 1].
#' }
#'
#' @param data A \code{\link{SummarizedExperiment}}.
#' The SummarizedExperiment must contain an assay with the following structure:
#' A valid header with names. The first  column of the header is the ID or name
#' of the instance of the dataset (e.g., ontology, pathway, etc.) on which the
#' metrics are measured.
#' The other columns of the header contains the names of the metrics.
#' The rows contains the measurements of the metrics for each instance in the dataset.
#' @param k Positive integer. Number of clusters between [2,15] range.
#' @param bs Positive integer. Bootstrap value to perform the resampling.
#' @param cbi Clusterboot interface name (default: "kmeans"):
#' "kmeans", "clara", "clara_pam", "hclust", "pamk", "pamk_pam", "pamk".
#' Any CBI appended with '_pam' makes use of \code{\link{pam}}.
#' The method used in 'hclust' CBI is "ward.D2".
#' @param getImages Boolean. If true, a plot is displayed.
#' @param seed Positive integer. A seed for internal bootstrap.
#'
#' @return A \code{\link{ExperimentList}} containing the stability and cluster measurements
#'  for k clusters.
#'
#' @examples
#' # Using example data from our package
#' data("ontMetrics")
#' result <- stability(ontMetrics, k=6, getImages=TRUE)
#'
#' @references
#' \insertRef{milligan1996measuring}{evaluomeR}
#'
#' \insertRef{jaccard1901distribution}{evaluomeR}
#'
#'
stability <- function(data, k=5, bs=100, cbi="kmeans",
                      getImages=TRUE, seed=NULL) {

  data <- as.data.frame(assay(data))

  checkKValue(k)
  runStabilityIndex(data, k.min=k, k.max=k, bs, cbi, seed=seed)
  stabilityDataFrame <- suppressWarnings(
    runStabilityIndexTableRange(data, k.min=k, k.max=k))
  if (getImages == TRUE) {
    suppressWarnings(
      runStabilityIndexK_IMG(bs, k.min = k, k.max = k))
  }
  se <- createSEList(stabilityDataFrame)
  return(se)
}


#' @title Stability index for a range of k clusters.
#' @name stabilityRange
#' @aliases stabilityRange
#' @description
#' This analysis permits to estimate whether the clustering is meaningfully
#' affected by small variations in the sample. For a range of k values (\code{k.range}),
#' a clustering using the k-means algorithm is carried out.
#' Then, the stability index is the mean of the Jaccard coefficient
#' values of a number of \code{bs} bootstrap replicates. The values are in the range [0,1],
#' having the following meaning:
#' \itemize{
#' \item Unstable: [0, 0.60[.
#' \item Doubtful: [0.60, 0.75].
#' \item Stable: ]0.75, 0.85].
#' \item Highly Stable: ]0.85, 1].
#' }
#'
#' @inheritParams stability
#' @param k.range Concatenation of two positive integers.
#' The first value \code{k.range[1]} is considered as the lower bound of the range,
#' whilst the second one, \code{k.range[2]}, as the higher. Both values must be
#' contained in [2,15] range.
#'
#' @return A \code{\link{ExperimentList}} containing the stability and cluster measurements
#'  for 2 to \code{k} clusters.
#'
#' @examples
#' # Using example data from our package
#' data("ontMetrics")
#' result <- stabilityRange(ontMetrics, k.range=c(2,3))
#'
#' @references
#' \insertRef{milligan1996measuring}{evaluomeR}
#'
#' \insertRef{jaccard1901distribution}{evaluomeR}
#'
#'
stabilityRange <- function(data, k.range=c(2,15), bs=100, cbi="kmeans",
                           getImages=TRUE, seed=NULL) {
  k.range.length = length(k.range)
  if (k.range.length != 2) {
    stop("k.range length must be 2")
  }
  k.min = k.range[1]
  k.max = k.range[2]
  checkKValue(k.min)
  checkKValue(k.max)
  if (k.max < k.min) {
    stop("The first value of k.range cannot be greater than its second value")
  }

  data <- as.data.frame(SummarizedExperiment::assay(data))

  runStabilityIndex(data, k.min=k.min, k.max=k.max, bs, cbi, seed=seed)
  stabilityDataFrame <- suppressWarnings(
    runStabilityIndexTableRange(data, k.min=k.min, k.max=k.max))

  if (getImages == TRUE) {
    suppressWarnings(
      runStabilityIndexK_IMG(bs, k.min=k.min, k.max=k.max))
    suppressWarnings(
      runStabilityIndexMetric_IMG(bs, k.min=k.min, k.max=k.max))
  }
  se <- createSEList(stabilityDataFrame)
  return(se)
}

#' @title Stability index for a set of k clusters.
#' @name stabilitySet
#' @aliases stabilitySet
#' @description
#' This analysis permits to estimate whether the clustering is meaningfully
#' affected by small variations in the sample. For a set of k values (\code{k.set}),
#' a clustering using the k-means algorithm is carried out.
#' Then, the stability index is the mean of the Jaccard coefficient
#' values of a number of \code{bs} bootstrap replicates. The values are in the range [0,1],
#' having the following meaning:
#' \itemize{
#' \item Unstable: [0, 0.60[.
#' \item Doubtful: [0.60, 0.75].
#' \item Stable: ]0.75, 0.85].
#' \item Highly Stable: ]0.85, 1].
#' }
#'
#' @inheritParams stability
#' @param k.set A list of integer values of \code{k}, as in c(2,4,8).
#' The values must be contained in [2,15] range.
#'
#' @return A \code{\link{ExperimentList}} containing the stability and cluster measurements
#'  of the list of \code{k} clusters.
#'
#' @examples
#' # Using example data from our package
#' data("rnaMetrics")
#' result <- stabilitySet(rnaMetrics, k.set=c(2,3))
#'
#' @references
#' \insertRef{milligan1996measuring}{evaluomeR}
#'
#' \insertRef{jaccard1901distribution}{evaluomeR}
#'
#'
stabilitySet <- function(data, k.set=c(2,3), bs=100, cbi="kmeans",
                           getImages=TRUE, seed=NULL) {
  k.set.length = length(k.set)
  if (k.set.length == 0) {
    stop("k.set list is empty")
  } else if (k.set.length == 1) {
    stop("k.set list contains only one element. For one K analysis use 'stability' method")
  }
  k.set = sort(k.set)
  for (k in k.set) {
    checkKValue(k)
  }

  data <- as.data.frame(SummarizedExperiment::assay(data))

  runStabilityIndex(data, k.set = k.set, bs=bs, cbi=cbi, seed=seed)
  stabilityDataFrame <- suppressWarnings(
    runStabilityIndexTableRange(data, k.set = k.set))

  if (getImages == TRUE) {
    suppressWarnings(
      runStabilityIndexK_IMG(bs, k.set = k.set))
    suppressWarnings(
      runStabilityIndexMetric_IMG(bs, k.set = k.set))
  }
  se <- createSEList(stabilityDataFrame)
  return(se)
}

runStabilityIndex <- function(data, k.min=NULL, k.max=NULL, bs,
                              cbi, seed, k.set=NULL) {
  if (is.null(seed)) {
    seed = pkg.env$seed
  }
  if (is.null(k.min) && is.null(k.max) && is.null(k.set)) {
    stop("runStabilityIndex: All k parameters are null!")
  }
  inversa=NULL
  m.stab.global = NULL
  m.stab.global.csv = NULL # To store new CSV output measures without altering legacy code
  todo.estable = NULL
  datos.bruto=data
  names.metr=names(datos.bruto)[-c(1)]

  pkg.env$names.metr = names.metr

  bs.values=c(bs)
  contador=0
  i.min=k.min
  i.max=k.max

  k.range = NULL
  k.range.length = NULL
  if (!is.null(k.set)) {
    k.range = k.set
    k.range.length = length(k.set)
  } else {
    k.range = i.min:i.max
    k.range.length = length(i.min:i.max)+1
  }

  num.metrics = length(names.metr)
  for (i.metr in 1:num.metrics) {
    message("Processing metric: ", names.metr[i.metr],"(", i.metr,")")
    m.stab.global[[i.metr]]=matrix(data=NA, nrow=1,
                                   ncol=k.range.length)
    m.stab.global.csv[[i.metr]]=matrix(data=NA, nrow=1,
                                       ncol=k.range.length)

    for (j.k in k.range) {
      message("\tCalculation of k = ", j.k,"")
      estable=NULL
      contador=contador+1
      i=i.metr+1
      estable$n.metric=i.metr
      estable$name.metric=names.metr[i.metr]
      estable$n.k=j.k
      estable$name.ontology=names(datos.bruto[1])

      km5=NULL
      v.size=length(levels(as.factor(datos.bruto[,i])))
      if (v.size>=j.k) {
        #km5$cluster=boot.cluster(data=datos.bruto[,i],
        #                         nk=j.k, B=bs, seed=seed)
        #km5$jac=km5$cluster$means

        clusterbootData = tryCatch({
          clusterbootWrapper(data=datos.bruto[,i], B=bs,
                             bootmethod="boot",
                             cbi=cbi,
                             krange=j.k, seed=seed)
        }, error = function(error_condition) {
          error_condition
        })

        if(inherits(clusterbootData, "error")) {
          message(paste0("\t", clusterbootData))
          message("\tWarning: Could not process data for k = ", j.k)
          km5$bspart=rep(NA,length(datos.bruto[,i]))
          km5$jac=rep(NA,j.k)
          km5$centr=rep(NA,j.k)
          km5$means=km5$bspart
          km5$bspart.or=km5$bspart
          km5$bspart.inv=km5$means
          km5$jac.or=km5$jac
          km5$jac.inv=km5$jac
          km5$partition=km5$bspart.inv
          km5$jac.stab=km5$jac.inv

          km5$csv = NULL
          km5$csv$cluster_partition = NULL
          km5$csv$cluster_mean = NULL
          km5$csv$cluster_centers = NULL
          km5$csv$cluster_size = NULL
          km5$csv$cluster_betweenss = NULL
          km5$csv$cluster_totss = NULL
          km5$csv$cluster_tot.withinss = NULL
          km5$csv$cluster_anova = NULL

          m.stab.global[[i.metr]][j.k] = mean(km5$jac.stab)
          m.stab.global.csv[[i.metr]][j.k] = list(km5)
          estable[[which(bs.values==bs)]] = km5
          next
        }

        km5$cluster = clusterbootData

        km5$jac=km5$cluster$bootmean
        km5$bspart=km5$cluster$partition

        km5$csv = NULL
        km5$csv$cluster_partition = km5$cluster$partition
        km5$csv$cluster_mean = km5$cluster$bootmean
        km5$csv$cluster_centers = km5$cluster$result$result$centers
        km5$csv$cluster_size = km5$cluster$result$result$size
        km5$csv$cluster_betweenss = km5$cluster$result$result$betweenss
        km5$csv$cluster_totss = km5$cluster$result$result$totss
        km5$csv$cluster_tot.withinss = km5$cluster$result$result$tot.withinss
        km5$csv$cluster_anova = fAnova(km5$cluster$result$result, j.k, num.metrics)

        km5$centr=by(datos.bruto[,i],km5$bspart,mean)
        for (km5.i in 1:length(km5$centr)) {
          km5$means[which(km5$bspart==km5.i)]=km5$centr[km5.i]
        }

        km5$bspart.or=ordered(km5$means,labels=seq(1,length(km5$centr)))

        km5$bspart.inv=ordered(km5$means,labels=seq(length(km5$centr),1))

        km5$jac.or=km5$jac[order(km5$centr)]
        km5$jac.inv=km5$jac[order(km5$centr,decreasing=TRUE)]

        if (any(inversa==names.metr[i.metr])) {
          km5$partition=km5$bspart.inv
          km5$jac.stab=km5$jac.inv
        } else {
          km5$partition=km5$bspart.or
          km5$jac.stab=km5$jac.or
        }
        m.stab.global[[i.metr]][j.k] = mean(km5$jac.stab)
        m.stab.global.csv[[i.metr]][j.k] = list(km5)
        estable[[which(bs.values==bs)]] = km5
      } else {
        message("\tWarning: Could not process data for k = ", j.k)
        km5$bspart=rep(NA,length(datos.bruto[,i]))
        km5$jac=rep(NA,j.k)
        km5$centr=rep(NA,j.k)
        km5$means=km5$bspart
        km5$bspart.or=km5$bspart
        km5$bspart.inv=km5$means
        km5$jac.or=km5$jac
        km5$jac.inv=km5$jac
        km5$partition=km5$bspart.inv
        km5$jac.stab=km5$jac.inv

        km5$csv = NULL
        km5$csv$cluster_partition = NULL
        km5$csv$cluster_mean = NULL
        km5$csv$cluster_centers = NULL
        km5$csv$cluster_size = NULL
        km5$csv$cluster_betweenss = NULL
        km5$csv$cluster_totss = NULL
        km5$csv$cluster_tot.withinss = NULL
        km5$csv$cluster_anova = NULL

        m.stab.global[[i.metr]][j.k] = mean(km5$jac.stab)
        m.stab.global.csv[[i.metr]][j.k] = list(km5)
        estable[[which(bs.values==bs)]] = km5
      }
      estable$km5.dynamic = km5$partition
      todo.estable[[contador]]=estable
    }
  }

  e.stab.global=NULL
  for (j.k in k.range) {
    #e.stab.global[[j.k]]=matrix(data=NA, nrow=length(names.metr), ncol=length(i.min:i.max))
    e.stab.global[[j.k]]=matrix(data=NA, nrow=k.range.length, ncol=1)
    for (i.metr in 1:length(names.metr)) {
      e.stab.global[[j.k]][i.metr]=m.stab.global[[i.metr]][j.k]
    }
  }


  pkg.env$m.stab.global.csv = m.stab.global.csv
  pkg.env$m.stab.global = m.stab.global
  pkg.env$e.stab.global = e.stab.global
  pkg.env$names.metr = names.metr
  #return(stabilityDataFrame)
  #return(NULL)
}

runStabilityIndexTableRange <- function(data, k.min=NULL, k.max=NULL, k.set=NULL) {
  if (is.null(k.min) && is.null(k.max) && is.null(k.set)) {
    stop("runStabilityIndexTableRange: All k parameters are null!")
  }
  stabilityDataFrame = NULL

  m.stab.global = pkg.env$m.stab.global
  m.stab.global.csv = pkg.env$m.stab.global.csv
  names.metr = pkg.env$names.metr

  measures = NULL
  # Key = Dataframe name - Value = Header name for each k
  measures["stability_mean"]= c("Mean_stability_k_")
  measures["cluster_partition"]= c("Cluster_partition_k_")
  measures["cluster_mean"]= c("Cluster_mean_k_")
  measures["cluster_centers"]= c("Cluster_centers_k_")
  measures["cluster_size"]= c("Cluster_size_k_")
  measures["cluster_betweenss"]= c("Cluster_betweenss_k_")
  measures["cluster_totss"]= c("Cluster_totss_k_")
  measures["cluster_tot.withinss"]= c("Cluster_tot.withinss_k_")
  measures["cluster_anova"]= c("Cluster_anova_k_")

  k.range = NULL
  k.range.length = NULL
  if (!is.null(k.set)) {
    k.range = k.set
    k.range.length = length(k.set)
  } else {
    k.range = k.min:k.max
    k.range.length = length(k.min:k.max)+1
  }

  for (measure in names(measures)) {
    stabilityDataList = list()
    # Build header
    header <- list("Metric")
    for (k in k.range) {
      nextHeader <- paste(measures[measure], k, sep="")
      header <- c(header, nextHeader)
    }
    header = unlist(header, use.names=FALSE)

    # Build rows
    for (i.metr in 1:length(names.metr)) {
      measure.data.list = list()
      wrapper=NULL # Wrapper object to extract the measure data
      wrapper$metric=names.metr[i.metr]
      if (measure == "stability_mean") {
        jacMean = m.stab.global[[i.metr]]
        cur.value = jacMean[c(k.range)]
        measure.data.list = c(measure.data.list, cur.value)
      } else {
        km5.list = m.stab.global.csv[[i.metr]]
        for (k in k.range) {
          km5.cur = km5.list[[k]]
          cur.value = getMeasureValue(km5.cur, measure)
          measure.data.list = c(measure.data.list, cur.value)
        }
      }
      wrapper[[measure]] = unlist(measure.data.list, use.names = FALSE)
      stabilityDataList[[i.metr]] = unlist(wrapper, use.names=FALSE)
    }  # end for i.metr

    # Transform into dataframe
    # Add df if has data
    if (ncol(t(data.frame(stabilityDataList))) != 1) {
      stabilityDataFrame[[measure]] = t(data.frame(stabilityDataList))
      colnames(stabilityDataFrame[[measure]]) = header
      rownames(stabilityDataFrame[[measure]]) <- NULL
    }
  }

  return(stabilityDataFrame)
}

# Stability index per K value (x values = matrics)
runStabilityIndexK_IMG <- function(bs, k.min = NULL, k.max = NULL, k.set = NULL) {
  if (is.null(k.min) && is.null(k.max) && is.null(k.set)) {
    stop("runStabilityIndexK_IMG: All k parameters are null!")
  }
  ancho=6
  alto=4
  escala=0.6
  escalax=escala
  ajuste=0.5
  escalat=0.5
  escalap=0.4
  contadorFiguras=1
  #Pattern: GlobalStability_K_2, ..., GlobalStability_K_N
  figurename="GlobalStability_K_"

  colores <- c("black", "blue", "red", "darkgreen", "orange", "magenta", "gray")
  ltype <- c(1, 2, 3, 4, 3, 4, 6)
  i.min=k.min
  i.max=k.max

  k.range = NULL
  k.range.length = NULL
  g.main = NULL

  if (!is.null(k.set)) {
    k.range = k.set
    k.range.length = length(k.set)
    setAsStrList = paste(as.character(k.set),collapse=", ",sep="")
    g.main=paste(" St. Indices of the metrics for k in {", setAsStrList, "}",sep="")
  } else {
    k.range = i.min:i.max
    k.range.length = length(k.min:k.max)+1
    g.main=paste(" St. Indices of the metrics for k in [", i.min, ",", i.max, "]",sep="")
  }

  stype <- c(1:k.range.length)

  e.stab.global = pkg.env$e.stab.global
  names.metr = pkg.env$names.metr
  margins <- par(mar=c(5,5,3,3))
  on.exit(par(margins))
  xnames=as.character(names.metr)
  ynames="Global Stability Indices"

  metrics_length = length(names.metr)

  num_metrics_plot = 19
  num_iterations = round(metrics_length/num_metrics_plot)
  if (num_iterations > 0) {
    num_iterations = num_iterations - 1
  }
  for (iteration in 0:num_iterations) {
    i = 1
    labels = list()
    rangeStart = (iteration*num_metrics_plot)+1
    rangeEnd = rangeStart+num_metrics_plot
    if (rangeEnd > metrics_length) {
      rangeEnd = metrics_length
    }
    new_xnames = xnames[rangeStart:rangeEnd]
    for (j.k in k.range) {
      cur.data = e.stab.global[[j.k]]
      cur.data = cur.data[rangeStart:rangeEnd]
      #cur.data = cur.data[!is.na(cur.data)]
      #cur.data = cur.data[c(i.min:i.max)
      plot(cur.data, main=g.main, axes=TRUE, col.axis="white",
           xlim=c(0.75,length(new_xnames)+0.25), xlab="", ylim=c(0,1),
           ylab=ynames, col="black", type="o", lwd=1, lty=stype[i], pch=stype[i])
      labels = c(labels,(paste0("k=", j.k)))
      i = i + 1
      par(new=TRUE)
    }
    axis(1,at=1:length(new_xnames),labels=new_xnames,las=2,cex.axis=0.75)
    axis(2,las=3,cex.axis=0.85)
    legend("bottomright", legend=labels, inset=.01, lwd=1, lty=stype,
           col="black", cex=0.7, pch=stype)
    mtext(side=1, text="Metrics",line=4)
    text(0.75, 0.9, "H.stab", cex=0.6, col = "black")
    abline(h = 0.85, col="black", lwd=1, lty=1) # Highly Stable: (0.85, 1]
    text(0.74, 0.8, "Stab", cex=0.6, col = "black")
    abline(h = 0.75, col="black", lwd=1, lty=1) # Stable: (0.75, 0.85]
    text(0.76, 0.65, "Doubt.", cex=0.6, col = "black")
    abline(h = 0.60, col="black", lwd=1, lty=1) # Doubtful: [0.60, 0.75]
    text(0.76, 0.05, "Unstab.", cex=0.6, col = "black")
    #abline(h = 0, col="black", lwd=1, lty=4) # Unstable: [0, 0.60)
    par(new=FALSE)
  }
}

# Stability index per metric (x values = k range)
runStabilityIndexMetric_IMG <- function(bs, k.min=NULL, k.max=NULL, k.set=NULL) {
  if (is.null(k.min) && is.null(k.max) && is.null(k.set)) {
    stop("runStabilityIndexMetric_IMG: All k parameters are null!")
  }
  ancho=6
  alto=4
  escala=0.9
  escalax=escala
  ajuste=0.5
  escalat=0.5
  escalap=0.4
  contadorFiguras=1
  #Pattern: GlobalStability_MetricX, ..., GlobalStability_MetricN
  figurename="GlobalStability_"

  colores <- c("black", "blue", "red", "darkgreen", "orange", "magenta", "gray")
  ltype <- c(1, 2, 3, 4, 3, 4, 6)

  stype <- c("o", "l")
  pchtype <- c(1, 2, 3, 4, 5, 5)


  k.range = NULL
  k.range.length = NULL
  if (!is.null(k.set)) {
    k.range = k.set
    k.range.length = length(k.set)
  } else {
    k.range = k.min:k.max
    k.range.length = length(k.min:k.max)
  }

  m.stab.global = pkg.env$m.stab.global
  names.metr = pkg.env$names.metr
  margins <- par(mar=c(5,5,3,3))
  on.exit(par(margins))
  for (i.metr in 1:length(names.metr)) {
    cur.data = m.stab.global[[i.metr]]
    cur.data = cur.data[!is.na(cur.data)]
    ymin = min(cur.data)
    if (is.na(ymin) || ymin == Inf) {
      ymin = 0
    }
    if (!is.null(k.set)) {
      setAsStrList = paste(as.character(k.set),collapse=", ",sep="")
      g.main=paste(" St. Indices of '", names.metr[i.metr], "' for k in {",
                   setAsStrList,"}",sep="")
    } else {
      g.main=paste(" St. Indices of '", names.metr[i.metr], "' for k in [",
                   k.min, ",", k.max,"]",sep="")
    }
    xnames=c(k.range)
    ynames="Global Stability Indices"

    plot(cur.data, main=g.main, axes=TRUE, col.axis="white",
         xlim=c(0.75,k.range.length+0.25), xlab="", ylim=c(ymin,1),
         ylab=ynames, col=colores[1],type="o", lwd=1, lty=ltype[1])
    axis(1,at=1:k.range.length,labels=xnames,las=1,cex.axis=escalax)
    axis(2,las=3,cex.axis=0.85)
    labels <- paste("b=", bs, sep = "")
    legend("topright", legend=labels, inset=.01, lwd=1, lty=ltype[1:4], col=colores[1:4], cex=0.7, pch=pchtype[1:4])
    mtext(side=1, text="K values",line=3)
  }

}