R/metricsAnalysis.R

Defines functions getRSKCAlpha getRSKCL1Boundry getMetricsRelevancy getMetricRangeByCluster annotateClustersByMetric standardizeStabilityData standardizeQualityData checkStabilityQualityData plotMetricsClusterComparison getOptimalKValue getFormattedK getLargestSilWidth isReasonable plotMetricsViolin plotMetricsCluster plotMetricsBoxplot plotMetricsMinMax

Documented in annotateClustersByMetric getMetricRangeByCluster getMetricRangeByCluster getMetricsRelevancy getOptimalKValue getRSKCAlpha getRSKCL1Boundry plotMetricsBoxplot plotMetricsCluster plotMetricsClusterComparison plotMetricsMinMax plotMetricsViolin

#' @title Minimum and maximum metric values plot.
#' @name plotMetricsMinMax
#' @aliases plotMetricsMinMax
#' @description
#' It plots the minimum, maximum and standard deviation
#' values of the metrics in a \code{\link{SummarizedExperiment}} object.
#'
#' @inheritParams stability
#'
#' @return Nothing.
#'
#' @examples
#' # Using example data from our package
#' data("ontMetrics")
#' plotMetricsMinMax(ontMetrics)
#'
plotMetricsMinMax <- function(data) {
  data <- as.data.frame(assay(data))
  # Prepare data for plotting
  # Data matrix without descritive column
  matrix = data.matrix(data[,-1])
  maxs = matrixStats::colMaxs(matrix)
  mins = matrixStats::colMins(matrix)
  means = colMeans(matrix)
  sd = matrixStats::colSds(matrix)

  dataStats = matrix(NA, nrow=5, ncol = length(data[,-1]), byrow=TRUE,
                     dimnames = list(c("Metric", "Min","Max","Mean","Sd"),
                                     c(colnames(data[,-1]))))
  dataStats["Metric",] = colnames(data[,-1])
  dataStats["Min",] = mins
  dataStats["Max",] = maxs
  dataStats["Mean",] = means
  dataStats["Sd",] = sd
  dataStats.df = as.data.frame(dataStats)
  dataStats.df.t = as.data.frame(t(dataStats.df))
  # Factor to numeric conversion
  dataStats.df.t$Min = as.numeric(as.character(dataStats.df.t$Min))
  dataStats.df.t$Max = as.numeric(as.character(dataStats.df.t$Max))
  dataStats.df.t$Mean = as.numeric(as.character(dataStats.df.t$Mean))
  dataStats.df.t$Sd = as.numeric(as.character(dataStats.df.t$Sd))

  ## Plotting
  p <- ggplot(dataStats.df.t, aes(x=dataStats.df.t$Metric)) +
    geom_linerange(aes(ymin=dataStats.df.t$Min,ymax=dataStats.df.t$Max),
                   linetype=2,color="#4E84C4") +
    geom_point(aes(y=dataStats.df.t$Min),size=3,color="#00AFBB") +
    geom_point(aes(y=dataStats.df.t$Max),size=3,color="#FC4E07") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90),
          #axis.text.y = element_blank(),
          text = element_text(size=15),
          axis.line = element_line(colour = "black",
                                   size = 1, linetype = "solid")
    ) +
    geom_errorbar(aes(ymin=(dataStats.df.t$Max-dataStats.df.t$Sd),
                      ymax=(dataStats.df.t$Max+dataStats.df.t$Sd)), width=.2,
                  position=position_dodge(.9)) +
    geom_errorbar(aes(ymin=(dataStats.df.t$Min-dataStats.df.t$Sd),
                      ymax=(dataStats.df.t$Min+dataStats.df.t$Sd)), width=.2,
                  position=position_dodge(.9)) +
    scale_y_continuous(breaks=seq(round(min(dataStats.df.t$Min-dataStats.df.t$Sd)), # 10 ticks across min - max range
                                  round(max(dataStats.df.t$Max+dataStats.df.t$Sd)),
                                  round((max(dataStats.df.t$Max)-min(dataStats.df.t$Min)))/10),
                       labels=function(x) sprintf("%.2f", x)) + # Two decimals
    labs(x = "Metrics", y = "Metric value", title = "Min/max/sd values across metrics") +
    guides(fill=TRUE)

  print(p)
}


#' @title Metric values as a boxplot.
#' @name plotMetricsBoxplot
#' @aliases plotMetricsBoxplot
#' @description
#' It plots the value of the metrics in a \code{\link{SummarizedExperiment}}
#' object as a boxplot.
#'
#' @inheritParams stability
#'
#' @return Nothing.
#'
#' @examples
#' # Using example data from our package
#' data("ontMetrics")
#' plotMetricsBoxplot(ontMetrics)
#'
plotMetricsBoxplot <- function(data) {
  data <- as.data.frame(assay(data))
  num_metrics_plot=20
  data.metrics = data[,-1] # Removing Description column

  metrics_length = length(colnames(data.metrics))
  num_iterations = round(metrics_length/num_metrics_plot)
  if (num_iterations > 0) {
    num_iterations = num_iterations - 1
  }
  for (iteration in 0:num_iterations) {
    i = 1
    rangeStart = (iteration*num_metrics_plot)+1
    rangeEnd = rangeStart+num_metrics_plot-1
    if (rangeEnd > metrics_length) {
      rangeEnd = metrics_length
    }
    suppressMessages({
      data.melt = melt(data.metrics[,rangeStart:rangeEnd])
    })
    # Melting 1 variable (e.g: data.metrics[,11:11])
    # won't create $variable column in data.melt.
    if (rangeStart == rangeEnd) {
      metricName = data[rangeStart, "Description"]
      data.melt$variable = rep(metricName, length(data.melt$value))
    }
    p <- ggplot(data.melt, aes(x=data.melt$variable, y=data.melt$value)) +
      geom_boxplot(
        #aes(fill=data.melt$variable), # Colors
        outlier.colour = "black",
        outlier.alpha = 0.7,
        outlier.shape = 21,
        show.legend = FALSE
      ) +
      #scale_y_continuous(limits = quantile(data.melt$value, c(0.1, 0.9))) +
      scale_color_grey() +
      theme_bw() +
      theme(
        text = element_text(size=20),
        axis.text.x = element_text(angle = 90)
      ) +
      labs(x = "Metrics", y="Metric value", fill="Metrics")
    # compute lower and upper whiskers
    #ylim1 = boxplot.stats(data.melt$value)$stats[c(1, 5)]
    # scale y limits based on ylim1
    #p1 = p + coord_cartesian(ylim = ylim1*1.05)
    print(p)
  }
}

#' @title Metric values clustering.
#' @name plotMetricsCluster
#' @aliases plotMetricsCluster
#' @description
#' It clusters the value of the metrics in a \code{\link{SummarizedExperiment}}
#' object a an hclust dendogram from \code{\link{stats}}. By default distance is measured in 'euclidean'
#' and hclust method is 'ward.D20.
#'
#' @inheritParams stability
#' @param scale Boolean. If true input data is scaled. Default: FALSE.
#' @param k Integer. If not NULL a 'cutree' cut on the cluster is done. Default: NULL
#'
#' @return An hclust object.
#'
#' @examples
#' # Using example data from our package
#' data("ontMetrics")
#' plotMetricsCluster(ontMetrics, scale=TRUE)
#'
plotMetricsCluster <- function(data, scale=FALSE, k=NULL) {
  data <- as.data.frame(assay(data))
  data.metrics = data[,-1] # Removing Description column
  if (isTRUE(scale)) {
    data.metrics = base::scale(data.metrics)
  }
  d <- dist(t(data.metrics), method = "euclidean") # distance matrix
  fit <- hclust(d, method="ward.D2")

  dend <- as.dendrogram(fit)

  nodePar <- list(lab.cex = 0.6, pch = c(NA, 19), cex = 0.7, col = "blue")
  plot(dend, xlab = "", sub="", ylab = "Euclidean distance",
       main = paste0("Metrics dendrogram 'ward.D2'"), nodePar = nodePar)

  if (!is.null(k)) {
    dendextend::rect.dendrogram(dend , k=k, border="purple")
  }
  return(fit)
}

#' @title Metric values as violin plot.
#' @name plotMetricsViolin
#' @aliases plotMetricsViolin
#' @description
#' It plots the value of the metrics in a \code{\link{SummarizedExperiment}}
#' object as a violin plot.
#'
#' @inheritParams stability
#' @param nplots Positive integer. Number of metrics per violin plot. Default: 20.
#'
#' @return Nothing.
#'
#' @examples
#' # Using example data from our package
#' data("ontMetrics")
#' plotMetricsViolin(ontMetrics)
#'
plotMetricsViolin <- function(data, nplots=20) {
  data <- as.data.frame(assay(data))
  data.metrics = data[,-1] # Removing Description column
  nplots=20

  metrics_length = length(colnames(data.metrics))
  num_iterations = round(metrics_length/nplots)
  if (num_iterations > 0) {
    num_iterations = num_iterations - 1
  }
  for (iteration in 0:num_iterations) {
      i = 1
      rangeStart = (iteration*nplots)+1
      rangeEnd = rangeStart+nplots-1
      if (rangeEnd > metrics_length) {
        rangeEnd = metrics_length
      }
    suppressMessages({
      data.melt = melt(data.metrics[,rangeStart:rangeEnd])
    })
    # Melting 1 variable (11:11), won't create $variable column in data.melt.
    if (rangeStart == rangeEnd) {
      metricName = data[rangeStart, "Description"]
      data.melt$variable = rep(metricName, length(data.melt$value))
    }
    p <- ggplot(data.melt, aes(x=data.melt$variable, y=data.melt$value)) +
      geom_violin(trim=FALSE) +
      geom_boxplot(width=0.1) +
      #scale_y_continuous(limits = quantile(data.melt$value, c(0.1, 0.9))) +
      scale_color_grey() +
      theme_bw() +
      theme(
        text = element_text(size=20),
        axis.text.x = element_text(angle = 90)
      ) +
      labs(x = "Metrics", y="Metric value", fill="Metrics")
    # compute lower and upper whiskers
    #ylim1 = boxplot.stats(data.melt$value)$stats[c(1, 5)]
    # scale y limits based on ylim1
    #p1 = p + coord_cartesian(ylim = ylim1*1.05)
    print(p)
  }
}


#
# It returns true if value is in range (0.5, 0.7]
#
isReasonable <- function(value) {
  return(value > 0.5 && value <= 0.7)
}

getLargestSilWidth <- function(qualityDf, metric, k1, k2) {
  k1KSil = qualityDf[metric, k1]
  k2KSil = qualityDf[metric, k2]
  k = NULL
  if (k1KSil >= k2KSil) {
    k = k1
  } else {
    k = k2
  }
  return(getFormattedK(k))
}

#
# It transform a string 'k_X' into 'X'.
# For instace, input is 'k_4', output is '4'
#
getFormattedK <- function(k) {
  return(gsub("^.*_","", k))
}

#' @title Calculating the optimal value of k.
#' getOptimalKValue
#' @aliases getOptimalKValue
#' @description
#' This method finds the optimal value of K per each metric.
#'
#' @param stabData An output \code{\link{ExperimentList}} from
#' a \code{\link{stabilityRange}} execution.
#'
#' @param qualData An output \code{\link{SummarizedExperiment}} from
#' a \code{\link{qualityRange}} execution.
#'
#' @param k.range A range of K values to limit the scope of the
#' analysis.
#'
#' @return It returns a dataframe following the schema:
#' \code{metric}, \code{optimal_k}.
#'
#' @examples
#' # Using example data from our package
#' data("rnaMetrics")
#' stabilityData <- stabilityRange(data=rnaMetrics, k.range=c(2,4), bs=20, getImages = FALSE)
#' qualityData <- qualityRange(data=rnaMetrics, k.range=c(2,4), getImages = FALSE)
#' kOptTable = getOptimalKValue(stabilityData, qualityData)
#'
#'
getOptimalKValue <- function(stabData, qualData, k.range=NULL) {
  checkStabilityQualityData(stabData, qualData)

  if (!is.null(k.range)) {
    k.range.length = length(k.range)
    if (k.range.length != 2) {
      stop("k.range length must be 2")
    }
    k.min = k.range[1]
    k.max = k.range[2]
    checkKValue(k.min)
    checkKValue(k.max)
    if (k.max < k.min) {
      stop("The first value of k.range cannot be greater than its second value")
    } else if (k.min == k.max) {
      stop("Range start point and end point are equals")
    }
  }

  stabDf = standardizeStabilityData(stabData, k.range)
  qualDf = standardizeQualityData(qualData, k.range)

  metrics = as.character(as.data.frame(assay(stabData))$Metric)
  STABLE_CLASS = 0.75

  outputTable = as.data.frame(metrics)
  #rownames(outputTable) = metrics
  outputTable = outputTable[, -1]
  optimalKs = list()
  stabMaxKs = list() # List of maximum K for the stability of metric X
  stabMaxKsStability = list() # Stability of the current K in stabMaxKs
  stabMaxKsQuality = list() # Quality of the current K in stabMaxKs
  qualMaxKs = list() # List of maximum K for the quality of metric X
  qualMaxKsStability = list() # Stability of the current K in qualMaxKs
  qualMaxKsQuality = list() # Quality of the current K in qualMaxKs

  for (metric in metrics) {
    message("Processing metric: ", metric, "\n")
    stabMaxK = colnames(stabDf[metric, ])[apply(stabDf[metric, ],1,which.max)] # ks
    stabMaxKFormatted = getFormattedK(stabMaxK)
    stabMaxVal = stabDf[metric, stabMaxK]
    qualMaxK = colnames(qualDf[metric, ])[apply(qualDf[metric, ],1,which.max)] # kg
    qualMaxKFormatted = getFormattedK(qualMaxK)
    qualMaxVal = qualDf[metric, qualMaxK]
    ## Info for output table
    stabMaxKs = append(stabMaxKs, stabMaxKFormatted)
    stabMaxKsStability = append(stabMaxKsStability, stabDf[metric, stabMaxK]);
    stabMaxKsQuality = append(stabMaxKsQuality, qualDf[metric, stabMaxK]);

    qualMaxKs = append(qualMaxKs, qualMaxKFormatted)
    qualMaxKsStability = append(qualMaxKsStability, stabDf[metric, qualMaxK]);
    qualMaxKsQuality = append(qualMaxKsQuality, qualDf[metric, qualMaxK]);

    # CASE 1: ks == kg
    if (identical(stabMaxK, qualMaxK)) {
      k = stabMaxKFormatted
      message("\tMaximum stability and quality values matches the same K value: '", k ,"'\n")
      optimalKs = append(optimalKs, k)
    } else {
      # CASE 2: ks != kg
      if (stabMaxVal > STABLE_CLASS && stabDf[metric, qualMaxK] > STABLE_CLASS) {
        # Both stables
        message("\tBoth Ks have a stable classification: '",
            stabMaxKFormatted, "', '", qualMaxKFormatted ,"'\n")
        k = qualMaxKFormatted
        optimalKs = append(optimalKs, k)
        message("\tUsing '", k, "' since it provides higher silhouette width\n")
      } else {
        if (stabMaxVal <= STABLE_CLASS && stabDf[metric, qualMaxK] <= STABLE_CLASS) {
          # Both not stables: S_ks <= 0.75 && S_kg <= 0.75
          message("\tBoth Ks do not have a stable classification: '",
              stabMaxKFormatted, "', '", qualMaxKFormatted ,"'\n")
          k = qualMaxKFormatted
          optimalKs = append(optimalKs, k)
          message("\tUsing '", k, "' since it provides higher silhouette width\n")
        } else {
          # S_ks > 0.75 && Sil_ks > 0.5 && S_kg <= 0.75
          if ((stabMaxVal > STABLE_CLASS) && (qualDf[metric, stabMaxK] > 0.5)
              && (stabDf[metric, qualMaxK] <= STABLE_CLASS)) {
            message("\tStability k '", stabMaxKFormatted, "' is stable but quality k '",
                qualMaxKFormatted,"' is not\n")
            k = stabMaxKFormatted
            optimalKs = append(optimalKs, k)
            message("\tUsing '", k, "' since it provides higher stability\n")
          } else {
            # CASE 3
            if (stabMaxVal > STABLE_CLASS && qualDf[metric, stabMaxK] <= 0.5
                && stabDf[metric, qualMaxK] <= STABLE_CLASS)  {
              message("\tStability k '", stabMaxKFormatted, "' is stable but its silhouette value is not reasonable\n")
              if (qualMaxVal > 0.5) { # S_kg > 0.5
                k = qualMaxKFormatted
                optimalKs = append(optimalKs, k)
                message("\tUsing quality '", k, "' since its at least reasonable\n")
              } else {# S_kg <= 0.5
                k = stabMaxKFormatted
                optimalKs = append(optimalKs, k)
                message("\tUsing stability '", k, "' since quality k is not reasonable\n")
              }
            } else { # This should not happen but it might come in handy to check errors
              message("\tUnknown case\n")
              optimalKs = append(optimalKs, -1)
            }
          }
        }
      }
    }
  }

  outputTable["Metric"] = metrics
  outputTable["Stability_max_k"] = unlist(stabMaxKs)
  outputTable["Stability_max_k_stab"] = unlist(stabMaxKsStability)
  outputTable["Stability_max_k_qual"] = unlist(stabMaxKsQuality)

  outputTable["Quality_max_k"] = unlist(qualMaxKs)
  outputTable["Quality_max_k_stab"] = unlist(qualMaxKsStability)
  outputTable["Quality_max_k_qual"] = unlist(qualMaxKsQuality)

  outputTable["Global_optimal_k"] = unlist(optimalKs)

  return(outputTable)

}


#' @title Comparison between two clusterings as plot.
#' plotMetricsClusterComparison
#' @aliases plotMetricsClusterComparison
#' @description
#' It plots a clustering comparison between two different
#' k-cluster vectors for a set of metrics.
#'
#' @inheritParams stability
#' @param k.vector1 Vector of positive integers representing \code{k} clusters.
#' The \code{k} values must be contained in [2,15] range.
#' @param k.vector2 Optional. Vector of positive integers representing \code{k} clusters.
#' The \code{k} values must be contained in [2,15] range.
#'
#' @return Nothing.
#'
#' @examples
#' # Using example data from our package
#' data("rnaMetrics")
#' stabilityData <- stabilityRange(data=rnaMetrics, k.range=c(2,4), bs=20, getImages = FALSE)
#' qualityData <- qualityRange(data=rnaMetrics, k.range=c(2,4), getImages = FALSE)
#' kOptTable = getOptimalKValue(stabilityData, qualityData)
#'
#'
plotMetricsClusterComparison <- function(data, k.vector1, k.vector2=NULL, seed=NULL) {
  if (is.null(seed)) {
    seed = pkg.env$seed
  }
  if (identical(k.vector1, k.vector2)) {
    stop("k.vector1 and k.vector2 are identical")
  }

  data <- as.data.frame(SummarizedExperiment::assay(data))

  numMetrics = length(colnames(data))-1

  if (length(k.vector1) == 1) {
    k.vector1=rep(k.vector1, numMetrics)
  }

  if (is.null(k.vector2)) {
    k.vector2 = k.vector1 # This will colour elipses around the same clusters of k.vector1
  }

  if (length(k.vector2) == 1) {
    k.vector2=rep(k.vector2, numMetrics)
  }

  if (numMetrics != length(k.vector1) || numMetrics != length(k.vector2)
      || length(k.vector1) != length(k.vector2)) {
    stop("Input parameters have different lengths")
  }
  for (i in 1:length(k.vector1)) {
    checkKValue(k.vector1[i])
    checkKValue(k.vector2[i])
  }

  data.metrics=NULL; names.metr=NULL; names.index=NULL;
  k.cl=NULL; k.min=NULL; k.max=NULL;
  data.metrics=NULL; datos.csv=NULL; datos.raw=NULL;
  ranges=NULL; mins=NULL; data.l=NULL; data.ms=NULL; k.sig=NULL; k.op.sig=NULL;

  datos.csv = data
  data.metrics <- datos.csv[,-1]
  names.metr <- colnames(datos.csv[,-1])  #nombres de metricas
  names.ont <- datos.csv[,1]

  ranges <- apply(data.metrics, 2, sample.range)
  mins <- apply(data.metrics, 2, sample.min)
  data.l <- sweep(data.metrics, 2, mins, FUN="-")
  data.ms <- sweep(data.l, 2, ranges, FUN="/")

  kcolors=c("black","red","blue","green","magenta","pink","yellow","orange","brown","cyan","gray","darkgreen")

  par(mar=c(4,6,3,3))
  plot(0,0, xlim=range(data.ms), ylim=c(0,length(names.metr)+1),
       lwd=NULL, xlab="", ylab="", xaxt="n", yaxt="n", type="n")
  axis(side=2, at = seq(1,length(names.metr)), labels=names.metr, las=2, cex.axis=.7)
  title(xlab=paste("Scaled raw scores", sep=""), line=1)
  title(ylab="Metrics", line=5)

  for (i.metr in 1:length(names.metr)) { # i.metr= n de metrica #ejemplo
    #  i.metr=1

    i=NULL; clusterk5=NULL; clusterkopt=NULL;
    k.cl=NULL; k.op=NULL; data.plot=NULL;

    i=i.metr

    #kmeans with k.cl classes
    k.cl=k.vector2[i]
    set.seed(seed)
    clusterk5=kmeans(data.ms[,i], centers=k.cl, iter.max = 100)
    ##
    clusterk5$means=by(data.ms[,i],clusterk5$cluster,mean) #calcula las k medias (centroides)
    for (i.5 in 1:length(clusterk5$means)) {
      clusterk5$partition[which(clusterk5$cluster==i.5)]=clusterk5$centers[i.5]
      #asigna valor centroide a todo miembro del cluster
    }
    #Ordenacion de la particion segun el sentido de la metrica (directa/inversa)
    clusterk5$ordered=ordered(clusterk5$partition,labels=seq(1,length(clusterk5$centers)))
    clusterk5$ordered.inv=ordered(clusterk5$partition,labels=seq(length(clusterk5$centers),1))
    clusterk5$partition=clusterk5$ordered
    clusterk5$means=sort(clusterk5$means,decreasing=FALSE)

    #kmeans with k.op classes
    k.op=k.vector1[i]
    set.seed(seed)
    clusterkopt=kmeans(data.ms[,i], centers=k.op, iter.max = 100)

    clusterkopt$means=by(data.ms[,i],clusterkopt$cluster,mean) #calcula las k medias (centroides)
    for (i.opt in 1:length(clusterkopt$means)) {
      clusterkopt$partition[which(clusterkopt$cluster==i.opt)]=clusterkopt$centers[i.opt]
      #asigna valor centroide a todo el cluster
    }

    clusterkopt$ordered=ordered(clusterkopt$partition,labels=seq(1,length(clusterkopt$centers)))
    clusterkopt$ordered.inv=ordered(clusterkopt$partition,labels=seq(length(clusterkopt$centers),1))
    clusterkopt$partition=clusterkopt$ordered
    clusterkopt$means=sort(clusterkopt$means,decreasing=FALSE)

    data.plot=data.frame(data.ms[,i],clusterk5$partition,clusterkopt$partition)
    colnames(data.plot)=c(names.metr[i],"k=5","k_op")
    rownames(data.plot)=names.ont

    xi=data.plot[[1]]
    yi=rep(i.metr,length(xi))
    ci=data.plot[[2]]
    ci=levels(ci)[ci]
    points(xi,yi,type="p", col=kcolors[as.numeric(ci)],lty=1, lwd=1)

    cj=data.plot[[3]]
    for (ellip.j in unique(cj)) {
      xj=mean(range(xi[which(cj==ellip.j)])) #clusterk5$means[ellip.j]
      yj=rep(i.metr,length(xj))
      aj=diff(range(xi[which(cj==ellip.j)]))/2
      draw.ellipse(x=xj, y=yj, a=aj, b=0.3, nv=100,
                   border=kcolors[as.numeric(ellip.j)], lty=1, lwd=2)
    }

  } #end for i.metr
}

checkStabilityQualityData <- function(stabData, qualData) {
  stabDf = assay(stabData) # Getting first assay, which is 'stabData$stability_mean'
  lengthStabDf = length(colnames(stabDf)[-1])
  stabRangeStart = gsub("^.*_.*_.*_","", colnames(stabDf)[-1][1]) # Mean_stability_k_2 -> 2
  stabRangeEnd = gsub("^.*_.*_.*_","", colnames(stabDf)[-1][lengthStabDf])
  lengthQual = length(qualData)
  namesQual = names(qualData)
  qualRangeStart = getFormattedK(namesQual[1]) # k_2 -> 2
  qualRangeEnd = getFormattedK(namesQual[lengthQual])
  if (stabRangeStart != qualRangeStart || stabRangeEnd != qualRangeEnd) {
    stop("Stability data and quality data have different k ranges")
  }
  stabMetricsList = as.character(stabDf[,"Metric"])
  qualMetricsList = as.character(
    assay(getDataQualityRange(qualData, as.numeric(qualRangeStart)))[,"Metric"]
  )
  if (!identical(stabMetricsList, qualMetricsList)) {
    stop("Stability data and quality data have different metrics")
  }
}

#
# It transforms the output of qualityRange method
# into a dataframe like this:
# (rownames)       k_2       k_3       k_4
# DegFact          0.6171262 0.6278294 0.4882649
# ...
# So that the input of getOptimalKValue has always a
# standardized dataframe to process.
#

standardizeQualityData <- function(qualData, k.range=NULL) {
  lengthQuality = length(qualData)
  qualRangeStart = getFormattedK(names(qualData)[1])
  qualRangeEnd = getFormattedK(names(qualData)[lengthQuality])
  Metric = NULL
  kValues = list()
  for (i in seq(qualRangeStart, qualRangeEnd, 1)) {
    curQual = as.data.frame(assay(getDataQualityRange(qualData, i)))
    if (i == qualRangeStart) {
      Metric = as.character(curQual$Metric)
    }
    kValues[[i]] = as.numeric(as.character(curQual$Avg_Silhouette_Width))
  }
  qualDf = as.data.frame(Metric)
  for (i in seq(qualRangeStart, qualRangeEnd, 1)) {
    values = kValues[[i]]
    newColname = paste0("k_", i)
    k = as.numeric(getFormattedK(newColname))
    if (!is.null(k.range) && (k < k.range[1] || k > k.range[2])) {
      next
    }
    if (length(values) < length(Metric)) {
      for (i in seq(length(values), length(Metric)-1,1)) {
        values = append(values, NaN)
      }
    }

    qualDf[[newColname]] = values
  }

  if (!is.null(k.range) && (k.range[1] < qualRangeStart || k.range[2] > qualRangeEnd)) {
    # Input k.range is not a subset of the stabData k ranges
    stop("Input k.range [", k.range[1], ", ", k.range[2], "] is not a subset of range [",
         qualRangeStart, ", ", qualRangeEnd, "]")
  }

  rownames(qualDf) = qualDf$Metric
  if (ncol(qualDf) == 2) { # Only one metric
    qualDf["Metric"] = NULL
    return(qualDf)
  }
  qualDf = qualDf[, -1] # Remove "Metric" column, metrics are rownames now
  qualDf <- qualDf[ order(row.names(qualDf)), ]
  return(qualDf)
}

#
# It transforms the output of stabilityRange method
# into a dataframe like this:
# (rownames)       k_2       k_3       k_4
# RIN              0.6171262 0.6278294 0.4882649
# ...
# So that the input of getOptimalKValue has always a
# standardized dataframe to process.
#
standardizeStabilityData <- function(stabData, k.range=NULL) {
  stabDf = as.data.frame(assay(stabData)) # Getting first assay, which is 'stabData$stability_mean'
  lengthColnames = length(colnames(stabDf))
  toRemove = list()
  for (i in seq(1, lengthColnames, 1)) {
    colname = colnames(stabDf)[i]
    newColname = gsub("^.*_.*_.*_","k_", colname)
    colnames(stabDf)[i] = newColname
    if (i != 1) { # Skip Metric column
      k = as.numeric(getFormattedK(newColname))
      if (!is.null(k.range) && (k < k.range[1] || k > k.range[2])) {
        toRemove = append(toRemove, newColname)
        next
      }
      stabDf[newColname] = as.numeric(as.character(stabDf[[newColname]]))
    }
  }

  for (columnName in toRemove) {
    stabDf[, columnName] = list(NULL)
    lengthColnames = lengthColnames-1
  }

  inputStartRange = as.numeric(getFormattedK(colnames(stabDf)[2]))
  inputEndRange = as.numeric(getFormattedK(colnames(stabDf)[lengthColnames]))
  if (!is.null(k.range) && (k.range[1] < inputStartRange || k.range[2] > inputEndRange)) {
    # Input k.range is not a subset of the stabData k ranges
    stop("Input k.range [", k.range[1], ", ", k.range[2], "] is not a subset of data range [",
         inputStartRange, ", ", inputEndRange, "]")
  }

  rownames(stabDf) = stabDf$Metric
  if (ncol(stabDf) == 2) { # Only one metric
    stabDf["Metric"] = NULL
    return(stabDf)
  }
  stabDf = stabDf[, -1] # Remove "Metric" column, metrics are rownames now
  stabDf <- stabDf[ order(row.names(stabDf)), ]
  return(stabDf)
}

#' @title Calculate the cluster ID from the optimal cluster per metric for each individual.
#' annotateClustersByMetric
#' @aliases annotateClustersByMetric
#' @description
#' Return a named list, where each metric name is linked to a data frame
#' containing the evaluated individuals, their score for the specified metric,
#' and the cluster id in which each individual is classified. This cluster
#' assignment is performed by calculating the optimal k value by evaluome.
#'
#' @param df Input data frame. The first column denotes the identifier of the
#' evaluated individuals. The remaining columns contain the metrics used to
#' evaluate the individuals. Rows with NA values will be ignored.
#' @param k.range Range of k values in which the optimal k will be searched
#' @param bs Bootstrap re-sample param.
#' @param seed Random seed to be used.
#'
#' @return A named list resulting from computing the optimal cluster for each
#' metric. Each metric is a name in the named list, and its content is a
#' data frame that includes the individuals, the value for the corresponding
#' metric, and the cluster id in which the individual has been asigned according
#' to the optimal cluster.
#' @export
#'
#' @examples
#' data("ontMetrics")
#' annotated_clusters=annotateClustersByMetric(ontMetrics, k.range=c(2,3), bs=20, seed=100)
#' annotated_clusters[['ANOnto']]
annotateClustersByMetric <- function(df, k.range, bs, seed){
  if (is.null(seed)) {
    seed = pkg.env$seed
  }
  df <- as.data.frame(assay(df))
  # Create a dataframe by removing NAs from the original data.
  df_clean = na.omit(df)

  # Compute stability, quality and the optimal k value from evaluome
  stabilityData <- stabilityRange(data=df_clean, k.range=k.range,
                                  bs=bs, getImages = FALSE, seed=seed)

  qualityData <- qualityRange(data=df_clean, k.range=k.range,
                              getImages = FALSE, seed=seed)

  kOptTable <- getOptimalKValue(stabilityData, qualityData)

  # Get the clusters obtained by evaluome for each k, together with
  # the optimal k
  clusters = as.data.frame(assay(stabilityData$cluster_partition))
  clusters$optimal_k = kOptTable$Global_optimal_k

  # Compose the results
  # For each metric, get the optimal k, get the clusters formed by using
  # that optimal k, include this information in a dataframe
  result_list = list()

  for (i in 1:nrow(clusters)){
    metric = as.character(clusters$Metric[i])
    # Get the optimal k
    optimal_k = clusters$optimal_k[i]

    # Get the clusters formed by using the optimal k as a vector of integers
    optimal_cluster = dplyr::select(clusters, dplyr::contains(as.character(optimal_k)))
    optimal_cluster = optimal_cluster[i,1]
    optimal_cluster = as.character(optimal_cluster)
    optimal_cluster = as.numeric(strsplit(optimal_cluster, ", ")[[1]])

    # Create a dataframe including the individual id, the concerning metric
    # and the cluster id in which the individual is classfied.
    annotated_df_clean = dplyr::select(df_clean, 1)
    annotated_df_clean$cluster = optimal_cluster

    # Merge this dataframe with the original one, so that original individuals
    # removed due to NAs will be present an NA as cluster.
    # Include this dataframe in the named list, using the name of the metric as
    # a key.
    result_list[[metric]] = merge(dplyr::select(df, 1, dplyr::contains(metric)), annotated_df_clean, all.x = TRUE)
  }
  return(result_list)
}


#' @title Get the range of each metric per cluster from the optimal cluster.
#' getMetricRangeByCluster
#' @aliases getMetricRangeByCluster
#' @description
#' Obtains the ranges of the metrics obtained by each optimal cluster.
#'
#' @param df Input data frame. The first column denotes the identifier of the
#' evaluated individuals. The remaining columns contain the metrics used to
#' evaluate the individuals. Rows with NA values will be ignored.
#' @param k.range Range of k values in which the optimal k will be searched
#' @param bs Bootstrap re-sample param.
#' @param seed Random seed to be used.
#'
#' @return A dataframe including the min and the max value for each
#' pair (metric, cluster).
#' @export
#'


getMetricRangeByCluster <- function(df, k.range, bs, seed) {
  if (is.null(seed)) {
    seed = pkg.env$seed
  }
  df <- as.data.frame(assay(df))
  annotated_clusters_by_metric = annotateClustersByMetric(df, k.range, bs, seed)

  metrics = c()
  cluster_ids = c()
  min_values = c()
  max_values = c()
  # For each metric
  for (metric in names(annotated_clusters_by_metric)){
    # Get the optimal clusters
    annotated_clusters = annotated_clusters_by_metric[[metric]]

    # For each cluster, get the minimal and the maximal value
    for (cluster_id in 1:max(annotated_clusters$cluster, na.rm=T)) {
      concrete_cluster_values = dplyr::filter(annotated_clusters, cluster==cluster_id) %>% dplyr::pull(metric)
      metrics = c(metrics, metric)
      cluster_ids = c(cluster_ids, cluster_id)
      min_values = c(min_values, min(concrete_cluster_values))
      max_values = c(max_values, max(concrete_cluster_values))
    }
  }
  return(data.frame(metric=metrics, cluster=cluster_ids, min_value=min_values, max_value=max_values))
}


#' @title Get the range of each metric per cluster from the optimal cluster.
#' getMetricRangeByCluster
#' @aliases getMetricRangeByCluster
#' @description
#' Obtains the ranges of the metrics obtained by each optimal cluster.
#'
#' @param df Input data frame. The first column denotes the identifier of the
#' evaluated individuals. The remaining columns contain the metrics used to
#' evaluate the individuals. Rows with NA values will be ignored.
#' @param k K value (number of clusters)
#' @param alpha 0 <= alpha <= 1, the proportion of the cases to be trimmed in robust sparse K-means, see \code{\link{RSKC}}.
#' @param L1 A single L1 bound on weights (the feature weights), see \code{\link{RSKC}}.
#' @param seed Random seed to be used.
#'
#' @return A dataframe including the min and the max value for each
#' pair (metric, cluster).
#' @export
#'
#' @examples
#' data("ontMetrics")
#' metricsRelevancy = getMetricsRelevancy(ontMetrics, k=3, alpha=0.1, seed=100)
#' metricsRelevancy$rskc # RSKC output object
#' metricsRelevancy$trimmed_cases # Trimmed cases from input (row indexes)
#' metricsRelevancy$relevancy # Metrics relevancy table
#'
getMetricsRelevancy <- function(df, k, alpha=0, L1=NULL, seed=NULL) {
    if (is.null(seed)) {
    seed = pkg.env$seed
  }
#  if (is.null(alpha)) {
#    alpha = 0.1
#  }

  df <- as.data.frame(assay(df))

  if (is.null(L1)) {
    print(paste0("No L1 provided. Computing best L1 boundry with 'sparcl::KMeansSparseCluster.permute'"))
    dataMatrix = as.matrix(df)
    wbounds = seq(2,sqrt(ncol(dataMatrix)), len=30)
    km.perm <- sparcl::KMeansSparseCluster.permute(dataMatrix,K=k,wbounds=wbounds,nperms=5,silent=TRUE)
    L1 = km.perm$bestw

  }


  # Compute RSKC
  print(paste0("Alpha set as: ", alpha))
  print(paste0("L1 set as: ", L1))
  rskc_out = RSKC(df, k, alpha, L1 = L1, nstart = 200,
                  silent=TRUE, scaling = FALSE, correlation = FALSE)
  # Get trimmed cases from input
  union_vector = c(rskc_out$oE,rskc_out$oW)
  union_vector_unique = unique(union_vector)
  union_vector_unique = sort(union_vector_unique)

  # Metrics relevancy
  columns = c('metric', 'weight')
  rskc_df = data.frame(matrix(ncol = length(columns), nrow = length(rskc_out$weights)))
  colnames(rskc_df) = columns
  rskc_df['metric'] = names(rskc_out$weights)
  rskc_df['weight'] = rskc_out$weights
  rskc_df_sorted = rskc_df[order(rskc_df$weight, decreasing = TRUE), ] # Sorting from greater values to lower



  output = NULL
  output$rskc = rskc_out
  output$trimmed_cases = union_vector_unique
  output$relevancy = rskc_df_sorted
  return (output)
}


#' @title Computes L1 boundry
#' getRSKCL1Boundry
#' @aliases getRSKCL1Boundry
#' @description
#' Computes the L1 boundry for an RSKC cbi execution.
#'
#' @param df Input data frame. The first column denotes the identifier of the
#' evaluated individuals. The remaining columns contain the metrics used to
#' evaluate the individuals. Rows with NA values will be ignored.
#' @param k K value (number of clusters)
#' @param seed Random seed to be used.
#'
#' @return A single L1 bound on weights (the feature weights), see \code{\link{RSKC}}.
#' @export
#'
#' @examples
#' data("ontMetrics")
#' l1_boundry = getRSKCL1Boundry(ontMetrics, k=3, seed=100)
#'
getRSKCL1Boundry <- function(df, k, seed=NULL) {
  if (is.null(seed)) {
    seed = pkg.env$seed
  }

  df <- as.data.frame(assay(df))
  df_data = df[-1] # Removing 'Description' column as it is not numeric

  message(paste0("Computing best L1 boundry with 'sparcl::KMeansSparseCluster.permute'"))
  dataMatrix = as.matrix(df_data)
  wbounds = seq(2,sqrt(ncol(dataMatrix)), len=30)
  km.perm <- sparcl::KMeansSparseCluster.permute(dataMatrix,K=k,wbounds=wbounds,nperms=5,silent=TRUE)
  L1 = floor(km.perm$bestw)
  message(paste0("Best L1 found is: ", km.perm$bestw, ", using floor: ", L1))

  return (L1)
}

#' @title Computes RSKC alpha
#' getRSKCAlpha
#' @aliases getRSKCAlpha
#' @description
#' Computes the proportion of the cases to be trimmed in robust sparse K-means, 0 <= alpha <= 1, see \code{\link{RSKC}}.
#'
#' @param df Input data frame. The first column denotes the identifier of the
#' evaluated individuals. The remaining columns contain the metrics used to
#' evaluate the individuals. Rows with NA values will be ignored.
#' @param k K value (number of clusters)
#' @param L1 A single L1 bound on weights (the feature weights), see \code{\link{RSKC}}.
#' @param seed Random seed to be used.
#'
#' @return Best suitable alpha.
#' @export
#'
#' @examples
#' data("ontMetrics")
#' alpha = getRSKCAlpha(ontMetrics, k=3, L1=2, seed=100)
#'
getRSKCAlpha <- function(df, k, L1, seed=NULL) {
  if (is.null(seed)) {
    seed = pkg.env$seed
  }

  df <- as.data.frame(assay(df))

  # Helper structures
  ## Structure to keep track of stability and qualit of each RSKC run.
  rskc_run <- function(stab, qual, alpha) {
    structure(list(stab = stab, mean_stab = mean(as.double(stab)),
                   qual = qual, mean_qual = mean(as.double(qual)),
                   alpha = alpha), class = "rskc_run")
  }
  run_list = list()
  ## Getting best RSKC run according to stabilities and qualities
  best_stab = -Inf
  best_qual = -Inf
  best_alpha_stab = 0
  best_alpha_qual = 0
  getBestRun <- function(run_list) {
    for (run in run_list)
      if (run$mean_stab > best_stab) {
        best_stab = run$mean_stab
        best_alpha_stab = run$alpha
      }
    if (run$mean_qual > best_qual) {
      best_qual = run$best_qual
      best_alpha_qual = run$alpha
    }

    return(
      list("best_alpha_stab" = best_alpha_stab,
           "best_alpha_qual"=best_alpha_qual)
    )
  }

  alpha_values = seq(0, 0.25, 0.05)
  index = 1
  for (alpha in alpha_values) {
    message(paste0("Running stability and quality indexes with alpha=", alpha))
    stab = stability(data=df, k=3,
                     bs=100, seed=seed,
                     all_metrics=TRUE,
                     cbi="rskc", L1=9, alpha=alpha)
    stab_table = standardizeStabilityData(stab)

    qual = quality(data=df, k=3,
                   seed=seed,
                   all_metrics=TRUE,
                   cbi="rskc", L1=9, alpha=alpha)
    qual_table = standardizeQualityData(qual)

    run_list[[index]] <- rskc_run(stab = stab_table, qual = qual_table, alpha=alpha)
    index = index + 1
  }

  best_run = getBestRun(run_list)

  message(paste0("Highest stability found when alpha=", best_run$best_alpha_stab))
  message(paste0("Highest quality found when alpha=", best_run$best_alpha_qual))

  best_alpha = best_run$best_alpha_qual
  if (best_run$best_alpha_stab <= best_run$best_alpha_qual) {
    best_alpha = best_run$best_alpha_stab
  }
  message(paste0("Using alpha=", best_alpha, " as it trims less data."))

  return (best_alpha)
}
neobernad/evaluomeR documentation built on Aug. 8, 2024, 1:01 a.m.