R/internalFunctions.R
In evaluomeR: Evaluation of Bioinformatics Metrics

######################################################
#function
#   jaccard(x, y)
#     for two binary variables (values 0,1)

jaccard = function (x, y) {
  c11 = sum(x == 1 & y == 1)
  c10 = sum(x == 1 & y == 0)
  c01 = sum(x == 0 & y == 1)
  return (c11 / (c11 + c10 + c01))
}

######################################################
#function
#   jaccard.cluster(clust1, clust2)
#     for two clusterings from a data set

jaccard.cluster <- function(clust1,clust2) {
  cat <- sort(unique(c(clust1,clust2)))
  aux <- rep(0,length(cat))
  if (length(clust1)==length(clust2) & all(sort(unique(clust1))==sort(unique(clust2)))) {
    for (i.aux in cat) {
      clust.x <- clust1==i.aux
      for (j.aux in cat) {
        clust.y <- clust2==j.aux
        aux[i.aux] <- max(aux[i.aux],jaccard(clust.x,clust.y))
      }
    }
  }
  return(aux)
}

######################################################
#function
#   boot.cluster(data, nk=5, B=100, seed=NULL)
#     stability cluster based on Jaccard by clustering bootstrap

boot.cluster <- function(data, nk=5, B=10, seed=NULL, prnt=FALSE) {
  old.seed <- .Random.seed
  on.exit( { .Random.seed <- old.seed } )
  if (!is.null(seed)) {
    #http://stackoverflow.com/questions/14324096/setting-seed-locally-not-globally-in-r?rq=1
    set.seed(as.integer(seed)) #seed
  } else {
    # Default seed
    set.seed(pkg.env$seed)
  }
  data <- as.matrix(data)
  n.data <- nrow(data)
  cluster1 <- kmeans(x=data, centers=nk, iter.max=100)
  cluster1$partition <- cluster1$cluster
  cluster1$nk <- nk
  bs.jaccard <- matrix(0,nrow=nk, ncol=B)
  for (i in 1:B){  # for B botstrap replications
    if (prnt) cat("bootstrap replicate ", i,"\n")  # Show the bootstrap replication index
    bscheck <- TRUE
    while (bscheck) {
      bs.samp <- sample(n.data,n.data,replace=TRUE)
      bs.samp <- unique(bs.samp)
      bs.data <- data[bs.samp,]
      if (nk <= length(levels(as.factor(bs.data))))
        bscheck <- FALSE
    }
    if (length(bs.data) <= nk) { # Not enough data for kmeans
      message("\tWarning: Could not process data for k = ", nk)
      return (NULL)
    }

    bs.cluster <- kmeans(bs.data, centers=nk, iter.max=100)
    bs.cluster$partition <- bs.cluster$cluster
    bs.cluster$nk <- nk
    bs.jaccard[,i] <- jaccard.cluster(cluster1$partition[bs.samp], bs.cluster$partition)
  } # end for i in 1:B
  bs.jaccard.mean <- apply(bs.jaccard, 1, mean, na.rm=TRUE) # 1=by rows
  return(list(result=cluster1, partition=cluster1$partition, nk=cluster1$nk,
              B=B, bsjaccard=bs.jaccard, means=bs.jaccard.mean))
}
######################################################

######################################################
#function
#   fAnova(clusterResult, k, num.elements)
#     Custom anova computation: F function.

fAnova <- function(clusterResult, k, n) {
  #cat("k: ", k, "\n")
  #cat("clusterResult$betweenss: ", clusterResult$betweenss, "\n")
  #cat("clusterResult$tot.withinss: ", clusterResult$tot.withinss, "\n")
  #cat("(k-1): ", (k-1), "\n")
  #cat("(n-k): ", (n-k), "\n")
  #cat("d1: ", clusterResult$betweenss/(k-1), "\n")
  #cat("d2: ", (clusterResult$tot.withinss)/(n-k), "\n")
  #cat("r: ", (clusterResult$betweenss/(k-1))/(clusterResult$tot.withinss)/(n-k), "\n")
  #("r: ", (clusterResult$betweenss/(k-1))/((clusterResult$tot.withinss)/(n-k)), "\n")

  return ((clusterResult$betweenss/(k-1))/((clusterResult$tot.withinss)/(n-k)))
}
######################################################

######################################################
#function
#   getMeasureValue(km5, measureName)
#     Returns a measure as in km5$measureName

getMeasureValue <- function(km5, measureName) {
  if (startsWith(measureName, "cluster_")) {
    return (toString(unlist(km5$csv[measureName], use.names = FALSE)))
  } else {
    stop("Unknown measure '", measureName, "'. Stopping.")
  }
}
######################################################

######################################################
#function
#   clusterbootWrapper(data, B, bootmethod="boot",
#                     clustermethod=kmeansCBI, krange, seed)
#     Wrapper method for clusterboot functionality.

clusterbootWrapper <- function(data, B, bootmethod="boot",
                               cbi, krange, seed) {
  cbiHelperResult = helperGetCBI(cbi, krange)

  #cat ("Using: ", cbi, "\n")
  #cat ("Type: ", typeof(cbiHelperResult[["method"]]), "\n")

  mandatoryArgs = list(
    "data"=data,
    "B"=B,
    "bootmethod"=bootmethod,
    "seed"=seed,
    "clustermethod"=cbiHelperResult[["method"]]
  )

  methodArgs = append(mandatoryArgs, cbiHelperResult[["args"]])

  return (
            #quiet(
              do.call(
                clusterboot,
                methodArgs
                #  )
            )
          )
}
######################################################

######################################################
#function
#   clusteringWrapper(data, cbi, krange, seed)
#     Wrapper method for clustering functionality.

clusteringWrapper <- function(data, cbi, krange, seed) {
  cbiHelperResult = helperGetCBI(cbi, krange)

  old.seed <- .Random.seed
  on.exit( { .Random.seed <<- old.seed } )

  if (!is.null(seed)) set.seed(seed)


  #cat ("Using: ", cbi, "\n")

  mandatoryArgs = list(
    "data"=data
  )

  methodArgs = append(mandatoryArgs, cbiHelperResult[["args"]])

  # print(cbiHelperResult[["method"]])
  # print(methodArgs)

  return (
    quiet(
      do.call(
        cbiHelperResult[["method"]],
        methodArgs
      )
    )
  )
}
######################################################

######################################################
#function
#   checkIfCanCluster(data, k)
#       It prevents clusterboot method from getting stuck by checking if
#       there is enough data to perform a clustering.

checkIfCanCluster <- function(data, ...) {
  mc <- as.list(match.call(expand.dots = TRUE))
  k = NULL
  if ("krange" %in% names(mc)) {
    k = mc[["krange"]]
  } else if ("k" %in% names(mc) ) {
    k = mc[["k"]]
  } else {
    stop(paste0("Unexpected error. Could not retrieve 'k' value from dot-dot-dot argument. ",
                "Arguments were: ", names(mc)))
  }

  numUnique = length(unique(data))
  #print(paste0("Division is:", numUnique/k))
  #print(paste0("Return is: ", (numUnique/k) > 1))

  if ((numUnique/k) <= 1) {
    stop(paste0("Not enough data to cluster '", numUnique, "' unique values in '", k, "' clusters"))
  }

}
######################################################