R/orphaned-Code.R
In optCluster: Determine Optimal Clustering Algorithm and Number of Clusters

####################################################################################
## Code taken from original clValid (Brock et al., 2008) package
## Contributing authors of clValid code: Guy Brock, Vasyl Pihur, Susmita Datta, Somnath Datta
## clValid is considered ORPHANED on CRAN as of 3/31/2020
## Includes code to make optCluster run
## Made code changes in BHI and BSI to fix CRAN check NOTES 3/31/2020
####################################################################################

####################################################################
####################################################################
####################################################################
## clValid Class Definitions
####################################################################
####################################################################
####################################################################
####################################################################

setClassUnion("numeric or NULL", c("numeric", "NULL"))
setClassUnion("array or NULL", c("array", "NULL"))
setClassUnion("character or array or list or NULL or logical",
              c("character","array","list","NULL","logical"))
setClass("clValid",representation(clusterObjs="list",measures="array",
                                  measNames="character",clMethods="character",
                                  labels="character",
                                  nClust="numeric",validation="character",
                                  metric="character",method="character",neighbSize="numeric",
                                  annotation="character or array or list or NULL or logical",
                                  GOcategory="character",
                                  goTermFreq="numeric",
                                  call="call"))





####################################################################
####################################################################
####################################################################
## clValid Methods
####################################################################
####################################################################
####################################################################

####################################################################
## Accessor Functions
####################################################################

## cluster methods accessor
setGeneric("clusterMethods", function(object, ...) standardGeneric("clusterMethods"))
setMethod("clusterMethods",signature(object="clValid"),
          function(object) return(object@clMethods))

## number of clusters accessor
setGeneric("nClusters", function(object, ...) standardGeneric("nClusters"))
setMethod("nClusters",signature(object="clValid"),
          function(object) return(object@nClust))

## measure names accessor
setGeneric("measNames", function(object, ...) standardGeneric("measNames"))
setMethod("measNames",signature(object="clValid"),
          function(object) return(object@measNames))

## clusters accessor
setGeneric("clusters", function(object, ...) standardGeneric("clusters"))
setMethod("clusters",signature(object="clValid"),
          function(object,method=clusterMethods(object)) {
            method <- match.arg(method,clusterMethods(object)) ##, several.ok=TRUE)
            return(object@clusterObjs[[method]])})

## measures accessor
setGeneric("measures", function(object, ...) standardGeneric("measures"))
setMethod("measures",signature(object="clValid"),
          function(object,measures=measNames(object)) {
            measures <- match.arg(measures,measNames(object),several.ok=TRUE)
            return(object@measures[measures,,,drop=FALSE])})


####################################################################
## Print and Show Methods
####################################################################

setMethod("print","clValid",
          function(x) {
            cat("\nCall:\n")
            print(x@call); cat("\n")
            cat("Clustering Methods:\n",clusterMethods(x),"\n\n")
            cat("Cluster sizes:\n",nClusters(x),"\n\n")
            cat("Validation measures:\n",measNames(x),"\n\n")
          })

setMethod("show","clValid",
          function(object) {
            cat("\nCall:\n")
            print(object@call); cat("\n")
            cat("Clustering Methods:\n",clusterMethods(object),"\n\n")
            cat("Cluster sizes:\n",nClusters(object),"\n\n")
            cat("Validation measures:\n",measNames(object),"\n\n")
          })

####################################################################
## Summary Method
####################################################################

setMethod("summary","clValid",
          function(object, digits = max(3,getOption("digits")-3)) {
            cat("\nClustering Methods:\n",clusterMethods(object),"\n\n")
            cat("Cluster sizes:\n",nClusters(object),"\n\n")
            cat("Validation Measures:\n")
            print(ftable(round(measures(object),digits),row.vars=c(3,1)))
            cat("\n")
            ## Find best scores
            ## APN, AD, ADM, Connectivity, FOM minimized
            ## BHI, BSI, Dunn, Silhouette maximized
            measNames <- measNames(object)
            best <- numeric(length(measNames))
            bestMeth <- character(length(measNames))
            bestNc <- character(length(measNames))
            names(best) <- names(bestMeth) <- names(bestNc) <- measNames
            minmeas <- c("APN", "AD", "ADM", "FOM", "Connectivity")
            maxmeas <- c("BHI","BSI","Dunn","Silhouette")
            ## Measures to minimize
            if (any(a <- minmeas%in%measNames)) {
              best[minmeas[a]] <- apply(measures(object)[minmeas[a],,,drop=FALSE],1,min,na.rm=TRUE)
              bestInd <- apply(measures(object)[minmeas[a],,,drop=FALSE],1,function(x) which(x==min(x,na.rm=TRUE),arr.ind=TRUE)[1,])
              bestNc[minmeas[a]] <- nClusters(object)[bestInd[1,]]
              bestMeth[minmeas[a]] <- clusterMethods(object)[bestInd[2,]]
            }
            ## Measures to maximize
            if (any(a <- maxmeas%in%measNames)) {
              best[maxmeas[a]] <- apply(measures(object)[maxmeas[a],,,drop=FALSE],1,max,na.rm=TRUE)
              bestInd <- apply(measures(object)[maxmeas[a],,,drop=FALSE],1,function(x) which(x==max(x,na.rm=TRUE),arr.ind=TRUE)[1,])
              bestNc[maxmeas[a]] <- nClusters(object)[bestInd[1,]]
              bestMeth[maxmeas[a]] <- clusterMethods(object)[bestInd[2,]]
            }
            
            cat("Optimal Scores:\n\n") 
            print(data.frame("Score"=round(best,digits),"Method"=bestMeth,"Clusters"=bestNc), right=FALSE)
            cat("\n")
          })



####################################################################
## Plot Method
####################################################################

setMethod("plot",c("clValid","missing"),
          function(x,y,measures=measNames(x), legend=TRUE, legendLoc="topright", main=NULL,
                   pch=NULL, type="b", ask=prod(par("mfcol")) < length(measures) && dev.interactive(), ...) {
            measures <- match.arg(measures,measNames(x),several.ok=TRUE)
            methods <- clusterMethods(x)
            nclust <- nClusters(x)
            k <- length(methods)
            if (ask) {
              op <- par(ask = TRUE)
              on.exit(par(op))
            }
            ##        if(is.null(main))
            ##          main <- paste("Validation Measures for ", deparse(substitute(x, sys.frame(-1))))
            if (is.null(pch)) 
              pch <- c(paste(c(1:9, 0)), letters)[1:k]            
            for(i in 1:length(measures)) {
              if (is.null(main)) {
                main <- switch(measures[i],
                               APN="Stability validation",
                               AD="Stability validation",
                               ADM="Stability validation",
                               FOM="Stability validation",
                               Connectivity="Internal validation",
                               Dunn="Internal validation",
                               Silhouette="Internal validation",
                               BHI="Biological validation",
                               BSI="Biological validation")
              }
              matplot(measures(x)[measures[i],,],type=type,ylab=measures[i],
                      xlab="Number of Clusters",col=1:k,
                      lty=1:k,main=main,xaxt="n", pch=pch, ...)
              axis(1,at=1:length(nclust),labels=nclust)
              if(legend) legend(x=legendLoc,methods,lty=1:k,col=1:k, pch=pch,...)
            }
          })
#####################################################################











#####################################################################################
#####################################################################################
#####################################################################################
## clValid Functions
#####################################################################################
#####################################################################################
#####################################################################################

#####################################################################
## Functions for Validation Measures
## Stability, Internal, and Biological
#####################################################################

#####################################################################
## Stability Measures
####################################################################
## APN, AD, ADM, and FOM
## All measures in [0,infty] and should be minimized
#####################################################################


stability <- function(mat, Dist=NULL, del, cluster, clusterDel, method="euclidean") {
  
  any.na <- any(is.na(mat))
  obsNum <- 1:nrow(mat)
  nc1 <- length(table(cluster))
  nc2 <- length(table(clusterDel))
  stabmeas <- numeric(4)
  names(stabmeas) <- c("APN","AD","ADM","FOM")
  
  ## measure APN
  ## calculate a ncxnc matrix of proportion of non-overlaps in the two collection of nc clusters
  overlap <- xtabs(~cluster + clusterDel)
  ## measure AD
  ## calculate a ncxnc matrix of average-distance in the two collection of nc clusters
  dij <- matrix(rep(NA,nc1*nc2),nc1,nc2)
  
  if (is.null(Dist)) matDist <- as.matrix(dist(mat, method=method))
  if (is(Dist,"dist")) matDist <- as.matrix(Dist)
  if (is(Dist,"matrix")) matDist <- Dist
  
  ## measure ADM
  ## calculate a ncxnc matrix of distance-average in the two collection of nc clusters
  dij2 <- matrix(rep(NA,nc1*nc2),nc1,nc2)
  ii <- 1
  for (i in sort(unique(cluster))) {
    jj <- 1
    xbari <- apply(mat[cluster==i,,drop=FALSE],2, function(x) mean(x, na.rm=TRUE))
    for (j in sort(unique(clusterDel))) {
      ## measure AD
      clusi <- obsNum[cluster==i]
      clusdelj <- obsNum[clusterDel==j]
      cl <- length(clusi)*length(clusdelj)
      if (cl>0) dij[ii,jj] <- mean(matDist[clusi,clusdelj], na.rm=TRUE)
      ##      if (cl>0) dij[ii,jj] <- mean(as.matrix(Dist)[clusi,clusdelj])
      ## measure ADM
      xbarj <- apply(mat[clusterDel==j,,drop=FALSE],2, function(x) mean(x, na.rm=TRUE))
      diff <- xbari-xbarj
      if(length(diff)>0) {
        if(any.na) {
          diff <- diff[!is.na(diff)]
          dij2[ii,jj] <- sqrt(mean(diff^2))
        } else {
          dij2[ii,jj] <- sqrt(sum(diff^2))
        }
      } else {
        dij2[ii,jj] <- 0
      }
      jj <- jj+1
    }
    ii <- ii+1
  }
  rs <- matrix(rowSums(overlap),nrow=nrow(overlap),ncol=ncol(overlap),byrow=FALSE)
  cs <- matrix(colSums(overlap),nrow=nrow(overlap),ncol=ncol(overlap),byrow=TRUE)
  stabmeas["APN"] <- 1-sum(overlap^2/rs)/sum(overlap)
  stabmeas["AD"] <- sum(overlap*dij)/nrow(mat)
  stabmeas["ADM"] <- sum(overlap*dij2)/nrow(mat)
  xbar <- tapply(mat[,del],clusterDel, function(x) mean(x, na.rm=TRUE))
  stabmeas["FOM"] <- sqrt(mean((mat[,del]-xbar[as.character(clusterDel)])^2, na.rm=TRUE))/sqrt((nrow(mat)-nc1)/nrow(mat))
  return(stabmeas)
}



####################################################################
## Internal Validation Functions
####################################################################
## Silhouette
## Connectivity
## Dunn index
####################################################################



########################################################################
## Silhouette
## Value in [-1,1] to be maximized
## NOTE! cluster library also has 'silhouette' function!
## Probably use that instead
#####################################################################


mysilhouette <- function(distance=NULL, clusters, Data=NULL, method="euclidean"){
  
  if (is.null(distance)) distance <- as.matrix(dist(Data, method=method))
  if (is(distance,"dist")) distance <- as.matrix(distance)
  dista <- apply(distance,2,function(x) tapply(x, clusters, function(x) na.rm=TRUE))
  nc <- ncol(dista); nr <- nrow(dista);
  a <- dista[matrix(c(clusters,1:nc),ncol=2,nrow=nc)]
  distb <- matrix(dista[-(clusters+(0:(nc-1))*nr)],ncol=nc,nrow=(nr-1))
  b <- apply(distb,2, function(x) min(x, na.rm=TRUE))
  s <- (b-a)/pmax(a,b)
  return(mean(s))
}



##########################################################################
## Connectivity
## Value in [0,infty] to be *minimized*
#####################################################################

connectivity <- function(distance=NULL, clusters, Data=NULL, neighbSize=10, method="euclidean"){
  
  if (is.null(distance) & is.null(Data)) stop("One of 'distance' or 'Data' is required")
  if (is.null(distance)) distance <- as.matrix(dist(Data, method=method))
  if (is(distance,"dist")) distance <- as.matrix(distance)
  nearest <- apply(distance,2,function(x) sort(x,ind=TRUE)$ix[2:(neighbSize+1)])
  nr <- nrow(nearest);nc <- ncol(nearest)
  same <- matrix(clusters,nrow=nr,ncol=nc,byrow=TRUE)!=matrix(clusters[nearest],nrow=nr,ncol=nc)
  conn <- sum(same*matrix(1/1:neighbSize,nrow=nr,ncol=nc))
  return(conn)
}



##########################################################################
## Dunn index
## Value in [0,infty], to be maximized
#####################################################################

dunn <- function(distance=NULL, clusters, Data=NULL, method="euclidean"){
  
  if (is.null(distance) & is.null(Data)) stop("One of 'distance' or 'Data' is required")
  if (is.null(distance)) distance <- as.matrix(dist(Data, method=method))
  if (is(distance,"dist")) distance <- as.matrix(distance)
  nc <- max(clusters)
  interClust <- matrix(NA, nc, nc)
  intraClust <- rep(NA, nc)
  
  for (i in 1:nc) {
    c1 <- which(clusters==i)
    for (j in i:nc) {
      if (j==i) intraClust[i] <- max(distance[c1,c1])
      if (j>i) {
        c2 <- which(clusters==j)
        interClust[i,j] <- min(distance[c1,c2])
      }
    }
  }
  dunn <- min(interClust,na.rm=TRUE)/max(intraClust)
  return(dunn)
}

#####################################################################
## Biological validation functions
####################################################################
## BHI (homogeneity index)
## BSI (stability index)
## Requires annotate and GO packages
#####################################################################

## read in csv file for biological annotation
readAnnotationFile <- function(filename) {
  if(!file.exists(filename))
    stop(paste(filename, "doesn't exist"))
  infile <- scan(filename, what="character")
  res <- lapply(infile, function(x) strsplit(x, ",")[[1]][-1])
  ## remove blank entries
  res <- lapply(res, function(x) x[!x%in%""])
  names(res) <- sapply(infile, function(x) strsplit(x, ",")[[1]][1])
  return(res)
}
## NOTE: returned value will be a LIST ...
## Is ok, since converted automatically to matrix in clValid function ...

## Convert annotation list to annotation TF matrix
annotationListToMatrix <- function(annotation, genenames) {
  annotation.matrix <- matrix(FALSE, ncol=length(annotation), nrow=length(genenames))
  colnames(annotation.matrix) <- names(annotation)
  rownames(annotation.matrix) <- genenames
  for ( i in 1:length(annotation) )
  {
    annot <- names(annotation)[i]
    genes <- as.character(annotation[[i]][!is.na(annotation[[i]])])
    genes.common <- intersect(genenames, genes)
    annotation.matrix[genes.common, annot ] <- TRUE
  }
  return(annotation.matrix)
}


## NOTE: could return BHI VECTOR (length=#stat clusters), take mean in clValid fn.
BHI <- function(statClust,annotation,names=NULL,category="all",dropEvidence=NULL) {
  
  ## Case 1
  ## Biological clusters provided by user
  if(is.matrix(annotation)) {
    ## initialize BHI vector to 0s
    bhi <- numeric(length(unique(statClust)))
    names(bhi) <- unique(statClust)
    
    ## for each statClust
    for ( k in unique(statClust) )
    {
      Ck.bhi <- 0
      Ck.idx <- which(statClust==k) # row indices of this statClust Ck
      
      if ( length(Ck.idx)<2 ) next # only one gene, skip
      
      ## for each gene in this statClust
      for ( i in Ck.idx )
      {
        ## ... count how many other genes j in Ck share any (1 or more)
        ## of gene i's annotations:
        
        B <- which(annotation[i,]==TRUE) # get indices of i's annotations
        if ( length(B)==0 ) next # gene i has no annotation
        
        ## gene's annotations of all other genes j in statClust Ck
        annot <- annotation[Ck.idx[ Ck.idx!= i ],B]
        ## ... add number of genes with at least one shared annotation
        if ( length(B)==1 )      Ck.bhi <- Ck.bhi + sum(annot)
        else if ( length(B) >1 ) Ck.bhi <- Ck.bhi + sum(rowSums(annot)>0)
      }
      
      nk <- sum(rowSums(annotation[Ck.idx,])>0) # nr. of annot. feat. in Ck
      if ( nk>1 ) bhi[k] <- Ck.bhi / (nk*(nk-1))
    }
    return(mean(bhi, na.rm=TRUE))
  }
  
  
  ## Case 2
  ## Name of annotation package provided by user
  ## Gene names assumed to correspond with rownames
  ## Requires Biobase, annotation
  ## Gene names and type of id provided by user
  ##  category <- match.arg(category,c("all","BP","CC","MF"))
  ##  expr <- parse(text="require(x, character.only=TRUE)")[[1]]
  ##  expr <- do.call("substitute", list(expr, list(x=annotation)))
  
  ##### NOTE: COMMENTED OUT THIS SECTION 3/31/2020
  ##### biocLite replaced with BiocManager
  # if(!require(annotation, character.only=TRUE)) {
  #   cat(paste("package",annotation,"not found, attempting download from Bioconductor\n",
  #             sep=" "))
  #   source("http://bioconductor.org/biocLite.R")
  #   res <- try(biocLite(annotation))
  #   if(class(res)=="try-error") {
  #     stop(paste("attempted download of package", annotation, "failed, exiting"))
  #   } else {
  #     library(annotation, character.only=TRUE)
  #   }
  # }
  
  ##### Removed this check from comments
  ##### Replaced "require" with "requireNamespace" 3/31/2020
  if(!requireNamespace(annotation,character.only=TRUE)) {
    stop(paste("package",annotation,"not found",sep=" "))
  }else{
    requireNamespace(annotation,character.only=TRUE)
  }
  
  ##### Added this check for suggested "annotate" package 3/31/2020
  if(!requireNamespace("annotate")) {
    stop(paste("package annotate not found",sep=" "))
  }else{
    requireNamespace("annotate")
  }
  #require(annotate)
  ## Get GO terms associated with each probe ID
  goTerms <- annotate::getGO(names,annotation)
  if (!is.null(dropEvidence))
    goTerms <- lapply(goTerms, annotate::dropECode, dropEvidence)
  ## Find biological homogeneity for each stat cluster, then take average
  bhi <- tapply(goTerms,statClust,function(x)  matchGO(x,category))
  return(mean(bhi, na.rm=TRUE))
  ## prev: return(mean(bhi[bhi!=-9]))
} ## End BHI function


matchGO <- function(gg,category) {
  ## x[1] is x$GOID, x[3] is x$Ontology
  ## extracts the GO_ID and Ont for each GO term associated with each probe ID
  goIDs <- lapply(gg, function(a) sapply(a, function(x) x[1]))
  ont   <- lapply(gg, function(a) sapply(a, function(x) x[3]))
  switch(category,
         ## 1. Find which probes have corresponding annotation
         ## 2. Extract only subset w/appropriate annotation
         BP = {
           goBP <- sapply(ont, function(x) any(x%in%"BP"))
           goIDs <- goIDs[goBP]
         },
         CC = {
           goCC <- sapply(ont, function(x) any(x%in%"CC"))
           goIDs <- goIDs[goCC]
         },
         MF = {
           goMF <- sapply(ont, function(x) any(x%in%"MF"))
           goIDs <- goIDs[goMF]
         },
         all = {
           ## goAll <- sapply(goIDs, function(a) all(sapply(a,function(x) is.null(x))))
           ## changed 11/17/09
           goAll <- sapply(ont, function(x) any(x%in%c("BP","CC","MF")))
           goIDs <- goIDs[goAll]
         })
  ## Number of annotated probes
  ## If only one annotated probe, then skip this cluster
  n <- length(goIDs)
  if (n<2) return(NA) ## previously: return(-9)
  sum <- 0
  ## Now count overlap of annotation categories for different probes/genes
  for (i in 1:(length(goIDs)-1)) {
    for (j in (i+1):length(goIDs)) {
      switch(category,
             all =  sum <- sum + any(goIDs[[i]]%in%goIDs[[j]]),
             BP  =  sum <- sum + any(goIDs[[i]][ont[[i]]=="BP"]%in%goIDs[[j]][ont[[j]]=="BP"]),
             CC  =  sum <- sum + any(goIDs[[i]][ont[[i]]=="CC"]%in%goIDs[[j]][ont[[j]]=="CC"]),
             MF  =  sum <- sum + any(goIDs[[i]][ont[[i]]=="MF"]%in%goIDs[[j]][ont[[j]]=="MF"]))
    }
  }
  return(sum/(n*(n-1)))
} ## End matchGO function


BSI <- function(statClust,statClustDel,annotation,names=NULL,category="all",
                goTermFreq=0.05, dropEvidence=NULL) {
  
  ## Case 1
  ## Biological clusters provided by user
  if(is.matrix(annotation)) {
    nFC <- ncol(annotation)
    nAnnot <- apply(annotation, 2, sum)
    if(is.null(names(statClust))) {
      names(statClust) <- names
      names(statClustDel) <- names
    }
    bsi <- numeric(nFC)
    overlap <- xtabs(~statClust + statClustDel)
    rsums <- rowSums(overlap)
    for (FCk in 1:nFC)
    {
      osum <- 0
      g.idx <- which(annotation[,FCk]) ## find which genes are annotated
      if (length(g.idx)<2) next ## only one gene, skip
      for (gx in g.idx) {
        for (gy in g.idx) {
          if (gx != gy) {
            i <- statClust[gx]
            j <- statClustDel[gy]
            osum <- osum + overlap[i,j]/rsums[i]
          }
        }
      }
      bsi[FCk] <- osum/(nAnnot[FCk]*(max(nAnnot[FCk]-1,1)))
    }
    return(mean(bsi, na.rm=TRUE))
  }
  
  ## Case 2
  ## Gene names and type of id provided by user
  ## For option 2 of BSI need to determine how many GO terms to use
  ## Cutoff determined by goTermFreq (cuts all GO terms w/ < 5% freq in Data set)
  ##  category <- match.arg(category,c("all","BP","CC","MF"))
  
  tab <- xtabs(~statClust + statClustDel)
  rs <- rowSums(tab)
  n <- length(statClust)
  
  ##### NOTE: COMMENTED OUT THIS SECTION 3/31/2020
  ##### biocLite replaced with BiocManager
  # if(!require(annotation,character.only=TRUE)) {
  #   cat(paste("package",annotation,"not found, attempting download from Bioconductor\n",
  #             sep=" "))
  #   source("http://bioconductor.org/biocLite.R")
  #   try(biocLite(annotation))
  # }
  
  ##### Removed this check from comments
  ##### Replaced "require" with "requireNamespace" 3/31/2020
  if(!requireNamespace(annotation,character.only=TRUE)) {
    stop(paste("package",annotation,"not found",sep=" "))
  }else{
    requireNamespace(annotation,character.only=TRUE)
  }
  
  ##### Added this check for suggested "annotate" package 3/31/2020
  if(!requireNamespace("annotate")) {
    stop(paste("package annotate not found",sep=" "))
  }else{
    requireNamespace("annotate")
  }
  
  goTerms <- annotate::getGO(names,annotation)
  if (!is.null(dropEvidence))
    goTerms <- lapply(goTerms, annotate::dropECode, dropEvidence)
  
  ## Things to do
  ## 1. extract all relevant GO terms for each gene (bp,cc, etc)
  ## 2. get freq table of terms - keep most frequent
  ## 3. create indicator matrix for each term indicating which genes have that term
  
  switch(category,
         ## x[1] is x$GOID, x[3] is x$Ontology
         BP = goIDs <- sapply(goTerms, function(a) sapply(a, function(x) x[1][x[3]=="BP"])),
         CC = goIDs <- sapply(goTerms, function(a) sapply(a, function(x) x[1][x[3]=="CC"])),
         MF = goIDs <- sapply(goTerms, function(a) sapply(a, function(x) x[1][x[3]=="MF"])),
         all = goIDs <- sapply(goTerms, function(a) sapply(a, function(x) x[1])))
  
  goTab <- table(unlist(goIDs))
  keepTerms <- names(goTab)[goTab>floor(n*goTermFreq)]
  termMat <- matrix(0,ncol=length(keepTerms),nrow=n)
  ## 09/27/09 - fixed issue by using unlist(x) in sapply
  for (i in 1:length(keepTerms)) {
    termMat[,i] <- sapply(goIDs, function(x) keepTerms[i]%in%unlist(x))
  }
  
  bsi <- apply(termMat,2, function(a) {
    out <- outer(statClust[as.logical(a)],statClustDel[as.logical(a)], function(x,y) tab[cbind(x,y)]/rs[x])
    (sum(out) - sum(diag(out)))/(sum(a)*(ifelse(sum(a)>1,sum(a)-1,1)))
  })
  
  return(mean(bsi, na.rm=TRUE))
  
}


#####################################################################
## Functions for clustering - SOTA
#####################################################################


sota.init <- function(data){
  nodes <- matrix(0, nrow(data)*2, 3+ncol(data))
  if(is.null(colnames(data)))
    colnames(data) <- paste("V", 1:ncol(data))
  colnames(nodes) <- c("ID", "anc", "cell", colnames(data))
  nodes[,"ID"]=1:(nrow(data)*2)
  
  nodes[1,] <- c(1, 0, 0, apply(data,2, function(x) mean(x, na.rm=TRUE)))
  nodes[2,] <- c(2, 1, 1, nodes[1,][-c(1,2,3)])
  nodes[3,] <- c(3, 1, 1, nodes[1,][-c(1,2,3)])
  return(nodes)
}

dist.fn <- function(input, profile, distance){
  if(distance=="correlation")
    return(1-cor(input,profile, use="pairwise.complete.obs"))
  else
    return(sqrt(sum((input-profile)^2)))
}

cl.ID <- function(clust, old.cl, new.cl){
  for(i in 1:length(clust))
    clust[i] <- new.cl[which(old.cl==clust[i])]
  clust
}

getResource <- function(data, tree, clust, distance, pr){
  dist <- rep(0, length(clust))
  resource <- rep(0, max(clust))
  
  for(i in unique(clust)){
    temp <- data[clust==i,]
    if(is.vector(temp))
      temp <- matrix(temp, nrow=1, ncol=ncol(data))
    if(distance=="correlation")
      resource[i] <- mean(apply(temp, 1, dist.fn, profile=tree[i,pr],
                                distance=distance))
    else
      resource[i] <- mean(apply(temp, 1, dist.fn, profile=tree[i,pr],
                                distance=distance))}
  resource
}

getCells <- function(tree, neighb.level, n){
  or.n <- n
  cells <- c(n-1,n)
  for(i in 1:(neighb.level+1)){
    n  <- tree[n, "anc"]
    if(n==1)
      break
  }
  for(j in 2:(or.n-2)){
    z <- j
    if(tree[j,"cell"]!=1)
      next
    while(z > 0){
      z <- tree[z, "anc"]
      if(z==n){
        cells <- c(cells, j)
        break}
    }
  }
  return(tree[cells,])
}


sota <- function(data, maxCycles, maxEpochs=1000, distance="euclidean",
                 wcell=.01, pcell=.005, scell=.001, delta=.0001, neighb.level=0,
                 maxDiversity = .9, unrest.growth=TRUE, ...){
  tree <- sota.init(data)
  pr <- 4:ncol(tree)
  n <- 3
  genes<- 1:nrow(data)
  clust <- rep(1, length(genes))
  Node.Split <- 1
  
  Res.V <- getResource(data, tree, clust, distance, pr)
  if(distance=="correlation")
    Res.V <- 1-Res.V
  diversity <- Res.V
  
  for(k in 1:maxCycles){                                #loop for the Cycles
    trainNode <- Node.Split
    trainSamp <- genes[clust==trainNode]
    curr.err <- 1e10
    ep <- 1
    
    while(ep <= maxEpochs){	      			#loop for the Epochs
      last.err <- 0
      left.ctr <- right.ctr <- 0
      left.d <- right.d <- 0
      for(i in trainSamp){
        cells <- tree[c(n-1,n),]
        dist <- rep(0, nrow(cells))
        for(j in 1:2)
          dist[j] <- dist.fn(data[i,], cells[j,pr], distance=distance)
        
        or <- which.min(dist)
        if(or==1)
          left.ctr <- left.ctr + 1
        else
          right.ctr<- right.ctr + 1
        
        closest <- cells[or,1]
        sis <- ifelse(closest%%2==0,closest+1,closest-1)
        sis.is.cell <- ifelse(tree[sis,"cell"]== 1, 1, 0)
        
        ##   updating the cell and its neighbourhood
        if(sis.is.cell==1){
          parent <- tree[closest, "anc"]
          tree[closest, pr] <- tree[closest, pr]+wcell*(data[i,]-tree[closest, pr])
          tree[sis, pr] <- tree[sis, pr]+scell*(data[i,]-tree[sis,pr])
          tree[parent, pr] <- tree[parent, pr]+pcell*(data[i,]-tree[parent, pr])
        }
        else
        {
          tree[closest, pr] <- tree[closest, pr]+wcell*(data[i,]-tree[closest, pr])
        }
      }
      cells <- tree[c(n-1,n),]
      for(i in trainSamp){
        for(j in 1:2)
          dist[j] <- dist.fn(data[i,], cells[j,pr], distance=distance)
        last.err <- last.err+min(dist)}
      last.err <- last.err/length(trainSamp)
      
      if(ifelse(last.err==0, 0, abs((curr.err-last.err)/last.err)) < delta
         && left.ctr !=0 && right.ctr !=0)
        break
      ep <- ep + 1
      curr.err <- last.err
    }
    clust <- assignGenes(data, trainSamp, clust, tree, n, distance, pr, neighb.level)
    Res.V <- getResource(data, tree, clust, distance, pr)
    if(distance=="correlation")
      Res.V <- 1-Res.V
    
    tempRes <- Res.V
    tempRes[tempRes == 0] <- diversity[tempRes==0]
    diversity <- tempRes
    
    if(k==maxCycles || (max(Res.V) < maxDiversity & unrest.growth==FALSE))
      break  ## do not split the cell
    newCells <- splitNode(Res.V, tree, n)
    tree <- newCells$tree
    n <- newCells$n
    Node.Split <- newCells$toSplit
  }
  
  tree <- trainLeaves(data, tree, clust, pr, wcell, distance, n, delta)
  Res.V <- getResource(data, tree, clust, distance, pr)
  Res.V <- Res.V[Res.V!=0]
  if(distance=="correlation")
    Res.V <- 1-Res.V
  
  diversity[(length(diversity)-length(Res.V)+1):length(diversity)] <- Res.V
  
  treel <- tree[tree[,"cell"]==1,]
  old.cl <- treel[,1]
  treel[,1] <- 1:nrow(treel)
  old.clust <- clust
  
  clust <- cl.ID(old.clust, old.cl, 1:nrow(treel))
  totals <- table(clust)
  
  out <- list(data=data, c.tree=cbind(tree[1:n,],Diversity=diversity), tree=treel, clust=clust,
              totals=totals, dist=distance, diversity=Res.V)
  
  class(out) <- "sota"
  return(out)
}



trainLeaves <- function(data, tree, clust, pr, wcell, distance, n, delta){
  nc <- ncol(data)
  for(i in 1:n){
    if(!is.element(i, clust))
      next
    temp <- matrix(data[clust==i,], ncol=nc)
    converged <- FALSE
    init.err <- getCellResource(temp, tree[i,pr], distance)
    while(!converged){
      for(j in 1:nrow(temp))
        tree[i, pr] <- tree[i, pr]+wcell*(temp[j,]-tree[i, pr])
      
      last.err <- getCellResource(temp, tree[i,pr], distance)
      converged <- ifelse(abs((last.err-init.err)/last.err) < delta, TRUE, FALSE)
      init.err <- last.err
    }
  }
  return(tree)
}


assignGenes <- function(data, Sample, clust, tree, n, distance, pr, neighb.level){
  if(neighb.level==0)
    cells <- tree[c(n-1,n),]
  else
    cells <- getCells(tree, neighb.level, n)
  
  for(i in Sample){
    dist <- rep(0, nrow(cells))
    for(j in 1:nrow(cells))
      dist[j] <- dist.fn(data[i,], cells[j,pr], distance)
    or <- which.min(dist)
    closest <- cells[or,1]
    clust[i] <- closest
  }
  clust
}

splitNode <- function(Res.V, tree, n){
  maxheter <- which.max(Res.V)
  cl.to.split <- tree[maxheter,1]
  tree[n<-n+1,-1] <- tree[cl.to.split,-1]
  tree[n, "anc"] <- cl.to.split
  tree[n<-n+1,-1] <- tree[cl.to.split,-1]
  tree[n, "anc"] <- cl.to.split
  tree[cl.to.split, "cell"] <- 0
  return(list(tree=tree, n=n, toSplit=cl.to.split))
}



getCellResource <- function(temp, profile, distance){
  if(distance=="correlation")
    resource <- mean(apply(temp, 1, dist.fn, profile,
                           distance=distance))
  else(distance=="euclidean")
  resource <- mean(apply(temp, 1, dist.fn, profile,
                         distance=distance))
  resource
}

print.sota <- function(x, ...){
  results <- as.matrix(cbind(as.numeric(names(x$totals)), as.numeric(x$totals),
                             x$diversity))  ## changed
  colnames(results) <- c("ID","Size", "Diversity")
  rownames(results) <- rep("", nrow(results))
  cat("\nClusters:\n")
  print(results, ...)
  cat("\nCentroids:\n")
  
  print(format(data.frame(x$tree[,-c(1:3)])), ...)
  cat("\n")
  cat(c("Distance: ", x$dist, "\n"))
  invisible(x)
}


plot.sota <- function(x, cl=0, ...){
  
  op <- par(no.readonly=TRUE)
  on.exit(par(op))
  if(cl!=0)
    par(mfrow=c(1,1)) else
    {
      pdim <- c(0,0)
      for(i in 1:100){
        j <- i
        if(length(x$totals) > i*j)
          j <- j+1
        else{
          pdim <- c(i,j)
          break}
        if(length(x$totals) > i*j)
          i <- i+1
        else{
          pdim <- c(i,j)
          break}
      }
      par(mfrow=pdim)
    }
  
  ylim = c(min(x$data), max(x$data))
  pr <- 4:ncol(x$tree)
  if(cl==0)
    cl.to.print <- 1:length(table(x$clust)) else  ## changed
      cl.to.print <- cl
  cl.id <- sort(unique(x$clust))  ## changed
  
  for(i in cl.to.print){
    plot(1:ncol(x$data), x$tree[i, pr], col="red", type="l",
         ylim=ylim, xlab=paste("Cluster ",i), ylab="Expr. Level", ...)
    legend("topleft", legend=paste(x$totals[i], " Genes"), cex=.7,
           text.col="navy", bty="n")
    cl <- x$data[x$clust==cl.id[i],]  ## changed
    if(is.vector(cl))
      cl <- matrix(cl, nrow=1)
    for(j in 1:x$totals[i])
      lines(1:ncol(x$data), cl[j,], col="grey")
    lines(1:ncol(x$data), x$tree[i, pr], col="red", ...)
    
  }
}


##################################################################################
## Functions for Rank Aggregation
## 03/08/2009
## NOTE: Will encorporate this into RankAggreg instead
##################################################################################

getRanksWeights <- function(clVObj, measures=measNames(clVObj), nClust=nClusters(clVObj),
                            clAlgs=clusterMethods(clVObj)) {
  measures <- match.arg(measures,measNames(clVObj),several.ok=TRUE)
  nClust <- as.character(nClust)
  nClust <- match.arg(nClust,nClusters(clVObj),several.ok=TRUE)
  nClust <- as.character(nClust)
  clAlgs <- match.arg(clAlgs,clusterMethods(clVObj),several.ok=TRUE)
  meas <- clVObj@measures[measures, nClust, clAlgs, drop=FALSE]
  ranks <- matrix(0, dim(meas)[1], dim(meas)[2]*dim(meas)[3])
  colnames(ranks) <- 1:ncol(ranks)
  rownames(ranks) <- rownames(meas)
  weights <- ranks
  
  tmp <- expand.grid(dimnames(meas)[[2]], dimnames(meas)[[3]])
  algs <- paste(tmp[,2], tmp[,1], sep="-")
  ## measures <- rownames(meas)
  
  for(i in 1:nrow(meas)){
    if(measures[i] %in% c("Dunn", "Silhouette"))
      incr <- TRUE
    else
      incr <- FALSE
    sorted <- sort(meas[i,,], ind=TRUE, decreasing=incr)
    ranks[i,] <- algs[sorted$ix]
    weights[i,] <- sorted$x
  }
  list(ranks=ranks, weights=weights)
}
Any scripts or data that you put into this service are public.
optCluster documentation built on April 16, 2022, 5:05 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
optCluster
Determine Optimal Clustering Algorithm and Number of Clusters

R/orphaned-Code.R
In optCluster: Determine Optimal Clustering Algorithm and Number of Clusters

Defines functions getRanksWeights plot.sota print.sota getCellResource splitNode assignGenes trainLeaves sota getCells getResource cl.ID dist.fn sota.init BSI matchGO BHI annotationListToMatrix readAnnotationFile dunn connectivity mysilhouette stability

Try the optCluster package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

optCluster Determine Optimal Clustering Algorithm and Number of Clusters

R/orphaned-Code.R In optCluster: Determine Optimal Clustering Algorithm and Number of Clusters

Defines functions getRanksWeights plot.sota print.sota getCellResource splitNode assignGenes trainLeaves sota getCells getResource cl.ID dist.fn sota.init BSI matchGO BHI annotationListToMatrix readAnnotationFile dunn connectivity mysilhouette stability

Try the optCluster package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

optCluster
Determine Optimal Clustering Algorithm and Number of Clusters

R/orphaned-Code.R
In optCluster: Determine Optimal Clustering Algorithm and Number of Clusters