R/Anthropometry-internalHipamAnthropom.R

Defines functions full2dist update.tree.local hipam.local collapse.step collapse.cluster partition.cluster further.clustering asw.calc cluster.search initial.clustering pamsam ext.dist initialize.tree

Documented in asw.calc cluster.search collapse.cluster collapse.step ext.dist full2dist further.clustering hipam.local initial.clustering initialize.tree pamsam partition.cluster update.tree.local

#All these functions are common to the HIPAM_{MO} and HIPAM_{IMO} algorithms.
initialize.tree <- function(x, maxsplit, orness, type, ah=c(23,28,20,25,25), verbose, ...){
 if(type == "MO"){
  x.ps <- getBestPamsamMO(x, maxsplit, orness, type, ah, verbose, ...)
 }else{
   x.ps <- getBestPamsamIMO(x, maxsplit, orness, type, ah, verbose, ...)
  }
  n.clust <- length(unique(x.ps$clustering))
  list(clustering = x.ps$clustering, asw = x.ps$asw, n.levels = 1, medoids = x.ps$medoids, active = 
         rep(TRUE,n.clust), development = matrix(1:n.clust), n.clust = n.clust, metric = "McCulloch")
}


ext.dist <- function(x, maxsplit, orness, ah, verbose){

 num.variables <- dim(x)[2]
 w <- weightsMixtureUB(orness, num.variables)

 num.personas <- dim(x)[1]
 datosm <- as.matrix(x)
 datost <- aperm(datosm, c(2,1))                     
 dim(datost) <- c(1, num.personas * num.variables)   
 rm(datosm)

 bh <- (apply(as.matrix(log(x)), 2, range)[2,] - apply(as.matrix(log(x)), 2, range)[1,]) / ((maxsplit - 1) * 8)
 if(any(bh==0)){ 
   bh[which(bh == 0)]=0.0000000001
 }
 bl <- -3 * bh
 ah <- ah
 al <- 3 * ah

 DIST <- getDistMatrix(datost, num.personas, num.variables, w, bl, bh, al, ah, verbose) 

  if( is.null(dimnames(x)[1]) ) dimnames(DIST) <- NULL
 
 return(DIST)
}


pamsam <- function(x, k = k, type = type, DIST, maxclust = 50, further.maxclust = 20, exhaustive = FALSE, 
                   max.check = 20, total.check = 100, maxsplit = maxsplit, orness = orness, 
                   ah, verbose){
 
 if(type == "MO"){                 
  #Calculate DIST:
  DIST <- ext.dist(x, maxsplit = k, orness = orness, ah, verbose)
 }else{
   DIST <- DIST
  }

 #Initial clustering:
 init.clust <- initial.clustering(x, k, DIST = DIST, maxclust, exhaustive)
 if(!is.na(k)){
  clustering <- init.clust$clustering
  medoids <- init.clust$medoids
  asw.profile <- init.clust$asw.profile
  sesw.profile <- init.clust$sesw.profile
  asw <- asw.profile
  sesw <- sesw.profile
  furth.info <- NULL
 }else{
   k <- init.clust$num.of.clusters
   asw.profile <- init.clust$asw.profile
   sesw.profile <- init.clust$sesw.profile
   asw <- asw.profile[k]
   sesw <- sesw.profile[k]
   medoids <- init.clust$medoids
   clustering <- init.clust$clustering


   #Further partitioning, ordering and collapsing:
   furth.clust <- further.clustering(x, k, DIST = DIST, medoids, clustering, asw, sesw, max.check, total.check, 
                                     maxclust = further.maxclust, exhaustive, maxsplit = maxsplit, 
                                     orness = orness, ah = ah, verbose)
   medoids <- furth.clust$medoids
   clustering <- furth.clust$clustering
   k <- furth.clust$num.of.clusters
   asw <- furth.clust$asw
   furth.info <- list(k.development = furth.clust$k.info, asw.development = furth.clust$asw.info, 
                      sesw.development = furth.clust$sesw.info, total.checks = furth.clust$total.checks)
  }
  if( !is.null(dimnames(x)[[1]]) ) names(clustering) <- dimnames(x)[[1]]


  #OUTPUT:
  list(medoids = medoids, clustering = clustering, asw = asw, num.of.clusters = k, info = furth.info, 
       profiles = list(asw.profile = asw.profile, sesw.profile = sesw.profile), metric = "McCulloch" )
}




#INITIAL CLUSTERING#
#Search function for finding the best clustering of x using PAM or CLARA.  Uses a grid or exhaustive search up 
#to maxclust unless k is given.
initial.clustering <- function(x, k, DIST, maxclust, exhaustive){

 n.obj <- nrow(x)
 
 DIST.init <- full2dist(DIST)

 x.clust <- cluster.search(x, k, DIST = DIST.init, maxclust, exhaustive)

 list(medoids = x.clust$medoids, clustering = x.clust$clustering,asw.profile = x.clust$asw.profile, 
      sesw.profile = x.clust$sesw.profile, num.of.clusters = x.clust$num.of.clusters)
}

cluster.search <- function(x, k, DIST = NULL, maxclust = 50, exhaustive){
   
 n.obj <- nrow(x)
 #Start:
 if(n.obj <= 2 | !is.na(k)) {  #When k is given or < 3 objects.
  if(n.obj <= 2) {
   x.medoids <- x
   k <- n.obj
   x.clustering <- 1:k
   asw.profile <- NA
   sesw.profile <- NA
   names(asw.profile) <- list( k )
   names(sesw.profile) <- list( k )
  }else{
    x.clust <- pam(DIST, k, diss=TRUE)
    asw.profile <- x.clust$silinfo$avg.width
    sesw.profile <- NA
          
    names(asw.profile) <- list( k )
    names(sesw.profile) <- list( k )
    maxclust <- k
   }
 }else{  #Estimate k using average silhouette width:
   maxclust <- min( maxclust, n.obj - 1 )

   if(maxclust > 12 & exhaustive == FALSE){
    gp <- sqrt(maxclust)
    gpr <- round(gp)
    gl <- floor(gp)
    gst <- floor( ((gpr - 2) + (maxclust - gpr * gl)) / 2 + 2 )
    grid <- seq(gst, by = gpr, length = gl)
    gridout <- c(1, grid, maxclust + 1)
   }else {
     grid <- 2:maxclust
     gpr <- 1
    }
        
   aswmax <- -1
   k <- 1
   asw.profile <- rep(-1,maxclust)
      
   sesw.profile <- rep(NA,maxclust) 
        
    for(i in grid){
     clusti <- pam(DIST, i, diss=TRUE)
     asw.profile[i]  <- clusti$silinfo$avg.width
        
      if(asw.profile[i] > aswmax) {
       aswmax <- asw.profile[i]
       k <- i
       x.clust <- clusti
      }
    }
     
   if(gpr > 1){
    kpos <- match(k,gridout)
    grid2 <- c((gridout[kpos - 1] + 1) : (k - 1), (k + 1) : (gridout[kpos + 1] - 1))

     for(i in grid2) {
      clusti <- pam(DIST, i, diss=TRUE)
      asw.profile[i] <- clusti$silinfo$avg.width
               
       if(asw.profile[i] > aswmax){
        aswmax <- asw.profile[i]
        k <- i
        x.clust <- clusti
       }
      }
    }  
  
  }
 #Check that k is a possible value:
    if(n.obj > 2) {
     if( !is.element(k,2:maxclust) ) {
      stop("Number of clusters found is wrong")
     }
      x.medoids <- x[x.clust$medoids,]
      x.clustering <- x.clust$clustering
    
    }
    list(medoids = x.medoids, clustering = x.clustering,asw.profile = asw.profile, sesw.profile = NA, 
         num.of.clusters = k)
}


##########
#ASW CALC#
#Calculates the average silhouette width of the data given a specific clustering. Note that asw.calc needs 
#to be fed the appropriate medoids.
asw.calc <- function(x, k, clustering, DIST = NULL){ 
  n.obj <- nrow(x)
   if(is.matrix(DIST) && (ncol(DIST)==n.obj & nrow(DIST)==n.obj)){
    col.size <- n.obj
    sil.mem <- clustering
    ind <- rep( clustering, col.size ) + rep( k * (0:(col.size-1)), each=n.obj )
    sumdists <- matrix(tapply(DIST,ind,sum),ncol=col.size)
   }
    
    identif <- matrix(0,k,col.size)
    identif[cbind(sil.mem,1:col.size)] <- 1
    clus.sizes <- tabulate(clustering)
    denominators <- matrix(rep(clus.sizes, col.size), ncol=col.size) - identif
    sil.widths <- sumdists / denominators
    ai <- sil.widths[identif == 1]
    bi <-  apply( matrix(sil.widths[identif == 0] , nrow = k - 1), 2, min)
    si <- (bi - ai) / apply(cbind(bi,ai),1,max)
    si[!is.finite(si)] <- 0
    #[gives 0 as value of a(i) when only 1 in cluster].
    asw <- mean(si)
    sesw <- sqrt( var(si) / col.size )
    list(asw = asw, sesw = sesw)
}


#####################
#FURTHER CLUSTERING#
#Further partitioning and collapsing; uses initial.clustering.
further.clustering <- function(x, k, DIST, medoids, clustering, asw, sesw, max.check, total.check, maxclust, 
                               exhaustive, maxsplit, orness, ah, verbose){
  
 n.obj <- nrow(x)
 itc <- 0
 isc <- 0
 clust.perm <- sample(k)
 k.info <- k
 asw.info <- asw
 sesw.info <- sesw
 prev.groups <- NULL
 prev.groups.index <- NULL
  
  while(isc < k & itc <= total.check){
   itc <- itc + 1
   isc <- isc + 1
   clust.i <- clust.perm[isc]
    if(runif(1) < 0.5) {
     poss.clust <- partition.cluster(x, k, DIST = DIST, clust.i, medoids, clustering, maxclust, exhaustive)
      if(poss.clust$asw <= asw) {
       poss.clust <- collapse.cluster(x, k, DIST = DIST, clust.i, medoids, clustering, asw, sesw, maxclust, 
                                      exhaustive, maxsplit, orness, ah, verbose) 
      }
    }else{
      poss.clust <- collapse.cluster(x, k, DIST = DIST, clust.i, medoids, clustering, asw, sesw, maxclust, 
                                     exhaustive, maxsplit, orness, ah, verbose)
       if(poss.clust$asw <= asw) {
        poss.clust <- partition.cluster(x, k, DIST = DIST, clust.i, medoids, clustering, maxclust, exhaustive)
       }
     }

   if(poss.clust$asw > asw) {
    k <- poss.clust$num.of.clusters
    asw <- poss.clust$asw
    sesw <- poss.clust$sesw
    medoids <- poss.clust$medoids
    clustering <- poss.clust$clustering
    k.info <- c(k.info, k)
    asw.info <- c(asw.info, asw)
    sesw.info <- c(sesw.info, sesw)
    isc <- 0 
    clust.perm <- sample(k)
   }
  }

    list(medoids = medoids, clustering = clustering, num.of.clusters = k, asw = asw, sesw = sesw, k.info = k.info, 
         asw.info = asw.info, sesw.info = sesw.info, total.checks = itc)
}


#PARTITION CLUSTER#
partition.cluster <- function(x, k, DIST = NULL, j, medoids, clustering, maxclust, exhaustive){

 if(length(clustering[clustering==j]) >= 1){
 #Partitioning:
  if(is.null(DIST)) {
   DIST.part <- NULL
  }else{
    DIST.part <- full2dist( DIST[clustering==j,clustering==j] )
   }
    
     clust.obj <- cluster.search(x[clustering==j,], k = NA, DIST = DIST.part, maxclust = maxclust, exhaustive)
     kj <- clust.obj$num.of.clusters
     medoids.j <- clust.obj$medoids
    
  #Ordering:
  clustering[clustering > j] <- clustering[clustering>j] + kj - 1
  clustering[clustering == j] <- j + clust.obj$clustering - 1
  if(j == 1)        medoids <- rbind(medoids.j, medoids[(j+1):k,])
  if(j > 1 & j < k) medoids <- rbind(medoids[1:(j-1),], medoids.j, medoids[(j+1):k,])
  if(j == k)        medoids <- rbind(medoids[1:(j-1),], medoids.j)

  k <- k + kj - 1 
  asw.obj <- asw.calc(x, k, clustering, DIST)
  asw <- asw.obj$asw
  sesw <- asw.obj$sesw
 }
 
 list(medoids = medoids, clustering = clustering,asw = asw, sesw = sesw, num.of.clusters = k)
}


##################
#COLLAPSE CLUSTER#
collapse.cluster <- function(x, k, DIST, j, medoids, clustering, asw, sesw, maxclust, exhaustive, maxsplit, 
                             orness, ah,verbose){

 if(k > 2){ 
  DIST.medoids <- ext.dist(medoids, maxsplit, orness = orness, ah, verbose) 
  cluster.weights <- 1/(DIST.medoids[j,-j] + .001)
  cluster.weights <- cluster.weights/sum(cluster.weights)
  chosen.order <- sample((1:k)[-j], prob = cluster.weights)
 } 
  n.check <- min(k-1,3)
  ip <- 0
   while(k > 2 & ip < n.check){ 
    ip <- ip + 1
    change1 <- j
    change2 <- chosen.order[ip]
    coll.step <- collapse.step(x, k, DIST, change1, change2, medoids, clustering, asw, sesw)
     if(coll.step$asw > asw) {
      medoids <- coll.step$medoids
      clustering <- coll.step$clustering
      asw <- coll.step$asw
      sesw <- coll.step$sesw
      k <- coll.step$num.of.clusters
      ip <- n.check + 1
     }
   }
  
  list(medoids = medoids, clustering = clustering,num.of.clusters = k, asw = asw, sesw = sesw)
}


#COLLAPSE STEP#
collapse.step <- function(x, k, DIST, change1, change2, medoids, clustering, asw, sesw){
 ch.low <- min(change1,change2)
 ch.high <- max(change1,change2)
  if(ch.high != ch.low){
   poss.clustering <- clustering
   poss.clustering[poss.clustering == ch.high] <- ch.low
   poss.clustering[poss.clustering > ch.high] <- poss.clustering[poss.clustering > ch.high] - 1
   poss.medoids <- medoids
   poss.medoids <- medoids[-ch.high,]
   x.mm <- x[poss.clustering==ch.low,]
      DIST.mm <- DIST[poss.clustering == ch.low, poss.clustering == ch.low]
      pos.mm <- pam(full2dist(DIST.mm), k=1)$medoids
      poss.medoids[ch.low,] <- x.mm[pos.mm,]
       if(!is.null(dimnames(medoids))) dimnames(medoids)[[1]][ch.low] <- pos.mm
       k.poss <- k - 1 
       asw.poss.obj <- asw.calc(x, k=k.poss, poss.clustering, DIST) 
       asw.poss <- asw.poss.obj$asw
       sesw.poss <- asw.poss.obj$sesw
       if(asw.poss >= asw){
        asw <- asw.poss
        sesw <- sesw.poss
        clustering <- poss.clustering
        k <- k.poss
        medoids <- poss.medoids
       }
 }else{
  }
    
  list(medoids = medoids, clustering = clustering,num.of.clusters = k, asw = asw, sesw = sesw)
}

hipam.local <- function(tree, x, asw.tol, maxsplit, local.const, orness, type, ah, verbose,...){
 #Iterative procedure to keep splitting branches as long as there are active branches.
 active <- tree$active
 while(sum(active) > 0){
  choose.among <- (1:length(active))[active]
   if(length(choose.among) == 1){
    whch.brnch <- choose.among
   }else {
     whch.brnch <- sample(choose.among, 1)
    }

   if(type == "MO"){
    proposal <- checkBranchLocalMO(tree, x, whch.brnch, maxsplit, asw.tol, local.const, 
                                   orness, type, ah, verbose, ...)
   }else{
     proposal <- checkBranchLocalIMO(tree, x, whch.brnch, maxsplit, asw.tol, local.const, 
                                     orness, type, ah, verbose, ...)
    }
 
    proposed.tree <- proposal$tree
    if(proposal$reject){
     tree$active[whch.brnch] <- FALSE
    }else{
       tree <- proposed.tree
      }
      active <- tree$active
 }

    tree
}


update.tree.local <- function(object, xi.ps, which.x, i){
 tree <- object
 #Number of clusters before the update:
 maxclust <- max(tree$clustering)
 
 #Update clustering:
 tree$clustering[which.x] <- maxclust + xi.ps$clustering
 
 #Update number of clusters:
 new.clust <- xi.ps$num.of.clusters
 tree$n.clust <- tree$n.clust + new.clust - 1
 clust <- tree$clustering
 
 #Update medoids:
 tree$medoids <- rbind(tree$medoids, xi.ps$medoids)
 
 #Update active status:
 tree$active[i] <- FALSE
 new.activities <- rep(TRUE, new.clust)
 
  for (j in 1:new.clust){
   if (sum(xi.ps$clustering==j) <= 2){
    new.activities[j] <- FALSE
   }
  }
  
  tree$active <- c(tree$active,new.activities)
  
  #Update the development matrix:
  dmat<- tree$development
  nr <- nrow(dmat)
  nc <- ncol(dmat)
  whch.row <- sum((1:nr) * apply((dmat==i), 1, sum, na.rm = TRUE))
  whch.col <- sum((1:nc) * apply((dmat==i), 2, sum, na.rm = TRUE))
  dmat<-dmat[c(1:whch.row,rep(whch.row,new.clust-2),whch.row:nr),]
  
  if (nc==whch.col){
   dmat <- cbind(dmat,NA)
   tree$n.levels <- tree$n.levels + 1
  }
  
  for (j in 1:new.clust){
   dmat[whch.row + j - 1, whch.col + 1] <- maxclust + j
  }

 tree$development <- dmat
 tree
}


full2dist <- function(fullm, method = "unspecified"){
    distm <- fullm[lower.tri(fullm)] #?lower.tri: Returns a matrix of logicals the same size of a given 
    #matrix with entries. 
    #TRUE in the lower or upper triangle. 
    attr(distm , "Size") <- nrow(fullm)
    attr(distm , "Labels") <- dimnames(fullm)[[1]]
    attr(distm , "Diag") <- FALSE
    attr(distm , "UPPER") <- FALSE
    attr(distm, "method") <- method
    attr(distm , "class") <- "dist"
    attr(distm , "call") <- match.call()
    class(distm) <- "dist"
    return(distm)
}

Try the Anthropometry package in your browser

Any scripts or data that you put into this service are public.

Anthropometry documentation built on March 7, 2023, 6:58 p.m.