R/auxiliary_cluster.R

Defines functions rclust_mat2cube rclust_stack3d rclust_stack3d2 rclust_equivariant rclust_equivariant_cube rclust_pdist rclust_pdist_cube rclust_pdist2 rclust_pdist2_cube rclust_cubesubset rclust_classmean rclust_index_Bq rclust_index_BGSS rclust_index_Wq rclust_index_WGSS rclust_index_Sw rclust_index_Sb rclust_index_Ns rclust_membership rclust_concordant rclust_Z rclust_index_density

############# Auxiliary R Functions #########################
# (00) rclust_mat2cube      : transform a matrix into 3d cube
# (01) rclust_stack3d       : stack data into 3d cube
#      rclust_stack3d2      : stack two data into one 3d cube
# (02) rclust_equivariant   : make an equivariant embedding stacked as rows
#      rclust_equivariant_cube
# (03) rclust_pdist         : pairwise intrinsic/extrinsic distances
#      rclust_pdist_cube  
# (04) rclust_pdist2        : pairwise intrinsic/extrinsic distances for two-set data
#      rclust_pdist2_cube
# (05) rclust_cubesubset    : remove anomaly in subsetting cube by third dimension
# (06) rclust_classmean     : compute class-wise mean cube
# (07) rclust_index_Bq      : between-group                    : CH is not okay. Change to Distance Measures
#      rclust_index_Wq      : within -group dispersion matrix
#      rclust_index_WGSS    : within -group sum of squared distances
#      rclust_index_BGSS    : between-group 
# (08) rclust_index_Sw      : within-cluster
#      rculst_index_Sb      : between-cluster sum of distances given a distance matrix
# (09) rclust_index_Ns      : number of counts information
# (10) rclust_membership    : membership of clusterings
# (11) rclust_concordant    : s(+) and s(-)
# (12) rclust_Z             : membership matrix of size (n x q)
# (13) rclust_index_density : use for SDbw method

# (00) rclust_mat2cube    : transform a matrix into 3d cube
#' @keywords internal
#' @noRd
rclust_mat2cube <- function(inputmat){
  output = array(0,c(nrow(inputmat),ncol(inputmat),1))
  output[,,1] = inputmat
  return(output)
}

# (01) rclust_stack3d     : stack data into 3d cube
#' @keywords internal
#' @noRd
rclust_stack3d <- function(riemdata){
  msize = riemdata$size
  ndata = length(riemdata$data)
  
  matdata = array(0,c(msize[1], msize[2], ndata))
  for (i in 1:ndata){
    matdata[,,i] = (riemdata$data[[i]])
  }
  return(matdata)
}
#' @keywords internal
#' @noRd
rclust_stack3d2 <- function(riem1, riem2){
  msize = riem1$size
  ndat1 = length(riem1$data)
  ndat2 = length(riem2$data)
  
  matdata = array(0,c(msize[1], msize[2], (ndat1+ndat2)))
  for (i in 1:ndat1){
    matdata[,,i] = (riem1$data[[i]])
  }
  for (i in (ndat1+1):(ndat1+ndat2)){
    matdata[,,i] = (riem2$data[[i-ndat1]])
  }
  return(matdata)
}

# (02) rclust_equivariant : make an equivariant embedding stacked as rows
#' @keywords internal
#' @noRd
rclust_equivariant <- function(input){
  mfddata = rclust_stack3d(input)
  mfdname = input$name
  output  = cpp_equivariant(mfddata, mfdname)
  return(output)
}
#' @keywords internal
#' @noRd
rclust_equivariant_cube <- function(dcube, mfdname){
  return(cpp_equivariant(dcube, mfdname))
}

# (03) rclust_pdist       : pairwise intrinsic/extrinsic distances
#' @keywords internal
#' @noRd
rclust_pdist <- function(input, type=c("extrinsic","intrinsic")){
  mfddata = rclust_stack3d(input)
  mfdname = input$name
  distype = match.arg(type)
  
  if (all(distype=="intrinsic")){
    output = cpp_distint(mfddata, mfdname)
  } else {
    output = cpp_distext(mfddata, mfdname)
  }
  return(output)
}
#' @keywords internal
#' @noRd
rclust_pdist_cube <- function(datacube, mfdname, type=c("extrinsic","intrinsic")){
  mfddata = datacube
  distype = match.arg(type)
  
  if (all(distype=="intrinsic")){
    output = cpp_distint(mfddata, mfdname)
  } else {
    output = cpp_distext(mfddata, mfdname)
  }
  return(output)
}
# (04) rclust_pdist2      : pairwise intrinsic/extrinsic distances for two-set data
#' @keywords internal
#' @noRd
rclust_pdist2 <- function(input1, input2, type=c("extrinsic","intrinsic")){
  mfddata1 = rclust_stack3d(input1)
  mfddata2 = rclust_stack3d(input2)
  mfdname = input1$name
  distype = match.arg(type)
  
  if (all(distype=="intrinsic")){
    output = cpp_dist2int(mfddata1, mfddata2, mfdname)
  } else {
    output = cpp_dist2ext(mfddata1, mfddata2, mfdname)
  }
  return(output)
}
#' @keywords internal
#' @noRd
rclust_pdist2_cube <- function(cube1, cube2, mfdname, type=c("extrinsic","intrinsic")){
  distype = match.arg(type)
  if (all(distype=="intrinsic")){
    output = cpp_dist2int(cube1, cube2, mfdname)
  } else {
    output = cpp_dist2ext(cube1, cube2, mfdname)
  }
  return(output)
}

# (05) rclust_cubesubset  : remove anomaly in subsetting cube by third dimension
#' @keywords internal
#' @noRd
rclust_cubesubset <- function(datacube, subsetid){
  nrow = dim(datacube)[1]
  ncol = dim(datacube)[2]
  nslice = length(subsetid)
  
  output = array(0,c(nrow,ncol,nslice))
  for (i in 1:nslice){
    tmp = datacube[,,subsetid[i]]
    if (is.vector(tmp)){
      tmp = as.matrix(tmp)
    }
    output[,,i]=tmp
  }
  return(output)
}

# (06) rclust_classmean   : compute class-wise mean cube
#' @keywords internal
#' @noRd
rclust_classmean <- function(datacube, label, mfdname, memberlist){
  # get RiemBase::rbase.mean.cube
  meanfunc = utils::getFromNamespace("rbase.mean.cube","RiemBase")
  ulabel = unique(label)
  classmean = array(0,c(nrow(datacube),ncol(datacube),length(ulabel)))
  for (i in 1:length(ulabel)){
    idx.i = memberlist[[which(ulabel==ulabel[i])]]
    if (length(idx.i)==1){
      classmean[,,i] = rclust_cubesubset(datacube, idx.i)
    } else {
      classmean[,,i] = meanfunc(rclust_cubesubset(datacube, idx.i), mfdname)$x
    }
  }
  return(classmean)
}

# (07) rclust_index_Bq    : between-group
#      rclust_index_Wq    : within -group dispersion matrix
#' @keywords internal
#' @noRd
rclust_index_Bq <- function(datacube, label, mfdname, memberlist){
  # unique label
  ulabel = unique(label)
  # get RiemBase::rbase.mean.cube
  meanfunc = utils::getFromNamespace("rbase.mean.cube","RiemBase")
  # compute intrinsic mean for global and local/per-class ones
  mean.global = meanfunc(datacube, mfdname)$x
  mean.local  = rclust_classmean(datacube, label, mfdname, memberlist)
  # count the size of clusters
  vec.count   = rep(0,length(ulabel))
  for (i in 1:length(ulabel)){
    vec.count[i] = length(memberlist[[which(ulabel==ulabel[i])]])
  }
  # compute log-pulled vectors of class means : stacked as columns
  log.pulled = cpp_dispersion(mean.local, mean.global, mfdname)
  dimsize    = nrow(log.pulled)
  # compute Bq
  Bq = array(0,c(dimsize,dimsize))
  for (i in 1:ncol(log.pulled)){
    Bq = Bq+((base::outer(log.pulled[,i],log.pulled[,i]))*vec.count[i])
  }
  # return output
  return(Bq)
}
#' @keywords internal
#' @noRd
rclust_index_BGSS <- function(datacube, label, mfdname, memberlist){
  ulabel = unique(label)
  meanfunc = utils::getFromNamespace("rbase.mean.cube","RiemBase")
  # means of clusters and global one
  mean.global = rclust_mat2cube(meanfunc(datacube, mfdname)$x)
  mean.local  = rclust_classmean(datacube, label, mfdname, memberlist)
  # compute distances from cluster means to global mean
  vec.dist    = as.vector(cpp_dist2int(mean.global, mean.local, mfdname))
  vec.count   = rep(0,length(memberlist))
  for (i in 1:length(memberlist)){
    vec.count[i] = length(memberlist[[which(ulabel==ulabel[i])]])
  }
  # compute output
  output = sum((vec.dist^2)*vec.count)
  return(output)
}
#' @keywords internal
#' @noRd
rclust_index_Wq <- function(datacube, label, mfdname, memberlist){
  ulabel = unique(label)
  # get RiemBase::rbase.mean.cube
  meanfunc = utils::getFromNamespace("rbase.mean.cube","RiemBase")
  # STEP 1 : initial one should be done by myself
  #   label selection
  idlabel    = memberlist[[which(ulabel==ulabel[1])]]
  class.mean = meanfunc(rclust_cubesubset(datacube,idlabel), mfdname)$x
  class.mems = rclust_cubesubset(datacube,idlabel)
  log.pulled = cpp_dispersion(class.mems, class.mean, mfdname)
  Wq         = (log.pulled%*%t(log.pulled))
  
  # STEP 2 : rest classes  
  if (length(ulabel)!=1){
    for (i in 2:length(ulabel)){
      idlabel    = memberlist[[which(ulabel==ulabel[i])]]
      class.mean = meanfunc(rclust_cubesubset(datacube,idlabel), mfdname)$x
      class.mems = rclust_cubesubset(datacube,idlabel)
      log.pulled = cpp_dispersion(class.mems, class.mean, mfdname)
      Wq         = Wq + (log.pulled%*%t(log.pulled))
    }
  } 
  # return output
  return(Wq)
}
#' @keywords internal
#' @noRd
rclust_index_WGSS <- function(datacube, label, mfdname, memberlist){
  ulabel = unique(label)
  q      = length(memberlist)
  # get RiemBase::rbase.mean.cube
  meanfunc = utils::getFromNamespace("rbase.mean.cube","RiemBase")
  #   label selection
  distvec    = rep(0,q)
  for (i in 1:q){
    idlabel = memberlist[[which(ulabel==ulabel[i])]]
    class.mean = rclust_mat2cube(meanfunc(rclust_cubesubset(datacube,idlabel), mfdname)$x)
    class.mems = rclust_cubesubset(datacube,idlabel)
    distvec[i] = sum(as.vector(cpp_dist2int(class.mean, class.mems, mfdname))^2)
  }
  return(sum(distvec))
}


# (08) sum of distances ---------------------------------------------------
#   rclust_index_Sw : within-cluster
#   rclust_index_Sb : between-cluster sum of distances given a distance matrix
#' @keywords internal
#' @noRd
rclust_index_Sw <- function(distmat, label, mfdname, memberlist){
  ulabel = unique(label)
  q      = length(ulabel)
  output = 0
  for (i in 1:q){
    idlabel = memberlist[[which(ulabel==ulabel[i])]]
    if (length(idlabel)!=1){
      partmat = distmat[idlabel,idlabel]
      output  = output + sum(as.vector(partmat[lower.tri(partmat)]))
    }
  }
  return(output)
}
#' @keywords internal
#' @noRd
rclust_index_Sb <- function(distmat, label, mfdname, memberlist){
  ulabel = unique(label)
  q      = length(ulabel)
  score  = 0.0
  for (k in 1:(q-1)){
    label.k = memberlist[[which(ulabel==ulabel[k])]]
    for (j in (k+1):q){
      label.j = memberlist[[which(ulabel==ulabel[j])]]
      score   = score + sum(as.vector(distmat[label.k, label.j]))
    }
  }
  return(score)
}

# (09) rclust_index_Ns    : number of counts information
#' @keywords internal
#' @noRd
rclust_index_Ns <- function(label, memberlist){
  # basic parameters
  ulabel = unique(label)
  n = length(label)
  q = length(ulabel)
  # Nt
  Nt = as.integer(n*(n-1)/2)
  # Nw
  Nw = 0
  for (i in 1:q){
    Nk = length(memberlist[[which(ulabel==ulabel[i])]])
    Nw = as.integer(Nk*(Nk-1)/2) + Nw
  }
  # Nb
  Nb = Nt - Nw
  # return output
  output = list()
  output$Nt = Nt
  output$Nb = Nb
  output$Nw = Nw
  return(output)
}

# (10) rclust_membership  : membership of clusterings
#' @keywords internal
#' @noRd
rclust_membership <- function(label){
  ulabel = unique(label)
  memvec = list()
  for (i in 1:length(ulabel)){
    memvec[[i]] = which(label==ulabel[i])
  }
  return(memvec)
}

# (11) rclust_concordant  : s(+) and s(-)
#' @keywords internal
#' @noRd
rclust_concordant <- function(distmat, label, membership){
  # get some parameters
  q      = length(membership)
  ulabel = unique(label)
  # 1. compute all within-distance vector
  vec.within = c()
  for (i in 1:q){
    idx = membership[[which(ulabel==ulabel[i])]]
    partmat = distmat[idx,idx]
    vec.within = c(vec.within, as.vector(partmat[upper.tri(partmat)]))
  }
  # 2. compute all between-distance vector
  vec.between = c()
  for (i in 1:(q-1)){
    id1 = membership[[which(ulabel==ulabel[i])]]
    for (j in (i+1):q){
      id2 = membership[[which(ulabel==ulabel[j])]]
      partmat = distmat[id1,id2]
      vec.between = c(vec.between, as.vector(partmat))
    }
  }
  # 3. count all numbers
  niter = length(vec.within)
  count.con = 0
  count.dis = 0
  for (i in 1:niter){
    count.con = count.con + sum((vec.between>vec.within[i]))
    count.dis = count.dis + sum((vec.between<vec.within[i]))
  }
  # return
  output = list()
  output$con = count.con
  output$dis = count.dis
  return(output)
}

# (12) rclust_Z           : membership matrix of size (n x q)
#' @keywords internal
#' @noRd
rclust_Z <- function(label, membership){
  n = length(label)
  q = length(membership)
  ulabel = unique(label)
  
  output = array(0,c(n,q))
  for (i in 1:q){
    id = membership[[which(ulabel==ulabel[i])]]
    output[id,i] = 1
  }
  return(output)
}

# (13) rclust_index_density : use for SDbw method in extrinsic manner
#' @keywords internal
#' @noRd
rclust_index_density <- function(meanvec, datamat, thr){
  centered = datamat - matrix(rep(meanvec,nrow(datamat)),nrow=nrow(datamat),byrow=TRUE)
  distvec  = apply(centered, 1, function(x){sqrt(sum(x^2))})
  return(sum(distvec<thr))
}
kyoustat/RiemBaseExt documentation built on March 28, 2020, 2:08 a.m.