R/treecut.pvs.R

Defines functions treecut.pvs

Documented in treecut.pvs

#' A function to identify the principal variables in your variable data set, given a defined tree cut height. 
#'
#' This function to identify the principal variables in your variable data set, given a defined tree cut height. 
#' @param tree a hclust object of you variables, built with the cormat and the hclust method "complete"
#' @param variabledata your data.frame of variables used to generate your tree
#' @param cormat your correlation matrix
#' @param cutheight the height at which you wish to cut your tree 
#' @keywords tree cut, principal variables
#' @export
#' @examples
#' treecut.pvs()
treecut.pvs = function( tree, variabledata, cutheight, cormat ){
  ##############  check for hclust object
  if(class(tree) != "hclust"){
    stop("You must pass a hclust object as the tree variable")
  }
  ##############  check for presence of all tree labels in variabledata
  if( sum(!tree$labels %in%  colnames(variabledata) ) ){
    stop( paste0("Make sure that all of the variables in your hclust object are also in your variabledata data.frame.") )
  }
  ##############  check for a single numeric float
  if( length(cutheight) != 1  ){
    stop(  "You must pass a single cutheight float >= 0  & <= 1." )
  }
  if( class(cutheight) != "numeric" |  cutheight < 0 | cutheight > 1 ){
    stop("You must pass a single, cutheigth float no smaller than 0 and no larger than 1.")
  }
  ############## Produce a warning if the hclust method used was not "complete", "average", or "mcquitty".
  if( !tree$method %in% c( "complete" ,"average","mcquitty") ){
    warning( paste0("It is advised to use the hclust method 'complete', 'average', or 'mcquitty'. You used the method ", tree$method, ".") )
  }
  ###############
  ## cut tree
  k = stats::cutree(tree, h = cutheight)
  ## vector of unique Ks
  singleKs = as.numeric( names( which( table(k) == 1 ) ) )
  ## vector of unique Ks with n > 1
  groupKs = as.numeric( names( which( table(k) > 1 ) ) )
  ## iterate of goupKs to identify a Principal Variable
  if(length(groupKs) > 0){
    pvs = t( sapply(groupKs, function(i){
      n = names( which(k == i) )
      if(length(n)>1){
        # tempd = variabledata[, n]
        tempd = cormat[n,n]
        #topPV =  PVA( names(tempd), tempd)
        topPV = VarRep(tempd)
        out = data.frame(k = i, clusN = table(k)[i],  PV = as.character(topPV$variable)[1], VarExp = topPV$added_vexp[1])
        } else {
          out = data.frame(k = i, clusN = table(k)[i], PV = n, VarExp = 1)
        }
      return(out)
      }) )

    ## redefine pvs as a proper data.frame without built in lists :-( 
    pvs = data.frame(k = as.numeric(unlist(pvs[,1])), clusN = as.numeric(unlist(pvs[,2])), 
      PV = as.character(unlist(pvs[,3])), VarExp =  as.numeric(unlist(pvs[,4])))
    } else {
    pvs = data.frame( )
  }
  ##  build data frame for all single variable K groups
  w = which(k %in% singleKs)
  n = names(k)[w]
  singleK_pvs = data.frame(k = singleKs, clusN = 1, PV = n, VarExp = 1)
  ## combine the pvs
  pvs = rbind(pvs, singleK_pvs)
  ## order the pvs by k
  o = order(pvs[,1])
  pvs = pvs[o, ]
  rownames(pvs) = paste0("K", pvs[,1])
  ##
  return( list( pvs = pvs, k = k ) )
}
hughesevoanth/iPVs documentation built on July 26, 2023, 3:21 p.m.