R/tree.builder.R

Defines functions tree.builder

Documented in tree.builder

#' A superfunction to estimate correlations and relationships among variables in a data set. 
#'
#' This function estimates a Spearman's correlation matrix, an abs(distance matrix), a hclust tree, a PCA from the correlation matrix, and numerous estimates of the effective number of variables in a data set (Me)
#' @param wdata a data.frame of the variable data used to build your cormat and hclust tree, the variables should be in columns, samples in rows.
#' @param cor_method the correlation method used to construct the correlation matrix
#' @param dist_method the distance method to generate the distance matrix
#' @param hclust_meth the clustering method used by hclust
#' @param cmat a correlation matrix you can pass to the function, which will skip the generation of a new matrix.
#' @keywords correaltion matrix, distance matrix, hclust, Me
#' @export
#' @examples
#' tree.builder()
tree.builder = function( wdata, cor_method = "spearman", dist_method = "R", hclust_meth = "complete", cmat = NULL ){
    
    ## number of variables
    M = ncol(wdata)

    ## estimate correlation matrix
    if( is.null(cmat) ){
      ## number of variables
      M = ncol(wdata)
      
      cat("1. estimating pairwsie correlations\n")
        cmat = cor(wdata, use = "pairwise.complete.obs", method = cor_method )
        ## turn NAs into 0 values
        cmat[is.na(cmat)] = 0
    } else {
      cat("1. pairwsie correlation matrix provided by user\n")
      M = ncol(cmat)
    }

    ## estimate a distance matrix
    cat("2. constructing distance matrix\n")
    if(dist_method == "R2"){
        dmat = as.dist(1 - (cmat * cmat) )    
    } else {
        if(dist_method == "R"){
            dmat = as.dist(1 - abs( cmat ) )    
        } else { 
            stop("The dist_method must be either 'R' (1-abs(R)) or 'R2' (R*R).")
        }
    }
    

    ## generate a hclust dendrogram
    cat("3. generating dendrogram\n")
    tree = hclust(dmat,  method = hclust_meth)

    # if(hclust_meth == "complete"){
    #     tree = hclust(dmat,  method = "complete")
    # } else {
    #     if(hclust_meth == "average"){
    #         tree = hclust(dmat,  method = "average")
    #     } else { 
    #         if(hclust_meth == "mcquitty"){
    #             tree = hclust(dmat,  method = "mcquitty")
    #         } else { 
    #             stop("The hclust_meth must be either 'complete' (max dist), 'average' (UPGMA), or 'mcquitty' (WPGMA).")
    #         }
    #     }
    # }

    ## eigenvalues
    cat("4. estimating eigenvalues from correlation matrix\n")
    eigenvalues = eigen(cmat, symmetric = TRUE)$values

    ## PCA
    cat("5. estimating principal components from correlation matrix\n")
    pca = prcomp(cmat, center = TRUE, scale = TRUE)
        #eigenvalues = summary(pca)[[6]][2, ]
    varexp = t(summary(pca)[[6]][2:3, ])
    
    ## effective number of markers
        ## Goa 2008
    m95 = length(which(varexp[,2] < 0.95))+1
    m995 = length(which(varexp[,2] < 0.995))+1
        ## Cheverud 2001
    eigen_var = var(eigenvalues)
    Meff = round( 1 + (M-1) * (1 - (eigen_var / M)) )
        ## Li and Ji 2005
    ve = varexp[,1]
    w = which(ve > 0.01)
    ve[w] = 1
    LJ_Me = round( sum(ve) )
    ##
    simpleM = data.frame( Cheverud_Me = Meff, LiJi_Me = LJ_Me,   Goa_M_95 = m95 , Goa_M_995 = m995   )
    
    ## data out
    out = list( variabledata = wdata, 
        cormat = cmat, 
        distmat = dmat, 
        tree = tree, 
        pca = pca, 
        eigenvalues = eigenvalues,  
        varexp = varexp, 
        simpleM = simpleM
       )
    ## return data
    return(out)
}
hughesevoanth/iPVs documentation built on July 26, 2023, 3:21 p.m.