R/shc-diagnostic.R

#' plot diagnostics for shc object
#'
#' Provides visualizations to check the null Gaussian assumption at a specified
#' range of nodes along the dendrogram
#' 
#' @param obj a \code{shc} object
#' @param K an integer value or vector specifying the range of nodes for which to 
#'        create diagnostic plots, with 1 corresponding to the root (default = 1)
#' @param fname a character string specifying the name of the output file,
#'        see details for more information on default behavior depending on
#'        \code{length(K)} (default = NULL)
#' @param ci_idx a numeric value between 1 and \code{length(obj$in_args$ci)}
#'        specifying which cluster index to use for creating plots (default = 1)
#' @param pty a string specifying the plot type that should be created
#'        see details for more information on different plot types available
#'        (default = "all")
#' @param ... other parameters passed to plots
#' 
#' @return
#' Prints plots to \code{paste0(fname, ".pdf")} if \code{fname} is specified or \code{length(K)>1}.
#'
#' @details
#' If \code{K} is a single value, the default behavior is to output the diagnostic
#' plot to the current device. This behavior is overriden if \code{fname} is specified
#' by the user. If \code{length(K) > 1}, than the diagnostic plots are
#' printed to \code{paste0(fname, ".pdf")}. If \code{fname} is not specified,
#' "shc_diagnostic", is used by default.
#'
#' The \code{pty} parameter accepts the following options:
#' \itemize{
#' \item{\code{"background"}}: {empirical distribution of marginal data used for background variance estimation}
#' \item{\code{"qq"}}: {qqplot for checking null Gaussian assumption}
#' \item{\code{"covest"}}: {screeplot for checking covariance factor model}
#' \item{\code{"pvalue"}}: {empirical distribution of simulated CIs}
#' \item{\code{"all"}}: {all of the above plots (default).}
#' }
#' 
#' @import ggplot2 ggthemes
#' @importFrom grDevices dev.off pdf
#' @importFrom graphics plot text
#' @importFrom stats density dnorm mad median qnorm qqnorm quantile runif sd
#' @name diagnostic-shc
#' @export
#' @method diagnostic shc
#' @author Patrick Kimes
diagnostic.shc <- function(obj, K = 1, fname = NULL, ci_idx = 1,
                           pty = "all", ...) {

    ## available plot types
    avail_pty <- c("background", "qq", "covest", "pvalue", "all")
    
    ## check default values
    if (length(K) == 0 || min(K) < 1 || max(K) > nrow(obj$p_norm)) {
        stop("K must be a set of indices between 1 and n-1")
    }
    if (!is.null(fname) && !is.character(fname)) {
        stop("fname must be NULL or a string")
    }
    if (length(pty) > 1) {
        stop("can only specify one plot type")
    } else if (sum(grepl(pty, avail_pty)) > 1) {
        stop("specified plot type matches more than one plot type")
    } else if (sum(grepl(pty, avail_pty)) == 0) {
        stop(paste("pty is not a valid value, please specify one of (or part of):",
                   paste(avail_pty, collapse=", ")))
    }

    if (ci_idx > ncol(obj$p_emp)) {
        stop(paste0("invalid choice for ci_idx; ci_idx must be <= ", ncol(obj$p_emp)))
    }

    ## determine if generating multiple plots
    mlt_plot <- (length(K) > 1 || grepl("all", pty)) 
    ## parse default fname behavior
    if (mlt_plot) {
        if (is.null(fname)) {
            fname <- "shc_diagnostic.pdf"
        } else if (!grepl("\\.pdf$", fname)) {
            fname <- paste0(fname, ".pdf")
        }
    }    

    ## create plots 
    if (mlt_plot) {
        pdf(fname)
        for (k in K) {
            plot(0, 0, col="white")
            text(0, 0, paste("K =", k))
            .diagnostic_k(obj, k, ci_idx, pty, mlt_plot)
        }
        invisible(dev.off())
    } else {
        plt <- .diagnostic_k(obj, K, ci_idx, pty, mlt_plot)
        if (is.null(fname)) {
            return(plt)
        } else {
            ggsave(filename=fname, plt, width=7, height=7)
        }
    }
}



## code cleaned/modified from sigclust CRAN package, plot function
.diagnostic_k <- function(obj, k, ci_idx, pty, mlt_plot) {

    ## identify subtree of x
    idx_sub <- unlist(obj$idx_hc[k, ])
    k_mat <- obj$in_mat[idx_sub, ]
    n <- nrow(k_mat)
    d <- ncol(k_mat)

    ## parameters from obj object
    icovest <- obj$in_args$icovest
    n_sim <- obj$in_args$n_sim
    keigval_dat <- obj$eigval_dat[k, ]
    backvar_k <- obj$backvar[k]
    keigval_sim <- obj$eigval_sim[k, ]
    kci_sim <- obj$ci_sim[k, , ci_idx]
    kci_dat <- obj$ci_dat[k, ci_idx]
    kp_emp <- obj$p_emp[k, ci_idx]
    kp_norm <- obj$p_norm[k, ci_idx]

    ## vectorize data and compute statistics
    xvec <- as.vector(k_mat)
    mean_x <- mean(xvec)
    sd_x <- sd(xvec)
    med_x <- median(xvec)
    mad_x <- mad(xvec)
    ntot <- length(xvec)
    max_nol <- 5000


    ## Background Standard Deviation Diagnostic Plot
    if (any(grepl(pty, c("all", "background")))) {

        ## fit density to full data
        den_x <- density(xvec)
        den_df <- data.frame(x=den_x$x, y=den_x$y)

        ## subset on dataset for plotting
        nused <- min(max_nol, ntot)
        if (ntot > max_nol) {
            xvec <- xvec[sample(1:ntot, max_nol)]
        }

        ## determine range of density plot
        xmin <- min(den_df$x)
        xmax <- max(den_df$x)
        ymin <- min(den_df$y)
        ymax <- max(den_df$y)

        ## create df with plotting points
        xvec_df <- data.frame(x=xvec, y=runif(length(xvec), ymax*1/4, ymax*3/4))

        ## plot data kde and best fit gaussian
        gp <- ggplot(xvec_df) +
            geom_point(aes(x=x, y=y), alpha=1/5, size=1, color="#33a02c") +
            geom_path(aes(x=x, y=y), data=den_df, color="black", size=1) + 
            stat_function(fun=dnorm, args=list(mean=med_x, sd=mad_x),
                          color="#1f78b4", size=1, n=500) +
            theme_gdocs() +
            ylab("density") +
            ggtitle(paste0("Distribution of Vectorized Data for K=", k)) +
                scale_x_continuous(expand=c(0.01, 0)) +
                    scale_y_continuous(expand=c(0.01, 0))

        ## note # sample points
        if (ntot > max_nol) {
            gp <- gp +
                annotate(geom="text", x=xmax, y=ymax*.98, vjust=1, hjust=1,
                         label=paste("overlay of", max_nol, "/", ntot, "data points"),
                         color="#33a02c")
        } else {
            gp <- gp +
                annotate(geom="text", x=xmax, y=ymax*.98, vjust=1, hjust=1,
                         label=paste("overlay of", ntot, "data points"),
                         color="#33a02c")
        }

        ## label best fit gaussian density
        gp <- gp +
            annotate(geom="text", x=xmax, y=ymax*.90, vjust=1, hjust=1,
                     label=paste0("N(", round(med_x, 3), ", ", round(mad_x, 3), ") density"),
                     color="#1f78b4")

        ## return parameter values
        gp <- gp +
            annotate(geom="text", x=xmin, y=ymax*.98, vjust=1, hjust=0,
                     label=paste0("mean = ", round(mean_x, 3), "\n",
                         "median = ", round(med_x, 3), "\n",
                         "s.d. = ", round(sd_x, 3), "\n",
                         "MAD = ", round(mad_x, 3)))

        ## check for possible anti-conservative behavior
        if ((mad_x > sd_x) && (icovest == 1)) {
            gp <- gp +
                annotate("text", x=(xmax+xmin)/2, y=ymax/4, vjust=1,
                         label="Warning: MAD > s.d., SHC can be anti-conservative",
                         size=5, color="#e41a1c")
        }
        
        if (mlt_plot) {
            print(gp)
        } else {
            return(gp)
        }
    }
    
    
    ## QQ plot
    if (any(grepl(pty, c("all", "qq")))) {

        ## compute quantiles
        qq_x <- qqnorm(as.vector(k_mat), plot.it=FALSE)
        qq_df <- data.frame(x=qq_x$x, y=qq_x$y)

        ## subset for plotting
        if(ntot > max_nol) {
            qq_df <- qq_df[sort(sample(1:ntot, max_nol)), ]
        }

        ## build base of plot
        gp <- ggplot(qq_df) +
            geom_abline(aes(intercept=0, slope=1), color="#33a02c", size=1/2) +
        geom_point(aes(x=x, y=y), size=1, color="#e31a1c", alpha=1/2) +
        ggtitle(paste0("Robust Fit Gaussian Q-Q for K=", k)) +
            xlab("Gaussian Q") + ylab("Data Q") +
                theme_gdocs()

        ## include annotations in plot
        gp <- gp +
            annotate(geom="text", vjust=1, hjust=0,
                     x=min(qq_df$x), y=max(qq_df$y),
                     label=paste0("mean = ", round(mean_x, 3), "\n",
                         "s.d. = ", round(sd_x, 3)))
        gp <- gp + annotate(geom="point", x=qnorm(c(.25, .50, .75)),
                            y=quantile(qq_df$y, c(.25, .50, .75)),
                            color="#1f78b4", shape=43, size=8)
        gp <- gp + annotate(geom="text", vjust=1, hjust=0, x=qnorm(c(.25, .50, .75)),
                            y=quantile(qq_df$y, c(.25, .50, .75)),
                            label=paste(" ", c("0.25", "0.50", "0.75"), "quantile"))
        
        if (mlt_plot) {
            print(gp)
        } else {
            return(gp)
        }
    }
    

    
    ## covariance estimation diagnostic plot
    if (any(grepl(pty, c("all", "covest")))) {
        
        ymin <- min(keigval_dat) - 0.05*(max(keigval_dat)-min(keigval_dat))
        ymax <- max(keigval_dat) + 0.05*(max(keigval_dat)-min(keigval_dat))

        df <- data.frame(idx=rep(1:d, times=2),
                         grp=rep(c("sim", "dat"), each=d),
                         eigval=c(keigval_sim, keigval_dat))

        ## create base plot
        gp <- ggplot(df) +
            geom_point(aes(x=idx, y=eigval, color=grp)) +
                geom_path(aes(x=idx, y=eigval, group=grp), alpha=1/2) +
                    xlab("Component #") + ylab("Eigenvalue") +
        ggtitle(paste0("Eigenvalues, K=", k)) +
        theme_gdocs() +
            scale_color_manual(limits=c("sim", "dat"),
                               values=c("#e41a1c", "black"), guide=FALSE)

        if (icovest != 2) {
            ## include horizontal line at bkgd noise level
            gp <- gp +
                geom_hline(yintercept=backvar_k, col="#377eb8") +
                    annotate("text", hjust=1, vjust=0,
                             x=d+1, y=backvar_k+(ymax-ymin)*.02,
                             label=paste0("bkgd var = ",
                                 round(backvar_k, 3)),
                             col="#377eb8")
        }

        ## annotate eigvals
        gp <- gp +
            annotate("text", hjust=1, vjust=1,
                     x=d+1, y=ymin+0.9*(ymax-ymin),
                     label="eigenvalues for simulation", col="#e41a1c") +
        annotate("text", hjust=1, vjust=1,
                 x=d+1, y=ymin+0.84*(ymax-ymin),
                 label="sample eigenvalues")

        ## make note if anti-conservative behavior expected 
        if (mad_x > sd_x) {
            gp <- gp + 
                annotate("text", hjust=1, vjust=1,
                         x=d+1, y=ymin + 0.65*(ymax-ymin),
                         label="Warning: MAD > s.d.", col="#377eb8")
        }

        if (mlt_plot) {
            print(gp)
        } else {
            return(gp)
        }
    }
    

    
    ## p-value plot
    if (any(grepl(pty, c("all", "pvalue")))) {

        ## fit density to cluster indices
        den_pv <- density(kci_sim)
        den_df <- data.frame(x=den_pv$x, y=den_pv$y)
        
        ## determine range of density plot
        xmin <- min(den_df$x)
        ymax <- max(den_df$y)

        mindex <- mean(kci_sim)
        sindex <- sd(kci_sim)

        ## create df with plotting points
        kci_df <- data.frame(x=kci_sim, y=runif(length(kci_sim), ymax*1/4, ymax*3/4))

        ## plot data kde and best fit gaussian
        gp <- ggplot(kci_df) +
            geom_point(aes(x=x, y=y), alpha=1/2, size=1, color="#377eb8") +
            geom_path(aes(x=x, y=y), data=den_df, color="#e41a1c", size=1) + 
            stat_function(fun=dnorm, args=list(mean=mindex, sd=sindex),
                          color="black", size=1, n=500) +
            geom_vline(xintercept=kci_dat, color="#4daf4a", size=1) + 
            theme_gdocs() +
            ylab("density") + xlab("Cluster Index") + 
                ggtitle(paste0("SHC Results for ci_idx=", ci_idx, ", K=", k)) +
                    scale_x_continuous(expand=c(0.01, 0)) +
                        scale_y_continuous(expand=c(0.01, 0))

        ## return p-values
        gp <- gp +
            annotate(geom="text", x=min(xmin, kci_dat), y=ymax*.98, vjust=1, hjust=0,
                     label=paste0(" p-value (Q) = ", round(kp_emp, 3), "\n",
                         " p-value (Z) = ", round(kp_norm, 3)))

        if (mlt_plot) {
            print(gp)
        } else {
            return(gp)
        }
    }

}
pkimes/sigclust2 documentation built on May 25, 2019, 8:20 a.m.