R/SH-utils.R

Defines functions shclust plot.SH

Documented in plot.SH shclust

#' Plotting model for Slope-Hunter clustering
#'
#' @param x Output from \code{slopehunter}.
#' @param what A string specifying the type of graph requested. Available choices are:
#' "clusters": showing clusters. The plot can display membership probabilities of each variable (e.g. SNP) to the target cluster (G1) by hovering over the points.
#' "classification": A plot showing point assigned to each cluster (class).
#' "uncertainty": A plot of classification uncertainty.
#' "density": A plot of estimated density.
#' @param xlab Optional label for the x-axis in case of "classification", "uncertainty", or "density" plots.
#' @param ylab Optional label for the y-axis in case of "classification", "uncertainty", or "density" plots.
#' @param addEllipses A logical indicating whether or not to add ellipses with axes corresponding to the within-cluster covariances in case of "classification" or "uncertainty" plots.
#' @param main A logical or NULL indicating whether or not to add a title to the plot identifying the type of plot drawn in case of "classification", "uncertainty", or "density" plots.
#' @param ... Other graphics parameters.
#' @export
#' @importFrom utils menu
plot.SH <- function(x, what= c("clusters", "classification", "uncertainty", "density"),
                    xlab = NULL, ylab = NULL, addEllipses = TRUE, main = FALSE, ...)
{
  object <- x
  if(!inherits(object, "SH"))
    stop("x is not of class \"SH\"!")

  if(interactive() & length(what) > 1){
    title <- "Slope-Hunter fitting plots:"
    # present menu waiting user choice
    choice <- menu(what, graphics = FALSE, title = title)
    while(choice != 0){
      if(what[choice] == "clusters")       print(object$plot.clusters)
      if(what[choice] == "classification") mclust::plot.Mclust(object$Model, what = "classification", xlab=xlab, ylab=ylab, addEllipses=addEllipses, main=main)
      if(what[choice] == "uncertainty")    mclust::plot.Mclust(object$Model, what = "uncertainty", xlab=xlab, ylab=ylab, addEllipses=addEllipses, main=main)
      if(what[choice] == "density")        mclust::plot.Mclust(object$Model, what = "density", xlab=xlab, ylab=ylab, main=main)
      # re-present menu waiting user choice
      choice <- menu(what, graphics = FALSE, title = title)
    }
  } else {
    if(any(what == "clusters"))       print(object$plot.clusters)
    if(any(what == "classification")) mclust::plot.Mclust(object$Model, what = "classification", xlab=xlab, ylab=ylab, addEllipses=addEllipses, main=main)
    if(any(what == "uncertainty"))    mclust::plot.Mclust(object$Model, what = "uncertainty", xlab=xlab, ylab=ylab, addEllipses=addEllipses, main=main)
    if(any(what == "density"))        mclust::plot.Mclust(object$Model, what = "density", xlab=xlab, ylab=ylab, main=main)
  }
  invisible()
}

#' Implement the EM algorithm for the SlopeHunter model-based clustering
#'
#' @param gwas a data frame with columns: xbeta; xse; ybeta; yse.
#' @param pi0 initial value for the weight of the mixture component that represents the cluster of SNPs affecting x only.
#' @param sxy1 initial value for the covariance between x and y.
#' @return EM fit for SlopeHunter estimator
#' @importFrom mclust dmvnorm
#' @import dplyr
#' @importFrom stats sd cov na.omit pt
shclust <- function(gwas, pi0, sxy1){
  # binding variable locally to the function:
  ## To avoid Notes: e.g. "shclust: no visible binding for global variable ‘xbeta’"
  xbeta <- ybeta <- clusters <- NULL

  sx0 = sx1 = gwas %>% summarise(sd(xbeta)) %>% pull
  sy0 = sy1 = gwas %>% summarise(sd(ybeta)) %>% pull
  dir0 = gwas %>% summarise(cov(xbeta, ybeta)) %>% pull %>% sign()
  if (dir0==0) stop("All associations with at least either x or y are constant")

  # convergence criterion
  loglkl_ck = 0

  ### EM algorithm
  for(iter in 1:50000){
    #### The E step:
    # covariance matrix for the target component (f0)
    sxy0 = sx0*sy0*dir0*0.95       # the x & y perfectly correlated under 1st component  #===========
    sigma0 = matrix(c(sx0^2,sxy0,sxy0,sy0^2), 2, 2)

    # 1st component
    f0 = gwas %>%
      dplyr::select(xbeta, ybeta) %>%
      mclust::dmvnorm(mean=c(0,0), sigma=sigma0)
    f0[f0<1e-300] = 1e-300

    # covariance matrix for the component (f1)
    sigma1 = matrix(c(sx1^2,sxy1,sxy1,sy1^2), 2, 2)

    # 2nd component
    f1 = gwas %>%
      dplyr::select(xbeta, ybeta) %>%
      mclust::dmvnorm(mean=c(0,0), sigma=sigma1)
    f1[f1<1e-300] = 1e-300

    # loglik of the mixture model: pi0 * f0 + (1-p0) * f1
    loglkl = sum(log(pi0*f0+(1-pi0)*f1))

    ## proportional contribution of density of f0 (for every point) to the total mixture
    pt = pi0*f0/(pi0*f0+(1-pi0)*f1)
    pt[pt>0.9999999] = 0.9999999

    #### The M step:
    # update pi0
    pi0 = mean(pt)
    if (pi0<0.0001) pi0 = 0.0001
    if (pi0>0.9999) pi0 = 0.9999

    # update sx0 & sy0
    sx0 = gwas %>% summarise(sqrt(sum(pt*(xbeta^2))/sum(pt))) %>% pull
    sy0 = gwas %>% summarise(sqrt(sum(pt*(ybeta^2))/sum(pt))) %>% pull
    dir0 = gwas %>% summarise(sum(pt*xbeta*ybeta)/sum(pt)) %>% pull %>% sign()
    if (dir0==0) dir0=sample(c(1,-1), 1)   # avoid slope = 0 (horizontal line)

    # update sx1, sy1 & sxy1
    sx1 = gwas %>% summarise(sqrt(sum((1-pt)*(xbeta^2))/(length(xbeta)-sum(pt)))) %>% pull
    sy1 = gwas %>% summarise(sqrt(sum((1-pt)*(ybeta^2))/(length(ybeta)-sum(pt)))) %>% pull
    sxy1 = gwas %>% summarise(sum((1-pt)*xbeta*ybeta)/(length(xbeta)-sum(pt))) %>% pull
    if (abs(sxy1) > 0.75*sx1*sy1)  sxy1 = sign(sxy1)*0.75*sx1*sy1     #===========

    ## Check convergence
    if (iter%%10==0){
      if ((loglkl - loglkl_ck)/loglkl < 1e-10){
        break
      } else {
        loglkl_ck = loglkl
      }
    }
  }

  # Diagnosis
  if (iter == 50000) warning("The algorithm may not have converged.\n")

  ### Results
  Fit = gwas %>% mutate(pt = pt) %>%
    mutate(po = 1-pt, clusters = factor(ifelse(pt >= 0.5, "Hunted", "Pleiotropic")))

  # slope of the eigenvector
  b = dir0*sy0/sx0
  bse = 0
  b_CI = c(b - 1.96*bse, b + 1.96*bse)
  entropy = Fit %>% filter(clusters == "Hunted") %>% summarise(mean(pt)) %>% pull

  return(list(b=b, bse=bse, b_CI=b_CI, iter=iter, pi0=pi0, entropy=entropy, Fit=Fit))
}
Osmahmoud/SlopeHunter documentation built on Oct. 7, 2022, 4:38 p.m.