knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "man/figures/README-",
  out.width = "100%"
)

Summix2

Summix2 is a suite of methods that detect and leverage substructure in genetic summary data. This package builds on Summix, a method that estimates and adjusts for substructure in genetic summary that was developed by the Hendricks Research Team at the University of Colorado Denver.

Find more details about Summix in our manuscript published in the American Journal of Human Genetics.

For individual function specifics in Summix2:

summix --- fast forward to example

adjAF --- fast forward to example

summix_local --- fast forward to example

Package Installation

To install this package, start R (version “4.3”) and run the following commands:

if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("Summix")




summix {#summix}

The summix() function estimates mixture proportions of reference groups within genetic summary (allele frequency) data using sequential quadratic programming performed with the slsqp() function in the nloptr package.

summix() Input

Mandatory parameters are:

Optional parameters are:

summix() Output

A data frame with the following columns:

Additional Output if summix_network = TRUE:




adjAF {#adjaf}

The adjAF() function adjusts allele frequencies to match reference group substructure mixture proportions in a given target group or individual.

adjAF() Input

Mandatory parameters are:

adjAF() Output

A data frame with the following columns:




summix_local {#summix_local}

The summix_local() function estimates local substructure mixture proportions in genetic summary data using the same slspq() functionality as summix(). summix_local() also performs a selection scan (optional) that identifies regions of selection along the given chromosome.

summix_local() Input

Mandatory parameters are:

Optional parameters are:

Conditional parameters are:

If algorithm = "windows":

If algorithm = "fastcatch":

If type = "variants":

If type = "bp":

If algorithm = "fastcatch" and type = "variants":

If algorithm = "fastcatch" and type = "bp":

If selection_scan = TRUE:

summix_local() Output

A data frame with a row for each local substructure block and the following columns:

Additional Output if selection_scan = TRUE:

Examples using toy data in the Summix package

For quick runs of all demos, we suggest using the data saved within the Summix library called ancestryData.

A quick demo of summix() {#a-quick-demo-of-summix}

The commands:

library(Summix)

# load the data
data("ancestryData")

# Estimate 5 reference group proportion values for the gnomAD African/African American group
# using a starting guess of .2 for each estimated proportion.
summix(data = ancestryData,
    reference=c("reference_AF_afr",
        "reference_AF_eas",
        "reference_AF_eur",
        "reference_AF_iam",
        "reference_AF_sas"),
    observed="gnomad_AF_afr",
    pi.start = c(.2, .2, .2, .2, .2),
    goodness.of.fit=TRUE)



Below is an example of creating a Summix Network plot.

Summix_output<- summix(data = ancestryData,
    reference=c("reference_AF_afr",
        "reference_AF_eas",
        "reference_AF_eur",
        "reference_AF_iam",
        "reference_AF_sas"),
    observed="gnomad_AF_afr",
    pi.start = c(.2, .2, .2, .2, .2),
    goodness.of.fit=TRUE,
    network = TRUE, 
    N_reference = c(704, 787, 741, 47, 545), reference_colors = c("#FDE725FF", "#5DC863FF", "#21908CFF", "#3B528BFF", "#440154FF"))

Summix_Network <- Summix_output[[2]]

Summix_Network





A quick demo of adjAF() {#a-quick-demo-of-adjaf}

The commands:

library(Summix)

# load the data
data("ancestryData")


adjusted_data<-adjAF(data = ancestryData,
     reference = c("reference_AF_afr", "reference_AF_eur"),
     observed = "gnomad_AF_afr",
     pi.target = c(1, 0),
     pi.observed = c(.85, .15),
     adj_method = 'average',
     N_reference = c(704,741),
     N_observed = 20744,
     filter = TRUE)
print(adjusted_data$adjusted.AF[1:5,])





A quick demo of summix_local() {#a-quick-demo-of-summix_local}

The commands:

library(Summix)

# load the data
data("ancestryData")

results <- summix_local(data = ancestryData,
                        reference = c("reference_AF_afr", 
                                      "reference_AF_eas", 
                                      "reference_AF_eur", 
                                      "reference_AF_iam", 
                                      "reference_AF_sas"),
                        NSimRef = c(704,787,741,47,545),
                        observed="gnomad_AF_afr",
                        goodness.of.fit = T,
                        type = "variants",
                        algorithm = "fastcatch",
                        minVariants = 150,
                        maxVariants = 250,
                        maxStepSize = 1000,
                        diffThreshold = .02,
                        override_fit = F,
                        override_removeSmallAnc = TRUE,
                        selection_scan = T,
                        position_col = "POS")

print(results$results)



Below is an example of plotting the reference group proportions estimated in each block using summix_local(); where asterisks indicate local substructure blocks that are at least nominally significant (p-value\<=.05).

suppressPackageStartupMessages(library(scales))
suppressPackageStartupMessages(library(viridis))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(ggplot2))

makeSubstructurePlot <- function(testResults, reference,
                             title = "Substructure Plot", addMarks = F,
                             include_table = F, include_plot = T,
                             totalBlocks = NULL, nominal = F) {
  colors <- viridis(length(reference))
  reference <- sort(reference)
  testResults$Block <- as.numeric(rownames(testResults))
  #make long format for plotting
  plotData <- testResults %>% pivot_longer(all_of(reference))
  if(addMarks == F) {
    ggplot(plotData, aes(fill = name, x = (End_Pos + Start_Pos)/2,
                         y = round(value, 4))) +
      geom_bar(position = "fill", stat = "identity",
               width = plotData$End_Pos - plotData$Start_Pos) +
      scale_fill_manual(values = colors) +
      scale_x_continuous(labels = comma) + xlab("Position") +
      ylab("Proportion") + ggtitle(title)
  } else {
    allMarks <- data.frame(matrix(nrow = nrow(testResults), ncol = 3))
    colnames(allMarks) <- c("x", "y", "mark")
    pvals <- testResults[grep("p.", colnames(testResults))]
    pvals <- pvals[,order(colnames(pvals))]

    # for any block that exceeds 95th percentile of loss/1000SNPs set
    # pvals = 1 so they won't be significant
    toSet <- which(testResults$objective > quantile(testResults$objective, .95))
    pvals[toSet,] <- 1
    pvals2 <- apply(pvals, 1, min)

    allMarks$x <- (testResults$End_Pos+testResults$Start_Pos)/2
    allMarks$y <- 1.03
    allMarks$i <- 1:nrow(allMarks)
    if(nominal == F) {
      allMarks <- allMarks %>% slice(which(pvals2 < .05/totalBlocks/(length(reference)-1)))
    } else {
      allMarks <- allMarks %>% slice(which(pvals2 < .05))
    }


    if(nrow(allMarks) > 0) {
      for(i in 1:nrow(allMarks)) {
        allMarks[i,]$mark <- "*"
      }
    }
    plotData <- testResults %>%
      pivot_longer(all_of(reference))
    if(nominal == F) {
      p <- ggplot(plotData, aes(fill = name, x = (End_Pos + Start_Pos)/2,
                                y = round(value, 4))) +
        geom_bar(position = "fill", stat = "identity",
                 width = plotData$End_Pos - plotData$Start_Pos) +
        scale_fill_manual(values = colors) +
        scale_x_continuous(labels = comma) + xlab("Position") +
        ylab("Proportion") + ggtitle(title) + theme_bw() +
        annotate("text", x = allMarks$x, y = allMarks$y, label = allMarks$mark,
                 color = "black", size = 6)
    } else {
      p <- ggplot(plotData, aes(fill = name, x = (End_Pos + Start_Pos)/2,
                                y = round(value, 4))) +
        geom_bar(position = "fill", stat = "identity",
                 width = plotData$End_Pos - plotData$Start_Pos) +
        scale_fill_manual(values = colors) +
        scale_x_continuous(labels = comma) + xlab("Position") +
        ylab("Proportion") + ggtitle(title) + theme_bw() +
        annotate("text", x = allMarks$x, y = allMarks$y, label = allMarks$mark,
                 color = "grey50", size = 4)
    }


    if(nrow(allMarks) == 0 | include_table == F) {
      p
    } else {
      allMarks$`Start Position` <- testResults[allMarks$i,]$Start_Pos
      allMarks$`End Position` <- testResults[allMarks$i,]$End_Pos
      allMarks <- cbind(allMarks, round(pvals[allMarks$i,],8))
      allMarks$x <- NULL
      allMarks$y <- NULL
      allMarks$mark <-NULL
      allMarks$i <- NULL
      t <- tableGrob(allMarks, theme = ttheme_minimal(base_size = 9),
                     rows = rep("", nrow(allMarks)))
      if(include_plot == F) {
        grid.arrange(t)
      } else {
        grid.arrange(p, t, nrow = 2, heights = c(2.5, 1))
      }
    }
  }
}

# makeSubstructurePlot(results$results,
#                  reference = c("reference_AF_afr", "reference_AF_eas", "reference_AF_eur", "reference_AF_iam", "reference_AF_sas"),
#                  title = "Example Local Substructure",
#                  totalBlocks = dim(results$results)[1],
#                  addMarks = T,
#                  nominal = T)



hendriau/Summix documentation built on Nov. 13, 2024, 6:53 a.m.