In knausb/vcfR_docs: Documentation For VcfR

knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(fig.align = 'center')
knitr::opts_chunk$set(fig.width = 8)
knitr::opts_chunk$set(fig.height = 6)

Creating histograms of allele balance can provide a quick perspective on teh quality of a sequenced genome as well as insights into whether there may be ploidy variation within an individual. The creation of these plots is covered elsewhere. Here I demonstrate how to automate this process over all samples from a VCF file.

Data input

# Load libraries
library(vcfR)
library(pinfsc50)

# Determine file locations
vcf_file <- system.file("extdata", "pinf_sc50.vcf.gz",
                        package = "pinfsc50")

# Read data into memory
vcf <- read.vcfR(vcf_file, verbose = FALSE)
vcf

Process data

ad <- extract.gt(vcf, element = 'AD')

allele1 <- masplit(ad, record = 1)
allele2 <- masplit(ad, record = 2)

ad1 <- allele1 / (allele1 + allele2)
ad2 <- allele2 / (allele1 + allele2)

# Filter on depth quantiles.
sums <- apply(allele1, MARGIN=2, quantile, probs=c(0.1, 0.9), na.rm=TRUE)
# Allele 1
dp2 <- sweep(allele1, MARGIN=2, FUN = "-", sums[1,])
#allele1[dp2 < 0] <- NA
vcf@gt[,-1][ dp2 < 0 & !is.na(vcf@gt[,-1]) ] <- NA
dp2 <- sweep(allele1, MARGIN=2, FUN = "-", sums[2,])
#allele1[dp2 > 0] <- NA
vcf@gt[,-1][dp2 > 0] <- NA
# Allele 2
dp2 <- sweep(allele2, MARGIN=2, FUN = "-", sums[1,])
vcf@gt[,-1][ dp2 < 0 & !is.na(vcf@gt[,-1]) ] <- NA
dp2 <- sweep(allele2, MARGIN=2, FUN = "-", sums[2,])
vcf@gt[,-1][dp2 > 0] <- NA

# Censor homozygotes.
gt <- extract.gt(vcf, element = 'GT')
hets <- is_het(gt)
is.na( vcf@gt[,-1][ !hets ] ) <- TRUE

# Extract allele depths
ad <- extract.gt(vcf, element = 'AD')
allele1 <- masplit(ad, record = 1)
allele2 <- masplit(ad, record = 2)
ad1 <- allele1 / (allele1 + allele2)
ad2 <- allele2 / (allele1 + allele2)

# Parameters
#winsize <- 1e5
#
winsize <- 2e5
#bin_width <- 0.1
#bin_width <- 0.05
#bin_width <- 0.025
#
bin_width <- 0.02
#bin_width <- 0.01


# Find peaks
freq1 <- ad1/(ad1+ad2)
freq2 <- ad2/(ad1+ad2)
myPeaks1 <- freq_peak(freq1, getPOS(vcf), winsize = winsize, bin_width = bin_width)
is.na(myPeaks1$peaks[myPeaks1$counts < 20]) <- TRUE
myPeaks2 <- freq_peak(freq2, getPOS(vcf), winsize = winsize, bin_width = bin_width, lhs = FALSE)
is.na(myPeaks2$peaks[myPeaks2$counts < 20]) <- TRUE

Plot

library(RColorBrewer)
col1 <- seq(1,ncol(ad2)*2, by=2) %% 8
col2 <- col1 + 1

alpha <- "11"

for( i in 1:ncol(freq1) ){
  layout(matrix(1:2, nrow=1), widths = c(4,1))
  par(mar=c(5,4,4,0))

  mySample <- colnames(freq1)[i]
  plot(getPOS(vcf), freq1[,mySample], ylim=c(0,1), type="n", yaxt='n', 
       main = mySample, xlab = "POS", ylab = "Allele balance")
  axis(side=2, at=c(0,0.25,0.333,0.5,0.666,0.75,1), 
       labels=c(0,'1/4','1/3','1/2','2/3','3/4',1), las=1)
  abline(h=c(0.25,0.333,0.5,0.666,0.75), col=8)

  bph <- myPeaks1$counts[,mySample]/sum(myPeaks1$counts[,mySample])
#  barplot(bph, width=winsize, space = 0,
#          names.arg = NA, col = c('#C0C0C0', '#808080'), border = NA, add=TRUE, yaxt='n')
  rect(xleft = myPeaks1$wins[,'START'], ybottom = rep(0, times = nrow(myPeaks1$wins)),
       xright = myPeaks1$wins[,'END'], ytop = bph, col = c('#C0C0C0', '#808080'), border = NA)

  bph <- 1 - myPeaks2$counts[,mySample]/sum(myPeaks2$counts[,mySample])
  rect(xleft = myPeaks1$wins[,'START'], ybottom = bph,
       xright = myPeaks1$wins[,'END'], ytop = rep(1, times = nrow(myPeaks2$wins)),
       col = c('#C0C0C0', '#808080'), border = NA)

  myMid <- (myPeaks2$wins[nrow(myPeaks2$wins),'END'] - myPeaks2$wins[1,'START'])/2
  myMu <- format(mean(myPeaks2$counts[,mySample]), digits = 4)
  text( x = myMid, y = 0, adj = c(1, 0), bquote( mu == .(myMu)), font = 2 )

  myMid <- (myPeaks1$wins[nrow(myPeaks1$wins),'END'] - myPeaks1$wins[1,'START'])/2
  myMu <- format(mean(myPeaks1$counts[,mySample]), digits = 4)
  text( x = myMid, y = 1, adj = c(1, 1), bquote( mu == .(myMu)), font = 2 )


  points(getPOS(vcf), freq1[,mySample], pch = 20, col= paste(brewer.pal(n=12, name = "Paired")[ col1[i] ], alpha, sep="") )
  points(getPOS(vcf), freq2[,mySample], pch = 20, col= paste(brewer.pal(n=12, name = "Paired")[ col2[i] ], alpha, sep="") )
  segments(x0=myPeaks1$wins[,'START_pos'], y0=myPeaks1$peaks[,mySample],
           x1=myPeaks1$wins[,'END_pos'], lwd=3)
  segments(x0=myPeaks1$wins[,'START_pos'], y0=myPeaks2$peaks[,mySample],
           x1=myPeaks1$wins[,'END_pos'], lwd=3)

  bp1 <- hist(freq1[,mySample], breaks = seq(0,1,by=bin_width), plot = FALSE)
  bp2 <- hist(freq2[,mySample], breaks = seq(0,1,by=bin_width), plot = FALSE)

  par(mar=c(5,1,4,2))
  barplot(height=bp1$counts, width=0.02,  space = 0, horiz = T, add = FALSE, col=brewer.pal(n=12, name = "Paired")[ col1[i] ])
  barplot(height=bp2$counts, width=0.02,  space = 0, horiz = T, add = TRUE, col=brewer.pal(n=12, name = "Paired")[ col2[i] ])


}