knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(fig.align = 'center')
knitr::opts_chunk$set(fig.width = 8)
knitr::opts_chunk$set(fig.height = 8)

Creating histograms of allele balance can provide a quick perspective on the quality of a sequenced genome as well as insights into whether there may be ploidy variation within an individual. The creation of these plots is covered elsewhere. Here I demonstrate how to automate this process over all samples from a VCF file.

Data input

# Load libraries
library(vcfR)
library(pinfsc50)

# Determine file locations
vcf_file <- system.file("extdata", "pinf_sc50.vcf.gz",
                        package = "pinfsc50")

# Read data into memory
vcf <- read.vcfR(vcf_file, verbose = FALSE)
vcf

Allele depth

ad <- extract.gt(vcf, element = 'AD')
allele1 <- masplit(ad, record = 1)
allele2 <- masplit(ad, record = 2)

Create a confidence interval based on allele depth.

#sums <- apply(allele1, MARGIN=2, quantile, probs=c(0.15, 0.95), na.rm=TRUE)
sums <- apply(allele1, MARGIN=2, quantile, probs=c(0.1, 0.9), na.rm=TRUE)

# Allele 1
dp2 <- sweep(allele1, MARGIN=2, FUN = "-", sums[1,])
#allele1[dp2 < 0] <- NA
vcf@gt[,-1][ dp2 < 0 & !is.na(vcf@gt[,-1]) ] <- NA
dp2 <- sweep(allele1, MARGIN=2, FUN = "-", sums[2,])
#allele1[dp2 > 0] <- NA
vcf@gt[,-1][dp2 > 0] <- NA

# Allele 2
dp2 <- sweep(allele2, MARGIN=2, FUN = "-", sums[1,])
vcf@gt[,-1][ dp2 < 0 & !is.na(vcf@gt[,-1]) ] <- NA
dp2 <- sweep(allele2, MARGIN=2, FUN = "-", sums[2,])
vcf@gt[,-1][dp2 > 0] <- NA

# Hard filter
#dp[dp < 4] <- NA
#vcf@gt[,-1][allele1 < 8] <- NA

Subset to heterozygotes and recalculate allele balance

gt <- extract.gt(vcf, element = 'GT')
hets <- is_het(gt)
is.na( ad[ !hets ] ) <- TRUE

allele1 <- masplit(ad, record = 1)
allele2 <- masplit(ad, record = 2)

ad1 <- allele1 / (allele1 + allele2)
ad2 <- allele2 / (allele1 + allele2)

Plot allele balance histograms

library(RColorBrewer)

#i <- 1
#brewer.pal(n=12, name = "Paired")

col1 <- seq(1,ncol(ad2)*2, by=2) %% 8
col2 <- col1 + 1

for(i in 1:ncol(ad2)){
  hist(ad2[,i], breaks = seq(0,1,by=0.02),
       col = brewer.pal(n=12, name = "Paired")[ col1[i] ], 
       xaxt="n", main=colnames(ad2)[i], xlab="")
  hist(ad1[,i], breaks = seq(0,1,by=0.02),
       col = brewer.pal(n=12, name = "Paired")[ col2[i] ], add = TRUE)
  axis(side=1, at=c(0,0.25,0.333,0.5,0.666,0.75,1), labels=c(0,"1/4","1/3","1/2","2/3","3/4",1))
}


knausb/vcfR_docs documentation built on May 20, 2019, 12:52 p.m.