In knausb/vcfR_docs: Documentation For VcfR

knitr::opts_chunk$set(fig.align = 'center')
knitr::opts_chunk$set(fig.width = 8)
knitr::opts_chunk$set(fig.height = 8)

Input data

Data import is performed similar to other examples.

# Load libraries
library(vcfR)
library(pinfsc50)

# Determine file locations
vcf_file <- system.file("extdata", "pinf_sc50.vcf.gz",
                        package = "pinfsc50")

# Read data into memory
vcf <- read.vcfR(vcf_file, verbose = FALSE)
vcf

Extract the alleles and create frequencies

ad <- extract.gt(vcf, element = 'AD')

allele1 <- masplit(ad, record = 1)
allele2 <- masplit(ad, record = 2)

ad1 <- allele1 / (allele1 + allele2)
ad2 <- allele2 / (allele1 + allele2)

The most abundant allele

dp <- allele1
#sums <- apply(dp, MARGIN=2, quantile, probs=c(0.15, 0.95), na.rm=TRUE)
sums <- apply(dp, MARGIN=2, quantile, probs=c(0.1, 0.9), na.rm=TRUE)

par(mfrow=c(4,3))
par(mar=c(2,2,1,1))
par(oma=c(1,1,0,0))

for(i in 1:12){
  hist(allele1[,i], breaks = seq(0,1e3,by=1), xlim=c(0,100), col=8, main="", xlab="", ylab="")
  title(main = colnames(allele1)[i])
  abline(v=sums[,i], col=2)
}
title(xlab = "Depth", line=0, outer = TRUE, font=2)
title(ylab = "Count", line=0, outer = TRUE, font=2)

par(mar=c(5,4,4,2))
par(oma=c(0,0,0,0))

The second most abundant allele

par(mfrow=c(4,3))
par(mar=c(2,2,1,1))
par(oma=c(1,1,0,0))

for(i in 1:12){
  tmp <- allele2[,i]
  tmp <- tmp[ tmp > 0 ]
  hist(tmp, breaks = seq(0,1e3,by=1), xlim=c(0,100), 
#       ylim=c(0,1000),
       col=8, main="", xlab="", ylab="")
  title(main = colnames(allele1)[i])
}
title(xlab = "Depth", line=0, outer = TRUE, font=2)
title(ylab = "Count", line=0, outer = TRUE, font=2)

par(mfrow=c(1,1))
par(mar=c(5,4,4,2))
par(oma=c(0,0,0,0))

Find peaks of density

# Filter on depth quantiles.
sums <- apply(allele1, MARGIN=2, quantile, probs=c(0.1, 0.9), na.rm=TRUE)
# Allele 1
dp2 <- sweep(allele1, MARGIN=2, FUN = "-", sums[1,])
#allele1[dp2 < 0] <- NA
vcf@gt[,-1][ dp2 < 0 & !is.na(vcf@gt[,-1]) ] <- NA
dp2 <- sweep(allele1, MARGIN=2, FUN = "-", sums[2,])
#allele1[dp2 > 0] <- NA
vcf@gt[,-1][dp2 > 0] <- NA
# Allele 2
dp2 <- sweep(allele2, MARGIN=2, FUN = "-", sums[1,])
vcf@gt[,-1][ dp2 < 0 & !is.na(vcf@gt[,-1]) ] <- NA
dp2 <- sweep(allele2, MARGIN=2, FUN = "-", sums[2,])
vcf@gt[,-1][dp2 > 0] <- NA

# Censor homozygotes.
gt <- extract.gt(vcf, element = 'GT')
hets <- is_het(gt)
is.na( vcf@gt[,-1][ !hets ] ) <- TRUE




# Extract allele depths
ad <- extract.gt(vcf, element = 'AD')
allele1 <- masplit(ad, record = 1)
allele2 <- masplit(ad, record = 2)
ad1 <- allele1 / (allele1 + allele2)
ad2 <- allele2 / (allele1 + allele2)

# Parameters
#winsize <- 1e5
#
winsize <- 2e5
#bin_width <- 0.1
#bin_width <- 0.05
#bin_width <- 0.025
#
bin_width <- 0.02
#bin_width <- 0.01


# Find peaks
freq1 <- ad1/(ad1+ad2)
freq2 <- ad2/(ad1+ad2)
myPeaks1 <- freq_peak(freq1, getPOS(vcf), winsize = winsize, bin_width = bin_width)
#myCounts1 <- freq_peak(freq1, getPOS(vcf), winsize = winsize, bin_width = bin_width, count = TRUE)
is.na(myPeaks1$peaks[myPeaks1$counts < 20]) <- TRUE
myPeaks2 <- freq_peak(freq2, getPOS(vcf), winsize = winsize, bin_width = bin_width, lhs = FALSE)
#myCounts2 <- freq_peak(freq2, getPOS(vcf), winsize = winsize, bin_width = bin_width, count = TRUE)
is.na(myPeaks2$peaks[myPeaks2$counts < 20]) <- TRUE

Proof of concept. Major allele.

par(mfrow=c(4,4))
par(mar=c(2,2,1,1))
par(oma=c(1,1,0,0))

mySample <- "BL2009P4_us23"
for(i in 1:4){
  hist(freq1[ myPeaks1$wins[i,'START_row']:myPeaks1$wins[i,'END_row'], mySample ],
       breaks = seq(0,1,by=bin_width), xlim=c(0,1), col=8, main = "", xaxt='n')
  axis(side=1, at=c(0,0.25,0.333,0.5,0.666,0.75,1),
     labels=c(0,'1/4','1/3','1/2','2/3','3/4',1), las=1)
  abline(v=myPeaks1$peaks[i,mySample], col=2)
  if(i==2){ title(main=mySample) }
}

mySample <- "DDR7602"
for(i in 1:4){
  hist(freq1[ myPeaks1$wins[i,'START_row']:myPeaks1$wins[i,'END_row'], mySample ], 
       breaks = seq(0,1,by=bin_width), xlim=c(0,1), col=8, main = "", xaxt='n')
  axis(side=1, at=c(0,0.25,0.333,0.5,0.666,0.75,1),
     labels=c(0,'1/4','1/3','1/2','2/3','3/4',1), las=1)
  abline(v=myPeaks1$peaks[i,mySample], col=2)
  if(i==2){ title(main=mySample) }
}

mySample <- "IN2009T1_us22"
for(i in 1:4){
  hist(freq1[ myPeaks1$wins[i,'START_row']:myPeaks1$wins[i,'END_row'], mySample ],
       breaks = seq(0,1,by=bin_width), xlim=c(0,1), col=8, main = "", xaxt='n')
  axis(side=1, at=c(0,0.25,0.333,0.5,0.666,0.75,1),
     labels=c(0,'1/4','1/3','1/2','2/3','3/4',1), las=1)
  abline(v=myPeaks1$peaks[i,mySample], col=2)
  if(i==2){ title(main=mySample) }
}

mySample <- "P17777us22"
for(i in 1:4){
  hist(freq1[ myPeaks1$wins[i,'START_row']:myPeaks1$wins[i,'END_row'], mySample ],
       breaks = seq(0,1,by=bin_width), xlim=c(0,1), col=8, main = "", xaxt='n')
  axis(side=1, at=c(0,0.25,0.333,0.5,0.666,0.75,1),
     labels=c(0,'1/4','1/3','1/2','2/3','3/4',1), las=1)
  abline(v=myPeaks1$peaks[i,mySample], col=2)
  if(i==2){ title(main=mySample) }
}

par(mfrow=c(1,1))
par(mar=c(5,4,4,2))
par(oma=c(0,0,0,0))

Second most abundant allele.

par(mfrow=c(4,4))
par(mar=c(2,2,1,1))
par(oma=c(1,1,0,0))

mySample <- "BL2009P4_us23"
for(i in 1:4){
  hist(freq2[ myPeaks1$wins[i,'START_row']:myPeaks1$wins[i,'END_row'], mySample ],
       breaks = seq(0,1,by=bin_width), xlim=c(0,1), col=8, main = "", xaxt='n')
  axis(side=1, at=c(0,0.25,0.333,0.5,0.666,0.75,1),
     labels=c(0,'1/4','1/3','1/2','2/3','3/4',1), las=1)
  abline(v=myPeaks2$peaks[i,mySample], col=2)
  if(i==2){ title(main=mySample) }
}

mySample <- "DDR7602"
for(i in 1:4){
  hist(freq2[ myPeaks1$wins[i,'START_row']:myPeaks1$wins[i,'END_row'], mySample ],
       breaks = seq(0,1,by=bin_width), xlim=c(0,1), col=8, main = "", xaxt='n')
  axis(side=1, at=c(0,0.25,0.333,0.5,0.666,0.75,1),
     labels=c(0,'1/4','1/3','1/2','2/3','3/4',1), las=1)
  abline(v=myPeaks2$peaks[i,mySample], col=2)
  if(i==2){ title(main=mySample) }
}

mySample <- "IN2009T1_us22"
for(i in 1:4){
  hist(freq2[ myPeaks1$wins[i,'START_row']:myPeaks1$wins[i,'END_row'], mySample ],
       breaks = seq(0,1,by=bin_width), xlim=c(0,1), col=8, main = "", xaxt='n')
  axis(side=1, at=c(0,0.25,0.333,0.5,0.666,0.75,1),
     labels=c(0,'1/4','1/3','1/2','2/3','3/4',1), las=1)
  abline(v=myPeaks2$peaks[i,mySample], col=2)
  if(i==2){ title(main=mySample) }
}

mySample <- "P17777us22"
for(i in 1:4){
  hist(freq2[ myPeaks2$wins[i,'START_row']:myPeaks1$wins[i,'END_row'], mySample ],
       breaks = seq(0,1,by=bin_width), xlim=c(0,1), col=8, main = "", xaxt='n')
  axis(side=1, at=c(0,0.25,0.333,0.5,0.666,0.75,1),
     labels=c(0,'1/4','1/3','1/2','2/3','3/4',1), las=1)
  abline(v=myPeaks2$peaks[i,mySample], col=2)
  if(i==2){ title(main=mySample) }
}

par(mfrow=c(1,1))
par(mar=c(5,4,4,2))
par(oma=c(0,0,0,0))

par(mfrow=c(4,3))
par(mar=c(2,2,1,1))
par(oma=c(1,1,0,0))


for(i in 1:12){
mySample <- colnames(freq1)[i]
plot(getPOS(vcf), freq1[,mySample], ylim=c(0,1), type="n", yaxt='n', 
     main = mySample, xlab = "POS", ylab = "Allele balance")
axis(side=2, at=c(0,0.25,0.333,0.5,0.666,0.75,1), 
     labels=c(0,'1/4','1/3','1/2','2/3','3/4',1), las=1)
abline(h=c(0.25,0.333,0.5,0.666,0.75), col=8)
points(getPOS(vcf), freq1[,mySample], pch = 20, col= "#A6CEE344")
points(getPOS(vcf), freq2[,mySample], pch = 20, col= "#1F78B444")
segments(x0=myPeaks1$wins[,'START_pos'], y0=myPeaks1$peaks[,mySample],
         x1=myPeaks1$wins[,'END_pos'], lwd=3)
segments(x0=myPeaks1$wins[,'START_pos'], y0=myPeaks2$peaks[,mySample],
         x1=myPeaks1$wins[,'END_pos'], lwd=3)
}

par(mfrow=c(1,1))
par(mar=c(5,4,4,2))
par(oma=c(0,0,0,0))

Visualization #2

i <- 2

layout(matrix(1:2, nrow=1), widths = c(4,1))
par(mar=c(5,4,4,0))

mySample <- colnames(freq1)[i]
plot(getPOS(vcf), freq1[,mySample], ylim=c(0,1), type="n", yaxt='n', 
     main = mySample, xlab = "POS", ylab = "Allele balance")
axis(side=2, at=c(0,0.25,0.333,0.5,0.666,0.75,1), 
     labels=c(0,'1/4','1/3','1/2','2/3','3/4',1), las=1)
abline(h=c(0.25,0.333,0.5,0.666,0.75), col=8)
points(getPOS(vcf), freq1[,mySample], pch = 20, col= "#A6CEE344")
points(getPOS(vcf), freq2[,mySample], pch = 20, col= "#1F78B444")
segments(x0=myPeaks1$wins[,'START_pos'], y0=myPeaks1$peaks[,mySample],
         x1=myPeaks1$wins[,'END_pos'], lwd=3)
segments(x0=myPeaks1$wins[,'START_pos'], y0=myPeaks2$peaks[,mySample],
         x1=myPeaks1$wins[,'END_pos'], lwd=3)

bp1 <- hist(freq1[,mySample], breaks = seq(0,1,by=bin_width), plot = FALSE)
bp2 <- hist(freq2[,mySample], breaks = seq(0,1,by=bin_width), plot = FALSE)

par(mar=c(5,1,4,2))
barplot(height=bp1$counts, width=0.02,  space = 0, horiz = T, add = FALSE, col="#A6CEE3")
barplot(height=bp2$counts, width=0.02,  space = 0, horiz = T, add = TRUE, col="#1F78B4")

par(mar=c(5,4,4,2))
par(mfrow=c(1,1))

Loop

for( i in 1:ncol(freq1) ){
#i <- 18

layout(matrix(1:2, nrow=1), widths = c(4,1))
par(mar=c(5,4,4,0))

mySample <- colnames(freq1)[i]
plot(getPOS(vcf), freq1[,mySample], ylim=c(0,1), type="n", yaxt='n', 
     main = mySample, xlab = "POS", ylab = "Allele balance")
axis(side=2, at=c(0,0.25,0.333,0.5,0.666,0.75,1), 
     labels=c(0,'1/4','1/3','1/2','2/3','3/4',1), las=1)
abline(h=c(0.25,0.333,0.5,0.666,0.75), col=8)
points(getPOS(vcf), freq1[,mySample], pch = 20, col= "#A6CEE344")
points(getPOS(vcf), freq2[,mySample], pch = 20, col= "#1F78B444")
segments(x0=myPeaks1$wins[,'START_pos'], y0=myPeaks1$peaks[,mySample],
         x1=myPeaks1$wins[,'END_pos'], lwd=3)
segments(x0=myPeaks1$wins[,'START_pos'], y0=myPeaks2$peaks[,mySample],
         x1=myPeaks1$wins[,'END_pos'], lwd=3)

bp1 <- hist(freq1[,mySample], breaks = seq(0,1,by=bin_width), plot = FALSE)
bp2 <- hist(freq2[,mySample], breaks = seq(0,1,by=bin_width), plot = FALSE)

par(mar=c(5,1,4,2))
barplot(height=bp1$counts, width=0.02,  space = 0, horiz = T, add = FALSE, col="#A6CEE3")
barplot(height=bp2$counts, width=0.02,  space = 0, horiz = T, add = TRUE, col="#1F78B4")

par(mar=c(5,4,4,2))
par(mfrow=c(1,1))
}

Most abundant allele

knitr::kable(myPeaks1$wins)

par(mfrow=c(1,2))
par(mar=c(2,2,1,1))
par(oma=c(1,1,1,0))

critical <- 1/4 - (1/3-1/4)/2
critical <- c(critical, 1/4 + (1/3-1/4)/2)
critical <- c(critical, 1/2 - (2/3 - 1/2)/2)
critical <- c(critical, 1/2 + (2/3 - 1/2)/2)
critical <- c(critical, 3/4 - (1/3-1/4)/2)
critical <- c(critical, 3/4 + (1/3-1/4)/2)


for( j in 1:ncol(freq1) ){
mySample <- colnames(freq1)[j]

  for( i in 1:nrow(myPeaks1$wins) ){
    hist(freq2[ myPeaks1$wins[i,'START_row']:myPeaks1$wins[i,'END_row'], mySample ],
         breaks = seq(0,1,by=bin_width), xlim=c(0,1), col=8, main = "", xaxt='n')
    axis(side=1, at=c(0,0.25,0.333,0.5,0.666,0.75,1),
         labels=c(0,'1/4','1/3','1/2','2/3','3/4',1), las=3)
    abline(v=myPeaks2$peaks[i,mySample], col=2, lwd=2)
    abline(v=critical, col="#228B22")

    hist(freq1[ myPeaks1$wins[i,'START_row']:myPeaks1$wins[i,'END_row'], mySample ],
         breaks = seq(0,1,by=bin_width), xlim=c(0,1), col=8, main = "", xaxt='n')
    axis(side=1, at=c(0,0.25,0.333,0.5,0.666,0.75,1),
         labels=c(0,'1/4','1/3','1/2','2/3','3/4',1), las=1)
    abline(v=myPeaks1$peaks[i,mySample], col=2, lwd=2)
    abline(v=critical, col="#228B22")

    mtext(paste("Window", i), side=4)
    title(main=mySample, outer = TRUE)
  }
}


par(mfrow=c(1,1))
par(mar=c(5,4,4,2))
par(oma=c(0,0,0,0))