knitr::opts_chunk$set(echo = TRUE,cache = TRUE,out.width = "100%") purity = 0.8
The sample code in this vignette is designed to show R users how to navigate UNMASC [@little2021unmasc] and its expected file types.
library(smarter) library(UNMASC) library(data.table) library(sqldf)
The code below will generate
assuming that the tumor's purity is r purity
with a fixed seed.
outdir = sprintf("%s/sim",getwd()) sim = UNMASC::gen_simulated_data( outdir = outdir, purity = purity, seed = 123) print(sim)
Code below imports a single vcf.
vcf_fn = file.path(outdir,"UMN_1.vcf") vcf = data.table::fread(vcf_fn,sep = "\t",data.table = FALSE) dim(vcf) head(vcf,n = 3)
Remove some initial fields, order contigs with ChrNum
derived field.
# Ignore annotation vcf = smarter::smart_rmcols(vcf,"INFO") smart_table(vcf[["#CHROM"]]) # For ordering contigs vcf$ChrNum = ifelse( vcf[["#CHROM"]] == "X",23, ifelse(vcf[["#CHROM"]] == "Y",24, vcf[["#CHROM"]])) vcf$ChrNum = as.integer(vcf$ChrNum)
Extracts tumor and normal read counts.
vcf$nAD = NA vcf$nRD = NA vcf$tAD = NA vcf$tRD = NA vcf[,c("nAD","nRD")] = t(sapply(vcf$NORMAL,function(xx){ xx = strsplit(xx,":")[[1]][2] xx = strsplit(xx,"[|]")[[1]] xx },USE.NAMES = FALSE)) vcf[,c("tAD","tRD")] = t(sapply(vcf$TUMOR,function(xx){ xx = strsplit(xx,":")[[1]][2] xx = strsplit(xx,"[|]")[[1]] xx },USE.NAMES = FALSE)) # Rm NORMAL/TUMOR fields vcf = smarter::smart_rmcols(vcf,c("TUMOR","NORMAL")) # Set field types for(nm in c("nAD","nRD","tAD","tRD")){ vcf[,nm] = as.integer(vcf[,nm]) } # Derive VAF and depth vcf = smarter::smart_rmcols(vcf, c("nDP","tDP","nVAF","tVAF","ncex","tcex")) vcf = sqldf(" select vcf.*, nAD + nRD as nDP, tAD + tRD as tDP, 1.0 * nAD / (nAD + nRD) as nVAF, 1.0 * tAD / (tAD + tRD) as tVAF, 0.5 * log10(nAD + nRD) as ncex, 0.5 * log10(tAD + tRD) as tcex from vcf order by ChrNum, POS ") head(vcf,n = 3) # Contigs, Positions sqldf(" select [#CHROM], count([#CHROM]) as num_loci, min(POS) as Min_Pos, max(POS) as Max_Pos from vcf group by ChrNum ")
par(mfrow = c(1,2)) smart_hist(vcf$nDP,breaks = 40,xlab = "Total Depth",main = "Normal") smart_hist(vcf$tDP,breaks = 40,xlab = "Total Depth",main = "Tumor")
plot(vcf$nVAF, main = "Normal", xlab = "Genomic Index", ylab = "Variant Allele Frequency", cex = vcf$ncex) abline(h = c(0,0.5,1),lty = 2,lwd = 2,col = "blue")
plot(vcf$tVAF, main = "Tumor", xlab = "Genomic Index", ylab = "Variant Allele Frequency", cex = vcf$tcex) abline(h = c(0,0.5,1),lty = 2,lwd = 2,col = "red")
The function UNMASC::run_UNMASC()
requires a R data.frame with pre-set field names and data types, multiple file paths, and a few hyper parameters. With our simulated data set, we'll import everything, extract necessary fields and run the full workflow.
fvcf = c() all_fns = list.files(outdir,pattern = "UMN_") all_fns = all_fns[grepl(".vcf$",all_fns)] for(fn in all_fns){ message(".",appendLF = FALSE) vcf_fn = file.path(outdir,fn) nm = gsub(".vcf","",fn) vcf = data.table::fread(vcf_fn,sep = "\t",data.table = FALSE) # ID per UMN vcf$STUDYNUMBER = nm fvcf = rbind(fvcf,vcf) } dim(fvcf) smart_table(fvcf$STUDYNUMBER)
In a real data analysis, one might have multiple raw VCF files and a separate annotation file similar to the outputs from ../workflow/tumor_only.sh
through the provided bash function TO_workflow()
. The user can perform a merge or SQL join between the raw VCF data.frame and annotation data.frame and pass the resulting data.frame into UNMASC::run_UNMASC()
. Alternatively, the user can use UNMASC::prep_UNMASC_VCF()
to parse the annotations, read counts and generate the required UNMASC input vcf R data.frame.
The code provided here will demonstrate one way to manually parse for all required input fields.
fin_vcf = sqldf(" select [#CHROM] as Chr from fvcf ")
Let us isolate unique loci and parse annotations.
anno = sqldf(" select distinct [#CHROM], POS, REF, ALT, INFO from fvcf ") dim(anno)
UNMASC::run_UNMASC( outdir = outdir, vcf = fvcf, tumorID = "sim")
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.