#------------------------------------------------------------------------------------------------#
# Function to generate the DPinput file from Battenberg output and sample vcfs which is then used in DPClust
# Beagle, NF
#------------------------------------------------------------------------------------------------#
#' @name
#' GenerateDPinput_mutect_vcf
#'
#' @title
#' Generate DP input
#'
#' @description
#' Converts snv counts from vcf into input format for DPClust
#'
#' @param
#' Loci of SNP positions used in CNV call
#' Allele frequencies of same SNP positions
#' File containing purity and ploidy of tumour sample
#' Subclones file from Battenberg containing copy number segments across genome
#' Sex of patient
#'
#' @return
#' A file containing all snv loci,
#' their mutation copy number status,
#' and their CCF (cancer cell fraction)
GenerateDPinput_mutect_vcf <- function(tumourplatekey,
normalplatekey,
gender,
vcffilepath,
rhoandpsifilepath,
subcloneshg38filepath,
output_loci_file,
output_file,
output_DPinput_file) {
# source(config_file)
# definitions
nucleotides = c("A","C","G","T")
# print samplename to logfile
samplename=paste0("tumo",tumourplatekey,"_norm",normalplatekey)
# give some info
print(paste("Sample name:",samplename))
#print(paste("This is run",run))
print(paste("Using",subcloneshg38filepath))
# check vcf exists and turn it into format for DPClust, else print that it doesn't exist and the sample is exiting
if (file.exists(vcffilepath)) {
# some info
print(paste("VCF is ",vcffilepath))
# load vcf
snvs=read.table(vcffilepath,stringsAsFactors=F)
print(paste("vcfloaded",vcffilepath))
# select only those variants which have passed all filters and which are SNVs (not indels)
snvs=snvs[which(snvs[,7]=="PASS" & nchar(snvs[,4])==1 & nchar(snvs[,5])==1),]
# make new snvs table for dpinput
newsnvs=snvs[,c(1,2,2,4,5,8,10,11)]
newsnvs[,2]=newsnvs[,3]-1
# remove stuff not assigned to a chr
if ("chrX" %in% snvs[,1]) {
newsnvs=newsnvs[1:max(which(newsnvs[,1]=="chrX")),]
}
# replace chr1 with 1 etc
newsnvs[,1]=gsub("chr","",newsnvs[,1])
# turn counts from vcf into required format
#tumdepth=lapply(newsnvs[,6],function(x){strsplit(x,"t2BamAC=")[[1]][2]})
#tumdepth=lapply(newsnvs[,10],function(x){strsplit(x,"t2BamAC=")[[1]][2]})
tumdepth = lapply(newsnvs[, 8], function(x) {
strsplit(x, ":")[[1]][2]
})
tumref = sapply(tumdepth, function(x) {
strsplit(x, ",")[[1]][1]
})
tumalt = sapply(tumdepth, function(x) {
strsplit(x, ",")[[1]][2]
})
tumpos = cbind(newsnvs[, c(1, 3, 4, 5), ], 0, 0, 0, 0,
0)
colnames(tumpos) = c("Chromosome", "Position", "REF",
"ALT", "count_A", "count_C", "count_G", "count_T",
"total_depth")
refcol = tumpos$REF
refcol[which(refcol == "A")] = 1
refcol[which(refcol == "C")] = 2
refcol[which(refcol == "G")] = 3
refcol[which(refcol == "T")] = 4
refcol = as.integer(refcol) + 4
altcol = tumpos$ALT
altcol[which(altcol == "A")] = 1
altcol[which(altcol == "C")] = 2
altcol[which(altcol == "G")] = 3
altcol[which(altcol == "T")] = 4
altcol = as.integer(altcol) + 4
tumpos[cbind(1:nrow(tumpos),refcol)]=tumref
tumpos[cbind(1:nrow(tumpos),refcol)]=tumalt
tumdepth=sapply(tumdepth,function(x){strsplit(x,";")[[1]][1]})
tumA=sapply(tumdepth,function(x){strsplit(x,",")[[1]][1]})
tumC=sapply(tumdepth,function(x){strsplit(x,",")[[1]][2]})
tumG=sapply(tumdepth,function(x){strsplit(x,",")[[1]][3]})
tumT=sapply(tumdepth,function(x){strsplit(x,",")[[1]][4]})
# create file type for tumour
tumpos=cbind(newsnvs[,c(1,3,4,5),],0,0,0,0,0)
colnames(tumpos)=c("Chromosome","Position","REF","ALT","count_A","count_C","count_G","count_T","total_depth")
tumpos[,5]=tumA
tumpos[,6]=tumC
tumpos[,7]=tumG
tumpos[,8]=tumT
refcol = tumpos$REF
refcol[which(refcol=="A")]=1
refcol[which(refcol=="C")]=2
refcol[which(refcol=="G")]=3
refcol[which(refcol=="T")]=4
refcol=as.integer(refcol)+4
altcol = tumpos$ALT
altcol[which(altcol=="A")]=1
altcol[which(altcol=="C")]=2
altcol[which(altcol=="G")]=3
altcol[which(altcol=="T")]=4
altcol=as.integer(altcol)+4
refcounts=tumpos[cbind(seq_along(refcol),refcol)]
altcounts=tumpos[cbind(seq_along(altcol),altcol)]
tumpos[,9]=as.integer(refcounts)+as.integer(altcounts)
# file containing all SNV positions and counts of all alleles at these positions
# normalpositioncounts=normpos[,c(1,2,5:9)]
tumourpositioncounts=tumpos[,c(1,2,5:9)]
print(paste("Writing tumour position counts to ",
output_file))
write.table(tumourpositioncounts,
output_file,
row.names=F,
quote=F,
sep="\t")
# file containing just the loci for input into the function below
tumourloci=tumpos[,c(1:4)]
print(paste("Writing tumour loci to ",
output_loci_file))
write.table(tumourloci,
output_loci_file,
row.names=F,
col.names=F,
quote=F,
sep="\t")
# define filenames for function
# loci_file=paste0(sampledir,dpcprepdir,tumourplatekey,"_loci.txt")
loci_file=output_loci_file
# allele_frequencies_file=paste0(sampledir,dpcprepdir,tumourplatekey,"_snv_counts_tumour.txt")
allele_frequencies_file=output_file
# cellularity_file=paste0(sampledir,fitcopydir,tumourplatekey,"_rho_and_psi.txt")
cellularity_file=rhoandpsifilepath
# subclone_file=paste0(sampledir,subdir,tumourplatekey,"_subclones.txt")
subclone_file=subcloneshg38filepath
# output_file=paste0(sampledir,dpcprepdir,samplename,"_DPinput.txt")
output_DPinput_file=output_DPinput_file
print(paste("Using loci file",loci_file))
print(paste("Using allele frequencies file",allele_frequencies_file))
print(paste("Using cellularity file",cellularity_file))
print(paste("Using subclones file",subclone_file))
print(paste("Writing to output file",output_DPinput_file))
if (gender=="FEMALE") {
gender="female"
} else if (gender=="MALE") {
gender="male"
} else {
print("Gender unspecified, exiting")
break
}
runGetDirichletProcessInfo(loci_file,
allele_frequencies_file,
cellularity_file,
subclone_file,
gender,
SNP.phase.file="NA",
mut.phase.file="NA",
output_file=output_DPinput_file)
} else {
print(paste("No vcf available - exiting"))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.