##########################################################################
# Genotyping Uncertainty with Sequencing data - Base package (GUSbase)
# Copyright 2017-2018 Timothy P. Bilton <tbilton@maths.otago.ac.nz>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#########################################################################
#' Convert VCF file into reference/alternative (RA) file.
#'
#' Function for converting a VCF file into RA format.
#'
#' The VCF files must contain some information regarding allelic depth. Currently, the function can use one of the following
#' fields (or group of fields) in a VCF file:
#' \itemize{
#' \item AD field
#' \item AO and RO fields
#' \item DP4 field
#' }
#' Information regarding VCF files and their format can be found at the samtools \href{https://samtools.github.io/hts-specs/VCFv4.3.pdf}{GitHub} page.
#'
#' RA format is a tab-delimited with columns, CHROM, POS, SAMPLES
#' where SAMPLES consists of sampleIDs, which typically (but not necessarily need to) consist of a colon-delimited sampleID, flowcellID, lane, seqlibID.
#' e.g.,
#' \tabular{llll}{
#' CHROM \tab POS \tab 999220:C4TWKACXX:7:56 \tab 999204:C4TWKACXX:7:56 \cr
#' 1 \tab 415 \tab 5,0 \tab 0,3 \cr
#' 1 \tab 443 \tab 1,0 \tab 4,4 \cr
#' 1 \tab 448 \tab 0,0 \tab 0,2
#' }
#' Note: Indels are removed, multiple alternative alleles are removed and ./. is translated into 0,0.
#'
#' The format of the pedigree files is a csv file with the following columns.
#' \itemize{
#' \item SampleID: A unique character string of the sample ID. These must correspond to those found in the VCF file.
#' \item IndividualID: A character giving the ID number of the individual for which the sample corresponds to.
#' Note that some samples can be from the same individual.
#' \item Mother: The ID of the mother as given in the IndividualID. Note, if the mother is unknown then this should be left blank.
#' \item Father: The ID of the father as given in the IndividualID. Note, if the father is unknown then this should be left blank.
#' \item Family: The name of the Family for a group of progeny with the same parents. Note that this is not necessary but if
#' given must be the same for all the progeny.
#' }
#'
#' @param infilename String giving the filename of the VCF file to be converted to RA format.
#' @param direct String of the directory (or relative to the working direct) where the RA file is to be written. Defaults to current working directory.
#' @param makePed A logical value. If TRUE, a pedigree file is initialized.
#' @return A string of the complete file path and name of the RA file created from the function.
#' In addition to creating a RA file, a pedigree file is also initialized in the same folder as the RA file if
#' specified and the pedigree does not already exist.
#' @author Timothy P. Bilton. Adapted from a Python script written by Rudiger Brauning and Rachael Ashby.
#' @seealso \code{\link{readRA}}
#' @examples
#' file <- simDS()
#' RAfile <- VCFtoRA(file$vcf)
#' @export VCFtoRA
VCFtoRA <- function(infilename, direct="./", makePed=F){
## Do some checks
if(!is.character(infilename) || !is.vector(infilename) || length(infilename) !=1)
stop("The input file name is not a string of length 1.")
if(!file.exists(infilename))
stop("Input file does not exist. Check your wording or the file path.")
if(!is.character(direct) || !is.vector(direct) || length(direct) != 1)
stop("Invalid input for the path to the directory where the RA file is to be written.")
cat("Processing VCF file: Converting to RA format.\n\n")
outfilename <- paste0(utils::tail(strsplit(infilename,split=.Platform$file.sep)[[1]],1),".ra.tab")
outpath <- dts(normalizePath(direct, winslash=.Platform$file.sep, mustWork=T))
headerlist = c('CHROM', 'POS')
## Read in the lines of the file
Lines <- readLines(infilename)
## entries for empty genotypes
empty_genotypes <- c("./.",".,.",".",".|.")
start = which(unlist(lapply(Lines, function(x) substr(x,1,2) == "##"))) ## starting position
start = max(start,0) + 1
## create the file
outfile = file.path(outpath,outfilename)
file.create(outfile,showWarnings = F)
## write the heading line
line = strsplit(Lines[start], split="\t")[[1]]
headerlist = c(headerlist, line[10:length(line)])
## Write the header to the RA file
newLines <- list(paste(headerlist, collapse="\t"))
cat("Found",length(headerlist)-2,"samples\n")
## Now write the SNPs
for(i in (start+1):length(Lines)){
line = trimws(Lines[[i]])
line = strsplit(line, split="\t")[[1]]
chrom = line[1]
pos = line[2]
ref = line[4]
alt = line[5]
## Filter out indels and multiple alternative alleles
if (ref == "." || alt == "." || length(alt) > 1)
next
else{
ad_pos <- ro_pos <- ao_pos <- dp4_pos <- NULL
## Check that there is allelic depth in the VCF file
format = strsplit(line[9], split= ":")[[1]]
if("AD" %in% format)
ad_pos = which(format == "AD")
else if(all(c("RO","AO") %in% format)){
ro_pos = which(format == "RO")
ao_pos = which(format == "AO")
}
else if("DP4" %in% format)
dp4_pos = which(format == "DP4")
else
stop(paste0("Error at SNP ", i,". No allelic depth information found. Either:\n",
" AD (alleleic depth)\n",
" RO (Reference allele observation count) and AO (Alternate allele observation count) information is needed.\n",
" DP4 (forward and reverse allele counts for reference and alternate alleles)\n",
"is required."))
## Extract the alleles depths
newline = c()
for(j in line[10:length(line)]){
if (j %in% empty_genotypes)
newline = c(newline,"0,0")
else{
j = strsplit(j, split = ":")[[1]]
if(!is.null(ad_pos) && (length(ad_pos) > 0) ){ #If ad_pos is not null, i.e., there is a value for it, append it to outlist
if( j[ad_pos] %in% empty_genotypes ) #gatk vcf4.2 will fill out genotype fields, even for uncovered data
newline = c(newline,"0,0")
else
newline = c(newline, j[ad_pos])
}
else if(!is.null(ro_pos) && !is.null(ao_pos) && (length(ro_pos) > 0) && (length(ao_pos) > 0)){ #ELSE IF ro_pos and ao_pos are not null or are equal to 0
ad = paste0(j[ro_pos], ",", j[ao_pos])
newline = c(newline, ad)
}
else if(!is.null(dp4_pos) && (length(dp4_pos) > 0) ){
counts = strsplit(j[dp4_pos], split=",")[[1]]
allele1 = as.integer(counts[1]) + as.integer(counts[2])
allele2 = as.integer(counts[3]) + as.integer(counts[4])
ad = paste0(allele1, ",", allele2)
newline = c(newline, ad)
}
else ##Should never really get here, but if AD, AO and RO are all null, it will break the script
stop(paste0("Can't find either Allele Depth (AD) or RO (Reference allele observation count) and AO (Alternate allele observation count) position ", j,".\n"))
}
}
newLines[[i-(start)+1]] <- paste0(c(chrom,pos,newline), collapse = "\t")
}
}
## open the connection to the file
con <- file(outfile)
writeLines(unlist(newLines), con=con)
close(con)
## output the information
cat(length(Lines)-start,"SNPs written\n\n")
cat("Name of RA file: \t\t",outfilename,"\n")
cat("Location of RA file: \t\t",outpath,"/\n\n",sep="")
## Initialize the pedigree file
if(makePed){
pedfile <- paste0(strsplit(paste0(utils::tail(strsplit(infilename,split=.Platform$file.sep)[[1]],1)),split="\\.")[[1]][1],"_ped.csv")
pedpath <- file.path(outpath,pedfile)
if(!file.exists(pedpath)){
cat("A pedigree file has been initialized.\n")
cat("Name of pedigree file: ",pedfile,"\n")
cat("Location of pedigree file: ",outpath,"/\n\n",sep="")
nSamp <- length(headerlist) - 2
utils::write.csv(cbind("SampleID"=c(headerlist[-c(1,2)]),"IndividualID"=rep("",nSamp),"Mother"=rep("",nSamp),
"Father"=rep("",nSamp), "Family"=rep("",nSamp)), file = pedpath, row.names=F)
}
}
return(invisible(outfile))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.