polyBreedR: Genomics-assisted breeding for polyploids (and diploids)

Documented in write_vcf

#' Create VCFv4.3 file
#' 
#' Create VCFv4.3 file
#' 
#' Several standard INFO key are recognized:
#' ##INFO=<ID=REF,Number=A,Type=Character,Description=\"Array allele (A/B) in reference genome\">
#' ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
#' ##INFO=<ID=DP.AVG,Number=1,Type=Float,Description="Average Sample Depth">
#' ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
#' ##INFO=<ID=AB,Number=1,Type=Float,Description="Allelic Bias">
#' ##INFO=<ID=SE,Number=1,Type=Integer,Description="Sequencing Error (PHRED)">
#' ##INFO=<ID=OD,Number=1,Type=Integer,Description="OverDispersion (PHRED)">
#' ##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
#' ##INFO=<ID=AF.GT,Number=A,Type=Float,Description="Allele Frequency based on GT">
#' ##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes">"
#' ##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles">"

#' Every element of \code{geno} is m x n matrix (m variants, n samples), e.g., AD, GT. The FORMAT field is created from the order and names of \code{geno}. Sample names taken from colnames of \code{geno}. Metadata for \code{geno} is generated from the names of the list:
#' ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#' ##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allele Depth">
#' ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Sample Depth">
#' ##FORMAT=<ID=DS,Number=1,Type=Float,Description="Posterior Mean Dosage">
#' ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
#'
#' Any additional metadata should be included without the ## prefix.
#'
#' @param filename VCF file name
#' @param fixed character matrix with 8 columns: CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO
#' @param geno named list of genotype matrices, see Details
#' @param other.meta optional, other metadata (without ##) besides INFO and FORMAT keys
#' 
#' @export

write_vcf <- function(filename, fixed, geno, other.meta=NULL) {

  stopifnot(colnames(fixed)==c("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"))
  fixed[is.na(fixed)] <- "."
  
  nc <- nchar(filename)
  if (substr(filename,nc-2,nc)==".gz") {
    con <- gzfile(filename,open="w")
  } else {
    con <- file(filename,open="w")
  }
  
  writeLines(con=con, text="##fileformat=VCFv4.3")
  
  if (!is.null(other.meta))
    writeLines(con=con, text=paste0("##",other.meta))
  
  contigs <- apply(array(unique(fixed[,"CHROM"])),1,
              function(z){sub("Q",z,"##contig=<ID=Q>")}) 
  writeLines(con=con, text=contigs)
  
  x <- lapply(strsplit(fixed[,"INFO"],split=";"),strsplit,split="=")
  info.keys <- setdiff(unique(unlist(lapply(x,function(z){sapply(z,"[[",1)}),recursive=T)),".")
  
  info.meta <- c("##INFO=<ID=REF,Number=A,Type=Character,Description=\"Array allele (A/B) in reference genome\">",
    "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with data\">",
    "##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">",
    "##INFO=<ID=DP.AVG,Number=1,Type=Float,Description=\"Average Sample Depth\">",
    "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">",
    "##INFO=<ID=AB,Number=1,Type=Float,Description=\"Allelic Bias\">",
    "##INFO=<ID=SE,Number=1,Type=Integer,Description=\"Sequencing Error (PHRED)\">",
    "##INFO=<ID=OD,Number=1,Type=Integer,Description=\"OverDispersion (PHRED)\">",
    "##INFO=<ID=AF.GT,Number=A,Type=Float,Description=\"Allele Frequency based on GT\">",
    "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">",
    "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes\">",
    "##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles\">")
    
  if (length(info.keys) > 0) {
    x2 <- lapply(info.keys,function(x){sub("Q",x,"<ID=Q,")})
    ix <- lapply(x2,grep,x=info.meta,fixed=T)
    iv <- which(sapply(ix,length)==0)
    if (length(iv) > 1)
      stop("Unrecognized keys in INFO")
    writeLines(con=con, text=info.meta[unlist(ix)])
  }
  
  m <- nrow(fixed)
  stopifnot(m==nrow(geno[[1]]))
  id <- colnames(geno[[1]])
  n <- length(id)
  if (any(duplicated(id)))
    stop("Duplicate sample names not allowed")
  
  format.keys <- names(geno)
  nf <- length(format.keys)
  if (nf > 1) {
    FORMAT <- paste(format.keys,collapse=":")
  } else {
    FORMAT <- format.keys
  }
  
  format.meta <- c("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
  "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allele Depth\">",
  "##FORMAT=<ID=AI,Number=R,Type=Integer,Description=\"Normalized Allele Intensity x 100%\">",
  "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Sample Depth\">",
  "##FORMAT=<ID=DS,Number=A,Type=Float,Description=\"Posterior Mean Dosage\">",
  "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">")

  ix <- lapply(as.list(paste0("ID=",format.keys)),grep,x=format.meta,fixed=T)
  iv <- which(sapply(ix,length)==0)
  if (length(iv) > 1)
    stop("Unrecognized FORMAT in geno")
  
  writeLines(con=con, text=format.meta[unlist(ix)])
  
  writeLines(con=con,
             text=paste(c("#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT",id),
                        collapse="\t"))
  
  for (i in 1:m) {
    tmp2 <- character(n)
    for (j in 1:n) {
      tmp <- geno[[1]][i,j]
      if (nf > 1) {
        for (k in 2:nf) 
          tmp <- c(tmp,geno[[k]][i,j])
        tmp2[j] <- paste(tmp,collapse=":")
      } else {
        tmp2[j] <- tmp
      }
    }
    writeLines(con=con,text=paste(c(as.character(fixed[i,]),FORMAT,tmp2),collapse="\t"))
  }
  
  close(con)
}