pseudoRef: Substitute Reference Genome with Sample Specific SNP variants

Documented in pseudoRef

#' \code{pseudoRef}
#' Make a pseudo reference genome.
#'
#' @param fa Path for the reference fasta file. [string or DNAStringSet/DNAString object]
#' @param snpdt A data.table object with heterozygote SNPs coded with IUPAC ambiguity codes.
#'              [data.table, 4 required columns: chr, pos, ref, alt, (sample1, ..., sampleN)]
#' @param sidx A vector to indicate the sample columns. [vector, default=5:ncol(snpdt)].
#' @param arules Additional nucleotide substitution rules defined by users.
#'               [data.frame, 2 required columns: from, to, default=NULL]
#'               For example, arules <- data.frame(from=c("M", "Y", "R", "K"), to=c("C", "C", "G", "T")).
#' @param outdir Output directory. Sample specific sub-folders will be created. [string]
#'
#' @return A list of summary statistics of subsituted nucleotides. [list].
#'
#' @examples
#' # First of all, use BCFtools to convert VCF into IUPAC coded data.table:
#'
#' # bcftools view JRI20_filtered_snps_annot.bcf.gz -m2 -M2 -v snps -Oz -o JRI20_bi_snps_annot.vcf.gz
#' # bcftools query -f '%CHROM\t%POS\t%REF\t%ALT[\t%IUPACGT]\n' JRI20_bi_snps_annot.vcf.gz > JRI20_bi_snps_annot.txt
#' # bcftools query -f 'chr\tpos\tref\talt[\t%SAMPLE]\n' JRI20_bi_snps_annot.vcf.gz > JRI20_bi_snps_annot.header
#'
#' arules <- data.frame(from=c("M", "Y", "R", "K"), to=c("C", "C", "G", "T"))
#' res <- pseudoRef(fa, snpdt, sidx=5:24, arules, outdir)
#'
#' @export
pseudoRef <- function(fa, snpdt, sidx=5:ncol(snpdt), arules=NULL, outdir){

  ### load reference genome fasta file into DNAStringSet
  library("data.table")
  library("Biostrings")
  if(class(fa) == "character"){fa <- readDNAStringSet(filepath = fa, format="fasta")}
  if(class(fa) != "DNAStringSet" & class(fa) != "DNAString"){stop("fa should be a DNAStringSet or DNAString")}
  if(!is.null(arules) & sum(names(arules) %in% c("from", "to")) !=2)
  {stop("arules should be a data.frame with at least two columns of from and to")}
  if(sum(names(snpdt) %in% c("chr", "pos", "ref", "alt")) !=4)
  {stop("snpdt should be a data.table with at least four columns of [chr, pos, ref, alt, (sample1,..., sampleN)]")}
  tab0 <- alphabetFrequency(fa)
  wd0 <- width(fa)
  nm0 <- names(fa)

  ### chr id, get only the first element of the blank seperated vector.
  chrid <-  gsub(" .*", "", names(fa))
  res <- list()
  ### substitute one sample at a time
  for(s in sidx){
    myfa <- fa
    #tab0 <- table(snpdt[, s, with=FALSE])
    sub <- snpdt[, c(1:4, s), with=FALSE]
    sub <- as.data.frame(sub)
    sid <- names(snpdt)[s]
    names(sub)[5] <- "SAMPLE"
    ## remove missing sites and sites that are the same as ref
    #sub <- as.data.table(sub)

    #sub <- sub[, SAMPLE != "./." & ref != SAMPLE]
    sub <- subset(sub, SAMPLE != "./." & ref != SAMPLE)

    ### replace for each chrom
    for(i in 1:length(chrid)){
      subchr <- subset(sub, chr == chrid[i])
      if(nrow(subchr) > 0){

        idx <- which(chrid[i] == chrid)
        ## replaced base-pairs according to arules
        if(!is.null(arules)){
          arules$from <- as.character(arules$from)
          arules$to <- as.character(arules$to)
          for(k in 1:nrow(arules)){
            # update the SAMPLE column, might have a problem
            subchr[subchr[,5] == arules$from[k], 5] <- arules$to[k]
          }
        }
        myfa[[idx]] <- replaceLetterAt(x=myfa[[idx]], at=subchr[, "pos"], letter=subchr[, "SAMPLE"],
                                       if.not.extending="replace", verbose=FALSE)
      }
    }

    ### write for each sample
    tab1 <- alphabetFrequency(myfa)
    wd1 <- width(myfa)
    nm1 <- names(myfa)
    if(sum(wd0 - wd1) != 0 & sum(nm0 != nm1) != 0)
      {stop(sprintf("!!! changed chr length or chr id for sample [ %s ]", sid))}

    #dir.create(paste0(outdir, "/", sid), showWarnings = FALSE)
    message(sprintf("###>>> printing fasta file for sample: [ %s ] ...", sid))
    outid <- paste0(outdir, "/", sid, ".fasta")
    writeXStringSet(myfa, filepath=outid, format="fasta", append=FALSE)

    ###output reprot
    res[[sid]] <- tab1 - tab0
  }
  return(res)
}