R/gl2bpp.r
In dartR.base: Analysing 'SNP' and 'Silicodart' Data - Basic Functions

Documented in gl2bpp

#' @name gl2bpp
#' @title Converts a genlight object into a format suitable for input to the BPP 
#' program
#' @family linker
#' 
#' @description
#' This function generates the sequence alignment file and the Imap file. The 
#' control file should produced by the user. 

#' If method = 1, heterozygous positions are replaced by standard ambiguity 
#' codes.

#' If method = 2, the heterozygous state is resolved by randomly assigning one 
#' or the other SNP variant to the individual.

#' Trimmed sequences for which the SNP has been trimmed out, rarely, by adapter
#'  mis-identity are deleted.

#' This function requires 'TrimmedSequence' to be among the locus metrics
#' (\code{@other$loc.metrics}) and information of the type of alleles (slot
#' loc.all e.g. 'G/A') and the position of the SNP in slot position of the
#'  ```genlight``` object (see testset.gl@position and testset.gl@loc.all for
#'  how to format these slots.)

#' @param x Name of the genlight object containing the SNP data [required].
#' @param method One of 1 | 2, see details [default = 1].
#' @param merge.secondaries Logical, if TRUE, secondary loci are merged into a 
#' single sequence [default = FALSE].
#' @param outfile Name of the saved sequence alignment file ["output_bpp.txt"].
#' @param imap Name of the saved Imap file ["Imap.txt"].
#' @param outpath Path where to save the output file [default global working 
#' directory or if not specified, tempdir()].
#' @param verbose Verbosity: 0, silent or fatal errors; 1, begin and end; 2,
#'  progress log; 3, progress and results summary; 5, full report 
#'  [default 2 or as specified using gl.set.verbosity].
#'  
#' @details
#' It's important to keep in mind that analyses based on coalescent theory, 
#' like those done by the programme BPP, are meant to be used with sequence
#'  data. In this type of data, large chunks of DNA are sequenced, so when we
#'   find polymorphic sites along the sequence, we know they are all on the same
#'    chromosome. This kind of data, in which we know which chromosome each 
#'    allele comes from, is called "phased data." Most data from reduced 
#'    representation genome-sequencing methods, like DArTseq, is unphased, 
#'    which means that we don't know which chromosome each allele comes from. 
#'    So, if we apply coalescence theory to data that is not phased, we will get
#'     biased results. As in Ellegren et al., one way to deal with this is to 
#'     "haplodize" each genotype by randomly choosing one allele from 
#'     heterozygous genotypes (2012) by using method = 2.
#'     
#'     Be mindful that there is little information in the literature on the
#'      validity of this method. 
#'      
#' @author Custodian: Luis Mijangos (Post to
#'  \url{https://groups.google.com/d/forum/dartr})
#'  
#' @examples
#' require(dartR.data)
#' test <- gl.filter.callrate(platypus.gl,threshold = 1)
#' test <- gl.filter.monomorphs(test)
#' test <- gl.subsample.loc(test,n=25)
#' gl2bpp(x = test, outpath=tempdir())
#' 
#' @references
#' \itemize{
#' \item Ellegren, Hans, et al. "The genomic landscape of species divergence in 
#'Ficedula flycatchers." Nature 491.7426 (2012): 756-760.
#' \item Flouri T., Jiao X., Rannala B., Yang Z. (2018) Species Tree Inference 
#'with BPP using Genomic Sequences and the Multispecies Coalescent. Molecular
#' Biology and Evolution, 35(10):2585-2593. doi:10.1093/molbev/msy147
#'}
#'
#' @export
#' @return  returns no value (i.e. NULL)

gl2bpp <- function(x,
                   method = 1,
                   merge.secondaries = FALSE,
                   outfile = "output_bpp.txt", 
                   imap = "Imap.txt",
                   outpath = NULL,
                   verbose = NULL) {
  
  
  # SET VERBOSITY
  verbose <- gl.check.verbosity(verbose)
  
  # SET WORKING DIRECTORY
  outpath <- gl.check.wd(outpath,verbose=0)
  outfilespec <- file.path(outpath, outfile)
  outfilespec_imap <- file.path(outpath, imap)

  # FLAG SCRIPT START
  funname <- match.call()[[1]]
  utils.flag.start(func = funname,
                   build = "v.2023.2",
                   verbose = verbose)
  
  # CHECK DATATYPE
  datatype <- utils.check.datatype(x, accept = "SNP", verbose = verbose)
  
  # FUNCTION SPECIFIC ERROR CHECKING
  
  # CHECK IF PACKAGES ARE INSTALLED
  pkg <- "seqinr"
  if (!(requireNamespace(pkg, quietly = TRUE))) {
    cat(error(
      "Package",
      pkg,
      " needed for this function to work. Please install it.\n"
    ))
    return(-1)
  }
  
  # Check monomorphs have been removed up to date
  if (x@other$loc.metrics.flags$monomorphs == FALSE) {
    if (verbose >= 2) {
      cat(
        warn(
          "  Warning: Dataset contains monomorphic loci which will be included in the output fasta file\n"
        )
      )
    }
  }
  
  if (length(x@other$loc.metrics$TrimmedSequence
             ) != nLoc(x)) {
    stop(
      error(
        "Fatal Error: Data must include Trimmed Sequences for each loci in a column called 'TrimmedSequence' in the @other$loc.metrics slot.\n"
      )
    )
  }
  if (length(x@position) != nLoc(x)) {
    stop(
      error(
        "Fatal Error: Data must include position information for each loci in the @position slot.\n"
      )
    )
  }
  if (length(x@loc.all) != nLoc(x)) {
    stop(error(
      "Fatal Error: Data must include type of alleles in the @loc.all slot.\n"))
  }
  
  if (verbose >= 2) {
    cat(
      report(
        "Assigning ambiguity codes to heterozygote SNPs\n"
      )
    )
  } 
  
  # DO THE JOB
  if (verbose >= 2) {
    cat(report(
      paste(
        "  Removing loci for which SNP position is outside the length of the trimmed sequences\n"
      )
    ))
  }
  x <- gl.filter.overshoot(x, verbose = 0)
  
  #sort loci by snp position in case there are secondaries to be merged
  
  x <- x[,order(x@position)]
  
  

  # METHOD = AMBIGUITY CODES
  
  if (method == 1) {
  
    allnames <- locNames(x)
    snp <- as.character(x@loc.all)
    trimmed <- as.character(x@other$loc.metrics$TrimmedSequence)
    snpmatrix <- as.matrix(x)
    
    # Create a lookup table for the ambiguity codes A T G C A A W R M) T W T K Y G R K G S C M Y S C
    
    conversion <-
      matrix(
        c(
          "A",
          "W",
          "R",
          "M",
          "W",
          "T",
          "K",
          "Y",
          "R",
          "K",
          "G",
          "S",
          "M",
          "Y",
          "S",
          "C"
        ),
        nrow = 4,
        ncol = 4
      )
    colnames(conversion) <- c("A", "T", "G", "C")
    rownames(conversion) <- colnames(conversion)
    
    # Extract alleles 1 and 2
    allelepos <- x@position
    allele1 <- gsub("(.)/(.)", "\\1", snp, perl = T)
    allele2 <- gsub("(.)/(.)", "\\2", snp, perl = T)
    
    # Prepare the output fastA file
    if (verbose >= 2) {
      cat(report("Generating haplotypes ... This may take some time\n"))
    }
    
    sink(outfilespec)
    
    for (j in 1:nLoc(x)) {
      seq <- NA
      for (i in 1:nInd(x)) {
        if (is.na(snpmatrix[i, j])) {
          code <- "N"
        } else {
          if (snpmatrix[i, j] == 0) {
            a1 <- allele1[j]
            a2 <- allele1[j]
          }
          if (snpmatrix[i, j] == 1) {
            a1 <- allele1[j]
            a2 <- allele2[j]
          }
          if (snpmatrix[i, j] == 2) {
            a1 <- allele2[j]
            a2 <- allele2[j]
          }
          code <- conversion[a1, a2]
        }
        snppos <- allelepos[j]
          if (code != "N") {
            seq[i] <-
              paste0(
                substr(
                  as.character(
                    x@other$loc.metrics$TrimmedSequence[j]
                  ),
                  1,
                  snppos
                ),
                code,
                substr(
                  x@other$loc.metrics$TrimmedSequence[j],
                  snppos + 2,
                  500
                )
              )
          } else {
            seq[i] <-
              paste(rep("N", nchar(
                as.character(
                  x@other$loc.metrics$TrimmedSequence[j]
                )
              )), collapse = "")
          }
      }

      cat(paste0(nInd(x)," ",nchar(seq[1])),"\n")
      cat(paste0(locNames(x)[j],"^",indNames(x)," ",seq,"\n"))

    }
    
    # Close the output file
    sink()
  }
    
    # METHOD = RANDOM ASSIGNMENT
    
    if (method == 2 ) {
      # Randomly allocate heterozygotes (1) to homozygote state (0 or 2)
      matrix <- as.matrix(x)
      # cat('Randomly allocating heterozygotes (1) to homozygote state (0 or 2)\n') pb <- txtProgressBar(min=0, max=1, style=3,
      # initial=0, label='Working ....') getTxtProgressBar(pb)
      r <- nrow(matrix)
      c <- ncol(matrix)
      for (i in 1:r) {
        for (j in 1:c) {
          if (matrix[i, j] == 1 && !is.na(matrix[i, j])) {
            # Score it 0 or 2
            matrix[i, j] <- (sample(1:2, 1) - 1) * 2
          }
        }
      }
      
      if (verbose >= 2) {
        cat(report("Generating haplotypes ... This may take some time\n"))
      }
      
      sink(outfilespec)
      
      # For each individual, and for each locus, generate the relevant haplotype

      for (j in 1:c) {
        seq <- NA
        # Reassign some variables
        trimmed <- as.character(x@other$loc.metrics$TrimmedSequence[j])
        snp <- x@loc.all[j]
        snpos <- x@position[j]
        # Shift the index for snppos to start from 1 not zero
        snpos <- snpos + 1
        for (i in 1:r) {
          # If the score is homozygous for the reference allele
            if (matrix[i, j] == 0 && !is.na(matrix[i, j])) {
              seq[i] <- trimmed
            }
   
          # If the score is homozygous for the alternate allele
          if (matrix[i, j] == 2 && !is.na(matrix[i, j])) {
            # Split the trimmed into a beginning sequence, the SNP and an end sequences
            start <- stringr::str_sub(trimmed, end = snpos - 1)
            snpbase <- stringr::str_sub(trimmed,
                               start = (snpos),
                               end = (snpos))
            end <- stringr::str_sub(trimmed, start = snpos + 1)
            # Extract the SNP transition bases (e.g. A and T)
            state1 <- gsub("(.)/(.)", "\\1", snp, perl = TRUE)
            state2 <- gsub("(.)/(.)", "\\2", snp, perl = TRUE)
            # Change the SNP state to the alternate
            if (snpbase == state1) {
              snpbase <- state2
            } else {
              snpbase <- state1
            }
            
              # Paste back to form the alternate fragment
              target <- paste0(start, snpbase, end)
              # Remove adaptors and save the trimmed alternate sequence
              seq[i] <-  stringr::str_sub(target,
                                 start = 1,
                                 end = nchar(trimmed))
          }
          
          # If the SNP state is missing, assign NNNNs
          if (is.na(matrix[i, j])) {
            seq[i] <- "N"
              seq[i] <-
                stringr::str_pad(
                  seq[i],
                  nchar(trimmed),
                  side = c("right"),
                  pad = "N"
                )
          }
        }

        cat(paste0(nInd(x)," ",nchar(seq[1])),"\n")
        cat(paste0(locNames(x)[j],"^",indNames(x)," ",seq,"\n"))
        
      }  
      
      # Close the output fastA file
      sink()
      
    }
  
  # merge secondaries
  if (merge.secondaries) {
    
    
    
    if (verbose >= 2) {
      cat(report(
        paste(
          "  Merging secondary alleles\n"
        )
      ))
    }
    #load bpp file
    con <- file(outfilespec, "r")
    
    bppf <- trimws(readLines(con))
    close(con)
    
    b2 <- bppf
    
    
    a <- strsplit(as.character(x@other$loc.metrics$AlleleID), "\\|")
    cloneid <- unlist(lapply(a, "[",1))
    tc <- table(cloneid)
    sn <- names(tc[tc > 1])

    if (length(sn)>0) {    
    for (xx in 1:length(sn))
    {
    dell <- NULL
    ssl <- NA
    cc <- 1
    
    a <- strsplit(b2, "-")
    bb2 <- unlist(lapply(a, "[",1))
      
    sl <- which(bb2 == sn[xx])
    
    ll <- b2[sl]
    
    #find
    allindslocs <- b2[sl]
    
    #go over all individuals
    indsbpp <- str_extract(ll, "(?<=\\^)[^ ]+")
    pos <- str_extract(ll, "(?<=-)[0-9]+(?=-)")
    sequence <- str_match(ll, "\\^\\S+\\s([A-Z]+)$")[, 2]
    

    for (i in 1:nInd(x)) {
    
      index <- which(indsbpp %in% indNames(x)[i])
      lines <- ll[index]
      snppos <- as.numeric(pos[index])+1
      seq <- substr(sequence[index][1],1,snppos[1])
      
      
      for (ii in 2:length(index))
      {
        seq <- paste0(seq, substr(sequence[index][ii],snppos[ii-1]+1,snppos[ii]))
      }
      #fill in the rest...
      seq <- paste0(seq,substr(sequence[index][ii],snppos[ii-1]+1,nchar(sequence[index][ii])))
      #now combine and erase the lines 

      ssl[cc] <- paste0("sec-", strsplit(bppf[sl[index][1]]," ")[[1]][1]," ", seq)
      cc <- cc+1
    }
    b2[sl[1:nInd(x)]] <- ssl
    
    #delete the others (and the first line)
    dell <- sl[-c(1:nInd(x))]
    dell <- c(min(dell)-1, dell)  #add the first line
    
    b2 <- b2[-dell] 
    
    
    
    }
    
    
    
    con <- file(paste0(outfilespec), "w")
    writeLines(b2, con = con)
    close(con)
   
    
    } #if length sn > 0 
  } # if merge.secondaries
  
  
    
    # Imap file
    sink(outfilespec_imap)
    cat(paste(indNames(x), pop(x),"\n"))
    sink()
  
  # FLAG SCRIPT END
  
  if (verbose >= 1) {
    cat(report("Completed:", funname, "\n"))
  }
  
  return(NULL)
  
}