R/data_import.R

Defines functions readDArTag readProcessIsoloci readProcessSamMulti readTASSELGBSv2 readStacks VCF2RADdata MakeTasselVcfFilter consolidateSNPs readTagDigger readHMC

Documented in MakeTasselVcfFilter readDArTag readHMC readProcessIsoloci readProcessSamMulti readStacks readTagDigger readTASSELGBSv2 VCF2RADdata

# functions for import of data into RADdata object

# Function to import read counts from UNEAK output (HapMap.hmc.txt).
# includeloci should be a character vector of loci names to keep in output.
# If shortIndNames is TRUE, just keep the part of the individual name before
# the first underscore.
readHMC <- function(file, includeLoci=NULL, shortIndNames=TRUE,
                    possiblePloidies = list(2), taxaPloidy = 2L,
                    contamRate = 0.001,
                    fastafile = sub("hmc.txt", "fas.txt", file, fixed = TRUE)){
  # read in file; not using read.table, to preserve indiv. names
  filelines <- strsplit(readLines(file), split="\t")
  # extract names of individuals
  indstrings <- filelines[[1]][2 :
                                 (match("HetCount_allele1", filelines[[1]]) - 1)]
  if(shortIndNames){
    indstrings <- sapply(strsplit(indstrings, split="_", fixed=TRUE),
                         function(x) x[1])
  }
  # number of individuals
  nInd <- length(indstrings)
  # number of loci
  nLoc <- ifelse(is.null(includeLoci), length(filelines) - 1,
                 length(includeLoci))
  # set up matrix to contain results
  alDepth <- matrix(as.integer(0), nrow = nInd, ncol = 2*nLoc,
                    dimnames = list(indstrings, NULL))
  # loop to fill in data
  locnames <- character(nLoc) # names of loci
  if(is.null(includeLoci)){  # if we are using all loci in the file
    for(i in 1:nLoc){
      locnames[i] <- filelines[[i+1]][1]
      thesecounts <- strsplit(filelines[[i+1]][2:(nInd+1)],
                              split="|", fixed=TRUE)
      alDepth[,2*i - 1] <- sapply(thesecounts, function(x) as.integer(x[1]))
      alDepth[,2*i] <- sapply(thesecounts, function(x) as.integer(x[2]))
    }
  } else { # if we are just using a subset of loci
    locnames <- includeLoci
    for(i in 2:length(filelines)){
      # get locus name for this line of the file and see if it is in
      # the list of loci that we are keeping. If so, what position?
      Lnum <- fastmatch::fmatch(filelines[[i]][1], locnames)
      # go to next line of file if we don't want this locus
      if(is.na(Lnum)) next
      thesecounts <- strsplit(filelines[[i]][2:(nInd+1)],
                              split="|", fixed=TRUE)
      alDepth[,Lnum*2 - 1] <- sapply(thesecounts, function(x) as.integer(x[1]))
      alDepth[,Lnum*2] <- sapply(thesecounts, function(x) as.integer(x[2]))
    }
  }
  dimnames(alDepth)[[2]] <- paste(rep(locnames, each = 2), c(0,1), sep = "_")
  
  # get nucleotides for the alleles
  fastalines <- readLines(fastafile)
  fastaseq <- fastalines[seq(2,length(fastalines), by = 2)] # just sequences
  fastaloc <- sapply(strsplit(fastalines[seq(1,length(fastalines)-1, by = 2)],
                              split = "_"), function(x) substring(x[1], 2, nchar(x[1])))
  alleleNucleotides <- character(nLoc * 2)
  for(i in 1:nLoc){
    rowid <- fastmatch::fmatch(locnames[i], fastaloc)
    theseseq <- strsplit(fastaseq[c(rowid, rowid + 1)], split = "")
    varpos <- theseseq[[1]] != theseseq[[2]]
    alleleNucleotides[2*i - 1] <- theseseq[[1]][varpos]
    alleleNucleotides[2*i] <- theseseq[[2]][varpos]
  }
  
  return(RADdata(alleleDepth = alDepth, alleles2loc = rep(1:nLoc, each = 2),
                 locTable = data.frame(row.names = locnames), 
                 possiblePloidies = possiblePloidies,
                 contamRate = contamRate,
                 alleleNucleotides = alleleNucleotides,
                 taxaPloidy = taxaPloidy))
}

# function to import read counts from TagDigger
readTagDigger <- function(countfile, includeLoci = NULL, 
                          possiblePloidies = list(2), taxaPloidy = 2L,
                          contamRate = 0.001, 
                          dbfile = NULL, dbColumnsToKeep = NULL,
                          dbChrCol = "Chr", dbPosCol = "Pos",
                          dbNameCol = "Marker name"){
  # read marker database, if applicable
  if(!is.null(dbfile)){
    mydb <- read.csv(dbfile, header = TRUE, stringsAsFactors = FALSE,
                     row.names = make.names(dbNameCol))
    if(!is.null(dbColumnsToKeep)){ # only retain subset of markers
      mydb <- mydb[, make.names(dbColumnsToKeep)]
    }
    if(!all(make.names(c(dbChrCol, dbPosCol)) %in% names(mydb))){
      stop(paste(dbChrCol, "and", dbPosCol, "not found in column names of", 
                 dbfile))
    }
    names(mydb)[match(make.names(dbChrCol), names(mydb))] <- "Chr"
    names(mydb)[match(make.names(dbPosCol), names(mydb))] <- "Pos"
    # subset by loci
    if(!is.null(includeLoci)){
      mydb <- mydb[row.names(mydb) %fin% includeLoci,]
    }
    if(dim(mydb)[1] == 0) stop("includeLoci and mydb don't match")
  }
  # read the counts
#  mycounts <- as.matrix(read.csv(countfile, row.names = 1, header = TRUE))
  # use scan; more code but a lot less processing time
  mycon <- file(countfile, open = 'r')
  myheader <- scan(mycon, what = "", nlines = 1, sep = ",")
  alleles <- myheader[-1]
  nalleles <- length(alleles)
  whatlist <- list(0L)[rep(1, nalleles)]
  names(whatlist) <- alleles
  whatlist <- c(list(Taxa = ""), whatlist)
  mydata <- scan(mycon, what = whatlist, sep = ",")
  close(mycon)
  taxa <- mydata[[1]]
  mydata <- mydata[-1]
  mycounts <- matrix(unlist(mydata), nrow = length(taxa), ncol = nalleles,
                  dimnames = list(taxa, alleles))
  
  # extract marker names 
  mrkrNamesByAl <- sapply(strsplit(alleles, split = "_"),
                          function(x) x[1])
  # subset by loci
  if(!is.null(includeLoci)){
    alToKeep <- mrkrNamesByAl %fin% includeLoci
    if(sum(alToKeep) == 0) stop("countsfile and includeLoci don't match")
    mrkrNamesByAl <- mrkrNamesByAl[alToKeep]
    mycounts <- mycounts[, alToKeep]
  }
  # make locTable if still necessary
  if(is.null(dbfile)){
    mydb <- data.frame(row.names = unique(mrkrNamesByAl))
  }
  # get locTable indices for counts matrix columns
  alleles2loc <- fastmatch::fmatch(mrkrNamesByAl, row.names(mydb))
  # error if there isn't a match in marker names between the two files
  if(any(is.na(alleles2loc))){
    cat(paste("Some markers in", file, "not found in", dbfile), sep = "\n")
    sadMarkers <- unique(mrkrNamesByAl[is.na(alleles2loc)])
    maxMrkrToPrint <- 10
    cat(sadMarkers[1:max(c(maxMrkrToPrint, length(sadMarkers)))], sep = "\n")
    if(length(sadMarkers) > maxMrkrToPrint){
      cat("...", sep = "\n")
    }
    stop(paste("Some markers in", file, "not found in", dbfile))
  }
  
  # get variable nucleotides for tags
  myNT <- sapply(strsplit(dimnames(mycounts)[[2]], split = "_"),
                 function(x) x[2])
  
  
  # make RADdata object
  return(RADdata(mycounts, alleles2loc, mydb, possiblePloidies, contamRate,
                 myNT, taxaPloidy))
}

# Function to consolidate loci imported from VCF back into tags.
# Essentially do phasing on the SNPs in the VCF to make haplotypes, using read depth.
# Assume locTable is already sorted by chromosome and position.
# A reference genome can be provided as FASTA or compressed, indexed FASTA
# from Bioconductor in order to get nucleotides in between variable sites.
# refgenome should either be an FaFile object or a path to a FASTA file for the genome
# tol indicates how dissimilar read depth can be and still have two alleles
# combined into a tag.
consolidateSNPs <- function(alleleDepth, alleles2loc, locTable, alleleNucleotides,
                            tagsize = 80, refgenome = NULL, tol = 0.01){
  if(!all(c("Chr", "Pos") %in% names(locTable))){
    stop("locTable needs Chr for chromosome and Pos for position")
  }
  
  # Set up reference genome if provided.  Make FaFile object.
  if(!is.null(refgenome) && !is(refgenome, "FaFile")){
    if(!is.character(refgenome)){
      stop("refgenome must be character string or FaFile object.")
    }
    if(!file.exists(refgenome)){
      stop(paste("File", refgenome, "not found."))
    }
    if(!file.exists(sprintf("%s.fai", refgenome))){
      # index the fasta file if necessary
      message("Creating FASTA file index...")
      Rsamtools::indexFa(refgenome)
    }
    refgenome <- Rsamtools::FaFile(refgenome)
  }
  # get chromosome names in FASTA
  if(!is.null(refgenome)){
    FAchrnames <- GenomeInfoDb::seqnames(GenomeInfoDb::seqinfo(refgenome))
  }
  
  nLoc <- dim(locTable)[1] # number of loci to start
  # rows in locTable for each chromosome
  rowsByChr <- tapply(1:nLoc, locTable$Chr, function(x) x) 
  nAl <- dim(alleleDepth)[2] # number of alleles to start
  nInd <- dim(alleleDepth)[1] # number of individuals
  
  # preallocate objects to output
  alleleDepthOut <- matrix(0L, nrow = nInd, ncol = nAl,
                           dimnames = list(dimnames(alleleDepth)[[1]],
                                           as.character(1:nAl)))
  alleles2locOut <- integer(nAl)
  alleleNucleotidesOut <- character(nAl)
  locTableOut <- data.frame(row.names = as.character(1:nLoc),
                            Chr = character(nLoc),
                            Pos = integer(nLoc),
                            Ref = character(nLoc),
                            stringsAsFactors = FALSE)
  
  # variables to keep track of which allele and locus we are on
  currAlOut <- 1L
  currLocOut <- 1L
  
  ### Internal functions for consolidateSNP ####
  
  # function for finding allele matches by looking for individuals with only one allele.
  # only searches for homozygotes in marker 1; run this function in both directions.
  findMatchesHomoz <- function(depth1, depth2){
    # which individuals just have one allele for marker 1, 
    # and equal depth for both markers.
    homoz1 <- which(rowSums(depth1 > 0) == 1 & rowSums(depth1) == rowSums(depth2))
    match1 <- integer(0) # to hold allele indices for marker 1
    match2 <- integer(0) # to hold allele indices for marker 2
    for(alCol in 1:dim(depth1)[2]){
      # homozygotes for this particular allele in marker 1
      homoz1Al <- homoz1[depth1[homoz1,alCol] > 0]
      # alleles in marker 2 that have reads for those homozygotes
      thisAlMatch <- which(colSums(depth2[homoz1Al,,drop=FALSE]) > 0)
      # add them to match output
      nNewMatches <- length(thisAlMatch)
      match1 <- c(match1, rep(alCol, nNewMatches))
      match2 <- c(match2, thisAlMatch)
    }
    return(matrix(c(match1, match2), nrow = length(match1), ncol = 2))
  }
  
  # function to identify remaining matches not found using homozygotes
  findRemainingMatches <- function(depth1, depth2, alMatch){
    # find individuals with more than one allele for both markers, 
    # and equal depth for both markers.
    notHomoz <- which(rowSums(depth1 > 0) > 1 & rowSums(depth2 > 0) > 1 &
                        rowSums(depth1) == rowSums(depth2))
    # try consolidating read depth with existing new markers
    cons <- consolidateDepth(depth1[notHomoz,, drop = FALSE], 
                             depth2[notHomoz,, drop = FALSE], alMatch)
    # find individuals that weren't fully matched
    nm <- rowSums(cons) < rowSums(depth1[notHomoz,, drop = FALSE])
    nmInd <- notHomoz[nm]
    cons <- cons[nm,, drop = FALSE]
    # add new alleles until we can match everything
    progress <- TRUE
    while(length(nmInd) > 0 && progress){
      progress <- FALSE
      # Temporary copies of depth matrices that we can subtract from
      depth1T <- depth1[nmInd,, drop = FALSE]
      depth2T <- depth2[nmInd,, drop = FALSE]
      # loop through non-matched individuals
      for(i in 1:length(nmInd)){
        # loop through known new alleles for this ind.
        for(newAl in which(cons[i,] > 0)){
          # corresponding alleles in old markers
          al1 <- alMatch[newAl, 1]
          al2 <- alMatch[newAl, 2]
          # remove reads for this allele
          depth1T[i, al1] <- depth1T[i, al1] - cons[i, newAl]
          depth2T[i, al2] <- depth2T[i, al2] - cons[i, newAl]
        }
      }
      
      # look for matches with what's left
      newAlMatch <- unique(rbind(findMatchesHomoz(depth1T, depth2T),
                                 findMatchesHomoz(depth2T, depth1T)[,2:1]))
      # add these new matches to the list of new alleles
      if(dim(unique(rbind(alMatch, newAlMatch)))[1] > dim(alMatch)[1]){
        progress <- TRUE
        alMatch <- unique(rbind(alMatch, newAlMatch))
        
      } else { # if that didn't work, try looking for alleles with equal depth
        
        # get rid of rows where multiple alleles have same depth
        keepInd <- apply(depth1T, 1, function(x) length(unique(x)) == length(x)) &
          apply(depth2T, 1, function(x) length(unique(x)) == length(x))
        depth1T <- depth1T[keepInd,, drop = FALSE]
        depth2T <- depth2T[keepInd,, drop = FALSE]
        # alleles to potentially match
        als1 <- which(colSums(depth1T) > 0)
        als2 <- which(colSums(depth2T) > 0)
        for(al1 in als1){
          for(al2 in als2){
            # individuals with equal read depth for these two alleles
            if(sum(depth1T[, al1] == depth2T[, al2] & depth1T[, al1] > 0) > 0){
              newAlMatch <- c(al1, al2)
              if(dim(unique(rbind(alMatch, newAlMatch)))[1] > dim(alMatch)[1]){
                alMatch <- rbind(alMatch, newAlMatch)
                progress <- TRUE
              }
            }
          }
        }
        
      }
      
      if(progress){
        # retry consolidation
        cons <- consolidateDepth(depth1[nmInd,, drop = FALSE], 
                                 depth2[nmInd,, drop = FALSE], alMatch)
        nm <- rowSums(cons) < rowSums(depth1[nmInd,, drop = FALSE])
        nmInd <- nmInd[nm]
        cons <- cons[nm,, drop = FALSE]
      }
    }

    return(alMatch)
  }
  
  # Function to consolidate read depth across two markers being merged into one.
  # depth1 and depth2 are read depth matrices for the two markers, respectively,
  # with individuals in rows and alleles in columns.
  # alMatch is a matrix with two columns, with one row for each new allele.  The
  # values indicate the corresponding alleles for the first and second markers.
  consolidateDepth <- function(depth1, depth2, alMatch){
    # depth to output for the new marker
    newdepth <- matrix(0L, nrow = dim(depth1)[1],
                       ncol = dim(alMatch)[1])
    # boolean to figure out if an allele has been processed yet
    allelesDone <- rep(FALSE, dim(alMatch)[1])
    # whether progress has been made
    progress <- TRUE
    
    while(!all(allelesDone) && progress){
      progress <- FALSE
      # loop through alleles for the first marker
      for(al1 in 1:dim(depth1)[2]){
        # which new alleles match this original allele
        newAl <- which(alMatch[,1] == al1 & !allelesDone)
        # skip for now if this allele doesn't correspond to exactly one new, unprocessed allele
        if(length(newAl) != 1) next
        # find individuals with this allele
        indWithAl <- which(depth1[,al1] > 0)
        # add depth to output matrix
        newdepth[indWithAl, newAl] <- depth1[indWithAl, al1]
        # remove these counts from original matrices
        al2 <- alMatch[newAl, 2]
        depth2[indWithAl, al2] <- depth2[indWithAl, al2] - depth1[indWithAl, al1]
        depth1[indWithAl, al1] <- 0L
        # adjust new depth if depth2 was lower than depth1
        d2neg <- depth2[indWithAl, al2] < 0
        newdepth[indWithAl[d2neg], newAl] <- newdepth[indWithAl[d2neg], newAl] +
          depth2[indWithAl[d2neg], al2]
        depth2[indWithAl[d2neg], al2] <- 0L
        
        # mark allele as done
        allelesDone[newAl] <- TRUE
        progress <- TRUE
      }
      # do the same for the second marker
      for(al2 in 1:dim(depth2)[2]){
        newAl <- which(alMatch[,2] == al2 & !allelesDone)
        if(length(newAl) != 1) next
        indWithAl <- which(depth2[,al2] > 0)
        newdepth[indWithAl, newAl] <- depth2[indWithAl, al2]
        al1 <- alMatch[newAl, 1]
        depth1[indWithAl, al1] <- depth1[indWithAl, al1] - depth2[indWithAl, al2]
        depth2[indWithAl, al2] <- 0L
        d1neg <- depth1[indWithAl, al1] < 0
        newdepth[indWithAl[d1neg], newAl] <- newdepth[indWithAl[d1neg], newAl] +
          depth1[indWithAl[d1neg], al1]
        depth1[indWithAl[d1neg], al1] <- 0L
        allelesDone[newAl] <- TRUE
        progress <- TRUE
      }
    }
    
    # Find any individuals with reads remaining to assign.
    # This should only come up if there is homoplasy or recombination within 
    # tags, e.g. alleles AC, AD, BC, and BD all exist.
    indRemaining <- which(rowSums(depth1) > 0 & rowSums(depth2) > 0)
    alRemaining <- which(!allelesDone) # alleles that still need to be assigned
    for(ind in indRemaining){
      als1 <- which(depth1[ind,] > 0) # alleles for marker 1
      als2 <- which(depth2[ind,] > 0) # alleles for marker 2
      # new alleles that are possible matches
      possAl <- alRemaining[alMatch[alRemaining, 1] %in% als1 & 
                              alMatch[alRemaining, 2] %in% als2]
      
      progress <- TRUE
      while(any(depth1[ind,] > 0) && any(depth2[ind,] > 0) && progress){
        progress <- FALSE
        for(al1 in als1){
          if(depth1[ind, al1] <= 0) next
          newAl <- possAl[alMatch[possAl, 1] == al1] # number for new allele(s)
          if(length(newAl) == 1){
            # Resolve cases where only one allele is possible based on 
            # presence/absence.
            al2 <- alMatch[newAl, 2]
          } else {
            # Try to find a match based on depth
            al2 <- which(depth2[ind,] == depth1[ind, al1])
            if(length(al2) != 1) next
            newAl <- newAl[alMatch[newAl, 2] == al2]
            if(length(newAl) != 1) next
          }
          newCount <- min(c(depth1[ind, al1], 
                            depth2[ind, al2]))
          newdepth[ind, newAl] <- newCount
          depth1[ind, al1] <- depth1[ind, al1] - newCount
          depth2[ind, al2] <- depth2[ind, al2] - newCount
          possAl <- possAl[possAl != newAl]
          progress <- TRUE
        }
        for(al2 in als2){ # same procedure with the second marker
          if(depth2[ind, al2] <= 0) next
          newAl <- possAl[alMatch[possAl, 2] == al2]
          if(length(newAl) == 1){
            al1 <- alMatch[newAl, 1]
          } else {
            al1 <- which(depth1[ind,] == depth2[ind, al2])
            if(length(al1) != 1) next
            newAl <- newAl[alMatch[newAl, 1] == al1]
            if(length(newAl) != 1) next
          }
          newCount <- min(c(depth1[ind, al1], 
                            depth2[ind, al2]))
          newdepth[ind, newAl] <- newCount
          depth1[ind, al1] <- depth1[ind, al1] - newCount
          depth2[ind, al2] <- depth2[ind, al2] - newCount
          possAl <- possAl[possAl != newAl]
          progress <- TRUE
        }
      } # end of while loop for unresolved reads
    } # end of loop through unresolved individuals
    return(newdepth)
  } # end of consolidateDepth internal function
  
  ### End internal functions for consolidateSNP ####
  
  # loop through chromosomes
  for(chrset in rowsByChr){
    thisChrom <- locTable$Chr[chrset[1]] # current chromosome name
    if(!is.null(refgenome)){ # matching chromosome name in ref. genome
      if(thisChrom %in% FAchrnames){
        thisFAchr <- thisChrom
      } else {
        thisFAchr <- grep(paste("(Chr|chr|CHR)(omosome|OMOSOME)?", thisChrom, sep = ""),
                          FAchrnames, value = TRUE)
      }
      if(length(thisFAchr) == 0){
        warning(paste("Couldn't match", thisChrom, "to reference genome."))
      }
      if(length(thisFAchr) > 1){
        stop(paste(thisChrom, "matches multiple sequence names in reference genome."))
      }
    }
    message(paste("Phasing", length(chrset), "SNPs on chromosome", thisChrom))
    if(!identical(locTable$Pos[chrset], sort(locTable$Pos[chrset]))){
      stop(paste(thisChrom, ": Loci must be sorted by position.", sep = ""))
    }
    currLocIn <- 1 # current locus number index WITHIN chrset (this chromosome)
    # data for this locus
    lastDepth <- alleleDepth[, alleles2loc == chrset[1], drop = FALSE]
    lastName <- row.names(locTable)[chrset[1]]
    lastPos <- locTable$Pos[chrset[1]]
    lastSeq <- alleleNucleotides[alleles2loc == chrset[1]]
    lastRef <- locTable$Ref[chrset[1]]
    
    # loop through loci on this chromosome
    for(currLocIn in 2:(length(chrset)+1)){
      if(currLocIn <= length(chrset)){
        # data for next locus
        thisDepth <- alleleDepth[, alleles2loc == chrset[currLocIn], drop = FALSE]
        thisName <- row.names(locTable)[chrset[currLocIn]]
        thisPos <- locTable$Pos[chrset[currLocIn]]
        thisSeq <- alleleNucleotides[alleles2loc == chrset[currLocIn]]
        thisRef <- locTable$Ref[chrset[currLocIn]]
        
        # get proportion difference in depth between these two loci
        diff <- sum(abs(rowSums(thisDepth) - rowSums(lastDepth))) / 
          ((sum(thisDepth) + sum(lastDepth))/2)
      }
      
      if(length(chrset) == 1 || diff > tol || thisPos - lastPos + 1 > tagsize 
         || currLocIn > length(chrset)){
        ## If these are different loci (either due to counts or distance)
        ## put the "last" data into the output, and make the new data the last.
        
        thisNAl <- dim(lastDepth)[2] # number of alleles for locus to output
        thisAlOut <- (1:thisNAl) + currAlOut - 1L # indices for all alleles to output
        
        # add data to output objects
        alleleDepthOut[,thisAlOut] <- lastDepth
        dimnames(alleleDepthOut)[[2]][thisAlOut] <- dimnames(lastDepth)[[2]]
        alleles2locOut[thisAlOut] <- currLocOut
        alleleNucleotidesOut[thisAlOut] <- lastSeq
        row.names(locTableOut)[currLocOut] <- lastName
        locTableOut$Chr[currLocOut] <- thisChrom
        locTableOut$Pos[currLocOut] <- lastPos
        locTableOut$Ref[currLocOut] <- lastRef
        
        if(length(chrset) > 1){
          # shift "this" locus to "last" locus
          lastDepth <- thisDepth
          lastName <- thisName
          lastPos <- thisPos
          lastSeq <- thisSeq
          lastRef <- thisRef
        }
        
        # increment current allele and locus
        currAlOut <- currAlOut + thisNAl
        currLocOut <- currLocOut + 1L
      } else {
        ## If these are the same locus, merge them.
        
        # matrix to indicate how alleles match up
        alMatch <- matrix(NA_integer_, nrow = 0, ncol = 2,
                          dimnames = list(NULL, c("last", "this")))
        # find individuals in "last" matrix that only have one allele
        alMatch <- rbind(alMatch, findMatchesHomoz(lastDepth, thisDepth))
        # find individuals in "this" matrix that only have one allele
        alMatch <- rbind(alMatch, findMatchesHomoz(thisDepth, lastDepth)[,2:1])
        # reduce to unique set of matches
        alMatch <- unique(alMatch, MARGIN = 1)
        # match any remaining alleles that don't have "homozygotes"
        alMatch <- findRemainingMatches(lastDepth, thisDepth, alMatch)
        
        # if matching didn't work, skip adding this SNP
        if(dim(alMatch)[1] < dim(lastDepth)[2]) next
        
        # make new set of haplotype sequences
        startPosFromReference <- lastPos + SummarizedExperiment::width(lastSeq[1])
        endPosFromReference <- thisPos - 1
        if(endPosFromReference >= startPosFromReference &&
           !is.null(refgenome) && length(thisFAchr) == 1){
          # retrieve reference sequence and past onto end of lastSeq
          nonvarSeq <- Biostrings::getSeq(refgenome,
                              GenomicRanges::GRanges(seqnames = thisFAchr,
                                IRanges::IRanges(startPosFromReference,
                                                 endPosFromReference),
                                        strand = "+"))[[1]]
          lastSeq <- paste(lastSeq, nonvarSeq, sep = "")
          lastRef <- paste(lastRef, nonvarSeq, sep = "")
        }
        lastSeq <- paste(lastSeq[alMatch[,1]], thisSeq[alMatch[,2]], sep = "")
        lastRef <- paste(lastRef, thisRef, sep = "")
        
        # make new depth matrix
#        print(c(lastName, thisName)) # debug
        lastDepth <- consolidateDepth(lastDepth, thisDepth, alMatch)
        dimnames(lastDepth)[[2]] <- paste(lastName, lastSeq, sep = "_")
      }
    } # end of loop through loci
  } # end of loop through chromosomes
  
  # trim output to remove columns not used.
  alleleDepthOut <- alleleDepthOut[, 1:(currAlOut - 1)]
  alleles2locOut <- alleles2locOut[1:(currAlOut - 1)]
  alleleNucleotidesOut <- alleleNucleotidesOut[1:(currAlOut - 1)]
  locTableOut <- locTableOut[1:(currLocOut - 1),]
  
  return(list(alleleDepth = alleleDepthOut, alleles2loc = alleles2locOut,
              alleleNucleotides = alleleNucleotidesOut,
              locTable = locTableOut))
}

# Function to make a function for filtering a VCF file.
# Assumes TASSEL GBSv2 format. (Diploid GT is listed first for each genotype.)
# The output function is used in the prefilters argument for 
# VariantAnnotation::filterVcf.
MakeTasselVcfFilter <- function(min.ind.with.reads = 200,
                                min.ind.with.minor.allele = 10){
  function(lines){
    # vector to indicate the number of individuals with reads for each line
    ind.with.reads <- sapply(gregexpr("[[:blank:]][[:digit:]]/[[:digit:]]:",
                                      lines), 
                             function(x){
                               if(x[1] == -1){
                                 0
                               } else {
                                 length(x)
                               }
                             })
    
    # set up vector to indicate whether each line should be kept
    result <- ind.with.reads >= min.ind.with.reads
    # check on ind with minor allele
    if(min.ind.with.minor.allele > 0){
      ind.with.ref <- sapply(gregexpr("[[:blank:]](0/[[:digit:]]|[[:digit:]]/0):",
                                      lines[result]), 
                             function(x){
                               if(x[1] == -1){
                                 0
                               } else {
                                 length(x)
                               }
                             })
      ind.with.alt <- sapply(gregexpr("[[:blank:]]([123]/[[:digit:]]|[[:digit:]]/[123]):",
                                      lines[result]), 
                             function(x){
                               if(x[1] == -1){
                                 0
                               } else {
                                 length(x)
                               }
                             })
      result[result] <- ind.with.ref >= min.ind.with.minor.allele &
        ind.with.alt >= min.ind.with.minor.allele
    }
    return(result)
  }
}

# Function to import data from a VCF file.
# Reads in VCF using BioConductor, then phases SNPs into haplotypes using 
# consolidateSNPs.
# file can be a character string or a TabixFile.
# samples is a character vector of the names of samples to retain.
# svparam can generally be left as-is unless you have additional columns to 
# import to locTable (from fixed or info) or if you have specific genomic
# ranges to import with "which".
# expectedAlleles and expectedLoci are for after SNP phasing and
# consolidation to haplotypes.
VCF2RADdata <- function(file, phaseSNPs = TRUE, tagsize = 80, refgenome = NULL, 
                        tol = 0.01, al.depth.field = "AD", 
                        min.ind.with.reads = 200, 
                        min.ind.with.minor.allele = 10,
                        possiblePloidies = list(2), taxaPloidy = 2L,
                        contamRate = 0.001,
                        samples = VariantAnnotation::samples(VariantAnnotation::scanVcfHeader(file)),
                        svparam = VariantAnnotation::ScanVcfParam(fixed = "ALT",
                                                                  info = NA, 
                                                                  geno = al.depth.field,
                                                                  samples = samples),
                        yieldSize = 5000, expectedAlleles = 5e5, 
                        expectedLoci = 1e5, maxLoci = NA){
  
  # clean up parameters for import
  if(is.na(VariantAnnotation::vcfFixed(svparam))){
    VariantAnnotation::vcfFixed(svparam) <- "ALT"
  }
  if(length(VariantAnnotation::vcfFixed(svparam)) > 0 &&
     !"ALT" %in% VariantAnnotation::vcfFixed(svparam)){
    VariantAnnotation::vcfFixed(svparam) <- 
      c(VariantAnnotation::vcfFixed(svparam), "ALT")
  }
  if(length(VariantAnnotation::vcfGeno(svparam)) > 0 && 
     is.na(VariantAnnotation::vcfGeno(svparam)[1])){
    stop("geno field must be provided in svparam.")
  }
  if(!identical(VariantAnnotation::vcfGeno(svparam), "AD")){
    warning("Allele depth field not set to AD.")
  }
  if(!is.na(VariantAnnotation::vcfSamples(svparam)[1])){
    samples <- VariantAnnotation::vcfSamples(svparam)
  }
  if(!all(samples %in% VariantAnnotation::samples(VariantAnnotation::scanVcfHeader(file)))){
    stop("Not all samples given in argument found in file.")
  }
  if(length(samples) < min.ind.with.reads){
    stop("Need to adjust min.ind.with.reads for number of samples in dataset.")
  }
  if(length(samples)/2 < min.ind.with.minor.allele){
    stop("Need to adjust min.ind.with.minor.allele for number of samples in dataset.")
  }
  
  # determine what extra columns to add to locTable
  extracols <- c(VariantAnnotation::vcfFixed(svparam),
                 VariantAnnotation::vcfInfo(svparam))
  hdr <- VariantAnnotation::scanVcfHeader(file)
  if(length(VariantAnnotation::vcfFixed(svparam)) == 0){
    extracols <- c("QUAL", "FILTER", extracols)
  }
  if(length(VariantAnnotation::vcfInfo(svparam)) == 0){
    extracols <- c(extracols, row.names(VariantAnnotation::info(hdr)))
  }
  extracols <- extracols[!extracols %in% c("CHROM", "POS", "ID", "REF", "ALT")]
  extracols <- extracols[!is.na(extracols)]
  # preallocate objects for constructing RADdata object
  alleleDepth <- matrix(0L, nrow = length(samples), ncol = expectedAlleles,
                        dimnames = list(samples, 1:expectedAlleles))
  locTable <- data.frame(row.names = as.character(1:expectedLoci),
                         Chr = character(expectedLoci), 
                         Pos = integer(expectedLoci),
                         Ref = character(expectedLoci),
                         matrix(nrow = expectedLoci, ncol = length(extracols),
                                dimnames = list(NULL, extracols)),
                         stringsAsFactors = FALSE)
  alleles2loc <- integer(expectedAlleles)
  alleleNucleotides <- character(expectedAlleles)
  # to track which allele we're on
  currAl <- 0L
  currLoc <- 0L
  
  # create Tabix file
  if(is.character(file) && !endsWith(file, ".bgz")){
    if(file.exists(paste(file, ".bgz", sep = ""))){
      file <- paste(file, ".bgz", sep = "")
    } else {
      message("Compressing file with bgzip.")
      tempbgz <- tempfile(fileext = ".bgz") # temporary file for example
      file <- Rsamtools::bgzip(file, dest = tempbgz)
      message(paste("Compressed VCF sent to", file))
    }
  }
  if(is.character(file) && !file.exists(paste(file, ".tbi", sep = ""))){
    message("Indexing VCF.")
    Rsamtools::indexTabix(file, format = "vcf")
  }
  tfile <- Rsamtools::TabixFile(file, yieldSize = yieldSize)
  
  # set up genome argument if needed
  genome <- VariantAnnotation::seqinfo(hdr)
  if(length(genome) == 0 && length(VariantAnnotation::vcfWhich(svparam)) > 0){
    genome <- unique(names(VariantAnnotation::vcfWhich(svparam)))
    genome <- GenomeInfoDb::Seqinfo(seqnames = genome)
  }

  # Read data one chunk at a time
  open(tfile)
  message("Reading file...")
  while(nrow(vcf <- VariantAnnotation::readVcf(tfile, genome = genome, 
                                               param = svparam))){
    thisNloc <- nrow(vcf) # number of loci in this chunk
    message("Unpacking data from VCF...")
    # reference alleles
    thisRef <- as.character(VariantAnnotation::ref(vcf))
    # alternative alleles; clean out ones w/o alt allele
    thisAltList <- lapply(VariantAnnotation::alt(vcf), function(x){
      char <- as.character(x)
      char <- char[char != ""]
      return(char)}) 
    nAlt <- sapply(thisAltList, length) # n alt alleles per locus
    thisNallele <- thisNloc + sum(nAlt) # n alleles in this chunk
    thisAlt <- unlist(thisAltList)
    
    # put reference and alternative alleles together into alleleNucleotides
    thisAlleleNucleotides <- character(thisNallele)
    alsums <- cumsum(nAlt + 1)
    refpos <- c(1, alsums[-thisNloc] + 1)
    thisAlleleNucleotides[refpos] <- thisRef
    thisAlleleNucleotides[-refpos] <- thisAlt
    # set up alleles2loc
    thisAlleles2loc <- rep(1:thisNloc, times = nAlt + 1)
    # set up locTable
    thisLocTable <- data.frame(row.names = make.unique(row.names(vcf)),
                               Chr = as.character(SummarizedExperiment::seqnames(vcf)),
                               Pos = Biostrings::start(vcf),
                               Ref = thisRef,
                               S4Vectors::mcols(vcf)[,extracols],
                               stringsAsFactors = FALSE)
    # set up depth matrix
    thisDepthVal <- unlist(VariantAnnotation::geno(vcf)[[al.depth.field]])
    thisAlNames <- paste(row.names(vcf)[thisAlleles2loc],
                         thisAlleleNucleotides, sep = "_")
    if(length(thisDepthVal) == length(samples) * thisNallele){
      thisAlDepth <- matrix(thisDepthVal, 
                            nrow = length(samples), ncol = thisNallele,
                            dimnames = list(samples, thisAlNames),
                            byrow = TRUE)
    } else {
      if(length(thisDepthVal) %% length(samples) != 0){
        stop("Unexpected number of allele depth fields.")
      }
      # For issue seen with GATK where there may be more AD values than alleles
      thisAlDepth <- matrix(thisDepthVal, nrow = length(samples),
                            ncol = length(thisDepthVal) %/% length(samples),
                            dimnames = list(samples, NULL),
                            byrow = TRUE)
      thisNADfield <- sapply(VariantAnnotation::geno(vcf)[[al.depth.field]][,1],
                             length)
      ADmismatchLoc <- which(thisNADfield != nAlt + 1)
      # Trim down to right number of alleles, and put in zero reads so locus
      # will be discarded.
      remal <- integer(0)
      for(L in ADmismatchLoc){
        firstal <- ifelse(L == 1, 1L, sum(thisNADfield[1:(L - 1)]) + 1)
        lastal <- sum(thisNADfield[1:L])
        thisAlDepth[, firstal:lastal] <- 0L
        remal <- c(remal, (firstal:lastal)[-(1:(nAlt[L] + 1))])
      }
      thisAlDepth <- thisAlDepth[,-remal]
      colnames(thisAlDepth) <- thisAlNames
    }
    
    # replace NA with zero
    thisAlDepth[is.na(thisAlDepth)] <- 0L
    # how many individuals have each allele
    indperal <- colSums(thisAlDepth > 0)

    # filter loci
    message("Filtering markers...")
    indperloc <- rowSums(rowsum(t(thisAlDepth), thisAlleles2loc) > 0)
    keepLoc <- indperloc >= min.ind.with.reads &
      tapply(indperal, thisAlleles2loc,
             function(x){
               sum(x >= min.ind.with.minor.allele) >= 2
             })
    # get rid of funny stuff from TASSEL-GBSv2
    locWithN <- unique(thisAlleles2loc[grep("N", thisAlleleNucleotides)])
    keepLoc[locWithN] <- FALSE
    # list alleles to remove
    remAl <- which(!thisAlleles2loc %in% which(keepLoc))
    
    # remove cut alleles from allele objects
    if(length(remAl) > 0){
      thisAlleleNucleotides <- thisAlleleNucleotides[-remAl]
      thisAlDepth <- thisAlDepth[, -remAl]
      thisAlleles2loc <- thisAlleles2loc[-remAl]
    }
    # update locTable to reflect cut loci
    thisLocTable <- thisLocTable[keepLoc,]
    # update locus numbers
    thisNloc <- sum(keepLoc)
    if(thisNloc > 0){
      thisAlleles2loc <- rep(1:thisNloc, times = table(thisAlleles2loc))
    } else {
      thisAlleles2loc <- integer(0)
    }
    thisNallele <- length(thisAlleles2loc)
    
    # pad out indels, using VCF convention of listing nucleotide before indel
    indelLoc <- which(tapply(nchar(thisAlleleNucleotides), thisAlleles2loc,
                             function(x) length(unique(x)) > 1))
    for(i in indelLoc){
      thiscol <- which(thisAlleles2loc == i)
      thisAN <- thisAlleleNucleotides[thiscol]
      maxWidth <- max(nchar(thisAN))
      for(a in 1:length(thisAN)){
        while(nchar(thisAN[a]) < maxWidth){
          thisAN[a] <- paste(thisAN[a], "-", sep = "")
        }
      }
      thisAlleleNucleotides[thiscol] <- thisAN
    }

    # group SNPs into tags
    if(phaseSNPs && thisNloc > 0){
      # add the last marker to the current set if appropriate
      # (i.e. if the break between file chunks may have been within a tag)
      if(currLoc > 0 && thisLocTable$Chr[1] == locTable$Chr[currLoc] &&
         thisLocTable$Pos[1] - locTable$Pos[currLoc] < tagsize){
        
        thisLocTable <- rbind(locTable[currLoc,], thisLocTable)
        oldAlCol <- which(alleles2loc == currLoc)
        thisAlleleNucleotides <- c(alleleNucleotides[oldAlCol], 
                                   thisAlleleNucleotides)
        thisAlleles2loc <- c(rep(1, length(oldAlCol)),
                             thisAlleles2loc + 1)
        thisAlDepth <- cbind(alleleDepth[,oldAlCol],
                             thisAlDepth)
        currLoc <- currLoc - 1L
        currAl <- currAl - length(oldAlCol)
      }
      
      # perform grouping + phasing
      consTags <- consolidateSNPs(thisAlDepth, thisAlleles2loc, thisLocTable,
                                  thisAlleleNucleotides, tagsize = tagsize,
                                  refgenome = refgenome, tol = tol)
      thisAlDepth <- consTags$alleleDepth
      thisAlleles2loc <- consTags$alleles2loc
      thisAlleleNucleotides <- consTags$alleleNucleotides
      thisLocTable <- consTags$locTable
      thisNloc <- dim(thisLocTable)[1]
      thisNallele <- length(thisAlleles2loc)
    }
    
    if(thisNloc > 0){
      # add data from this chunk to objects for whole dataset
      thisAlCol <- (1:thisNallele) + currAl
      if(currAl + thisNallele > ncol(alleleDepth)){
        message("Exceeded expected number of alleles; processing may slow.")
        if(currAl == 0){
          alleleDepth <- thisAlDepth
        } else {
          alleleDepth <- cbind(alleleDepth[,1:currAl], thisAlDepth)
        }
      } else {
        alleleDepth[,thisAlCol] <- thisAlDepth
      }
      dimnames(alleleDepth)[[2]][thisAlCol] <- dimnames(thisAlDepth)[[2]]
      alleles2loc[thisAlCol] <- thisAlleles2loc + currLoc
      alleleNucleotides[thisAlCol] <- thisAlleleNucleotides
      if(currLoc + thisNloc > nrow(locTable)){
        message("Exceeded expected number of loci; processing may slow.")
        if(currLoc == 0){
          locTable <- thisLocTable
        } else {
          locTable <- rbind(locTable[1:currLoc,], thisLocTable)
        }
      } else {
        locTable[(1:thisNloc) + currLoc, ] <- thisLocTable
      }
      row.names(locTable)[(1:thisNloc) + currLoc] <- row.names(thisLocTable)
      # update position in the output
      currAl <- currAl + thisNallele
      currLoc <- currLoc + thisNloc
    }
    
    # don't loop if we are using "which" in svparam
    if(is.na(yieldSize)) break
    # quit if we have a limit on how many loci to import.
    if(!is.na(maxLoci) && currLoc >= maxLoci) break
    message("Reading file...")
  }
  close(tfile)
  
  if(currAl == 0 || currLoc == 0){
    stop("No loci passed the missing data and allele frequency thresholds.")
  }
  message(paste(currLoc, "loci imported."))
  
  # trim output
  alleleDepth <- alleleDepth[, 1:currAl]
  alleles2loc <- alleles2loc[1:currAl]
  alleleNucleotides <- alleleNucleotides[1:currAl]
  locTable <- locTable[1:currLoc, ]

  # build RADdata object
  message("Building RADdata object...")
  radout <- RADdata(alleleDepth, alleles2loc, locTable, possiblePloidies,
                    contamRate, alleleNucleotides, taxaPloidy)
  message("Merging rare haplotypes...")
  radout <- MergeRareHaplotypes(radout, 
                                min.ind.with.haplotype = min.ind.with.minor.allele)
  radout <- MergeIdenticalHaplotypes(radout)
  radout <- RemoveMonomorphicLoci(radout)
  
  # indicate whether non-variable sites included in alleleNucleotides
  attr(radout$alleleNucleotides, "Variable_sites_only") <- is.null(refgenome) && phaseSNPs
  
  return(radout)
}

# Function to import from Stacks 1.48
readStacks <- function(allelesFile, matchesFolder, version = 2, 
                       min.ind.with.reads = 200,
                       min.ind.with.minor.allele = 10,
                       readAlignmentData = FALSE,
                       sumstatsFile = "populations.sumstats.tsv",
                       possiblePloidies = list(2), taxaPloidy = 2L,
                       contamRate = 0.001){
  # get columns depending on version number
  if(!version %in% c(1,2)){
    stop("Version must be equal to 1 or 2.")
  }
  if(version == 2){
    # for catalog alleles file
    hapcol <- 3
    loccol <- 2
    afcols <- list(NULL, integer(0), character(0), NULL, NULL)
    # for matches file
    mloccol <- 1
    msamcol <- 2
    mhapcol <- 4
    mdepcol <- 5
    mfcols <- list(integer(0), integer(0), NULL, character(0), integer(0),
                   NULL)
    # for sumstats file
    sloccol <- 1
    schrcol <- 2
    sposcol <- 3
    sfcols <- list(integer(0), character(0), integer(0), NULL, NULL, NULL,
                   NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
                   NULL, NULL, NULL, NULL, NULL, NULL)
  } else {
    # for catalog alleles file
    hapcol <- 4
    loccol <- 3
    afcols <- list(NULL, NULL, integer(0), character(0), NULL, NULL)
    # for matches file
    mloccol <- 3
    msamcol <- 4
    mhapcol <- 6
    mdepcol <- 7
    mfcols <- list(NULL, NULL, integer(0), integer(0), NULL, character(0),
                   integer(0), NULL)
    # for catalog tags file
    tloccol <- 3
    tchrcol <- 4
    tposcol <- 5
    tstrcol <- 6
    tfcols <- list(NULL, NULL, integer(0), character(0), integer(0),
                   character(0), NULL, NULL, NULL, NULL, NULL, NULL, NULL,
                   NULL)
  }
  
  # read in catalog.alleles.tsv file
  af <- scan(allelesFile, what = afcols,
             sep = "\t", comment.char = "#", na.strings = character(0))
  # get locus names (numbers) and haplotypes for variable sites
  keep <- af[[hapcol]] != ""
  locNames <- af[[loccol]][keep]
  alleleNucleotides <- af[[hapcol]][keep]
  attr(alleleNucleotides, "Variable_sites_only") <- TRUE
  alleleNames <- paste(locNames, alleleNucleotides, sep = "_")
  
  # get all of the sstacks files to read
  sstacksFiles <- list.files(matchesFolder, "\\.matches\\.tsv")
  if(length(sstacksFiles) == 0){
    stop("No .matches.tsv files found.")
  }
  sampleNames <- sub("\\.matches\\.tsv(\\.gz)?$", "", sstacksFiles)
  sstacksFiles <- file.path(matchesFolder, sstacksFiles)
  if(length(sampleNames) < min.ind.with.reads){
    stop("Need to adjust min.ind.with.reads for number of samples in dataset.")
  }
  if(length(sampleNames)/2 < min.ind.with.minor.allele){
    stop("Need to adjust min.ind.with.minor.allele for number of samples in dataset.")
  }
  
  # set up allele depth matrix
  alleleDepth <- matrix(0L, nrow = length(sampleNames), 
                        ncol = length(alleleNames),
                        dimnames = list(sampleNames, alleleNames))
  # vector for reordering samples according to numbers in matches files
  reorder <- integer(length(sampleNames))
  
  # read sstacks files
  for(i in 1:length(sampleNames)){
    mf <- scan(sstacksFiles[i], 
               what = mfcols, sep = "\t", comment.char = "#", 
               na.strings = character(0))
    keep <- mf[[mhapcol]] != "consensus"
    theseLocNames <- mf[[mloccol]][keep]
    theseAlNuc <- mf[[mhapcol]][keep]
    theseDepth <- mf[[mdepcol]][keep]
    theseAlNames <- paste(theseLocNames, theseAlNuc, sep = "_")
    alleleDepth[i, theseAlNames] <- theseDepth
    reorder[mf[[msamcol]][1]] <- i
  }
  # eliminate any  sample numbers that were skipped, and reorder matrix
  reorder <- reorder[!is.na(reorder) & reorder > 0]
  alleleDepth <- alleleDepth[reorder,]
  
  # filter loci
  indperal <- colSums(alleleDepth > 0)
  indperloc <- rowSums(rowsum(t(alleleDepth), locNames) > 0)
  locRemove <- which(indperloc < min.ind.with.reads |
                       tapply(indperal, locNames,
                              function(x){
                                x <- x[-which.max(x)]
                                sum(x) < min.ind.with.minor.allele
                              }))
  alRemove <- which(locNames %in% names(indperloc)[locRemove])
  
  # subset objects
  alleleDepth <- alleleDepth[, -alRemove]
  locNames <- locNames[-alRemove]
  alleleNucleotides <- alleleNucleotides[-alRemove]
  alleleNames <- alleleNames[-alRemove]
  
  # Get chromosome and position
  uniqueLocNames <- unique(locNames)
  if(readAlignmentData){
    if(version == 1){
      tagFile <- sub("alleles", "tags", allelesFile)
      tf <- scan(tagFile, what = tfcols,
                 sep = "\t", comment.char = "#", na.strings = character(0))
      keeprows <- fastmatch::fmatch(uniqueLocNames, tf[[tloccol]])
      locTable <- data.frame(row.names = as.character(uniqueLocNames),
                             Chr = tf[[tchrcol]][keeprows],
                             Pos = tf[[tposcol]][keeprows],
                             Strand = tf[[tstrcol]][keeprows],
                             stringsAsFactors = FALSE)
    }
    if(version == 2){
      sf <- scan(sumstatsFile, what = sfcols,
                 sep = "\t", comment.char = "#", na.strings = character(0))
      keeprows <- fastmatch::fmatch(uniqueLocNames, sf[[sloccol]])
      locTable <- data.frame(row.names = as.character(uniqueLocNames),
                             Chr = sf[[schrcol]][keeprows],
                             Pos = sf[[sposcol]][keeprows],
                             stringsAsFactors = FALSE)
    }
  } else {
    # no alignment data
    locTable <- data.frame(row.names = as.character(uniqueLocNames))
  }
  
  # match depth matrix columns to locus table
  alleles2loc <- fastmatch::fmatch(locNames, uniqueLocNames)
  
  # build RADdata object
  radout <- RADdata(alleleDepth, alleles2loc, locTable, possiblePloidies,
                    contamRate, alleleNucleotides, taxaPloidy)
  message("Merging rare haplotypes...")
  radout <- MergeRareHaplotypes(radout, 
                                min.ind.with.haplotype = min.ind.with.minor.allele)
  radout <- MergeIdenticalHaplotypes(radout)
  radout <- RemoveMonomorphicLoci(radout)
  return(radout)
}

# Read in from the TASSEL GBSv2 database with minimal processing.
# Use counts matrix output by GetTagTaxaDistFromDBPlugin, plus SAM file.
readTASSELGBSv2 <- function(tagtaxadistFile, samFile, min.ind.with.reads = 200,
                            min.ind.with.minor.allele = 10,
                            possiblePloidies = list(2), taxaPloidy = 2L,
                            contamRate = 0.001, chromosomes = NULL){
  # read the SAM file
  message("Reading SAM file...")
  samwhat <- list(NULL, 0L, "", 0L, NULL, NULL, NULL, NULL, NULL, "", NULL,
                  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
  samchunk <- scan(samFile, what = samwhat, comment.char = "@", sep = "\t",
                   fill = TRUE, quiet = TRUE)
  samflag <- samchunk[[2]]
  samchr <- samchunk[[3]]
  sampos <- samchunk[[4]]
  samseq <- samchunk[[10]]
  rm(samchunk)
  
  samind <- seq_along(samflag) # index for the tags
  
  # eliminate unaligned tags
  aligned <- bitwAnd(samflag, 4) == 0
  samflag <- samflag[aligned]
  samchr <- samchr[aligned]
  sampos <- sampos[aligned]
  samseq <- samseq[aligned]
  samind <- samind[aligned]
  
  # subset to desired chromosomes
  if(!is.null(chromosomes)){
    chrnotfound <- chromosomes[!chromosomes %fin% samchr]
    if(length(chrnotfound) > 0){
      warning(paste("Chromosomes not found:", paste(chrnotfound, collapse = " ")))
    }
    chrmatch <- samchr %fin% chromosomes
    if(sum(chrmatch) == 0){
      stop("No chromosome names from SAM file match those provided.")
    }
    samflag <- samflag[chrmatch]
    samchr <- samchr[chrmatch]
    sampos <- sampos[chrmatch]
    samseq <- samseq[chrmatch]
    samind <- samind[chrmatch]
  }
  
  # get strand
  samstrand <- ifelse(bitwAnd(samflag, 16) == 0, "top", "bot")
  # get tag sequence length
  samtlen <- nchar(samseq)
  
  # combine to make marker names
  sammrkr <- paste(samchr, sampos, samstrand, samtlen, sep = "-")
  
  # find all non-monomorphic markers
  mrkrTable <- table(sammrkr)
  polymrkr <- names(mrkrTable)[mrkrTable > 1]
  # subset to get only tags belonging to polymorphic markers
  polytags <- sammrkr %fin% polymrkr
  samflag <- samflag[polytags]
  samchr <- samchr[polytags]
  sampos <- sampos[polytags]
  samseq <- samseq[polytags]
  samind <- samind[polytags]
  samstrand <- samstrand[polytags]
  samtlen <- samtlen[polytags]
  sammrkr <- sammrkr[polytags]
  
  # build locTable
  lookup <- fastmatch::fmatch(polymrkr, sammrkr)
  locTable <- data.frame(row.names = polymrkr, Chr = samchr[lookup],
                         Pos = sampos[lookup], Strand = samstrand[lookup],
                         SequenceLength = samtlen[lookup],
                         stringsAsFactors = FALSE)
  locTable <- locTable[order(locTable$Chr, locTable$Pos),]
  
  # open tagtaxadist file
  message("Reading TagTaxaDist file...")
  ttdcon <- file(tagtaxadistFile, open = 'rt')
  # get taxa names
  taxa <- scan(ttdcon, what = character(), sep = "\t", nlines = 1, 
               quiet = TRUE)[-1]
  # set up matrix to contain read counts
  alleleDepth <- matrix(0L, nrow = length(taxa), ncol = length(sammrkr),
                        dimnames = list(taxa, paste(sammrkr, samseq, sep = "_")))
  # setup for reading file in a loop
  whatlist <- list(0L)[rep(1, length(taxa))]
  whatlist <- c(list(""), whatlist) ## might change "" to NULL if we don't care about seq
  chunksize <- 1e5
  # read first chunk
  dataIn <- scan(ttdcon, what = whatlist, sep = "\t", nlines = chunksize,
                 quiet = TRUE)
  nread <- length(dataIn[[1]]) # number of tags read
  lastTag <- 0
  # loop through file
  while(nread){
    # convert imported data to matrix
    datamat <- matrix(unlist(dataIn[-1]), nrow = length(taxa), ncol = nread,
                      byrow = TRUE)
    # find the tags we will keep
    thesetags <- samind > lastTag & samind <= lastTag + nread
    # add to matrix
    alleleDepth[, thesetags] <- datamat[, samind[thesetags] - lastTag]
    # read next chunk
    dataIn <- scan(ttdcon, what = whatlist, sep = "\t", nlines = chunksize,
                   quiet = TRUE)
    nread <- length(dataIn[[1]]) # number of tags read
    lastTag <- lastTag + nread
  }
  # close tagtaxadist file
  close(ttdcon)
  
  # filter loci
  message("Filtering loci...")
  lociToRemove <- integer(0)
  tagsToRemove <- integer(0)
  for(i in 1:nrow(locTable)){
    thesealleles <- which(sammrkr == row.names(locTable)[i])
    thesepres <- alleleDepth[,thesealleles] > 0 # presence or absence of alleles in individuals
    if(sum(rowSums(thesepres) > 0) < min.ind.with.reads ||
       sum(colSums(thesepres) >= min.ind.with.minor.allele) < 2){
      lociToRemove <- c(lociToRemove, i)
      tagsToRemove <- c(tagsToRemove, thesealleles)
    }
  }
  if(length(lociToRemove) > 0){
    locTable <- locTable[-lociToRemove,]
    alleleDepth <- alleleDepth[,-tagsToRemove]
    sammrkr <- sammrkr[-tagsToRemove]
    samseq <- samseq[-tagsToRemove]
  }
  if(nrow(locTable) == 0) stop("No loci passed filtering threshold.")
  
  # build vector to match alleles to loci
  alleles2loc <- fastmatch::fmatch(sammrkr, rownames(locTable))
  # sort alleles to keep things tidy
  tagorder <- order(alleles2loc)
  alleles2loc <- sort(alleles2loc)
  alleleDepth <- alleleDepth[,tagorder]
  samseq <- samseq[tagorder]
  
  # build RADdata object
  message("Building RADdata object...")
  radout <- RADdata(alleleDepth, alleles2loc, locTable, possiblePloidies,
                    contamRate, samseq, taxaPloidy)
  radout <- MergeRareHaplotypes(radout, 
                                min.ind.with.haplotype = min.ind.with.minor.allele)
  radout <- MergeIdenticalHaplotypes(radout)
  radout <- RemoveMonomorphicLoci(radout)
  attr(radout$alleleNucleotides, "Variable_sites_only") <- FALSE
  return(radout)
}

# Function to read output of process_sam_multi.py, so the user can get a
# preliminary look at Hind/He distribution before going forward with
# the isolocus_splitter script.
readProcessSamMulti <- function(alignfile, depthfile = sub("align", "depth", alignfile),
                                expectedLoci = 1000, min.ind.with.reads = 200,
                                min.ind.with.minor.allele = 10,
                                possiblePloidies = list(2), taxaPloidy = 2L,
                                contamRate = 0.001, expectedAlleles = expectedLoci * 15,
                                maxLoci = expectedLoci){
  # read file headers
  aligncon <- file(alignfile, open = 'r')
  depthcon <- file(depthfile, open = 'r')
  alignheader <- scan(aligncon, sep = ",", nlines = 1, what = character(),
                      quiet = TRUE)
  depthheader <- scan(depthcon, sep = ",", nlines = 1, what = character(),
                      quiet = TRUE)
  nalign <- (length(alignheader) - 1) / 4 # number of alignment positions reported
  samples <- depthheader[-1]
  nsam <- length(samples)
  
  # set up objects to go into RADdata object
  locTable <- data.frame(Chr = rep(NA_character_, expectedLoci),
                         Pos = rep(NA_integer_, expectedLoci),
                         stringsAsFactors = FALSE)
  locnames <- rep(NA_character_, expectedLoci)
  alleles2loc <- rep(NA_integer_, expectedAlleles)
  alleleNucleotides <- rep(NA_character_, expectedAlleles)
  alleleDepth <- matrix(0L, nrow = nsam, ncol = expectedAlleles,
                        dimnames = list(samples, NULL))
  allelenames <- rep(NA_character_, expectedAlleles)
  
  # count loci and alleles read in (passing any filtering)
  loccount <- 0L
  alcount <- 0L
  
  # read the files
  nscan <- 1e4 # number of lines to read at once
  whatlistalign <- list(character(), integer(), NULL)
  whatlistalign <- whatlistalign[c(rep(1, nalign + 1), rep(2, nalign), rep(3, 2 * nalign))]
  whatlistdepth <- list(character(), integer())
  whatlistdepth <- whatlistdepth[c(1, rep(2, nsam))]
  # dummy objects; will hold the last partial marker read
  lastdepthmat <- matrix(integer(0), nrow = 0, ncol = nsam)
  lastaligndata <- whatlistalign
  lastchunk <- FALSE # indicates if we have read the last chunk of the file
  while(loccount < maxLoci && !lastchunk){
    aligndata <- scan(aligncon, sep = ",", what = whatlistalign, nlines = nscan,
                      quiet = TRUE)
    depthdata <- scan(depthcon, sep = ",", what = whatlistdepth, nlines = nscan,
                      quiet = TRUE)
    nread <- length(aligndata[[1]])
    lastchunk <- nread == 0 # is this the last chunk of the file?
    stopifnot(nread == length(depthdata[[1]]))
    if(lastchunk){ # end of file
      depthmat <- lastdepthmat
      aligndata <- lastaligndata
      if(nrow(depthmat) == 0) break
    } else {
      depthmat <- matrix(unlist(depthdata[-1]), nrow = nread, ncol = nsam)
      # merge in any potentially unfinished markers from last chunk
      depthmat <- rbind(lastdepthmat, depthmat)
      for(i in 1:(length(whatlistalign) - nalign)){
        aligndata[[i]] <- c(lastaligndata[[i]], aligndata[[i]])
      }
    }
    nread <- nrow(depthmat) # updated if any loci from last chunk added in
    if(nread < 2) break # quit if there's nothing left to process
    # convert number of mutations to matrix for easier processing
    NM <- matrix(unlist(aligndata[(1:nalign) + (1 + nalign)]),
                 nrow = nread, ncol = nalign)
    
    # find groups of alleles from the same set of alignment positions and process them
    theseal <- 1L
    for(i in 2:nread){
      samegroup <- all(sapply(1:nalign, function(x) aligndata[[x]][i] == aligndata[[x]][i-1]))
      if(samegroup){
        theseal <- c(theseal, i)
      }
      if(i == nread){
        lastaligndata <- lapply(aligndata, function(x) x[theseal])
        lastdepthmat <- depthmat[theseal,]
      }
      if(!samegroup || (lastchunk && i == nread)) { # we have one complete group of alleles
        # divide up into loci based on number of mutations from reference
        thisNM <- NM[theseal,, drop = FALSE]
        thisNM <- thisNM[,!is.na(thisNM[1,]), drop = FALSE]
        hapAssign <- InitHapAssign(thisNM)
        hapAssign <- lapply(1:nalign, function(x) which(hapAssign == x))
        for(L in 1:nalign){
          # skip monomorphic loci or those with no alleles
          if(length(hapAssign[[L]]) < 2) next
          # filter by missing data rate
          thesealSub <- theseal[hapAssign[[L]]]
          if(sum(colSums(depthmat[thesealSub,]) > 0) < min.ind.with.reads) next
          # filter by minor allele frequency
          if(sum(rowSums(depthmat[thesealSub,] > 0) >= min.ind.with.minor.allele) < 2) next
          # add the locus in if it passed everything
          thislocname <-aligndata[[L]][thesealSub[1]]
          firstal <- alcount + 1L
          alcount <- alcount + length(thesealSub)
          if(thislocname %in% locnames){ # same locus may show up in multiple groups
            alleles2loc[firstal:alcount] <- match(thislocname, locnames)
          } else {
            loccount <- loccount + 1L
            locnames[loccount] <- thislocname
            splitlocname <- strsplit(locnames[loccount], "-")[[1]]
            locTable$Chr[loccount] <- splitlocname[1]
            locTable$Pos[loccount] <- as.integer(splitlocname[2])
            alleles2loc[firstal:alcount] <- loccount
          }
          alleleNucleotides[firstal:alcount] <- aligndata[[nalign+1]][thesealSub]
          allelenames[firstal:alcount] <-
            paste(locnames[loccount], alleleNucleotides[firstal:alcount], sep = "_")
          if(alcount > ncol(alleleDepth)){
            alleleDepth <- alleleDepth[,1:(firstal-1)]
            alleleDepth <- cbind(alleleDepth, t(depthmat[thesealSub,]))
          } else {
            alleleDepth[,firstal:alcount] <- t(depthmat[thesealSub,])
          }
          if(loccount >= maxLoci) break
        }
      }
      if(!samegroup){
        theseal <- i
      }
      if(loccount >= maxLoci) break
    }
  }
  
  # wrap-up and build RADdata object
  close(aligncon)
  close(depthcon)
  
  locnames <- locnames[1:loccount]
  locTable <- locTable[1:loccount,]
  allelenames <- allelenames[1:alcount]
  alleleNucleotides <- alleleNucleotides[1:alcount]
  alleleDepth <- alleleDepth[,1:alcount]
  alleles2loc <- alleles2loc[1:alcount]
  
  alleleorder <- order(alleles2loc)
  allelenames <- allelenames[alleleorder]
  alleleNucleotides <- alleleNucleotides[alleleorder]
  alleleDepth <- alleleDepth[,alleleorder]
  alleles2loc <- alleles2loc[alleleorder]
  
  rownames(locTable) <- locnames
  colnames(alleleDepth) <- allelenames
  out <- RADdata(alleleDepth, alleles2loc, locTable, possiblePloidies,
                 contamRate, alleleNucleotides, taxaPloidy)
#  out <- MergeRareHaplotypes(out, min.ind.with.haplotype = min.ind.with.minor.allele)
#  out <- RemoveMonomorphicLoci(out)
  return(out)
}

readProcessIsoloci <- function(sortedfile, min.ind.with.reads = 200,
                               min.ind.with.minor.allele = 10,
                               min.median.read.depth = 10,
                               possiblePloidies = list(2), taxaPloidy = 2L,
                               contamRate = 0.001, nameFromTagStart = TRUE,
                               mergeRareHap = TRUE){
  message("Reading file...")
  incon <- file(sortedfile, open = "r")
  # read header
  header <- scan(incon, sep = ",", nlines = 1, what = character(), quiet = TRUE)
  samples <- header[-(1:5)]
  nSam <- length(samples)
  scanwhat <- list(character(), integer(), character(), character(), NULL, integer())
  scanwhat <- scanwhat[c(1:5, rep(6, nSam))]
  
  # read file
  mydata <- scan(incon, sep = ",", what = scanwhat, quiet = TRUE)
  close(incon)
  nAl <- length(mydata[[1]])
  
  # get depth matrix
  alleleDepth <- matrix(unlist(mydata[(1:nSam) + 5]), nrow = nSam, ncol = nAl,
                         byrow = TRUE, dimnames = list(samples, NULL))
  if(any(is.na(alleleDepth))){
    stop("Missing data in depth matrix.")
  }
  mydata <- mydata[1:4] # free up space
  # factor by locus, sorting locus names
  alleles2loc_factor <- as.factor(mydata[[1]])
  loci <- levels(alleles2loc_factor)
  nLoc <- length(loci)
  alleles2loc <- as.integer(alleles2loc_factor)
  
  # perform filtering
  message("Filtering and sorting loci...")
  keeploc <- integer(0)
  for(L in 1:nLoc){
    submat <- alleleDepth[,alleles2loc == L, drop = FALSE]
    depthperind <- rowSums(submat)
    if(sum(depthperind > 0) >= min.ind.with.reads &&
       sum(colSums(submat > 0) >= min.ind.with.minor.allele) > 1 &&
       median(depthperind) >= min.median.read.depth){
      keeploc <- c(keeploc, L)
    }
  }
  keepal <- which(alleles2loc %fin% keeploc)
  
  alleles2loc_factor <- droplevels(alleles2loc_factor[keepal])
  loci <- levels(alleles2loc_factor)
  alleleDepth <- alleleDepth[,keepal, drop = FALSE]
  alleles2loc <- as.integer(alleles2loc_factor)
  alleleNucleotides <- mydata[[3]][keepal]
  refNucleotides <- mydata[[4]][keepal]
  colnames(alleleDepth) <- paste(alleles2loc_factor, alleleNucleotides,
                                 sep = "_")
  
  # sort by locus name (i.e. position and chromosome)
  alorder <- order(alleles2loc)
  alleleDepth <- alleleDepth[, alorder, drop = FALSE]
  alleleNucleotides <- alleleNucleotides[alorder]
  alleles2loc <- alleles2loc[alorder]
  
  # build locTable
  chrom <- sub("\\-.*$", "", loci)
  loc2al <- fastmatch::fmatch(loci, mydata[[1]])
  pos <- mydata[[2]][loc2al]
  ref <- mydata[[4]][loc2al]
  if(!nameFromTagStart){
    loci <- paste(chrom, pos, sep = "-")
  }
  locTable <- data.frame(row.names = loci,
                         Chr = chrom, Pos = pos,
                         Ref = ref,
                         stringsAsFactors = FALSE)
  
  # build RADdata object
  message("Building RADdata object...")
  attr(alleleNucleotides, "Variable_sites_only") <- FALSE
  radout <- RADdata(alleleDepth, alleles2loc, locTable, possiblePloidies,
                    contamRate, alleleNucleotides, taxaPloidy)
  radout <- MergeIdenticalHaplotypes(radout)
  if(mergeRareHap){
    radout <- MergeRareHaplotypes(radout, 
                                  min.ind.with.haplotype = min.ind.with.minor.allele)
    radout <- MergeIdenticalHaplotypes(radout)
    radout <- RemoveMonomorphicLoci(radout)
  }
  return(radout)
}

readDArTag <- function(file, botloci = NULL, blastfile = NULL,
                       excludeHaps = NULL, includeHaps = NULL,
                       n.header.rows = 0, sample.name.row = 1, 
                       trim.sample.names = "_[^_]+_[ABCDEFGH][[:digit:]][012]?$",
                       sep.counts = ",", sep.blast = "\t",
                       possiblePloidies = list(2), taxaPloidy = 2L,
                       contamRate = 0.001){
  if(!is.null(excludeHaps) && !is.null(includeHaps)){
    stop("Only specify one of excludeHaps or includeHaps")
  }
  if(is.null(botloci) && is.null(blastfile)){
    stop("Need to specify botloci or blastfile.")
  }
  if(!is.null(botloci) && !is.null(blastfile)){
    stop("Only specify one of botloci or blastfile.")
  }
  message("Importing read counts...")
  mycon <- file(file, open = 'r')
  hdr <- readLines(mycon, n = n.header.rows)
  tab <- read.table(mycon, header = TRUE, sep = sep.counts,
                    stringsAsFactors = FALSE)
  close(mycon)
  
  # determine number of leading columns
  if(n.header.rows > 0){
    hdr.split <- strsplit(hdr, split = ",")
    n.lead.cols <-
      unique(sapply(hdr.split, function(x) min(grep(".+", x)) - 1))
    if(length(n.lead.cols) != 1){
      stop("Not all sample headers start on same column. Be sure not to count the row starting with AlleleID towards n.header.rows.")
    }
  } else {
    n.lead.cols <- sum(colnames(tab) %in% c("AlleleID", "CloneID", "AlleleSequence", "readCountSum"))
  }
  
  if(!all(c("AlleleID", "CloneID", "AlleleSequence") %in% colnames(tab))){
    stop("Need AlleleID, CloneID, and AlleleSequence columns.")
  }
  
  # Get sample names
  if(sample.name.row == n.header.rows + 1){
    samples <- colnames(tab)[-seq_len(n.lead.cols)]
  } else {
    if(sample.name.row < 1 || sample.name.row > n.header.rows){
      stop("sample.name.row should be within header rows or column headers.")
    }
    samples <- hdr.split[[sample.name.row]][-seq_len(n.lead.cols)]
  }
  if(anyDuplicated(samples)){
    stop("Not all sample names are unique.  Check sample.name.row.")
  }
  if(trim.sample.names != ""){
    samplesOLD <- samples
    samples <- sub(trim.sample.names, "", samples)
    dups <- duplicated(samples)
    if(any(dups)){
      warning("Some sample names not trimmed, to avoid duplicates.")
      samples[dups] <- samplesOLD[dups]
    }
    stopifnot(!anyDuplicated(samples))
  }
  
  # Filter haplotypes if needed
  if(!is.null(includeHaps)){
    if(!all(includeHaps %in% tab$AlleleID)){
      stop("Not all haplotypes in includeHaps found in AlleleID.")
    }
    tab <- tab[tab$AlleleID %in% includeHaps,]
  }
  if(!is.null(excludeHaps)){
    if(!all(excludeHaps %in% tab$AlleleID)){
      warning("Not all haplotypes in excludeHaps found in AlleleID.")
    }
    tab <- tab[!tab$AlleleID %in% excludeHaps,]
  }
  if(anyDuplicated(tab$AlleleID)){
    stop("Duplicate AlleleIDs found.")
  }
  
  # Build locTable
  loci <- unique(tab$CloneID)
  refals <- sapply(loci, function(L) grep(paste0("^", L, "\\|Ref(_[0]*1)?$"), tab$AlleleID, value = TRUE))
  if(is.list(refals) || is.matrix(refals)){
    stop("More than one reference allele found per locus.")
  }
  locTable <- data.frame(row.names = loci,
                         Chr = sub("_[[:digit:]]+$", "", loci),
                         Pos = as.integer(sub("^.+_", "", loci)))
  
  # Trim allele tags as needed. DArT provided newer alleles as longer sequence.
  taglength <- nchar(tab$AlleleSequence)
  minlength <- tapply(taglength, tab$CloneID, min)
  if(length(unique(minlength)) == 1){
    tab$AlleleSequence <- substring(tab$AlleleSequence, 1, minlength[1])
  } else {
    for(L in loci){
      theserows <- which(tab$CloneID == L)
      tab$AlleleSequence[theserows] <- substring(tab$AlleleSequence[theserows], 1, minlength[L])
    }
  }
  
  # Do reverse complement where appropriate
  if(!is.null(blastfile)){
    # Import BLAST results
    message("Importing BLAST results...")
    blastres <- read.table(blastfile, header = TRUE, sep = sep.blast,
                           stringsAsFactors = FALSE)
    botbool <- logical(length(loci))
    alignedbool <- logical(length(loci))
    qidcol <- which(colnames(blastres) %in% c("qseqid", "Query"))
    subcol <- which(colnames(blastres) %in% c("sseqid", "Subject"))
    sstartcol <- which(colnames(blastres) %in% c("sstart", "S_start"))
    sendcol <- which(colnames(blastres) %in% c("send", "S_end"))
    pidentcol <- which(colnames(blastres) %in% c("pident", "X.Identity"))
    perfectmatch <- blastres[[pidentcol]] == 100
    if(!all(lengths(list(qidcol, subcol, sstartcol, sendcol, pidentcol)) == 1)){
      stop("Problem with column headers in BLAST file.")
    }
    for(i in seq_along(loci)){
      # Find perfect matches for the reference allele
      theserows <- which(blastres[[qidcol]] == refals[i] & perfectmatch)
      # Check that chromosome is correct
      theserows <- theserows[grep(paste0("(.+_)?", locTable$Chr[i], "$"),
                                  blastres[[subcol]][theserows])]
      # Check that position is correct
      thispos <- locTable$Pos[i]
      theserows <- theserows[(blastres[[sstartcol]][theserows] <= thispos &
                                blastres[[sendcol]][theserows] >= thispos) |
                               (blastres[[sstartcol]][theserows] >= thispos &
                                  blastres[[sendcol]][theserows] <= thispos)]
      if(length(theserows) >= 1){
        alignedbool[i] <- TRUE
        # Determine strandedness
        botbool[i] <-
          blastres[[sstartcol]][theserows[1]] > blastres[[sendcol]][theserows[1]]
      }
    }
    botloci <- loci[botbool]
    # subset to loci with alignments
    loci <- loci[alignedbool]
    refals <- refals[alignedbool]
    locTable <- locTable[alignedbool,]
    tab <- tab[tab$CloneID %in% loci,]
    nnotaligned <- sum(!alignedbool)
    if(nnotaligned > 0){
      warning(paste("Discarded", nnotaligned, "loci without correct alignments."))
    }
  }
  botrows <- which(tab$CloneID %in% botloci)
  tab$AlleleSequence[botrows] <- reverseComplement(tab$AlleleSequence[botrows])
  locTable$Tag_strand <- ifelse(loci %in% botloci, "bot", "top")

  # Add reference sequence to locTable
  if(!all(refals %in% tab$AlleleID)){
    warning("Reference sequence not recorded due to not all loci having reference alleles.  Expecting Ref_001.")
    warning("Positions incorrect because target SNP could not be ascertained.")
  } else {
    locTable$Ref <- tab$AlleleSequence[match(refals, tab$AlleleID)]
    altals <- sapply(loci, function(L) grep(paste0("^", L, "\\|Alt(_[0]*2)?$"), tab$AlleleID, value = TRUE))
    if(is.list(altals) || is.matrix(altals)){
      warning("Positions incorrect because target SNP could not be ascertained.")
    } else {
      # Convert position from target SNP to tag start
      altseq <- tab$AlleleSequence[match(altals, tab$AlleleID)]
      snppos <-
        mapply(function(x, y) which(charToRaw(x) != charToRaw(y))[1],
               locTable$Ref, altseq, USE.NAMES = FALSE)
      locTable$Pos <- locTable$Pos - snppos + 1L
    }
  }
  
  # Allele info
  alleles2loc <- match(tab$CloneID, loci)
  alleleNucleotides <- tab$AlleleSequence
  attr(alleleNucleotides, "Variable_sites_only") <- FALSE
  
  # Allelic read depth
  alleleDepth <- t(as.matrix(tab[,-seq_len(n.lead.cols)]))
  colnames(alleleDepth) <- tab$AlleleID
  rownames(alleleDepth) <- samples
  
  message("Building RADdata object...")
  out <- RADdata(alleleDepth, alleles2loc, locTable, possiblePloidies,
                 contamRate, alleleNucleotides, taxaPloidy)
  message("Merging identical haplotypes...")
  out <- MergeIdenticalHaplotypes(out)
  return(out)
}

Try the polyRAD package in your browser

Any scripts or data that you put into this service are public.

polyRAD documentation built on Nov. 10, 2022, 5:14 p.m.