# functions for import of data into RADdata object
# Function to import read counts from UNEAK output (HapMap.hmc.txt).
# includeloci should be a character vector of loci names to keep in output.
# If shortIndNames is TRUE, just keep the part of the individual name before
# the first underscore.
readHMC <- function(file, includeLoci=NULL, shortIndNames=TRUE,
possiblePloidies = list(2), taxaPloidy = 2L,
contamRate = 0.001,
fastafile = sub("hmc.txt", "fas.txt", file, fixed = TRUE)){
# read in file; not using read.table, to preserve indiv. names
filelines <- strsplit(readLines(file), split="\t")
# extract names of individuals
indstrings <- filelines[[1]][2 :
(match("HetCount_allele1", filelines[[1]]) - 1)]
if(shortIndNames){
indstrings <- sapply(strsplit(indstrings, split="_", fixed=TRUE),
function(x) x[1])
}
# number of individuals
nInd <- length(indstrings)
# number of loci
nLoc <- ifelse(is.null(includeLoci), length(filelines) - 1,
length(includeLoci))
# set up matrix to contain results
alDepth <- matrix(as.integer(0), nrow = nInd, ncol = 2*nLoc,
dimnames = list(indstrings, NULL))
# loop to fill in data
locnames <- character(nLoc) # names of loci
if(is.null(includeLoci)){ # if we are using all loci in the file
for(i in 1:nLoc){
locnames[i] <- filelines[[i+1]][1]
thesecounts <- strsplit(filelines[[i+1]][2:(nInd+1)],
split="|", fixed=TRUE)
alDepth[,2*i - 1] <- sapply(thesecounts, function(x) as.integer(x[1]))
alDepth[,2*i] <- sapply(thesecounts, function(x) as.integer(x[2]))
}
} else { # if we are just using a subset of loci
locnames <- includeLoci
for(i in 2:length(filelines)){
# get locus name for this line of the file and see if it is in
# the list of loci that we are keeping. If so, what position?
Lnum <- fastmatch::fmatch(filelines[[i]][1], locnames)
# go to next line of file if we don't want this locus
if(is.na(Lnum)) next
thesecounts <- strsplit(filelines[[i]][2:(nInd+1)],
split="|", fixed=TRUE)
alDepth[,Lnum*2 - 1] <- sapply(thesecounts, function(x) as.integer(x[1]))
alDepth[,Lnum*2] <- sapply(thesecounts, function(x) as.integer(x[2]))
}
}
dimnames(alDepth)[[2]] <- paste(rep(locnames, each = 2), c(0,1), sep = "_")
# get nucleotides for the alleles
fastalines <- readLines(fastafile)
fastaseq <- fastalines[seq(2,length(fastalines), by = 2)] # just sequences
fastaloc <- sapply(strsplit(fastalines[seq(1,length(fastalines)-1, by = 2)],
split = "_"), function(x) substring(x[1], 2, nchar(x[1])))
alleleNucleotides <- character(nLoc * 2)
for(i in 1:nLoc){
rowid <- fastmatch::fmatch(locnames[i], fastaloc)
theseseq <- strsplit(fastaseq[c(rowid, rowid + 1)], split = "")
varpos <- theseseq[[1]] != theseseq[[2]]
alleleNucleotides[2*i - 1] <- theseseq[[1]][varpos]
alleleNucleotides[2*i] <- theseseq[[2]][varpos]
}
return(RADdata(alleleDepth = alDepth, alleles2loc = rep(1:nLoc, each = 2),
locTable = data.frame(row.names = locnames),
possiblePloidies = possiblePloidies,
contamRate = contamRate,
alleleNucleotides = alleleNucleotides,
taxaPloidy = taxaPloidy))
}
# function to import read counts from TagDigger
readTagDigger <- function(countfile, includeLoci = NULL,
possiblePloidies = list(2), taxaPloidy = 2L,
contamRate = 0.001,
dbfile = NULL, dbColumnsToKeep = NULL,
dbChrCol = "Chr", dbPosCol = "Pos",
dbNameCol = "Marker name"){
# read marker database, if applicable
if(!is.null(dbfile)){
mydb <- read.csv(dbfile, header = TRUE, stringsAsFactors = FALSE,
row.names = make.names(dbNameCol))
if(!is.null(dbColumnsToKeep)){ # only retain subset of markers
mydb <- mydb[, make.names(dbColumnsToKeep)]
}
if(!all(make.names(c(dbChrCol, dbPosCol)) %in% names(mydb))){
stop(paste(dbChrCol, "and", dbPosCol, "not found in column names of",
dbfile))
}
names(mydb)[match(make.names(dbChrCol), names(mydb))] <- "Chr"
names(mydb)[match(make.names(dbPosCol), names(mydb))] <- "Pos"
# subset by loci
if(!is.null(includeLoci)){
mydb <- mydb[row.names(mydb) %fin% includeLoci,]
}
if(dim(mydb)[1] == 0) stop("includeLoci and mydb don't match")
}
# read the counts
# mycounts <- as.matrix(read.csv(countfile, row.names = 1, header = TRUE))
# use scan; more code but a lot less processing time
mycon <- file(countfile, open = 'r')
myheader <- scan(mycon, what = "", nlines = 1, sep = ",")
alleles <- myheader[-1]
nalleles <- length(alleles)
whatlist <- list(0L)[rep(1, nalleles)]
names(whatlist) <- alleles
whatlist <- c(list(Taxa = ""), whatlist)
mydata <- scan(mycon, what = whatlist, sep = ",")
close(mycon)
taxa <- mydata[[1]]
mydata <- mydata[-1]
mycounts <- matrix(unlist(mydata), nrow = length(taxa), ncol = nalleles,
dimnames = list(taxa, alleles))
# extract marker names
mrkrNamesByAl <- sapply(strsplit(alleles, split = "_"),
function(x) x[1])
# subset by loci
if(!is.null(includeLoci)){
alToKeep <- mrkrNamesByAl %fin% includeLoci
if(sum(alToKeep) == 0) stop("countsfile and includeLoci don't match")
mrkrNamesByAl <- mrkrNamesByAl[alToKeep]
mycounts <- mycounts[, alToKeep]
}
# make locTable if still necessary
if(is.null(dbfile)){
mydb <- data.frame(row.names = unique(mrkrNamesByAl))
}
# get locTable indices for counts matrix columns
alleles2loc <- fastmatch::fmatch(mrkrNamesByAl, row.names(mydb))
# error if there isn't a match in marker names between the two files
if(any(is.na(alleles2loc))){
cat(paste("Some markers in", file, "not found in", dbfile), sep = "\n")
sadMarkers <- unique(mrkrNamesByAl[is.na(alleles2loc)])
maxMrkrToPrint <- 10
cat(sadMarkers[1:max(c(maxMrkrToPrint, length(sadMarkers)))], sep = "\n")
if(length(sadMarkers) > maxMrkrToPrint){
cat("...", sep = "\n")
}
stop(paste("Some markers in", file, "not found in", dbfile))
}
# get variable nucleotides for tags
myNT <- sapply(strsplit(dimnames(mycounts)[[2]], split = "_"),
function(x) x[2])
# make RADdata object
return(RADdata(mycounts, alleles2loc, mydb, possiblePloidies, contamRate,
myNT, taxaPloidy))
}
# Function to consolidate loci imported from VCF back into tags.
# Essentially do phasing on the SNPs in the VCF to make haplotypes, using read depth.
# Assume locTable is already sorted by chromosome and position.
# A reference genome can be provided as FASTA or compressed, indexed FASTA
# from Bioconductor in order to get nucleotides in between variable sites.
# refgenome should either be an FaFile object or a path to a FASTA file for the genome
# tol indicates how dissimilar read depth can be and still have two alleles
# combined into a tag.
consolidateSNPs <- function(alleleDepth, alleles2loc, locTable, alleleNucleotides,
tagsize = 80, refgenome = NULL, tol = 0.01){
if(!all(c("Chr", "Pos") %in% names(locTable))){
stop("locTable needs Chr for chromosome and Pos for position")
}
# Set up reference genome if provided. Make FaFile object.
if(!is.null(refgenome) && !is(refgenome, "FaFile")){
if(!is.character(refgenome)){
stop("refgenome must be character string or FaFile object.")
}
if(!file.exists(refgenome)){
stop(paste("File", refgenome, "not found."))
}
if(!file.exists(sprintf("%s.fai", refgenome))){
# index the fasta file if necessary
message("Creating FASTA file index...")
Rsamtools::indexFa(refgenome)
}
refgenome <- Rsamtools::FaFile(refgenome)
}
# get chromosome names in FASTA
if(!is.null(refgenome)){
FAchrnames <- GenomeInfoDb::seqnames(GenomeInfoDb::seqinfo(refgenome))
}
nLoc <- dim(locTable)[1] # number of loci to start
# rows in locTable for each chromosome
rowsByChr <- tapply(1:nLoc, locTable$Chr, function(x) x)
nAl <- dim(alleleDepth)[2] # number of alleles to start
nInd <- dim(alleleDepth)[1] # number of individuals
# preallocate objects to output
alleleDepthOut <- matrix(0L, nrow = nInd, ncol = nAl,
dimnames = list(dimnames(alleleDepth)[[1]],
as.character(1:nAl)))
alleles2locOut <- integer(nAl)
alleleNucleotidesOut <- character(nAl)
locTableOut <- data.frame(row.names = as.character(1:nLoc),
Chr = character(nLoc),
Pos = integer(nLoc),
Ref = character(nLoc),
stringsAsFactors = FALSE)
# variables to keep track of which allele and locus we are on
currAlOut <- 1L
currLocOut <- 1L
### Internal functions for consolidateSNP ####
# function for finding allele matches by looking for individuals with only one allele.
# only searches for homozygotes in marker 1; run this function in both directions.
findMatchesHomoz <- function(depth1, depth2){
# which individuals just have one allele for marker 1,
# and equal depth for both markers.
homoz1 <- which(rowSums(depth1 > 0) == 1 & rowSums(depth1) == rowSums(depth2))
match1 <- integer(0) # to hold allele indices for marker 1
match2 <- integer(0) # to hold allele indices for marker 2
for(alCol in 1:dim(depth1)[2]){
# homozygotes for this particular allele in marker 1
homoz1Al <- homoz1[depth1[homoz1,alCol] > 0]
# alleles in marker 2 that have reads for those homozygotes
thisAlMatch <- which(colSums(depth2[homoz1Al,,drop=FALSE]) > 0)
# add them to match output
nNewMatches <- length(thisAlMatch)
match1 <- c(match1, rep(alCol, nNewMatches))
match2 <- c(match2, thisAlMatch)
}
return(matrix(c(match1, match2), nrow = length(match1), ncol = 2))
}
# function to identify remaining matches not found using homozygotes
findRemainingMatches <- function(depth1, depth2, alMatch){
# find individuals with more than one allele for both markers,
# and equal depth for both markers.
notHomoz <- which(rowSums(depth1 > 0) > 1 & rowSums(depth2 > 0) > 1 &
rowSums(depth1) == rowSums(depth2))
# try consolidating read depth with existing new markers
cons <- consolidateDepth(depth1[notHomoz,, drop = FALSE],
depth2[notHomoz,, drop = FALSE], alMatch)
# find individuals that weren't fully matched
nm <- rowSums(cons) < rowSums(depth1[notHomoz,, drop = FALSE])
nmInd <- notHomoz[nm]
cons <- cons[nm,, drop = FALSE]
# add new alleles until we can match everything
progress <- TRUE
while(length(nmInd) > 0 && progress){
progress <- FALSE
# Temporary copies of depth matrices that we can subtract from
depth1T <- depth1[nmInd,, drop = FALSE]
depth2T <- depth2[nmInd,, drop = FALSE]
# loop through non-matched individuals
for(i in 1:length(nmInd)){
# loop through known new alleles for this ind.
for(newAl in which(cons[i,] > 0)){
# corresponding alleles in old markers
al1 <- alMatch[newAl, 1]
al2 <- alMatch[newAl, 2]
# remove reads for this allele
depth1T[i, al1] <- depth1T[i, al1] - cons[i, newAl]
depth2T[i, al2] <- depth2T[i, al2] - cons[i, newAl]
}
}
# look for matches with what's left
newAlMatch <- unique(rbind(findMatchesHomoz(depth1T, depth2T),
findMatchesHomoz(depth2T, depth1T)[,2:1]))
# add these new matches to the list of new alleles
if(dim(unique(rbind(alMatch, newAlMatch)))[1] > dim(alMatch)[1]){
progress <- TRUE
alMatch <- unique(rbind(alMatch, newAlMatch))
} else { # if that didn't work, try looking for alleles with equal depth
# get rid of rows where multiple alleles have same depth
keepInd <- apply(depth1T, 1, function(x) length(unique(x)) == length(x)) &
apply(depth2T, 1, function(x) length(unique(x)) == length(x))
depth1T <- depth1T[keepInd,, drop = FALSE]
depth2T <- depth2T[keepInd,, drop = FALSE]
# alleles to potentially match
als1 <- which(colSums(depth1T) > 0)
als2 <- which(colSums(depth2T) > 0)
for(al1 in als1){
for(al2 in als2){
# individuals with equal read depth for these two alleles
if(sum(depth1T[, al1] == depth2T[, al2] & depth1T[, al1] > 0) > 0){
newAlMatch <- c(al1, al2)
if(dim(unique(rbind(alMatch, newAlMatch)))[1] > dim(alMatch)[1]){
alMatch <- rbind(alMatch, newAlMatch)
progress <- TRUE
}
}
}
}
}
if(progress){
# retry consolidation
cons <- consolidateDepth(depth1[nmInd,, drop = FALSE],
depth2[nmInd,, drop = FALSE], alMatch)
nm <- rowSums(cons) < rowSums(depth1[nmInd,, drop = FALSE])
nmInd <- nmInd[nm]
cons <- cons[nm,, drop = FALSE]
}
}
return(alMatch)
}
# Function to consolidate read depth across two markers being merged into one.
# depth1 and depth2 are read depth matrices for the two markers, respectively,
# with individuals in rows and alleles in columns.
# alMatch is a matrix with two columns, with one row for each new allele. The
# values indicate the corresponding alleles for the first and second markers.
consolidateDepth <- function(depth1, depth2, alMatch){
# depth to output for the new marker
newdepth <- matrix(0L, nrow = dim(depth1)[1],
ncol = dim(alMatch)[1])
# boolean to figure out if an allele has been processed yet
allelesDone <- rep(FALSE, dim(alMatch)[1])
# whether progress has been made
progress <- TRUE
while(!all(allelesDone) && progress){
progress <- FALSE
# loop through alleles for the first marker
for(al1 in 1:dim(depth1)[2]){
# which new alleles match this original allele
newAl <- which(alMatch[,1] == al1 & !allelesDone)
# skip for now if this allele doesn't correspond to exactly one new, unprocessed allele
if(length(newAl) != 1) next
# find individuals with this allele
indWithAl <- which(depth1[,al1] > 0)
# add depth to output matrix
newdepth[indWithAl, newAl] <- depth1[indWithAl, al1]
# remove these counts from original matrices
al2 <- alMatch[newAl, 2]
depth2[indWithAl, al2] <- depth2[indWithAl, al2] - depth1[indWithAl, al1]
depth1[indWithAl, al1] <- 0L
# adjust new depth if depth2 was lower than depth1
d2neg <- depth2[indWithAl, al2] < 0
newdepth[indWithAl[d2neg], newAl] <- newdepth[indWithAl[d2neg], newAl] +
depth2[indWithAl[d2neg], al2]
depth2[indWithAl[d2neg], al2] <- 0L
# mark allele as done
allelesDone[newAl] <- TRUE
progress <- TRUE
}
# do the same for the second marker
for(al2 in 1:dim(depth2)[2]){
newAl <- which(alMatch[,2] == al2 & !allelesDone)
if(length(newAl) != 1) next
indWithAl <- which(depth2[,al2] > 0)
newdepth[indWithAl, newAl] <- depth2[indWithAl, al2]
al1 <- alMatch[newAl, 1]
depth1[indWithAl, al1] <- depth1[indWithAl, al1] - depth2[indWithAl, al2]
depth2[indWithAl, al2] <- 0L
d1neg <- depth1[indWithAl, al1] < 0
newdepth[indWithAl[d1neg], newAl] <- newdepth[indWithAl[d1neg], newAl] +
depth1[indWithAl[d1neg], al1]
depth1[indWithAl[d1neg], al1] <- 0L
allelesDone[newAl] <- TRUE
progress <- TRUE
}
}
# Find any individuals with reads remaining to assign.
# This should only come up if there is homoplasy or recombination within
# tags, e.g. alleles AC, AD, BC, and BD all exist.
indRemaining <- which(rowSums(depth1) > 0 & rowSums(depth2) > 0)
alRemaining <- which(!allelesDone) # alleles that still need to be assigned
for(ind in indRemaining){
als1 <- which(depth1[ind,] > 0) # alleles for marker 1
als2 <- which(depth2[ind,] > 0) # alleles for marker 2
# new alleles that are possible matches
possAl <- alRemaining[alMatch[alRemaining, 1] %in% als1 &
alMatch[alRemaining, 2] %in% als2]
progress <- TRUE
while(any(depth1[ind,] > 0) && any(depth2[ind,] > 0) && progress){
progress <- FALSE
for(al1 in als1){
if(depth1[ind, al1] <= 0) next
newAl <- possAl[alMatch[possAl, 1] == al1] # number for new allele(s)
if(length(newAl) == 1){
# Resolve cases where only one allele is possible based on
# presence/absence.
al2 <- alMatch[newAl, 2]
} else {
# Try to find a match based on depth
al2 <- which(depth2[ind,] == depth1[ind, al1])
if(length(al2) != 1) next
newAl <- newAl[alMatch[newAl, 2] == al2]
if(length(newAl) != 1) next
}
newCount <- min(c(depth1[ind, al1],
depth2[ind, al2]))
newdepth[ind, newAl] <- newCount
depth1[ind, al1] <- depth1[ind, al1] - newCount
depth2[ind, al2] <- depth2[ind, al2] - newCount
possAl <- possAl[possAl != newAl]
progress <- TRUE
}
for(al2 in als2){ # same procedure with the second marker
if(depth2[ind, al2] <= 0) next
newAl <- possAl[alMatch[possAl, 2] == al2]
if(length(newAl) == 1){
al1 <- alMatch[newAl, 1]
} else {
al1 <- which(depth1[ind,] == depth2[ind, al2])
if(length(al1) != 1) next
newAl <- newAl[alMatch[newAl, 1] == al1]
if(length(newAl) != 1) next
}
newCount <- min(c(depth1[ind, al1],
depth2[ind, al2]))
newdepth[ind, newAl] <- newCount
depth1[ind, al1] <- depth1[ind, al1] - newCount
depth2[ind, al2] <- depth2[ind, al2] - newCount
possAl <- possAl[possAl != newAl]
progress <- TRUE
}
} # end of while loop for unresolved reads
} # end of loop through unresolved individuals
return(newdepth)
} # end of consolidateDepth internal function
### End internal functions for consolidateSNP ####
# loop through chromosomes
for(chrset in rowsByChr){
thisChrom <- locTable$Chr[chrset[1]] # current chromosome name
if(!is.null(refgenome)){ # matching chromosome name in ref. genome
if(thisChrom %in% FAchrnames){
thisFAchr <- thisChrom
} else {
thisFAchr <- grep(paste("(Chr|chr|CHR)(omosome|OMOSOME)?", thisChrom, sep = ""),
FAchrnames, value = TRUE)
}
if(length(thisFAchr) == 0){
warning(paste("Couldn't match", thisChrom, "to reference genome."))
}
if(length(thisFAchr) > 1){
stop(paste(thisChrom, "matches multiple sequence names in reference genome."))
}
}
message(paste("Phasing", length(chrset), "SNPs on chromosome", thisChrom))
if(!identical(locTable$Pos[chrset], sort(locTable$Pos[chrset]))){
stop(paste(thisChrom, ": Loci must be sorted by position.", sep = ""))
}
currLocIn <- 1 # current locus number index WITHIN chrset (this chromosome)
# data for this locus
lastDepth <- alleleDepth[, alleles2loc == chrset[1], drop = FALSE]
lastName <- row.names(locTable)[chrset[1]]
lastPos <- locTable$Pos[chrset[1]]
lastSeq <- alleleNucleotides[alleles2loc == chrset[1]]
lastRef <- locTable$Ref[chrset[1]]
# loop through loci on this chromosome
for(currLocIn in 2:(length(chrset)+1)){
if(currLocIn <= length(chrset)){
# data for next locus
thisDepth <- alleleDepth[, alleles2loc == chrset[currLocIn], drop = FALSE]
thisName <- row.names(locTable)[chrset[currLocIn]]
thisPos <- locTable$Pos[chrset[currLocIn]]
thisSeq <- alleleNucleotides[alleles2loc == chrset[currLocIn]]
thisRef <- locTable$Ref[chrset[currLocIn]]
# get proportion difference in depth between these two loci
diff <- sum(abs(rowSums(thisDepth) - rowSums(lastDepth))) /
((sum(thisDepth) + sum(lastDepth))/2)
}
if(length(chrset) == 1 || diff > tol || thisPos - lastPos + 1 > tagsize
|| currLocIn > length(chrset)){
## If these are different loci (either due to counts or distance)
## put the "last" data into the output, and make the new data the last.
thisNAl <- dim(lastDepth)[2] # number of alleles for locus to output
thisAlOut <- (1:thisNAl) + currAlOut - 1L # indices for all alleles to output
# add data to output objects
alleleDepthOut[,thisAlOut] <- lastDepth
dimnames(alleleDepthOut)[[2]][thisAlOut] <- dimnames(lastDepth)[[2]]
alleles2locOut[thisAlOut] <- currLocOut
alleleNucleotidesOut[thisAlOut] <- lastSeq
row.names(locTableOut)[currLocOut] <- lastName
locTableOut$Chr[currLocOut] <- thisChrom
locTableOut$Pos[currLocOut] <- lastPos
locTableOut$Ref[currLocOut] <- lastRef
if(length(chrset) > 1){
# shift "this" locus to "last" locus
lastDepth <- thisDepth
lastName <- thisName
lastPos <- thisPos
lastSeq <- thisSeq
lastRef <- thisRef
}
# increment current allele and locus
currAlOut <- currAlOut + thisNAl
currLocOut <- currLocOut + 1L
} else {
## If these are the same locus, merge them.
# matrix to indicate how alleles match up
alMatch <- matrix(NA_integer_, nrow = 0, ncol = 2,
dimnames = list(NULL, c("last", "this")))
# find individuals in "last" matrix that only have one allele
alMatch <- rbind(alMatch, findMatchesHomoz(lastDepth, thisDepth))
# find individuals in "this" matrix that only have one allele
alMatch <- rbind(alMatch, findMatchesHomoz(thisDepth, lastDepth)[,2:1])
# reduce to unique set of matches
alMatch <- unique(alMatch, MARGIN = 1)
# match any remaining alleles that don't have "homozygotes"
alMatch <- findRemainingMatches(lastDepth, thisDepth, alMatch)
# if matching didn't work, skip adding this SNP
if(dim(alMatch)[1] < dim(lastDepth)[2]) next
# make new set of haplotype sequences
startPosFromReference <- lastPos + SummarizedExperiment::width(lastSeq[1])
endPosFromReference <- thisPos - 1
if(endPosFromReference >= startPosFromReference &&
!is.null(refgenome) && length(thisFAchr) == 1){
# retrieve reference sequence and past onto end of lastSeq
nonvarSeq <- Biostrings::getSeq(refgenome,
GenomicRanges::GRanges(seqnames = thisFAchr,
IRanges::IRanges(startPosFromReference,
endPosFromReference),
strand = "+"))[[1]]
lastSeq <- paste(lastSeq, nonvarSeq, sep = "")
lastRef <- paste(lastRef, nonvarSeq, sep = "")
}
lastSeq <- paste(lastSeq[alMatch[,1]], thisSeq[alMatch[,2]], sep = "")
lastRef <- paste(lastRef, thisRef, sep = "")
# make new depth matrix
# print(c(lastName, thisName)) # debug
lastDepth <- consolidateDepth(lastDepth, thisDepth, alMatch)
dimnames(lastDepth)[[2]] <- paste(lastName, lastSeq, sep = "_")
}
} # end of loop through loci
} # end of loop through chromosomes
# trim output to remove columns not used.
alleleDepthOut <- alleleDepthOut[, 1:(currAlOut - 1)]
alleles2locOut <- alleles2locOut[1:(currAlOut - 1)]
alleleNucleotidesOut <- alleleNucleotidesOut[1:(currAlOut - 1)]
locTableOut <- locTableOut[1:(currLocOut - 1),]
return(list(alleleDepth = alleleDepthOut, alleles2loc = alleles2locOut,
alleleNucleotides = alleleNucleotidesOut,
locTable = locTableOut))
}
# Function to make a function for filtering a VCF file.
# Assumes TASSEL GBSv2 format. (Diploid GT is listed first for each genotype.)
# The output function is used in the prefilters argument for
# VariantAnnotation::filterVcf.
MakeTasselVcfFilter <- function(min.ind.with.reads = 200,
min.ind.with.minor.allele = 10){
function(lines){
# vector to indicate the number of individuals with reads for each line
ind.with.reads <- sapply(gregexpr("[[:blank:]][[:digit:]]/[[:digit:]]:",
lines),
function(x){
if(x[1] == -1){
0
} else {
length(x)
}
})
# set up vector to indicate whether each line should be kept
result <- ind.with.reads >= min.ind.with.reads
# check on ind with minor allele
if(min.ind.with.minor.allele > 0){
ind.with.ref <- sapply(gregexpr("[[:blank:]](0/[[:digit:]]|[[:digit:]]/0):",
lines[result]),
function(x){
if(x[1] == -1){
0
} else {
length(x)
}
})
ind.with.alt <- sapply(gregexpr("[[:blank:]]([123]/[[:digit:]]|[[:digit:]]/[123]):",
lines[result]),
function(x){
if(x[1] == -1){
0
} else {
length(x)
}
})
result[result] <- ind.with.ref >= min.ind.with.minor.allele &
ind.with.alt >= min.ind.with.minor.allele
}
return(result)
}
}
# Function to import data from a VCF file.
# Reads in VCF using BioConductor, then phases SNPs into haplotypes using
# consolidateSNPs.
# file can be a character string or a TabixFile.
# samples is a character vector of the names of samples to retain.
# svparam can generally be left as-is unless you have additional columns to
# import to locTable (from fixed or info) or if you have specific genomic
# ranges to import with "which".
# expectedAlleles and expectedLoci are for after SNP phasing and
# consolidation to haplotypes.
VCF2RADdata <- function(file, phaseSNPs = TRUE, tagsize = 80, refgenome = NULL,
tol = 0.01, al.depth.field = "AD",
min.ind.with.reads = 200,
min.ind.with.minor.allele = 10,
possiblePloidies = list(2), taxaPloidy = 2L,
contamRate = 0.001,
samples = VariantAnnotation::samples(VariantAnnotation::scanVcfHeader(file)),
svparam = VariantAnnotation::ScanVcfParam(fixed = "ALT",
info = NA,
geno = al.depth.field,
samples = samples),
yieldSize = 5000, expectedAlleles = 5e5,
expectedLoci = 1e5, maxLoci = NA){
# clean up parameters for import
if(is.na(VariantAnnotation::vcfFixed(svparam))){
VariantAnnotation::vcfFixed(svparam) <- "ALT"
}
if(length(VariantAnnotation::vcfFixed(svparam)) > 0 &&
!"ALT" %in% VariantAnnotation::vcfFixed(svparam)){
VariantAnnotation::vcfFixed(svparam) <-
c(VariantAnnotation::vcfFixed(svparam), "ALT")
}
if(length(VariantAnnotation::vcfGeno(svparam)) > 0 &&
is.na(VariantAnnotation::vcfGeno(svparam)[1])){
stop("geno field must be provided in svparam.")
}
if(!identical(VariantAnnotation::vcfGeno(svparam), "AD")){
warning("Allele depth field not set to AD.")
}
if(!is.na(VariantAnnotation::vcfSamples(svparam)[1])){
samples <- VariantAnnotation::vcfSamples(svparam)
}
if(!all(samples %in% VariantAnnotation::samples(VariantAnnotation::scanVcfHeader(file)))){
stop("Not all samples given in argument found in file.")
}
if(length(samples) < min.ind.with.reads){
stop("Need to adjust min.ind.with.reads for number of samples in dataset.")
}
if(length(samples)/2 < min.ind.with.minor.allele){
stop("Need to adjust min.ind.with.minor.allele for number of samples in dataset.")
}
# determine what extra columns to add to locTable
extracols <- c(VariantAnnotation::vcfFixed(svparam),
VariantAnnotation::vcfInfo(svparam))
hdr <- VariantAnnotation::scanVcfHeader(file)
if(length(VariantAnnotation::vcfFixed(svparam)) == 0){
extracols <- c("QUAL", "FILTER", extracols)
}
if(length(VariantAnnotation::vcfInfo(svparam)) == 0){
extracols <- c(extracols, row.names(VariantAnnotation::info(hdr)))
}
extracols <- extracols[!extracols %in% c("CHROM", "POS", "ID", "REF", "ALT")]
extracols <- extracols[!is.na(extracols)]
# preallocate objects for constructing RADdata object
alleleDepth <- matrix(0L, nrow = length(samples), ncol = expectedAlleles,
dimnames = list(samples, 1:expectedAlleles))
locTable <- data.frame(row.names = as.character(1:expectedLoci),
Chr = character(expectedLoci),
Pos = integer(expectedLoci),
Ref = character(expectedLoci),
matrix(nrow = expectedLoci, ncol = length(extracols),
dimnames = list(NULL, extracols)),
stringsAsFactors = FALSE)
alleles2loc <- integer(expectedAlleles)
alleleNucleotides <- character(expectedAlleles)
# to track which allele we're on
currAl <- 0L
currLoc <- 0L
# create Tabix file
if(is.character(file) && !endsWith(file, ".bgz")){
if(file.exists(paste(file, ".bgz", sep = ""))){
file <- paste(file, ".bgz", sep = "")
} else {
message("Compressing file with bgzip.")
tempbgz <- tempfile(fileext = ".bgz") # temporary file for example
file <- Rsamtools::bgzip(file, dest = tempbgz)
message(paste("Compressed VCF sent to", file))
}
}
if(is.character(file) && file.exists(paste0(file, ".csi"))){
idx <- paste0(file, ".csi")
} else {
idx <- paste0(file, ".tbi")
}
if(is.character(file) && !file.exists(idx)){
message("Indexing VCF.")
Rsamtools::indexTabix(file, format = "vcf")
}
tfile <- Rsamtools::TabixFile(file, index = idx, yieldSize = yieldSize)
# set up genome argument if needed
genome <- VariantAnnotation::seqinfo(hdr)
if(length(genome) == 0 && length(VariantAnnotation::vcfWhich(svparam)) > 0){
genome <- unique(names(VariantAnnotation::vcfWhich(svparam)))
genome <- GenomeInfoDb::Seqinfo(seqnames = genome)
}
# Read data one chunk at a time
open(tfile)
message("Reading file...")
while(nrow(vcf <- VariantAnnotation::readVcf(tfile, genome = genome,
param = svparam))){
thisNloc <- nrow(vcf) # number of loci in this chunk
message("Unpacking data from VCF...")
# reference alleles
thisRef <- as.character(VariantAnnotation::ref(vcf))
# alternative alleles; clean out ones w/o alt allele
thisAltList <- lapply(VariantAnnotation::alt(vcf), function(x){
char <- as.character(x)
char <- char[char != ""]
return(char)})
nAlt <- sapply(thisAltList, length) # n alt alleles per locus
thisNallele <- thisNloc + sum(nAlt) # n alleles in this chunk
thisAlt <- unlist(thisAltList)
# put reference and alternative alleles together into alleleNucleotides
thisAlleleNucleotides <- character(thisNallele)
alsums <- cumsum(nAlt + 1)
refpos <- c(1, alsums[-thisNloc] + 1)
thisAlleleNucleotides[refpos] <- thisRef
thisAlleleNucleotides[-refpos] <- thisAlt
# set up alleles2loc
thisAlleles2loc <- rep(1:thisNloc, times = nAlt + 1)
# set up locTable
thisLocTable <- data.frame(row.names = make.unique(row.names(vcf)),
Chr = as.character(SummarizedExperiment::seqnames(vcf)),
Pos = Biostrings::start(vcf),
Ref = thisRef,
S4Vectors::mcols(vcf)[,extracols],
stringsAsFactors = FALSE)
# set up depth matrix
thisDepthVal <- unlist(VariantAnnotation::geno(vcf)[[al.depth.field]])
thisAlNames <- paste(row.names(vcf)[thisAlleles2loc],
thisAlleleNucleotides, sep = "_")
if(length(thisDepthVal) == length(samples) * thisNallele){
thisAlDepth <- matrix(thisDepthVal,
nrow = length(samples), ncol = thisNallele,
dimnames = list(samples, thisAlNames),
byrow = TRUE)
} else {
if(length(thisDepthVal) %% length(samples) != 0){
stop("Unexpected number of allele depth fields.")
}
# For issue seen with GATK where there may be more AD values than alleles
thisAlDepth <- matrix(thisDepthVal, nrow = length(samples),
ncol = length(thisDepthVal) %/% length(samples),
dimnames = list(samples, NULL),
byrow = TRUE)
thisNADfield <- sapply(VariantAnnotation::geno(vcf)[[al.depth.field]][,1],
length)
ADmismatchLoc <- which(thisNADfield != nAlt + 1)
# Trim down to right number of alleles, and put in zero reads so locus
# will be discarded.
remal <- integer(0)
for(L in ADmismatchLoc){
firstal <- ifelse(L == 1, 1L, sum(thisNADfield[1:(L - 1)]) + 1)
lastal <- sum(thisNADfield[1:L])
thisAlDepth[, firstal:lastal] <- 0L
remal <- c(remal, (firstal:lastal)[-(1:(nAlt[L] + 1))])
}
thisAlDepth <- thisAlDepth[,-remal]
colnames(thisAlDepth) <- thisAlNames
}
# replace NA with zero
thisAlDepth[is.na(thisAlDepth)] <- 0L
# how many individuals have each allele
indperal <- colSums(thisAlDepth > 0)
# filter loci
message("Filtering markers...")
indperloc <- rowSums(rowsum(t(thisAlDepth), thisAlleles2loc) > 0)
keepLoc <- indperloc >= min.ind.with.reads &
tapply(indperal, thisAlleles2loc,
function(x){
sum(x >= min.ind.with.minor.allele) >= 2
})
# get rid of funny stuff from TASSEL-GBSv2
locWithN <- unique(thisAlleles2loc[grep("N", thisAlleleNucleotides)])
keepLoc[locWithN] <- FALSE
# list alleles to remove
remAl <- which(!thisAlleles2loc %in% which(keepLoc))
# remove cut alleles from allele objects
if(length(remAl) > 0){
thisAlleleNucleotides <- thisAlleleNucleotides[-remAl]
thisAlDepth <- thisAlDepth[, -remAl]
thisAlleles2loc <- thisAlleles2loc[-remAl]
}
# update locTable to reflect cut loci
thisLocTable <- thisLocTable[keepLoc,]
# update locus numbers
thisNloc <- sum(keepLoc)
if(thisNloc > 0){
thisAlleles2loc <- rep(1:thisNloc, times = table(thisAlleles2loc))
} else {
thisAlleles2loc <- integer(0)
}
thisNallele <- length(thisAlleles2loc)
# pad out indels, using VCF convention of listing nucleotide before indel
indelLoc <- which(tapply(nchar(thisAlleleNucleotides), thisAlleles2loc,
function(x) length(unique(x)) > 1))
for(i in indelLoc){
thiscol <- which(thisAlleles2loc == i)
thisAN <- thisAlleleNucleotides[thiscol]
maxWidth <- max(nchar(thisAN))
for(a in 1:length(thisAN)){
while(nchar(thisAN[a]) < maxWidth){
thisAN[a] <- paste(thisAN[a], "-", sep = "")
}
}
thisAlleleNucleotides[thiscol] <- thisAN
}
# group SNPs into tags
if(phaseSNPs && thisNloc > 0){
# add the last marker to the current set if appropriate
# (i.e. if the break between file chunks may have been within a tag)
if(currLoc > 0 && thisLocTable$Chr[1] == locTable$Chr[currLoc] &&
thisLocTable$Pos[1] - locTable$Pos[currLoc] < tagsize){
thisLocTable <- rbind(locTable[currLoc,], thisLocTable)
oldAlCol <- which(alleles2loc == currLoc)
thisAlleleNucleotides <- c(alleleNucleotides[oldAlCol],
thisAlleleNucleotides)
thisAlleles2loc <- c(rep(1, length(oldAlCol)),
thisAlleles2loc + 1)
thisAlDepth <- cbind(alleleDepth[,oldAlCol],
thisAlDepth)
currLoc <- currLoc - 1L
currAl <- currAl - length(oldAlCol)
}
# perform grouping + phasing
consTags <- consolidateSNPs(thisAlDepth, thisAlleles2loc, thisLocTable,
thisAlleleNucleotides, tagsize = tagsize,
refgenome = refgenome, tol = tol)
thisAlDepth <- consTags$alleleDepth
thisAlleles2loc <- consTags$alleles2loc
thisAlleleNucleotides <- consTags$alleleNucleotides
thisLocTable <- consTags$locTable
thisNloc <- dim(thisLocTable)[1]
thisNallele <- length(thisAlleles2loc)
}
if(thisNloc > 0){
# add data from this chunk to objects for whole dataset
thisAlCol <- (1:thisNallele) + currAl
if(currAl + thisNallele > ncol(alleleDepth)){
message("Exceeded expected number of alleles; processing may slow.")
if(currAl == 0){
alleleDepth <- thisAlDepth
} else {
alleleDepth <- cbind(alleleDepth[,1:currAl], thisAlDepth)
}
} else {
alleleDepth[,thisAlCol] <- thisAlDepth
}
dimnames(alleleDepth)[[2]][thisAlCol] <- dimnames(thisAlDepth)[[2]]
alleles2loc[thisAlCol] <- thisAlleles2loc + currLoc
alleleNucleotides[thisAlCol] <- thisAlleleNucleotides
if(currLoc + thisNloc > nrow(locTable)){
message("Exceeded expected number of loci; processing may slow.")
if(currLoc == 0){
locTable <- thisLocTable
} else {
locTable <- rbind(locTable[1:currLoc,], thisLocTable)
}
} else {
locTable[(1:thisNloc) + currLoc, ] <- thisLocTable
}
row.names(locTable)[(1:thisNloc) + currLoc] <- row.names(thisLocTable)
# update position in the output
currAl <- currAl + thisNallele
currLoc <- currLoc + thisNloc
}
# don't loop if we are using "which" in svparam
if(is.na(yieldSize)) break
# quit if we have a limit on how many loci to import.
if(!is.na(maxLoci) && currLoc >= maxLoci) break
message("Reading file...")
}
close(tfile)
if(currAl == 0 || currLoc == 0){
stop("No loci passed the missing data and allele frequency thresholds.")
}
message(paste(currLoc, "loci imported."))
# trim output
alleleDepth <- alleleDepth[, 1:currAl]
alleles2loc <- alleles2loc[1:currAl]
alleleNucleotides <- alleleNucleotides[1:currAl]
locTable <- locTable[1:currLoc, ]
# build RADdata object
message("Building RADdata object...")
radout <- RADdata(alleleDepth, alleles2loc, locTable, possiblePloidies,
contamRate, alleleNucleotides, taxaPloidy)
message("Merging rare haplotypes...")
radout <- MergeRareHaplotypes(radout,
min.ind.with.haplotype = min.ind.with.minor.allele)
radout <- MergeIdenticalHaplotypes(radout)
radout <- RemoveMonomorphicLoci(radout)
# indicate whether non-variable sites included in alleleNucleotides
attr(radout$alleleNucleotides, "Variable_sites_only") <- is.null(refgenome) && phaseSNPs
return(radout)
}
# Function to import from Stacks 1.48
readStacks <- function(allelesFile, matchesFolder, version = 2,
min.ind.with.reads = 200,
min.ind.with.minor.allele = 10,
readAlignmentData = FALSE,
sumstatsFile = "populations.sumstats.tsv",
possiblePloidies = list(2), taxaPloidy = 2L,
contamRate = 0.001){
# get columns depending on version number
if(!version %in% c(1,2)){
stop("Version must be equal to 1 or 2.")
}
if(version == 2){
# for catalog alleles file
hapcol <- 3
loccol <- 2
afcols <- list(NULL, integer(0), character(0), NULL, NULL)
# for matches file
mloccol <- 1
msamcol <- 2
mhapcol <- 4
mdepcol <- 5
mfcols <- list(integer(0), integer(0), NULL, character(0), integer(0),
NULL)
# for sumstats file
sloccol <- 1
schrcol <- 2
sposcol <- 3
sfcols <- list(integer(0), character(0), integer(0), NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL)
} else {
# for catalog alleles file
hapcol <- 4
loccol <- 3
afcols <- list(NULL, NULL, integer(0), character(0), NULL, NULL)
# for matches file
mloccol <- 3
msamcol <- 4
mhapcol <- 6
mdepcol <- 7
mfcols <- list(NULL, NULL, integer(0), integer(0), NULL, character(0),
integer(0), NULL)
# for catalog tags file
tloccol <- 3
tchrcol <- 4
tposcol <- 5
tstrcol <- 6
tfcols <- list(NULL, NULL, integer(0), character(0), integer(0),
character(0), NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL)
}
# read in catalog.alleles.tsv file
af <- scan(allelesFile, what = afcols,
sep = "\t", comment.char = "#", na.strings = character(0))
# get locus names (numbers) and haplotypes for variable sites
keep <- af[[hapcol]] != ""
locNames <- af[[loccol]][keep]
alleleNucleotides <- af[[hapcol]][keep]
attr(alleleNucleotides, "Variable_sites_only") <- TRUE
alleleNames <- paste(locNames, alleleNucleotides, sep = "_")
# get all of the sstacks files to read
sstacksFiles <- list.files(matchesFolder, "\\.matches\\.tsv")
if(length(sstacksFiles) == 0){
stop("No .matches.tsv files found.")
}
sampleNames <- sub("\\.matches\\.tsv(\\.gz)?$", "", sstacksFiles)
sstacksFiles <- file.path(matchesFolder, sstacksFiles)
if(length(sampleNames) < min.ind.with.reads){
stop("Need to adjust min.ind.with.reads for number of samples in dataset.")
}
if(length(sampleNames)/2 < min.ind.with.minor.allele){
stop("Need to adjust min.ind.with.minor.allele for number of samples in dataset.")
}
# set up allele depth matrix
alleleDepth <- matrix(0L, nrow = length(sampleNames),
ncol = length(alleleNames),
dimnames = list(sampleNames, alleleNames))
# vector for reordering samples according to numbers in matches files
reorder <- integer(length(sampleNames))
# read sstacks files
for(i in 1:length(sampleNames)){
mf <- scan(sstacksFiles[i],
what = mfcols, sep = "\t", comment.char = "#",
na.strings = character(0))
keep <- mf[[mhapcol]] != "consensus"
theseLocNames <- mf[[mloccol]][keep]
theseAlNuc <- mf[[mhapcol]][keep]
theseDepth <- mf[[mdepcol]][keep]
theseAlNames <- paste(theseLocNames, theseAlNuc, sep = "_")
alleleDepth[i, theseAlNames] <- theseDepth
reorder[mf[[msamcol]][1]] <- i
}
# eliminate any sample numbers that were skipped, and reorder matrix
reorder <- reorder[!is.na(reorder) & reorder > 0]
alleleDepth <- alleleDepth[reorder,]
# filter loci
indperal <- colSums(alleleDepth > 0)
indperloc <- rowSums(rowsum(t(alleleDepth), locNames) > 0)
locRemove <- which(indperloc < min.ind.with.reads |
tapply(indperal, locNames,
function(x){
x <- x[-which.max(x)]
sum(x) < min.ind.with.minor.allele
}))
alRemove <- which(locNames %in% names(indperloc)[locRemove])
# subset objects
alleleDepth <- alleleDepth[, -alRemove]
locNames <- locNames[-alRemove]
alleleNucleotides <- alleleNucleotides[-alRemove]
alleleNames <- alleleNames[-alRemove]
# Get chromosome and position
uniqueLocNames <- unique(locNames)
if(readAlignmentData){
if(version == 1){
tagFile <- sub("alleles", "tags", allelesFile)
tf <- scan(tagFile, what = tfcols,
sep = "\t", comment.char = "#", na.strings = character(0))
keeprows <- fastmatch::fmatch(uniqueLocNames, tf[[tloccol]])
locTable <- data.frame(row.names = as.character(uniqueLocNames),
Chr = tf[[tchrcol]][keeprows],
Pos = tf[[tposcol]][keeprows],
Strand = tf[[tstrcol]][keeprows],
stringsAsFactors = FALSE)
}
if(version == 2){
sf <- scan(sumstatsFile, what = sfcols,
sep = "\t", comment.char = "#", na.strings = character(0))
keeprows <- fastmatch::fmatch(uniqueLocNames, sf[[sloccol]])
locTable <- data.frame(row.names = as.character(uniqueLocNames),
Chr = sf[[schrcol]][keeprows],
Pos = sf[[sposcol]][keeprows],
stringsAsFactors = FALSE)
}
} else {
# no alignment data
locTable <- data.frame(row.names = as.character(uniqueLocNames))
}
# match depth matrix columns to locus table
alleles2loc <- fastmatch::fmatch(locNames, uniqueLocNames)
# build RADdata object
radout <- RADdata(alleleDepth, alleles2loc, locTable, possiblePloidies,
contamRate, alleleNucleotides, taxaPloidy)
message("Merging rare haplotypes...")
radout <- MergeRareHaplotypes(radout,
min.ind.with.haplotype = min.ind.with.minor.allele)
radout <- MergeIdenticalHaplotypes(radout)
radout <- RemoveMonomorphicLoci(radout)
return(radout)
}
# Read in from the TASSEL GBSv2 database with minimal processing.
# Use counts matrix output by GetTagTaxaDistFromDBPlugin, plus SAM file.
readTASSELGBSv2 <- function(tagtaxadistFile, samFile, min.ind.with.reads = 200,
min.ind.with.minor.allele = 10,
possiblePloidies = list(2), taxaPloidy = 2L,
contamRate = 0.001, chromosomes = NULL){
# read the SAM file
message("Reading SAM file...")
samwhat <- list(NULL, 0L, "", 0L, NULL, NULL, NULL, NULL, NULL, "", NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
samchunk <- scan(samFile, what = samwhat, comment.char = "@", sep = "\t",
fill = TRUE, quiet = TRUE)
samflag <- samchunk[[2]]
samchr <- samchunk[[3]]
sampos <- samchunk[[4]]
samseq <- samchunk[[10]]
rm(samchunk)
samind <- seq_along(samflag) # index for the tags
# eliminate unaligned tags
aligned <- bitwAnd(samflag, 4) == 0
samflag <- samflag[aligned]
samchr <- samchr[aligned]
sampos <- sampos[aligned]
samseq <- samseq[aligned]
samind <- samind[aligned]
# subset to desired chromosomes
if(!is.null(chromosomes)){
chrnotfound <- chromosomes[!chromosomes %fin% samchr]
if(length(chrnotfound) > 0){
warning(paste("Chromosomes not found:", paste(chrnotfound, collapse = " ")))
}
chrmatch <- samchr %fin% chromosomes
if(sum(chrmatch) == 0){
stop("No chromosome names from SAM file match those provided.")
}
samflag <- samflag[chrmatch]
samchr <- samchr[chrmatch]
sampos <- sampos[chrmatch]
samseq <- samseq[chrmatch]
samind <- samind[chrmatch]
}
# get strand
samstrand <- ifelse(bitwAnd(samflag, 16) == 0, "top", "bot")
# get tag sequence length
samtlen <- nchar(samseq)
# combine to make marker names
sammrkr <- paste(samchr, sampos, samstrand, samtlen, sep = "-")
# find all non-monomorphic markers
mrkrTable <- table(sammrkr)
polymrkr <- names(mrkrTable)[mrkrTable > 1]
# subset to get only tags belonging to polymorphic markers
polytags <- sammrkr %fin% polymrkr
samflag <- samflag[polytags]
samchr <- samchr[polytags]
sampos <- sampos[polytags]
samseq <- samseq[polytags]
samind <- samind[polytags]
samstrand <- samstrand[polytags]
samtlen <- samtlen[polytags]
sammrkr <- sammrkr[polytags]
# build locTable
lookup <- fastmatch::fmatch(polymrkr, sammrkr)
locTable <- data.frame(row.names = polymrkr, Chr = samchr[lookup],
Pos = sampos[lookup], Strand = samstrand[lookup],
SequenceLength = samtlen[lookup],
stringsAsFactors = FALSE)
locTable <- locTable[order(locTable$Chr, locTable$Pos),]
# open tagtaxadist file
message("Reading TagTaxaDist file...")
ttdcon <- file(tagtaxadistFile, open = 'rt')
# get taxa names
taxa <- scan(ttdcon, what = character(), sep = "\t", nlines = 1,
quiet = TRUE)[-1]
# set up matrix to contain read counts
alleleDepth <- matrix(0L, nrow = length(taxa), ncol = length(sammrkr),
dimnames = list(taxa, paste(sammrkr, samseq, sep = "_")))
# setup for reading file in a loop
whatlist <- list(0L)[rep(1, length(taxa))]
whatlist <- c(list(""), whatlist) ## might change "" to NULL if we don't care about seq
chunksize <- 1e5
# read first chunk
dataIn <- scan(ttdcon, what = whatlist, sep = "\t", nlines = chunksize,
quiet = TRUE)
nread <- length(dataIn[[1]]) # number of tags read
lastTag <- 0
# loop through file
while(nread){
# convert imported data to matrix
datamat <- matrix(unlist(dataIn[-1]), nrow = length(taxa), ncol = nread,
byrow = TRUE)
# find the tags we will keep
thesetags <- samind > lastTag & samind <= lastTag + nread
# add to matrix
alleleDepth[, thesetags] <- datamat[, samind[thesetags] - lastTag]
# read next chunk
dataIn <- scan(ttdcon, what = whatlist, sep = "\t", nlines = chunksize,
quiet = TRUE)
nread <- length(dataIn[[1]]) # number of tags read
lastTag <- lastTag + nread
}
# close tagtaxadist file
close(ttdcon)
# filter loci
message("Filtering loci...")
lociToRemove <- integer(0)
tagsToRemove <- integer(0)
for(i in 1:nrow(locTable)){
thesealleles <- which(sammrkr == row.names(locTable)[i])
thesepres <- alleleDepth[,thesealleles] > 0 # presence or absence of alleles in individuals
if(sum(rowSums(thesepres) > 0) < min.ind.with.reads ||
sum(colSums(thesepres) >= min.ind.with.minor.allele) < 2){
lociToRemove <- c(lociToRemove, i)
tagsToRemove <- c(tagsToRemove, thesealleles)
}
}
if(length(lociToRemove) > 0){
locTable <- locTable[-lociToRemove,]
alleleDepth <- alleleDepth[,-tagsToRemove]
sammrkr <- sammrkr[-tagsToRemove]
samseq <- samseq[-tagsToRemove]
}
if(nrow(locTable) == 0) stop("No loci passed filtering threshold.")
# build vector to match alleles to loci
alleles2loc <- fastmatch::fmatch(sammrkr, rownames(locTable))
# sort alleles to keep things tidy
tagorder <- order(alleles2loc)
alleles2loc <- sort(alleles2loc)
alleleDepth <- alleleDepth[,tagorder]
samseq <- samseq[tagorder]
# build RADdata object
message("Building RADdata object...")
radout <- RADdata(alleleDepth, alleles2loc, locTable, possiblePloidies,
contamRate, samseq, taxaPloidy)
radout <- MergeRareHaplotypes(radout,
min.ind.with.haplotype = min.ind.with.minor.allele)
radout <- MergeIdenticalHaplotypes(radout)
radout <- RemoveMonomorphicLoci(radout)
attr(radout$alleleNucleotides, "Variable_sites_only") <- FALSE
return(radout)
}
# Function to read output of process_sam_multi.py, so the user can get a
# preliminary look at Hind/He distribution before going forward with
# the isolocus_splitter script.
readProcessSamMulti <- function(alignfile, depthfile = sub("align", "depth", alignfile),
expectedLoci = 1000, min.ind.with.reads = 200,
min.ind.with.minor.allele = 10,
possiblePloidies = list(2), taxaPloidy = 2L,
contamRate = 0.001, expectedAlleles = expectedLoci * 15,
maxLoci = expectedLoci){
# read file headers
aligncon <- file(alignfile, open = 'r')
depthcon <- file(depthfile, open = 'r')
alignheader <- scan(aligncon, sep = ",", nlines = 1, what = character(),
quiet = TRUE)
depthheader <- scan(depthcon, sep = ",", nlines = 1, what = character(),
quiet = TRUE)
nalign <- (length(alignheader) - 1) / 4 # number of alignment positions reported
samples <- depthheader[-1]
nsam <- length(samples)
# set up objects to go into RADdata object
locTable <- data.frame(Chr = rep(NA_character_, expectedLoci),
Pos = rep(NA_integer_, expectedLoci),
stringsAsFactors = FALSE)
locnames <- rep(NA_character_, expectedLoci)
alleles2loc <- rep(NA_integer_, expectedAlleles)
alleleNucleotides <- rep(NA_character_, expectedAlleles)
alleleDepth <- matrix(0L, nrow = nsam, ncol = expectedAlleles,
dimnames = list(samples, NULL))
allelenames <- rep(NA_character_, expectedAlleles)
# count loci and alleles read in (passing any filtering)
loccount <- 0L
alcount <- 0L
# read the files
nscan <- 1e4 # number of lines to read at once
whatlistalign <- list(character(), integer(), NULL)
whatlistalign <- whatlistalign[c(rep(1, nalign + 1), rep(2, nalign), rep(3, 2 * nalign))]
whatlistdepth <- list(character(), integer())
whatlistdepth <- whatlistdepth[c(1, rep(2, nsam))]
# dummy objects; will hold the last partial marker read
lastdepthmat <- matrix(integer(0), nrow = 0, ncol = nsam)
lastaligndata <- whatlistalign
lastchunk <- FALSE # indicates if we have read the last chunk of the file
while(loccount < maxLoci && !lastchunk){
aligndata <- scan(aligncon, sep = ",", what = whatlistalign, nlines = nscan,
quiet = TRUE)
depthdata <- scan(depthcon, sep = ",", what = whatlistdepth, nlines = nscan,
quiet = TRUE)
nread <- length(aligndata[[1]])
lastchunk <- nread == 0 # is this the last chunk of the file?
stopifnot(nread == length(depthdata[[1]]))
if(lastchunk){ # end of file
depthmat <- lastdepthmat
aligndata <- lastaligndata
if(nrow(depthmat) == 0) break
} else {
depthmat <- matrix(unlist(depthdata[-1]), nrow = nread, ncol = nsam)
# merge in any potentially unfinished markers from last chunk
depthmat <- rbind(lastdepthmat, depthmat)
for(i in 1:(length(whatlistalign) - nalign)){
aligndata[[i]] <- c(lastaligndata[[i]], aligndata[[i]])
}
}
nread <- nrow(depthmat) # updated if any loci from last chunk added in
if(nread < 2) break # quit if there's nothing left to process
# convert number of mutations to matrix for easier processing
NM <- matrix(unlist(aligndata[(1:nalign) + (1 + nalign)]),
nrow = nread, ncol = nalign)
# find groups of alleles from the same set of alignment positions and process them
theseal <- 1L
for(i in 2:nread){
samegroup <- all(sapply(1:nalign, function(x) aligndata[[x]][i] == aligndata[[x]][i-1]))
if(samegroup){
theseal <- c(theseal, i)
}
if(i == nread){
lastaligndata <- lapply(aligndata, function(x) x[theseal])
lastdepthmat <- depthmat[theseal,]
}
if(!samegroup || (lastchunk && i == nread)) { # we have one complete group of alleles
# divide up into loci based on number of mutations from reference
thisNM <- NM[theseal,, drop = FALSE]
thisNM <- thisNM[,!is.na(thisNM[1,]), drop = FALSE]
hapAssign <- InitHapAssign(thisNM)
hapAssign <- lapply(1:nalign, function(x) which(hapAssign == x))
for(L in 1:nalign){
# skip monomorphic loci or those with no alleles
if(length(hapAssign[[L]]) < 2) next
# filter by missing data rate
thesealSub <- theseal[hapAssign[[L]]]
if(sum(colSums(depthmat[thesealSub,]) > 0) < min.ind.with.reads) next
# filter by minor allele frequency
if(sum(rowSums(depthmat[thesealSub,] > 0) >= min.ind.with.minor.allele) < 2) next
# add the locus in if it passed everything
thislocname <-aligndata[[L]][thesealSub[1]]
firstal <- alcount + 1L
alcount <- alcount + length(thesealSub)
if(thislocname %in% locnames){ # same locus may show up in multiple groups
alleles2loc[firstal:alcount] <- match(thislocname, locnames)
} else {
loccount <- loccount + 1L
locnames[loccount] <- thislocname
splitlocname <- strsplit(locnames[loccount], "-")[[1]]
locTable$Chr[loccount] <- splitlocname[1]
locTable$Pos[loccount] <- as.integer(splitlocname[2])
alleles2loc[firstal:alcount] <- loccount
}
alleleNucleotides[firstal:alcount] <- aligndata[[nalign+1]][thesealSub]
allelenames[firstal:alcount] <-
paste(locnames[loccount], alleleNucleotides[firstal:alcount], sep = "_")
if(alcount > ncol(alleleDepth)){
alleleDepth <- alleleDepth[,1:(firstal-1)]
alleleDepth <- cbind(alleleDepth, t(depthmat[thesealSub,]))
} else {
alleleDepth[,firstal:alcount] <- t(depthmat[thesealSub,])
}
if(loccount >= maxLoci) break
}
}
if(!samegroup){
theseal <- i
}
if(loccount >= maxLoci) break
}
}
# wrap-up and build RADdata object
close(aligncon)
close(depthcon)
locnames <- locnames[1:loccount]
locTable <- locTable[1:loccount,]
allelenames <- allelenames[1:alcount]
alleleNucleotides <- alleleNucleotides[1:alcount]
alleleDepth <- alleleDepth[,1:alcount]
alleles2loc <- alleles2loc[1:alcount]
alleleorder <- order(alleles2loc)
allelenames <- allelenames[alleleorder]
alleleNucleotides <- alleleNucleotides[alleleorder]
alleleDepth <- alleleDepth[,alleleorder]
alleles2loc <- alleles2loc[alleleorder]
rownames(locTable) <- locnames
colnames(alleleDepth) <- allelenames
out <- RADdata(alleleDepth, alleles2loc, locTable, possiblePloidies,
contamRate, alleleNucleotides, taxaPloidy)
# out <- MergeRareHaplotypes(out, min.ind.with.haplotype = min.ind.with.minor.allele)
# out <- RemoveMonomorphicLoci(out)
return(out)
}
readProcessIsoloci <- function(sortedfile, min.ind.with.reads = 200,
min.ind.with.minor.allele = 10,
min.median.read.depth = 10,
possiblePloidies = list(2), taxaPloidy = 2L,
contamRate = 0.001, nameFromTagStart = TRUE,
mergeRareHap = TRUE){
message("Reading file...")
incon <- file(sortedfile, open = "r")
# read header
header <- scan(incon, sep = ",", nlines = 1, what = character(), quiet = TRUE)
samples <- header[-(1:5)]
nSam <- length(samples)
scanwhat <- list(character(), integer(), character(), character(), NULL, integer())
scanwhat <- scanwhat[c(1:5, rep(6, nSam))]
# read file
mydata <- scan(incon, sep = ",", what = scanwhat, quiet = TRUE)
close(incon)
nAl <- length(mydata[[1]])
# get depth matrix
alleleDepth <- matrix(unlist(mydata[(1:nSam) + 5]), nrow = nSam, ncol = nAl,
byrow = TRUE, dimnames = list(samples, NULL))
if(any(is.na(alleleDepth))){
stop("Missing data in depth matrix.")
}
mydata <- mydata[1:4] # free up space
# factor by locus, sorting locus names
alleles2loc_factor <- as.factor(mydata[[1]])
loci <- levels(alleles2loc_factor)
nLoc <- length(loci)
alleles2loc <- as.integer(alleles2loc_factor)
# perform filtering
message("Filtering and sorting loci...")
keeploc <- integer(0)
for(L in 1:nLoc){
submat <- alleleDepth[,alleles2loc == L, drop = FALSE]
depthperind <- rowSums(submat)
if(sum(depthperind > 0) >= min.ind.with.reads &&
sum(colSums(submat > 0) >= min.ind.with.minor.allele) > 1 &&
median(depthperind) >= min.median.read.depth){
keeploc <- c(keeploc, L)
}
}
keepal <- which(alleles2loc %fin% keeploc)
alleles2loc_factor <- droplevels(alleles2loc_factor[keepal])
loci <- levels(alleles2loc_factor)
alleleDepth <- alleleDepth[,keepal, drop = FALSE]
alleles2loc <- as.integer(alleles2loc_factor)
alleleNucleotides <- mydata[[3]][keepal]
refNucleotides <- mydata[[4]][keepal]
colnames(alleleDepth) <- paste(alleles2loc_factor, alleleNucleotides,
sep = "_")
# sort by locus name (i.e. position and chromosome)
alorder <- order(alleles2loc)
alleleDepth <- alleleDepth[, alorder, drop = FALSE]
alleleNucleotides <- alleleNucleotides[alorder]
alleles2loc <- alleles2loc[alorder]
# build locTable
chrom <- sub("\\-.*$", "", loci)
loc2al <- fastmatch::fmatch(loci, mydata[[1]])
pos <- mydata[[2]][loc2al]
ref <- mydata[[4]][loc2al]
if(!nameFromTagStart){
loci <- paste(chrom, pos, sep = "-")
}
locTable <- data.frame(row.names = loci,
Chr = chrom, Pos = pos,
Ref = ref,
stringsAsFactors = FALSE)
# build RADdata object
message("Building RADdata object...")
attr(alleleNucleotides, "Variable_sites_only") <- FALSE
radout <- RADdata(alleleDepth, alleles2loc, locTable, possiblePloidies,
contamRate, alleleNucleotides, taxaPloidy)
radout <- MergeIdenticalHaplotypes(radout)
if(mergeRareHap){
radout <- MergeRareHaplotypes(radout,
min.ind.with.haplotype = min.ind.with.minor.allele)
radout <- MergeIdenticalHaplotypes(radout)
radout <- RemoveMonomorphicLoci(radout)
}
return(radout)
}
readDArTag <- function(file, botloci = NULL, blastfile = NULL,
excludeHaps = NULL, includeHaps = NULL,
n.header.rows = 0, sample.name.row = 1,
trim.sample.names = "_[^_]+_[ABCDEFGH][[:digit:]][012]?$",
sep.counts = ",", sep.blast = "\t",
possiblePloidies = list(2), taxaPloidy = 2L,
contamRate = 0.001){
if(!is.null(excludeHaps) && !is.null(includeHaps)){
stop("Only specify one of excludeHaps or includeHaps")
}
if(is.null(botloci) && is.null(blastfile)){
stop("Need to specify botloci or blastfile.")
}
if(!is.null(botloci) && !is.null(blastfile)){
stop("Only specify one of botloci or blastfile.")
}
message("Importing read counts...")
mycon <- file(file, open = 'r')
hdr <- readLines(mycon, n = n.header.rows)
tab <- read.table(mycon, header = TRUE, sep = sep.counts,
stringsAsFactors = FALSE)
close(mycon)
# determine number of leading columns
if(n.header.rows > 0){
hdr.split <- strsplit(hdr, split = ",")
n.lead.cols <-
unique(sapply(hdr.split, function(x) min(grep(".+", x)) - 1))
if(length(n.lead.cols) != 1){
stop("Not all sample headers start on same column. Be sure not to count the row starting with AlleleID towards n.header.rows.")
}
} else {
n.lead.cols <- sum(colnames(tab) %in% c("AlleleID", "CloneID", "AlleleSequence", "readCountSum"))
}
if(!all(c("AlleleID", "CloneID", "AlleleSequence") %in% colnames(tab))){
stop("Need AlleleID, CloneID, and AlleleSequence columns.")
}
# Get sample names
if(sample.name.row == n.header.rows + 1){
samples <- colnames(tab)[-seq_len(n.lead.cols)]
} else {
if(sample.name.row < 1 || sample.name.row > n.header.rows){
stop("sample.name.row should be within header rows or column headers.")
}
samples <- hdr.split[[sample.name.row]][-seq_len(n.lead.cols)]
}
if(anyDuplicated(samples)){
stop("Not all sample names are unique. Check sample.name.row.")
}
if(trim.sample.names != ""){
samplesOLD <- samples
samples <- sub(trim.sample.names, "", samples)
dups <- duplicated(samples)
if(any(dups)){
warning("Some sample names not trimmed, to avoid duplicates.")
samples[dups] <- samplesOLD[dups]
}
stopifnot(!anyDuplicated(samples))
}
# Filter haplotypes if needed
if(!is.null(includeHaps)){
if(!all(includeHaps %in% tab$AlleleID)){
stop("Not all haplotypes in includeHaps found in AlleleID.")
}
tab <- tab[tab$AlleleID %in% includeHaps,]
}
if(!is.null(excludeHaps)){
if(!all(excludeHaps %in% tab$AlleleID)){
warning("Not all haplotypes in excludeHaps found in AlleleID.")
}
tab <- tab[!tab$AlleleID %in% excludeHaps,]
}
if(anyDuplicated(tab$AlleleID)){
stop("Duplicate AlleleIDs found.")
}
# Build locTable
loci <- unique(tab$CloneID)
refals <- sapply(loci, function(L) grep(paste0("^", L, "\\|Ref(_[0]*1)?$"), tab$AlleleID, value = TRUE))
if(is.list(refals) || is.matrix(refals)){
stop("More than one reference allele found per locus.")
}
locTable <- data.frame(row.names = loci,
Chr = sub("_[[:digit:]]+$", "", loci),
Pos = as.integer(sub("^.+_", "", loci)))
# Trim allele tags as needed. DArT provided newer alleles as longer sequence.
taglength <- nchar(tab$AlleleSequence)
minlength <- tapply(taglength, tab$CloneID, min)
if(length(unique(minlength)) == 1){
tab$AlleleSequence <- substring(tab$AlleleSequence, 1, minlength[1])
} else {
for(L in loci){
theserows <- which(tab$CloneID == L)
tab$AlleleSequence[theserows] <- substring(tab$AlleleSequence[theserows], 1, minlength[L])
}
}
# Do reverse complement where appropriate
if(!is.null(blastfile)){
# Import BLAST results
message("Importing BLAST results...")
blastres <- read.table(blastfile, header = TRUE, sep = sep.blast,
stringsAsFactors = FALSE)
botbool <- logical(length(loci))
alignedbool <- logical(length(loci))
qidcol <- which(colnames(blastres) %in% c("qseqid", "Query"))
subcol <- which(colnames(blastres) %in% c("sseqid", "Subject"))
sstartcol <- which(colnames(blastres) %in% c("sstart", "S_start"))
sendcol <- which(colnames(blastres) %in% c("send", "S_end"))
pidentcol <- which(colnames(blastres) %in% c("pident", "X.Identity"))
perfectmatch <- blastres[[pidentcol]] == 100
if(!all(lengths(list(qidcol, subcol, sstartcol, sendcol, pidentcol)) == 1)){
stop("Problem with column headers in BLAST file.")
}
for(i in seq_along(loci)){
# Find perfect matches for the reference allele
theserows <- which(blastres[[qidcol]] == refals[i] & perfectmatch)
# Check that chromosome is correct
theserows <- theserows[grep(paste0("(.+_)?", locTable$Chr[i], "$"),
blastres[[subcol]][theserows])]
# Check that position is correct
thispos <- locTable$Pos[i]
theserows <- theserows[(blastres[[sstartcol]][theserows] <= thispos &
blastres[[sendcol]][theserows] >= thispos) |
(blastres[[sstartcol]][theserows] >= thispos &
blastres[[sendcol]][theserows] <= thispos)]
if(length(theserows) >= 1){
alignedbool[i] <- TRUE
# Determine strandedness
botbool[i] <-
blastres[[sstartcol]][theserows[1]] > blastres[[sendcol]][theserows[1]]
}
}
botloci <- loci[botbool]
# subset to loci with alignments
loci <- loci[alignedbool]
refals <- refals[alignedbool]
locTable <- locTable[alignedbool,]
tab <- tab[tab$CloneID %in% loci,]
nnotaligned <- sum(!alignedbool)
if(nnotaligned > 0){
warning(paste("Discarded", nnotaligned, "loci without correct alignments."))
}
}
botrows <- which(tab$CloneID %in% botloci)
tab$AlleleSequence[botrows] <- reverseComplement(tab$AlleleSequence[botrows])
locTable$Tag_strand <- ifelse(loci %in% botloci, "bot", "top")
# Add reference sequence to locTable
if(!all(refals %in% tab$AlleleID)){
warning("Reference sequence not recorded due to not all loci having reference alleles. Expecting Ref_001.")
warning("Positions incorrect because target SNP could not be ascertained.")
} else {
locTable$Ref <- tab$AlleleSequence[match(refals, tab$AlleleID)]
altals <- sapply(loci, function(L) grep(paste0("^", L, "\\|Alt(_[0]*2)?$"), tab$AlleleID, value = TRUE))
if(is.list(altals) || is.matrix(altals)){
warning("Positions incorrect because target SNP could not be ascertained.")
} else {
# Convert position from target SNP to tag start
altseq <- tab$AlleleSequence[match(altals, tab$AlleleID)]
snppos <-
mapply(function(x, y) which(charToRaw(x) != charToRaw(y))[1],
locTable$Ref, altseq, USE.NAMES = FALSE)
locTable$Pos <- locTable$Pos - snppos + 1L
}
}
# Allele info
alleles2loc <- match(tab$CloneID, loci)
alleleNucleotides <- tab$AlleleSequence
attr(alleleNucleotides, "Variable_sites_only") <- FALSE
# Allelic read depth
alleleDepth <- t(as.matrix(tab[,-seq_len(n.lead.cols)]))
colnames(alleleDepth) <- tab$AlleleID
rownames(alleleDepth) <- samples
message("Building RADdata object...")
out <- RADdata(alleleDepth, alleles2loc, locTable, possiblePloidies,
contamRate, alleleNucleotides, taxaPloidy)
message("Merging identical haplotypes...")
out <- MergeIdenticalHaplotypes(out)
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.