Nothing
FindChimeras <- function(dbFile,
tblName="Seqs",
identifier="",
dbFileReference,
tblNameReference="Seqs",
batchSize=100,
minNumFragments=20000,
tb.width=5,
multiplier=20,
minLength=30,
minCoverage=0.6,
overlap=100,
minSuspectFragments=4,
showPercentCoverage=FALSE,
add2tbl=FALSE,
maxGroupSize=-1,
minGroupSize=25,
excludeIDs=NULL,
processors=1,
verbose=TRUE) {
# error checking
if (!is.character(tblName))
stop("tblName must be a character string.")
if (!is.character(tblNameReference))
stop("tblNameReference must be a character string.")
if (!is.character(identifier))
stop("identifier must be a character string.")
if (!is.null(excludeIDs) && !is.character(excludeIDs))
stop("excludeIDs must be a character string.")
if (!is.numeric(batchSize))
stop("batchSize must be a numeric.")
if (!is.numeric(minNumFragments))
stop("minNumFragments must be a numeric.")
if (!is.numeric(tb.width))
stop("tb.width must be a numeric.")
if (tb.width >= 15)
stop("tb.width must be less than 15.")
if (tb.width <= 0)
stop("tb.width must be greater than zero.")
if (!is.numeric(multiplier))
stop("multiplier must be a numeric.")
if (!is.logical(verbose))
stop("verbose must be a logical.")
if (!is.logical(add2tbl) && !is.character(add2tbl))
stop("add2tbl must be a logical or table name.")
if (!is.numeric(minLength))
stop("minLength must be a numeric.")
if (minLength < 0)
stop("minLength must be positive or zero.")
if (!is.numeric(overlap))
stop("overlap must be a numeric.")
if (overlap < 0)
stop("overlap must be positive or zero.")
if (!is.numeric(minCoverage))
stop("minCoverage must be a numeric.")
if (minCoverage < 0 || minCoverage > 1)
stop("minCoverage must be between 0 and 1 inclusive.")
if (!is.numeric(minSuspectFragments))
stop("minSuspectFragments must be a numeric.")
if (minSuspectFragments < 0)
stop("minSuspectFragments must be positive or zero.")
if (!is.numeric(maxGroupSize))
stop("maxGroupSize must be a numeric.")
if (maxGroupSize == 0)
stop("maxGroupSize must be non-zero.")
if (maxGroupSize != floor(maxGroupSize))
stop("maxGroupSize must be an integer.")
if (!is.numeric(minGroupSize))
stop("minGroupSize must be a numeric.")
if (minGroupSize < 0)
stop("minGroupSize must be at least zero.")
if (minGroupSize != floor(minGroupSize))
stop("minGroupSize must be an integer.")
if (!is.null(processors) && !is.numeric(processors))
stop("processors must be a numeric.")
if (!is.null(processors) && floor(processors)!=processors)
stop("processors must be a whole number.")
if (!is.null(processors) && processors < 1)
stop("processors must be at least 1.")
if (is.null(processors)) {
processors <- detectCores()
} else {
processors <- as.integer(processors)
}
if (verbose)
time.1 <- Sys.time()
# initialize databases
driver = dbDriver("SQLite")
if (is.character(dbFile)) {
dbConn1 = dbConnect(driver, dbFile)
on.exit(dbDisconnect(dbConn1))
} else {
dbConn1 = dbFile
if (!inherits(dbConn1,"SQLiteConnection"))
stop("'dbFile' must be a character string or SQLiteConnection.")
if (!dbIsValid(dbConn1))
stop("The connection has expired.")
}
if (is.character(dbFileReference)) {
dbConn2 = dbConnect(driver, dbFileReference)
on.exit(dbDisconnect(dbConn2), add=TRUE)
} else {
dbConn2 = dbFileReference
if (!inherits(dbConn2,"SQLiteConnection"))
stop("'dbFileReference' must be a character string or SQLiteConnection")
if (!dbIsValid(dbConn2))
stop("The connection has expired.")
}
searchExpression <- paste("select distinct identifier, origin from",
tblNameReference)
rs <- dbSendQuery(dbConn2, searchExpression)
searchResult <- dbFetch(rs, n=-1, row.names=FALSE)
groups <- searchResult$identifier
origins <- searchResult$origin
dbClearResult(rs)
searchExpression <- paste("select distinct identifier from",
tblName)
rs <- dbSendQuery(dbConn1, searchExpression)
searchResult <- dbFetch(rs, n=-1, row.names=FALSE)
myGroups <- searchResult$identifier
dbClearResult(rs)
if (identifier[1] != "") {
w <- which(!(identifier %in% groups))
if (length(w) > 0) {
warning("Identifier(s) not found in dbFile: ", paste(unique(identifier[w]), sep=", "))
identifier <- identifier[-w]
}
myGroups <- unique(identifier)
}
w <- which(!(myGroups %in% groups))
if (length(w) > 0) {
warning("Identifier(s) not found in dbFileReference: ", paste(unique(myGroups[w]), sep=", "))
myGroups <- myGroups[-w]
}
w <- which(myGroups %in% excludeIDs)
if (length(w) > 0)
myGroups <- myGroups[-w]
# determine if the reference db contains a chimera column
f <- dbListFields(dbConn2, tblNameReference)
firstRound2 <- is.na(match("chimera", f))
# determine if the dbFile contains a chimera column
f <- dbListFields(dbConn1, tblName)
firstRound1 <- is.na(match("chimera", f))
# remove groups with too few sequences
remove <- character()
for (group in myGroups) {
if (firstRound2) {
numG <- SearchDB(dbConn2,
tblName=tblNameReference,
identifier=group,
processors=processors,
verbose=FALSE,
clause="nonbases < 20",
countOnly=TRUE)
} else {
numG <- SearchDB(dbConn2,
identifier=group,
tblName=tblNameReference,
processors=processors,
verbose=FALSE,
clause="chimera is NULL and nonbases < 20",
countOnly=TRUE)
}
if (numG < minGroupSize) # too small
remove <- c(remove, group)
}
w <- which(myGroups %in% remove)
if (length(w) > 0)
myGroups <- myGroups[-w]
# initialize fragment variables
offset <- 5
min_hits_in <- 5
fragments <- DNAStringSet()
all_hits_in <- integer()
all_results <- NULL
percentCoverage <- ""
m <- integer()
rns <- integer()
if (verbose)
pBar <- txtProgressBar(style=ifelse(interactive(), 3, 1))
for (group in myGroups) {
if (firstRound2) {
group_dna <- SearchDB(dbConn2,
tblName=tblNameReference,
identifier=group,
processors=processors,
verbose=FALSE,
type="DNAStringSet",
limit=maxGroupSize,
removeGaps="all",
clause="nonbases < 20")
} else {
group_dna <- SearchDB(dbConn2,
tblName=tblNameReference,
identifier=group,
processors=processors,
verbose=FALSE,
type="DNAStringSet",
limit=maxGroupSize,
removeGaps="all",
clause="chimera is NULL and nonbases < 20")
}
numG <- length(group_dna)
if (firstRound1) {
dna <- SearchDB(dbConn1,
tblName=tblName,
identifier=group,
type="DNAStringSet",
replaceChar="",
removeGaps="all",
processors=processors,
verbose=FALSE)
} else {
dna <- SearchDB(dbConn1,
tblName=tblName,
identifier=group,
type="DNAStringSet",
replaceChar="",
removeGaps="all",
processors=processors,
verbose=FALSE,
clause="chimera is NULL")
}
# reduce to the set of unique dna sequences
u_dna <- unique(dna)
names(u_dna) <- names(dna)[match(u_dna, dna)]
m <- c(m, as.integer(names(u_dna)[match(dna, u_dna)]))
rns <- c(rns, as.integer(names(dna)))
dna <- u_dna
numF <- length(dna)
batches <- seq(1, numF, batchSize)
# for all sequences
for (j in batches) {
# subset of the whole group's sequence set
seqs <- j:(j + batchSize - 1)
# remove sequences beyond length of sequence set
# (only applies to the final iteration)
w <- which(seqs > numF)
if (length(w) > 0)
seqs <- seqs[-w]
# split each sequence into fragments of length 30
temp_fragments <- DNAStringSet()
count <- 0
widths <- width(dna[seqs])
numFragments <- floor((max(widths) - 30)/offset) + 1
for (i in seq_len(numFragments)) {
start <- ((i - 1)*offset + 1)
end <- ((i - 1)*offset + 30)
w <- which(widths >= end)
l <- length(w)
temp_fragments <- c(temp_fragments,
subseq(dna[seqs[w]], start, end))
names(temp_fragments)[seq((count + 1), (count + l))] <-
paste(group, ".", names(dna[seqs[w]]),
".", rep(i, l), ".", widths[w], sep="")
count <- count + l
}
# remove fragments with a trusted-band containing ambiguities
# first 5 nucleotide trusted-band
w <- which(rowSums(alphabetFrequency(subseq(temp_fragments,
1,
2*tb.width))[,5:15])==0)
if (length(w) > 0)
temp_fragments <- temp_fragments[w]
if (length(temp_fragments) > 0) {
# remove fragments with a middle containing Ns
w <- which(alphabetFrequency(subseq(temp_fragments,
2*tb.width + 1,
30))[,15]==0)
if (length(w) > 0)
temp_fragments <- temp_fragments[w]
}
if (length(temp_fragments) > 0) {
u_temp_fragments <- unique(temp_fragments)
m_fragments <- match(temp_fragments, u_temp_fragments)
pdict <- PDict(u_temp_fragments,
algorithm="ACtree2",
tb.end=tb.width)
if (numG <= 3000) { # search whole group
# count number of hits in group
counts <- vwhichPDict(pdict,
group_dna,
max.mismatch=1,
fixed=FALSE)
counts <- unlist(counts)
# tabulate counts
if (length(counts) > 0) { # hits
t <- tabulate(counts)
if (length(t) < length(u_temp_fragments))
t[(length(t) + 1):length(u_temp_fragments)] <- 0
hits_in <- t[m_fragments]
# remove fragments with too many hits in group
w <- which(hits_in < min_hits_in)
} else { # no hits
hits_in <- integer(length(temp_fragments))
w <- seq_along(temp_fragments)
}
} else { # search part of group first for speed
# count number of hits in part of group
counts <- vwhichPDict(pdict,
group_dna[1:2999],
max.mismatch=1,
fixed=FALSE)
counts <- unlist(counts)
# tabulate counts
if (length(counts) > 0) { # hits
t <- tabulate(counts)
if (length(t) < length(u_temp_fragments))
t[(length(t) + 1):length(u_temp_fragments)] <- 0
hits_in <- t[m_fragments]
# remove fragments with too many hits in group
w <- which(hits_in < min_hits_in)
} else { # no hits
hits_in <- integer(length(temp_fragments))
w <- seq_along(temp_fragments)
}
hits_in <- hits_in[w]
temp_fragments <- temp_fragments[w]
if (length(temp_fragments) > 0) {
u_temp_fragments <- unique(temp_fragments)
m_fragments <- match(temp_fragments, u_temp_fragments)
# count number of hits in group
pdict <- PDict(u_temp_fragments,
algorithm="ACtree2",
tb.end=tb.width)
counts <- vwhichPDict(pdict,
group_dna[3000:numG],
max.mismatch=1,
fixed=FALSE)
counts <- unlist(counts)
# tabulate counts
if (length(counts) > 0) { # hits
t <- tabulate(counts)
if (length(t) < length(u_temp_fragments))
t[(length(t) + 1):length(u_temp_fragments)] <- 0
hits_in2 <- t[m_fragments]
hits_in <- hits_in + hits_in2
# remove fragments with too many hits in group
w <- which(hits_in < min_hits_in)
} else { # no hits
w <- seq_along(temp_fragments)
}
}
}
} else { # no temp_fragments
w <- integer()
}
if (length(w) > 0) {
temp_fragments <- temp_fragments[w]
u_temp_fragments <- unique(temp_fragments)
m_fragments <- match(temp_fragments, u_temp_fragments)
hits_in <- hits_in[w]
# count number of lenient hits in group
pdict <- PDict(u_temp_fragments,
algorithm="ACtree2",
tb.start=tb.width + 1,
tb.end=2*tb.width)
counts <- vwhichPDict(pdict,
group_dna,
max.mismatch=2,
fixed=FALSE)
counts <- unlist(counts)
# tabulate counts
if (length(counts)==0) { # no hits
w <- integer() # should never occurr
} else { # hits
t <- tabulate(counts)
if (length(t) < length(u_temp_fragments))
t[(length(t) + 1):length(u_temp_fragments)] <- 0
lenient_hits_in <- t[m_fragments]
# remove fragments with too many lenient hits in group
w <- which(lenient_hits_in < 10)
}
}
if (length(w) > 0) {
# remove fragments that would not meet requirements for a chimera
name.position.width <- unlist(strsplit(names(temp_fragments[w]),
".",
fixed=TRUE))
l <- length(name.position.width)
# get the sequences of origin
name <- name.position.width[seq(2, l, 4)]
unique_name <- unique(name)
# get the fragment positions
position <- as.numeric(name.position.width[seq(3, l, 4)])
# get the original sequence widths
widths <- as.numeric(name.position.width[seq(4, l, 4)])
remove <- character()
for (i in unique_name) {
w1 <- which(name==i)
if (length(w1) < minSuspectFragments) {
remove <- c(remove, i)
next # chimeras need at least minSuspectFragments fragments
}
start <- ((position[w1[1]] - 1)*offset + 1)
end <- ((position[w1[length(w1)]] - 1)*offset + 30)
# chimeric section needs to be at least minLength long
if ((end - start + 1) < minLength) {
remove <- c(remove, i)
next
}
# chimeric section must overlap ends
if (!(end > (widths[w1[1]] - overlap)) &&
!(start < overlap))
remove <- c(remove, i)
}
if (length(remove) > 0)
w <- w[-which(name %in% remove)]
if (length(w) > 0) {
fragments <- c(fragments, temp_fragments[w])
all_hits_in <- c(all_hits_in, hits_in[w])
}
}
# keep adding fragments until above minimum
if (length(unique(fragments)) < minNumFragments &&
(!(j==batches[length(batches)] && group==myGroups[length(myGroups)])
|| length(fragments)==0)) # and not the last batch
next
# search through other groups for fragments
ns <- unlist(strsplit(names(fragments), ".", fixed=TRUE))
groupsOfOrigin <- ns[seq(1, length(ns), 4)]
n <- ns[seq(2, length(ns), 4)]
un <- unique(n)
result <- data.frame(chimera=I(character(length(un))),
row.names=as.character(un))
# for each group where the fragment did not originate
if (verbose) {
beg <- j - 1
fin <- j + batchSize
if (fin > numF)
fin <- numF
count <- 0
}
for (other in groups) {
if (verbose)
count <- count + 1
if (other %in% excludeIDs)
next # don't allow chimeras
# remove groups that are from the other group and
# remove groups that share unclassified's line of descent
uGroupsOfOrigin <- unique(groupsOfOrigin)
# find origin_other (line of descent of other)
origin_other <- origins[match(other, groups)]
g2 <- grepl("unclassified", other, fixed=TRUE)
for (g in uGroupsOfOrigin) {
if (other==g) {
uGroupsOfOrigin <- uGroupsOfOrigin[-match(g, uGroupsOfOrigin)]
} else if (g2 ||
grepl("unclassified", g, fixed=TRUE)) {
# find origin (line of descent of group)
origin <- origins[match(g, groups)]
if (grepl(origin, origin_other, fixed=TRUE) ||
grepl(origin_other, origin, fixed=TRUE))
uGroupsOfOrigin <- uGroupsOfOrigin[-match(g, uGroupsOfOrigin)]
}
}
# do not search the group of origin
h <- which(groupsOfOrigin %in% uGroupsOfOrigin)
if (length(h)==0)
next
u_fragments <- unique(fragments[h])
m_fragments <- match(fragments[h], u_fragments)
pdict <- PDict(u_fragments,
algorithm="ACtree2",
tb.end=tb.width)
# search for the fragments
if (firstRound2) {
other_dna <- SearchDB(dbConn2,
tblName=tblNameReference,
identifier=other,
processors=processors,
verbose=FALSE,
type="DNAStringSet",
limit=maxGroupSize,
removeGaps="all",
clause="nonbases < 20")
} else {
other_dna <- SearchDB(dbConn2,
tblName=tblNameReference,
identifier=other,
processors=processors,
verbose=FALSE,
type="DNAStringSet",
limit=maxGroupSize,
removeGaps="all",
clause="nonbases < 20 and chimera is NULL")
}
if (length(other_dna) <= multiplier)
next # not enough sequences
counts <- vwhichPDict(pdict,
other_dna,
max.mismatch=1,
fixed=FALSE)
counts <- unlist(counts)
# tabulate counts
if (length(counts)==0) # no hits
next
t <- tabulate(counts)
if (length(t) < length(u_fragments))
t[(length(t) + 1):length(u_fragments)] <- 0
hits_out <- t[m_fragments]
# mark fragments with hits out of group greater than
# multiplier times the number of hits in group
# as possible chimeras from the other group
x <- all_hits_in[h]*multiplier < hits_out
y <- hits_out > multiplier
w <- which(x & y)
if (length(w) > 0) {
# find all fragments corresponding to the unique fragment
name.position.width <- unlist(strsplit(names(fragments[h[w]]),
".",
fixed=TRUE))
l <- length(name.position.width)
# get the sequences of origin
name <- name.position.width[seq(2, l, 4)]
unique_name <- unique(name)
# get the fragment positions
position <- as.numeric(name.position.width[seq(3, l, 4)])
# get the original sequence widths
widths <- as.numeric(name.position.width[seq(4, l, 4)])
for (i in unique_name) {
w1 <- which(name==i)
if (length(w1) < minSuspectFragments)
next # chimeras need at least minSuspectFragments fragments
# needs to have a high density of fragments
start <- 0
end <- 0
coverage <- 0
for (p in position[w1]) {
newCoverage <- (p - 1)*offset + 30 - end
if (newCoverage > 30)
coverage <- coverage + 30
else
coverage <- coverage + newCoverage
end <- ((p - 1)*offset + 30)
}
start <- ((position[w1[1]] - 1)*offset + 1)
end <- ((position[w1[length(w1)]] - 1)*offset + 30)
# chimeric section needs > % coverage of fragments
if (coverage <= minCoverage*(end - start + 1))
next
# chimeric section needs to be at least minLength long
if ((end - start + 1) < minLength)
next
# chimeric section must overlap ends
if (!(end > (widths[w1[1]] - overlap)) &&
!(start < overlap))
next
index <- match(i,row.names(result))
# append this group to the result
w2 <- which(result$chimera[index]=="")
if (length(w2) > 0) {
result$chimera[index[w2]] <- other
if (length(index[-w2]) > 0)
result$chimera[index[-w2]] <- paste(result$chimera[index[-w2]],
other,
sep=", ")
} else {
result$chimera[index] <- paste(result$chimera[index],
other,
sep=", ")
}
if (showPercentCoverage)
percentCoverage <- paste(":",
round(100*coverage/(end - start + 1),
digits=2),
"%",
sep="")
# append chimeric positions to the result
result$chimera[index] <- paste(result$chimera[index],
" (",
paste(as.character(start),
"-",
as.character(end),
percentCoverage,
")",
sep=""),
sep="")
}
}
if (verbose)
setTxtProgressBar(pBar,
(beg + (fin - beg)*count/length(groups))/numF)
}
# initialize fragment variables
fragments <- DNAStringSet()
all_hits_in <- integer()
# update the result in the database
result <- subset(result, result$chimera!="")
if (dim(result)[1] > 0) {
if (is.null(all_results))
all_results <- result
else
all_results <- rbind(all_results, result)
}
}
}
# expand results to include duplicated sequences
rows <- match(m, as.integer(rownames(all_results)))
w <- which(!is.na(rows))
if (length(w) > 0) {
rows <- rows[w]
if (length(rows) > dim(all_results)[1])
all_results <- data.frame(chimera=I(all_results$chimera[rows]),
row.names=as.character(rns[w]))
}
if ((is.character(add2tbl) || add2tbl) && !is.null(all_results))
Add2DB(all_results,
dbConn1,
tblName=ifelse(is.character(add2tbl),add2tbl,tblName),
verbose=FALSE)
if (verbose) {
setTxtProgressBar(pBar, 1)
close(pBar)
if (is.null(all_results)) {
cat("\nNo chimeras found.")
} else {
cat("\nFound",
dim(all_results)[1],
ifelse(dim(all_results)[1] > 1,
"possible chimeras.",
"possible chimera."))
if ((is.character(add2tbl) || add2tbl) && !is.null(all_results))
cat("\nAdded to table ",
ifelse(is.character(add2tbl),add2tbl,tblName),
": \"chimera\".",
sep="")
}
cat("\n")
time.2 <- Sys.time()
print(round(difftime(time.2,
time.1,
units='secs'),
digits=2))
cat("\n")
}
return(all_results)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.