###### -- Expand blocks of predicted pairs ------------------------------------
# author: nicholas cooley
# contact: npc19@pitt.edu / npcooley@gmail.com
BlockExpansion <- function(Pairs,
GapTolerance = 4L,
DropSingletons = FALSE,
Criteria = "PID",
Floor = 0.5,
NewPairsOnly = TRUE,
DBPATH,
Verbose = FALSE) {
if (!("PID" %in% colnames(Pairs))) {
stop ("PairSummaries Object must have PIDs calculated.")
}
if ("SCORE" %in% colnames(Pairs)) {
INCLUDESCORE <- TRUE
} else {
INCLUDESCORE <- FALSE
}
if (missing(DBPATH)) {
stop("DBPATH must be supplied.")
}
if (!(Criteria %in% c("PID",
"Score"))) {
stop ("Please select a valid comparison criteria.")
}
# given the gap tolerance, split diags where appropriate
if (is.null(GapTolerance)) {
GapTolerance <- 2L
}
if (GapTolerance <= 1L) {
stop ("GapTolerance defines the diff() of features within a block. It cannot be <= 1.")
}
if (Verbose) {
TimeStart <- Sys.time()
}
MAT1 <- get(data("HEC_MI1",
package = "DECIPHER",
envir = environment()))
MAT2 <- get(data("HEC_MI2",
package = "DECIPHER",
envir = environment()))
structureMatrix <- matrix(c(0.187, -0.8, -0.873,
-0.8, 0.561, -0.979,
-0.873, -0.979, 0.221),
3,
3,
dimnames=list(c("H", "E", "C"),
c("H", "E", "C")))
substitutionMatrix <- matrix(c(1.5, -2.134, -0.739, -1.298,
-2.134, 1.832, -2.462, 0.2,
-0.739, -2.462, 1.522, -2.062,
-1.298, 0.2, -2.062, 1.275),
nrow = 4,
dimnames = list(DNA_BASES, DNA_BASES))
# break PairSummaries object down into a workable format
# build overhead data in a way that makes sense
GeneCalls <- attr(x = Pairs,
which = "GeneCalls")
GCIDs <- as.integer(names(GeneCalls))
L <- length(GeneCalls)
L2 <- (L * (L - 1L)) / 2L
POIDs <- paste(Pairs$p1,
Pairs$p2,
sep = "_")
FeaturesMat <- do.call(rbind,
strsplit(x = POIDs,
split = "_",
fixed = TRUE))
FeaturesMat <- data.frame("g1" = as.integer(FeaturesMat[, 1L]),
"i1" = as.integer(FeaturesMat[, 2L]),
"f1" = as.integer(FeaturesMat[, 3L]),
"g2" = as.integer(FeaturesMat[, 4L]),
"i2" = as.integer(FeaturesMat[, 5L]),
"f2" = as.integer(FeaturesMat[, 6L]))
GMat <- unique(FeaturesMat[, c(1, 4)])
rownames(GMat) <- NULL
Res <- vector(mode = "list",
length = nrow(GMat))
for (m1 in seq(nrow(GMat))) {
# subset to current genomes
w1 <- GMat[m1, 1L]
w2 <- GMat[m1, 2L]
# wa1 <- match(x = as.character(w1),
# table = names(attr(x = Pairs,
# which = "GeneCalls")))
# wa2 <- match(x = as.character(w2),
# table = names(attr(x = Pairs,
# which = "GeneCalls")))
if (Verbose) {
cat(paste0("### Genome pair: ",
w1,
" - ",
w2,
" ###\n"))
}
w3 <- FeaturesMat[, 1L] == w1
w4 <- FeaturesMat[, 4L] == w2
CMat <- FeaturesMat[w3 & w4, ]
rownames(CMat) <- NULL
# separate index pairs
IMat <- split(x = CMat,
f = list(CMat$i1,
CMat$i2),
drop = TRUE)
i1 <- which(GCIDs == w1)
i2 <- which(GCIDs == w2)
# i3 <- unique(GeneCalls[[i1]]$Index)
# i4 <- unique(GeneCalls[[i2]]$Index)
i3 <- seq_len(max(GeneCalls[[i1]]$Index))
i4 <- seq_len(max(GeneCalls[[i2]]$Index))
i5 <- sapply(i3,
function(x) {
which(GeneCalls[[i1]]$Index == x)
},
simplify = FALSE)
i6 <- sapply(i4,
function(x) {
which(GeneCalls[[i2]]$Index == x)
},
simplify = FALSE)
i1lower <- sapply(i5,
function(x) {
if (length(x) > 0L) {
min(x)
} else {
NA # NA Placeholder - should never be accessed, but necessary to keep index matching
}
},
simplify = TRUE)
i1upper <- sapply(i5,
function(x) {
if (length(x) > 0L) {
max(x)
} else {
NA
}
},
simplify = TRUE)
i2lower <- sapply(i6,
function(x) {
if (length(x) > 0L) {
min(x)
} else {
NA # NA Placeholder - should never be accessed, but necessary to keep index matching
}
},
simplify = TRUE)
i2upper <- sapply(i6,
function(x) {
if (length(x) > 0L) {
max(x)
} else {
NA
}
},
simplify = TRUE)
Genome1 <- SearchDB(dbFile = DBPATH,
identifier = as.character(w1),
nameBy = "description",
type = "DNAStringSet",
verbose = FALSE)
Genome2 <- SearchDB(dbFile = DBPATH,
identifier = as.character(w2),
nameBy = "description",
type = "DNAStringSet",
verbose = FALSE)
# for each Index Pair Matrix
Res[[m1]] <- vector(mode = "list",
length = length(IMat))
for (m2 in seq_along(IMat)) {
if (Verbose) {
cat(paste0("Index pair ",
IMat[[m2]][1L, 2L],
" - ",
IMat[[m2]][1L, 5L],
"\n"))
}
# current offsets
ci1lower <- i1lower[IMat[[m2]][1L, 2L]]
ci1upper <- i1upper[IMat[[m2]][1L, 2L]]
ci2lower <- i2lower[IMat[[m2]][1L, 5L]]
ci2upper <- i2upper[IMat[[m2]][1L, 5L]]
if (Verbose) {
cat(paste0("Feature set ",
ci1lower,
" - ",
ci1upper,
" and ",
ci2lower,
" - ",
ci2upper,
":\n"))
}
# if (IMat[[m2]][1L, 2L] == 1 &
# IMat[[m2]][1L, 5L] == 67) {
# print("a")
# # return(list(GeneCalls,
# # IMat[[m2]][1L, 2L],
# # IMat[[m2]][1L, 5L],
# # i1))
# }
# extract seqs for features one on the current index for w1 !! actually i1 !!
w5 <- GeneCalls[[i1]]$Index == IMat[[m2]][1L, 2L]
w6 <- GeneCalls[[i1]]$Coding[w5] & GeneCalls[[i1]]$Type[w5] == "gene"
w7 <- unique(GeneCalls[[i1]]$Translation_Table[w5])
w7 <- w7[!is.na(w7)]
w7 <- w7[1L]
z1 <- unname(GeneCalls[[i1]]$Range[w5])
z2 <- lengths(z1)
z1 <- unlist(z1,
recursive = FALSE)
NTFeatures01 <- extractAt(x = Genome1[[IMat[[m2]][1L, 2L]]],
at = z1)
# if (IMat[[m2]][1L, 2L] == 1 &
# IMat[[m2]][1L, 5L] == 67) {
# print("b")
# }
CollapseCount <- 0L
w <- which(z2 > 1L)
# if no collapsing needs to occur, do not enter loop
if (length(w) > 0L) {
# if collapsing must take place build a placeholder of positions to remove
# once collapsing correct positions has occurred
remove <- vector(mode = "integer",
length = sum(z2[w]) - length(w))
for (m4 in w) {
NTFeatures01[[m4 + CollapseCount]] <- unlist(NTFeatures01[m4:(m4 + z2[m4] - 1L) + CollapseCount])
remove[(CollapseCount + 1L):(CollapseCount + z2[m4] - 1L)] <- (m4 + 1L):(m4 + z2[m4] - 1L) + CollapseCount
CollapseCount <- CollapseCount + z2[m4] - 1L
}
# return(list(w,
# z2[w],
# remove))
NTFeatures01[remove] <- NULL
}
FlipMe <- GeneCalls[[i1]]$Strand[w5] == 1L
if (any(FlipMe)) {
NTFeatures01[FlipMe] <- reverseComplement(NTFeatures01[FlipMe])
}
names(NTFeatures01) <- paste(rep(w1,
sum(w5)),
rep(IMat[[m2]][1L, 2L],
sum(w5)),
seq(from = ci1lower,
to = ci1upper,
by = 1L),
sep = "_")
if (!is.na(w7)) {
AAFeatures01 <- translate(x = NTFeatures01[w6],
genetic.code = getGeneticCode(id_or_name2 = w7,
as.data.frame = FALSE,
full.search = FALSE),
if.fuzzy.codon = "solve")
} else {
AAFeatures01 <- AAStringSet()
}
names(AAFeatures01) <- names(NTFeatures01)[w6]
Features01Match <- match(x = names(NTFeatures01),
table = names(AAFeatures01))
Features01Key <- names(NTFeatures01) %in% names(AAFeatures01)
# if (IMat[[m2]][1L, 2L] == 1 &
# IMat[[m2]][1L, 5L] == 67) {
# print("c")
# }
# extract seqs for features one on the current index for w2
w5 <- GeneCalls[[i2]]$Index == IMat[[m2]][1L, 5L]
w6 <- GeneCalls[[i2]]$Coding[w5] & GeneCalls[[i2]]$Type[w5] == "gene"
w7 <- unique(GeneCalls[[i2]]$Translation_Table[w5])
w7 <- w7[!is.na(w7)]
w7 <- w7[1L]
z1 <- unname(GeneCalls[[i2]]$Range[w5])
z2 <- lengths(z1)
z1 <- unlist(z1,
recursive = FALSE)
NTFeatures02 <- extractAt(x = Genome2[[IMat[[m2]][1L, 5L]]],
at = z1)
# if (IMat[[m2]][1L, 2L] == 1 &
# IMat[[m2]][1L, 5L] == 67) {
# print("d")
# }
CollapseCount <- 0L
w <- which(z2 > 1L)
# if no collapsing needs to occur, do not enter loop
if (length(w) > 0L) {
# if collapsing must take place build a placeholder of positions to remove
# once collapsing correct positions has occurred
remove <- vector(mode = "integer",
length = sum(z2[w]) - length(w))
for (m4 in w) {
NTFeatures02[[m4 + CollapseCount]] <- unlist(NTFeatures02[m4:(m4 + z2[m4] - 1L) + CollapseCount])
remove[(CollapseCount + 1L):(CollapseCount + z2[m4] - 1L)] <- (m4 + 1L):(m4 + z2[m4] - 1L) + CollapseCount
CollapseCount <- CollapseCount + z2[m4] - 1L
}
# return(list(w,
# z2[w],
# remove))
NTFeatures02[remove] <- NULL
}
FlipMe <- GeneCalls[[i2]]$Strand[w5] == 1L
if (any(FlipMe)) {
NTFeatures02[FlipMe] <- reverseComplement(NTFeatures02[FlipMe])
}
# if (IMat[[m2]][1L, 2L] == 1 &
# IMat[[m2]][1L, 5L] == 67) {
# print("g")
# return(list(NTFeatures02,
# w2,
# w5,
# IMat[[m2]][1L, 5L],
# ci2lower,
# ci2upper,
# GeneCalls,
# IMat,
# m2,
# i5,
# i6))
# }
names(NTFeatures02) <- paste(rep(w2,
sum(w5)),
rep(IMat[[m2]][1L, 5L],
sum(w5)),
seq(from = ci2lower,
to = ci2upper,
by = 1L),
sep = "_")
if (!is.na(w7)) {
AAFeatures02 <- translate(x = NTFeatures02[w6],
genetic.code = getGeneticCode(id_or_name2 = w7,
as.data.frame = FALSE,
full.search = FALSE),
if.fuzzy.codon = "solve")
} else {
AAFeatures02 <- AAStringSet()
}
# if (IMat[[m2]][1L, 2L] == 1 &
# IMat[[m2]][1L, 5L] == 67) {
# print("f")
# return(list(AAFeatures02,
# NTFeatures02,
# w5,
# w6,
# w7))
# }
names(AAFeatures02) <- names(NTFeatures02)[w6]
Features02Match <- match(x = names(NTFeatures02),
table = names(AAFeatures02))
Features02Key <- names(NTFeatures02) %in% names(AAFeatures02)
# if (IMat[[m2]][1L, 2L] == 1 &
# IMat[[m2]][1L, 5L] == 67) {
# print("e")
# }
# return(list(NTFeatures01,
# NTFeatures02,
# AAFeatures01,
# AAFeatures02,
# Features01Key,
# Features01Match,
# Features02Key,
# Features02Match))
dr1 <- IMat[[m2]][, 3L] - IMat[[m2]][, 6L]
dr2 <- IMat[[m2]][, 3L] + IMat[[m2]][, 6L]
IMat[[m2]] <- cbind(IMat[[m2]],
"ID" = seq(nrow(IMat[[m2]])),
"rank1" = dr1,
"rank2" = dr2)
# given the diagonal and anti-diagonal `ranks` that have been assigned
# build 3 ledgers:
# one for the diagonal, one for the ant-diagonal, and one for singletons
# you can be in both the diagonal and the anti-diagonal ledger at the same tiem
# but you cannot be in the singleton ledger and either diag or anti-diag
# at this point we have 4 points describing each pair
# 1: g1 feature ID number
# 2: g2 features ID number
# 3: diag rank
# 4: anti-diag rank
dr3 <- unname(split(x = IMat[[m2]],
f = dr1,
drop = TRUE))
dr4 <- unname(split(x = IMat[[m2]],
f = dr2,
drop = TRUE))
for (m3 in seq_along(dr3)) {
# if the current rank has more than one pair
if (nrow(dr3[[m3]]) > 1L) {
# build a dummy vector for the split
sp1 <- vector(mode = "integer",
length = nrow(dr3[[m3]]))
# extract the feature positions that are always increasing
sp2 <- dr3[[m3]][, 3L]
# construct iterators
it1 <- 1L
it2 <- sp2[1L]
# loop through dummy vector, and assign the split iterator
# updating it when a gap larger than the tolerance appears
for (m4 in seq_along(sp1)) {
it3 <- sp2[m4]
if (it3 - it2 > GapTolerance) {
it1 <- it1 + 1L
}
sp1[m4] <- it1
it2 <- it3
} # end loop through split map
if (it1 > 1L) {
# the split map was updated, split the matrix
dr3[[m3]] <- unname(split(x = dr3[[m3]],
f = sp1))
} else {
dr3[[m3]] <- dr3[m3]
}
} else {
dr3[[m3]] <- dr3[m3]
}
}
for (m3 in seq_along(dr4)) {
# if the current rank has more than one pair
if (nrow(dr4[[m3]]) > 1L) {
# build a dummy vector for the split
sp1 <- vector(mode = "integer",
length = nrow(dr4[[m3]]))
# extract the feature positions that are always increasing
sp2 <- dr4[[m3]][, 3L]
# construct iterators
it1 <- 1L
it2 <- sp2[1L]
# loop through dummy vector, and assign the split iterator
# updating it when a gap larger than the tolerance appears
for (m4 in seq_along(sp1)) {
it3 <- sp2[m4]
if (it3 - it2 > GapTolerance) {
it1 <- it1 + 1L
}
sp1[m4] <- it1
it2 <- it3
} # end loop through split map
if (it1 > 1L) {
# the split map was updated, split the matrix
dr4[[m3]] <- unname(split(x = dr4[[m3]],
f = sp1))
} else {
dr4[[m3]] <- dr4[m3]
}
} else {
dr4[[m3]] <- dr4[m3]
}
}
# return(list(IMat[[m2]],
# dr3,
# dr4))
dr3 <- unlist(dr3,
recursive = FALSE)
# dr3 IDs singletons
dr3a <- unlist(sapply(dr3,
function(x) {
if (nrow(x) == 1L) {
x$ID
} else {
NA
}
},
simplify = FALSE))
# dr3 IDs blocks
dr3b <- unlist(sapply(dr3,
function(x) {
if (nrow(x) > 1L) {
x$ID
}
},
simplify = FALSE))
dr4 <- unlist(dr4,
recursive = FALSE)
# dr4 IDs singletons
dr4a <- unlist(sapply(dr4,
function(x) {
if (nrow(x) == 1L) {
x$ID
} else {
NA
}
},
simplify = FALSE))
# dr4 IDs blocks
dr4b <- unlist(sapply(dr4,
function(x) {
if (nrow(x) > 1L) {
x$ID
}
},
simplify = FALSE))
# drop dr3 singleton positions that are present in dr4
dr3c <- dr3a %in% dr4b
# drop dr4 singleton positions that are present in dr3
dr4c <- dr4a %in% dr3b
# drop singletons from one rank set that are present in a block from the
# other set
dr3 <- dr3[!dr3c]
dr4 <- dr4[!dr4c]
dr5 <- c(dr3, dr4)
dr5 <- unique(dr5)
if (DropSingletons) {
checkrows <- sapply(dr5,
function(x) nrow(x),
simplify = TRUE)
dr5 <- dr5[checkrows > 1L]
if (length(dr5) == 0L) {
# break out of m2 position without assigning Res[[m1]][[m2]] anything
next
}
}
# loop through dr5
# if the position is a singleton pair, assign all possible expansions
# if the position is a blocked set of pairs assign the two expansions
dr6 <- vector(mode = "list",
length = length(dr5))
for (m3 in seq_along(dr6)) {
f1s <- dr5[[m3]][, 3L]
f2s <- dr5[[m3]][, 6L]
if (length(f1s) == 1L) {
# singleton pair
dr6[[m3]] <- data.frame("f1" = c(f1s - 1L, f1s - 1L, f1s + 1L, f1s + 1L),
"f2" = c(f2s - 1L, f2s + 1L, f2s + 1L, f2s - 1L),
"direction" = c(1L, 2L, 3L, 4L))
} else {
# a contiguous block of pairs
if (length(unique(dr5[[m3]][, 8L])) == 1L) {
# the regular diagonal
f1s <- dr5[[m3]][, 3L]
f2s <- dr5[[m3]][, 6L]
f1f <- seq(from = min(f1s) - 1L,
to = max(f1s) + 1L,
by = 1L)
f2f <- seq(from = min(f2s) - 1L,
to = max(f2s) + 1L,
by = 1L)
f1f <- f1f[!(f1f %in% f1s)]
f2f <- f2f[!(f2f %in% f2s)]
if (length(f1f) > 2L) {
# gaps in the block assign as no-expanding checks
dr6[[m3]] <- data.frame("f1" = f1f,
"f2" = f2f,
"direction" = c(1L,
rep(0L,
length(f1f) - 2L),
3L))
} else {
# no gaps in the block
dr6[[m3]] <- data.frame("f1" = f1f,
"f2" = f2f,
"direction" = c(1L,
3L))
}
} else {
# the anti diagonal
f1s <- dr5[[m3]][, 3L]
f2s <- dr5[[m3]][, 6L]
f1f <- seq(from = min(f1s) - 1L,
to = max(f1s) + 1L,
by = 1L)
f2f <- seq(from = max(f2s) + 1L,
to = min(f2s) - 1L,
by = -1L)
f1f <- f1f[!(f1f %in% f1s)]
f2f <- f2f[!(f2f %in% f2s)]
if (length(f1f) > 2L) {
# gaps in the block assign as no-expanding checks
dr6[[m3]] <- data.frame("f1" = f1f,
"f2" = f2f,
"direction" = c(2L,
rep(0L,
length(f1f) - 2L),
4L))
} else {
# no gaps in the block
dr6[[m3]] <- data.frame("f1" = f1f,
"f2" = f2f,
"direction" = c(2L,
4L))
}
}
} # end row check
} # end dr6 loop
dr6 <- do.call(rbind,
dr6)
dr6 <- dr6[dr6[, 1L] >= ci1lower &
dr6[, 1L] <= ci1upper &
dr6[, 2L] >= ci2lower &
dr6[, 2L] <= ci2upper, , drop = FALSE]
if (nrow(dr6) < 1L) {
next
}
dr6 <- dr6[order(dr6[, 1L]), ]
# return(dr6)
# for every line in dr6
# n + ? alignments will be attempted
# for every n alignments that pass some threshold a new `Pair` is recorded
# to add to pair summaries
L <- nrow(dr6)
VSize <- L * 2L
p1placeholder <- p2placeholder <- p1FeatureLength <- p2FeatureLength <- rep(NA_integer_,
times = VSize)
PIDVector <- SCOREVector <- rep(NA_real_,
times = VSize)
AType <- rep(NA_character_,
times = VSize)
# return(list(dr6,
# ci1lower,
# ci1upper,
# ci2lower,
# ci2upper,
# Features01Key,
# Features02Key,
# Features01Match,
# Features02Match,
# NTFeatures01,
# AAFeatures01,
# NTFeatures02,
# AAFeatures02))
Count <- 1L
Continue <- TRUE
if (Verbose) {
pBar <- txtProgressBar(style = 1L)
}
for (m3 in seq(nrow(dr6))) {
# each line contains feature coordinates for an alignment
# and the direction to expand, if the alignment passes a threshold
# with each expansion, check whether the expansion is within bounds
f1 <- dr6[m3, 1L]
f2 <- dr6[m3, 2L]
advanceID <- dr6[m3, 3L]
while (Continue) {
# first ask if f1 and f2 are within bounds
# if they are, do not attempt alignment
# exit the while loop and move to the next position in dr6
if (f1 > ci1upper |
f1 < ci1lower |
f2 > ci2upper |
f2 < ci2lower) {
# Continue <- FALSE
break
}
# third ask if these sequences have already been aligned
# CURRENTLY NOT IMPLEMENTED
# third ask if both f1 and f2 are both coding
if (Features01Key[f1 - ci1lower + 1L] &
Features02Key[f2 - ci2lower + 1L]) {
# both are coding
ali <- AlignProfiles(pattern = AAFeatures01[Features01Match[f1 - ci1lower + 1L]],
subject = AAFeatures02[Features02Match[f2 - ci2lower + 1L]])
PID <- 1 - DistanceMatrix(myXStringSet = ali,
type = "matrix",
includeTerminalGaps = TRUE,
verbose = FALSE)[1L, 2L]
# UW <- unique(width(ali))
if (INCLUDESCORE) {
SCORE <- ScoreAlignment(myXStringSet = ali,
structures = PredictHEC(ali,
type="probabilities",
HEC_MI1 = MAT1,
HEC_MI2 = MAT2),
structureMatrix = structureMatrix)
}
CType <- "AA"
# if (m2 >= 2L) {
# print(f1)
# print(f2)
# print(PID)
# print(CType)
# }
} else {
# at least one is not coding
ali <- AlignProfiles(pattern = NTFeatures01[f1 - ci1lower + 1L],
subject = NTFeatures02[f2 - ci2lower + 1L])
PID <- 1 - DistanceMatrix(myXStringSet = ali,
type = "matrix",
includeTerminalGaps = TRUE,
verbose = FALSE)[1L, 2L]
# UW <- unique(width(ali))
if (INCLUDESCORE) {
SCORE <- ScoreAlignment(myXStringSet = ali,
substitutionMatrix = substitutionMatrix)
}
CType <- "NT"
# if (m2 >= 2L) {
# print(f1)
# print(f2)
# print(PID)
# print(CType)
# }
} # end if else statement for coding / non
# assign the dist
# update the count
if (Criteria == "PID") {
CHECK <- PID
} else if (Criteria == "Score" &
INCLUDESCORE) {
CHECK <- SCORE
}
if (CHECK < Floor) {
# the current PID does not meet inclusion criteria
# exit the while loop and move to the next position in dr6
Continue <- FALSE
} else {
# PID meets inclusion criteria
# assign integers to result vectors
p1placeholder[Count] <- f1
p2placeholder[Count] <- f2
p1FeatureLength[Count] <- width(NTFeatures01[f1 - ci1lower + 1L])
p2FeatureLength[Count] <- width(NTFeatures02[f2 - ci2lower + 1L])
PIDVector[Count] <- PID
if (INCLUDESCORE) {
SCOREVector[Count] <- SCORE
}
AType[Count] <- CType
# update f1 and f2
if (advanceID == 0L) {
# interior alignment, no forward movement allowed
Continue <- FALSE
} else if (advanceID == 1L) {
# advancing down the diagonal
f1 <- f1 - 1L
f2 <- f2 - 1L
} else if (advanceID == 2L) {
# advancing up the anti-diagonal
f1 <- f1 - 1L
f2 <- f2 + 1L
} else if (advanceID == 3L) {
# advancing up the diagonal
f1 <- f1 + 1L
f2 <- f2 + 1L
} else if (advanceID == 4L) {
# advancing down the anti-diagonal
f1 <- f1 + 1L
f2 <- f2 - 1L
} # end advancement if elses
# if (f1 == 1572 &
# f2 == 5750) {
# return(list(f1,
# f2,
# Count,
# m3,
# p1placeholder[Count],
# p2placeholder[Count],
# dr6))
# }
Count <- Count + 1L
if (Count >= VSize) {
# if Count exceeds VSize, increase size
VSize <- VSize * 2L
p1placeholder <- c(p1placeholder,
rep(NA_integer_,
times = VSize))
p2placeholder <- c(p2placeholder,
rep(NA_integer_,
times = VSize))
p1FeatureLength <- c(p1FeatureLength,
rep(NA_integer_,
times = VSize))
p2FeatureLength <- c(p2FeatureLength,
rep(NA_integer_,
times = VSize))
PIDVector <- c(PIDVector,
rep(NA_real_,
times = VSize))
SCOREVector <- c(SCOREVector,
rep(NA_real_,
times = VSize))
AType <- c(AType,
rep(NA_character_,
times = VSize))
} # end count size if statement
} # end PID check
} # end while loop
Continue <- TRUE
if (Verbose) {
setTxtProgressBar(pb = pBar,
value = m3 / L)
}
} # end of m3 loop through dr6
if (Verbose) {
cat("\n")
close(pBar)
}
if (any(!is.na(PIDVector))) {
L2 <- max(which(!is.na(PIDVector)))
p1placeholder <- p1placeholder[seq_len(L2)]
p2placeholder <- p2placeholder[seq_len(L2)]
p1FeatureLength <- p1FeatureLength[seq_len(L2)]
p2FeatureLength <- p2FeatureLength[seq_len(L2)]
PIDVector <- PIDVector[seq_len(L2)]
SCOREVector <- SCOREVector[seq_len(L2)]
AType <- AType[seq_len(L2)]
# if (m2 == 2L) {
# return(list(IMat,
# w1,
# w2,
# L2,
# p1placeholder,
# p2placeholder,
# p1FeatureLength,
# p2FeatureLength,
# PIDVector,
# SCOREVector,
# AType))
# }
if (INCLUDESCORE) {
Res[[m1]][[m2]] <- data.frame("p1" = paste(rep(w1,
times = L2),
rep(IMat[[m2]][1L, 2L],
times = L2),
p1placeholder,
sep = "_"),
"p2" = paste(rep(w2,
times = L2),
rep(IMat[[m2]][1L, 5L],
times = L2),
p2placeholder,
sep = "_"),
"ExactMatch" = rep(0L,
times = L2),
"TotalKmers" = rep(0L,
times = L2),
"MaxKmer" = rep(0L,
times = L2),
"Consensus" = rep(0,
times = L2),
"p1FeatureLength" = p1FeatureLength,
"p2FeatureLength" = p2FeatureLength,
"Adjacent" = rep(0L,
times = L2),
"TetDist" = rep(0,
times = L2),
"PID" = PIDVector,
"SCORE" = SCOREVector,
"PIDType" = AType,
"PredictedPID" = rep(NA_real_,
times = L2),
stringsAsFactors = FALSE)
} else {
Res[[m1]][[m2]] <- data.frame("p1" = paste(rep(w1,
times = L2),
rep(IMat[[m2]][1L, 2L],
times = L2),
p1placeholder,
sep = "_"),
"p2" = paste(rep(w2,
times = L2),
rep(IMat[[m2]][1L, 5L],
times = L2),
p2placeholder,
sep = "_"),
"ExactMatch" = rep(0L,
times = L2),
"TotalKmers" = rep(0L,
times = L2),
"MaxKmer" = rep(0L,
times = L2),
"Consensus" = rep(0,
times = L2),
"p1FeatureLength" = p1FeatureLength,
"p2FeatureLength" = p2FeatureLength,
"Adjacent" = rep(0L,
times = L2),
"TetDist" = rep(0,
times = L2),
"PID" = PIDVector,
"PIDType" = AType,
"PredictedPID" = rep(NA_real_,
times = L2),
stringsAsFactors = FALSE)
}
} else {
# do nothing
}
} # end m2 loop
}
Res <- unlist(Res,
recursive = FALSE)
Res <- do.call(rbind,
Res)
if (!is.null(Res)) {
# if Res is not NULL
# check for duplicates
if (nrow(Res) > 1L) {
IDS <- paste(Res$p1,
Res$p2,
sep = "_")
check1 <- !duplicated(IDS)
Res <- Res[check1, , drop = FALSE]
if (nrow(Res) > 0L) {
IDS <- paste(Res$p1,
Res$p2,
sep = "_")
check2 <- !(IDS %in% POIDs)
Res <- Res[check2, , drop = FALSE]
}
}
} else {
# no new pairs were discovered
if ("SCORE" %in% colnames(Pairs)) {
Res <- data.frame("p1" = character(0L),
"p2" = character(0L),
"ExactMatch" = integer(0L),
"TotalKmers" = integer(0L),
"MaxKmer" = integer(0L),
"Consensus" = integer(0L),
"p1FeatureLength" = integer(0L),
"p2FeatureLength" = integer(0L),
"Adjacent" = integer(0L),
"TetDist" = integer(0L),
"PID" = numeric(0L),
"SCORE" = numeric(0L),
"PIDType" = character(0L),
"PredictedPID" = numeric(0L),
stringsAsFactors = FALSE)
} else {
Res <- data.frame("p1" = character(0L),
"p2" = character(0L),
"ExactMatch" = integer(0L),
"TotalKmers" = integer(0L),
"MaxKmer" = integer(0L),
"Consensus" = integer(0L),
"p1FeatureLength" = integer(0L),
"p2FeatureLength" = integer(0L),
"Adjacent" = integer(0L),
"TetDist" = integer(0L),
"PID" = numeric(0L),
"PIDType" = character(0L),
"PredictedPID" = numeric(0L),
stringsAsFactors = FALSE)
}
}
if (NewPairsOnly) {
if (Verbose) {
TimeEnd <- Sys.time()
print(TimeEnd - TimeStart)
}
return(Res)
} else {
if (nrow(Res) > 0L) {
FeaturesMat2 <- do.call(rbind,
strsplit(x = paste(Res$p1,
Res$p2,
sep = "_"),
split = "_",
fixed = TRUE))
FeaturesMat2 <- data.frame("g1" = as.integer(FeaturesMat2[, 1L]),
"i1" = as.integer(FeaturesMat2[, 2L]),
"f1" = as.integer(FeaturesMat2[, 3L]),
"g2" = as.integer(FeaturesMat2[, 4L]),
"i2" = as.integer(FeaturesMat2[, 5L]),
"f2" = as.integer(FeaturesMat2[, 6L]))
FeaturesMat3 <- rbind(FeaturesMat,
FeaturesMat2)
o1 <- order(FeaturesMat3[, 4L],
FeaturesMat3[, 1L],
FeaturesMat3[, 3L])
Res2 <- rbind(Pairs,
Res)
Res2 <- Res2[o1, ]
rownames(Res2) <- NULL
if (Verbose) {
TimeEnd <- Sys.time()
print(TimeEnd - TimeStart)
}
return(Res2)
} else {
if (Verbose) {
TimeEnd <- Sys.time()
print(TimeEnd - TimeStart)
}
return(Pairs)
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.