#' Find the vector-insert junctions at the beginning and end of a VDV read
#'
#' The function does the following:
#' \enumerate{
#' \item At both ends of the read, search for the restriction site
#' at the position where the vector-insert junction is expected to be found
#' \item If the restriction site is not found, then search for the vector
#' sequence just adjacent to the restriction site in a slighly wider window
#' \item (optional) at these junctions, replace the observed vector sequence
#' by the full (and true) vector sequence
#' }
#'
#' @param ReadName character string. Name of the read
#' @param ReadDNA A DNAString or DNAStringSet with the read sequence
#' @param ReadVecAlign Table with the Blast results from aligning the vector on the read
#' @param RestrictionSite Character string in the for "G^AATTC" indicating the sequence and the cut site
#' @param VectorSequence A DNAString or DNAStringSet of length 1 with the vector sequence starting and ending
#' with the full sequence of the restriction site used for cloning
#' @param UnalignedVectorLength Integer. If more than \code{UnalignedVectorLength}bp of expected vector sequence
#' is not correctly aligned at the vector-insert junction,
#' the function will return a message
#' @param SideSeqSearch Integer. If the expected restriction site is not found at a vector-insert junction
#' then the algorithm will try to search for the vector sequence of length
#' \code{SideSeqSearch} bp that is adjacent to the restriction site
#' (Default to 10 bp)
#' @param replaceVectorSequence Logical. If TRUE, the function will replace the vector sequence in the read
#' by the true vector sequence, using the junction has been identified.
#' If no junction is identified, then no sequence is replaced
#'
#' @importFrom magrittr %>%
#' @importFrom dplyr filter top_n mutate pull
#' @importFrom rlang .data
#' @importFrom Biostrings subseq matchPattern reverseComplement
#' @importFrom methods is as
#' @importFrom BiocGenerics start end width
#' @importClassesFrom Biostrings DNAString DNAStringSet
#'
#' @export
#'
#' @return a list with the following elements:
#' \itemize{
#' \item{"ReadName"}{Name of the read}
#' \item{"Strand"}{"Strand of the read (based on vector alignment)}
#' \item{"InsertStart"}{Location of the first base of insert sequence (if NA, no vector-insert junction has been detected)}
#' \item{"InsertEnd"}{Location of the last base of insert sequence (if NA, no vector-insert junction has been detected)}
#' \item{"correctedRead"}{(Optional) if \code{replaceVectorSequence} is TRUE, this sequence contains the true vector sequence in the read}
#' }
#'
#' @examples
#' # Some dummy sequences (use paste0 for clarity / comparison of sequences):
#' ## Vector sequence start and ends with the HindIII site used for cloning ("A^AGCTT")
#' vector <- Biostrings::DNAString(paste0(
#' "AAGCTTTATTAAGACACCCGGTATGCTTCAGGATCGTTCGGACTAA",
#' "ACCGTAACTGCGATATTTTAGGCGTGTTACAAGCTT"))
#' read <- Biostrings::DNAString(paste0(
#' "ACCGTAACTGCGATATTTTAGGCGTGTTACAAGCTT",
#' "GCTAGATCGCGCGATATGTG",
#' "AAGCTTTATTAAGACACCCGGTATGCTTCAGGATCGTTCGGACTAA"))
#' noisyread <- Biostrings::DNAString(paste0(
#' "ACCGTAACTGCGTTTTTTTAGGCGTGTTACAAGCTT",
#' "GCTAAATCGCGCGCTATGTG",
#' "GGGCTTTATTAAGACACCCGGTATGCTTTCAGGATCGTTCGGACTAA"))
#' # Import the blastn results for these sequences (alignment of vector on the reads):
#' readaln <- readBlast(system.file("extdata",
#' "juncEx_vec_read.res",
#' package = "NanoBAC"))
#' noisyreadaln <- readBlast(system.file("extdata",
#' "juncEx_vec_noisyread.res",
#' package = "NanoBAC"))
#' # Get the coordinates of the insert sequence:
#' findVDVjunctions("read", read, readaln, "A^AGCTT", vector,
#' replaceVectorSequence = FALSE)
#' # With the noisy read, the restriction site is not found at
#' # the end of the read but an adjacent sequence is:
#' findVDVjunctions("noisyread", noisyread, noisyreadaln, "A^AGCTT", vector,
#' replaceVectorSequence = FALSE)
#' # Get the read sequence after replacing the vector sequence
#' # by the full vector sequence on both sides
#' findVDVjunctions("noisyread", noisyread, noisyreadaln, "A^AGCTT", vector,
#' replaceVectorSequence = TRUE)$correctedRead
findVDVjunctions <- function(ReadName = NULL,
ReadDNA,
ReadVecAlign,
RestrictionSite = "G^AATTC",
VectorSequence = NULL,
UnalignedVectorLength = 1000L,
SideSeqSearch = 10L,
replaceVectorSequence = TRUE) {
#---------------------
#Test arguments and import data if necessary
#---------------------
## ReadName
if (is.null(ReadName)) {
stop("Please provide a read name in the ReadName argument (no default)")
} else {
rnn <- as.character(ReadName)
}
## ReadDNA
if (!is(ReadDNA, "DNAStringSet") && !is(ReadDNA, "DNAString")) {
stop("ReadDNA should be a DNAString or a DNAStringSet")
}
## Make ReadDNA a DNAStringSet of length 1 with names(ReadDNA)==ReadName
if (is(ReadDNA, "DNAString")) {
ReadDNA <- as(ReadDNA, "DNAStringSet")
names(ReadDNA) <- rnn
} else {
if (!(rnn %in% names(ReadDNA))) {
stop("Read name not found in ReadDNA")
} else {
ReadDNA <- ReadDNA[rnn]
}
}
## Filter read/vector alignments. Throw a message if there are >2 alignments
## If this message appears for many reads, then consider filtering Blast results with SelectSingularBlastALN function
if (!is.data.frame(ReadVecAlign)) {
stop("ReadVecAlign is not a data frame")
}
if (!(rnn %in% (ReadVecAlign %>%
dplyr::pull(.data$SubjectACC)))) {
stop("Read name not found in ReadVecAlign")
} else {
ReadVecAlign <- ReadVecAlign %>%
dplyr::filter(.data$SubjectACC == rnn)
if (nrow(ReadVecAlign) > 2) {
message(rnn, ": found ",
nrow(ReadVecAlign),
" alignments. Should ReadVecAlign be filtered?")
}
}
## Vector sequence and restriction site
### The vector sequence is expected to start by the restriction site
if (is.null(VectorSequence)) {
stop(strwrap(prefix = " ", initial = "",
"Please provide the vector sequence, starting with
the restriction site used for cloning"))
}
if (is.character(VectorSequence)) {
VectorSequence <- as(VectorSequence, "DNAString")
}
if (is(VectorSequence, "DNAStringSet")) {
VectorSequence <- VectorSequence[[1]]
}
### Get the vector size
VectorSize <- length(VectorSequence)
### Check that vector sequence starts and end with restriction site
CutSiteLocation <- which(strsplit(RestrictionSite, "")[[1]] == "^")
if (!length(CutSiteLocation) == 1) {
stop(strwrap(prefix = " ", initial = "",
"Please provide RestrictionSite in the following format
(ex for EcoRI): G^AATTC"))
}
RestrictionSiteLength <- nchar(RestrictionSite) - 1
BasesBeforeCut <- CutSiteLocation - 1
BasesAfterCut <- RestrictionSiteLength - BasesBeforeCut
RestrictionSiteSequence <- as(gsub("\\^", "", RestrictionSite), "DNAString")
if (!identical(toString(RestrictionSiteSequence),
toString(Biostrings::subseq(VectorSequence,
1,
length(RestrictionSiteSequence))))) {
stop("Vector sequence does not start with restriction site")
}
if (!identical(toString(RestrictionSiteSequence),
toString(Biostrings::subseq(VectorSequence,
VectorSize-RestrictionSiteLength+1,
VectorSize)))) {
stop("Vector sequence does not end with restriction site")
}
# Get the strand of the read
readStrand <- unique(ReadVecAlign$Strand)
if (length(readStrand)!=1) {
stop(rnn, " has alignment on both strands. Read cannot be analyzed")
}
#---------------------
#Initial search for potential vector-insert borders at the beginning and end of the reads
#---------------------
## Detect if the aligments are at the beginning or at the end of the read.
## Add this info to the alignment table ReadVecAlign
ReadVecAlign <- ReadVecAlign %>%
dplyr::mutate(ReadSide = ifelse(pmax(.data$SubjectStart,
.data$SubjectEnd) <
length(ReadDNA[[1]])/2,
"Begin", "End"))
## Identify the theoretical location of the beginning/end of the insert sequence (based on alignment positions only):
### For reads with alignment on the "+" strand:
if (readStrand == "+") {
### 1) find the largest plasmid coordinate that aligns at the beginning of the read
### then estimate where the vector-insert border should be if the alignment went all the way to the end of the vector (InsertBorder)
### and obtain the distance between the end of the vector and where the alignment stops (DistToVectorEnd)
beginAlign <- ReadVecAlign %>%
dplyr::filter(.data$ReadSide == "Begin") %>%
dplyr::top_n(1, .data$QueryEnd) %>%
dplyr::mutate(InsertBorder = .data$SubjectEnd + 1 +
VectorSize - .data$QueryEnd,
DistToVectorEnd = (VectorSize -
.data$QueryEnd))
### 2) find the smallest plasmid coordinate that aligns at the end of the read
### then estimate where the insert-vector border should be if the alignment started at the first base of the vector (InsertBorder)
### and obtain the distance between the start of the vector (1) and where the alignment starts (DistToVectorEnd)
endAlign <- ReadVecAlign %>%
dplyr::filter(.data$ReadSide == "End") %>%
dplyr::top_n(1, -.data$QueryStart) %>%
dplyr::mutate(InsertBorder = .data$SubjectStart - 1 -
.data$QueryStart + 1,
DistToVectorEnd = .data$QueryStart - 1)
} else {
## For reads with alignment on the "-" strand:
### 1) find the smallest vector coordinate that aligns at the beginning of the read
beginAlign <- ReadVecAlign %>%
dplyr::filter(.data$ReadSide == "Begin") %>%
dplyr::top_n(1,-.data$QueryStart) %>%
dplyr::mutate(InsertBorder = .data$SubjectStart + 1 +
.data$QueryStart - 1,
DistToVectorEnd = .data$QueryStart - 1)
### 2) find the largest vector coordinate that aligns at the end of the read
endAlign <- ReadVecAlign %>%
filter(.data$ReadSide == "End") %>%
top_n(1, .data$QueryEnd) %>%
mutate(InsertBorder = .data$SubjectEnd - 1 -
VectorSize + .data$QueryEnd,
DistToVectorEnd = (VectorSize -
.data$QueryEnd))
}
## If the alignment does not cover at all the first (resp. last) UnalignedVectorLength bp of the vector sequence, throw a message
if (dplyr::pull(beginAlign, .data$DistToVectorEnd) >= UnalignedVectorLength) {
message(rnn,
": at the beginning of the read, more than ",
UnalignedVectorLength,
"bp of expected vector sequence are not aligned")
}
if (dplyr::pull(endAlign, .data$DistToVectorEnd) >= UnalignedVectorLength) {
message(rnn,
": at the end of the read, more than ",
UnalignedVectorLength,
"bp of expected vector sequence are not aligned")
}
# Search for the restriction site at the predicted vector-insert junction (i.e. in a window around InsertBorder) allowing only 1 mismatch/indel
# If found, the location of the restriction site determines the location of the vector-insert junction
# If the restriction sit is not found or found multiple times, we search for the vector sequence (currently 10bp) adjacent to the restriction site, allowing 20% mismatch/indel and using a wider search window.
# If not found then JuncAtBegin wil be NA
#----------------------------------------
# Search the vector-insert junction at the beginning of the read:
#----------------------------------------
PointOfSearch <- beginAlign %>%
dplyr::pull(.data$InsertBorder) #This is the position of the first base following the restriction site
# We search for the restriction site in a window covering +/-15bp or +/- 5% of DistToVectorEnd if larger
# i.e. the farther we are from vector end, the wider the window:
WidthOfwindow <- max(c(15,
round(0.05*(beginAlign %>%
dplyr::pull(.data$DistToVectorEnd)),
0)))
# Sequence to search
SearchStart <- PointOfSearch - RestrictionSiteLength - WidthOfwindow
SearchEnd <- PointOfSearch + WidthOfwindow - 1
SeqToSearch <- Biostrings::subseq(ReadDNA[[rnn]],
SearchStart,
SearchEnd) # +/- WidthOfwindow bases on each side of the restriction motif
# Search for the restriction site
findRestrSite <- Biostrings::matchPattern(RestrictionSiteSequence,
SeqToSearch,
max.mismatch = 1,
with.indels= TRUE)
#Define JuncAtBegin which is the first base of the insert
if (length(findRestrSite) != 1) {
JuncAtBegin <- NA
if (length(findRestrSite) == 0) {
message(rnn,
strwrap(prefix = " ", initial = "",
": restriction site not found near the expected
position at the beginning of the read"))
} else {
message(rnn,
strwrap(prefix = " ", initial = "",
": multiple restriction sites found near the
expected position at the beginning of the read"))
}
} else {
JuncAtBegin <- PointOfSearch - RestrictionSiteLength -
WidthOfwindow + end(findRestrSite)
}
# If the restriction site is not found:
# then try to search for the vector sequence located just upstream of the restriction site
if (is.na(JuncAtBegin)) {
SideSeqLength = as.integer(SideSeqSearch)
message(rnn, ": Searching for a ", SideSeqLength,
strwrap(prefix = " ", initial = "",
"bp sequence adjacent to the restriction site near the
predicted vector-insert junction
at the beginning of the read"))
MisMatchTolerance <- ceiling(0.2 * SideSeqLength)
## Increase the WidthOfWindow by the max of 2*SideSeqLength or +50%
WidthOfwindow <- max(c(ceiling(1.5 * WidthOfwindow),
ceiling(WidthOfwindow + 2 * SideSeqLength)))
## Get the corresponding larger sequence on the read that is supposed to contain the junction between vector and insert
SeqToSearch <- subseq(ReadDNA[[rnn]],
PointOfSearch -
RestrictionSiteLength -
WidthOfwindow,
PointOfSearch + WidthOfwindow - 1)
## Get the sequence located upstream of the restriction site
if (readStrand == "+") {
MotifToSearch <- Biostrings::subseq(VectorSequence,
VectorSize -
RestrictionSiteLength -
SideSeqLength + 1,
VectorSize -
RestrictionSiteLength)
#~ DownSeqToSearch <- subseq(VectorSequence, RestrictionSiteLength+1, RestrictionSiteLength + SideSeqLength)
} else {
MotifToSearch <- Biostrings::reverseComplement(
Biostrings::subseq(VectorSequence,
RestrictionSiteLength + 1,
RestrictionSiteLength + SideSeqLength))
#~ DownSeqToSearch <- reverseComplement(subseq(VectorSequence, VectorSize - RestrictionSiteLength - SideSeqLength + 1, VectorSize - RestrictionSiteLength))
}
## Search for the motif around the predicted vector-insert junction
findAdjacentSequence <- Biostrings::matchPattern(
MotifToSearch,
SeqToSearch,
max.mismatch = MisMatchTolerance,
with.indels= TRUE)
## If the motif is found only once, then set the vector-insert junction based on this
if (length(findAdjacentSequence) != 1) {
JuncAtBegin <- NA
if (length(findAdjacentSequence) == 0) {
message(rnn,
strwrap(prefix = " ", initial = "",
": sequence adjacent to the restriction site
not found near the expected position
at the beginning of the read"))
} else {
message(rnn,
strwrap(prefix = " ", initial = "",
": sequence adjacent to the restriction sites
found multiple times near the expected position
at the beginning of the read"))
}
} else {
message(rnn,
strwrap(prefix = " ", initial = "",
": sequence found at the beginning of the read!
Predicted vector-insert junction available"))
JuncAtBegin <- PointOfSearch -
RestrictionSiteLength -
WidthOfwindow +
end(findAdjacentSequence) +
RestrictionSiteLength #Predicted first base of the insert
}
}
#----------------------------------------
# Search the vector-insert junction at the end of the read:
#----------------------------------------
##PointOfSearch is the position of the first base preceding the restriction site
PointOfSearch <- endAlign %>% dplyr::pull(.data$InsertBorder)
## Search for the restriction site in a window covering +/- 5% of DistToVectorEnd
## the farther we are from vector end, the wider the window
WidthOfwindow <- max(c(15,
round(0.05 * (endAlign %>%
dplyr::pull(.data$DistToVectorEnd)),
0)))
## Sequence to search
## defined as +/- WidthOfwindow bases on each side of the expected position of the restriction site
SeqToSearch <- Biostrings::subseq(ReadDNA[[rnn]],
PointOfSearch - WidthOfwindow + 1,
PointOfSearch + RestrictionSiteLength +
WidthOfwindow)
## Search for the restriction site
findRestrSite <- Biostrings::matchPattern(RestrictionSiteSequence,
SeqToSearch,
max.mismatch = 1,
with.indels = TRUE)
# Define JuncAtBegin which is the last base of the insert
if (length(findRestrSite) != 1) {
JuncAtEnd <- NA
if (length(findRestrSite) == 0) {
message(rnn,
strwrap(prefix = " ", initial = "",
": restriction site not found near the
expected position at the end of the read"))
} else {
message(rnn,
strwrap(prefix = " ", initial = "",
": multiple restriction sites found near the
expected position at the end of the read"))
}
} else {
JuncAtEnd <- PointOfSearch - WidthOfwindow + start(findRestrSite) - 1
}
# If search for the restriction site fails
# then try to search for the vector sequence located just downstream of the restriction site
if (is.na(JuncAtEnd)) {
SideSeqLength = as.integer(SideSeqSearch)
message(rnn,
": Searching for a ",
SideSeqLength,
strwrap(prefix = " ", initial = "",
"bp sequence adjacent to the restriction site
near the predicted vector-insert junction
at the end of the read"))
MisMatchTolerance <- ceiling(0.2 * SideSeqLength)
## Increase the WidthOfWindow by the max of 2*SideSeqLength or +50%
WidthOfwindow <- max(c(ceiling(1.5 * WidthOfwindow),
ceiling(WidthOfwindow + 2 * SideSeqLength)))
## Get the corresponding larger sequence on the read that is supposed to contain the vector-insert junction
SeqToSearch <- Biostrings::subseq(ReadDNA[[rnn]],
PointOfSearch -
WidthOfwindow + 1,
PointOfSearch +
RestrictionSiteLength +
WidthOfwindow)
## Get the sequence located upstream of the restriction site
if (readStrand == "+") {
MotifToSearch <- Biostrings::subseq(VectorSequence,
RestrictionSiteLength + 1,
RestrictionSiteLength +
SideSeqLength)
} else {
MotifToSearch <- Biostrings::reverseComplement(
Biostrings::subseq(VectorSequence,
VectorSize - RestrictionSiteLength -
SideSeqLength + 1,
VectorSize - RestrictionSiteLength))
}
## Search for the motif around the predicted vector-insert junction
findAdjacentSequence <- Biostrings::matchPattern(
MotifToSearch,
SeqToSearch,
max.mismatch = MisMatchTolerance,
with.indels= TRUE)
## If the motif is found only once, then set the vector-insert junction based on this
## JuncAtEnd is the position of the last base of the insert
if (length(findAdjacentSequence) != 1) {
JuncAtEnd <- NA
if (length(findAdjacentSequence) == 0) {
message(rnn,
strwrap(prefix = " ", initial = "",
": sequence adjacent to the restriction site
not found near the expected position
at the end of the read"))
} else {
message(rnn,
strwrap(prefix = " ", initial = "",
": sequence adjacent to the restriction sites
found multiple times near the expected position
at the end of the read"))
}
} else {
message(rnn,
strwrap(prefix = " ", initial = "",
": sequence found at the end of the read!
Predicted vector-insert junction available"))
JuncAtEnd <- PointOfSearch -
WidthOfwindow +
start(findAdjacentSequence) - 1 -
RestrictionSiteLength
}
}
#----------------------------------------
# Replace the vector sequence
#----------------------------------------
if (replaceVectorSequence) {
## If no junction is found at beginning and end of the read:
## then don't replace anything
if (is.na(JuncAtBegin) && is.na(JuncAtEnd)) {
newRead <- ReadDNA[[rnn]]
}
## If no junction is found at the beginning of the read:
## Then replace only one side
if (is.na(JuncAtBegin) && !is.na(JuncAtEnd)) {
if (readStrand == "+") {
newRead <- Biostrings::DNAString(paste0(
Biostrings::subseq(ReadDNA[[rnn]], 1, JuncAtEnd),
VectorSequence
))
} else {
newRead <- Biostrings::DNAString(paste0(
VectorSequence,
Biostrings::reverseComplement(Biostrings::subseq(ReadDNA[[rnn]],
1,
JuncAtEnd))
))
}
}
## If no junction is found at the end of the read:
## Then replace only one side
if (!is.na(JuncAtBegin) && is.na(JuncAtEnd)) {
if (readStrand == "+") {
newRead <- Biostrings::DNAString(paste0(
VectorSequence,
Biostrings::subseq(ReadDNA[[rnn]],
JuncAtBegin,
length(ReadDNA[[rnn]]))
))
} else {
newRead <- Biostrings::DNAString(paste0(
Biostrings::reverseComplement(Biostrings::subseq(
ReadDNA[[rnn]],
JuncAtBegin,
length(ReadDNA[[rnn]]))),
VectorSequence
))
}
}
## If a junction is found at both the beginning and the end of the read
## Then replace the vector sequence on both sides
if (!is.na(JuncAtBegin) && !is.na(JuncAtEnd)) {
if (readStrand == "+") {
newRead <- Biostrings::DNAString(paste0(
VectorSequence,
Biostrings::subseq(ReadDNA[[rnn]],
JuncAtBegin,
JuncAtEnd),
VectorSequence
))
} else {
newRead <- Biostrings::DNAString(paste0(
VectorSequence,
Biostrings::reverseComplement(Biostrings::subseq(
ReadDNA[[rnn]],
JuncAtBegin,
JuncAtEnd)),
VectorSequence
))
}
}
}
# For each read, report the base at which the junction occurs with the vector and where it ends
if (replaceVectorSequence) {
return(
list("ReadName" = rnn,
"Strand" = readStrand,
"InsertStart" = JuncAtBegin,
"InsertEnd" = JuncAtEnd,
"correctedRead" = newRead)
)
} else {
return(
list("ReadName" = rnn,
"Strand" = readStrand,
"InsertStart" = JuncAtBegin,
"InsertEnd" = JuncAtEnd)
)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.