#' Sequence scanner
#'
#' @description Scans over sequences finding changes around given patterns
#'
#' @details The scan_seq function simultaneously passes two sliding windows
#' along the ancestral and query sequences. The sliding window is of length 3,
#' corresponding to the potentially hypermutated position and the 2 downstream
#' positions. At each position, the size of the window is increased until it
#' covers 3 non-gap characters in the query sequence. If a G is located at the
#' first position of the window, the position is considered a position of
#' interest and the query sequence is inspected to classify it as either a
#' hypermutation or control position, incrementing either the num_potential_mut
#' variable or the num_potential_control variable. The query sequence is
#' checked next and if the G mutated to an A, then the tally of the number of
#' possible hypermutations (num_mut) or the number of control mutations
#' (num_control) is incremented.
#'
#' @param cons The ancestral sequences to compare against
#' @param the_seq The query sequence
#' @param fix_with Either false or a single letter. If not FALSE, then replace
#' the hypermutated base with the letter indicated.
#'
#' @return The return value from scan_seq is a list that contains the number of
#' mutated hypermutation and control positions, the total number of potential
#' hypermutation and control positions, the p-value of the one-sided Fischer
#' exact test, the (possibly corrected) query sequence and the data.frame that
#' catalogs each individual position.
#'
#' @examples
#' scan_seq(paste(as.character(hd_seqs[1][[1]]), collapse = ''), paste(as.character(hd_seqs[2][[1]]), collapse = ''))
#'
#' @export
scan_seq <- function(cons, the_seq, fix_with = FALSE){
if (fix_with != FALSE){
fix_with <- tolower(fix_with)
stopifnot(tolower(fix_with) %in% letters)
}
cons <- strsplit(toupper(cons), '')[[1]]
the_seq <- strsplit(toupper(the_seq), '')[[1]]
stopifnot(length(cons) == length(the_seq))
if( length(cons) < 3){
return(list(num_mut = 0,
num_potential_mut = 0,
num_control = 0,
num_potential_control = 0,
p_value = 1,
all_mut_pos = NULL,
the_seq = cons)
)
}
num_potential_mut <- 0
num_mut <- 0
num_potential_control <- 0
num_control <- 0
all_mut_pos <- NULL
for( window_start_i in 1:(length(cons) - 2) ) {
context_indx1 <- 1
context_indx2 <- 2
# Move the context forward if gaps are encountered to ensure that the
# context pattern is matched to the sequence that the APOBEC enzyme
# would have encountered.
while( as.character( the_seq[ window_start_i + context_indx1 ] ) == "-" ){
context_indx1 <- context_indx1 + 1
context_indx2 <- context_indx2 + 1
if (window_start_i + context_indx2 > length(cons)){
break
}
}
if (window_start_i + context_indx2 > length(cons)){
next
}
while( as.character( the_seq[ window_start_i + context_indx2 ] ) == "-" ){
context_indx2 <- context_indx2 + 1
if (window_start_i + context_indx2 > length(cons)){
break
}
}
if (window_start_i + context_indx2 > length(cons)){
next
}
# Check for hypermutated spots
if( ( cons[window_start_i + 0 ] == "G" ) &&
# Reference must mutate from G
( the_seq[window_start_i + context_indx1 ] %in% c( "A", "G" ) ) &&
# Context position 1 must match R = [AG] in query
( the_seq[window_start_i + context_indx2 ] %in% c( "A", "G", "T" ) ) ){
# Context position 2 must match D = [AGT] in query
num_potential_mut <- num_potential_mut + 1;
hyper_muted <- as.character( the_seq[ window_start_i + 0 ] ) == "A"
all_mut_pos <- rbind(all_mut_pos,
data.frame(pos = window_start_i,
base_in_query = as.character( the_seq[ window_start_i + 0 ] ),
full_seq = paste(as.character( the_seq[ window_start_i + c(0, context_indx1, context_indx2) ] ),
sep = '', collapse = ''),
cons_seq = paste(as.character( cons[ window_start_i + c(0, context_indx1, context_indx2) ] ),
sep = '', collapse = ''),
type = 'hyp',
muted = hyper_muted,
stringsAsFactors = F)
)
if( hyper_muted ) {
# If G -> A mutation occurred
num_mut <- num_mut + 1;
if ( fix_with != FALSE ) {
the_seq[ window_start_i ] <- fix_with
}
}
}
# Check for control spots
if( ( cons[ window_start_i + 0 ] == "G" ) &&
# Reference must mutate from G
( ( ( the_seq[ window_start_i + context_indx1 ] %in% c( "C", "T" ) ) &&
# Option 1 Context position 1 must match Y = [CT] in query
( the_seq[ window_start_i + context_indx2 ] %in% c( "A", "C", "G", "T" ) )) ||
# Option 1 Context position 2 must match N = [ACGT] in query
( ( the_seq[ window_start_i + context_indx1 ] %in% c( "A", "G" ) ) &&
# Option 2 Context position 1 must match R = [AG] in query
( the_seq[ window_start_i + context_indx2 ] ) == "C" ) ) ){
# Option 2 Context position 2 must match C in query
num_potential_control <- num_potential_control + 1;
control_muted <- as.character( the_seq[ window_start_i + 0 ] ) == "A"
all_mut_pos <- rbind(all_mut_pos,
data.frame(pos = window_start_i,
base_in_query = as.character( the_seq[ window_start_i + 0 ] ),
full_seq = paste(as.character( the_seq[ window_start_i + c(0, context_indx1, context_indx2) ] ),
sep = '', collapse = ''),
cons_seq = paste(as.character( cons[ window_start_i + c(0, context_indx1, context_indx2) ] ),
sep = '', collapse = ''),
type = 'con',
muted = control_muted,
stringsAsFactors = F)
)
if( control_muted ) {
# If G -> A mutation occureed
num_control <- num_control + 1;
}
}
} # for window_start_i
p_value <- fisher.test( matrix( c( num_control, ( num_potential_control - num_control ), num_mut, ( num_potential_mut - num_mut ) ), nrow = 2, byrow = T ), alternative = 'less' )$p.value;
return(list(num_mut = num_mut,
num_potential_mut = num_potential_mut,
num_control = num_control,
num_potential_control = num_potential_control,
p_value = p_value,
all_mut_pos = all_mut_pos,
the_seq = the_seq)
)
}
#' Wrapper for C++ implementation of scan_seq
#' @export
rcpp_scan_seq <- function(cons, the_seq, fix_with = FALSE){
if (length(cons) > 1){
cons <- toupper(paste(cons, collate = ''))
} else {
cons <- toupper(cons)
}
if (length(the_seq) > 1){
the_seq <- toupper(paste(the_seq, collate = ''))
} else {
the_seq <- toupper(the_seq)
}
stopifnot(nchar(cons) == nchar(the_seq))
if (fix_with != FALSE){
fix_with <- tolower(fix_with)
stopifnot(tolower(fix_with) %in% letters)
} else {
fix_with <- "FALSE"
}
result <- hypermutR:::rcpp_scan_seq_int(cons, the_seq, fix_with)
p_value <- fisher.test( matrix( c( result$num_control, ( result$num_potential_control - result$num_control ), result$num_mut, ( result$num_potential_mut - result$num_mut ) ), nrow = 2, byrow = T ), alternative = 'less' )$p.value;
all_mut_pos <- data.frame(
pos = result$all_mut_pos$pos,
base_in_query = result$all_mut_pos$base_in_query,
full_seq = result$all_mut_pos$full_seq,
cons_seq = result$all_mut_pos$cons_seq,
type = result$all_mut_pos$type,
muted = ifelse(result$all_mut_pos$muted == "yes", T, F),
stringsAsFactors = FALSE
)
if (nrow(all_mut_pos) == 0) {all_mut_pos <- NULL}
return(list(num_mut = result$num_mut,
num_potential_mut = result$num_potential_mut,
num_control = result$num_control,
num_potential_control = result$num_potential_control,
p_value = p_value,
all_mut_pos = all_mut_pos,
the_seq = strsplit(result$the_seq, '')[[1]])
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.