#' Encode sequences into binyary format
#'
#' Split data into cytosines and thymines and encode mutations
#' and sequence contexts
#'
#' @param position_collection.gr GRanges object. With mutation types and sequence contexts
#' @param kmers Character vector. Kmers to encode
#' @export
encode_seqs <- function(position_collection.gr, kmers) {
log_debug("In function: `encode_seqs`")
position_collection.gr$mut_pyrbased = muts_encode(position_collection.gr$nucl_pyrbased, position_collection.gr$variant_pyrbased)
position_collection.gr = verify_input_encodeSeqs(position_collection.gr, kmers)
kmers_encoded = binary_enc(position_collection.gr$sequence_pyrbased, kmers, as_equal = FALSE)
cytosine_ix = position_collection.gr$nucl_pyrbased == 'C'
thymine_ix = position_collection.gr$nucl_pyrbased == 'T'
trinucs_c_encoded = binary_enc(position_collection.gr$trinuc_pyrbased[cytosine_ix], trinucs.c, as_equal = TRUE)
trinucs_t_encoded = binary_enc(position_collection.gr$trinuc_pyrbased[thymine_ix], trinucs.t, as_equal = TRUE)
return(list(
cytosines = list(mutations = relevel(factor(position_collection.gr$mut_pyrbased[cytosine_ix]), ref = 'NA'),
contexts = cbind(trinucs_c_encoded, kmers_encoded[cytosine_ix,] * 1)),
thymines = list(mutations = relevel(factor(position_collection.gr$mut_pyrbased[thymine_ix]), ref = 'NA'),
contexts = cbind(trinucs_t_encoded, kmers_encoded[thymine_ix,] * 1))
))
}
muts_encode <- function(ref_base, var_base) {
pyrimidines = c("C", "T"); pyrimidines = c(pyrimidines, tolower(pyrimidines))
nucs = c("A", "C", "G", "T"); nucs = c(nucs, tolower(nucs))
is_na = is.na(var_base)
is_cytosine = ref_base == 'C'
is_thymine = ref_base == 'T'
enc = character(length(ref_base))
enc[is_na] = 'NA'
enc[!is_na & is_cytosine] = paste0('C>', var_base[!is_na & is_cytosine])
enc[!is_na & is_thymine] = paste0('T>', var_base[!is_na & is_thymine])
return(enc)
}
verify_input_encodeSeqs <- function(position_collection.gr, kmers) {
# Classes
handle_error(class(position_collection.gr)[1] == 'GRanges', "Input `position_collection.gr` need to be of class 'GRanges'")
handle_error(class(kmers) == 'character', "Input `kmers` need to be of class 'character'")
# Columns
pc_values = position_collection.gr@elementMetadata
cnames = colnames(pc_values)
handle_error("sequence_pyrbased" %in% cnames, "The column 'sequence_pyrbased' must exist in input `position_collection.gr`")
handle_error("nucl_pyrbased" %in% cnames, "The column 'nucl_pyrbased' must exist in input `position_collection.gr`")
handle_error("mut_pyrbased" %in% cnames, "The column 'mut_pyrbased' must exist in input `position_collection.gr`")
handle_error("trinuc_pyrbased" %in% cnames, "The column 'trinuc_pyrbased' must exist in input `position_collection.gr`")
# Values
handle_error(all(nchar(pc_values$nucl_pyrbased) == 1), "The reference base may only contain single nucleotides")
handle_error(all(pc_values$nucl_pyrbased %in% c("C", "T")), "Only cytosines and thymines can be pyrimidines (nucl_pyrbased column contains other stuff)")
handle_error(all(nchar(pc_values$trinuc_pyrbased) == 3), "All trinucleotides must be of length 3")
# Sequences
has_ambiguous = stringr::str_detect(position_collection.gr$sequence_pyrbased, '[NYRWSKMDVHB]')
warn.msg = paste0("encode_seqs: The supplied sequences have ambiguous nucleotides in them, disregarding those\n",
"Number of remaining sequences is: ", sum(!(has_ambiguous)))
correct_ambiguous = handle_warning(sum(has_ambiguous) == 0,
warn.msg)
if (correct_ambiguous) {
return(position_collection.gr[!(has_ambiguous)])
}
position_collection.gr
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.