contextendR: Analyse genomic data with regard to sequence contexts

Documented in encode_seqs

#' Encode sequences into binyary format
#'
#' Split data into cytosines and thymines and encode mutations
#' and sequence contexts
#'
#' @param position_collection.gr GRanges object. With mutation types and sequence contexts
#' @param kmers Character vector. Kmers to encode
#' @export
encode_seqs <- function(position_collection.gr, kmers) {
  log_debug("In function: `encode_seqs`")
  position_collection.gr$mut_pyrbased = muts_encode(position_collection.gr$nucl_pyrbased, position_collection.gr$variant_pyrbased)
  position_collection.gr = verify_input_encodeSeqs(position_collection.gr, kmers)

  kmers_encoded    = binary_enc(position_collection.gr$sequence_pyrbased, kmers, as_equal = FALSE)

  cytosine_ix       = position_collection.gr$nucl_pyrbased == 'C'
  thymine_ix        = position_collection.gr$nucl_pyrbased == 'T'

  trinucs_c_encoded = binary_enc(position_collection.gr$trinuc_pyrbased[cytosine_ix], trinucs.c, as_equal = TRUE)
  trinucs_t_encoded = binary_enc(position_collection.gr$trinuc_pyrbased[thymine_ix],  trinucs.t, as_equal = TRUE)

  return(list(
    cytosines = list(mutations = relevel(factor(position_collection.gr$mut_pyrbased[cytosine_ix]), ref = 'NA'),
                     contexts = cbind(trinucs_c_encoded, kmers_encoded[cytosine_ix,] * 1)),
    thymines  = list(mutations = relevel(factor(position_collection.gr$mut_pyrbased[thymine_ix]), ref = 'NA'),
                     contexts = cbind(trinucs_t_encoded, kmers_encoded[thymine_ix,] * 1))
  ))
}

muts_encode <- function(ref_base, var_base) {
  pyrimidines = c("C", "T"); pyrimidines = c(pyrimidines, tolower(pyrimidines))
  nucs        = c("A", "C", "G", "T"); nucs = c(nucs, tolower(nucs))
  
  is_na       = is.na(var_base)
  is_cytosine = ref_base == 'C'
  is_thymine  = ref_base == 'T'
  
  enc = character(length(ref_base))
  enc[is_na] = 'NA'
  enc[!is_na & is_cytosine] = paste0('C>', var_base[!is_na & is_cytosine])
  enc[!is_na & is_thymine]  = paste0('T>', var_base[!is_na & is_thymine])
 
  return(enc)
}

verify_input_encodeSeqs <- function(position_collection.gr, kmers) {
  # Classes
  handle_error(class(position_collection.gr)[1] == 'GRanges', "Input `position_collection.gr` need to be of class 'GRanges'")
  handle_error(class(kmers) == 'character', "Input `kmers` need to be of class 'character'")

  # Columns
  pc_values = position_collection.gr@elementMetadata
  cnames = colnames(pc_values)
  handle_error("sequence_pyrbased" %in% cnames, "The column 'sequence_pyrbased' must exist in input `position_collection.gr`")
  handle_error("nucl_pyrbased"     %in% cnames, "The column 'nucl_pyrbased' must exist in input `position_collection.gr`")
  handle_error("mut_pyrbased"      %in% cnames, "The column 'mut_pyrbased' must exist in input `position_collection.gr`")
  handle_error("trinuc_pyrbased"   %in% cnames, "The column 'trinuc_pyrbased' must exist in input `position_collection.gr`")

  # Values
  handle_error(all(nchar(pc_values$nucl_pyrbased) == 1), "The reference base may only contain single nucleotides")
  handle_error(all(pc_values$nucl_pyrbased %in% c("C", "T")), "Only cytosines and thymines can be pyrimidines (nucl_pyrbased column contains other stuff)")
  handle_error(all(nchar(pc_values$trinuc_pyrbased) == 3), "All trinucleotides must be of length 3")

  # Sequences
  has_ambiguous = stringr::str_detect(position_collection.gr$sequence_pyrbased, '[NYRWSKMDVHB]')
  warn.msg =  paste0("encode_seqs: The supplied sequences have ambiguous nucleotides in them, disregarding those\n",
                     "Number of remaining sequences is: ", sum(!(has_ambiguous)))
  correct_ambiguous = handle_warning(sum(has_ambiguous) == 0,
                                         warn.msg)
  if (correct_ambiguous) {
    return(position_collection.gr[!(has_ambiguous)])
  }
  position_collection.gr
}