R/get_context.R

Defines functions get_context

get_context <- function(x, bsgenome, flank_size = 10, .pos_shift = NULL) {
  log_debug("In function: `get_context`")
  handle_error(flank_size >= 1, "`get_context` :: Flank size must be greater or equal to 1")

  get_trinucs <- function(z) substr(z, flank_size, flank_size + 2)

  handle_error(length(unique(IRanges::width(x))) == 1, "`get_context` :: Some positions are intervals, need to have single positions as input")

  x$position = GenomicRanges::start(x)

  # Check that flanking regions doesn't violate chromosome boundaries
  flank_size_adjusted = flank_size + 1
  x <- GenomicRanges::resize(x, flank_size_adjusted, fix='start')
  x <- GenomicRanges::resize(x, flank_size * 2 + 1,  fix='end')

  invalid_small = (GenomicRanges::start(x) - flank_size) < 1
  if (any(invalid_small)) {
    log_warning("get_context: Some positions got flanking regions starting before the first nuc of chromosome, these will be removed")
    log_debug(paste("Small values:", paste(
      GenomicRanges::seqnames(x)[invalid_small],
      GenomicRanges::start(x)[invalid_small])))
    x <- x[!invalid_small]
  }

  upper_limits         =  genome_seqlengths = bsgenome@seqinfo@seqlengths
  names(upper_limits)  <- as.character(bsgenome@seqinfo@seqnames)
  invalid_large = GenomicRanges::end(x) > upper_limits[as.character(GenomicRanges::seqnames(x))]
  if (any(invalid_large, na.rm = TRUE)) {
    log_warning("get_context: Some positions got flanking regions ending after the last nuc of chromosome, these will be removed")
    log_debug(paste("Large values:", paste(
      GenomicRanges::seqnames(x)[invalid_large],
      GenomicRanges::start(x)[invalid_large])))
    x <- x[!invalid_large]
  }

  x_vals = GenomicRanges::values(x)

  # Obtain sequence context and choose the pyrimidine based strand
  x.seq         <- Biostrings::getSeq(bsgenome, x)
  x.seq_rc      <- Biostrings::reverseComplement(x.seq)

  x.ref_nuc     <- substr(x.seq, flank_size_adjusted, flank_size_adjusted)
  x.ref_nuc_rc  <- compl_map[x.ref_nuc]
  x.trinuc      <- substr(x.seq,    flank_size, flank_size + 2)
  x.trinuc_rc   <- substr(x.seq_rc, flank_size, flank_size + 2)

  if (!(is.null(.pos_shift))) {
    log_warning("Using shifted position for troubleshooting, results shouldn't make sense")
    x = GenomicRanges::shift(x, .pos_shift)
    x.seq    <- Biostrings::getSeq(bsgenome, x)
    x.seq_rc <- Biostrings::reverseComplement(x.seq)
  }

  x.pyr_strand  <- toupper(x.ref_nuc) %in% c("C", "T")
  BiocGenerics::strand(x) = ifelse(x.pyr_strand, '+', '-')

  x.pyr_nuc     <- ifelse(x.pyr_strand, x.ref_nuc, x.ref_nuc_rc)
  x.pyr_seq     <- ifelse(x.pyr_strand, x.seq,     x.seq_rc)
  x.pyr_trinuc  <- ifelse(x.pyr_strand, x.trinuc,  x.trinuc_rc)

  x_new_vals = cbind(x_vals, data.frame(
    sequence_pyrbased = x.pyr_seq,
    trinuc_pyrbased   = x.pyr_trinuc,
    nucl_pyrbased     = x.pyr_nuc,
    stringsAsFactors  = FALSE
  ))
  GenomicRanges::values(x) = x_new_vals
  return(x)
}
lindberg-m/contextendR documentation built on Jan. 8, 2022, 3:16 a.m.