contextendR: Analyse genomic data with regard to sequence contexts

Documented in extend_positions

#' Add nucleotide positions and extract sequence contexts
#'
#' Add genomic positions to generate a dataset of mutated and non mutated nucleotide positions.
#' Supply known variants either thruogh a data.frame using the `mut_df` argument, or by supplying
#' the genomic positions and variants throu the three arguments `chrom`, `pos` and `alt`. Currently
#' this function will only allow positions where all contextual nucleotides are non-ambiguous.
#'
#' @param genome BSgenome object. The genome which the regions and annotated positions refer to
#' @param N Integer. Number of positions to include in final dataset. Can be used instead of `N_factor`
#' @param N_factor Integer. Multiplied with the number of mutations (after subsetting) to reach a target size (instead of specifying directly with `N`)
#' @param mut_df data.frame. A data frame containing positions and variant annotations in three columns: `chrom`, `pos` and `variant`
#' @param chrom Character. A vector of chromosomes. Not necessary if `mut_df` is supplied
#' @param pos Integer. A vector of positions. Not necessary if `mut_df` is supplied
#' @param reference Character. A vector of reference nucleotides. If omitted, reference will be extracted from the genome using `genome`, `chrom` and `pos`
#' @param variant Character. A vector of variants. Not necessary if `mut_df` is supplied
#' @param regions.gr Optional GRanges object. Genomic regions from which to draw random positions
#' @param n_max_muts Optional Integer. Filter mutation dataset to only include `n_max_muts` number of mutations
#' @param flank_size Integer. Size of region on each side of mutated and sampled positions to include as contextual regions
#' @param num_retries Int
#' @param is_pyrbased Bool
#' @param olap Bool. Overlap the current mutations with regions?
#' @param remove_ambiguous Remove positions where contextual sequences contains ambiguous nucleotides?
#' @param pos_shift Int. Only for debugging purposes. Do not set this
#' If so, new positions will be sampled to reach `N` positions in total
#' @export
extend_positions <- function(genome,
                             N = NULL,
                             N_factor = NULL,
                             mut_df = NULL,
                             chrom = NULL,
                             pos = NULL,
                             reference = NULL,
                             variant = NULL,
                             regions.gr = NULL,
                             n_max_muts = NULL,
                             flank_size = 10,
                             num_retries = 10,
                             is_pyrbased = F,
                             olap = TRUE,
                             remove_ambiguous = TRUE,
                             .pos_shift = NULL) {

  log_debug("In function: `extend_positions`")

  mode_flag = verifyInput_extendPositions(genome = genome,
                                          N = N,
                                          N_factor = N_factor,
                                          mut_df = mut_df,
                                          chrom = chrom,
                                          pos = pos,
                                          variant = variant,
                                          reference = reference,
                                          regions.gr = regions.gr,
                                          n_max_muts = n_max_muts,
                                          flank_size = flank_size,
                                          num_retries = num_retries,
                                          is_pyrbased = is_pyrbased,
                                          olap = olap)
  log_debug(paste(names(mode_flag), mode_flag, sep = '=', collapse = ', '))

  if (mode_flag['param_error']) {
    log_error("An error occured when checking input to function `extend_positions`", stop = TRUE)
  }

  # Define Initial GRanges mutation set and GRanges regions-set
  INIT_SET = NULL
  REGIONS  = NULL
  if (mode_flag['include_obs']) {
    if (mode_flag['use_df']) {
      chrom = as.character(mut_df$chrom)
      pos   = as.integer(mut_df$pos)
      var   = as.character(mut_df$variant)
    } else {
      chrom = as.character(chrom)
      pos   = as.integer(pos)
      var   = as.character(variant)
    }
    INIT_SET <- GenomicRanges::GRanges(seqnames = chrom, ranges = IRanges::IRanges(pos, width = 1))
    INIT_SET$variant = var

    log_debug("Checking for non-SNV mutations...")
    var_is_single_nucl = stringr::str_detect(INIT_SET$variant, c('^[acgtACGT]$'))
    if (sum(!var_is_single_nucl, na.rm = TRUE) > 0) {
      log_warning(paste("Currently, contextendR only supports SNVs. Found", sum(!(var_is_single_nucl)), "non-SNV variants in original dataset, removing those positions now"))
      INIT_SET = INIT_SET[var_is_single_nucl]
    }
  }

  if (mode_flag['use_regions']) {
    REGIONS = regions.gr
  } else {
    chrom_offset = 10
    # In order to not sample positions too close to chromosome boundaries,
    # we apply an offset to the start and end of each chromosome
    REGIONS = GenomicRanges::GRanges(seqnames = BSgenome::seqnames(genome),
                                     ranges = IRanges::IRanges(start = chrom_offset,
                                                               end = BSgenome::seqinfo(genome)@seqlengths - chrom_offset
                                                               )
                                     )
  }

  # Overlap INIT_SET vs REGIONS only if `olap` argument is set to TRUE
  if (mode_flag['use_regions']) {
    if (olap) {
      if (is.null(INIT_SET)) {
        log_error("You specified `olap=TRUE`, but there are no mutations to overlap", stop = TRUE)
      }
      log_info("Overlapping mutations over the regions specified")
      olaps = GenomicRanges::findOverlaps(INIT_SET, REGIONS)
      INIT_SET = INIT_SET[olaps@from]
      log_info(paste(length(INIT_SET), "number of mutations after overlapping"))
    } else {
      log_warning("You supplied genomic regions for overlapping, but olap=FALSE. Mutations will NOT be overlapped")
    }
  }

  n_muts = BiocGenerics::NROW(INIT_SET)
  # INIT_SET is now defined, use `N` as supplied or calculate new `N` based
  # on elements in INIT_SET and `N_factor`
  #if (mode_flag['sample'] && !(is.null(N_factor))) {
  target_size = N
  if (!(is.null(N_factor))) {
    target_size = round(n_muts * N_factor)
    use_n_factor = TRUE
  } else {
    use_n_factor = FALSE
  }

  if (mode_flag['sub_sample'] && mode_flag['include_obs']) {
    n = min(c(n_max_muts, target_size), na.rm = TRUE)
    if (n < n_muts) {
      log_info(paste("Subsampling mutations to to reach a target size of:", n))
      ix = sort(sample(1:n_muts, n, replace = FALSE))
      INIT_SET = INIT_SET[ix]
      if (use_n_factor) {
        if (N_factor >= 1) {
          target_size = round(length(INIT_SET) * N_factor)
        }
      }
    }
  }

  # Sample positions
  if (mode_flag['sample']) {
    if (mode_flag['include_obs']) {
      log_info(paste("Adding positions until we reach", target_size, "positions"))
      genomic_positions = add_positions(REGIONS, INIT_SET, target_size, .n_retries = num_retries)
    } else {
      log_info(paste("Sampling", target_size, "positions"))
      genomic_positions = sample_positions(REGIONS, target_size)
      genomic_positions$variant = as.character(NA)
    }
  } else {
    genomic_positions = INIT_SET
  }

  # This is no longer needed. `genomic_position` is the main object now
  rm("INIT_SET")

  # Extract sequence context
  if (mode_flag['extract']) {
    log_debug("Extracting context")
    ret = get_context(genomic_positions, genome, flank_size = flank_size, .pos_shift = .pos_shift)

    # Remove positions which has ambiguous nucleotides in them, and then
    # re-sample new positions if `sample` is true
    if (remove_ambiguous) {
      ambig_regex  = paste0('[', paste0(ambig_nucs, collapse = ''), ']')
      ambig_ix     = grepl(ambig_regex, ret$sequence_pyrbased) | grepl(ambig_regex, ret$trinuc_pyrbased)

      if (mode_flag['sample']) {
        tries_left = num_retries
        while((tries_left > 0) && (sum(ambig_ix) > 0)) {
          ret = ret[!ambig_ix]
          log_debug(paste(sum(ambig_ix), "number of ambiguous sequences"))
          ret$aux           = TRUE # Just an aux for subsetting before getting context sequences (no need to get contexts already extracted)
          genomic_positions = add_positions(REGIONS, ret, target_size, num_retries)
          genomic_positions = genomic_positions[is.na(genomic_positions$aux)]; GenomicRanges::values(genomic_positions) = NULL
          new_pos_w_ctx     = get_context(genomic_positions, genome, flank_size = flank_size, .pos_shift = .pos_shift)
          ret$aux           = NULL # aux not needed anymore
          ret               = GenomicRanges::sort(c(ret, new_pos_w_ctx))
          ambig_ix          = grepl(ambig_regex, ret$sequence_pyrbased) | grepl(ambig_regex, ret$trinuc_pyrbased)
          tries_left        = tries_left - 1
        }

        if (sum(ambig_ix) > 0) {
          log_warning("OBS: There were still some ambiguous nucleotides left after trying to remove them. Maybe the region you are sampling from is rich in ambiguous nucleotides
                  or is too small compared to the target size.")
        }
      } else if(sum(ambig_ix) > 0) {
        log_info(paste("Removing", sum(ambig_ix), "positions due to ambiguous sequence contexts detected. Will NOT sample new positions to replace them.",
                       "Set a greater target size (N) to sample non-mutated sites"))
        ret = ret[!ambig_ix]
      }
    }
  } else {
    ret = genomic_positions
  }

  # Make variant pyrbased, if not already
  if (is_pyrbased) {
    ret$variant_pyrbased = ret$variant
  } else {
    is_variant       = !is.na(ret$variant)
    variant          = ret$variant[is_variant]
    variants.dss     = Biostrings::DNAStringSet(variant)
    variants.dss.rc  = Biostrings::reverseComplement(variants.dss)
    variants.rc      = as.character(variants.dss.rc)
    variant_pyrbased = ifelse(BiocGenerics::strand(ret)[is_variant] == '+',
                              variant,
                              variants.rc)

    ret$variant_pyrbased = NA
    ret$variant_pyrbased[is_variant] = variant_pyrbased
  }

  # In case pyrbased variants and pyrbased reference are the same, we
  # suspect input isn't correct.
  if (any(ret$variant_pyrbased == ret$nucl_pyrbased, na.rm = TRUE)) {
    log_error(paste("There are some positions where the variant and",
                    "the reference nucleotide are identical. Make sure that",
                    "all data refer to the same genome build, and check that",
                    "`pyrbased` flag is set/unset depending on if variants in",
                    "your dataset are all based on the pyrimidine reference",
                    "strand"))
  }

  return(GenomicRanges::sort(ret))
}

# Used internally to parse input to `extend_positions`
verifyInput_extendPositions = function(genome, N, N_factor, mut_df, chrom, pos,
                                       reference, variant, regions.gr, n_max_muts,
                                       flank_size, num_retries, is_pyrbased, olap){

  mode_flag = logical(7)
  names(mode_flag) = c("include_obs", "use_df", "sample", "use_regions", "sub_sample", "extract", "param_error")
  # include_obs : Include mutations from either `mut_df` or chrom+pos+variant
  # use_df      : Use data from `mut_df` instead of the vectors `chrom` `pos` and `variant`
  # sample      : Sample extra positions from genome
  # use_regions : Sample from regions specified in `regions.gr`
  # sub_sample  : Subsample mutations (if N < obs or if n_max_muts is set)
  # extract     : extract sequence context
  # param_error : Invalid combination of params

  if (is.null(genome)) {
    log_error("`extend_positions` :: argument `genome` is lacking", stop=FALSE)
    mode_flag['param_error'] = TRUE
  }

  if (!(inherits(genome, "BSgenome"))) {
    log_error("`extend_positions` :: argument `genome` isn't of class 'BSgenome'", stop=FALSE)
    mode_flag['param_error'] = TRUE
  }

  if (is.null(regions.gr)) {
    log_info("No region limitation specified")
  } else {
    mode_flag['use_regions'] = TRUE
  }

  if (!(is.null(n_max_muts))) {
    if (n_max_muts < 0) {
      log_error("Argument `n_max_muts` must be a positive integer", stop =FALSE)
      mode_flag['param_error'] = TRUE
    } else {
      mode_flag['sub_sample'] = TRUE
    }
  }

  if (!(is.na(flank_size))) {
    if (flank_size <= 0) {
      log_warning("flank_size must be a positive integer in order to extract sequence context")
    } else {
      mode_flag['extract'] = TRUE
    }
  }

  if (all(is.null(mut_df),is.null(pos),is.null(chrom),is.null(variant))) {
    if (is.null(N) & is.null(N_factor)) {
      log_error("`extend_positions` :: Cannot sample positions unless parameter `N` or `N_factor` is specified", stop = FALSE)
      mode_flag['param_error'] = TRUE
    }
  }

  if (!(is.null(variant)) && !(is.null(reference))) {
    if (length(variant) != length(reference)) {
      log_error("`variant` and `reference` are not of equal length", stop = FALSE)
      mode_flag['param_error'] = TRUE
    }
  }

  if (!(is.null(N))) {
    if (N > 0) {
      mode_flag['sample'] = TRUE
    } else {
      log_error("`N` must be a positive integer")
    }
  }

  if (!(is.null(N_factor))) {
    if (N_factor > 0) {
      if (mode_flag['sample']) {
        log_warning("Both `N` and `N_factor` was supplied. `N_factor` takes precedence")
      }
      mode_flag['sample'] = TRUE
    } else {
      log_error("`N_sample` must be a positive number")
    }
  }

  if (is.null(mut_df)) {
    log_info("No data frame supplied")
    if (mode_flag['include_obs']) {
      if (is.null(chrom)) {
        log_error("`extend_positions` :: Neither a data frame or a vector of chromosomes was supplied", stop = FALSE)
        mode_flag['param_error'] = TRUE
      }
      if (is.null(pos)) {
        log_error("`extend_positions` :: Neither a data frame or a vector of positions was supplied", stop = FALSE)
        mode_flag['param_error'] = TRUE
      }
      if (is.null(variant)) {
        log_error("`extend_positions` :: Neither a data frame or a vector of variants was supplied", stop = FALSE)
        mode_flag['param_error'] = TRUE
      }
    }

  } else {
    # If a `mut_df` data frame is supplied, it is assumed to be used rather than the vectors in `chrom`, `pos` and `variant`.
    # Warn if those vectors are defined, since those are ignored, and check that column names and column types are
    # proper
    mode_flag['use_df'] = TRUE
    mode_flag['include_obs'] = TRUE


    if (!(inherits(mut_df, "data.frame"))){
      log_error("`extend_positions` :: mut_df is not a data frame", stop=FALSE)
      mode_flag['param_error'] = TRUE
    } else {
      if (!(is.null(N)) ) {
        if (nrow(mut_df) > N) {
          mode_flag['sub_sample'] = TRUE
        }
      }
      if (!(is.null(N_factor))) {
        mode_flag['sub_sample'] = TRUE
      }
    }
    cnames = colnames(mut_df)
    if (!("chrom" %in% cnames)) {
      log_error("`extend_positions` :: couldn't find the column 'chrom' in the table `mut_df`", stop=FALSE)
      mode_flag['param_error'] = TRUE
    }

    if (!("pos" %in% cnames)) {
      log_error("`extend_positions` :: couldn't find the column 'pos' in the table `mut_df`", stop=FALSE)
      mode_flag['param_error'] = TRUE
    }

    if (!("variant" %in% cnames)) {
      log_error("`extend_positions` :: couldn't find the column 'variant' in the table `mut_df`", stop=FALSE)
      mode_flag['param_error'] = TRUE
    }

    if (!(is.null(chrom))) {
      log_warning("parameter `chrom` is defined, but will be ignored since a data.frame in `mut_df` is supplied")
    }

    if (!(is.null(pos))) {
      log_warning("parameter `pos` is defined, but will be ignored since a data.frame in `mut_df` is supplied")
    }

    if (!(is.null(variant))) {
      log_warning("parameter `variant` is defined, but will be ignored since a data.frame in `mut_df` is supplied")
    }
  }

  chrom_n   = length(chrom)
  pos_n     = length(pos)
  variant_n = length(variant)
  if (!(unique(c(chrom_n, pos_n, variant_n)) == chrom_n)) {
    log_error("`extend_positions` :: Number of chromosomes, positions, and variants are not identical", stop=FALSE)
    mode_flag['param_error'] = TRUE
  }

  if (chrom_n > 0) {
    mode_flag['include_obs'] = TRUE
  }

  if(!mode_flag['use_df']) {
    if (mode_flag['sample'] && (chrom_n > N)) {
      mode_flag['sub_sample'] = TRUE
    }

    if (mode_flag['include_obs']) {
      if (!(is.numeric(pos))) {
        log_error("Argument `pos` must be numeric", stop=FALSE)
        mode_flag['param_error'] = TRUE
      }
      if (!(is.character(chrom))) {
        log_error("Argument `chrom` must be a character", stop = FALSE)
        mode_flag['param_error'] = TRUE
      }
      if (!(is.character(variant))) {
        log_error("Argument `variant` must be a character", stop = FALSE)
        mode_flag['param_error'] = TRUE
      }
    }
  }

  if (!mode_flag['use_regions'] && olap) {
    log_error("Cannot overlap mutations without specifying a region. Set `olap` to FALSE or specify a GRanges object to `region.gr`.", stop = FALSE)
    mode_flag['param_error'] = TRUE
  }

  if (mode_flag['sub_sample'] && !mode_flag['include_obs']) {
    log_error("Cannot subsample positions without including observations", stop = FALSE)
    mode_flag['param_error'] = TRUE
  }
  return(mode_flag)
}