contextendR: Analyse genomic data with regard to sequence contexts

Documented in plot_kmer_positions

`%>%` = dplyr::`%>%`
aes = ggplot2::aes


#' Plot positional effects of kmers
#' 
#' Visualize the ratio between expected and observed number of mutations at
#' positions around a certain kmer
#' @param dataset A dataset generated from `extend_positions`
#' @param kmer A single kmer of interest
#' @param lambda A penalty factor, giving positions with low counts a lower estimate
#' @return A ggplot object
#' @export
plot_kmer_positions = function(dataset, kmer, lambda = 0) {
  position_ratios = kmer_position_ratios(dataset, kmer, per_trinuc = FALSE)
  position_ratios.tbl = on_pyrs(position_ratios, function(x) {
    ratio = x[,'ratio',] %>% 
      dplyr::as_tibble(rownames = 'rel_pos') %>% 
      tidyr::pivot_longer(2:ncol(.), names_to = 'var_nuc', values_to = 'ratio')
    
    ratio$observed =  x[,'observed',] %>% 
      dplyr::as_tibble(rownames = 'rel_pos') %>% 
      tidyr::pivot_longer(2:ncol(.), names_to = 'var_nuc', values_to = 'observed') %>% 
      dplyr::pull(observed)
    
    
    ratio$expected = x[,'expected',] %>% 
      dplyr::as_tibble(rownames = 'rel_pos') %>% 
      tidyr::pivot_longer(2:ncol(.), names_to = 'var_nuc', values_to = 'expected') %>% 
      dplyr::pull(expected)
    
    ratio
  })
  
  penalty = function(x, l = lambda) {
    p =  (x*2 + l) / 2
    x / p
  }
    
  position_ratios.tbl$cytosines$ref_base = 'C'
  position_ratios.tbl$thymines$ref_base = 'T'
  position_ratios.tbl = dplyr::bind_rows(position_ratios.tbl$cytosines, position_ratios.tbl$thymines)
  position_ratios.tbl %>% 
    dplyr::filter(ref_base != var_nuc) %>% 
    dplyr::mutate(rel_pos = as.integer(rel_pos),
                  mutation = paste0(ref_base, '>', var_nuc),
                  log2_ratio = log2(ratio),
                  penalized_l2_ratio = log2_ratio * penalty(observed + expected)) %>% 
    ggplot2::ggplot(aes(x = rel_pos, y = penalized_l2_ratio, color = mutation)) +
    ggplot2::geom_hline(yintercept = 0) +
    ggplot2::geom_point() +
    ggplot2::scale_color_manual(values = cosmic_palette)
}

kmer_position_ratios = function(dataset, kmer, per_trinuc = FALSE) {
  kmer_poscounts = count_kmer_positions(dataset, kmer)
  dn = dimnames(kmer_poscounts)
  kmer_rel_pos = dn[[3]]
  
  aux = function(x, pos_ix = '0', per_trinuc = FALSE) {
    # Function for calculation ratio between observed vs expected
    # number of mutations with kmers at specific positions,
    # requires that `x` is an array from `count_kmer_position`
    has_kmer = x[,,pos_ix, 'yes','counts']
    has_trinuc.ix = rowSums(has_kmer) > 0
    ni            = sum(has_trinuc.ix)
    
    nj  = ncol(has_kmer)
    rn  = rownames(has_kmer)[has_trinuc.ix]
    cn  = colnames(has_kmer)
    counts_array = array(0, dim = c(ni, nj, 3), dimnames = list(trinuc = rn, varbase = cn, val_type = c("observed", "expected", "ratio")))
    
    if (ni < 1) {
      log_info(paste("Kmer", kmer, "wasn't found in position", pos_ix))
    }
    
    counts_array[,,'observed'] = x[has_trinuc.ix,,pos_ix,'yes','counts']
    summer = if(ni == 1) sum else rowSums
    
    counts_array[,,'expected'] = x[has_trinuc.ix,,pos_ix,'no','means'] * summer(counts_array[,,'observed'])
    counts_array[,,'ratio']    = counts_array[,,'observed'] / counts_array[,,'expected']
    
    if (per_trinuc) {
      counts_arrayurn(counts_array) 
    } else {
      counts_array2 = list(
        cytosines = rownames(counts_array) %in% trinucs.c,
        thymines = rownames(counts_array) %in% trinucs.t
      )
      
      on_pyrs(counts_array2, function(ix) {
        observed = colSums(matrix(counts_array[ix,,'observed'], ncol = 4))
        expected = colSums(matrix(counts_array[ix,,'expected'], ncol = 4))
        res = matrix(c(observed, expected, observed/expected), ncol = 4 ,byrow = T)
        colnames(res) = colnames(counts_array) 
        rownames(res) = c("observed", "expected", "ratio")
        res
      })
    }
  }
  
  ratios = lapply(kmer_rel_pos, function(x) aux(kmer_poscounts, pos_ix=x, per_trinuc = per_trinuc))
  names(ratios) = kmer_rel_pos
  res = on_pyrs(list(cytosines = NA, thymines = NA), function(x) {
    ret = array(dim = c(length(ratios), 3, 4), dimnames = list(
      rel_pos = kmer_rel_pos,
      valtype = c("observed", "expected", "ratio"),
      var     = colnames(kmer_poscounts)
    ))
  })
  for (i in seq_along(kmer_rel_pos)) {
    res$cytosines[i,,] = ratios[[i]]$cytosines
    res$thymines[i,,] = ratios[[i]]$thymines
  }
  res  
}
  
count_kmer_positions = function(dataset, kmer) {
  # Transform `NA` variants to refbases
  # This simplifies things, and makes it possible to present all counts in one array, with
  # common columns (var-bases) and rows (trinuc contexts)
  dataset$variant_pyrbased[is.na(dataset$variant_pyrbased)] <- dataset$nucl_pyrbased[is.na(dataset$variant_pyrbased)]
  
  kp        = stringr::str_locate(dataset$sequence_pyrbased, kmer)[,'start']
  has_kmer  = !(is.na(kp))
  dataset$rel_pos  = (nchar(dataset$sequence_pyrbased) - 1) / 2 - (kp - 1)
  pos_range = range(dataset$rel_pos, na.rm = TRUE)
  pos_seq   = seq(pos_range[1], pos_range[2])
  n_pos     = length(pos_seq)
  
  # A 4 dimentional array of counts with the following dimensions:
  #  1) trinuc (rows)
  #  2) varbase (columns)
  #  3) Relative position, where 0 == first nuc in kmer, -1 == first nuc upstream of kmer, 1 == second nuc in kmer...
  #  4) The data subsetted as the kmer detected at that position ('yes') or not ('no')
  positional_kmer_muts = array(0,
                               dim = c(32, 4, n_pos, 2),
                               dimnames = list(
                                 trinuc = c(trinucs.c, trinucs.t),
                                 varnuc = c("A", "C", "G", "T"),
                                 kmer_rel_pos = as.character(pos_seq),
                                 kmer_detected = c("yes", "no")
                               ))
  for (pos in pos_seq) {
    kmer_pos_ix  = has_kmer & (dataset$rel_pos == pos)
    positional_kmer_muts[,,as.character(pos),'yes'] = count_trinucs(dataset[kmer_pos_ix])
    positional_kmer_muts[,,as.character(pos),'no']  = count_trinucs(dataset[!kmer_pos_ix])
  }
  
  positional_kmer_means = apply(positional_kmer_muts, c(1,3,4), function(x) x / sum(x))
  
  # `apply()` will rearrange dimensions, so return to the same format as for the counts data
  positional_kmer_means = aperm(positional_kmer_means, c(2,1,3,4))
  
  dimnames_new = dimnames(positional_kmer_muts); dimnames_new[[5]] = c('counts', 'means'); 
  names(dimnames_new) = c(names(dimnames(positional_kmer_means)), 'val_type')
  
  # Create a 5-dim array with the counts and means together over the 5th dimension
  ret = array(c(positional_kmer_muts, positional_kmer_means), dim = c(dim(positional_kmer_muts), 2),
              dimnames = dimnames_new)
  
  ret[is.nan(ret)] <- 0
  return(ret)
}