`%>%` = dplyr::`%>%`
aes = ggplot2::aes
#' Plot positional effects of kmers
#'
#' Visualize the ratio between expected and observed number of mutations at
#' positions around a certain kmer
#' @param dataset A dataset generated from `extend_positions`
#' @param kmer A single kmer of interest
#' @param lambda A penalty factor, giving positions with low counts a lower estimate
#' @return A ggplot object
#' @export
plot_kmer_positions = function(dataset, kmer, lambda = 0) {
position_ratios = kmer_position_ratios(dataset, kmer, per_trinuc = FALSE)
position_ratios.tbl = on_pyrs(position_ratios, function(x) {
ratio = x[,'ratio',] %>%
dplyr::as_tibble(rownames = 'rel_pos') %>%
tidyr::pivot_longer(2:ncol(.), names_to = 'var_nuc', values_to = 'ratio')
ratio$observed = x[,'observed',] %>%
dplyr::as_tibble(rownames = 'rel_pos') %>%
tidyr::pivot_longer(2:ncol(.), names_to = 'var_nuc', values_to = 'observed') %>%
dplyr::pull(observed)
ratio$expected = x[,'expected',] %>%
dplyr::as_tibble(rownames = 'rel_pos') %>%
tidyr::pivot_longer(2:ncol(.), names_to = 'var_nuc', values_to = 'expected') %>%
dplyr::pull(expected)
ratio
})
penalty = function(x, l = lambda) {
p = (x*2 + l) / 2
x / p
}
position_ratios.tbl$cytosines$ref_base = 'C'
position_ratios.tbl$thymines$ref_base = 'T'
position_ratios.tbl = dplyr::bind_rows(position_ratios.tbl$cytosines, position_ratios.tbl$thymines)
position_ratios.tbl %>%
dplyr::filter(ref_base != var_nuc) %>%
dplyr::mutate(rel_pos = as.integer(rel_pos),
mutation = paste0(ref_base, '>', var_nuc),
log2_ratio = log2(ratio),
penalized_l2_ratio = log2_ratio * penalty(observed + expected)) %>%
ggplot2::ggplot(aes(x = rel_pos, y = penalized_l2_ratio, color = mutation)) +
ggplot2::geom_hline(yintercept = 0) +
ggplot2::geom_point() +
ggplot2::scale_color_manual(values = cosmic_palette)
}
kmer_position_ratios = function(dataset, kmer, per_trinuc = FALSE) {
kmer_poscounts = count_kmer_positions(dataset, kmer)
dn = dimnames(kmer_poscounts)
kmer_rel_pos = dn[[3]]
aux = function(x, pos_ix = '0', per_trinuc = FALSE) {
# Function for calculation ratio between observed vs expected
# number of mutations with kmers at specific positions,
# requires that `x` is an array from `count_kmer_position`
has_kmer = x[,,pos_ix, 'yes','counts']
has_trinuc.ix = rowSums(has_kmer) > 0
ni = sum(has_trinuc.ix)
nj = ncol(has_kmer)
rn = rownames(has_kmer)[has_trinuc.ix]
cn = colnames(has_kmer)
counts_array = array(0, dim = c(ni, nj, 3), dimnames = list(trinuc = rn, varbase = cn, val_type = c("observed", "expected", "ratio")))
if (ni < 1) {
log_info(paste("Kmer", kmer, "wasn't found in position", pos_ix))
}
counts_array[,,'observed'] = x[has_trinuc.ix,,pos_ix,'yes','counts']
summer = if(ni == 1) sum else rowSums
counts_array[,,'expected'] = x[has_trinuc.ix,,pos_ix,'no','means'] * summer(counts_array[,,'observed'])
counts_array[,,'ratio'] = counts_array[,,'observed'] / counts_array[,,'expected']
if (per_trinuc) {
counts_arrayurn(counts_array)
} else {
counts_array2 = list(
cytosines = rownames(counts_array) %in% trinucs.c,
thymines = rownames(counts_array) %in% trinucs.t
)
on_pyrs(counts_array2, function(ix) {
observed = colSums(matrix(counts_array[ix,,'observed'], ncol = 4))
expected = colSums(matrix(counts_array[ix,,'expected'], ncol = 4))
res = matrix(c(observed, expected, observed/expected), ncol = 4 ,byrow = T)
colnames(res) = colnames(counts_array)
rownames(res) = c("observed", "expected", "ratio")
res
})
}
}
ratios = lapply(kmer_rel_pos, function(x) aux(kmer_poscounts, pos_ix=x, per_trinuc = per_trinuc))
names(ratios) = kmer_rel_pos
res = on_pyrs(list(cytosines = NA, thymines = NA), function(x) {
ret = array(dim = c(length(ratios), 3, 4), dimnames = list(
rel_pos = kmer_rel_pos,
valtype = c("observed", "expected", "ratio"),
var = colnames(kmer_poscounts)
))
})
for (i in seq_along(kmer_rel_pos)) {
res$cytosines[i,,] = ratios[[i]]$cytosines
res$thymines[i,,] = ratios[[i]]$thymines
}
res
}
count_kmer_positions = function(dataset, kmer) {
# Transform `NA` variants to refbases
# This simplifies things, and makes it possible to present all counts in one array, with
# common columns (var-bases) and rows (trinuc contexts)
dataset$variant_pyrbased[is.na(dataset$variant_pyrbased)] <- dataset$nucl_pyrbased[is.na(dataset$variant_pyrbased)]
kp = stringr::str_locate(dataset$sequence_pyrbased, kmer)[,'start']
has_kmer = !(is.na(kp))
dataset$rel_pos = (nchar(dataset$sequence_pyrbased) - 1) / 2 - (kp - 1)
pos_range = range(dataset$rel_pos, na.rm = TRUE)
pos_seq = seq(pos_range[1], pos_range[2])
n_pos = length(pos_seq)
# A 4 dimentional array of counts with the following dimensions:
# 1) trinuc (rows)
# 2) varbase (columns)
# 3) Relative position, where 0 == first nuc in kmer, -1 == first nuc upstream of kmer, 1 == second nuc in kmer...
# 4) The data subsetted as the kmer detected at that position ('yes') or not ('no')
positional_kmer_muts = array(0,
dim = c(32, 4, n_pos, 2),
dimnames = list(
trinuc = c(trinucs.c, trinucs.t),
varnuc = c("A", "C", "G", "T"),
kmer_rel_pos = as.character(pos_seq),
kmer_detected = c("yes", "no")
))
for (pos in pos_seq) {
kmer_pos_ix = has_kmer & (dataset$rel_pos == pos)
positional_kmer_muts[,,as.character(pos),'yes'] = count_trinucs(dataset[kmer_pos_ix])
positional_kmer_muts[,,as.character(pos),'no'] = count_trinucs(dataset[!kmer_pos_ix])
}
positional_kmer_means = apply(positional_kmer_muts, c(1,3,4), function(x) x / sum(x))
# `apply()` will rearrange dimensions, so return to the same format as for the counts data
positional_kmer_means = aperm(positional_kmer_means, c(2,1,3,4))
dimnames_new = dimnames(positional_kmer_muts); dimnames_new[[5]] = c('counts', 'means');
names(dimnames_new) = c(names(dimnames(positional_kmer_means)), 'val_type')
# Create a 5-dim array with the counts and means together over the 5th dimension
ret = array(c(positional_kmer_muts, positional_kmer_means), dim = c(dim(positional_kmer_muts), 2),
dimnames = dimnames_new)
ret[is.nan(ret)] <- 0
return(ret)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.