#' Kmer frequency
#'
#' Calculate kmer frequency in mutated and non-mutated sites. Apply fisher test
#'
#' @param dataset Granges object, with a `sequence.pyr` column containing sequence region and `mut.pyr` column containing mutations
#' @param ks Int. Size of kmers to look for
#' @param pval_cutoff. Real. Filtering threshold for fisher test
#' @param n_keep. Int. Number of kmers to keep
#' @export
kmer_freq = function(dataset, ks = 5, pval_cutoff = 0.001, n_keep = 80) {
mut_counts = count_muts(dataset, ks)
fisher_test = fisher_test_mutations(mut_counts)
mtx_rank = function(x) {
matrix(data = rank(x),
nrow = nrow(x),
ncol = ncol(x),
dimnames = dimnames(x) )
}
kmers = {
mask = fisher_test[,,'pvalue'] < pval_cutoff
lor_vals = abs(log(fisher_test[,,'odds_ratio'])) * mask; lor_vals[is.nan(lor_vals)] = 0
lor_rank = mtx_rank(lor_vals) * mask
logp_vals = log(fisher_test[,,'pvalue']) * mask
logp_rank = mtx_rank(-logp_vals) * mask
comb_rank = mtx_rank(lor_rank + logp_rank) * mask
kmer_final_rank = sort(apply(comb_rank, 1, max), decreasing = TRUE); kmer_final_rank = kmer_final_rank[kmer_final_rank > 0]
n_ranked = length(kmer_final_rank)
if (n_ranked > n_keep) {
names(kmer_final_rank[1:n_keep])
} else{
log_info(paste("Only", n_ranked, "kmers passed preselection"))
names(kmer_final_rank)
}
}
ret = array(
data = c(fisher_test[kmers,,], mut_counts[kmers,,'yes',]),
dim = c(length(kmers), ncol(mut_counts), 4),
dimnames = list(kmers, colnames(mut_counts), c("odds_ratio", "pvalue", "mutated", "non_mutated")))
ret
}
count_muts = function(dataset, ks = 5, mut_classes = list(cytosines = c('C>A', 'C>G', 'C>T'), thymines = c('T>A', 'T>C', 'T>G' ))) {
log_debug("In function: `count_muts`")
count_aux = function(nucl) {
# Since default data frame creation generates factors from strings, merging two data frames
# on the string column need string data type to not mess up.
ret = count_kmers(sequences = dataset$sequence_pyrbased[dataset$nucl_pyrbased == nucl], kmer_size = ks)
ret$kmer = as.character(ret$kmer)
ret
}
bgfreq.all = local({
bgfreq.cytosines = count_aux(nucl = 'C')
bgfreq.thymines = count_aux(nucl = 'T')
ret = merge(bgfreq.cytosines, bgfreq.thymines, by = 'kmer', all = TRUE)
ret.arr = array(dim = c(nrow(ret), 2))
dimnames(ret.arr) = list(kmer = ret$kmer, counts = c("cytosines", "thymines"))
ret.arr[,'cytosines'] = ret$count.x
ret.arr[,'thymines'] = ret$count.y
ret.arr
})
dataset$mut.pyr = ifelse(is.na(dataset$variant_pyrbased),
NA,
paste0(dataset$nucl_pyrbased, '>', dataset$variant_pyrbased))
tot_muts = {
muts = dataset$mut.pyr
if (sum(is.na(muts)) < 96*10) {
# Don't know what a proper cutoff would be, 960 is a mean of 10 mutations
log_error("`kmer_freq` : Not enough non-mutated sites sampled to count mutation frequencies", stop = TRUE)
}
muts[is.na(muts)] = "wt"
tm = table(dataset$nucl_pyrbased, muts)
list(cytosines = tm['C', c(mut_classes$cytosines, 'wt')],
thymines = tm['T', c(mut_classes$thymines, 'wt')])
}
mut_counter = function(pyr_subset = 'cytosines') {
n_kmers = nrow(bgfreq.all)
n_mut_types = length(mut_classes[[pyr_subset]])
tot_positions = sum(tot_muts[[pyr_subset]])
ret = array(dim = c(n_kmers, n_mut_types, 2, 2))
dimnames(ret) = list(
kmer = rownames(bgfreq.all),
mut_type = mut_classes[[pyr_subset]],
has_kmer = c("yes", "no"),
is_mutated = c("yes", "no")
)
for (mut_class in mut_classes[[pyr_subset]]) {
mut_class.ix = dataset$mut.pyr == mut_class
mut_class.ix[is.na(mut_class.ix)] = FALSE
mut_counts = count_kmers(sequences = dataset$sequence_pyrbased[mut_class.ix], kmer_size = ks)
mut_counts.v = mut_counts$count
names(mut_counts.v) = mut_counts$kmer
ret[,mut_class, 'yes', 'yes'] = {x = mut_counts.v[rownames(ret)]; x[is.na(x)] = 0; x} # Mutated with kmer
ret[,mut_class, 'no', 'yes'] = tot_muts[[pyr_subset]][mut_class] - ret[,mut_class, 'yes', 'yes'] # Mutated without kmer
ret[,mut_class, 'yes', 'no' ] = bgfreq.all[,pyr_subset] - ret[,mut_class, 'yes', 'yes'] # Not mutated, but has kmer
ret[,mut_class, 'no', 'no' ] = tot_positions - (ret[,mut_class, 'yes', 'yes'] + # Not mutated, without kmer
ret[,mut_class, 'yes', 'no' ] +
ret[,mut_class, 'no', 'yes'])
}
ret[is.na(ret)] = 0
ret
}
cytosines = mut_counter('cytosines')
thymines = mut_counter('thymines')
result = abind::abind(cytosines,
thymines,
along = 2,
make.names = TRUE)
names(dimnames(result)) = names(dimnames(cytosines))
result
}
fisher_test_mutations = function(mut_counts) {
kmers = rownames(mut_counts)
muts = colnames(mut_counts)
res = array(
dim = c(length(kmers), length(muts), 2),
dimnames = list(kmers = kmers,
mut_type = muts,
statistic = c("odds_ratio", "pvalue")))
for (kmer in kmers) {
for (mut_type in muts){
fisher.res = fisher.test(mut_counts[kmer,mut_type,,])
res[kmer,mut_type,"odds_ratio"] = fisher.res$estimate
res[kmer,mut_type,"pvalue"] = fisher.res$p.value
}
}
res
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.