#' The get_trinuc_norm() can be used to determine the
#' normalization to be used to match the frequency in the
#' sequencing platform to the frequency in the whole genomes
#'
#' @param bed_file the path to the bed file that defines the
#' library of the sequencing platform
#' @param do_MC_sampling if set to TRUE, to speed up
#' the calculation the regions are sampled randomly rather
# than providing a full count
get_trinuc_norm <- function(bed_file, do_MC_sampling = F,
ref_genome = BSgenome.Hsapiens.UCSC.hg19:: BSgenome.Hsapiens.UCSC.hg19){
counts <- get_trinuc_counts(bed_file, do_MC_sampling, ref_genome)
counts_expanded <- c(rep(counts[1:16], 3), rep(counts[17:32], 3))
norm <- counts_expanded/counts_trinuc_genome
return(norm=100*norm/sum(norm))
}
get_trinuc_counts <- function(bed_file, do_MC_sampling = F,
ref_genome = BSgenome.Hsapiens.UCSC.hg19:: BSgenome.Hsapiens.UCSC.hg19){
counts_all <- rep(0, 64)
names(counts_all) <- c('ACA', 'ACC', 'ACG', 'ACT',
'CCA', 'CCC', 'CCG', 'CCT',
'GCA', 'GCC', 'GCG', 'GCT',
'TCA', 'TCC', 'TCG', 'TCT',
'TGT', 'GGT', 'CGT', 'AGT',
'TGG', 'GGG', 'CGG', 'AGG',
'TGC', 'GGC', 'CGC', 'AGC',
'TGA', 'GGA', 'CGA', 'AGA',
'ATA', 'ATC', 'ATG', 'ATT',
'CTA', 'CTC', 'CTG', 'CTT',
'GTA', 'GTC', 'GTG', 'GTT',
'TTA', 'TTC', 'TTG', 'TTT',
'TAT', 'GAT', 'CAT', 'AAT',
'TAG', 'GAG', 'CAG', 'AAG',
'TAC', 'GAC', 'CAC', 'AAC',
'TAA', 'GAA', 'CAA', 'AAA')
bed <- read.table(bed_file,header=F)
colnames(bed) <- c('chr','start','end','id')[1:dim(bed)[[2]]]
context_seq_vec <- character()
if(!grepl('chr', bed$chr[[1]])) bed$chr <- paste0('chr', bed$chr)
if(do_MC_sampling){
library_size <- sum(bed$end - bed$start)
scale_MC_sampling <- (library_size/4000000)
if(scale_MC_sampling < 1){
warning('library smaller than 4Mb quick exact calculation feasible')
do_MC_sampling <- F
}
}
for(i in 1:dim(bed)[[1]]){
starts <- (bed$start[[i]] - 1):(bed$end[[i]] - 1)
ends <- (bed$start[[i]] + 1):(bed$end[[i]] + 1)
if(do_MC_sampling){
inds <- 1:length(starts)
n <- length(inds)/scale_MC_sampling
inds <- unique(round(runif(n, 0.5, inds + 0.5), digit = 0))
starts <- starts[inds]
ends <- ends[inds]
}
context_seq <- VariantAnnotation::getSeq(ref_genome,
names = bed$chr[[i]],
start = starts,
end = ends,
as.character = TRUE)
context_seq_vec <- c(context_seq_vec, context_seq)
counts_context <- table(context_seq)
inds <- match(names(counts_context), names(counts_all))
counts_all[na.omit(inds)] <- counts_all[na.omit(inds)] + as.numeric(unlist(counts_context[!is.na(inds)]))
}
counts <- c(counts_all[1:16] + counts_all[17:32],
counts_all[33:48] + counts_all[49:64])
return(counts = counts)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.