Nothing
#' Converts mutation list to correct input format
#'
#' Given a mutation list, outputs a data frame with counts of how frequently a
#' mutation is found within each trinucleotide context per sample ID. Output
#' can be used as input into getTriContextFraction.
#'
#' The context sequence is taken from the BSgenome.Hsapiens.UCSC.hgX::Hsapiens
#' object. Therefore the coordinates must correspond to the human hgX assembly.
#' Default is set to the UCSC hg19 assembly, which corresponds to the GRCh37
#' assembly. If another assembly is required, it must already be present in the
#' R workspace and fed as a parameter. This method will translate chromosome
#' names from other versions of the assembly like NCBI or Ensembl. For instance,
#' the following transformation will be done: "1" -> "chr1"; "MT" -> "chrM";
#' "GL000245.1" -> "chrUn_gl000245"; etc.
#'
#' @param mut.ref Location of the mutation file that is to be converted or name
#' of data frame in environment
#' @param sample.id Column name in the mutation file corresponding to the Sample
#' ID
#' @param chr Column name in the mutation file corresponding to the chromosome
#' @param pos Column name in the mutation file corresponding to the mutation
#' position
#' @param ref Column name in the mutation file corresponding to the reference
#' base
#' @param alt Column name in the mutation file corresponding to the alternate
#' base
#' @param bsg Only set if another genome build is required. Must be a BSgenome
#' object.
#' @return A data frame that contains sample IDs for the rows and trinucleotide
#' contexts for the columns. Each entry is the count of how many times a
#' mutation with that trinucleotide context is seen in the sample.
#' @examples
#' \dontrun{
#'sigs.input = mut.to.sigs.input(mut.ref = sample.mut.ref, sample.id = "Sample",
#'chr = "chr", pos = "pos", ref = "ref", alt = "alt", bsg =
#'BSgenome.Hsapiens.UCSC.hg19)
#'}
#' @export
mut.to.sigs.input = function(mut.ref, sample.id = 'Sample', chr = 'chr', pos = 'pos', ref = 'ref', alt = 'alt', bsg = NULL){
if(exists("mut.ref", mode = "list")){
mut.full <- mut.ref
} else {
if(file.exists(mut.ref)){
mut.full <- utils::read.table(mut.ref, sep = "\t", header = TRUE, as.is = FALSE, check.names = FALSE)
} else {
stop("mut.ref is neither a file nor a loaded data frame")
}
}
mut <- mut.full[,c(sample.id, chr, pos, ref, alt)]
# Only take SNPs for now
#mut.lengths <- with(mut, nchar(as.character(mut[,ref])))
#mut.lengths <- with(mut, nchar(as.character(ref)))
#mut <- mut[which(mut.lengths == 1),]
#mut$mut.lengths <- nchar(as.character(mut[, ref]))
mut <- mut[which(mut[, ref] %in% c('A', 'T', 'C', 'G') & mut[, alt] %in% c('A', 'T', 'C', 'G')),]
# Fix the chromosome names (in case they come from Ensembl instead of UCSC)
levels(mut[, chr]) <- sub("^([0-9XY])", "chr\\1", levels(mut[, chr]))
levels(mut[, chr]) <- sub("^MT", "chrM", levels(mut[, chr]))
levels(mut[, chr]) <- sub("^(GL[0-9]+).[0-9]", "chrUn_\\L\\1", levels(mut[, chr]), perl = T)
# Check the genome version the user wants to use
# If set to default, carry on happily
if(is.null(bsg)){
# Remove any entry in chromosomes that do not exist in the BSgenome.Hsapiens.UCSC.hg19::Hsapiens object
unknown.regions <- levels(mut[, chr])[which(!(levels(mut[, chr]) %in% GenomeInfoDb::seqnames(BSgenome.Hsapiens.UCSC.hg19::Hsapiens)))]
if (length(unknown.regions) > 0) {
unknown.regions <- paste(unknown.regions, collapse = ',\ ')
warning(paste('Check chr names -- not all match BSgenome.Hsapiens.UCSC.hg19::Hsapiens object:\n', unknown.regions, sep = ' '))
mut <- mut[mut[, chr] %in% GenomeInfoDb::seqnames(BSgenome.Hsapiens.UCSC.hg19::Hsapiens), ]
}
# Add in context
mut$context = BSgenome::getSeq(BSgenome.Hsapiens.UCSC.hg19::Hsapiens, mut[,chr], mut[,pos]-1, mut[,pos]+1, as.character = T)
}
# If set to another build, use that one
# bsg parameter should be BSgenome object already
if(!is.null(bsg)){
if(class(bsg) != 'BSgenome'){
stop('The bsg parameter needs to either be set to default or a BSgenome object.')
}
# Remove any entry in chromosomes that do not exist in the BSgenome.Hsapiens.UCSC.hgX::Hsapiens object
unknown.regions <- levels(mut[, chr])[which(!(levels(mut[, chr]) %in% GenomeInfoDb::seqnames(bsg)))]
if (length(unknown.regions) > 0) {
unknown.regions <- paste(unknown.regions, collapse = ',\ ')
warning(paste('Check chr names -- not all match',bsg,'object:\n', unknown.regions, sep = ' '))
mut <- mut[mut[, chr] %in% GenomeInfoDb::seqnames(bsg), ]
}
# Add in context
mut$context = BSgenome::getSeq(bsg, mut[,chr], mut[,pos]-1, mut[,pos]+1, as.character = T)
}
mut$mutcat = paste(mut[,ref], ">", mut[,alt], sep = "")
if(any(substr(mut[,ref], 1, 1) != substr(mut[,'context'], 2, 2))){
bad = mut[ which(substr(mut[,ref], 1, 1) != substr(mut[,'context'], 2, 2)), ]
bad = paste(bad[,sample.id], bad[,chr], bad[,pos], bad[,ref], bad[,alt], sep = ':')
bad = paste(bad, collapse = ',\ ')
warning(paste('Check ref bases -- not all match context:\n ', bad, sep = ' '))
}
# Reverse complement the G's and A's
gind = grep("G",substr(mut$mutcat,1,1))
tind = grep("A",substr(mut$mutcat,1,1))
mut$std.mutcat = mut$mutcat
mut$std.mutcat[c(gind, tind)] <- gsub("G", "g", gsub("C", "c", gsub("T", "t", gsub("A", "a", mut$std.mutcat[c(gind, tind)])))) # to lowercase
mut$std.mutcat[c(gind, tind)] <- gsub("g", "C", gsub("c", "G", gsub("t", "A", gsub("a", "T", mut$std.mutcat[c(gind, tind)])))) # complement
mut$std.context = mut$context
mut$std.context[c(gind, tind)] <- gsub("G", "g", gsub("C", "c", gsub("T", "t", gsub("A", "a", mut$std.context[c(gind, tind)])))) # to lowercase
mut$std.context[c(gind, tind)] <- gsub("g", "C", gsub("c", "G", gsub("t", "A", gsub("a", "T", mut$std.context[c(gind, tind)])))) # complement
mut$std.context[c(gind, tind)] <- sapply(strsplit(mut$std.context[c(gind, tind)], split = ""), function(str) {paste(rev(str), collapse = "")}) # reverse
# Make the tricontext
mut$tricontext = paste(substr(mut$std.context, 1, 1), "[", mut$std.mutcat, "]", substr(mut$std.context, 3, 3), sep = "")
# Generate all possible trinucleotide contexts
all.tri = c()
for(i in c("A", "C", "G", "T")){
for(j in c("C", "T")){
for(k in c("A", "C", "G", "T")){
if(j != k){
for(l in c("A", "C", "G", "T")){
tmp = paste(i, "[", j, ">", k, "]", l, sep = "")
all.tri = c(all.tri, tmp)
}
}
}
}
}
all.tri <- all.tri[order(substr(all.tri, 3, 5))]
final.matrix = matrix(0, ncol = 96, nrow = length(unique(mut[,sample.id])))
colnames(final.matrix) = all.tri
rownames(final.matrix) = unique(mut[,sample.id])
# print(paste("[", date(), "]", "Fill in the context matrix"))
for(i in unique(mut[,sample.id])){
tmp = subset(mut, mut[,sample.id] == i)
beep = table(tmp$tricontext)
for(l in 1:length(beep)){
trimer = names(beep[l])
if(trimer %in% all.tri){
final.matrix[i, trimer] = beep[trimer]
}
}
}
final.df = data.frame(final.matrix, check.names = F)
bad = names(which(rowSums(final.df) <= 50))
if(length(bad) > 0){
bad = paste(bad, collapse = ',\ ')
warning(paste('Some samples have fewer than 50 mutations:\n ', bad, sep = ' '))
}
return(final.df)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.