#' Read and preprocess an isomut file
#'
#' `read_isomut` reads an isomut file, adds a `read` column, optionally
#' filters according to given criteria and adds extra columns.
#'
#' @param file the name of the file which the data are to be read from.
#' @param minReads integer indicating the minimum number of reads
#' (default = 0L).
#' @param minCoverage double indicating the minimum coverage
#' (default = 0).
#' @param minMutFreq double indicating the minimum mutation frequency
#' (default = 0).
#' @param minScore double indicating the minimum isomut score
#' (default = 0).
#' @param minCleanliness double indicating the minimum isomut cleanliness
#' (default = 0).
#' @param removePatterns character vector providing patterns in
#' sample_names that are to be removed.
#' @param extraColumns named list to generate extra columns. The list names are
#' the names of the new columns. The list elements can either be a vector of
#' length one (each row gets the same value for this column), a named vector
#' with names being a subset of the `sample_name` column in the original isomut
#' file, or a function that accepts those names as arguments.
#' @param removeFeatures whether certain genomic features should be removed.
#' @param featureData `GRanges` that contains features to be removed.
#' @param removeMito whether variants in mitochondrial DNA should be removed.
#' @param asDataFrame whether to return a `data.frame`. If `FALSE`, a
#' `data.table` is returned.
#'
#' @return a `data.frame` or `data.table` containing the filtered and preprocessed data.
#'
#' @importFrom data.table fread setnames setorder
#' @export
read_isomut <- function(file, minReads = 0L, minCoverage = 0, minMutFreq = 0,
minScore = 0, minCleanliness = 0, removePatterns = NULL,
extraColumns = NULL, removeFeatures = TRUE,
featureData = SGD_features, removeMito = TRUE,
asDataFrame = TRUE) {
## read data
if(is.null(removePatterns)) {
isomut <- fread(file = file, header = TRUE, stringsAsFactors = TRUE)
} else {
cmd <- paste("egrep -v", ' "', paste(removePatterns, collapse = "|"), '" ', file, sep = "")
isomut <- fread(cmd = cmd, header = TRUE, stringsAsFactors = TRUE)
}
## rename `sample_name` column
names(isomut)[1] = "file_name"
## rename `cov` to `coverage` if necessary
names(isomut)[names(isomut) == "cov"] <- "coverage"
# ## remove samples that match certain patterns
# if(!is.null(removePatterns)) {
# for(pattern in removePatterns) {
# isomut <- isomut[!grepl(pattern, file_name)]
# }
# }
## remove variants in mito
if(removeMito) isomut <- isomut[chr != "Mito"]
## remove features
if(removeFeatures) {
isomut <- remove_features(isomut, featureData = featureData)
}
## add `read` column
isomut[,read := round(coverage * mut_freq)]
## remove rows with too few reads
if(minReads > 0) {
isomut <- isomut[read >= minReads]
}
## remove rows with low coverage
if(minCoverage > 0) {
isomut <- isomut[coverage >= minCoverage]
}
## remove rows with low mutation frequency
if(minMutFreq > 0) {
isomut <- isomut[mut_freq >= minMutFreq]
}
## remove rows with low isomut score
if(minScore > 0) {
isomut <- isomut[score >= minScore]
}
## replace cleanliness 42 with NA
isomut[cleanliness==42, cleanliness := NA]
if(minCleanliness > 0) {
isomut <- isomut[cleanliness >= minCleanliness]
}
## add extra columns
if(!is.null(extraColumns)) {
stopifnot(is.list(extraColumns), !is.null(names(extraColumns)))
for(col_name in names(extraColumns)) {
col_value <- extraColumns[[col_name]]
stopifnot(is.vector(col_value) || is.function(col_value))
if(is.function(col_value)) {
isomut[, (col_name) := factor(vapply(isomut$file_name, col_value, character(1)))]
} else {
if(length(col_value) == 1) {
isomut[,(col_name) := factor(col_value)]
} else {
stopifnot(isomut$file_name %in% names(col_value))
isomut[,(col_name) := factor(col_value[as.character(isomut$file_name)])]
}
}
}
}
## order and remove possible duplicates
isomut <- unique(isomut)
setorder(isomut, file_name, chr, pos)
if(asDataFrame) isomut <- as.data.frame(isomut)
return(isomut)
}
#' Remove features
#'
#' Remove variants from isomut data if they match given genomic features
#'
#' @param isomut a `data.frame` or `data.table` of isomut data as generated by
#' `read_isomut`.
#' @param featureData a `GRanges` object that contains the features to be removed.
#'
#' @return a `data.frame` or `data.table` containing the filtered data.
#'
#' @importFrom GenomicRanges GRanges findOverlaps
#' @importFrom IRanges IRanges
#'
#' @export
remove_features <- function(isomut, featureData = SGD_features) {
if(!is(featureData, "GRanges")) stop("`featureData` must be GRanges")
mut_ranges <- GRanges(isomut$chr,
IRanges(isomut$pos, width = nchar(as.character(isomut$ref))))
suppressWarnings(
hits <- findOverlaps(mut_ranges, SGD_features)
)
toKeep <- setdiff(seq_len(nrow(isomut)), hits@from)
return(isomut[toKeep,])
}
#' Format indels
#'
#' Converts the sequence information about indels to a format that can be used
#' to predict the coding consequences.
#'
#' @param isomut a `data.frame` or `data.table` of isomut data as generated by
#' `read_isomut`.
#' @param ref either the name of a fasta file containing the reference genome or
#' an object of class `FaFile`.
#'
#' @return A data.frame with formatted sequence information.
#'
#' @importFrom Rsamtools FaFile
#' @importFrom GenomicRanges GRanges
#' @importFrom IRanges IRanges
#' @importFrom Biostrings getSeq
#' @importFrom data.table as.data.table
#'
#' @export
format_indels <- function(isomut, ref = system.file("extdata", "Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa", package = "isomutR")) {
if(with(isomut, all(ref != "-") && all(mut != "-"))) {
return(isomut)
}
if(!is(isomut, "data.table")) {
retDF <- TRUE
isomut <- as.data.table(isomut)
} else {
retDF <- FALSE
}
stopifnot(is(ref, "FaFile") || is.character(ref))
if(is.character(ref)) ref <- FaFile(ref)
mut_ranges <- GRanges(isomut$chr, IRanges(isomut$pos, width = 1))
mut_seq <- getSeq(ref, mut_ranges)
## format insertions
mut_seq_ins <- as.character(mut_seq[isomut$type == "INS"])
isomut[type == "INS", `:=`(ref=mut_seq_ins, mut=paste0(mut_seq_ins, mut))]
## format deletions
mut_seq_del <- as.character(mut_seq[isomut$type == "DEL"])
isomut[type == "DEL", `:=`(ref=paste0(mut_seq_del, ref), mut=mut_seq_del)]
if(retDF) as.data.frame(isomut)
return(isomut)
}
#' Annotate isomut data
#'
#' `annotate_isomut` adds information about the location of variants (coding or
#' intergenic) and, for coding variants, about the affected gene(s) and the
#' position of the variant in the gene.
#'
#' @param isomut a data.frame of isomut data as generated by `read_isomut`
#' (minimally needs to contain `chr` and `pos` columns.
#' @param annotation path to annotation file in `gtf` or `gff` format or `GRanges`
#' object containing the annotations.
#' @param txdb optionally provide `TxDb` object (should match `anno`).
#' @param noDups don't keep multiple annotations (e.g. when variant is inside
#' two genes on opposite strands). Default `FALSE`.
#' @param predictCoding predict consequences of variants in coding regions.
#' @param ... further arguments for `predict_coding`.
#'
#' @return a data.frame with additional columns regarding annotation.
#'
#' @importFrom GenomicFeatures makeTxDbFromGFF makeTxDbFromGRanges
#' @importFrom rtracklayer import
#' @importFrom VariantAnnotation locateVariants CodingVariants IntergenicVariants
#' @importFrom data.table as.data.table rbindlist
#' @export
annotate_isomut <- function(isomut, ref = system.file("extdata", "Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa", package = "isomutR"), annotation = gtf_scer,
txdb = NULL, noDups = FALSE, predictCoding = TRUE,
...) {
if(nrow(isomut)==0) return(isomut)
if(!is(isomut, "data.table")) {
retDF <- TRUE
isomut <- as.data.table(isomut)
} else {
retDF <- FALSE
}
## load annotation
stopifnot(is.character(annotation) || is(annotation, "GRanges"))
if(is.character(annotation)) {
if(is.null(txdb)) {
suppressMessages(
txdb <- suppressWarnings(makeTxDbFromGFF(annotation, metadata = "gene_id"))
)
}
gr <- import(annotation)
} else {
if(is.null(txdb)) {
suppressMessages(
txdb <- suppressWarnings(makeTxDbFromGRanges(annotation))
)
}
gr <- annotation
}
gr_genes <- gr[gr$type == "gene"]
id2names <- setNames(gr_genes$gene_name, gr_genes$gene_id)
## convert isomut to GRanges
isomut_list <- split(isomut, isomut$file_name)
variants <- lapply(isomut_list, function(isomut_i) {
mut_ranges_i <- GRanges(isomut_i$chr, IRanges(isomut_i$pos, width = 1))
mut_ranges_i <- unique(mut_ranges_i)
## locate variants
variants_coding_i <- suppressMessages(suppressWarnings(
locateVariants(mut_ranges_i, txdb, CodingVariants())
))
variants_intergenic_i <- suppressMessages(suppressWarnings(
locateVariants(mut_ranges_i, txdb,
IntergenicVariants(upstream = 0 , downstream = 0))
))
variants_i <- rbind(as.data.table(variants_coding_i), as.data.table(variants_intergenic_i))
## get rid of additional annotations for the same variant
if(noDups) {
variants_i <- variants_i[!duplicated(QUERYID)]
}
return(variants_i)
})
## merge variants with isomut data
variants <- rbindlist(variants, idcol = "file_name")
variants <- variants[, c("seqnames", "start", "strand", "LOCATION", "LOCSTART", "GENEID", "file_name")]
names(variants) <- c("chr", "pos", "strand", "location", "nt_start", "gene_id", "file_name")
variants[, gene_name := id2names[gene_id]]
## get rid of previous annotation if present
cols <- c("chr", "pos", "file_name", setdiff(names(isomut), names(variants)))
isomut <- unique(isomut[, cols, with = FALSE])
isomut <- merge(isomut, as.data.table(variants), by = c("chr", "pos", "file_name"), all.x = TRUE)
isomut[location=="intergenic", `:=`(gene_id = "intergenic")]
isomut[is.na(gene_name), gene_name:=gene_id]
setorder(isomut, file_name, chr, pos)
if(predictCoding) isomut <- predict_coding(isomut, ref = ref,
annotation = annotation, ...)
if(retDF) isomut <- as.data.frame(isomut)
return(isomut)
}
#' Predict coding from variants
#'
#' predicts the consequences of variants in coding regions.
#'
#' @param isomut a data.frame of isomut data as generated by `read_isomut`.
#' @param ref either the name of a fasta file containing the reference genome or
#' an object of class `FaFile`.
#' @param annotation path to annotation file in `gtf` or `gff` format or `GRanges`
#' object containing the annotations.
#' @param txdb optionally provide `TxDb` object (should match `annotation`).
#' @param format_indels whether indels should be formatted (see `format_indels`
#' for details).
#' @param correct_coding whether coding be corrected if two variants hit the same
#' codon.
#' @param ... further arguments for `correct_coding`.
#'
#' @return a data.frame with additional columns regarding coding consequences.
#'
#' @importFrom Biostrings DNAStringSet translate reverseComplement
#' @importFrom VariantAnnotation predictCoding
#' @importFrom magrittr `%>%`
#' @importFrom dplyr mutate group_by filter n case_when
#'
#' @export
predict_coding <- function(isomut, ref = system.file("extdata", "Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa", package = "isomutR"),
annotation = gtf_scer, txdb = NULL,
format_indels = TRUE, correct_coding = FALSE, ...) {
if(!"gene_id" %in% names(isomut)) {
stop("Please annotate data first!")
}
isomut_cols <- names(isomut)
if(nrow(isomut)==0) return(isomut)
if(!is(isomut, "data.table")) {
retDF <- TRUE
isomut <- as.data.table(isomut)
} else {
retDF <- FALSE
}
## load genome
stopifnot(is(ref, "FaFile") || is.character(ref))
if(is.character(ref)) ref <- FaFile(ref)
## load annotation
if(is.null(txdb)) {
stopifnot(is.character(annotation) || is(annotation, "GRanges"))
if(is.character(annotation)) {
suppressMessages(
txdb <- suppressWarnings(makeTxDbFromGFF(annotation, metadata = "gene_id"))
)
} else {
suppressMessages(
txdb <- suppressWarnings(makeTxDbFromGRanges(annotation))
)
}
}
## format indels
if(format_indels) isomut <- format_indels(isomut, ref)
## generate GRanges
mut_ranges <- with(isomut, GRanges(chr, ranges = IRanges(start = pos,
width = nchar(as.character(ref)))))
varAllele <- DNAStringSet(isomut$mut)
coding <- suppressWarnings(predictCoding(mut_ranges, txdb, ref, varAllele))
if(length(coding) > 0) {
coding <- as.data.table(coding)
coding$AA_start <- vapply(coding$PROTEINLOC, function(l) l[[1]], numeric(1))
coding$mut <- ifelse(coding$strand=="+", coding$varAllele, reverseComplement(DNAStringSet(coding$varAllele)))
coding <- coding[, c("seqnames", "start", "mut", "GENEID", "CONSEQUENCE", "REFCODON",
"VARCODON", "REFAA", "VARAA", "AA_start")]
names(coding) <- c("chr", "pos", "mut", "gene_id", "consequence", "ref_codon",
"var_codon", "ref_AA", "var_AA", "AA_start")
coding <- unique(coding)
## get rid of previous coding if present
cols <- c("chr", "pos", "gene_id", "mut", setdiff(names(isomut), names(coding)))
isomut <- unique(isomut[, cols, with = FALSE])
isomut <- merge(isomut, coding, by = c("chr", "pos", "mut", "gene_id"),
all.x = TRUE, all.y = FALSE)
isomut <- unique(isomut)
if(correct_coding) isomut <- correct_coding(isomut, ...)
}
## rearrange columns
isomut <- isomut[,c(isomut_cols, setdiff(names(isomut), isomut_cols)), with = FALSE]
## adjust return format
if(retDF) isomut <- as.data.frame(isomut)
return(isomut)
}
#' Correct coding
#'
#' corrects the predicted AA if multiple variants are found in the same codon
#'
#' @param isomut a data.frame of isomut data as generated by `read_isomut` and
#' annotated by `annotate_isomut`.
#' @param keep_uncorrected boolean. Adds a column for the uncorrected coding.
#' @param highlight boolean. Adds a column that indicates which variants have
#' corrected condons.
#' @return a data.frame of the same format with corrected coding.
#'
#' @importFrom Biostrings DNAStringSet translate reverseComplement
#' @importFrom VariantAnnotation predictCoding
#' @importFrom dplyr case_when
#'
#' @export
correct_coding <- function(isomut, keep_uncorrected = FALSE, highlight = FALSE) {
if(!all(c("gene_id", "nt_start", "AA_start") %in% names(isomut))) {
stop("Please annotate data first!")
}
if(!is(isomut, "data.table")) {
retDF <- TRUE
isomut <- as.data.table(isomut)
} else {
retDF <- FALSE
}
isomut[,var_AA_uncorrected:=var_AA]
isomut[,var_codon_uncorrected:=var_codon]
isomut[,consequence_uncorrected:=consequence]
ind_coding <- which(!is.na(isomut$gene_id) & isomut$type=="SNV")
isomut_coding <- isomut[ind_coding]
isomut_coding[, ind:=ind_coding]
isomut_coding[, N:=.N , by=.(file_name, gene_id, AA_start)]
isomut_coding <- isomut_coding[N %in% 2:3]
isomut_coding[, codon_pos:=ifelse(nt_start %% 3 %in% 1:2, nt_start %% 3, 3)]
isomut_coding[, var_codon:=getVarCodon(
codon_pos, mut, ref_codon, var_codon, strand, type),
by = .(file_name, gene_id, AA_start)]
isomut_coding[, `:=`(
var_AA = case_when(
!is.na(var_codon) && nt_start>3 ~
as.character(translate(DNAStringSet(var_codon), no.init.codon = TRUE)),
!is.na(var_codon) ~
as.character(translate(DNAStringSet(var_codon), no.init.codon = FALSE)),
TRUE ~ var_AA))]
isomut_coding[, `:=`(
consequence = case_when(
var_AA == "*" ~ "nonsense",
var_AA == ref_AA ~ "synonymous",
var_AA != ref_AA ~ "nonsynonymous",
TRUE ~ as.character(consequence)))]
isomut[isomut_coding$ind, c("var_codon", "var_AA", "consequence")] <- isomut_coding[, c("var_codon", "var_AA", "consequence")]
if(highlight) isomut[,corrected_codon:= var_AA!=var_AA_uncorrected]
if(!keep_uncorrected) {
isomut[,var_AA_uncorrected:=NULL]
isomut[,var_codon_uncorrected:=NULL]
isomut[,consequence_uncorrected:=NULL]
}
if(retDF) isomut <- as.data.frame(isomut)
return(isomut)
}
getVarCodon <- function(codon_pos, MUT, REFCODON, VARCODON, strand, type) {
codon_pos <- codon_pos[!is.na(codon_pos)]
MUT <- MUT[!is.na(codon_pos)]
if(length(codon_pos)==0) return(NA)
MUT <- ifelse(strand=="+", as.character(MUT), reverseComplement(DNAStringSet(MUT)))
RC <- strsplit(REFCODON[[1]], "")[[1]]
RC[codon_pos] <- MUT
return(case_when(
type=="SNV" & !is.na(codon_pos) ~ paste(RC, collapse = ""),
TRUE ~ VARCODON
))
}
#' Read isomut data from directory
#'
#' @param dir path to the directory containing isomut data.
#' @param minReads integer indicating the minimum number of reads
#' (default = 0L).
#' @param minCoverage double indicating the minimum coverage
#' (default = 0).
#' @param minMutFreq double indicating the minimum mutation frequency
#' (default = 0).
#' @param minScore double indicating the minimum isomut score
#' (default = 0).
#' @param minCleanliness double indicating the minimum isomut cleanliness
#' (default = 0).
#' @param removePatterns character vector providing patterns in
#' sample_names that are to be removed.
#' @param extraColumns named list to generate extra columns. The list names are
#' the names of the new columns. The list elements can either be a vector of
#' length one (each row gets the same value for this column), a named vector
#' with names being a subset of the `sample_name` column in the original isomut
#' file, or a function that accepts those names as arguments.
#' @param removeFeatures whether certain genomic features should be removed.
#' @param featureData `GRanges` that contains features to be removed.
#' @param removeMito whether variants in mitochondrial DNA should be removed.
#' @param verbose monitor progress.
#' @param asDataFrame whether to return a `data.frame`. If `FALSE`, a
#' `data.table` is returned.
#' @param annotate whether data should be annotated. For details see
#' `annotate_isomut`.
#' @param ... further arguments for `annotate_isomut`.
#'
#' @return a `data.frame` or `data.table` containing the filtered and
#' preprocessed data.
#'
#' @importFrom data.table rbindlist
#' @importFrom GenomicFeatures makeTxDbFromGFF makeTxDbFromGRanges
#' @importFrom rtracklayer import
#' @export
read_isomut_from_dir <- function(dir, minReads = 0L,
minCoverage = 0, minMutFreq = 0, minScore = 0,
minCleanliness = 0, removePatterns = NULL,
extraColumns = NULL, removeFeatures = TRUE,
featureData = SGD_features, removeMito = TRUE,
verbose = TRUE,
asDataFrame = TRUE,
annotate = TRUE,
...
) {
isomut_files <- list.files(dir, pattern = "*.isomut$", recursive = TRUE,
full.names = TRUE)
if(length(isomut_files) == 0) stop("No isomut files found.")
if(verbose) {
message("Reading files...")
pb <- txtProgressBar(min = 0, max = length(isomut_files), style = 3)
i <- 1
}
isomut_list <- lapply(isomut_files,
function(file) {
isomut <- read_isomut(file, minReads = minReads,
minCoverage = minCoverage,
minMutFreq = minMutFreq,
minScore = minScore,
minCleanliness = minCleanliness,
removePatterns = removePatterns,
extraColumns = extraColumns,
removeFeatures = removeFeatures,
featureData = featureData,
removeMito = removeMito,
asDataFrame = FALSE
)
if(verbose) {
setTxtProgressBar(pb, i)
i <<- i+1
}
return(isomut)
})
if(annotate) {
args <- list(...)
## load annotation
if("annotation" %in% names(args)) {
annotation <- args[["annotation"]]
stopifnot(is.character(annotation) || is(annotation, "GRanges"))
if(is.character(annotation)) {
if(!"txdb" %in% names(args) || is.null(args[["txdb"]])) {
suppressMessages(
args[["txdb"]] <- suppressWarnings(makeTxDbFromGFF(annotation, metadata = "gene_id"))
)
}
args[["annotation"]] <- import(annotation)
} else {
if(!"txdb" %in% names(args) || is.null(args[["txdb"]])) {
suppressMessages(
args[["txdb"]] <- suppressWarnings(makeTxDbFromGRanges(annotation))
)
}
args[["annotation"]] <- annotation
}
} else {
if(!"txdb" %in% names(args) || is.null(args[["txdb"]])) {
suppressMessages(
args[["txdb"]] <- suppressWarnings(makeTxDbFromGRanges(gtf_scer))
)
}
args[["annotation"]] <- gtf_scer
}
if(verbose) {
message("\nAnnotating...")
pb2 <- txtProgressBar(min = 0, max = length(isomut_files), style = 3)
i <- 1
}
isomut_list <- lapply(isomut_list,
function(isomut) {
isomut <- do.call(annotate_isomut, c(list(isomut = isomut), args))
if(verbose) {
setTxtProgressBar(pb2, i)
i <<- i+1
}
return(isomut)
}
)
}
isomut <- rbindlist(isomut_list, fill = TRUE)
if(asDataFrame) isomut <- as.data.frame(isomut)
message("...done!\n")
return(isomut)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.