###############################################################################
# Utility functions
###############################################################################
#' Given a deletion and its sequence context, categorize it
#'
#' This function is primarily for internal use, but we export it
#' to document the underlying logic.
#'
#' See
#' \url{https://github.com/steverozen/ICAMS/raw/master/data-raw/PCAWG7_indel_classification_2017_12_08.xlsx}
#' for additional information on deletion
#' mutation classification.
#'
#' This function first handles deletions in homopolymers, then
#' handles deletions in simple repeats with
#' longer repeat units, (e.g. \code{CACACACA}, see
#' \code{\link{FindMaxRepeatDel}}),
#' and if the deletion is not in a simple repeat,
#' looks for microhomology (see \code{\link{FindDelMH}}).
#'
#' See the code for unexported function \code{\link{CanonicalizeID}}
#' and the functions it calls for handling of insertions.
#'
#' @param context The deleted sequence plus ample surrounding
#' sequence on each side (at least as long as \code{del.seq}).
#'
#' @param del.seq The deleted sequence in \code{context}.
#'
#' @param pos The position of \code{del.sequence} in \code{context}.
#'
#' @param trace If > 0, then generate messages tracing
#' how the computation is carried out.
#'
#' @param strand NULL by default. But when called by PlotTransBiasInternal,
#' \code{strand} is either \code{+} or \code{-}. The return value will
#' include :trans or :nontrans indicating whether the deletion occurred on
#' the transcribed or non-transcribed strand.
#'
#' @return A string that is the canonical representation
#' of the given deletion type. Return \code{NA}
#' and raise a warning if
#' there is an un-normalized representation of
#' the deletion of a repeat unit.
#' See \code{FindDelMH} for details.
#' (This seems to be very rare.)
#'
#' @keywords internal
Canonicalize1Del115 <- function(context, del.seq, pos, trace = 0, strand) {
# Is the deletion involved in a repeat?
rep.count <- FindMaxRepeatDel(context, del.seq, pos)
rep.count.string <- ifelse(rep.count >= 5, "5+", as.character(rep.count))
deletion.size <- nchar(del.seq)
deletion.size.string <-
ifelse(deletion.size >= 5, "5+", as.character(deletion.size))
# Category is "1bp deletion"
if (deletion.size == 1) {
if (rep.count == 0){
if (del.seq %in% c("A","G")){
context <- revc(context)
pos <- nchar(context) - pos + 1
del.seq <- substr(context,pos,pos)
preceding.base <- substr(context,pos+1,pos+1)
proceeding.base <- substr(context,pos-1,pos-1)
retval <- paste0("DEL:",del.seq,":1:0_",preceding.base,proceeding.base)
if (!is.na(strand)){
retval <- paste0(retval, ifelse(strand=="+", ":nontrans", ":trans"))
}
} else {
preceding.base <- substr(context,pos-1,pos-1)
proceeding.base <- substr(context,pos+1,pos+1)
retval <- paste0("DEL:",del.seq,":1:0_",preceding.base,proceeding.base)
if (!is.na(strand)){
retval <- paste0(retval, ifelse(strand=="+", ":trans", ":nontrans"))
}
}
return(retval)
} else {
if (del.seq == "A") del.seq <- "T"
if (del.seq == "G") del.seq <- "C"
return(paste0("DEL:", del.seq, ":1:", rep.count.string))
}
}
# Category is ">2bp deletion"
if (rep.count > 0) {
return(
paste0("DEL:repeats:", deletion.size.string, ":", rep.count.string))
}
# We have to look for microhomology
microhomology.len <- FindDelMH(context, del.seq, pos, trace = trace)
if (microhomology.len == -1) {
warning("Non-normalized deleted repeat ignored:",
"\ncontext: ", context,
"\ndeleted sequence: ", del.seq,
"\nposition of deleted sequence: ", pos)
return(NA)
}
if (microhomology.len == 0) {
stopifnot(rep.count.string == 0)
# Categorize and return non-repeat, non-microhomology deletion
return(paste0("DEL:repeats:", deletion.size.string, ":0"))
}
microhomology.len.str <-
ifelse(microhomology.len >= 5, "5+", as.character(microhomology.len))
return(paste0(
"DEL:MH:", deletion.size.string, ":", microhomology.len.str))
}
#' Given an insertion and its sequence context, categorize it.
#' @param context
#' @param ins.sequence
#' @param pos The position of \code{ins.sequence} in \code{context}.
#' @param trace
#' @param strand
#'
#' @return A string that is the canonical representation of
#' the given insertion type.
#'
#' @details same as Canonicalize1INS except when
#' \code{insertion.size == 1} & \code{rep.count == 0}
#' in which the format will be \code{INS:Y:1:0_PF}
#' where Y is C or T, P is the preceding nucleotide, and
#' F is the following nucleotide
#'
#' @keywords internal
Canonicalize1INS115 <- function(context, ins.sequence, pos, trace = 0, strand) {
if (trace > 0) {
message("Canonicalize1ID(", context, ",", ins.sequence, ",", pos, "\n")
}
rep.count <- ICAMS:::FindMaxRepeatIns(context, ins.sequence, pos)
rep.count.string <- ifelse(rep.count >= 5, "5+", as.character(rep.count))
insertion.size <- nchar(ins.sequence)
insertion.size.string <-
ifelse(insertion.size >= 5, "5+", as.character(insertion.size))
if (insertion.size == 1) {
if (rep.count == 0){
if (ins.sequence %in% c("A","G")){
context <- revc(context)
ins.sequence <- revc(ins.sequence)
pos <- nchar(context) - pos + 1
preceding.base <- substr(context, pos, pos)
proceeding.base <- substr(context, pos-1, pos-1)
retval <- paste0("INS:",ins.sequence,":1:0_",preceding.base,proceeding.base)
if (!is.na(strand)){
retval <- paste0(retval, ifelse(strand=="+",":nontrans", ":trans"))
}
} else {
preceding.base <- substr(context, pos, pos)
proceeding.base <- substr(context, pos+1, pos+1)
retval <- paste0("INS:",ins.sequence,":1:0_",preceding.base,proceeding.base)
if (!is.na(strand)){
retval <- paste0(retval, ifelse(strand=="+", ":trans", ":nontrans"))
}
}
} else {
if (ins.sequence == "A") ins.sequence <- "T"
if (ins.sequence == "G") ins.sequence <- "C"
retval <-
(paste0("INS:", ins.sequence, ":1:", rep.count.string))
}
} else {
retval <-
paste0("INS:repeats:", insertion.size.string, ":", rep.count.string)
}
if (trace > 0) message(retval)
return(retval)
}
#' @title Given a single insertion or deletion in context categorize it.
#'
#' @param context Ample surrounding
#' sequence on each side of the insertion or deletion.
#'
#' @param ref The reference allele (vector of length 1)
#'
#' @param alt The alternative allele (vector of length 1)
#'
#' @param pos The position of \code{ins.or.del.seq} in \code{context}.
#'
#' @param trace If > 0, then generate messages tracing
#' how the computation is carried out.
#'
#' @param strand NULL by default. But when called by \code{StrandBiasGetPlottablesID115},
#' \code{strand} is either \code{+} or \code{-}. The return value will
#' include :trans or :nontrans indicating whether the deletion occurred on
#' the transcribed or non-transcribed strand.
#
#' @return A string that is the canonical representation
#' of the type of the given
#' insertion or deletion.
#' Return \code{NA}
#' and raise a warning if
#' there is an un-normalized representation of
#' the deletion of a repeat unit.
#' See \code{FindDelMH} for details.
#' (This seems to be very rare.)
#'
#' @keywords internal
Canonicalize1ID115 <- function(context, ref, alt, pos, trace = 0, strand) {
if (trace > 0) {
message("Canonicalize1ID115(", context, ",", ref, ",", alt, ",", pos, "\n")
}
if (nchar(alt) < nchar(ref)) {
# A deletion
return(Canonicalize1Del115(context, ref, pos + 1, trace, strand))
} else if (nchar(alt) > nchar(ref)) {
# An insertion
return(Canonicalize1INS115(context, alt, pos, trace, strand))
} else {
stop("Non-insertion / non-deletion found: ", ref, " ", alt, " ", context)
}
}
#' @title Determine the mutation types of insertions and deletions.
#'
#' @param context A vector of ample surrounding
#' sequence on each side the variants
#'
#' @param ref Vector of reference alleles
#'
#' @param alt Vector of alternative alleles
#'
#' @param pos Vector of the positions of the insertions and deletions in
#' \code{context}.
#'
#' @param strand NULL by default. But when called by \code{StrandBiasGetPlottablesID115},
#' \code{strand} is either \code{+} or \code{-}. The return value will
#' include :trans or :nontrans indicating whether the deletion occurred on
#' the transcribed or non-transcribed strand.
#'
#' @return A vector of strings that are the canonical representations
#' of the given insertions and deletions.
#'
#' @importFrom utils head
#'
#' @keywords internal
CanonicalizeID115 <- function(context, ref, alt, pos, strand) {
if (all(substr(ref, 1, 1) == substr(alt, 1, 1))) {
ref <- substr(ref, 2, nchar(ref))
alt <- substr(alt, 2, nchar(alt))
} else {
stopifnot(ref != "" | alt != "")
}
ret <- mapply(Canonicalize1ID115, context, ref, alt, pos, 0, strand)
return(ret)
}
#' @title Create one column of the matrix for an indel catalog from *one* in-memory VCF.
#'
#' @param ID.vcf An in-memory VCF as a data.frame annotated by the
#' \code{\link{AnnotateIDVCF}} function. It must only
#' contain indels and must \strong{not} contain SBSs
#' (single base substitutions), DBSs, or triplet
#' base substitutions, etc.
#'
#' One design decision for variant callers is the representation of "complex
#' indels", e.g. mutations e.g. CAT > GC. Some callers represent this as C>G,
#' A>C, and T>_. Others might represent it as CAT > CG. Multiple issues can
#' arise. In PCAWG, overlapping indel/SBS calls from different callers were
#' included in the indel VCFs.
#'
#' @param SBS.vcf This argument defaults to \code{NULL} and
#' is not used. Ideally this should be an in-memory SBS VCF
#' as a data frame. The rational is that for some data,
#' complex indels might be represented as an indel with adjoining
#' SBSs.
#'
#' @return A list of a 1-column matrix containing the mutation catalog
#' information and the annotated VCF with ID categories information added.
#'
#' @keywords internal
CreateOneColID115Matrix <- function(ID.vcf, SBS.vcf = NULL) {
if (nrow(ID.vcf) == 0) {
# Create 1-column matrix with all values being 0 and the correct row labels.
catID <- matrix(0, nrow = length(ICAMSxtra::catalog.row.order$ID115), ncol = 1)
rownames(catID) <- ICAMSxtra::catalog.row.order$ID115
return(catID)
}
if (!is.null(SBS.vcf))
warning("Argument SBS.vcf in CreateOneColIDMatrix is always ignored")
canon.ID <- CanonicalizeID115(ID.vcf$seq.context,
ID.vcf$REF,
ID.vcf$ALT,
ID.vcf$seq.context.width + 1,
strand = NA)
if (any(is.na(canon.ID))) warning("NA ID categories ignored")
idx <- which(is.na(canon.ID))
canon.ID <- canon.ID[!is.na(canon.ID)]
if (length(idx) > 0) {
ID.vcf <- ID.vcf[-idx, ]
}
out.ID.vcf <- cbind(ID.vcf, ID.class = canon.ID)
# Create the ID catalog matrix
tab.ID <- table(canon.ID)
row.order <- data.table(rn = ICAMSxtra::catalog.row.order$ID115)
ID.dt <- as.data.table(tab.ID)
# ID.dt has two columns, names cannon.dt (from the table() function
# and N (the count)
ID.dt2 <-
merge(row.order, ID.dt, by.x = "rn", by.y = "canon.ID", all = TRUE)
ID.dt2[ is.na(N) , N := 0]
stopifnot(setequal(unlist(ID.dt2$rn), ICAMSxtra::catalog.row.order$ID115))
ID.mat <- as.matrix(ID.dt2[ , 2])
rownames(ID.mat) <- ID.dt2$rn
return(list(catalog = ID.mat[ICAMSxtra::catalog.row.order$ID115, , drop = FALSE],
annotated.VCF = out.ID.vcf))
}
#' Create a catalog from a \code{matrix}, \code{data.frame}, or \code{vector}
#'
#' @param object A numeric \code{matrix}, numeric \code{data.frame},
#' or \code{vector}.
#' If a \code{vector}, converted to a 1-column \code{matrix}
#' with rownames taken from the element names of the \code{vector}
#' and with column name \code{"Unknown"}.
#' If argument \code{infer.rownames}
#' is \code{FALSE} than this argument must have
#' rownames to denote the mutation types. See \code{\link{CatalogRowOrder}}
#' for more details.
#'
#' @param ref.genome A \code{ref.genome} argument as described in
#' \code{\link{ICAMS}}.
#'
#' @param region A character string designating a region, one of
#' \code{genome}, \code{transcript}, \code{exome}, \code{unknown};
#' see \code{\link{ICAMS}}.
#'
#' @param catalog.type One of "counts", "density", "counts.signature",
#' "density.signature".
#'
#' @param abundance If \code{NULL}, then
#' inferred if \code{ref.genome}
#' is one of
#' the reference genomes known to ICAMS and \code{region}
#' is not \code{unknown}. See \code{\link{ICAMS}}.
#' The argument \code{abundance} should
#' contain the counts of different source sequences for mutations
#' in the same format as the numeric vectors in \code{\link{all.abundance}}.
#'
#' @param infer.rownames If \code{TRUE}, and \code{object} has no
#' rownames, then assume the rows of \code{object} are in the
#' correct order and add the rownames implied by the number of rows
#' in \code{object} (e.g. rownames for SBS 192 if there are 192 rows).
#' If \code{TRUE}, \strong{be sure the order of rows is correct.}
#'
#' @return A catalog as described in \code{\link{ICAMS}}.
#'
#' @export
as.catalog.for.ID115 <- function(object,
ref.genome = NULL,
region = "unknown",
catalog.type = "counts",
abundance = NULL,
infer.rownames = FALSE) {
if (!is.matrix(object)) {
if (is.data.frame(object)) {
object <- as.matrix(object)
} else if (is.vector(object)) {
obj2 <- matrix(object, ncol = 1)
rownames(obj2) <- names(object)
colnames(obj2) <- "Unknown"
object <- obj2
} else {
stop("object must be numeric matrix, vector, or data frame")
}
}
stopifnot(mode(object) == "numeric")
if (is.null(rownames(object))) {
if (!infer.rownames) {
stop("Require correct rownames on object unless infer.rownames == TRUE")
}
rownames(object) <- ICAMS:::InferRownames(object)
} else {
#removed call to checkandreorder rownames
object <- object[ICAMSxtra::catalog.row.order$ID115, ,drop=FALSE]
}
# StopIfRegionIllegal(region)
class.string <- "ID115Catalog"
ICAMS:::StopIfCatalogTypeIllegal(catalog.type)
if (!is.null(ref.genome)) {
ref.genome <- ICAMS:::NormalizeGenomeArg(ref.genome)
}
attr(object, "ref.genome") <- ref.genome
attr(object, "catalog.type") <- catalog.type
attr(object, "abundance") <- NULL
class(object) <- append(class(object), class.string, after = 0)
class(object) <- unique(attributes(object)$class)
attr(object, "region") <- region
return(object)
}
###############################################################################
# Catalog functions
###############################################################################
#' Create ID (small insertion and deletion) catalog from ID VCFs
#'
#' @param list.of.vcfs List of in-memory ID VCFs. The list names will be
#' the sample ids in the output catalog.
#'
#' @param ref.genome A \code{ref.genome} argument as described in
#' \code{\link{ICAMS}}.
#'
#' @param region A character string acting as a region identifier, one of
#' "genome", "exome".
#'
#' @param flag.mismatches Optional. If > 0, then if there are mismatches to
#' references in the ID (insertion/deletion) VCF, generate messages showing
#' the mismatched rows and continue. Otherwise \code{stop} if there are
#' mismatched rows. See \code{\link{AnnotateIDVCF}} for more details.
#'
#' @return A list of two elements. 1st element \code{catalog} is the ID (small
#' insertion and deletion) catalog with attributes added. See
#' \code{\link{as.catalog}} for more details. 2nd element
#' \code{annotated.vcfs} is a list of data frames which contain the original
#' VCF with three additional columns \code{seq.context.width},
#' \code{seq.context} and \code{ID.class} added. The category assignment of
#' each ID mutation in VCF can be obtained from \code{ID.class} column.
#'
#' @note In ID (small insertion and deletion) catalogs, deletion repeat sizes
#' range from 0 to 5+, but for plotting and end-user documentation
#' deletion repeat sizes range from 1 to 6+.
#'
#' @export
VCFsToID115Catalogs<-function(list.of.vcfs, ref.genome, region = "unknown",
flag.mismatches = 0){
ncol <- length(list.of.vcfs)
# Create a 0-column matrix with the correct row labels.
catID <- matrix(0, nrow = length(ICAMSxtra::catalog.row.order$ID115), ncol = 0)
rownames(catID) <- ICAMSxtra::catalog.row.order$ID115
out.list.of.vcfs <- list()
vcf.names <- names(list.of.vcfs)
for (i in 1:ncol) {
ID <- list.of.vcfs[[i]]
list <- AnnotateIDVCF(ID, ref.genome = ref.genome,
flag.mismatches = flag.mismatches,
name.of.VCF = vcf.names[i])
# Unlike the case for SBS and DBS, we do not
# add transcript information.
tmp <- CreateOneColID115Matrix(list$annotated.vcf)
one.ID.column <- tmp[[1]]
out.list.of.vcfs <- c(out.list.of.vcfs, list(tmp[[2]]))
rm(ID)
catID <- cbind(catID, one.ID.column)
}
colnames(catID) <- names(list.of.vcfs)
names(out.list.of.vcfs) <- names(list.of.vcfs)
return(list(catalog =
as.catalog.for.ID115(catID, ref.genome = ref.genome,
region = region, catalog.type = "counts"),
annotated.vcfs = out.list.of.vcfs))
}
#' "Collapse" a catalog
#'
#' @description
#'
#' \code{Collapse115CatalogTo83} Collapse a ID 115 catalog
#' to a ID 83 catalog.
#'
#' @param catalog A catalog as defined in \code{\link{ICAMS}}.
#'
#' @return A catalog as defined in \code{\link{ICAMS}}.
#'
#' @export
Collapse115CatalogTo83 <- function(catalog) {
dt115 <- data.table(catalog)
list83 <- list()
list83[1] <- sum(dt115[1:9])
list83[7] <- sum(dt115[15:23])
list83[13] <- sum(dt115[29:37])
list83[19] <- sum(dt115[43:51])
for (i in 0:4){
list83[i+2] <- dt115[i+10]
list83[i+8] <- dt115[i+24]
list83[i+14] <- dt115[i+38]
}
for (i in 0:63){
list83[i+20] <- dt115[i+52]
}
mat83 <- matrix(unlist(list83),ncol=1,byrow=TRUE)
rownames(mat83) <- ICAMS::catalog.row.order$ID
mat83 <- mat83[ICAMS::catalog.row.order$ID, , drop = FALSE]
cat83 <-
as.catalog(
object = mat83,
ref.genome = attributes(catalog)$ref.genome,
region = attributes(catalog)$region,
catalog.type = attributes(catalog)$catalog.type,
abundance = NULL)
#hdp0 etc... otherwise plot will not have names
attributes(cat83)$dimnames[[2]] <- attributes(catalog)$dimnames[[2]]
return(cat83)
}
#' "Collapse" a matrix of ID 115 catalogs to ID 83 catalog
#'
#' @param catalogs A catalog as defined in \code{\link{ICAMS}}.
#'
#' @return A catalog as defined in \code{\link{ICAMS}}.
#' @export
CollapseID115CatalogsToID83s <- function(catalogs){
n <- ncol(catalogs)
cat115s <-catalogs[, 1, drop=FALSE]
cat83s <- Collapse115CatalogTo83(cat115s)
for (i in 2:n){
cat115_i <- catalogs[, i, drop=FALSE]
cat83_i <-Collapse115CatalogTo83(cat115_i)
cat83s <- cbind(cat83s,cat83_i)
}
return (cat83s)
}
###############################################################################
# Read and write functions
###############################################################################
#' @title Write a catalog to a file.
#'
#' @param catalog A catalog as defined in \code{\link{ICAMS}} with attributes added.
#' See \code{\link{as.catalog}} for more details.
#'
#' @param file The path of the file to be written.
#'
#' @param strict If TRUE, then stop if additional checks on the input fail.
#'
#' @export
WriteID115Catalog <- function(catalog, file, strict = TRUE) {
ICAMS:::WriteCat(catalog, file, 115, ICAMSxtra::catalog.row.order$ID115,
catalog.row.headers.ID115, strict)
}
#' Read catalog
#'
#' Read a catalog in standardized format from path.
#'
#' See also \code{\link{WriteCatalog}}
#'
#' @param file Path to a catalog on disk in the standardized format.
#'
#' @param ref.genome A \code{ref.genome} argument as described in
#' \code{\link{ICAMS}}.
#'
#' @param region region A character string designating a genomic region;
#' see \code{\link{as.catalog}} and \code{\link{ICAMS}}.
#'
#' @param catalog.type One of "counts", "density", "counts.signature",
#' "density.signature".
#'
#' @return A catalog as an S3 object; see \code{\link{as.catalog}}.
#'
#' @note In ID (small insertion and deletion) catalogs, deletion repeat sizes
#' range from 0 to 5+, but for plotting and end-user documentation
#' deletion repeat sizes range from 1 to 6+.
#'
#' @section Comments:
#' To add or change attributes of the catalog, you can use function
#' \code{\link[base]{attr}}. \cr For example, \code{attr(catalog, "abundance")
#' <- custom.abundance}.
#'
#' @export
ReadID115Catalog <- function(file, ref.genome = NULL, region = "unknown",
catalog.type = "counts") {
cos <- data.table::fread(file)
stopifnot(nrow(cos) == 115)
rn <- apply(cos[ , 1:4], MARGIN = 1, paste, collapse = ":")
out <- as.matrix(cos[ , -(1:4), drop = FALSE])
stopifnot(setdiff(rn, ICAMSxtra::catalog.row.order$ID115) == c())
stopifnot(setdiff(ICAMSxtra::catalog.row.order$ID115, rn) == c())
stopifnot(rn == ICAMSxtra::catalog.row.order$ID115)
rownames(out) <- rn
# if (ncol(out) == 1) colnames(out) <- colnames(cos)[3]
out <- out[ICAMSxtra::catalog.row.order$ID115, , drop = FALSE]
return(as.catalog.for.ID115(out, ref.genome, region, catalog.type))
}
###############################################################################
# Plotting functions for ID(insertion and deletion) catalog start here
###############################################################################
#' Plot \strong{one} spectrum or signature
#'
#' Plot the spectrum of \strong{one} sample or plot \strong{one} signature. The
#' type of graph is based on one attribute("catalog.type") of the input catalog.
#' You can first use \code{\link{TransformCatalog}} to get different types of
#' catalog and then do the plotting.
#'
#' @param catalog A catalog as defined in \code{\link{ICAMS}} with attributes added.
#' See \code{\link{as.catalog}} for more details.
#'
#' @param ylim Has the usual meaning. Only implemented for SBS96Catalog and
#' IndelCatalog.
#'
#' @export
PlotID115Catalog <- function(catalog, ylim = NULL){
stopifnot(dim(catalog) == c(115, 1))
indel.class.col <- c("#fdbe6f",
"#ff8001",
"#b0dd8b",
"#36a12e",
"#fdcab5",
"#fc8a6a",
"#f14432",
"#bc141a",
"#d0e1f2",
"#94c4df",
"#4a98c9",
"#1764ab",
"#e2e2ef",
"#b6b6d8",
"#8683bd",
"#61409b")
num.classes <- length(catalog)
cols <- rep(indel.class.col,
c(14, 14, 14, 14,
6, 6, 6, 6,
6, 6, 6, 6,
1, 2, 3, 5))
to.plot <- catalog[, 1]
catalog.type <- attributes(catalog)$catalog.type
if (catalog.type == "counts") {
# Set a minimum value for ymax to make the plot more informative
ymax <- 4 * ceiling(max(max(to.plot) * 1.3, 10) / 4)
} else if (catalog.type == "counts.signature") {
ymax <- ifelse(max(to.plot) * 1.3 > 1, 1, max(to.plot) * 1.3)
} else {
stop('\nCan only plot IndelCatalog with "counts" or "counts.signature" catalog.type.')
}
if (is.null(ylim)) {
ylim <- c(0, ymax)
} else {
ymax <- ylim[2]
}
# Barplot
bp <- barplot(catalog[, 1], ylim = c(0, ymax), axes = FALSE, xaxt = "n",
lwd = 3, space = 1.35, border = NA, col = cols, xpd = NA,
xaxs = "i", yaxt = "n")
if (catalog.type == "counts") {
# Calculate and draw the total counts for each major type
counts <- integer(16)
idx <- c(14, 28, 42, 56, seq(62, 104, 6), 105, 107, 110, 115)
# midpoints of major types
# c(7, 21, 35, 49, 59.5, 65.5, 71.5, 77.5, 83.5, 89.5, 95.5, 101.5, 105, 106.5, 109, 112.5)
midx <- c(seq(7, 49, 14), seq(59.5, 101.5, 6), 105, 106.5, 109, 112.5)
# scale midpoints to horizontal width of plot
idx2 <- midx * 270 / 115
for (i in 1:16) {
if (i == 1) {
counts[i] <- sum(catalog[1:idx[1], 1])
} else {
counts[i] <- sum(catalog[(idx[i - 1] + 1):idx[i], 1])
}
text(idx2[i], ymax * 0.6, labels = counts[i],
cex = 0.68, adj = 1, xpd = NA)
}
}
# Draw box and grid lines
rect(xleft = bp[1] - 1.0, 0, xright = bp[num.classes] + 1.0, ymax, col = NA,
border = "grey60", lwd = 0.5, xpd = NA)
segments(bp[1] - 1.0, seq(0, ymax, ymax / 4), bp[num.classes] + 1,
seq(0, ymax, ymax / 4), col = "grey60", lwd = 0.5, xpd = NA)
# Draw mutation class labels and lines above each class
maj.class.names <- c("1bp deletion", "1bp insertion",
">1bp deletions at repeats\n(Deletion length)",
">1bp insertions at repeats\n(Insertion length)",
"Deletions with microhomology\n(Deletion length)")
x.left <- bp[c(seq(1, 57, 14), seq(63, 99, 6), 105, 106, 108, 111)] - 0.8
x.right <- bp[c(seq(14, 56, 14), seq(62, 104, 6), 105, 107, 110, 115)] + 0.8
class.pos <- c((x.left[seq(1, 4, 2)] + x.right[seq(2, 5, 2)]) / 2,
(x.left[c(6, 10)] + x.right[c(8, 12)] - 12) / 2,
(x.left[13] + x.right[length(x.left)]) / 2)
category.lab <- c(rep(c("C", "T"), 2), rep(c("2", "3", "4", "5+"), 3))
category.col <- c(rep(c("black", "white"), 2),
rep(c("black", "black", "black", "white"), 3))
# Draw lines above each class
# draw at ymax instead of ymax
rect(xleft = x.left, ymax * 1.02, xright = x.right, ymax * 1.09,
col = indel.class.col, border = NA, xpd = NA)
text((x.left + x.right) / 2, ymax * 1.05, labels = category.lab,
cex = 0.65, col = category.col, xpd = NA)
# Draw mutation class labels at the top of the figure
text(class.pos, ymax * 1.2, labels = maj.class.names, cex = 0.75, xpd = NA)
# Draw the sample name information of the sample
text(1.5, ymax * 7 / 8, labels = colnames(catalog),
adj = 0, cex = 0.8, font = 2)
# Draw y axis
y.axis.values <- seq(0, ymax, ymax / 4)
if (attributes(catalog)$catalog.type != "counts") {
y.axis.labels <- format(round(y.axis.values, 2), nsmall = 2)
text(-9, ymax / 2, labels = "counts proportion",
srt = 90, xpd = NA, cex = 0.8)
} else {
y.axis.labels <- y.axis.values
text(-8, ymax / 2, labels = "counts",
srt = 90, xpd = NA, cex = 0.8)
}
text(0, y.axis.values, labels = y.axis.labels,
las = 1, adj = 1, xpd = NA, cex = 0.75)
# Draw x axis labels
mut.type <- c("AA","AT","AG","TA","TT","TG","GA","GT","GG",
"2", "3", "4", "5", "6+",
"AA","AC","AG","CA","CC","CG","GA","GC","GG",
"2", "3", "4", "5", "6+",
"AA","AT","AG","TA","TT","TG","GA","GT","GG",
"1", "2", "3", "4", "5+",
"AA","AC","AG","CA","CC","CG","GA","GC","GG",
"1", "2", "3", "4", "5+",
rep(c("1", "2", "3", "4", "5", "6+"), 4),
rep(c("0", "1", "2", "3", "4", "5+"), 4),
"1", "1", "2", "1", "2", "3", "1", "2", "3", "4", "5+")
bottom.pos <- c((x.left[1] + x.right[2]) / 2, (x.left[3] + x.right[4]) / 2,
class.pos[3 : length(class.pos)])
bottom.lab <- c("Homopolymer length", "Homopolymer length",
"Number of repeat units", "Number of repeat units",
"Microhomology length")
rect(xleft = x.left, -ymax * 0.09, xright = x.right, -ymax * 0.02,
col = indel.class.col, border = NA, xpd = NA)
text(bp, -ymax * 0.11, labels = mut.type, cex = 0.65, xpd = NA, srt = 270, adj = 0)
text(bottom.pos, -ymax * 0.27, labels = bottom.lab, cex = 0.75, xpd = NA)
return(list(plot.success = TRUE))
}
#' Plot catalog to a PDF file
#'
#' Plot catalog to a PDF file. The type of graph is based on one
#' attribute("catalog.type") of the input catalog. You can first use
#' \code{\link{TransformCatalog}} to get different types of catalog and then do
#' the plotting.
#'
#' @param catalog A catalog as defined in \code{\link{ICAMS}} with attributes added.
#' See \code{\link{as.catalog}} for more details.
#'
#' @param file The name of the PDF file to be produced.
#' @param ylim Has the usual meaning. Only implemented for SBS96Catalog and IndelCatalog.
#'
#' @return A list whose first element is a logic value indicating whether the
#' plot is successful. For \strong{SBS192Catalog} with "counts" catalog.type
#' and non-null abundance and \code{plot.SBS12 = TRUE}, the list will have a
#' second element which is a list containing the strand bias statistics.
#'
#' @note The sizes of repeats involved in deletions range from 0 to 5+ in the
#' mutational-spectra and signature catalog rownames, but for plotting and
#' end-user documentation deletion repeat sizes range from 1 to 6+.
#'
#' @export
PlotID115CatalogToPdf <-
function(catalog, file, ylim = NULL) {
# Setting the width and length for LANDSCAPE A4 size plotting
grDevices::pdf(file, width = 11.6929, height = 8.2677, onefile = TRUE)
n <- ncol(catalog)
opar <- par(mfrow = c(4, 1), mar = c(3, 4, 2.5, 2), oma = c(3, 3, 2, 2))
on.exit(par(opar))
for (i in 1 : n) {
cat <- catalog[, i, drop = FALSE]
PlotID115Catalog(cat, ylim = ylim)
}
grDevices::dev.off()
return(list(plot.success = TRUE))
}
#' @title Plot an ID 115 signatures (default) or catalog as standard ID83 and save as pdf file
#'
#' @param file The name of the PDF file to be produced.
#'
#' @param ylim Has the usual meaning. Only implemented for SBS96Catalog and
#' IndelCatalog.
#'
#' @param catalog A catalog as defined in \code{\link{ICAMS}}.
#'
#' @return A list whose first element is a logic value indicating whether the
#' plot is successful. For \strong{SBS192Catalog} with "counts" catalog.type
#' and non-null abundance and \code{plot.SBS12 = TRUE}, the list will have a
#' second element which is a list containing the strand bias statistics.
#'
#' @export
PlotID115AsID83ToPdf <-
function(catalog, file, ylim = NULL) {
# Setting the width and length for LANDSCAPE A4 size plotting
grDevices::pdf(file, width = 11.6929, height = 8.2677, onefile = TRUE)
n <- ncol(catalog)
opar <- par(mfrow = c(4, 1), mar = c(3, 4, 2.5, 2), oma = c(3, 3, 2, 2))
on.exit(par(opar))
for (i in 1 : n) {
cat115 <- catalog[, i, drop = FALSE]
cat83 <- Collapse115CatalogTo83(cat115)
ICAMS:::PlotCatalog.IndelCatalog(cat83, ylim = ylim)
}
grDevices::dev.off()
return(list(plot.success = TRUE))
}
###############################################################################
# Combined functions
###############################################################################
#' @title Read a list of vcfs and plot ID115 catalogs as pdf
#'
#' @param list.of.vcfs List of in-memory ID VCFs. The list names will be
#' the sample ids in the output catalog.
#'
#' @param ref.genome A \code{ref.genome} argument as described in
#' \code{\link{ICAMS}}.
#'
#' @param region A character string acting as a region identifier, one of
#' "genome", "exome".
#'
#' @param flag.mismatches Optional. If > 0, then if there are mismatches to
#' references in the ID (insertion/deletion) VCF, generate messages showing
#' the mismatched rows and continue. Otherwise \code{stop} if there are
#' mismatched rows. See \code{\link{AnnotateIDVCF}} for more details.
#'
#' @param file The name of the PDF file to be produced.
#'
#' @param ylim Has the usual meaning. Only implemented for SBS96Catalog and
#' IndelCatalog.
#'
#' @export
VCFsToID115CatalogsAndPlotToPdf <-
function(list.of.vcfs, ref.genome, region = "unknown",
flag.mismatches = 0, file, ylim = NULL){
catalogs <- VCFsToID115Catalogs(list.of.vcfs, ref.genome, region)$catalog
PlotID115CatalogToPdf(catalogs, file, ylim)
}
###############################################################################
# Strand bias functions
###############################################################################
#' @keywords internal
revcID115 <- function(string) {
if (string %in% c(target_pooled, reverse_pooled)){
idx_py <- which(grepl(string, target_pooled, fixed=TRUE))
idx_pu <- which(grepl(string, reverse_pooled, fixed=TRUE))
if (length(idx_py)==1){
return(reverse_pooled[idx_py])
}
if (length(idx_pu)==1){
return(target_pooled[idx_pu])
}
stop(paste0("Tried to reverse ", string," but it was not recognized\n"))
} else {
idx_py <- which(grepl(string, target, fixed=TRUE))
idx_pu <- which(grepl(string, reverse, fixed=TRUE))
if (length(idx_py)==1){
return(reverse[idx_py])
}
if (length(idx_pu)==1){
return(target[idx_pu])
}
stop(paste0("Tried to reverse ", string," but it was not recognized\n"))
}
}
#' @keywords internal
StrandBiasGetPlottablesID115 <-
function(annotated.ID.vcf, damaged.base, vcf.name, pool) {
df <- annotated.ID.vcf
df <- df[!is.na(trans.gene.symbol), ]
df1 <- df[, .(REF = REF[1], ALT= ALT[1], trans.strand = trans.strand[1],
seq.context = seq.context[1], seq.context.width = seq.context.width[1],
trans.gene.symbol = trans.gene.symbol[1],
trans.start.pos = trans.start.pos[1],
trans.end.pos = trans.end.pos[1]),
by = .(CHROM, POS)]
if (pool) {
target_i <- target_pooled
reverse_i <- reverse_pooled
} else {
target_i <- target
reverse_i <- reverse
}
# Construct a matrix that later can be used to plot transcriptional strand bias
result <- matrix(data = 0, nrow = 1, ncol = 2*length(target_i))
rownames(result) <- c(1)
mutation.type <- c(target_i, reverse_i)
colnames(result) <- mutation.type
if (nrow(df1) > 0){
df1$mutation <- CanonicalizeID115(df1$seq.context, df1$REF, df1$ALT, df1$seq.context.width + 1, strand = df1$trans.strand)
# filter for non-homopolymer single base indels
df1 <- df1[which(grepl(pattern = "trans", x = df1$mutation)),,]
# if aren't any non-homopolymer single base indels
# then return value is same as if original df1 == 0
if (nrow(df1) > 0){
if (pool){
df1$mutation1 <- substr(df1$mutation, 1, 5)
df1$mutation2 <- substr(df1$mutation, 14, nchar(df1$mutation))
df1$mutation <- paste0(df1$mutation1, ":", df1$mutation2)
df1 <- df1[, c(-12,-13)]
}
for (i in 1:length(target_i)) {
type <- mutation.type[i]
idx <- which(unname(mapply(grepl, type, df1$mutation)))
df2 <- df1[idx,,drop=FALSE]
if (nrow(df2) == 0) {
result[, type] <- 0
result[, revcID115(type)] <- 0
} else {
nrow.nontrans <- length(which(grepl("nontrans", df2$mutation,fixed=TRUE)))
result[1, revcID115(type)] <- nrow.nontrans
result[1, type] <- nrow(df2) - nrow.nontrans
}
}
}
}
# Carry out logistic regression and get the p-values
p.values <- CalculatePValuesID115(df1, pool = pool)
return(list(plotmatrix = result, vcf.df = df1,
p.values = p.values, vcf.name = vcf.name))
}
#' @importFrom stats binom.test p.adjust
#' @keywords internal
CalculatePValuesID115 <- function(dt, pool) {
if (pool){
target_i <- target_pooled
} else {
target_i <- target
}
p.values <- numeric(length(target_i))
names(p.values) <- target_i
#construct a matrix with 4 (or 36) columns and 2 rows
mat <- matrix(0L, nrow = 2, ncol = length(target_i))
colnames(mat) <- target_i
rownames(mat) <- c("trans", "nontrans")
if (nrow(dt)> 0){
for (i in 1:nrow(dt)){
mutation <- dt[i, mutation]
if (pool){
col.name <- substr(mutation, 1, 5)
row.name <- substr(mutation, 7, nchar(mutation))
} else {
col.name <- substr(mutation, 1, 12)
row.name <- substr(dt[i, mutation], 14, nchar(mutation))
}
mat[row.name, col.name] <- mat[row.name, col.name] + 1
}
}
# proportion of indels on transcribed strand of genes
prop.transcribed <- 0.5
for (type in target_i){
htest <- binom.test(x = mat[, type], p = prop.transcribed,
alternative = "two.sided")
p.values[type] <- htest$p.value
}
# Adjust p-values for multiple comparisons to Benjamini-Hochberg false discovery rate
q.values <- p.adjust(p.values, method = "BH")
return(q.values)
}
#' @keywords internal
PlotTransBiasID115Internal <- function(list, ymax = NULL) {
result <- list$plotmatrix
# determined whether to plot pooled version based on ncol of plotmatrix
if (ncol(result) == 8){
pool = TRUE
} else {
if (ncol(result) == 72){
pool = FALSE
} else {
stop("Error in `list` argument passed to PlotTransBiasID115Internal. ncol of plotmatrix is neither 8 nor 72")
}
}
if (is.null(ymax)) {
ymax <- max(result)
}
if (pool){
tmp <- matrix(0, nrow = 2, ncol = 4)
tmp[1,] <- result[1:4]
tmp[2,] <- result[5:8]
bpcol = rep(c('#394398', '#e83020'), 4)
bplabels = colnames(list$plotmatrix)[1:4]
} else {
tmp <- matrix(0, nrow = 2, ncol = 36)
tmp[1,] <- result[1:36]
tmp[2,] <- result[37:72]
bpcol = rep(c('#394398', '#e83020'), 36)
bplabels = colnames(list$plotmatrix)[1:36]
}
bp <- barplot(tmp, beside = TRUE, main = list$vcf.name,
ylim = c(0, ifelse(ymax != 0, 1.5 * ymax, 5)),
col = bpcol, axisnames = FALSE,
ylab = "counts")
colnames(bp) <- bplabels
rownames(bp) <- c("Transcribed", "Untranscribed")
legend.list <- legend("topright", legend = c("Transcribed", "Untranscribed"),
fill = c('#394398', '#e83020'), bty = "n", cex = 0.8)
if (pool){
text(bp, x = (bp[1,]+bp[2,])/2, y = -0.1 * ymax, labels = bplabels,
pos = 1, xpd = NA, cex = 1)
} else {
text(bp, x = (bp[1,]+bp[2,])/2, y = - 0.3 * ymax, labels = bplabels,
pos = 1, xpd = NA, cex = 0.5, srt = 270)
}
for (type in bplabels){
p.value <- list$p.values[type]
if (!is.na(p.value)) {
# draw asterisk if p.value < 0.05
if (p.value < 0.05){
# Get the x coordinates of the line segment to be drawn
x1 <- bp[1, type]
x2 <- bp[2, type]
# Get the y coordinates of the line segment to be drawn
y1 <- y2 <- max(tmp) + 0.03 * max(result)
# Draw the line segment
segments(x1, y1, x2, y2)
# Draw the asterisk on top of line segment
x3 <- mean(c(x1, x2))
y3 <- y1 + max(result) * 0.05
label <- ICAMS:::AssignNumberOfAsterisks(p.value) #p.values actually are q.value
text(x3, y3, label)
}
}
}
}
#' @keywords internal
CheckSeqContextInIDVCF <- function(vcf, column.to.use) {
if (0 == nrow(vcf)) return()
stopifnot(!any(vcf$REF == '-'))
stopifnot(!any(vcf$ALT == '-'))
cut.pos <- 1 + (nchar(vcf$column.to.use) - 1) / 2
stopifnot(cut.pos == round(cut.pos))
cut.from.ref <- substr(vcf$column.to.use, cut.pos,
(cut.pos + nchar(vcf$REF)) - 1)
error.rows <- which(vcf$REF != cut.from.ref)
if (any(error.rows > 0)) {
temp <- tempfile(fileext = ".csv")
write.csv(vcf[error.rows, ], file = temp)
stop("Seqence context of reference allele is inconsistent,",
"see file ", temp)
}
}
#' Plot transcription strand bias
#'
#' @param annotated.ID.vcf An ID VCF annotated by
#' \code{AnnotateIDVCFsWithTransRanges}. It \strong{must} have transcript range
#' information added.
#'
#' @param pool if true, 36 categories will be pooled to 4 categories by removing
#' trinucleotide context. This can be done if the counts of individual categories
#' are too low, to increase power.
#'
#' @param damaged.base One of \code{NULL}, \code{"purine"} or
#' \code{"pyrimidine"}. This function allocates approximately
#' equal numbers of mutations from \code{damaged.base} into
#' each of \code{num.of.bins} bin by expression level. E.g.
#' if \code{damaged.base} is \code{"purine"}, then mutations from
#' A and G will be allocated in approximately equal numbers to
#' each expression-level bin. The rationale for the name \code{damaged.base}
#' is that the direction of strand bias is a result of whether the damage
#' occurs on a purine or pyrimidine.
#' If \code{NULL}, the function attempts to infer the \code{damaged.base}
#' based on mutation counts.
#'
#' @param ymax Limit for the y axis. If not specified, it defaults to NULL and
#' the y axis limit equals 1.5 times of the maximum mutation counts in a
#' specific mutation type.
#'
#' @return A list whose first element is a logic value indicating whether the
#' plot is successful. The second element is a named numeric vector containing
#' the p-values printed on the plot.
#'
#' @section Note:
#' The strand bias statistics are Benjamini-Hochberg q-values based on two-sided
#' binomial tests of the mutation counts on the transcribed and untranscribed strands
#' relative to the actual abundances of C and T on the transcribed strand. On the
#' plot, asterisks indicate q-values as follows *, \eqn{Q<0.05}; **, \eqn{Q<0.01}; ***,
#' \eqn{Q<0.001}.
#'
#' @import data.table
#'
#' @export
#'
#' @examples
#' file <- c(system.file("extdata/Strelka-ID-vcf/",
#' "Strelka.ID.GRCh37.s1.vcf",
#' package = "ICAMSxtra"))
#' ID.vcf <- ICAMS::ReadAndSplitVCFs(file, variant.caller = "strelka")$ID
#' if (requireNamespace("BSgenome.Hsapiens.1000genomes.hs37d5", quietly = TRUE)) {
#' annotated.ID.vcf <- AnnotateIDVCFsWithTransRanges(ID.vcf, ref.genome = "hg19",
#' trans.ranges = ICAMS::trans.ranges.GRCh37,
#' vcf.names = "Strelka.ID.GRCh37.s1.vcf")
#' #' alternatively run below code to skip call to AnnotateIDVCFsWithTransRanges
#' #' load(c(system.file("extdata/annotated.ID.vcf.rda",package = "ICAMSxtra")))
#' PlotTransBiasID115(annotated.ID.vcf = annotated.ID.vcf[[1]],
#' pool = TRUE)
#' }
PlotTransBiasID115 <-
function(annotated.ID.vcf, pool, damaged.base = NULL, ymax = NULL) {
if (is.null(names(annotated.ID.vcf))){
stop("annotated.ID.vcf does not have 'name' property")
}
list <- StrandBiasGetPlottablesID115(annotated.ID.vcf, damaged.base, pool=pool, vcf.name=names(annotated.ID.vcf))
PlotTransBiasID115Internal(list = list, ymax = ymax)
return(list(plot.success = TRUE, p.values = list$p.values))
}
#' Plot transcription strand bias to a PDF file
#'
#' @inheritParams PlotTransBiasID115
#'
#' @param annotated.ID.vcfs ID vcfs which have been annotated with
#' \code{AnnotateIDVCFsWithTransRanges}.
#'
#' @param file The name of output file.
#'
#' @importFrom stats glm
#'
#' @inherit PlotTransBiasID115 return
#'
#' @inheritSection PlotTransBiasID115 Note
#'
#' @export
#'
#' @examples
#' library(ICAMS)
#' file <- c(system.file("extdata/Strelka-ID-vcf/",
#' "Strelka.ID.GRCh37.s1.vcf",
#' package = "ICAMSxtra"))
#' ID.vcf <- ICAMS::ReadAndSplitVCFs(file, variant.caller = "strelka")$ID
#' if (requireNamespace("BSgenome.Hsapiens.1000genomes.hs37d5", quietly = TRUE)) {
#' annotated.ID.vcf <- AnnotateIDVCFsWithTransRanges(ID.vcf, ref.genome = "hg19",
#' trans.ranges = ICAMS::trans.ranges.GRCh37,
#' vcf.names = "Strelka.ID.GRCh37.s1")
#' PlotTransBiasID115ToPdf(annotated.ID.vcfs = annotated.ID.vcf,
#' file = file.path(tempdir(), "test.pdf"),
#' pool = TRUE)
#' }
PlotTransBiasID115ToPdf <-
function(annotated.ID.vcfs, file, pool, damaged.base = NULL) {
# Setting the width and length for A4 size plotting
grDevices::pdf(file, width = 8.2677, height = 11.6929, onefile = TRUE)
opar <- par(mfrow = c(4, 1), mar = c(8, 5.5, 2, 1), oma = c(1, 1, 2, 1))
on.exit(par(opar))
plot.type <- ifelse(pool, target_pooled, target)
n <- length(annotated.ID.vcfs)
vcf.names <- names(annotated.ID.vcfs)
results <- list()
for (i in 1:n){
list <- StrandBiasGetPlottablesID115(annotated.ID.vcf = annotated.ID.vcfs[[i]],
vcf.name = vcf.names[[i]],
pool = pool,
damaged.base = damaged.base)
ymax <- max(list$plotmatrix[, c(plot.type, unname(mapply(revcID115, plot.type)))])
PlotTransBiasID115Internal(list = list, ymax = NULL)
results[[vcf.names[i]]] <- list(plot.success = TRUE, p.values = list$p.values)
}
grDevices::dev.off()
return(results)
}
#' Add sequence context and transcript information to an in-memory ID VCF
#'
#' @param ID.vcfs A list of in-memory ID VCF as a \code{data.frame}.
#'
#' @param ref.genome A \code{ref.genome} argument as described in
#' \code{\link{ICAMS}}.
#'
#' @param trans.ranges Optional. If \code{ref.genome} specifies one of the
#' \code{BSgenome} object
#' \enumerate{
#' \item \code{BSgenome.Hsapiens.1000genomes.hs37d5}
#' \item \code{BSgenome.Hsapiens.UCSC.hg38}
#' \item \code{BSgenome.Mmusculus.UCSC.mm10}
#' }
#' then the function will infer \code{trans.ranges} automatically. Otherwise,
#' user will need to provide the necessary \code{trans.ranges}. Please refer to
#' \code{\link{TranscriptRanges}} for more details.
#' If \code{is.null(trans.ranges)} do not add transcript range
#' information.
#'
#' @param vcf.names list of names of the vcfs
#'
#' @import ICAMS
#'
#' @return A list of in-memory ID VCFs each a \code{data.table}. These have been annotated
#' with the sequence context (column name \code{seq.context}) and with
#' transcript information in the form of a gene symbol (e.g. \code{"TP53"})
#' and transcript strand. This information is in the columns
#' \code{trans.start.pos}, \code{trans.end.pos} , \code{trans.strand},
#' \code{trans.Ensembl.gene.ID} and \code{trans.gene.symbol} in the output.
#' These columns are not added if \code{is.null(trans.ranges)}.
#'
#' @export
#'
#' @examples
#' file <- c(system.file("extdata/Strelka-ID-vcf",
#' "Strelka.ID.GRCh37.s1.vcf",
#' package = "ICAMSxtra"))
#' list.of.vcfs <- ICAMS::ReadAndSplitVCFs(file, variant.caller = "strelka")$ID
#' if (requireNamespace("BSgenome.Hsapiens.1000genomes.hs37d5", quietly = TRUE)) {
#' annotated.ID.vcfs <- AnnotateIDVCFsWithTransRanges(list.of.vcfs, ref.genome = "hg19",
#' trans.ranges = ICAMS::trans.ranges.GRCh37)
#' }
AnnotateIDVCFsWithTransRanges <-
function(ID.vcfs, ref.genome, trans.ranges = NULL, vcf.names = NULL) {
if (is.null(vcf.names)) {
vcf.names <- names(ID.vcfs)
}
n <- length(ID.vcfs)
retval <- list()
for (i in 1:n){
ID.vcf <- AnnotateIDVCF(ID.vcfs[[i]], ref.genome=ref.genome)[[1]]
CheckSeqContextInIDVCF(ID.vcf, "seq.context")
trans.ranges <- ICAMS:::InferTransRanges(ref.genome, trans.ranges)
if (!is.null(trans.ranges)) {
ID.vcf <-
ICAMS:::AddTranscript(ID.vcf,
trans.ranges = trans.ranges,
ref.genome = ref.genome)
}
retval[[vcf.names[[i]]]] <- (as.data.table(ID.vcf))
}
return(retval)
}
###############################################################################
# S3 Methods
###############################################################################
#' @export
`[.ID115Catalog` <- function (x, i, j, drop = if (missing(i)) TRUE else length(cols) ==
1) {
y <- NextMethod("[")
if (inherits(y, c("integer", "numeric"))) {
return(y)
} else {
class(y) <- class(x)
for (at in c("ref.genome", "catalog.type", "abundance", "region")) {
attr(y, at) <- attr(x, at, exact = TRUE)
}
return(y)
}
}
#' @export
`cbind.ID115Catalog` <- function (..., deparse.level = 1) {
x <- base::cbind.data.frame(..., deparse.level = deparse.level)
x <- data.matrix(x)
class(x) <- class(..1)
list0 <- list(...)
x <- CheckAndAssignAttributes(x, list0)
return(x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.