R/ID115_fns.R

Defines functions StrandBiasGetPlottablesID115 revcID115 VCFsToID115CatalogsAndPlotToPdf PlotID115AsID83ToPdf PlotID115CatalogToPdf PlotID115Catalog ReadID115Catalog WriteID115Catalog CollapseID115CatalogsToID83s Collapse115CatalogTo83 VCFsToID115Catalogs as.catalog.for.ID115 CreateOneColID115Matrix CanonicalizeID115 Canonicalize1ID115 Canonicalize1INS115 Canonicalize1Del115

Documented in as.catalog.for.ID115 Canonicalize1Del115 Canonicalize1ID115 Canonicalize1INS115 CanonicalizeID115 Collapse115CatalogTo83 CollapseID115CatalogsToID83s CreateOneColID115Matrix PlotID115AsID83ToPdf PlotID115Catalog PlotID115CatalogToPdf ReadID115Catalog VCFsToID115Catalogs VCFsToID115CatalogsAndPlotToPdf WriteID115Catalog

###############################################################################
# Utility functions
###############################################################################

#' Given a deletion and its sequence context, categorize it
#' 
#' This function is primarily for internal use, but we export it
#' to document the underlying logic.
#' 
#' See 
#' \url{https://github.com/steverozen/ICAMS/raw/master/data-raw/PCAWG7_indel_classification_2017_12_08.xlsx}
#' for additional information on deletion 
#' mutation classification.
#' 
#' This function first handles deletions in homopolymers, then
#' handles deletions in simple repeats with
#' longer repeat units, (e.g. \code{CACACACA}, see
#' \code{\link{FindMaxRepeatDel}}),
#' and if the deletion is not in a simple repeat,
#' looks for microhomology (see \code{\link{FindDelMH}}).
#' 
#' See the code for unexported function \code{\link{CanonicalizeID}}
#' and the functions it calls for handling of insertions.
#'
#' @param context The deleted sequence plus ample surrounding
#'   sequence on each side (at least as long as \code{del.seq}).
#'
#' @param del.seq The deleted sequence in \code{context}.
#'
#' @param pos The position of \code{del.sequence} in \code{context}.
#'
#' @param trace If > 0, then generate messages tracing
#' how the computation is carried out.
#' 
#' @param strand NULL by default. But when called by PlotTransBiasInternal,
#' \code{strand} is either \code{+} or \code{-}. The return value will 
#' include :trans or :nontrans indicating whether the deletion occurred on
#' the transcribed or non-transcribed strand.
#' 
#' @return A string that is the canonical representation
#'  of the given deletion type. Return \code{NA} 
#'  and raise a warning if
#'  there is an un-normalized representation of
#'  the deletion of a repeat unit.
#'  See \code{FindDelMH} for details.
#'  (This seems to be very rare.)
#'  
#'  @keywords internal
Canonicalize1Del115 <- function(context, del.seq, pos, trace = 0, strand) {
  # Is the deletion involved in a repeat?
  rep.count <- FindMaxRepeatDel(context, del.seq, pos)
  
  rep.count.string <- ifelse(rep.count >= 5, "5+", as.character(rep.count))
  deletion.size <- nchar(del.seq)
  deletion.size.string <- 
    ifelse(deletion.size >= 5, "5+", as.character(deletion.size))
  
  # Category is "1bp deletion"
  if (deletion.size == 1) {
    if (rep.count == 0){
      if (del.seq %in% c("A","G")){
        context <- revc(context)
        pos <- nchar(context) - pos + 1
        del.seq <- substr(context,pos,pos)
        preceding.base <- substr(context,pos+1,pos+1)
        proceeding.base <- substr(context,pos-1,pos-1)
        retval <- paste0("DEL:",del.seq,":1:0_",preceding.base,proceeding.base)
        if (!is.na(strand)){
          retval <- paste0(retval, ifelse(strand=="+", ":nontrans", ":trans"))
        }
      } else {
        preceding.base <- substr(context,pos-1,pos-1)
        proceeding.base <- substr(context,pos+1,pos+1)
        retval <- paste0("DEL:",del.seq,":1:0_",preceding.base,proceeding.base)
        if (!is.na(strand)){
          retval <- paste0(retval, ifelse(strand=="+", ":trans", ":nontrans"))
        }
      }
      return(retval)
      
    } else {
      if (del.seq == "A") del.seq <- "T"
      if (del.seq == "G") del.seq <- "C"
      return(paste0("DEL:", del.seq, ":1:", rep.count.string))
    }
  }
  
  # Category is ">2bp deletion"
  if (rep.count > 0) {
    return(
      paste0("DEL:repeats:", deletion.size.string, ":", rep.count.string))
  }
  
  # We have to look for microhomology
  microhomology.len <- FindDelMH(context, del.seq, pos, trace = trace)
  if (microhomology.len == -1) {
    warning("Non-normalized deleted repeat ignored:",
            "\ncontext: ", context,
            "\ndeleted sequence: ", del.seq,
            "\nposition of deleted sequence: ", pos)
    return(NA)
  }
  
  if (microhomology.len == 0) {
    stopifnot(rep.count.string == 0)
    # Categorize and return non-repeat, non-microhomology deletion
    return(paste0("DEL:repeats:", deletion.size.string, ":0"))
  }
  
  microhomology.len.str <-
    ifelse(microhomology.len >= 5, "5+", as.character(microhomology.len))
  
  return(paste0(
    "DEL:MH:", deletion.size.string, ":", microhomology.len.str))
}


#' Given an insertion and its sequence context, categorize it.
#' @param context 
#' @param ins.sequence 
#' @param pos The position of \code{ins.sequence} in \code{context}.
#' @param trace 
#' @param strand
#'
#' @return A string that is the canonical representation of 
#' the given insertion type.
#' 
#' @details same as Canonicalize1INS except when 
#' \code{insertion.size == 1} & \code{rep.count == 0}
#' in which the format will be \code{INS:Y:1:0_PF}
#' where Y is C or T, P is the preceding nucleotide, and
#' F is the following nucleotide
#' 
#' @keywords internal
Canonicalize1INS115 <- function(context, ins.sequence, pos, trace = 0, strand) {
  if (trace > 0) {
    message("Canonicalize1ID(", context, ",", ins.sequence, ",", pos, "\n")
  }
  rep.count <- ICAMS:::FindMaxRepeatIns(context, ins.sequence, pos)
  rep.count.string <- ifelse(rep.count >= 5, "5+", as.character(rep.count))
  insertion.size <- nchar(ins.sequence)
  insertion.size.string <-
    ifelse(insertion.size >= 5, "5+", as.character(insertion.size))
  
  if (insertion.size == 1) {
    if (rep.count == 0){
      if (ins.sequence %in% c("A","G")){
        context <-  revc(context)
        ins.sequence <-  revc(ins.sequence)
        pos <- nchar(context) - pos + 1
        preceding.base <- substr(context, pos, pos)
        proceeding.base <- substr(context, pos-1, pos-1)
        retval <- paste0("INS:",ins.sequence,":1:0_",preceding.base,proceeding.base)
        if (!is.na(strand)){
          retval <- paste0(retval, ifelse(strand=="+",":nontrans", ":trans"))
        }
      } else {
        preceding.base <- substr(context, pos, pos)
        proceeding.base <- substr(context, pos+1, pos+1)
        retval <- paste0("INS:",ins.sequence,":1:0_",preceding.base,proceeding.base)
        if (!is.na(strand)){
          retval <- paste0(retval, ifelse(strand=="+", ":trans", ":nontrans"))
        }
      }
    } else {
      if (ins.sequence == "A") ins.sequence <- "T"
      if (ins.sequence == "G") ins.sequence <- "C"
      retval <-
        (paste0("INS:", ins.sequence, ":1:", rep.count.string))
    }
  } else { 
    retval <-
      paste0("INS:repeats:", insertion.size.string, ":", rep.count.string)
  }
  if (trace > 0) message(retval)
  return(retval)
}

#' @title Given a single insertion or deletion in context categorize it.
#'
#' @param context Ample surrounding
#'   sequence on each side of the insertion or deletion.
#'
#' @param ref The reference allele (vector of length 1)
#'
#' @param alt The alternative allele (vector of length 1)
#'
#' @param pos The position of \code{ins.or.del.seq} in \code{context}.
#'
#' @param trace If > 0, then generate messages tracing
#' how the computation is carried out.
#' 
#' @param strand NULL by default. But when called by \code{StrandBiasGetPlottablesID115},
#'  \code{strand} is either \code{+} or \code{-}. The return value will 
#'  include :trans or :nontrans indicating whether the deletion occurred on
#'  the transcribed or non-transcribed strand.
#
#' @return A string that is the canonical representation
#'  of the type of the given
#'  insertion or deletion.
#'  Return \code{NA} 
#'  and raise a warning if
#'  there is an un-normalized representation of
#'  the deletion of a repeat unit.
#'  See \code{FindDelMH} for details.
#'  (This seems to be very rare.)
#'
#' @keywords internal
Canonicalize1ID115 <- function(context, ref, alt, pos, trace = 0, strand) {
  if (trace > 0) {
    message("Canonicalize1ID115(", context, ",", ref, ",", alt, ",", pos, "\n")
  }
  if (nchar(alt) < nchar(ref)) {
    # A deletion
    return(Canonicalize1Del115(context, ref, pos + 1, trace, strand))
  } else if (nchar(alt) > nchar(ref)) {
    # An insertion
    return(Canonicalize1INS115(context, alt, pos, trace, strand))
  } else {
    stop("Non-insertion / non-deletion found: ", ref, " ", alt, " ", context)
  }
}

#' @title Determine the mutation types of insertions and deletions.
#'
#' @param context A vector of ample surrounding
#'   sequence on each side the variants
#'
#' @param ref Vector of reference alleles
#'
#' @param alt Vector of alternative alleles
#'
#' @param pos Vector of the positions of the insertions and deletions in
#'  \code{context}.
#'  
#' @param strand NULL by default. But when called by \code{StrandBiasGetPlottablesID115},
#'  \code{strand} is either \code{+} or \code{-}. The return value will 
#'  include :trans or :nontrans indicating whether the deletion occurred on
#'  the transcribed or non-transcribed strand.
#' 
#' @return A vector of strings that are the canonical representations
#'  of the given insertions and deletions.
#'
#' @importFrom utils head
#'
#' @keywords internal
CanonicalizeID115 <- function(context, ref, alt, pos, strand) {
  
  if (all(substr(ref, 1, 1) == substr(alt, 1, 1))) {
    ref <- substr(ref, 2, nchar(ref))
    alt <- substr(alt, 2, nchar(alt))
  } else {
    stopifnot(ref != "" | alt != "")
  }
  ret <- mapply(Canonicalize1ID115, context, ref, alt, pos, 0, strand)  
  return(ret)
}

#' @title Create one column of the matrix for an indel catalog from *one* in-memory VCF.
#'
#' @param ID.vcf An in-memory VCF as a data.frame annotated by the
#'   \code{\link{AnnotateIDVCF}} function. It must only
#'   contain indels and must \strong{not} contain SBSs
#'   (single base substitutions), DBSs, or triplet
#'   base substitutions, etc.
#'
#'   One design decision for variant callers is the representation of "complex
#'   indels", e.g. mutations e.g. CAT > GC. Some callers represent this as C>G,
#'   A>C, and T>_. Others might represent it as CAT > CG. Multiple issues can
#'   arise. In PCAWG, overlapping indel/SBS calls from different callers were
#'   included in the indel VCFs.
#'
#' @param SBS.vcf This argument defaults to \code{NULL} and
#'   is not used. Ideally this should be an in-memory SBS VCF 
#'   as a data frame. The rational is that for some data,
#'   complex indels might be represented as an indel with adjoining
#'   SBSs. 
#'
#' @return A list of a 1-column matrix containing the mutation catalog
#'   information and the annotated VCF with ID categories information added.
#'
#' @keywords internal
CreateOneColID115Matrix <- function(ID.vcf, SBS.vcf = NULL) {
  if (nrow(ID.vcf) == 0) {
    # Create 1-column matrix with all values being 0 and the correct row labels.
    catID <- matrix(0, nrow = length(ICAMSxtra::catalog.row.order$ID115), ncol = 1)
    rownames(catID) <- ICAMSxtra::catalog.row.order$ID115
    return(catID)
  }
  
  if (!is.null(SBS.vcf)) 
    warning("Argument SBS.vcf in CreateOneColIDMatrix is always ignored")
  
  canon.ID <- CanonicalizeID115(ID.vcf$seq.context,
                                ID.vcf$REF,
                                ID.vcf$ALT,
                                ID.vcf$seq.context.width + 1,
                                strand = NA)
  
  if (any(is.na(canon.ID))) warning("NA ID categories ignored")
  idx <- which(is.na(canon.ID))
  canon.ID <- canon.ID[!is.na(canon.ID)]
  if (length(idx) > 0) {
    ID.vcf <- ID.vcf[-idx, ]
  }
  
  out.ID.vcf <- cbind(ID.vcf, ID.class = canon.ID)
  
  # Create the ID catalog matrix
  tab.ID <- table(canon.ID)
  
  row.order <- data.table(rn = ICAMSxtra::catalog.row.order$ID115)
  
  ID.dt <- as.data.table(tab.ID)
  # ID.dt has two columns, names cannon.dt (from the table() function
  # and N (the count)
  
  ID.dt2 <-
    merge(row.order, ID.dt, by.x = "rn", by.y = "canon.ID", all = TRUE)
  ID.dt2[ is.na(N) , N := 0]
  stopifnot(setequal(unlist(ID.dt2$rn), ICAMSxtra::catalog.row.order$ID115))
  
  ID.mat <- as.matrix(ID.dt2[ , 2])
  rownames(ID.mat) <- ID.dt2$rn
  return(list(catalog = ID.mat[ICAMSxtra::catalog.row.order$ID115, , drop = FALSE],
              annotated.VCF = out.ID.vcf))
}


#' Create a catalog from a \code{matrix}, \code{data.frame}, or \code{vector}
#'
#' @param object A numeric \code{matrix}, numeric \code{data.frame},
#' or \code{vector}.
#' If a \code{vector}, converted to a 1-column \code{matrix}
#' with rownames taken from the element names of the \code{vector}
#' and with column name \code{"Unknown"}.
#' If argument \code{infer.rownames}
#'  is \code{FALSE} than this argument must have
#'   rownames to denote the mutation types. See \code{\link{CatalogRowOrder}}
#'   for more details.
#'
#' @param ref.genome A \code{ref.genome} argument as described in
#'   \code{\link{ICAMS}}.
#'
#' @param region A character string designating a region, one of
#' \code{genome}, \code{transcript}, \code{exome}, \code{unknown};
#' see \code{\link{ICAMS}}.
#'
#' @param catalog.type One of "counts", "density", "counts.signature",
#'   "density.signature".
#'
#' @param abundance If \code{NULL}, then
#'  inferred if \code{ref.genome}
#'  is one of
#'  the reference genomes known to ICAMS and \code{region}
#'  is not \code{unknown}. See \code{\link{ICAMS}}.
#'  The argument \code{abundance} should
#'  contain the counts of different source sequences for mutations
#'  in the same format as the numeric vectors in \code{\link{all.abundance}}.
#'  
#' @param infer.rownames If \code{TRUE}, and \code{object} has no
#' rownames, then assume the rows of \code{object} are in the
#' correct order and add the rownames implied by the number of rows
#' in \code{object} (e.g. rownames for SBS 192 if there are 192 rows).
#' If \code{TRUE}, \strong{be sure the order of rows is correct.}
#'
#' @return A catalog as described in \code{\link{ICAMS}}.
#'
#' @export
as.catalog.for.ID115 <- function(object, 
                                 ref.genome = NULL, 
                                 region = "unknown", 
                                 catalog.type = "counts", 
                                 abundance = NULL,
                                 infer.rownames = FALSE) {
  if (!is.matrix(object)) {
    if (is.data.frame(object)) {
      object <- as.matrix(object)
    } else if (is.vector(object)) {
      obj2 <- matrix(object, ncol = 1)
      rownames(obj2) <- names(object)
      colnames(obj2) <- "Unknown"
      object <- obj2
    } else {
      stop("object must be numeric matrix, vector, or data frame")
    }
  }
  stopifnot(mode(object) == "numeric")
  
  if (is.null(rownames(object))) {
    if (!infer.rownames) {
      stop("Require correct rownames on object unless infer.rownames == TRUE")
    }
    rownames(object) <- ICAMS:::InferRownames(object)
  } else {
    #removed call to checkandreorder rownames
    object <- object[ICAMSxtra::catalog.row.order$ID115, ,drop=FALSE] 
  }
  
  # StopIfRegionIllegal(region)
  
  class.string  <- "ID115Catalog"
  
  ICAMS:::StopIfCatalogTypeIllegal(catalog.type)
  
  if (!is.null(ref.genome)) {
    ref.genome <- ICAMS:::NormalizeGenomeArg(ref.genome)
  }
  
  attr(object, "ref.genome") <- ref.genome
  
  attr(object, "catalog.type") <- catalog.type
  
  attr(object, "abundance") <- NULL
  
  class(object) <- append(class(object), class.string, after = 0)
  class(object) <- unique(attributes(object)$class)
  
  attr(object, "region") <- region
  
  return(object)
}


###############################################################################
# Catalog functions 
###############################################################################

#' Create ID (small insertion and deletion) catalog from ID VCFs
#'
#' @param list.of.vcfs List of in-memory ID VCFs. The list names will be
#' the sample ids in the output catalog.
#'
#' @param ref.genome A \code{ref.genome} argument as described in
#'   \code{\link{ICAMS}}.
#'
#' @param region A character string acting as a region identifier, one of
#' "genome", "exome".
#' 
#' @param flag.mismatches Optional. If > 0, then if there are mismatches to
#'   references in the ID (insertion/deletion) VCF, generate messages showing
#'   the mismatched rows and continue. Otherwise \code{stop} if there are
#'   mismatched rows. See \code{\link{AnnotateIDVCF}} for more details.
#'
#' @return A list of two elements. 1st element \code{catalog} is the ID (small
#'   insertion and deletion) catalog with attributes added. See
#'   \code{\link{as.catalog}} for more details. 2nd element
#'   \code{annotated.vcfs} is a list of data frames which contain the original
#'   VCF with three additional columns \code{seq.context.width},
#'   \code{seq.context} and \code{ID.class} added. The category assignment of
#'   each ID mutation in VCF can be obtained from \code{ID.class} column.
#'   
#' @note In ID (small insertion and deletion) catalogs, deletion repeat sizes
#'   range from 0 to 5+, but for plotting and end-user documentation
#'   deletion repeat sizes range from 1 to 6+.
#'   
#' @export
VCFsToID115Catalogs<-function(list.of.vcfs, ref.genome, region = "unknown",
                              flag.mismatches = 0){
  ncol <- length(list.of.vcfs)
  
  # Create a 0-column matrix with the correct row labels.
  catID <- matrix(0, nrow = length(ICAMSxtra::catalog.row.order$ID115), ncol = 0)
  rownames(catID) <- ICAMSxtra::catalog.row.order$ID115
  out.list.of.vcfs <- list()
  
  vcf.names <- names(list.of.vcfs)
  for (i in 1:ncol) {
    ID <- list.of.vcfs[[i]]
    list <- AnnotateIDVCF(ID, ref.genome = ref.genome,
                          flag.mismatches = flag.mismatches,
                          name.of.VCF = vcf.names[i])
    # Unlike the case for SBS and DBS, we do not
    # add transcript information.
    tmp <- CreateOneColID115Matrix(list$annotated.vcf)
    one.ID.column <- tmp[[1]]
    out.list.of.vcfs <- c(out.list.of.vcfs, list(tmp[[2]]))
    rm(ID)
    catID <- cbind(catID, one.ID.column)
  }
  
  colnames(catID) <- names(list.of.vcfs)
  names(out.list.of.vcfs) <- names(list.of.vcfs)
  
  return(list(catalog = 
                as.catalog.for.ID115(catID, ref.genome = ref.genome,
                                     region = region, catalog.type = "counts"),
              annotated.vcfs = out.list.of.vcfs))
}

#' "Collapse" a catalog
#' 
#' @description 
#' 
#' \code{Collapse115CatalogTo83} Collapse a ID 115 catalog
#' to a ID 83 catalog.
#'
#' @param catalog A catalog as defined in \code{\link{ICAMS}}.
#'
#' @return A catalog as defined in \code{\link{ICAMS}}.
#' 
#' @export
Collapse115CatalogTo83 <- function(catalog) {
  dt115 <- data.table(catalog)
  list83 <- list()
  list83[1] <- sum(dt115[1:9])
  list83[7] <- sum(dt115[15:23])
  list83[13] <- sum(dt115[29:37])
  list83[19] <- sum(dt115[43:51])
  for (i in 0:4){
    list83[i+2] <- dt115[i+10]
    list83[i+8] <- dt115[i+24]
    list83[i+14] <- dt115[i+38]
  }
  for (i in 0:63){
    list83[i+20] <- dt115[i+52] 
  }
  mat83 <- matrix(unlist(list83),ncol=1,byrow=TRUE)
  rownames(mat83) <- ICAMS::catalog.row.order$ID
  mat83 <- mat83[ICAMS::catalog.row.order$ID, , drop = FALSE]
  
  cat83 <-
    as.catalog(
      object = mat83,
      ref.genome = attributes(catalog)$ref.genome,
      region = attributes(catalog)$region,
      catalog.type = attributes(catalog)$catalog.type,
      abundance = NULL)
  
  #hdp0 etc... otherwise plot will not have names
  attributes(cat83)$dimnames[[2]] <- attributes(catalog)$dimnames[[2]]
  
  return(cat83)
}

#' "Collapse" a matrix of ID 115 catalogs to ID 83 catalog
#'
#' @param catalogs A catalog as defined in \code{\link{ICAMS}}.
#'
#' @return A catalog as defined in \code{\link{ICAMS}}.
#' @export
CollapseID115CatalogsToID83s <- function(catalogs){
  n <- ncol(catalogs)
  cat115s <-catalogs[, 1, drop=FALSE]
  cat83s <- Collapse115CatalogTo83(cat115s)
  for (i in 2:n){
    cat115_i <- catalogs[, i, drop=FALSE]
    cat83_i <-Collapse115CatalogTo83(cat115_i)
    cat83s <- cbind(cat83s,cat83_i)
  }
  return (cat83s)
}

###############################################################################
# Read and write functions
###############################################################################

#' @title Write a catalog to a file.
#' 
#' @param catalog A catalog as defined in \code{\link{ICAMS}} with attributes added.
#' See \code{\link{as.catalog}} for more details.
#'
#' @param file The path of the file to be written.
#' 
#' @param strict If TRUE, then stop if additional checks on the input fail.
#'
#' @export
WriteID115Catalog <- function(catalog, file, strict = TRUE) {
  ICAMS:::WriteCat(catalog, file, 115, ICAMSxtra::catalog.row.order$ID115,
           catalog.row.headers.ID115, strict)
}

#' Read catalog
#'
#' Read a catalog in standardized format from path.
#'
#' See also \code{\link{WriteCatalog}}
#'
#' @param file Path to a catalog on disk in the standardized format.
#'
#' @param ref.genome A \code{ref.genome} argument as described in
#'   \code{\link{ICAMS}}.
#'
#' @param region region A character string designating a genomic region;
#'  see \code{\link{as.catalog}} and \code{\link{ICAMS}}.
#'
#' @param catalog.type One of "counts", "density", "counts.signature",
#'   "density.signature".
#'
#' @return A catalog as an S3 object; see \code{\link{as.catalog}}.
#'
#' @note In ID (small insertion and deletion) catalogs, deletion repeat sizes
#'   range from 0 to 5+, but for plotting and end-user documentation
#'   deletion repeat sizes range from 1 to 6+.
#'
#' @section Comments:
#' To add or change attributes of the catalog, you can use function
#' \code{\link[base]{attr}}. \cr For example, \code{attr(catalog, "abundance")
#' <- custom.abundance}.
#' 
#' @export
ReadID115Catalog <- function(file, ref.genome = NULL, region = "unknown", 
                             catalog.type = "counts") {
  cos <- data.table::fread(file)
  stopifnot(nrow(cos) == 115)
  
  rn <- apply(cos[ , 1:4], MARGIN = 1, paste, collapse = ":")
  out <- as.matrix(cos[ , -(1:4), drop = FALSE])
  
  stopifnot(setdiff(rn, ICAMSxtra::catalog.row.order$ID115) == c())
  stopifnot(setdiff(ICAMSxtra::catalog.row.order$ID115, rn) == c())
  stopifnot(rn == ICAMSxtra::catalog.row.order$ID115)
  rownames(out) <- rn
  #   if (ncol(out) == 1) colnames(out) <- colnames(cos)[3] 
  out <- out[ICAMSxtra::catalog.row.order$ID115, , drop = FALSE]
  return(as.catalog.for.ID115(out, ref.genome, region, catalog.type))
}

###############################################################################
# Plotting functions for ID(insertion and deletion) catalog start here
###############################################################################

#' Plot \strong{one} spectrum or signature
#' 
#' Plot the spectrum of \strong{one} sample or plot \strong{one} signature. The
#' type of graph is based on one attribute("catalog.type") of the input catalog.
#' You can first use \code{\link{TransformCatalog}} to get different types of
#' catalog and then do the plotting.
#' 
#' @param catalog A catalog as defined in \code{\link{ICAMS}} with attributes added.
#' See \code{\link{as.catalog}} for more details.
#'
#' @param ylim Has the usual meaning. Only implemented for SBS96Catalog and
#'   IndelCatalog.
#'
#' @export
PlotID115Catalog <- function(catalog, ylim = NULL){
  stopifnot(dim(catalog) == c(115, 1))
  
  indel.class.col <- c("#fdbe6f",
                       "#ff8001",
                       "#b0dd8b",
                       "#36a12e",
                       "#fdcab5",
                       "#fc8a6a",
                       "#f14432",
                       "#bc141a",
                       "#d0e1f2",
                       "#94c4df",
                       "#4a98c9",
                       "#1764ab",
                       "#e2e2ef",
                       "#b6b6d8",
                       "#8683bd",
                       "#61409b")
  
  num.classes <- length(catalog)
  cols <- rep(indel.class.col,
              c(14, 14, 14, 14,
                6, 6, 6, 6,
                6, 6, 6, 6,
                1, 2, 3, 5))
  
  to.plot <- catalog[, 1]
  
  catalog.type <- attributes(catalog)$catalog.type
  if (catalog.type == "counts") {
    # Set a minimum value for ymax to make the plot more informative
    ymax <- 4 * ceiling(max(max(to.plot) * 1.3, 10) / 4)
  } else if (catalog.type == "counts.signature") {
    ymax <- ifelse(max(to.plot) * 1.3 > 1, 1, max(to.plot) * 1.3)
  } else {
    stop('\nCan only plot IndelCatalog with "counts" or "counts.signature" catalog.type.')
  }
  if (is.null(ylim)) {
    ylim <- c(0, ymax)
  } else {
    ymax <- ylim[2]
  }
  
  # Barplot
  bp <- barplot(catalog[, 1], ylim = c(0, ymax), axes = FALSE, xaxt = "n",
                lwd = 3, space = 1.35, border = NA, col = cols, xpd = NA,
                xaxs = "i", yaxt = "n")
  
  if (catalog.type == "counts") {
    # Calculate and draw the total counts for each major type
    counts <- integer(16)
    idx <- c(14, 28, 42, 56, seq(62, 104, 6), 105, 107, 110, 115)
    # midpoints of major types
    # c(7, 21, 35, 49, 59.5, 65.5, 71.5, 77.5, 83.5, 89.5, 95.5, 101.5, 105, 106.5, 109, 112.5)
    midx <- c(seq(7, 49, 14), seq(59.5, 101.5, 6), 105, 106.5, 109, 112.5)
    # scale midpoints to horizontal width of plot
    idx2 <- midx * 270 / 115
    for (i in 1:16) {
      if (i == 1) {
        counts[i] <- sum(catalog[1:idx[1], 1])
      } else {
        counts[i] <- sum(catalog[(idx[i - 1] + 1):idx[i], 1])
      }
      text(idx2[i], ymax * 0.6, labels = counts[i],
           cex = 0.68, adj = 1, xpd = NA)
    }
  } 
  
  # Draw box and grid lines
  rect(xleft = bp[1] - 1.0, 0, xright = bp[num.classes] + 1.0, ymax, col = NA,
       border = "grey60", lwd = 0.5, xpd = NA)
  segments(bp[1] - 1.0, seq(0, ymax, ymax / 4), bp[num.classes] + 1,
           seq(0, ymax, ymax / 4), col = "grey60", lwd = 0.5, xpd = NA)
  
  # Draw mutation class labels and lines above each class
  maj.class.names <- c("1bp deletion", "1bp insertion",
                       ">1bp deletions at repeats\n(Deletion length)",
                       ">1bp insertions at repeats\n(Insertion length)",
                       "Deletions with microhomology\n(Deletion length)")
  x.left <- bp[c(seq(1, 57, 14), seq(63, 99, 6), 105, 106, 108, 111)] - 0.8
  x.right <- bp[c(seq(14, 56, 14), seq(62, 104, 6), 105, 107, 110, 115)] + 0.8
  class.pos <- c((x.left[seq(1, 4, 2)] + x.right[seq(2, 5, 2)]) / 2,
                 (x.left[c(6, 10)] + x.right[c(8, 12)] - 12) / 2,
                 (x.left[13] + x.right[length(x.left)]) / 2)
  category.lab <- c(rep(c("C", "T"), 2), rep(c("2", "3", "4", "5+"), 3))
  category.col <- c(rep(c("black", "white"), 2),
                    rep(c("black", "black", "black", "white"), 3))
  
  # Draw lines above each class
  # draw at ymax instead of ymax
  rect(xleft = x.left, ymax * 1.02, xright = x.right, ymax * 1.09,
       col = indel.class.col, border = NA, xpd = NA)
  text((x.left + x.right) / 2, ymax * 1.05, labels = category.lab,
       cex = 0.65, col = category.col, xpd = NA)
  
  # Draw mutation class labels at the top of the figure
  text(class.pos, ymax * 1.2, labels = maj.class.names, cex = 0.75, xpd = NA)
  
  # Draw the sample name information of the sample
  text(1.5, ymax * 7 / 8, labels = colnames(catalog),
       adj = 0, cex = 0.8, font = 2)
  
  # Draw y axis
  y.axis.values <- seq(0, ymax, ymax / 4)
  if (attributes(catalog)$catalog.type != "counts") {
    y.axis.labels <- format(round(y.axis.values, 2), nsmall = 2)
    text(-9, ymax / 2, labels = "counts proportion",
         srt = 90, xpd = NA, cex = 0.8)
  } else {
    y.axis.labels <- y.axis.values 
    text(-8, ymax / 2, labels = "counts",
         srt = 90, xpd = NA, cex = 0.8)
  }
  text(0, y.axis.values, labels = y.axis.labels,
       las = 1, adj = 1, xpd = NA, cex = 0.75)
  
  # Draw x axis labels
  mut.type <- c("AA","AT","AG","TA","TT","TG","GA","GT","GG",
                "2", "3", "4", "5", "6+",
                "AA","AC","AG","CA","CC","CG","GA","GC","GG",
                "2", "3", "4", "5", "6+",
                "AA","AT","AG","TA","TT","TG","GA","GT","GG",
                "1", "2", "3", "4", "5+",
                "AA","AC","AG","CA","CC","CG","GA","GC","GG",
                "1", "2", "3", "4", "5+",
                rep(c("1", "2", "3", "4", "5", "6+"), 4),
                rep(c("0", "1", "2", "3", "4", "5+"), 4),
                "1", "1", "2", "1", "2", "3", "1", "2", "3", "4", "5+")
  bottom.pos <- c((x.left[1] + x.right[2]) / 2, (x.left[3] + x.right[4]) / 2,
                  class.pos[3 : length(class.pos)])
  bottom.lab <- c("Homopolymer length", "Homopolymer length",
                  "Number of repeat units", "Number of repeat units",
                  "Microhomology length")
  rect(xleft = x.left, -ymax * 0.09, xright = x.right, -ymax * 0.02,
       col = indel.class.col, border = NA, xpd = NA)
  text(bp, -ymax * 0.11, labels = mut.type, cex = 0.65, xpd = NA, srt = 270, adj = 0)
  text(bottom.pos, -ymax * 0.27, labels = bottom.lab, cex = 0.75, xpd = NA)
  
  return(list(plot.success = TRUE))
}

#' Plot catalog to a PDF file
#'
#' Plot catalog to a PDF file. The type of graph is based on one
#' attribute("catalog.type") of the input catalog. You can first use
#' \code{\link{TransformCatalog}} to get different types of catalog and then do
#' the plotting.
#' 
#' @param catalog A catalog as defined in \code{\link{ICAMS}} with attributes added.
#' See \code{\link{as.catalog}} for more details.
#'
#' @param file The name of the PDF file to be produced.
#' @param ylim Has the usual meaning. Only implemented for SBS96Catalog and IndelCatalog.
#'
#' @return A list whose first element is a logic value indicating whether the
#'   plot is successful. For \strong{SBS192Catalog} with "counts" catalog.type
#'   and non-null abundance and \code{plot.SBS12 = TRUE}, the list will have a
#'   second element which is a list containing the strand bias statistics.
#'   
#' @note The sizes of repeats involved in deletions range from 0 to 5+ in the
#'   mutational-spectra and signature catalog rownames, but for plotting and
#'   end-user documentation deletion repeat sizes range from 1 to 6+.
#' 
#' @export
PlotID115CatalogToPdf <-
  function(catalog, file, ylim = NULL) {
    # Setting the width and length for LANDSCAPE A4 size plotting
    grDevices::pdf(file, width = 11.6929, height = 8.2677, onefile = TRUE)
    
    n <- ncol(catalog)
    opar <- par(mfrow = c(4, 1), mar = c(3, 4, 2.5, 2), oma = c(3, 3, 2, 2))
    on.exit(par(opar))
    
    for (i in 1 : n) {
      cat <- catalog[, i, drop = FALSE]
      PlotID115Catalog(cat, ylim = ylim)
    }
    grDevices::dev.off()
    return(list(plot.success = TRUE))
  }


#' @title Plot an ID 115 signatures (default) or catalog as standard ID83 and save as pdf file
#'
#' @param file The name of the PDF file to be produced.
#' 
#' @param ylim Has the usual meaning. Only implemented for SBS96Catalog and
#'   IndelCatalog.
#'   
#' @param catalog A catalog as defined in \code{\link{ICAMS}}.
#' 
#' @return A list whose first element is a logic value indicating whether the
#'   plot is successful. For \strong{SBS192Catalog} with "counts" catalog.type
#'   and non-null abundance and \code{plot.SBS12 = TRUE}, the list will have a
#'   second element which is a list containing the strand bias statistics.
#' 
#' @export 
PlotID115AsID83ToPdf <-
  function(catalog, file, ylim = NULL) {
    # Setting the width and length for LANDSCAPE A4 size plotting
    grDevices::pdf(file, width = 11.6929, height = 8.2677, onefile = TRUE)
    
    n <- ncol(catalog)
    opar <- par(mfrow = c(4, 1), mar = c(3, 4, 2.5, 2), oma = c(3, 3, 2, 2))
    on.exit(par(opar))
    
    for (i in 1 : n) {
      cat115 <- catalog[, i, drop = FALSE]
      cat83 <- Collapse115CatalogTo83(cat115)
      ICAMS:::PlotCatalog.IndelCatalog(cat83, ylim = ylim)
    }
    grDevices::dev.off()
    return(list(plot.success = TRUE))
  }

###############################################################################
# Combined functions
###############################################################################

#' @title Read a list of vcfs and plot ID115 catalogs as pdf
#' 
#' @param list.of.vcfs List of in-memory ID VCFs. The list names will be
#' the sample ids in the output catalog.
#'
#' @param ref.genome A \code{ref.genome} argument as described in
#'   \code{\link{ICAMS}}.
#'   
#' @param region A character string acting as a region identifier, one of
#' "genome", "exome".
#' 
#' @param flag.mismatches Optional. If > 0, then if there are mismatches to
#'   references in the ID (insertion/deletion) VCF, generate messages showing
#'   the mismatched rows and continue. Otherwise \code{stop} if there are
#'   mismatched rows. See \code{\link{AnnotateIDVCF}} for more details.
#'   
#' @param file The name of the PDF file to be produced.
#' 
#' @param ylim Has the usual meaning. Only implemented for SBS96Catalog and
#'   IndelCatalog.
#'
#' @export
VCFsToID115CatalogsAndPlotToPdf <-
  function(list.of.vcfs, ref.genome, region = "unknown",
           flag.mismatches = 0, file, ylim = NULL){
    catalogs <- VCFsToID115Catalogs(list.of.vcfs, ref.genome, region)$catalog
    PlotID115CatalogToPdf(catalogs, file, ylim)
  }

###############################################################################
# Strand bias functions
###############################################################################

#' @keywords internal
revcID115 <- function(string) {
  if (string %in% c(target_pooled, reverse_pooled)){
    idx_py <- which(grepl(string, target_pooled, fixed=TRUE))
    idx_pu <- which(grepl(string, reverse_pooled, fixed=TRUE))
    if (length(idx_py)==1){
      return(reverse_pooled[idx_py])
    }
    if (length(idx_pu)==1){
      return(target_pooled[idx_pu])
    }
    stop(paste0("Tried to reverse ", string," but it was not recognized\n"))
  } else {
    idx_py <- which(grepl(string, target, fixed=TRUE))
    idx_pu <- which(grepl(string, reverse, fixed=TRUE))
    if (length(idx_py)==1){
      return(reverse[idx_py])
    }
    if (length(idx_pu)==1){
      return(target[idx_pu])
    }
    stop(paste0("Tried to reverse ", string," but it was not recognized\n"))
  }
}

#' @keywords internal
StrandBiasGetPlottablesID115 <- 
  function(annotated.ID.vcf, damaged.base, vcf.name, pool) {
    df <- annotated.ID.vcf
    df <- df[!is.na(trans.gene.symbol), ]
    
    df1 <- df[, .(REF = REF[1], ALT= ALT[1], trans.strand = trans.strand[1],
                  seq.context = seq.context[1], seq.context.width = seq.context.width[1],
                  trans.gene.symbol = trans.gene.symbol[1],
                  trans.start.pos = trans.start.pos[1], 
                  trans.end.pos = trans.end.pos[1]), 
              by = .(CHROM, POS)]
    
    if (pool) {
      target_i <- target_pooled
      reverse_i <- reverse_pooled
    } else {
      target_i <- target
      reverse_i <- reverse
    }
    
    # Construct a matrix that later can be used to plot transcriptional strand bias
    result <- matrix(data = 0, nrow = 1, ncol = 2*length(target_i))
    rownames(result) <- c(1)
    
    mutation.type <- c(target_i, reverse_i)
    colnames(result) <- mutation.type
    
    if (nrow(df1) > 0){
      df1$mutation <- CanonicalizeID115(df1$seq.context, df1$REF, df1$ALT, df1$seq.context.width + 1, strand = df1$trans.strand)
      
      # filter for non-homopolymer single base indels
      df1 <- df1[which(grepl(pattern = "trans", x = df1$mutation)),,]
      
      # if aren't any non-homopolymer single base indels
      # then return value is same as if original df1 == 0
      if (nrow(df1) > 0){
        if (pool){
          df1$mutation1 <- substr(df1$mutation, 1, 5)
          df1$mutation2 <- substr(df1$mutation, 14, nchar(df1$mutation))
          df1$mutation <- paste0(df1$mutation1, ":", df1$mutation2)
          df1 <- df1[, c(-12,-13)]
        }
        
        for (i in 1:length(target_i)) {
          type <- mutation.type[i]
          idx <- which(unname(mapply(grepl, type, df1$mutation)))
          df2 <- df1[idx,,drop=FALSE]
          if (nrow(df2) == 0) {
            result[, type] <- 0
            result[, revcID115(type)] <- 0
          } else {
            nrow.nontrans <- length(which(grepl("nontrans", df2$mutation,fixed=TRUE)))
            result[1, revcID115(type)] <- nrow.nontrans
            result[1, type] <- nrow(df2) - nrow.nontrans
          }
        }  
      }
    }
    # Carry out logistic regression and get the p-values
    p.values <- CalculatePValuesID115(df1, pool = pool)
    
    return(list(plotmatrix = result, vcf.df = df1, 
                p.values = p.values, vcf.name = vcf.name))
  }

#' @importFrom stats binom.test p.adjust
#' @keywords internal
CalculatePValuesID115 <- function(dt, pool) {
  if (pool){
    target_i <- target_pooled
  } else {
    target_i <- target
  }
  p.values <- numeric(length(target_i))
  names(p.values) <- target_i
  
  #construct a matrix with 4 (or 36) columns and 2 rows
  mat <- matrix(0L, nrow = 2, ncol = length(target_i))
  colnames(mat) <- target_i
  rownames(mat) <- c("trans", "nontrans")
  if (nrow(dt)> 0){
    for (i in 1:nrow(dt)){
      mutation <- dt[i, mutation]
      if (pool){
        col.name <- substr(mutation, 1, 5)
        row.name <- substr(mutation, 7, nchar(mutation))
      } else {
        col.name <- substr(mutation, 1, 12)
        row.name <- substr(dt[i, mutation], 14, nchar(mutation))
      }
      mat[row.name, col.name] <- mat[row.name, col.name] + 1
    }
  }
  
  # proportion of indels on transcribed strand of genes
  prop.transcribed <- 0.5
  
  for (type in target_i){
    htest <- binom.test(x = mat[, type], p = prop.transcribed, 
                        alternative = "two.sided")
    p.values[type] <- htest$p.value
  }
  
  # Adjust p-values for multiple comparisons to Benjamini-Hochberg false discovery rate
  q.values <- p.adjust(p.values, method = "BH")
  return(q.values)
}

#' @keywords internal
PlotTransBiasID115Internal <- function(list, ymax = NULL) {
  result <- list$plotmatrix
  # determined whether to plot pooled version based on ncol of plotmatrix
  if (ncol(result) == 8){
    pool = TRUE
  } else {
    if (ncol(result) == 72){
      pool = FALSE
    }  else {
      stop("Error in `list` argument passed to PlotTransBiasID115Internal. ncol of plotmatrix is neither 8 nor 72")
    }
  }
  if (is.null(ymax)) {
    ymax <- max(result)
  }
  if (pool){
    tmp <- matrix(0, nrow = 2, ncol = 4)
    tmp[1,] <- result[1:4]
    tmp[2,] <- result[5:8]
    bpcol = rep(c('#394398', '#e83020'), 4)
    bplabels = colnames(list$plotmatrix)[1:4]
  } else {
    tmp <- matrix(0, nrow = 2, ncol = 36)
    tmp[1,] <- result[1:36]
    tmp[2,] <- result[37:72]
    bpcol = rep(c('#394398', '#e83020'), 36)
    bplabels = colnames(list$plotmatrix)[1:36]
  }
  
  bp <- barplot(tmp, beside = TRUE, main = list$vcf.name,
                ylim = c(0, ifelse(ymax != 0, 1.5 * ymax, 5)), 
                col = bpcol, axisnames = FALSE,
                ylab = "counts")
  colnames(bp) <- bplabels
  rownames(bp) <- c("Transcribed", "Untranscribed")
  
  legend.list <- legend("topright", legend = c("Transcribed", "Untranscribed"), 
                        fill = c('#394398', '#e83020'), bty = "n", cex = 0.8)
  
  if (pool){
    text(bp, x = (bp[1,]+bp[2,])/2, y = -0.1 * ymax, labels = bplabels, 
         pos = 1, xpd = NA, cex = 1)  
  } else {
    text(bp, x = (bp[1,]+bp[2,])/2, y = - 0.3 * ymax, labels = bplabels, 
         pos = 1, xpd = NA, cex = 0.5, srt = 270)
  }
  
  
  for (type in bplabels){
    p.value <- list$p.values[type]
    if (!is.na(p.value)) {
      # draw asterisk if p.value < 0.05
      if (p.value < 0.05){
        # Get the x coordinates of the line segment to be drawn
        x1 <- bp[1, type]
        x2 <- bp[2, type]
        
        # Get the y coordinates of the line segment to be drawn
        y1 <- y2 <- max(tmp) + 0.03 * max(result)
        
        # Draw the line segment
        segments(x1, y1, x2, y2)
        
        # Draw the asterisk on top of line segment
        x3 <- mean(c(x1, x2))
        y3 <- y1 + max(result) * 0.05
        label <- ICAMS:::AssignNumberOfAsterisks(p.value) #p.values actually are q.value
        text(x3, y3, label)
      }
    }
  }
}

#' @keywords internal
CheckSeqContextInIDVCF <- function(vcf, column.to.use) {
  if (0 == nrow(vcf)) return()
  stopifnot(!any(vcf$REF == '-'))
  stopifnot(!any(vcf$ALT == '-'))
  cut.pos <- 1 + (nchar(vcf$column.to.use) - 1) / 2
  stopifnot(cut.pos == round(cut.pos))
  cut.from.ref <- substr(vcf$column.to.use, cut.pos,
                         (cut.pos + nchar(vcf$REF)) - 1)
  error.rows <- which(vcf$REF != cut.from.ref)
  if (any(error.rows > 0)) {
    temp <- tempfile(fileext = ".csv")
    write.csv(vcf[error.rows, ], file = temp)
    stop("Seqence context of reference allele is inconsistent,",
         "see file ", temp)
  }
}

#' Plot transcription strand bias 
#'
#' @param annotated.ID.vcf An ID VCF annotated by
#'   \code{AnnotateIDVCFsWithTransRanges}. It \strong{must} have transcript range
#'   information added.
#'   
#' @param pool if true, 36 categories will be pooled to 4 categories by removing
#' trinucleotide context. This can be done if the counts of individual categories 
#' are too low, to increase power.
#'   
#' @param damaged.base One of \code{NULL}, \code{"purine"} or
#'   \code{"pyrimidine"}. This function allocates approximately
#'   equal numbers of mutations from \code{damaged.base} into
#'   each of \code{num.of.bins} bin by expression level. E.g.
#'   if \code{damaged.base} is \code{"purine"}, then mutations from
#'   A and G will be allocated in approximately equal numbers to
#'   each expression-level bin. The rationale for the name \code{damaged.base}
#'   is that the direction of strand bias is a result of whether the damage
#'   occurs on a purine or pyrimidine.
#'   If \code{NULL}, the function attempts to infer the \code{damaged.base}
#'   based on mutation counts.
#'   
#' @param ymax Limit for the y axis. If not specified, it defaults to NULL and
#'   the y axis limit equals 1.5 times of the maximum mutation counts in a
#'   specific mutation type.
#'   
#' @return A list whose first element is a logic value indicating whether the
#'   plot is successful. The second element is a named numeric vector containing
#'   the p-values printed on the plot.
#'   
#' @section Note: 
#' The strand bias statistics are Benjamini-Hochberg q-values based on two-sided
#' binomial tests of the mutation counts on the transcribed and untranscribed strands
#' relative to the actual abundances of C and T on the transcribed strand. On the
#' plot, asterisks indicate q-values as follows *, \eqn{Q<0.05}; **, \eqn{Q<0.01}; ***,
#'  \eqn{Q<0.001}.
#'  
#' @import data.table
#' 
#' @export
#'
#' @examples 
#' file <- c(system.file("extdata/Strelka-ID-vcf/",
#'                       "Strelka.ID.GRCh37.s1.vcf",
#'                       package = "ICAMSxtra"))
#' ID.vcf <- ICAMS::ReadAndSplitVCFs(file, variant.caller = "strelka")$ID
#' if (requireNamespace("BSgenome.Hsapiens.1000genomes.hs37d5", quietly = TRUE)) {
#'   annotated.ID.vcf <- AnnotateIDVCFsWithTransRanges(ID.vcf, ref.genome = "hg19",
#'                                                     trans.ranges = ICAMS::trans.ranges.GRCh37,
#'                                                     vcf.names = "Strelka.ID.GRCh37.s1.vcf")
#'   #' alternatively run below code to skip call to AnnotateIDVCFsWithTransRanges 
#'   #' load(c(system.file("extdata/annotated.ID.vcf.rda",package = "ICAMSxtra")))
#'   PlotTransBiasID115(annotated.ID.vcf = annotated.ID.vcf[[1]],
#'                       pool = TRUE)
#' }
PlotTransBiasID115 <-
  function(annotated.ID.vcf, pool, damaged.base = NULL, ymax = NULL) { 
    if (is.null(names(annotated.ID.vcf))){
      stop("annotated.ID.vcf does not have 'name' property")
    }
    list <- StrandBiasGetPlottablesID115(annotated.ID.vcf, damaged.base, pool=pool, vcf.name=names(annotated.ID.vcf))
    PlotTransBiasID115Internal(list = list, ymax = ymax)
    return(list(plot.success = TRUE, p.values = list$p.values))
  }

#' Plot transcription strand bias to a PDF file
#' 
#' @inheritParams PlotTransBiasID115
#' 
#' @param annotated.ID.vcfs ID vcfs which have been annotated with 
#' \code{AnnotateIDVCFsWithTransRanges}.
#' 
#' @param file The name of output file.
#'   
#' @importFrom stats glm
#' 
#' @inherit PlotTransBiasID115 return
#' 
#' @inheritSection PlotTransBiasID115 Note
#' 
#' @export
#'
#' @examples
#' library(ICAMS)
#' file <- c(system.file("extdata/Strelka-ID-vcf/",
#'                       "Strelka.ID.GRCh37.s1.vcf",
#'                       package = "ICAMSxtra"))
#' ID.vcf <- ICAMS::ReadAndSplitVCFs(file, variant.caller = "strelka")$ID  
#' if (requireNamespace("BSgenome.Hsapiens.1000genomes.hs37d5", quietly = TRUE)) {
#'   annotated.ID.vcf <- AnnotateIDVCFsWithTransRanges(ID.vcf, ref.genome = "hg19",
#'                                       trans.ranges = ICAMS::trans.ranges.GRCh37, 
#'                                       vcf.names = "Strelka.ID.GRCh37.s1")
#'   PlotTransBiasID115ToPdf(annotated.ID.vcfs = annotated.ID.vcf,
#'                            file = file.path(tempdir(), "test.pdf"),
#'                            pool = TRUE)
#' }
PlotTransBiasID115ToPdf <- 
  function(annotated.ID.vcfs, file, pool, damaged.base = NULL) {
    # Setting the width and length for A4 size plotting
    grDevices::pdf(file, width = 8.2677, height = 11.6929, onefile = TRUE)
    
    opar <- par(mfrow = c(4, 1), mar = c(8, 5.5, 2, 1), oma = c(1, 1, 2, 1))
    on.exit(par(opar))
    
    plot.type <- ifelse(pool, target_pooled, target)
    n <- length(annotated.ID.vcfs)
    vcf.names <- names(annotated.ID.vcfs)
    results <- list()
    for (i in 1:n){
      list <- StrandBiasGetPlottablesID115(annotated.ID.vcf = annotated.ID.vcfs[[i]], 
                                            vcf.name = vcf.names[[i]], 
                                            pool = pool, 
                                            damaged.base = damaged.base)
      ymax <- max(list$plotmatrix[, c(plot.type, unname(mapply(revcID115, plot.type)))])
      PlotTransBiasID115Internal(list = list, ymax = NULL)
      results[[vcf.names[i]]] <- list(plot.success = TRUE, p.values = list$p.values)
    }
    grDevices::dev.off()
    return(results)
  }

#' Add sequence context and transcript information to an in-memory ID VCF
#' 
#' @param ID.vcfs A list of in-memory ID VCF as a \code{data.frame}.
#' 
#' @param ref.genome A \code{ref.genome} argument as described in
#'   \code{\link{ICAMS}}.
#'
#' @param trans.ranges Optional. If \code{ref.genome} specifies one of the
#'   \code{BSgenome} object 
#'   \enumerate{
#'     \item \code{BSgenome.Hsapiens.1000genomes.hs37d5}
#'     \item \code{BSgenome.Hsapiens.UCSC.hg38}
#'     \item \code{BSgenome.Mmusculus.UCSC.mm10}
#'   }
#'   then the function will infer \code{trans.ranges} automatically. Otherwise,
#'   user will need to provide the necessary \code{trans.ranges}. Please refer to
#'   \code{\link{TranscriptRanges}} for more details.
#'   If \code{is.null(trans.ranges)} do not add transcript range
#'   information.
#'   
#' @param vcf.names list of names of the vcfs
#' 
#' @import ICAMS
#'
#' @return A list of in-memory ID VCFs each a \code{data.table}. These have been annotated
#'   with the sequence context (column name \code{seq.context}) and with
#'   transcript information in the form of a gene symbol (e.g. \code{"TP53"})
#'   and transcript strand. This information is in the columns
#'   \code{trans.start.pos}, \code{trans.end.pos} , \code{trans.strand},
#'   \code{trans.Ensembl.gene.ID} and \code{trans.gene.symbol} in the output.
#'   These columns are not added if \code{is.null(trans.ranges)}.
#'   
#' @export
#' 
#' @examples 
#' file <- c(system.file("extdata/Strelka-ID-vcf",
#'                       "Strelka.ID.GRCh37.s1.vcf",
#'                       package = "ICAMSxtra"))
#' list.of.vcfs <- ICAMS::ReadAndSplitVCFs(file, variant.caller = "strelka")$ID
#' if (requireNamespace("BSgenome.Hsapiens.1000genomes.hs37d5", quietly = TRUE)) {
#'   annotated.ID.vcfs <- AnnotateIDVCFsWithTransRanges(list.of.vcfs, ref.genome = "hg19",
#'                                                     trans.ranges = ICAMS::trans.ranges.GRCh37)
#'   }
AnnotateIDVCFsWithTransRanges <- 
  function(ID.vcfs, ref.genome, trans.ranges = NULL, vcf.names = NULL) {
    if (is.null(vcf.names)) {
      vcf.names <- names(ID.vcfs)
    }
    n <- length(ID.vcfs)
    retval <- list()
    for (i in 1:n){
      ID.vcf <- AnnotateIDVCF(ID.vcfs[[i]], ref.genome=ref.genome)[[1]]
      CheckSeqContextInIDVCF(ID.vcf, "seq.context")
      trans.ranges <- ICAMS:::InferTransRanges(ref.genome, trans.ranges)
      if (!is.null(trans.ranges)) {
        ID.vcf <- 
          ICAMS:::AddTranscript(ID.vcf,
                                trans.ranges = trans.ranges,
                                ref.genome   = ref.genome)
      }
      retval[[vcf.names[[i]]]] <- (as.data.table(ID.vcf))
    }
    return(retval)
  }


###############################################################################
# S3 Methods
###############################################################################
#' @export
`[.ID115Catalog` <- function (x, i, j, drop = if (missing(i)) TRUE else length(cols) ==
                                1) {
  y <- NextMethod("[")
  if (inherits(y, c("integer", "numeric"))) {
    return(y)
  } else {
    class(y) <- class(x)
    for (at in c("ref.genome", "catalog.type", "abundance", "region")) {
      attr(y, at) <- attr(x, at, exact = TRUE)
    }
    return(y)
  }
}

#' @export
`cbind.ID115Catalog` <- function (..., deparse.level = 1) {
  x <- base::cbind.data.frame(..., deparse.level = deparse.level)
  x <- data.matrix(x)
  class(x) <- class(..1)
  list0 <- list(...)
  x <- CheckAndAssignAttributes(x, list0)
  return(x)
}
steverozen/ICAMSxtra documentation built on Feb. 9, 2022, 7:01 a.m.