R/utility_functions.R

Defines functions CreateTransRanges TransformCatalog CheckCatalogAttributes RegionIsSame RefGenomeIsSame AbundanceIsSame TCFromDenSigDen TCFromCou TCFromCouSig TCFromCouSigCou IsTransformationLegal CheckAndNormalizeTranCatArgs IsSignature IsCounts IsDensity Collapse144AbundanceTo78 Collapse144CatalogTo78 Collapse5bpAbundanceTo3bp Collapse1536CatalogTo96 Collapse192AbundanceTo96 Collapse192CatalogTo96

Documented in Collapse144CatalogTo78 Collapse1536CatalogTo96 Collapse192CatalogTo96 CreateTransRanges TCFromCouSigCou TCFromDenSigDen TransformCatalog

#' "Collapse" a catalog
#' 
#' @description 
#' \enumerate{
#' \item Take a mutational spectrum or signature catalog
#' that is based on a fined-grained set of features (for example, single-nucleotide
#' substitutions in the context of the preceding and following 2 bases).
#'
#' \item Collapse it to a catalog based on a coarser-grained set of features
#' (for example, single-nucleotide substitutions in the context of the
#' immediately preceding and following bases).
#' }
#'
#' \code{Collapse192CatalogTo96} Collapse an SBS 192 catalog
#' to an SBS 96 catalog.
#'
#' \code{Collapse1536CatalogTo96} Collapse an SBS 1536 catalog
#'  to an SBS 96 catalog.
#'
#' \code{Collapse144CatalogTo78} Collapse a DBS 144 catalog
#' to a DBS 78 catalog.
#'
#' @param catalog A catalog as defined in \code{\link{ICAMS}}.
#'
#' @return A catalog as defined in \code{\link{ICAMS}}.
#'
#' @name CollapseCatalog
#' 
#' @examples 
#' # Create an SBS192 catalog and collapse it to an SBS96 catalog
#' object <- matrix(1, nrow = 192, ncol = 1, 
#'                  dimnames = list(catalog.row.order$SBS192))
#' catSBS192 <- as.catalog(object, region = "transcript")
#' catSBS96 <- Collapse192CatalogTo96(catSBS192)
NULL

#' @rdname CollapseCatalog
#' @export
Collapse192CatalogTo96 <- function(catalog) {
  dt192 <- data.table(catalog)
  dt192$rn <- PyrTri(rownames(catalog))
  dt96 <- dt192[, lapply(.SD, sum), by = rn, .SDcols = ]
  mat96 <- as.matrix(dt96[, -1])
  rownames(mat96) <- dt96$rn
  mat96 <- mat96[ICAMS::catalog.row.order$SBS96, , drop = FALSE]
  
  abundance <- attributes(catalog)$abundance
  if (!is.null(abundance)) {
      abundance <- Collapse192AbundanceTo96(abundance)
  }
  
  cat96 <-
    as.catalog(
      object = mat96,
      ref.genome = attributes(catalog)$ref.genome,
      region = attributes(catalog)$region,
      catalog.type = attributes(catalog)$catalog.type,
      abundance = abundance
    )
  return(cat96)
}

#' @keywords internal
Collapse192AbundanceTo96 <- function(abundance192) {
  PyrTri <- function(string) {
    stopifnot(nchar(string) == rep(3, length(string)))
    output <-
      ifelse(substr(string, 2, 2) %in% c("A", "G"), revc(string), string)
    return(output)
  }
  dt <- data.table(abundance192)
  rownames(dt) <- names(abundance192)
  dt$rn <- PyrTri(rownames(dt))
  dt1 <- dt[, lapply(.SD, sum), by = rn, .SDcols = ]
  abundance96 <- unlist(dt1[, -1])
  names(abundance96) <- dt1$rn
  return(abundance96)
}

#' @rdname CollapseCatalog
#' @export
Collapse1536CatalogTo96 <- function(catalog) {
  dt <- data.table(catalog)
  rn <- rownames(catalog)

  # The next gsub replaces the string representing a
  # single-base mutation in pentanucleotide with the corresponding
  # string for that mutation in a trinucleotide context.
  dt$rn <- gsub(".(...).(.)", "\\1\\2", rn, perl = TRUE)
  dt96 <- dt[, lapply(.SD, sum), by = rn, .SDcols = ]
  mat96 <- as.matrix(dt96[, -1])
  rownames(mat96) <- dt96$rn
  mat96 <- mat96[ICAMS::catalog.row.order$SBS96, , drop = FALSE]
  
  abundance <- attributes(catalog)$abundance
  if (!is.null(abundance)) {
    abundance <- Collapse5bpAbundanceTo3bp(abundance)
  }
  
  cat96 <-
    as.catalog(
      object = mat96,
      ref.genome = attributes(catalog)$ref.genome,
      region = attributes(catalog)$region,
      catalog.type = attributes(catalog)$catalog.type,
      abundance = abundance
    )
  return(cat96)
}

#' @keywords internal
Collapse5bpAbundanceTo3bp <- function(abundance1536) {
  dt <- data.table(abundance1536)
  rownames(dt) <- names(abundance1536)
  dt$rn <- substr(rownames(dt), 2, 4)
  dt1 <- dt[, lapply(.SD, sum), by = rn, .SDcols = ]
  abundance96 <- unlist(dt1[, -1])
  names(abundance96) <- dt1$rn
  return(abundance96)
}

#' @rdname CollapseCatalog
#' @export
Collapse144CatalogTo78 <- function(catalog) {
  dt144 <- data.table(catalog)
  ref <- substr(rownames(catalog), 1, 2)
  alt <- substr(rownames(catalog), 3, 4)
  dt144$rn <- CanonicalizeDBS(ref, alt)
  dt78 <- dt144[, lapply(.SD, sum), by = rn, .SDcols = ]
  mat78 <- as.matrix(dt78[ , -1])
  rownames(mat78) <- dt78$rn
  mat78 <- mat78[ICAMS::catalog.row.order$DBS78, , drop = FALSE]
  
  abundance <- attributes(catalog)$abundance
  if (!is.null(abundance)) {
    abundance <- Collapse144AbundanceTo78(abundance)
  }
  
  cat78 <-
    as.catalog(
      object = mat78,
      ref.genome = attributes(catalog)$ref.genome,
      region = attributes(catalog)$region,
      catalog.type = attributes(catalog)$catalog.type,
      abundance = abundance)
    
  return(cat78)
}

#' @keywords internal
Collapse144AbundanceTo78 <- function(abundance144) {
  canonical.ref <-
    c("AC", "AT", "CC", "CG", "CT", "GC", "TA", "TC", "TG", "TT")
  dt <- data.table(abundance144)
  rownames(dt) <- names(abundance144)
  dt$rn <- ifelse(rownames(dt) %in% canonical.ref, rownames(dt), 
                  revc(rownames(dt)))
  dt1 <- dt[, lapply(.SD, sum), by = rn, .SDcols = ]
  abundance78 <- unlist(dt1[, -1])
  names(abundance78) <- dt1$rn
  return(abundance78)
}

#' @keywords internal
IsDensity <- function(x) {
  return(x %in% c("density", "density.signature"))
}

#' @keywords internal
IsCounts <- function(x) {
  return(x %in% c("counts", "counts.signature"))
}

#' @keywords internal
IsSignature <- function(x) {
  return(x %in% c("counts.signature", "density.signature"))
}

#' @keywords internal
CheckAndNormalizeTranCatArgs <-
  function(catalog, 
           target.ref.genome, 
           target.region, 
           target.catalog.type, 
           target.abundance) {
    
    StopIfNrowIllegal(catalog)
    
    s <- list(
      ref.genome    = attr(catalog, "ref.genome",   exact = TRUE),
      catalog.type = attr(catalog,  "catalog.type", exact = TRUE),
      abundance    = attr(catalog,  "abundance",    exact = TRUE),
      region       = attr(catalog,  "region",       exact = TRUE))
    
    if (is.null(target.ref.genome))   target.ref.genome   <- s$ref.genome
    if (is.null(target.catalog.type)) target.catalog.type <- s$catalog.type 
    StopIfCatalogTypeIllegal(target.catalog.type)
    if (is.null(target.region))       target.region       <-s$region
    StopIfRegionIllegal(target.region)

    inferred.abundance <- 
        InferAbundance(catalog,
                       target.ref.genome,
                       target.region,
                       target.catalog.type)
                   
    if (!is.null(target.abundance) &&
        !is.null(inferred.abundance)) {
      if (!all.equal(target.abundance,
                     inferred.abundance)) {
        stop("Caller supplied abundance is different from inferred abundance")
      }
      stopifnot(names(inferred.abundance) == names(target.abundance))
    }
    if (is.null(target.abundance)) target.abundance <- inferred.abundance
    stopifnot(names(s[["abundance"]]) == names(target.abundance))

    if (s$catalog.type == target.catalog.type) {
      if (all(s[["abundance"]] == target.abundance)) { 
        warning("Input and target catalog.type and abundance are the same\n",
                "no transformation")
      }
    }
    return(list(s = s, 
                t = 
                  list(ref.genome   = target.ref.genome,
                       catalog.type = target.catalog.type,
                       region       = target.region,
                       abundance    = target.abundance)))
  }


#' @keywords internal
IsTransformationLegal <- function(s, t) {
 
  if (IsSignature(s$catalog.type) &&
      !IsSignature(t$catalog.type)) {
    stop("cannot transform from a signature to a counts or density catalog")
  }
  
  if (("COMPOSITECatalog" %in% class(s)) ||
       ("COMPOSITECatalog" %in% class(t))) {
    stop("Transformation of class COMPOSITECatalog not supported")
  }
  
  if (IsDensity(s$catalog.type))
    return(TCFromDenSigDen(s, t))
  else
    return(TCFromCouSigCou(s, t))
}


#' Source catalog type is counts or counts.signature
#' 
#' counts.signature -> density.signature, counts.signature
#' counts -> anything
#' 
#' @keywords internal
TCFromCouSigCou <- function(s, t) {
  
  stopifnot(IsCounts(s$catalog.type))
  
  if (IsSignature(s$catalog.type)) {
    # counts.signature -> density.signatyre, counts.signature
    return(TCFromCouSig(s, t))
  } else {
    # counts -> anything
    return(TCFromCou(s, t))
  }
}

#' @keywords internal
#' 
#' counts.signature -> counts.signature, density.signature
TCFromCouSig <- function(s, t) {
  
  if (t$catalog.type == "counts.signature") {
    # counts.signature -> counts.signature
    if (is.null(s[["abundance"]])) {
      stop("Cannot transform from counts.signature -> counts.signature ",
           "source abundance is NULL")
    }
    if (is.null(t[["abundance"]])) {
      stop("Cannot transform from counts.signature -> counts.signature ",
           "target abundance is NULL")
    }
    if (all(s[["abundance"]] == t[["abundance"]])) {
      warning("Transformation from counts.signature -> counts.sinature",
              "with equal abundance is a null operation")
      return("null.op")
    }
    return(TRUE)
  } else if (t$catalog.type == "counts") {
    # counts.signature -> counts
    stop("Cannot transform from counts.signature -> counts")
  } else if (t$catalog.type == "density.signature") {
    # counts.signature -> density.signature
     if (is.null(s[["abundance"]])) {
       stop("Cannot transform from counts.signature -> denisity.signature ",
            "if source abundance is NULL")
     } else {
       return(TRUE)
     }

  } else if (t$catalog.type == "density") {
    # counts.signature -> density
    stop("Cannnot tranform from counts.signature -> density")
  }
  stop("programming error, unexpected catalog.type ", t$catalog.type)
}


#' @keywords internal
#' 
#' counts -> <anything>
TCFromCou <- function(s, t) {
  if (t$catalog.type == "counts.signature") {
    # counts -> counts.signature
    if (is.null(s[["abundance"]])) {
      if (!is.null(t[["abundance"]]))
      stop("Cannot transform from counts -> counts.signature ",
           "source abundance is NULL and target abundance is not NULL")
    }
    if (is.null(t[["abundance"]])) {
      if (!is.null(s[["abundance"]]))
      stop("Cannot transform from counts -> counts.signature ",
           "target abundance is NULL and source abundance in not NULL")
    }
    # Source and target catalog both have NULL abundance
    if (is.null(s[["abundance"]]) && is.null(t[["abundance"]])) {
      # If source and target catalog have different ref.genome, 
      # raise an error
      if (!RefGenomeIsSame(s[["ref.genome"]], t[["ref.genome"]])) {
        stop("Cannot transform from counts -> counts.signature ",
             "source and target catalog both have NULL abundance,",
             "but have different ref.genome")
      }
      # If source and target catalog have different region, 
      # raise an error
      if (!RegionIsSame(s[["region"]], t[["region"]])) {
        stop("Cannot transform from counts -> counts.signature ",
             "source and target catalog both have NULL abundance,",
             "but have different region")
      }
    }
    if (AbundanceIsSame(s[["abundance"]], t[["abundance"]])) {
      return("sig.only")
    }
    return(TRUE)
  } else if (t$catalog.type == "counts") {
    # counts -> counts
    warning("counts -> counts is deprecated; ",
            "it simply infers new counts based on ",
            "changes in abundance; ", 
            "we strongly suggest that you work with densities")
    if (is.null(s[["abundance"]]) || is.null(t[["abundance"]])) {
      stop("Cannot transform from counts -> counts if either ",
           "source or target abundance is null")
    }
    if (all(s[["abundance"]] == t[["abundance"]])) {
      warning("Trasformation from counts -> counts with equal abundances ",
              "is a null operation")
      return("null.op")
    }
    return(TRUE)
        
  } else if (t$catalog.type == "density.signature") {
    # counts -> density.signature
    if (is.null(s[["abundance"]])) {
      stop("Cannot transform from counts -> denisity.signature ",
           "if source abundance is NULL")
    } 
    return(TRUE)
  } else if (t$catalog.type == "density") {
    # counts -> density
    if (is.null(s[["abundance"]])) {
      stop("Cannot transform from counts -> denisity ",
           "if source abundance is NULL")
    } 
    return(TRUE)
    
  }
  stop("programming error, unexpected catalog.type ", t$catalog.type)
}

#' density -> <anything>
#' density.signature -> density.signature, counts.signature
#' 
#' @keywords internal
TCFromDenSigDen <- function(s, t) {

  if (IsSignature(s$catalog.type))
    return(TCFromDenSig(s, t))
  else
    return(TCFromDen(s,t))
}

#' @keywords internal
#' 
#' density.signature -> anything
TCFromDenSig <- function (s, t) {
  if (t$catalog.type == "density") {
    # density.signature -> density
    stop("programming error: density.signature -> density")
  } else if (t$catalog.type == "density.signature") {
    # density.signature -> density.signature
    warning("Transform density.signature -> density signature",
            " is a null operation")
    return("null.op")
  } else if (t$catalog.type == "counts") {
    # density.signature -> counts
    stop("programming error: density.signature -> counts")
  } else if (t$catalog.type == "counts.signature") {
    # density.signature -> counts.signature
    if (is.null(t[["abundance"]])) {
      stop("Cannot transform density.signature -> counts.signature",
           " if target abundance is NULL")
    } else {  return(TRUE)   }
  }
  stop("programming error, unexpected catalog.type ", t$catalog.type)
}

#' @keywords internal
#' density -> anything
TCFromDen <- function (s, t) {  
  if (t$catalog.type == "density") {
    # density -> density
    warning("Transformation from density to density ",
            "is a null operation")
    return("null.op")
  } else if (t$catalog.type == "density.signature") { 
    # density -> density.signature
    return("sig.only")
  } else if (t$catalog.type == "counts") {
    # density -> counts
    if (is.null(t[["abundance"]])) {
      stop("Cannot tranform density -> counts; target abundance is NULL")
    } else {
      return(TRUE)
    }
  } else if (t$catalog.type == "counts.signature") {
    # density -> counts.signature
    if (is.null(t[["abundance"]])) {
      stop("Cannot tranform density -> counts.signature; target abundance is NULL")
    } else {
      return(TRUE)
    }    
  }
  stop("programming error, unexpected catalog.type ", t$catalog.type)
}

#' @keywords internal
AbundanceIsSame <- function(a1, a2) {
  if (is.null(a1)  && is.null(a2))  return(TRUE)
  if (is.null(a1)  && !is.null(a2)) return(FALSE)
  if (!is.null(a1) && is.null(a2))  return(FALSE)
  if (all(a1 == a2) && all(names(a1) == names(a2))) return(TRUE)
  return(FALSE)
}

#' @keywords internal
RefGenomeIsSame <- function(a1, a2) {
  if (is.null(a1)  && is.null(a2))  return(TRUE)
  if (is.null(a1)  && !is.null(a2)) return(FALSE)
  if (!is.null(a1) && is.null(a2))  return(FALSE)
  a1 <- NormalizeGenomeArg(a1)
  a2 <- NormalizeGenomeArg(a2)
  if (a1@pkgname == a2@pkgname) return(TRUE)
  return(FALSE)
}

#' @keywords internal
RegionIsSame <- function(a1, a2) {
  if (is.null(a1)  && is.null(a2))  return(TRUE)
  if (is.null(a1)  && !is.null(a2)) return(FALSE)
  if (!is.null(a1) && is.null(a2))  return(FALSE)
  if (a1 == a2) return(TRUE)
  return(FALSE)
}

#' @keywords internal
CheckCatalogAttributes <- function(catalog, target.catalog.type) {
  ref.genome <- attr(catalog, "ref.genome", exact = TRUE)
  catalog.type <- attr(catalog, "catalog.type", exact = TRUE)
  abundance <- attr(catalog, "abundance", exact = TRUE)
  region <- attr(catalog, "region", exact = TRUE)
  check.result <- TRUE
  
  if (is.null(ref.genome)) {
    check.result <<- TRUE
  } else if (inherits(ref.genome, "BSgenome")) {
    check.result <<- TRUE
  } else if (ref.genome == "mixed") {
    stop('Cannot perform transformation from a catalog with "mixed" ref.genome')
  }
  if (is.null(catalog.type)) {
    stop("Cannot perform transformation from a catalog with NULL catalog.type")
  }
  if (is.null(abundance)) {
    if (!(catalog.type == "counts" && target.catalog.type == "counts.signature")) {
      stop("Cannot perform transformation from a catalog with NULL abundance")
    }
  } else if (!inherits(abundance, "integer")) {
    stop("Cannot perform transformation from a catalog with non integer abundance")
  }
  if (is.null(region)) {
    stop("Cannot perform transformation from a catalog with NULL region")
  } else if (region == "mixed") {
    stop('Cannot perform transformation from a catalog with "mixed" region')
  }
  return(check.result)
}

#' Transform between counts and density spectrum catalogs
#' and counts and density signature catalogs
#'
#' @details Only the following transformations are legal:
#'
#' \enumerate{
#'
#' \item \code{counts -> counts} (deprecated, generates a warning;
#' we strongly suggest that you work with densities if comparing
#' spectra or signatures generated from data with
#' different underlying abundances.)
#' 
#' \item \code{counts -> density}
#'
#' \item \code{counts -> (counts.signature, density.signature)}
#'
#' \item \code{density -> counts} (the semantics are to
#' infer the genome-wide or exome-wide counts based on the
#' densities)
#'
#' \item \code{density -> density} (a null operation, generates
#' a warning)
#'
#' \item \code{density -> (counts.signature, density.signature)}
#'
#' \item \code{counts.signature -> counts.signature} (used to transform
#'    between the source abundance and \code{target.abundance})
#'
#' \item \code{counts.signature -> density.signature}
#' 
#' \item \code{counts.signature -> (counts, density)} (generates an error)
#' 
#' \item \code{density.signature -> density.signature} (a null operation,
#' generates a warning)
#'
#' \item \code{density.signature -> counts.signature}
#' 
#' \item \code{density.signature -> (counts, density)} (generates an error)
#'
#' }
#'
#'
#' @param catalog An SBS or DBS catalog as described in \code{\link{ICAMS}};
#'  must \strong{not} be an ID (small insertion and deletion) catalog.
#'
#' @param target.ref.genome A \code{ref.genome} argument as described in
#'   \code{\link{ICAMS}}. If \code{NULL}, then defaults to the
#'   \code{ref.genome} attribute of \code{catalog}.
#'
#' @param target.region A \code{region} argument; see \code{\link{as.catalog}}
#' and \code{\link{ICAMS}}. If \code{NULL}, then defaults to the
#'   \code{region} attribute of \code{catalog}.
#'
#' @param target.catalog.type A character string acting as a catalog type
#'   identifier, one of "counts", "density", "counts.signature",
#'   "density.signature"; see \code{\link{as.catalog}}. If \code{NULL}, then defaults to the
#'   \code{catalog.type} attribute of \code{catalog}.
#'   
#' @param target.abundance  
#'   A vector of counts, one for each source K-mer for mutations (e.g. for
#'   strand-agnostic single nucleotide substitutions in trinucleotide -- i.e.
#'   3-mer -- context, one count each for ACA, ACC, ACG, ... TTT). See
#'   \code{\link{all.abundance}}. If \code{NULL}, the function tries to infer
#'   \code{target.abundace} from the class of \code{catalog} and the value of
#'   the \code{target.ref.genome}, \code{target.region}, and
#'   \code{target.catalog.type}. If the \code{target.abundance} can be inferred
#'   and is different from a supplied non-\code{NULL} value of
#'   \code{target.abundance}, raise an error.
#'   
#' @return A catalog as defined in \code{\link{ICAMS}}.
#' 
#' @section Rationale:  
#' The \code{\link{TransformCatalog}}
#' function transforms catalogs of mutational spectra or
#' signatures to account for differing abundances of the source
#' sequence of the mutations in the genome.
#'
#' For example, mutations from
#' ACG are much rarer in the human genome than mutations from ACC
#' simply because CG dinucleotides are rare in the genome.
#' Consequently, there are two possible representations of
#' mutational spectra or signatures. One representation is
#' based on mutation counts as observed in a given genome
#' or exome,
#' and this approach is widely used, as, for example, at
#' https://cancer.sanger.ac.uk/cosmic/signatures, which
#' presents signatures based on observed mutation counts
#' in the human genome. We call these "counts-based spectra"
#' or "counts-based signatures".
#'
#' Alternatively,
#' mutational spectra or signatures can be represented as
#' mutations per source sequence, for example
#' the number of ACT > AGT mutations occurring at all
#' ACT 3-mers in a genome. We call these "density-based
#' spectra" or "density-based signatures".
#'
#' This function can also transform spectra
#' based on observed genome-wide counts to "density"-based
#' catalogs. In density-based catalogs
#' mutations are expressed as mutations per
#' source sequences. For example,
#' a density-based catalog represents
#' the proportion of ACCs mutated to
#' ATCs, the proportion of ACGs mutated to ATGs, etc.
#' This is
#' different from counts-based mutational spectra catalogs, which
#' contain the number of ACC > ATC mutations, the number of
#' ACG > ATG mutations, etc.
#'
#' This function can also transform observed-count based
#' spectra or signatures from genome to exome based counts,
#' or between different species (since the abundances of
#' source sequences vary between genome and exome and between
#' species).
#'
#' @export
#' 
#' @examples 
#' file <- system.file("extdata",
#'                     "strelka.regress.cat.sbs.96.csv",
#'                     package = "ICAMS")
#' if (requireNamespace("BSgenome.Hsapiens.1000genomes.hs37d5", quietly = TRUE)) {
#'   catSBS96.counts <- ReadCatalog(file, ref.genome = "hg19", 
#'                                  region = "genome",
#'                                  catalog.type = "counts")
#'   catSBS96.density <- TransformCatalog(catSBS96.counts,
#'                                        target.ref.genome = "hg19",
#'                                        target.region = "genome",
#'                                        target.catalog.type = "density")}
TransformCatalog <-
  function(catalog, 
           target.ref.genome   = NULL,
           target.region       = NULL, 
           target.catalog.type = NULL,
           target.abundance    = NULL) {
    # Check the attributes of the catalog
    stopifnot(CheckCatalogAttributes(catalog, target.catalog.type))
    
    # Check and normalize the arguments
    args <-
      CheckAndNormalizeTranCatArgs(
        catalog             = catalog,
        target.ref.genome   = target.ref.genome,
        target.catalog.type = target.catalog.type,
        target.region       = target.region,
        target.abundance    = target.abundance)
    rm(target.ref.genome, target.catalog.type, target.region, target.abundance)
    s <- args$s
    t <- args$t
    
    # Check if the transformation is legal
    test <- IsTransformationLegal(s, t)
    if (test == "null.op") return(catalog)
    if (test == "sig.only") {
      out <- apply(catalog, MARGIN = 2, function (x) x / sum(x))
      return(as.catalog(out, t$ref.genome,
                        t[["region"]], t[["catalog.type"]], t[["abundance"]])) 
    }

    factor <- t[["abundance"]] / s[["abundance"]]
    if (any(is.infinite(factor)) || any(is.na(factor))) {
      factor[is.infinite(factor)] <- 0
      factor[is.na(factor)] <- 0
    }

    names(factor) <- names(t[["abundance"]])
    out.catalog <- catalog

    for(source.n.mer in names(s[["abundance"]])) {
      # CAUTION: this loop depends on how mutations are encoded in
      # the row names of catalogs!
      
      # For 96 and 192 SBS, source.n.mer is e.g. "ACT" (for the encoding of ACT >
      # AGT as "ACTG"); for SBS1536 the n-mer for AACAG > AATAG is AACAG, in the
      # encoding AACAGT. For DBS78 and DBS144 TGGA represents TG >GA, and the
      # source n-mer is TG. For DBS136, TTGA represents TTGA > TNNA, and the
      # source n-mer is TTGA.
      
      # First, get the rows with the given source.n.mer
      rows <- grep(paste("^", source.n.mer, sep=''), rownames(out.catalog))
      # Then update those rows using the factor for that source.n.mer
      
      out.catalog[rows, ] <- out.catalog[rows, ] * factor[source.n.mer]
    }

    if (t[["catalog.type"]] %in% c("counts.signature", "density.signature")) {
      out2 <- apply(out.catalog, MARGIN = 2, function (x) x / sum(x))
    } else {
      out2 <- out.catalog
    }
    return(as.catalog(out2, t[["ref.genome"]], t[["region"]],
                      t[["catalog.type"]], t[["abundance"]]))

  }


#' Create a transcript range file from the raw GFF3 File
#'
#' @param file The name/path of the raw GFF3 File, or a complete URL.
#'
#' @importFrom  stringi stri_split_fixed
#'
#' @return A data table which contains chromosome name, start, end position,
#'   strand information and gene name. It is keyed by chrom, start, and
#'   end. Only genes that are associated with a CCDS ID are kept for
#'   transcriptional strand bias analysis.
#'
#' @keywords internal
CreateTransRanges <- function(file) {
  df <- read.csv(file, header = FALSE, fill = TRUE, nrows = 20)
  # Count the number of comment lines
  n <- sum(grepl("#", df[, 1]))

  # Read in the raw GFF3 File while skipping the comment lines
  dt <- data.table::fread(file, header = FALSE, sep = "\t", fill = TRUE, skip = n)

  # Extract the gene ID associated with CCDS ID
  dt1 <- dt[grep("CCDS", dt$V9), ]
  info <- stri_split_fixed(dt1$V9, ";")
  gene.id.CCDS.idx <- lapply(info, grep, pattern = "gene_id")
  ExtractInfo <- function(idx, list1, list2) {
    return(list1[[idx]][list2[[idx]]])
  }
  gene.id.CCDS <-
    unique(sapply(1:length(info), ExtractInfo,
                  list1 = info, list2 = gene.id.CCDS.idx))
  gene.id.CCDS <- sapply(stri_split_fixed(gene.id.CCDS, "="), "[", 2)

  # Extract the gene ID and gene name from entries in dt which belong to the
  # type "gene"
  dt2 <- dt[dt$V3 == "gene", ]
  attributes.info <- stri_split_fixed(dt2$V9, ";")
  gene.id.idx <- lapply(attributes.info, grep, pattern = "gene_id")
  gene.name.idx <- lapply(attributes.info, grep, pattern = "gene_name")
  len <- length(attributes.info)
  gene.id <-
    sapply(1:len, ExtractInfo, list1 = attributes.info, list2 = gene.id.idx)
  gene.id <- sapply(stri_split_fixed(gene.id, "="), "[", 2)
  gene.name <-
    sapply(1:len, ExtractInfo, list1 = attributes.info, list2 = gene.name.idx)
  gene.name <- sapply(stri_split_fixed(gene.name, "="), "[", 2)
  dt3 <- dt2[, c("gene.id", "gene.name") := .(gene.id, gene.name)]

  # Only keep the entries in dt3 with gene ID that is associated with CCDS ID.
  # Select the necessary columns and standardize the chromosome names.
  dt4 <- dt3[gene.id %in% gene.id.CCDS, c(1, 4, 5, 7, 10, 11)]
  colnames(dt4) <- c("chrom", "start", "end", "strand", 
                     "Ensembl.gene.ID", "gene.name")
  dt5 <- StandardChromName(dt4)

  # Reorder dt5 according to chromosome name, start and end position
  chrOrder <- c((1:22), "X", "Y")
  dt5$chrom <- factor(dt5$chrom, chrOrder, ordered = TRUE)
  setkeyv(dt5, c("chrom", "start", "end"))

  # Check whether one gene.name have multiple entries in dt5
  dt6 <- dt5[, count := .N, by = .(chrom, gene.name)]
  dt7 <- dt6[count == 1, ]
  dt8 <- dt6[count != 1, ]
  if (nrow(dt8) == 0) {
    return(dt7[, c(1:6)])
  } else {
    # Check and remove entries in dt8 if it has readthrough transcripts
    dt9 <- dt[grep("readthrough", dt$V9), ]
    info1 <- stri_split_fixed(dt9$V9, ";")
    gene.id.readthrough.idx <- lapply(info1, grep, pattern = "gene_id")
    gene.id.readthrough <-
      unique(sapply(1:length(info1), ExtractInfo,
                    list1 = info1, list2 = gene.id.readthrough.idx))
    gene.id.readthrough1 <- 
      sapply(stri_split_fixed(gene.id.readthrough, "="), "[", 2)
    dt8$readthrough <- FALSE
    dt9 <- dt8[Ensembl.gene.ID %in% gene.id.readthrough1, readthrough := TRUE]
    dt10 <- dt9[readthrough == FALSE, ]
    dt11 <- dplyr::distinct(dt10, chrom, gene.name, .keep_all = TRUE)
    
    # Rbind dt7 and dt11
    dt12 <- rbind(dt7[, c(1:6)], dt11[, c(1:6)], use.names = FALSE)
    
    # Remove the string after dot in the Ensembl gene ID
    Ensembl.gene.ID.withoutdot <- gsub("\\..*", "", dt12$Ensembl.gene.ID)
    dt12$Ensembl.gene.ID <- Ensembl.gene.ID.withoutdot
    
    return(setkeyv(dt12, c("chrom", "start", "end")))
  }
}

#' Create exome transcriptionally stranded regions
#'
#' @param file Path to a SureSelect BED file which contains unstranded exome
#'   ranges.
#'
#' @param trans.ranges A data.table which contains transcript range and strand
#'   information. Please refer to \code{\link{TranscriptRanges}} for more
#'   details.
#'
#' @import data.table
#'
#' @return A data table which contains chromosome name, start, end position,
#'   strand information. It is keyed by chrom, start, and
#'   end.
#'
#' @keywords internal
CreateExomeStrandedRanges <- function(file, trans.ranges) {
  exome.ranges <- ReadBedRanges(file)
  colnames(exome.ranges) <- c("chrom", "exome.start", "exome.end")

  # Remove ranges which fall on transcripts on both strands and get
  # transcriptionally stranded regions
  trans.ranges <- RemoveRangesOnBothStrand(trans.ranges)

  # Find range overlaps between the exome.ranges and trans.ranges
  dt <- foverlaps(exome.ranges, trans.ranges, type = "any", mult = "all")
  dt1 <- dt[!is.na(strand)]

  # Find out and remove exome ranges that fall on transcripts on both strands
  dt2 <- dt1[, bothstrand := "+" %in% strand && "-" %in% strand,
             by = .(chrom, exome.start, exome.end)]
  dt3 <- dt2[bothstrand == FALSE]

  # One unstranded exome range can be represented by more than 1 row in dt3 if it
  # overlaps with multiple transcriptionally stranded regions. When creating the
  # exome transcriptionally stranded regions, we only need to count those ranges once.
  dt4 <- dt3[, count := .N, by = .(chrom, exome.start, exome.end)]
  dt4 <- dt3[, .(strand = strand[1]), by = .(chrom, exome.start, exome.end)]

  # Intersect dt4 ranges with the transcriptionally stranded regions
  gr1 <- GenomicRanges::makeGRangesFromDataFrame(dt4)
  gr2 <- GenomicRanges::makeGRangesFromDataFrame(trans.ranges)
  gr3 <- GenomicRanges::intersect(gr1, gr2)

  dt5 <- as.data.table(gr3)[, c(1:3, 5)]
  colnames(dt5)[1] <- "chrom"
  chrOrder <- c((1:22), "X", "Y")
  dt5$chrom <- factor(dt5$chrom, chrOrder, ordered = TRUE)
  return(setkeyv(dt5, c("chrom", "start", "end")))
}

#' @keywords internal
PyrTri <- function(mutstring) {
  stopifnot(nchar(mutstring) == rep(4, length(mutstring)))
  output <-
    ifelse(substr(mutstring, 2, 2) %in% c("A", "G"),
           paste0(revc(substr(mutstring, 1, 3)),
                  revc(substr(mutstring, 4, 4))),
           mutstring)
  return(output)
}

#' @keywords internal
PyrPenta <- function(mutstring) {
  stopifnot(nchar(mutstring) == rep(6, length(mutstring)))
  output <-
    ifelse(substr(mutstring, 3, 3) %in% c("A", "G"),
           paste0(revc(substr(mutstring, 1, 5)),
                  revc(substr(mutstring, 6, 6))),
           mutstring)
  return(output)
}

#' Reverse complement every string in \code{string.vec}
#' 
#' Based on \code{\link{reverseComplement}}.
#' Handles IUPAC ambiguity codes but not "u" (uracil). \cr
#' (see <https://en.wikipedia.org/wiki/Nucleic_acid_notation>).
#'
#' @param string.vec A character vector.
#'
#' @importFrom Biostrings reverseComplement DNAStringSet
#'
#' @return A character vector with the reverse complement of every
#'   string in \code{string.vec}.
#'
#' @export
#' 
#' @examples 
#' revc("aTgc") # GCAT
#' 
#' # A vector and strings with ambiguity codes
#' revc(c("ATGC", "aTGc", "wnTCb")) # GCAT GCAT VGANW
#' 
#' \dontrun{
#' revc("ACGU") # An error}
revc <- function(string.vec) {
  return(
    as.character(reverseComplement(DNAStringSet(string.vec)))
  )
}

#' @title Reverse complement strings that represent stranded SBSs
#'
#' @param mutstring A vector of 4-character strings representing
#' stranded SBSs in trinucleotide context,
#' for example "AATC" represents AAT > ACT mutations.
#'
#' @return Return the vector of
#' reverse complements of the first 3 characters
#' concatenated with the reverse complement of the
#' last character, e.g. "AATC" returns "ATTG".
#'
#' @keywords internal
RevcSBS96 <- function(mutstring) {
  stopifnot(nchar(mutstring) == rep(4, length(mutstring)))
  context <- revc(substr(mutstring, 1, 3))
  target  <- revc(substr(mutstring, 4, 4))
  return(paste0(context, target))
}

#' @title Reverse complement strings that represent stranded DBSs
#'
#' @param mutstring A vector of 4-character strings representing
#' stranded DBSs, for example "AATC" represents AA > TC mutations.
#'
#' @return Return the vector of
#' reverse complements of the first 2 characters
#' concatenated with the reverse complement of the second
#' 2 characters, e.g. "AATC" returns "TTGA".
#'
#' @keywords internal
RevcDBS144 <- function(mutstring) {
  stopifnot(nchar(mutstring) == rep(4, length(mutstring)))
  context <- revc(substr(mutstring, 1, 2))
  target  <- revc(substr(mutstring, 3, 4))
  return(paste0(context, target))
}

#' Read transcript ranges and strand information from a gff3 format file.
#' Use this one for the new, cut down gff3 file (2018 11 24)
#'
#' @param file Path to the file with the transcript information with 1-based
#'   start end positions of genomic ranges.
#'
#' @return A data.table keyed by chrom, start, and end.
#'
#' @keywords internal
ReadTranscriptRanges <- function(file) {
  dt <- data.table::fread(file)
  colnames(dt) <- c("chrom", "start", "end", "strand", 
                    "Ensembl.gene.ID", "gene.symbol")
  
  # Check whether the transcript ranges come from human or mouse genomes
  if (length(unique(dt$chrom)) == 24) {
    chrOrder <- c((1:22), "X", "Y")
  } else if (length(unique(dt$chrom)) == 21) {
    chrOrder <- c((1:19), "X", "Y")
  }
  
  dt$chrom <- factor(dt$chrom, chrOrder, ordered = TRUE)
  data.table::setkeyv(dt, c("chrom", "start", "end"))
  return(dt)
}

#' Read chromosome and position information from a bed format file.
#'
#' @param file Path to the file in bed format.
#'
#' @return A data.table keyed by chrom, start, and end. It uses one-based
#'   coordinates.
#'   
#' @keywords internal
ReadBedRanges <- function(file) {
  dt <- data.table::fread(file)
  dt1 <- StandardChromName(dt)
  colnames(dt1)[1:3] <- c("chrom", "start", "end")

  # Delete duplicate entries in the BED file
  dt2 <- dplyr::distinct(dt1, chrom, start, end, .keep_all = TRUE)

  # Bed file are 0 based start and 1 based end (an oversimplification).
  # We need to add 1L and not 1, otherwise the column turns to a double
  # we get a warning from data.table.
  dt2$start <- dt2$start + 1L

  chrOrder <- c((1:22), "X", "Y")
  dt2$chrom <- factor(dt2$chrom, chrOrder, ordered = TRUE)
  return(data.table::setkeyv(dt2, c("chrom", "start", "end")))
}

#' Create trinucleotide abundance
#'
#' @param file Path to the file with the nucleotide abundance information with 3
#'   base pairs.
#'
#' @return A numeric vector whose names indicate 32 different types of 3 base pairs
#'   combinations while its values indicate the occurrences of each type.
#'
#' @keywords internal
CreateTrinucAbundance <- function(file) {
  dt <- fread(file)
  colnames(dt) <- c("3bp", "occurrences")
  dt$type <-
    ifelse(substr(dt[[1]], 2, 2) %in% c("A", "G"), revc(dt[[1]]), dt[[1]])
  dt1 <- dt[, .(counts = sum(occurrences)), by = type]
  abundance <- dt1$counts
  names(abundance) <- dt1$type
  return(abundance)
}

#' Create stranded trinucleotide abundance
#'
#' @param file Path to the file with the nucleotide abundance information with 3
#'   base pairs.
#'
#' @return A numeric vector whose names indicate 64 different types of 3 base pairs
#'   combinations while its values indicate the occurrences of each type.
#'
#' @keywords internal
CreateStrandedTrinucAbundance <- function(file) {
  dt <- fread(file)
  colnames(dt) <- c("3bp", "occurrences")
  dt$type <- dt[[1]]
  dt1 <- dt[, .(counts = sum(occurrences)), by = type]
  abundance <- dt1$counts
  names(abundance) <- dt1$type
  return(abundance)
}

#' Create dinucleotide abundance
#'
#' @param file Path to the file with the nucleotide abundance information with 2
#'   base pairs.
#'
#' @import data.table
#'
#' @return A numeric vector whose names indicate 10 different types of 2 base pairs
#'   combinations while its values indicate the occurrences of each type.
#'
#' @keywords internal
CreateDinucAbundance <- function(file) {
  dt <- fread(file)
  colnames(dt) <- c("2bp", "occurrences")
  canonical.ref <-
    c("AC", "AT", "CC", "CG", "CT", "GC", "TA", "TC", "TG", "TT")
  dt$type <-
    ifelse((dt[[1]]) %in% canonical.ref, dt[[1]], revc(dt[[1]]))
  dt1 <- dt[, .(counts = sum(occurrences)), by = type]
  abundance <- dt1$counts
  names(abundance) <- dt1$type
  return(abundance)
}

#' Create stranded dinucleotide abundance
#'
#' @param file Path to the file with the nucleotide abundance information with 2
#'   base pairs.
#'
#' @import data.table
#'
#' @return A numeric vector whose names indicate 16 different types of 2 base pairs
#'   combinations while its values indicate the occurrences of each type.
#'
#' @keywords internal
CreateStrandedDinucAbundance <- function(file) {
  dt <- fread(file)
  colnames(dt) <- c("2bp", "occurrences")
  dt$type <- dt[[1]]
  dt1 <- dt[, .(counts = sum(occurrences)), by = type]
  abundance <- dt1$counts
  names(abundance) <- dt1$type
  return(abundance)
}

#' Create tetranucleotide abundance
#'
#' @param file Path to the file with the nucleotide abundance information with 4
#'   base pairs.
#'
#' @import data.table
#'
#' @return A numeric vector whose names indicate 136 different types of 4 base pairs
#'   combinations while its values indicate the occurrences of each type.
#'
#' @keywords internal
CreateTetranucAbundance <- function(file) {
  dt <- fread(file)
  colnames(dt) <- c("4bp", "occurrences")
  dt$type <- CanonicalizeQUAD(dt[[1]])
  dt1 <- dt[, .(counts = sum(occurrences)), by = type]
  abundance <- dt1$counts
  names(abundance) <- dt1$type
  return(abundance)
}

#' Create pentanucleotide abundance
#'
#' @param file Path to the file with the nucleotide abundance information
#'   with 5 base pairs.
#'
#' @import data.table
#'
#' @return A numeric vector whose names indicate 512 different types of 5 base
#'   pairs combinations while its values indicate the occurrences of each type.
#'
#' @keywords internal
CreatePentanucAbundance <- function(file) {
  dt <- fread(file)
  colnames(dt) <- c("5bp", "occurrences")
  dt$type <-
    ifelse(substr(dt[[1]], 3, 3) %in% c("A", "G"), revc(dt[[1]]), dt[[1]])
  dt1 <- dt[, .(counts = sum(occurrences)), by = type]
  abundance <- dt1$counts
  names(abundance) <- dt1$type
  return(abundance)
}

#' Take strings representing a genome and return the \code{\link{BSgenome}} object.
#'
#' @param ref.genome A \code{ref.genome} argument as described in
#'   \code{\link{ICAMS}}.
#'
#' @return If \code{ref.genome} is 
#' a \code{\link{BSgenome}} object, return it.
#' Otherwise return the \code{\link{BSgenome}} object identified by the
#' string \code{ref.genome}.
#'
#' @keywords internal
NormalizeGenomeArg <- function(ref.genome) {
  
  if (is.null(ref.genome)) stop("Need a non-NULL ref.genome")
  if (class(ref.genome) == "BSgenome") return(ref.genome)
  
  stopifnot(class(ref.genome) == "character")
  
  if (ref.genome %in%
      c("GRCh38", "hg38", "BSgenome.Hsapiens.UCSC.hg38")) {
    if ("" == system.file(package = "BSgenome.Hsapiens.UCSC.hg38")) {
      stop("\nPlease install BSgenome.Hsapiens.UCSC.hg38:\n",
           "BiocManager::install(\"BSgenome.Hsapiens.UCSC.hg38\")")
    }
    stopifnot(requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly = TRUE))
    ref.genome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
  } else if (ref.genome %in%
             c("GRCh37", "hg19", "BSgenome.Hsapiens.1000genomes.hs37d5")) {
    if ("" == system.file(package = "BSgenome.Hsapiens.1000genomes.hs37d5")) {
      stop("\nPlease install BSgenome.Hsapiens.1000genomes.hs37d5:\n",
           "BiocManager::install(\"BSgenome.Hsapiens.1000genomes.hs37d5\")")
    }
    stopifnot(requireNamespace("BSgenome.Hsapiens.1000genomes.hs37d5", quietly = TRUE))
    ref.genome <- 
      BSgenome.Hsapiens.1000genomes.hs37d5::BSgenome.Hsapiens.1000genomes.hs37d5
  } else if (ref.genome %in%
             c("GRCm38", "mm10", "BSgenome.Mmusculus.UCSC.mm10")) {
    if ("" == system.file(package = "BSgenome.Mmusculus.UCSC.mm10")) {
      stop("\nPlease install BSgenome.Mmusculus.UCSC.mm10:\n",
           "BiocManager::install(\"BSgenome.Mmusculus.UCSC.mm10\")")
    }
    stopifnot(requireNamespace("BSgenome.Mmusculus.UCSC.mm10", quietly = TRUE))
    ref.genome <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10
  } else {
    stop("Unrecoginzed ref.genome:\n", ref.genome,
         "\nNeed a BSgenome reference genome\n",
         "or one of the character strings GRCh38, hg38, GRCh37, hg19, ",
         "GRCm38, mm10")
  }
  
  return(ref.genome)
}

#' Stop if \code{region} is illegal for an in-transcript catalogs
#'
#' @param region The region to test (a character string)
#' 
#' @keywords internal

StopIfTranscribedRegionIllegal <- function(region) {
  if (!region %in% c("transcript", "exome", "unknown")) {
    stop("Require region to be one of transcript, exome, or unknown for ",
         "SBS192Catalog and DBS144Catalog\n",
         "Got ", region)
  }
}

#' Stop if the number of rows in \code{object} is illegal
#' 
#' @param object A \code{catalog}, numeric \code{matrix}, or numeric \code{data.fram}
#' 
#' @keywords internal
StopIfNrowIllegal <- function(object) {
  # Will call stop() if the number or rows is illegal.
  InferCatalogClassString(object)

}

#' Stop if \code{region} is illegal.
#'
#' @param region Character string to check.
#' 
#' @keywords internal
StopIfRegionIllegal <- function(region) {
  if (!region %in% c("genome", "exome", "transcript", "unknown")) {
    stop("Unrecoginzed region identifier: ", region,
         "\nNeed one of genome, exome, transcript, unknown")
  }
 return(NULL)
}

#' Stop if \code{catalog.type} is illegal.
#'
#' @param catalog.type Character string to check.
#'
#' @keywords internal
StopIfCatalogTypeIllegal <- function(catalog.type) {
  if (is.null(catalog.type)) return (NULL)
  if (!catalog.type %in% c("counts", "density",
                           "counts.signature", "density.signature")) {
    stop("Unrecoginzed catalog type identifier: ", catalog.type,
         "\nNeed one of counts, density, counts.signature, density.signature")
  }
  return(NULL)
}


#' Check whether the rownames of \code{object} are correct, if yes then put the
#' rows in the correct order.
#'
#' @keywords internal
CheckAndReorderRownames <- function(object) {
  prefix <- InferCatalogClassPrefix(object)
  correct.rownames <- ICAMS::catalog.row.order[[prefix]]
  difference <- setdiff(correct.rownames, rownames(object))
  if(identical(difference, character(0))) {
    return(object[correct.rownames, , drop = FALSE])
  } else {
    stop("\nThe input object does not have correct rownames to denote the\n", 
         "mutation types. Mutation types that are missing are\n", 
         paste(difference, collapse = " "))
  }
}

#' Test if object is \code{BSgenome.Hsapiens.1000genome.hs37d5}.
#'
#' @param x Object to test.
#' 
#' @return TRUE if \code{x} is \code{BSgenome.Hsapiens.1000genome.hs37d5}.
#' 
#' @keywords internal
IsGRCh37 <- function(x) {
  if (is.null(x)) return(FALSE)
  return(NormalizeGenomeArg(x)@pkgname == 
           "BSgenome.Hsapiens.1000genomes.hs37d5")
}

#' Test if object is \code{BSgenome.Hsapiens.UCSC.hg38}.
#'
#' @param x Object to test.
#' 
#' @return TRUE if \code{x} is \code{BSgenome.Hsapiens.UCSC.hg38}.
#' 
#' @keywords internal
IsGRCh38 <- function(x) {
  if (is.null(x)) return(FALSE)
  return(NormalizeGenomeArg(x)@pkgname == 
           "BSgenome.Hsapiens.UCSC.hg38")
}

#' Test if object is \code{BSgenome.Mmusculus.UCSC.mm10}.
#'
#' @param x Object to test.
#' 
#' @return TRUE if \code{x} is \code{BSgenome.Mmusculus.UCSC.mm10}.
#' 
#' @keywords internal
IsGRCm38 <- function(x) {
  if (is.null(x)) return(FALSE)
  return(NormalizeGenomeArg(x)@pkgname == 
           "BSgenome.Mmusculus.UCSC.mm10")
}


#' Infer \code{abundance} given a matrix-like \code{object} and additional information.
#'
#' @param object A numeric matrix, numeric data frame, or \code{catalog}.
#'
#' @param ref.genome A \code{ref.genome} argument as described in
#'   \code{\link{ICAMS}}.
#'
#' @param region A character string designating a genomic region;
#'  see \code{\link{as.catalog}} and \code{\link{ICAMS}}.
#'
#' @param catalog.type A character string for \code{catalog.type}
#' as described in \code{\link{ICAMS}}.
#'
#' @return A value that can be set as the abundance attribute of
#' a \code{catalog} (which may be \code{NULL} if no abundance
#' can be inferred).
#'
#' @keywords internal
InferAbundance <- function(object, ref.genome, region, catalog.type) {
    
    StopIfNrowIllegal(object)
    StopIfRegionIllegal(region)
    StopIfCatalogTypeIllegal(catalog.type)
    if (nrow(object) == 1697) {
      return(NULL)
      # There are no meaningful abundances for COMPOSITE catalogs.
    }

    if (IsDensity(catalog.type)) {
      ab <- flat.abundance[[as.character(nrow(object))]]
      stopifnot(!is.null(ab))
      return(ab)
    }
    
    if (is.null(ref.genome)) return(NULL)
    ref.genome <- NormalizeGenomeArg(ref.genome)
    ab <- ICAMS::all.abundance[[ref.genome@pkgname]]
    if (is.null(ab)) return(NULL)

    ab2 <- ab[[region]]
    if (is.null(ab2)) return(NULL)

    ab3 <- ab2[[as.character(nrow(object))]]

    return(ab3)

  }

#' Create a catalog from a \code{matrix}, \code{data.frame}, or \code{vector}
#'
#' @param object A numeric \code{matrix}, numeric \code{data.frame},
#' or \code{vector}.
#' If a \code{vector}, converted to a 1-column \code{matrix}
#' with rownames taken from the element names of the \code{vector}
#' and with column name \code{"Unknown"}.
#' If argument \code{infer.rownames}
#'  is \code{FALSE} then this argument must have
#'   rownames to denote the mutation types. See \code{\link{CatalogRowOrder}}
#'   for more details.
#'
#' @param ref.genome A \code{ref.genome} argument as described in
#'   \code{\link{ICAMS}}.
#'
#' @param region A character string designating a region, one of
#' \code{genome}, \code{transcript}, \code{exome}, \code{unknown};
#' see \code{\link{ICAMS}}. If the catalog type is a stranded
#' catalog type (SBS192 or DBS144), region = "genome" will
#' be silently converted to "transcript".
#'
#' @param catalog.type One of "counts", "density", "counts.signature",
#'   "density.signature".
#'
#' @param abundance If \code{NULL}, then
#'  inferred if \code{ref.genome}
#'  is one of
#'  the reference genomes known to ICAMS and \code{region}
#'  is not \code{unknown}. See \code{\link{ICAMS}}.
#'  The argument \code{abundance} should
#'  contain the counts of different source sequences for mutations
#'  in the same format as the numeric vectors in \code{\link{all.abundance}}.
#'  
#' @param infer.rownames If \code{TRUE}, and \code{object} has no
#' rownames, then assume the rows of \code{object} are in the
#' correct order and add the rownames implied by the number of rows
#' in \code{object} (e.g. rownames for SBS 192 if there are 192 rows).
#' If \code{TRUE}, \strong{be sure the order of rows is correct.}
#'
#' @return A catalog as described in \code{\link{ICAMS}}.
#'
#' @export
#' 
#' @examples
#' # Create an SBS96 catalog with all mutation counts equal to 1.  
#' object <- matrix(1, nrow = 96, ncol = 1, 
#'                  dimnames = list(catalog.row.order$SBS96))
#' catSBS96 <- as.catalog(object)
             
as.catalog <- function(object, 
                       ref.genome = NULL, 
                       region = "unknown", 
                       catalog.type = "counts", 
                       abundance = NULL,
                       infer.rownames = FALSE) {
  if (!is.matrix(object)) {
    if (is.data.frame(object)) {
      object <- as.matrix(object)
    } else if (is.vector(object)) {
      obj2 <- matrix(object, ncol = 1)
      rownames(obj2) <- names(object)
      colnames(obj2) <- "Unknown"
      object <- obj2
    } else {
      stop("object must be numeric matrix, vector, or data frame")
    }
  }
  stopifnot(mode(object) == "numeric")

  if (is.null(rownames(object))) {
    if (!infer.rownames) {
      stop("Require correct rownames on object unless infer.rownames == TRUE")
    }
    rownames(object) <- InferRownames(object)
  } else {
    object <- CheckAndReorderRownames(object)
  }

  StopIfRegionIllegal(region)
  
  # Will call stop() if nrow(object) is illegal
  class.string  <- InferCatalogClassString(object)
  
  StopIfCatalogTypeIllegal(catalog.type)
  
  if (!is.null(ref.genome)) {
    ref.genome <- NormalizeGenomeArg(ref.genome)
  }
  attr(object, "ref.genome") <- ref.genome

  attr(object, "catalog.type") <- catalog.type
  
  if (is.null(abundance)) {
    abundance <- InferAbundance(object, ref.genome, region, catalog.type)
  } 
  attr(object, "abundance") <- abundance

  class(object) <- append(class(object), class.string, after = 0)
  class(object) <- unique(attributes(object)$class)
  
  # TODO(Steve): check that his has been moved 
  # to the VCF -> catalog functions
  if (attributes(object)$class[1] %in% c("SBS192Catalog", "DBS144Catalog")) {
    if (region == "genome") region <- "transcript"
    StopIfTranscribedRegionIllegal(region)
  }
  attr(object, "region") <- region
  
  return(object)
}

#' Generate all possible k-mers of length k.
#'
#' @param k Length of k-mers (k>=2)
#'
#' @return Character vector containing all possible k-mers.
#'
#' @keywords internal
GenerateKmer <- function(k) {
  base <- c("A", "C", "G", "T")
  list.of.base <- list()
  for (i in 1:k){
    list.of.base[[i]] <- base
  }
  permutation <- expand.grid(list.of.base, stringsAsFactors = FALSE)
  all.kmer.list <- character(nrow(permutation))
  for (i in 1:k) {
    all.kmer.list <- stringi::stri_c(all.kmer.list, permutation[[i]])
  }
  all.kmer.list <- stringi::stri_sort(all.kmer.list)
}

#' Generate an empty matrix of k-mer abundance
#'
#' @param k Length of k-mers (k>=2)
#'
#' @return An empty matrix of k-mer abundance
#'
#' @keywords internal
GenerateEmptyKmerCounts <- function(k) {
  all.kmer.list <- GenerateKmer(k)
  kmer.counts <- matrix(0, nrow = length(all.kmer.list))
  rownames(kmer.counts) <- all.kmer.list
  colnames(kmer.counts) <- "Freq"
  return(kmer.counts)
}

#' Generate k-mer abundance from given nucleotide sequences
#'
#' @param sequences A vector of nucleotide sequences
#'
#' @param k Length of k-mers (k>=2)
#'
#' @return Matrix of the counts of each k-mer inside \code{sequences}
#'
#' @keywords  internal
GetSequenceKmerCounts <- function(sequences, k) {
  kmer.counts <- GenerateEmptyKmerCounts(k)

  for(start_idx in 1:k){
    temp.seqs <- substring(sequences, start_idx, nchar(sequences))
    temp.kmers <-
      stringi::stri_extract_all_regex(
        temp.seqs, pattern = paste(rep(".", each = k), collapse = ""))
    temp.kmers <- unlist(temp.kmers)
    temp.kmer.counts <- data.frame(table(temp.kmers))
    row.names(temp.kmer.counts) <- temp.kmer.counts[, 1]
    if (any(grepl("N", temp.kmer.counts[, 1]))) {
      temp.kmer.counts <- temp.kmer.counts[-grep("N", temp.kmer.counts[, 1]), ]
    }

    kmer.counts[row.names(temp.kmer.counts), ] <-
      kmer.counts[row.names(temp.kmer.counts), ]  +
      temp.kmer.counts$Freq
  }
  return(kmer.counts)
}

#' Generate k-mer abundance from a given genome
#'
#' @param k Length of k-mers (k>=2)
#'
#' @param ref.genome A \code{ref.genome} argument as described in
#'   \code{\link{ICAMS}}.
#'
#' @param filter.path If given, homopolymers will be masked from
#'   genome(sequence). Only simple repeat masking is accepted now.
#'   
#' @param verbose If \code{TRUE}, generate progress messages.
#'
#' @importFrom GenomicRanges GRanges
#'
#' @importFrom IRanges IRanges
#'
#' @return Matrix of the counts of each k-mer across the \code{ref.genome}
#'
#' @keywords internal
GetGenomeKmerCounts <- function(k, ref.genome, filter.path, verbose = FALSE) {
  kmer.counts <- GenerateEmptyKmerCounts(k)
  genome <- NormalizeGenomeArg(ref.genome)

  # Remove decoyed chromosomes and mitochondrial DNA
  chr.list <- seqnames(genome)[which(nchar(seqnames(genome)) <= 5)]
  if (any(grepl("M", chr.list))) {
    chr.list <- chr.list[-grep("M", chr.list)]
  }

  if (!missing(filter.path)) {
    filter.df <- fread(filter.path, header = FALSE, stringsAsFactors = FALSE)
    filter.df <- filter.df[filter.df$V6 <= 6]
    filter.df <- StandardChromName(filter.df[, 2:ncol(filter.df)])
    # Check whether chromosome names in filter.df are the same as in ref.genome
    if (!(filter.df$V2[1] %in% seqnames(genome))){
      filter.df$V2 <- paste0("chr", filter.df$V2)
    }
  }

  if (verbose) message("Start counting by chromosomes")

  for (idx in 1:length(chr.list)) {
    if (verbose) message(chr.list[idx])

    if (!missing(filter.path)) {
      chr.filter.df <- filter.df[which(filter.df$V2 == chr.list[idx]), ]
      filter.chr <- with(chr.filter.df, GRanges(V2, IRanges(V3 + 1, V4)))
      genome.ranges.chr <-
        GRanges(chr.list[idx],
                IRanges(1, as.numeric(GenomeInfoDb::seqlengths(genome)[idx])))
      filtered.genome.ranges.chr <- GenomicRanges::setdiff(genome.ranges.chr, filter.chr)
      genome.seq <- BSgenome::getSeq(genome, filtered.genome.ranges.chr,
                                     as.character = TRUE)
      #Filter shorter homopolymer and microsatellites by regex
      genome.seq <- gsub(homopolymer.ms.regex.pattern, "N", genome.seq)

    } else {
      genome.seq <- BSgenome::getSeq(genome, chr.list[idx], as.character = TRUE)
    }

    kmer.counts <- kmer.counts + GetSequenceKmerCounts(genome.seq, k)
  }
  return(kmer.counts)
}

#' Remove ranges that fall on both strands
#'
#' @param stranded.ranges A keyed data table which has stranded ranges information.
#' It has four columns: chrom, start, end and strand.
#'
#' @return A data table which has removed ranges that fall on both strands from
#'   the input \code{stranded.ranges}.
#'
#' @keywords internal
RemoveRangesOnBothStrand <- function(stranded.ranges) {
  dt <- as.data.table(stranded.ranges)
  dt.plus <- dt[strand == "+", ]
  dt.minus <- dt[strand == "-", ]
  gr.plus <-
    GenomicRanges::makeGRangesFromDataFrame(dt.plus, keep.extra.columns = TRUE)
  gr.minus <-
    GenomicRanges::makeGRangesFromDataFrame(dt.minus, keep.extra.columns = TRUE)
  gr.plus.reduced <- GenomicRanges::reduce(gr.plus, with.revmap = TRUE)
  gr.minus.reduced <- GenomicRanges::reduce(gr.minus, with.revmap = TRUE)

  # Find ranges that have transcripts on both strands and remove these
  # ranges from each strand
  gr <-
    GenomicRanges::intersect(gr.plus.reduced, gr.minus.reduced,
                             ignore.strand = TRUE)

  if (length(gr) != 0) {
    gr1 <- gr
    gr1@strand@values <- "+"
    gr2 <- gr
    gr2@strand@values <- "-"
    gr3 <- GenomicRanges::setdiff(gr.plus.reduced, gr1)
    gr4 <- GenomicRanges::setdiff(gr.minus.reduced, gr2)
  } else {
    gr3 <- gr.plus.reduced
    gr4 <- gr.minus.reduced
  }

  # Get a new data table which does not have ranges on both strands
  dt1 <- as.data.table(gr3)
  dt2 <- as.data.table(gr4)
  dt3 <- rbind(dt1, dt2)
  dt4 <- dt3[, c(1:3, 5)]
  colnames(dt4) <- c("chrom", "start", "end", "strand")
  return(setkeyv(dt4, c("chrom", "start", "end")))
}

#' Generate stranded k-mer abundance from a given genome and gene annotation file
#'
#' @param k Length of k-mers (k>=2)
#'
#' @param ref.genome A \code{ref.genome} argument as described in
#'   \code{\link{ICAMS}}.
#'
#' @param filter.path If given, homopolymers will be masked from
#'   genome(sequence). Only simple repeat masking is accepted now.
#'
#' @param stranded.ranges A keyed data table which has stranded ranges
#'   information. It has four columns: chrom, start, end and strand. It should
#'   use one-based coordinate system.
#'   
#' @param verbose If \code{TRUE} generate progress messages.
#'
#' @importFrom stats start end
#'
#' @importFrom GenomicRanges GRanges
#'
#' @importFrom IRanges IRanges
#'
#' @return Matrix of the counts of each stranded k-mer across the \code{ref.genome}
#'
#' @keywords internal
GetStrandedKmerCounts <- 
  function(k, ref.genome, stranded.ranges, filter.path, verbose = FALSE) {
  stranded.ranges <- RemoveRangesOnBothStrand(stranded.ranges)
  stranded.ranges <- StandardChromName(stranded.ranges)
  genome <- NormalizeGenomeArg(ref.genome)
  kmer.counts <- GenerateEmptyKmerCounts(k)

  # Check whether chromosome names in stranded.ranges are the same as in ref.genome
  if (!(stranded.ranges$chrom[1] %in% seqnames(genome))) {
    stranded.ranges$chrom <- paste0("chr", stranded.ranges$chrom)
  }

  if (!missing(filter.path)) {
    filter.df <- fread(filter.path, header = FALSE, stringsAsFactors = FALSE)
    filter.df <- filter.df[filter.df$V6 <= 6]
    filter.df <- StandardChromName(filter.df[, 2:ncol(filter.df)])
    # Check whether chromosome names in filter.df are the same as in ref.genome
    if (!(filter.df$V2[1] %in% seqnames(genome))){
      filter.df$V2 <- paste0("chr", filter.df$V2)
    }
  }

  if (verbose) message("Start counting by chromosomes")

  for (chr in unique(stranded.ranges$chrom)) {
    if (verbose) message(chr)
    temp.stranded.ranges <- stranded.ranges[stranded.ranges$chrom == chr, ]
    stranded.ranges.chr <-
      with(temp.stranded.ranges,
           GRanges(chrom, IRanges(start, end), strand = strand))

    # Remove the overlapping ranges in stranded.ranges.chr
    stranded.ranges.chr <- IRanges::reduce(stranded.ranges.chr)

    if (!missing(filter.path)) {
      chr.filter.df <- filter.df[which(filter.df$V2 == chr), ]

      # Add strand information to chr.filter.df because
      # the setdiff function is strand specific
      filter.chr <-
        c(with(chr.filter.df, GRanges(V2, IRanges(V3 + 1, V4), strand = "+")),
          with(chr.filter.df, GRanges(V2, IRanges(V3 + 1, V4), strand = "-")))
      filtered.stranded.ranges.chr <-
        GenomicRanges::setdiff(stranded.ranges.chr, filter.chr)

      stranded.seq <- BSgenome::getSeq(genome, filtered.stranded.ranges.chr,
                                       as.character = TRUE)
      # Filter shorter homopolymer and microsatellites by regex
      stranded.seq <- gsub(homopolymer.ms.regex.pattern, "N", stranded.seq)
    } else {
      stranded.seq <- BSgenome::getSeq(genome, stranded.ranges.chr,
                                       as.character = TRUE)
    }
    kmer.counts <- kmer.counts + GetSequenceKmerCounts(stranded.seq, k)
  }
  return(kmer.counts)
}

#' Generate custom k-mer abundance from a given reference genome
#'
#' @param k Length of k-mers (k>=2)
#'
#' @param ref.genome A \code{ref.genome} argument as described in
#'   \code{\link{ICAMS}}.
#'
#' @param custom.range A keyed data table which has custom ranges information. It
#'   has three columns: chrom, start and end. It should use one-based coordinate
#'   system. You can use the internal function in this package
#'   \code{ICAMS:::ReadBedRanges} to read a BED file in 0-based coordinates and
#'   convert it to 1-based coordinates.
#'   
#' @param filter.path If given, homopolymers will be masked from
#'   genome(sequence). Only simple repeat masking is accepted now.
#'   
#' @param verbose If \code{TRUE} generate progress messages.
#'
#' @importFrom stats start end
#'
#' @importFrom GenomicRanges GRanges
#'
#' @importFrom IRanges IRanges
#'
#' @return Matrix of the counts of custom k-mer across the \code{ref.genome}
#'
#' @keywords internal
GetCustomKmerCounts <- function(k, ref.genome, custom.ranges, filter.path, 
                                verbose = FALSE) {
  custom.ranges <- StandardChromName(custom.ranges)
  genome <- NormalizeGenomeArg(ref.genome)
  kmer.counts <- GenerateEmptyKmerCounts(k)
  
  # Check whether chromosome names in custom.ranges are the same as in ref.genome
  if (!(custom.ranges$chrom[1] %in% seqnames(genome))) {
    custom.ranges$chrom <- paste0("chr", custom.ranges$chrom)
  }
  
  if (!missing(filter.path)) {
    filter.df <- fread(filter.path, header = FALSE, stringsAsFactors = FALSE)
    filter.df <- filter.df[filter.df$V6 <= 6]
    filter.df <- StandardChromName(filter.df[, 2:ncol(filter.df)])
    # Check whether chromosome names in filter.df are the same as in ref.genome
    if (!(filter.df$V2[1] %in% seqnames(genome))){
      filter.df$V2 <- paste0("chr", filter.df$V2)
    }
    
  }
  if (verbose) message("Start counting by chromosomes")
  
  for (chr in unique(custom.ranges$chrom)) {
    if (verbose) message(chr)
    temp.custom.ranges <- custom.ranges[custom.ranges$chrom == chr, ]
    custom.range.chr <-
      with(temp.custom.ranges, GRanges(chrom, IRanges(start, end)))
    
    # Remove the overlapping ranges in custom.range.chr
    custom.range.chr <- IRanges::reduce(custom.range.chr)
    
    if (!missing(filter.path)) {
      chr.filter.df <- filter.df[which(filter.df$V2 == chr), ]
      filter.chr <- with(chr.filter.df, GRanges(V2, IRanges(V3 + 1, V4)))
      filtered.custom.range.chr <-
        GenomicRanges::setdiff(custom.range.chr, filter.chr)
      custom.seq <- BSgenome::getSeq(genome, filtered.custom.range.chr,
                                     as.character = TRUE)
      #Filter shorter homopolymer and microsatellites by regex
      custom.seq <- gsub(homopolymer.ms.regex.pattern, "N", custom.seq)
      
    } else {
      custom.seq <- BSgenome::getSeq(genome, custom.range.chr,
                                     as.character = TRUE)
    }
    kmer.counts <- kmer.counts + GetSequenceKmerCounts(custom.seq, k)
  }
  return(kmer.counts)
}

# Redefine the [ methods for catalogs
#' @export
`[.SBS96Catalog` <- function (x, i, j, drop = if (missing(i)) TRUE else length(cols) ==
                                1) {
  y <- NextMethod("[")
  if (inherits(y, c("integer", "numeric"))) {
    return(y)
  } else {
    class(y) <- class(x)
    for (at in c("ref.genome", "catalog.type", "abundance", "region")) {
      attr(y, at) <- attr(x, at, exact = TRUE)
    }
    return(y)
  }
}

#' @export
`[.SBS192Catalog` <- function (x, i, j, drop = if (missing(i)) TRUE else length(cols) ==
                                1) {
  y <- NextMethod("[")
  if (inherits(y, c("integer", "numeric"))) {
    return(y)
  } else {
    class(y) <- class(x)
    for (at in c("ref.genome", "catalog.type", "abundance", "region")) {
      attr(y, at) <- attr(x, at, exact = TRUE)
    }
    return(y)
  }
}

#' @export
`[.SBS1536Catalog` <- function (x, i, j, drop = if (missing(i)) TRUE else length(cols) ==
                                 1) {
  y <- NextMethod("[")
  if (inherits(y, c("integer", "numeric"))) {
    return(y)
  } else {
    class(y) <- class(x)
    for (at in c("ref.genome", "catalog.type", "abundance", "region")) {
      attr(y, at) <- attr(x, at, exact = TRUE)
    }
    return(y)
  }
}

#' @export
`[.DBS78Catalog` <- function (x, i, j, drop = if (missing(i)) TRUE else length(cols) ==
                                  1) {
  y <- NextMethod("[")
  if (inherits(y, c("integer", "numeric"))) {
    return(y)
  } else {
    class(y) <- class(x)
    for (at in c("ref.genome", "catalog.type", "abundance", "region")) {
      attr(y, at) <- attr(x, at, exact = TRUE)
    }
    return(y)
  }
}

#' @export
`[.DBS144Catalog` <- function (x, i, j, drop = if (missing(i)) TRUE else length(cols) ==
                                1) {
  y <- NextMethod("[")
  if (inherits(y, c("integer", "numeric"))) {
    return(y)
  } else {
    class(y) <- class(x)
    for (at in c("ref.genome", "catalog.type", "abundance", "region")) {
      attr(y, at) <- attr(x, at, exact = TRUE)
    }
    return(y)
  }
}

#' @export
`[.DBS136Catalog` <- function (x, i, j, drop = if (missing(i)) TRUE else length(cols) ==
                                 1) {
  y <- NextMethod("[")
  if (inherits(y, c("integer", "numeric"))) {
    return(y)
  } else {
    class(y) <- class(x)
    for (at in c("ref.genome", "catalog.type", "abundance", "region")) {
      attr(y, at) <- attr(x, at, exact = TRUE)
    }
    return(y)
  }
}

#' @export
`[.IndelCatalog` <- function (x, i, j, drop = if (missing(i)) TRUE else length(cols) ==
                                 1) {
  y <- NextMethod("[")
  if (inherits(y, c("integer", "numeric"))) {
    return(y)
  } else {
    class(y) <- class(x)
    for (at in c("ref.genome", "catalog.type", "abundance", "region")) {
      attr(y, at) <- attr(x, at, exact = TRUE)
    }
    return(y)
  }
}

#' @keywords internal
CheckAndAssignAttributes <- function(x, list0) {
  for (at in c("ref.genome", "catalog.type", "abundance", "region")) {
    
    GetAttribute2 <- function(item, at) {
      attr(item, at, exact = TRUE)
    }
    attributes.list <- lapply(list0, FUN = GetAttribute2, at = at)
    
    CheckNullAttribute <- function(x, attributes.list) {
      is.null.result <- sapply(attributes.list, FUN = is.null)
      # If all the attributes are NULL, then x will also have NULL attribute
      if (all(is.null.result)) {
        attr(x, at) <- NULL
        return(list(is.null = TRUE, x = x))
      } else if (any(is.null.result)) {
        # If some attributes are NULL, then x will have a "mixed" attribute
        attr(x, at) <- "mixed"
        return(list(is.null = TRUE, x = x))
      }
    }
    
    if (at == "ref.genome") {
      result.list <- CheckNullAttribute(x, attributes.list)
      if (isTRUE(result.list$is.null)) {
        x <- result.list$x
      } else {
        GetRefGenomeName <- function(object) {
          return(object@pkgname)
        }
        ref.genome.check.result <- 
          sapply(attributes.list, FUN = GetRefGenomeName)
        if (length(unique(ref.genome.check.result)) == 1) {
          attr(x, at) <- attributes.list[[1]]
        } else {
          attr(x, at) <- "mixed"
        }
      }
    }
    
    if (at == "catalog.type") {
      is.null.result <- sapply(attributes.list, FUN = is.null)
      if (any(is.null.result)) {
        # If any attribute is NULL, then stop
        stop("Cannot perform cbind operation to catalogs with NULL",
             " catalog.type")
      } else {
        catalog.type.check.result <- do.call("c", attributes.list)
        if (length(unique(catalog.type.check.result)) == 1) {
          attr(x, at) <- attributes.list[[1]]
        } else {
          stop("Cannot perform cbind operation to catalogs with different",
               " catalog.type")
        }
      }
    }
    
    if (at == "abundance") {
      result.list <- CheckNullAttribute(x, attributes.list)
      if (isTRUE(result.list$is.null)) {
        x <- result.list$x
      } else {
        abundance.length.result <- sapply(attributes.list, FUN = length)
        if (length(unique(abundance.length.result)) == 1) {
          GetAbundanceNames <- function(abundance) {
            return(sort(names(abundance)))
          }
          
          abundance.names.list <- 
            lapply(attributes.list, FUN = GetAbundanceNames)
          CheckNamesOfAbundance <- function(abundance.names.list) {
            num <- length(abundance.names.list)
            ref.names <- abundance.names.list[[1]]
            for (i in 2:num) {
              if (!all(abundance.names.list[[i]] == ref.names)) {
                return(FALSE)
              } 
            }
            return(TRUE)
          }
          
          CheckValuesOfAbundance <- function(abundance.list) {
            num <- length(abundance.list)
            ref.values <- abundance.list[[1]][abundance.names.list[[1]]]
            for (i in 2:num) {
              if (!all(abundance.list[[i]][abundance.names.list[[i]]] == 
                       ref.values)) {
                return(FALSE)
              }
            }
            return(TRUE)
          }
          
          if (CheckNamesOfAbundance(abundance.names.list) && 
              CheckValuesOfAbundance(attributes.list)) {
            attr(x, at) <- attributes.list[[1]]
          } else {
            attr(x, at) <- "mixed"
          }
        } else {
          attr(x, at) <- "mixed"
        }
      }
    }
    
    if (at == "region") {
      result.list <- CheckNullAttribute(x, attributes.list)
      if (isTRUE(result.list$is.null)) {
        x <- result.list$x
      } else {
        region.check.result <- do.call("c", attributes.list)
        if (length(unique(region.check.result)) == 1) {
          attr(x, at) <- attributes.list[[1]]
        } else {
          attr(x, at) <- "mixed"
        }
      }
    }
  }
  return(x)
}

# Redefine the cbind methods for catalogs
#' @export
`cbind.SBS96Catalog` <- function (..., deparse.level = 1) {
  x <- base::cbind.data.frame(..., deparse.level = deparse.level)
  x <- data.matrix(x)
  class(x) <- class(..1)
  list0 <- list(...)
  x <- CheckAndAssignAttributes(x, list0)
  return(x)
}

#' @export
`cbind.SBS192Catalog` <- function (..., deparse.level = 1) {
  x <- base::cbind.data.frame(..., deparse.level = deparse.level)
  x <- data.matrix(x)
  class(x) <- class(..1)
  list0 <- list(...)
  x <- CheckAndAssignAttributes(x, list0)
  return(x)
}

#' @export
`cbind.SBS1536Catalog` <- function (..., deparse.level = 1) {
  x <- base::cbind.data.frame(..., deparse.level = deparse.level)
  x <- data.matrix(x)
  class(x) <- class(..1)
  list0 <- list(...)
  x <- CheckAndAssignAttributes(x, list0)
  return(x)
}

#' @export
`cbind.DBS78Catalog` <- function (..., deparse.level = 1) {
  x <- base::cbind.data.frame(..., deparse.level = deparse.level)
  x <- data.matrix(x)
  class(x) <- class(..1)
  list0 <- list(...)
  x <- CheckAndAssignAttributes(x, list0)
  return(x)
}

#' @export
`cbind.DBS144Catalog` <- function (..., deparse.level = 1) {
  x <- base::cbind.data.frame(..., deparse.level = deparse.level)
  x <- data.matrix(x)
  class(x) <- class(..1)
  list0 <- list(...)
  x <- CheckAndAssignAttributes(x, list0)
  return(x)
}

#' @export
`cbind.DBS136Catalog` <- function (..., deparse.level = 1) {
  x <- base::cbind.data.frame(..., deparse.level = deparse.level)
  x <- data.matrix(x)
  class(x) <- class(..1)
  list0 <- list(...)
  x <- CheckAndAssignAttributes(x, list0)
  return(x)
}

#' @export
`cbind.IndelCatalog` <- function (..., deparse.level = 1) {
  x <- base::cbind.data.frame(..., deparse.level = deparse.level)
  x <- data.matrix(x)
  class(x) <- class(..1)
  list0 <- list(...)
  x <- CheckAndAssignAttributes(x, list0)
  return(x)
}

#' Calculate base counts from three mer abundance
#' @keywords internal
CalBaseCountsFrom3MerAbundance <- function(three.mer.abundance) {
  base.counts <- integer(4)
  names(base.counts) <- c("A", "C", "G", "T")
  
  tmp <- three.mer.abundance
  names(tmp) <- substr(names(tmp), 2, 2)
  for (base in c("A", "C", "G", "T")) {
    base.counts[base] <- sum(tmp[names(tmp) == base])
  }
  return(base.counts)
}

#' @keywords internal
IsBinomialTestApplicable <- function(catalog) {
  catalog.type <- attributes(catalog)$catalog.type
  abundance <- attributes(catalog)$abundance
  
  if(catalog.type == "counts" && !is.null(abundance)) {
    if (length(abundance) == 64) {
      return(TRUE)
    } else {
      return(FALSE)
    }
  } else {
    return(FALSE)
  }
}

#' @keywords internal
AssignNumberOfAsterisks <- function(value) {
  label <- NULL
  if (value < 0.001) {
    label <- "***"
  } else if (value < 0.01) {
    label <- "**"
  } else if (value < 0.05) {
    label <- "*"
  }
  return(label)
}

#' @keywords internal
AdjustNumberOfCores <- function(num.of.cores) {
  if (.Platform$OS.type == "windows" && num.of.cores > 1) {
    message("On Windows, changing num.of.cores from ", num.of.cores, " to 1")
    return(1)
  }
  return(num.of.cores)
}

Try the ICAMS package in your browser

Any scripts or data that you put into this service are public.

ICAMS documentation built on April 3, 2021, 5:07 p.m.