R/infer_catalog_format.R

Defines functions InferRownames MakeCOMPOSITECatalogFromICAMSExt MakeCOMPOSITECatalog MakeID83CatalogFromSigPro MakeID83CatalogFromICAMSExt MakeID83Catalog MakeDBS144CatalogFromICAMSExt MakeDBS144Catalog MakeDBS136CatalogFromICAMSExt MakeDBS136Catalog MakeDBS78CatalogFromSigPro MakeDBS78CatalogFromICAMSExt MakeDBS78Catalog MakeSBS1536CatalogFromICAMSExt MakeSBS1536Catalog MakeSBS192CatalogFromICAMSExt MakeSBS192Catalog MakeID83ICAMSFromSigProID96 MakeSBS96CatalogFromSigPro MakeSBS96CatalogFromICAMSExt Make96Catalog InferCatalogClassString InferCatalogClassPrefix InferCatalogInfo

Documented in InferCatalogClassPrefix InferCatalogInfo InferRownames

#' This function converts an data.table imported
#' from external catalog text file into ICAMS
#' internal catalog object of appropriate type.
#'
#' 
#' @keywords internal
InferCatalogInfo <- function(object) {
  
  nrow <- nrow(object)

  if (nrow == 96)  {
    return(Make96Catalog(object))
  } 
  if (nrow == 192)  {
    return(MakeSBS192Catalog(object))
  }
  if (nrow == 1536) {
    return(MakeSBS1536Catalog(object))
  }
  if (nrow == 78)   {
    return(MakeDBS78Catalog(object))
  }
  if (nrow == 144){
    return(MakeDBS144Catalog(object))
  }
  if (nrow == 136){
    return(MakeDBS136Catalog(object))
  }
  if (nrow == 83)  {
    return(MakeID83Catalog(object))
  }
  if (nrow == 1697){ # TODO(Wuyang)
    return(MakeCOMPOSITECatalog(object))
  }  
  stop("\nThe number of rows in the input object must be one of\n",
       "96 192 1536 78 144 136 83 1697\ngot ", nrow)
  
}

#' These two functions is applicable only for 
#' internal ICAMS-formatted catalog object.
#'
#' @keywords internal
InferCatalogClassPrefix <- function(object) {
  
  nrow <- nrow(object)
  
  if (nrow == 96)   return("SBS96")
  if (nrow == 192)  return("SBS192")
  if (nrow == 1536) return("SBS1536")
  if (nrow == 78)   return("DBS78")
  if (nrow == 144)  return("DBS144")
  if (nrow == 136)  return("DBS136")
  if (nrow == 83)   return("ID")
  if (nrow == 1697) return("COMPOSITE")
  
  stop("\nThe number of rows in the input object must be one of\n",
       "96 192 1536 78 144 136 83 1697\ngot ", nrow)
  
}


InferCatalogClassString <- function(object) {
  
  prefix <- InferCatalogClassPrefix(object)
  if (prefix == "ID") prefix <- "Indel"
  return(paste0(prefix, "Catalog"))
}



## Convert external catalog files with 96 rows into ICAMS internal catalog format.
#' @keywords internal
Make96Catalog <- function(object) {
  # ICAMS and COSMIC SBS96 csv format
  if("C>A" %in% unlist(object[,1]) && "TTT" %in% unlist(object[,2])) {
    # Convert ICAMS / COSMIC csv into ICAMS internal format
    return(MakeSBS96CatalogFromICAMSExt(object))
  }
  ## SigPro SBS96 txt format
  if("A[C>A]A" %in% unlist(object[,1])) {
    return(MakeSBS96CatalogFromSigPro(object))
  }
  ## COSMIC / SigPro ID96 text format
  if(all(c("1:Del:C:0","1:Ins:C:0","1:Del:T:0","1:Ins:T:0",
           "2:Del:R:0","5:Ins:R:0","5:Del:M:5","5:Ins:M:5",
           "complex","non_matching") %in% unlist(object[,1]))) {
    return(MakeID83ICAMSFromSigProID96(object))
  }
  stop("96 mutation types, but not an SBS96 catalog in ICAMS/COSMIC/SigProfiler",
       " or ID96 catalog in ICAMS/COSMIC format")
}
## Both COSMIC and ICAMS use ICAMS-external SBS96 format: 
## C/T>D/V NYN
MakeSBS96CatalogFromICAMSExt <- function(cos) {
  ref.gt.var       <- unlist(cos[, 1])
  before.ref.after <- unlist(cos[, 2])
  var <- substring(ref.gt.var, 3, 3)
  out <- cos[, -(1 : 2), drop = FALSE]
  out <- as.matrix(out)
  rownames(out) <- paste0(before.ref.after, var)
  if (ncol(out) == 1) colnames(out) <- colnames(cos)[3]
  
  if(!setequal(rownames(out),ICAMS::catalog.row.order$SBS96)){
    stop("The mutation types in this SBS96 catalog is not in correct ICAMS format",
      " check ICAMS::catalog.row.order$SBS96 for more details.")
  }
  out <- out[ICAMS::catalog.row.order$SBS96, , drop = FALSE]
  out <- as.matrix(out)
  class(out) <- c("SBS96Catalog", class(out))
  return(out)
}
## N[C/T>D/V]N
MakeSBS96CatalogFromSigPro <- function(cos) {
  rownames <- Unstaple96(unlist(cos[ , 1]))
  cos <- cos[ , -1]
  out <- as.matrix(cos)
  rownames(out) <- rownames
  if(!setequal(rownames(out),ICAMS::catalog.row.order$SBS96)){
    stop("The mutation types in this SBS96 catalog is not in correct ICAMS format",
      " check ICAMS::catalog.row.order$SBS96 for more details.")
  }
  out <- out[ICAMS::catalog.row.order$SBS96, ]
  class(out) <- c("SBS96Catalog", class(out))
  return(out)
}
## See ICAMS:::TransRownames.ID.SigPro.PCAWG() for more details
## on ID96/ID83 format.
MakeID83ICAMSFromSigProID96 <- function(cos) {

  ## Row names before conversion.
  headerPrev <- unlist(cos[,1])
  names(headerPrev) <- NULL

  ## Row headers not in ID83
  headerIDsToBeDeleted <- which(headerPrev %in% c("complex","non_matching","non-matching", "non.matching"))
  headerIDsToBeDeleted <- c(headerIDsToBeDeleted, grep("Ins:M",headerPrev))
  cos <- cos[setdiff(1:96,headerIDsToBeDeleted),]
  headers <- unlist(cos[,1])
  headers <- TransRownames.ID.SigPro.PCAWG(headers)

  out <- cos[, -1] # No need to add drop = FALSE
  out <- as.matrix(out)
  rownames(out) <- headers
  
  if(!setequal(rownames(out),ICAMS::catalog.row.order$ID)){
    stop("The mutation types in this ID83 catalog is not in correct ICAMS format",
      " check ICAMS::catalog.row.order$ID for more details.")
  }
  
  # Need drop = FALSE, otherwise out will not be a matrix and has attributes
  # other than names. Then in ReadCatalogInternal(), as.catalog() will throw an
  # error as is.vector(out) is FALSE.
  out <- out[ICAMS::catalog.row.order$ID, , drop = FALSE]
  ## TODO(Wuyang): check if the matrix is unified.
  class(out) <- c("IndelCatalog", class(out))
  return(out)
}

## Convert external catalog files with 192 rows (should always be SBS192)
## into ICAMS SBS192 internal catalog object.
MakeSBS192Catalog <- function(object) {
  # ICAMS / COSMIC SBS192 csv format
  if(setequal(c("T","U"), unique(unlist(object[,1]))) && "C>A" %in% unlist(object[,2]) && "TTT" %in% unlist(object[,3])) {
    # Convert ICAMS / COSMIC csv into ICAMS internal format
    return(MakeSBS192CatalogFromICAMSExt(object))
  }
  stop("192 mutation types, but not an SBS192 catalog in",
       " ICAMS or COSMIC format")
}
## COSMIC, ICAMS and SigPro all use ICAMS-external SBS192 format:
## T/U C/T>D/V NYN
MakeSBS192CatalogFromICAMSExt <- function(cos) {
  
  stopifnot(nrow(cos) == 192)
  ref.gt.var       <- unlist(cos[, 2])
  before.ref.after <- unlist(cos[, 3])

  ## Find the rows labeled with "T", indicating the
  ## SBS is on the transcribed (which is the *antisense*) strand.
  transcribed.strand.pos <- which(cos[, 1] == 'T')

  before.ref.after[transcribed.strand.pos] <-
    revc(before.ref.after[transcribed.strand.pos])

  var <- substring(ref.gt.var, 3, 3)
  var[transcribed.strand.pos] <- revc(var[transcribed.strand.pos])

  internalHeaders <- paste0(before.ref.after, var)
  stopifnot(setequal(internalHeaders, ICAMS::catalog.row.order$SBS192))

  out <- cos[, -(1 : 3), drop = FALSE]
  out <- as.matrix(out)
  rownames(out) <- internalHeaders
  if(!setequal(rownames(out),ICAMS::catalog.row.order$SBS192)){
    stop("The mutation types in this SBS192 catalog is not in correct ICAMS format",
      " check ICAMS::catalog.row.order$SBS192 for more details.")
  }
  out <- out[ICAMS::catalog.row.order$SBS192, , drop = FALSE]
  class(out) <- c("SBS192Catalog", class(out))
  return(out)
}


## Convert external catalog files with 1536 rows (should always be SBS1536)
## into ICAMS SBS1536 internal catalog object.
MakeSBS1536Catalog <- function(object) {
  # ICAMS SBS1536 csv format
  if("C>A" %in% unlist(object[,1]) && "TTTGT" %in% unlist(object[,2])) {
    # Convert ICAMS / COSMIC csv into ICAMS internal format
    return(MakeSBS1536CatalogFromICAMSExt(object))
  }
  stop("1536 mutation types, but not an SBS1536 catalog in",
       " ICAMS format")
}
## ICAMS uses ICAMS-external SBS1536 format:
## C/T>D/V NNYNN
MakeSBS1536CatalogFromICAMSExt <- function(cos) {
  
  stopifnot(nrow(cos) == 1536)
  names(cos)[1:2] <- c("Mutation type", "Pentanucleotide")
  ref.gt.var       <- cos[["Mutation type"]]
  before.ref.after <- cos[["Pentanucleotide"]]
  var <- substring(ref.gt.var, 3, 3)
  out <- as.matrix(cos[ , -(1 : 2)], drop = FALSE)
  rownames(out) <- paste0(before.ref.after, var)
  if (ncol(out) == 1) colnames(out) <- colnames(cos)[3]
  # Check whether the row names are setequal to standard internal row names.
  if(!setequal(rownames(out),ICAMS::catalog.row.order$SBS1536)){
    stop("The mutation types in this SBS1536 catalog is not in correct ICAMS format",
      " check ICAMS::catalog.row.order$SBS1536 for more details.")
  }
  out <- out[ICAMS::catalog.row.order$SBS1536, , drop = FALSE]
  return(out)
}


## Convert external catalog files with 78 rows into ICAMS DBS78 internal catalog format.
MakeDBS78Catalog <- function(object) {
  # ICAMS DBS78 csv format
  if("AC" %in% unlist(object[,1]) && "GA" %in% unlist(object[,2])) {
    # Convert ICAMS-external csv into ICAMS internal format
    return(MakeDBS78CatalogFromICAMSExt(object))
  }
  ## SigPro DBS78 txt format
  if("AC>GA" %in% unlist(object[,1])) {
    return(MakeDBS78CatalogFromSigPro(object))
  }
  stop("78 mutation types, but not a DBS78 catalog in",
       " ICAMS, COSMIC, or SigProfiler format")
}
## NN(Ref) NN(Var)
MakeDBS78CatalogFromICAMSExt <- function(cos) {
  ref <- unlist(cos[, 1])
  var <- unlist(cos[, 2])
  out <- cos[, -(1 : 2), drop = FALSE]
  out <- as.matrix(out)
  rownames(out) <- paste0(ref, var)
  if (ncol(out) == 1) colnames(out) <- colnames(cos)[3]
  
  if(!setequal(rownames(out),ICAMS::catalog.row.order$DBS78)){
    stop("The mutation types in this DBS78 catalog is not in correct ICAMS format",
      " check ICAMS::catalog.row.order$DBS78 for more details.")
  }
  out <- out[ICAMS::catalog.row.order$DBS78, , drop = FALSE]
  out <- as.matrix(out)
  class(out) <- c("DBS78Catalog", class(out))
  return(out)
}
## Both COSMIC and SigPro use SigPro DBS78 format:
## NN>NN
MakeDBS78CatalogFromSigPro <- function(cos) {
  rownames <- Unstaple78(unlist(cos[ , 1]))
  cos <- cos[ , -1]
  out <- as.matrix(cos)
  rownames(out) <- rownames
  
  if(!setequal(rownames(out),ICAMS::catalog.row.order$DBS78)){
    stop("The mutation types in this DBS78 catalog is not in correct ICAMS format",
      " check ICAMS::catalog.row.order$DBS78 for more details.")
  }
  out <- out[ICAMS::catalog.row.order$DBS78, ]
  class(out) <- c("DBS78Catalog", class(out))
  return(out)
}

## Convert external catalog files with 136 rows into ICAMS DBS136 internal catalog format.
MakeDBS136Catalog <- function(object) {
  # ICAMS DBS136 csv format
  if("AATA" %in% unlist(object[,1])) {
    # Convert ICAMS-external csv into ICAMS internal format
    return(MakeDBS136CatalogFromICAMSExt(object))
  }
  stop("136 mutation types, but not a DBS136 catalog in",
       " ICAMS format")
}
## Quad
## NNNN
MakeDBS136CatalogFromICAMSExt <- function(cos) {

  names(cos)[1] <- "Quad"
  out <- cos[, -1, drop = FALSE]
  out <- as.matrix(out)
  rownames(out) <- cos$Quad

  if(!setequal(rownames(out), ICAMS::catalog.row.order$DBS136)){
    stop("The mutation types in this DBS136 catalog is not in correct ICAMS format",
      " check ICAMS::catalog.row.order$DBS136 for more details.")
  }
  out <- out[ICAMS::catalog.row.order$DBS136, , drop = FALSE]
  return(out)
}


## Convert external catalog files with 144 rows into ICAMS DBS144 internal catalog format.
MakeDBS144Catalog <- function(object) {
  # ICAMS DBS144 csv format
  if("AC" %in% unlist(object[,1]) && "GA" %in% unlist(object[,2])) {
    # Convert ICAMS-external csv into ICAMS internal format
    return(MakeDBS144CatalogFromICAMSExt(object))
  }
  stop("144 mutation types, but not a DBS144 catalog in",
       " ICAMS format")
}
## Ref Var
## NN > NN 
MakeDBS144CatalogFromICAMSExt <- function(cos) {
  
  stopifnot(nrow(cos) == 144)
  names(cos)[1 : 2] <- c("Ref", "Var")
  out <- cos[, -(1 : 2), drop = FALSE]
  out <- as.matrix(out)
  rn <- paste0(cos$Ref, cos$Var)
  rownames(out) <- rn
  if(!setequal(rownames(out), ICAMS::catalog.row.order$DBS144)){
    stop("The mutation types in this DBS144 catalog is not in correct ICAMS format",
      " check ICAMS::catalog.row.order$DBS144 for more details.")
  }
  out <- out[ICAMS::catalog.row.order$DBS144, , drop = FALSE]
  return(out)
}


## Convert external catalog files with 83 rows into ICAMS ID83 internal catalog format.
MakeID83Catalog <- function(object) {
  # ICAMS ID83 csv format
  if(setequal(c("DEL","INS"),unique(unlist(object[,1]))) &&
     "repeats" %in% unlist(object[,2]) && 
     "5+" %in% unlist(object[,3]) && 
     "5+" %in% unlist(object[,4])) {
    # Convert ICAMS-external csv into ICAMS internal format
    return(MakeID83CatalogFromICAMSExt(object))
  }
  ## COSMIC and SigPro ID83 txt format
  if("3:Del:R:5" %in% unlist(object[,1])) {
    return(MakeID83CatalogFromSigPro(object))
  }
  stop("83 mutation types, but not a ID83 catalog in",
       " ICAMS, COSMIC, or SigProfiler format")
}
## Type Subtype Indel_size Repeat_MH_size
## INS/DEL C/T/repeats/MH 1~5+ 0~5+
MakeID83CatalogFromICAMSExt <- function(cos) {
     
    cn <- names(cos)
    ex.cn <- c("Type", "Subtype", "Indel_size", "Repeat_MH_size")
    # Repeat_MH_size is the size of repeat OR microhomology (MH)
    # if (strict) { for (i in 1:4) { stopifnot(cn[i] == ex.cn[i]) } }
    names(cos)[1:4] <- ex.cn
    rn <- apply(cos[ , 1:4], MARGIN = 1, paste, collapse = ":")
    out <- as.matrix(cos[ , -(1:4), drop = FALSE])
    
    
    # null.out <- matrix(NA, ncol = ncol(out), nrow = nrow(out))
    if (!setequal(rn, ICAMS::catalog.row.order$ID)) {
      msg <- 
        paste("The row names are not correct:\n",
              "got", paste(rn, collapse = ", "),
              "\nexpected", paste(ICAMS::catalog.row.order$ID,
                                  collapse = ", "))
      stop(msg)
    }
    rownames(out) <- rn
    #   if (ncol(out) == 1) colnames(out) <- colnames(cos)[3] 
    out <- out[ICAMS::catalog.row.order$ID, , drop = FALSE]
    return(out)
}
## <Indel_size:Type:Subtype:Repeat_MH_size>
## 1~5:Ins/Del:C/T/R/M:0~5
MakeID83CatalogFromSigPro <- function(cos) {
  rn <- unlist(cos[,1])
  if(!setequal(rn, catalog.row.headers.sp$ID83)) {
      msg <- 
        paste("The row names are not correct:\n",
              "got", paste(rn, collapse = ", "),
              "\nexpected", paste(catalog.row.headers.sp$ID83,
                                  collapse = ", "))
      stop(msg)
  }
  internalHeader <- TransRownames.ID.SigPro.PCAWG(rn)
  cos <- as.matrix(cos[,-1])
  rownames(cos) <- internalHeader
  return(cos)
}

## Convert external catalog files with 1697 rows into ICAMS COMPOSITE internal catalog format.
MakeCOMPOSITECatalog <- function(object) {
  # ICAMS COMPOSITE csv format
  if("TCTAAA" %in% unlist(object[,1])) {
    # Convert ICAMS-external csv into ICAMS internal format
    return(MakeCOMPOSITECatalogFromICAMSExt(object))
  }
  stop("1697 mutation types, but not a COMPOSITE catalog in",
       " ICAMS format")
}
## NN(Ref) NN(Var)
MakeCOMPOSITECatalogFromICAMSExt <- function(cos) {
    
  names <- cos[[1]]
  cos <- cos[, -1]
  out <- as.matrix(cos)
  rownames(out) <- names
  
  if(!setequal(rownames(out),ICAMS::catalog.row.order$COMPOSITE)){
    stop("The mutation types in this COMPOSITE catalog is not in correct ICAMS format",
      " check ICAMS::catalog.row.order$COMPOSITE for more details.")
  }
  out <- out[ICAMS::catalog.row.order$COMPOSITE, , drop = FALSE]
  out <- as.matrix(out)
  class(out) <- c("COMPOSITECatalog", class(out))
  return(out)
}




#' Infer the correct rownames for a matrix based on its number of rows
#' 
#' @keywords internal
InferRownames <- function(object) {
  prefix <- InferCatalogClassPrefix(object)
  return(ICAMS::catalog.row.order[[prefix]])
}

Try the ICAMS package in your browser

Any scripts or data that you put into this service are public.

ICAMS documentation built on April 3, 2021, 5:07 p.m.